diff --git a/.clang-format b/.clang-format new file mode 100644 index 000000000..2b00788b1 --- /dev/null +++ b/.clang-format @@ -0,0 +1,90 @@ +BasedOnStyle: LLVM +AccessModifierOffset: -2 +AlignAfterOpenBracket: Align +AlignConsecutiveAssignments: true +AlignEscapedNewlines: Right +AlignOperands: true +AllowAllArgumentsOnNextLine: false +AllowAllConstructorInitializersOnNextLine: false +AllowAllParametersOfDeclarationOnNextLine: false +AllowShortBlocksOnASingleLine: false +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: All +AllowShortIfStatementsOnASingleLine: Never +AllowShortLambdasOnASingleLine: All +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterReturnType: None +AlwaysBreakTemplateDeclarations: Yes +BinPackArguments: false +BinPackParameters: false +BreakBeforeBraces: Custom +BraceWrapping: + AfterCaseLabel: false + AfterClass: true + AfterControlStatement: true + AfterEnum: true + AfterFunction: true + AfterNamespace: true + AfterStruct: true + AfterUnion: true + BeforeCatch: true + BeforeElse: true + IndentBraces: false + SplitEmptyFunction: false + SplitEmptyRecord: false +BreakBeforeBinaryOperators: None +BreakBeforeTernaryOperators: true +BreakConstructorInitializers: BeforeComma +BreakInheritanceList: BeforeComma +ColumnLimit: 100 +CompactNamespaces: false +ContinuationIndentWidth: 2 +IncludeBlocks: Regroup +IncludeCategories: + - Regex: '^$' + Priority: 4 + - Regex: '^` to handle host/device code divergence. +- NVIDIA/thrust#1752: Remove a leftover merge conflict from a documentation + file. Thanks to @tabedzki for this contribution. + +## Thrust 1.17.2 + +### Summary + +Thrust 1.17.2 is a minor bugfix release that provides an updated version of CUB. + +## Thrust 1.17.1 + +### Summary + +Thrust 1.17.1 is a minor bugfix release that provides an updated version of CUB. + +## Thrust 1.17.0 + +### Summary + +Thrust 1.17.0 is the final minor release of the 1.X series. This release +provides GDB pretty-printers for device vectors/references, a new `unique_count` +algorithm, and an easier way to create tagged Thrust iterators. Several +documentation fixes are included, which can be found on the new Thrust +documentation site at https://nvidia.github.io/thrust. We'll be migrating +existing documentation sources to this new location over the next few months. + +### New Features + +- NVIDIA/thrust#1586: Add new `thrust::make_tagged_iterator` convenience + function. Thanks to @karthikeyann for this contribution. +- NVIDIA/thrust#1619: Add `unique_count` algorithm. Thanks to @upsj for this + contribution. +- NVIDIA/thrust#1631: Add GDB pretty-printers for device vectors/references + to `scripts/gdb-pretty-printers.py`. Thanks to @upsj for this contribution. + +### Bug Fixes + +- NVIDIA/thrust#1671: Fixed `reduce_by_key` when called with 2^31 elements. + +### Other Enhancements + +- NVIDIA/thrust#1512: Use CUB to implement `adjacent_difference`. +- NVIDIA/thrust#1555: Use CUB to implement `scan_by_key`. +- NVIDIA/thrust#1611: Add new doxybook-based Thrust documentation + at https://nvidia.github.io/thrust. +- NVIDIA/thrust#1639: Fixed broken link in documentation. Thanks to @jrhemstad + for this contribution. +- NVIDIA/thrust#1644: Increase contrast of search input text in new doc site. + Thanks to @bdice for this contribution. +- NVIDIA/thrust#1647: Add `__forceinline__` annotations to a functor wrapper. + Thanks to @mkuron for this contribution. +- NVIDIA/thrust#1660: Fixed typo in documentation example for + `permutation_iterator`. +- NVIDIA/thrust#1669: Add a new `explicit_cuda_stream.cu` example that shows how + to use explicit CUDA streams and `par`/`par_nosync` execution policies. + +## Thrust 1.16.0 + +### Summary + +Thrust 1.16.0 provides a new “nosync” hint for the CUDA backend, as well as +numerous bugfixes and stability improvements. + +#### New `thrust::cuda::par_nosync` Execution Policy + +Most of Thrust's parallel algorithms are fully synchronous and will block the +calling CPU thread until all work is completed. This design avoids many pitfalls +associated with asynchronous GPU programming, resulting in simpler and +less-error prone usage for new CUDA developers. Unfortunately, this improvement +in user experience comes at a performance cost that often frustrates more +experienced CUDA programmers. + +Prior to this release, the only synchronous-to-asynchronous migration path for +existing Thrust codebases involved significant refactoring, replacing calls +to `thrust` algorithms with a limited set of `future`-based `thrust::async` +algorithms or lower-level CUB kernels. The new `thrust::cuda::par_nosync` +execution policy provides a new, less-invasive entry point for asynchronous +computation. + +`par_nosync` is a hint to the Thrust execution engine that any non-essential +internal synchronizations should be skipped and that an explicit synchronization +will be performed by the caller before accessing results. + +While some Thrust algorithms require internal synchronization to safely compute +their results, many do not. For example, multiple `thrust::for_each` invocations +can be launched without waiting for earlier calls to complete: + +```cpp +// Queue three `for_each` kernels: +thrust::for_each(thrust::cuda::par_nosync, vec1.begin(), vec1.end(), Op{}); +thrust::for_each(thrust::cuda::par_nosync, vec2.begin(), vec2.end(), Op{}); +thrust::for_each(thrust::cuda::par_nosync, vec3.begin(), vec3.end(), Op{}); + +// Do other work while kernels execute: +do_something(); + +// Must explictly synchronize before accessing `for_each` results: +cudaDeviceSynchronize(); +``` + +Thanks to @fkallen for this contribution. + +### Deprecation Notices + +#### CUDA Dynamic Parallelism Support + +**A future version of Thrust will remove support for CUDA Dynamic Parallelism +(CDP).** + +This will only affect calls to Thrust algorithms made from CUDA device-side code +that currently launches a kernel; such calls will instead execute sequentially +on the calling GPU thread instead of launching a device-wide kernel. + +### Breaking Changes + +- Thrust 1.14.0 included a change that aliased the `cub` namespace + to `thrust::cub`. This has caused issues with ambiguous namespaces for + projects that declare `using namespace thrust;` from the global namespace. We + recommend against this practice. +- NVIDIA/thrust#1572: Removed several unnecessary header includes. Downstream + projects may need to update their includes if they were relying on this + behavior. + +### New Features + +- NVIDIA/thrust#1568: Add `thrust::cuda::par_nosync` policy. Thanks to @fkallen + for this contribution. + +### Enhancements + +- NVIDIA/thrust#1511: Use CUB's new `DeviceMergeSort` API and remove Thrust's + internal implementation. +- NVIDIA/thrust#1566: Improved performance of `thrust::shuffle`. Thanks to + @djns99 for this contribution. +- NVIDIA/thrust#1584: Support user-defined `CMAKE_INSTALL_INCLUDEDIR` values in + Thrust's CMake install rules. Thanks to @robertmaynard for this contribution. + +### Bug Fixes + +- NVIDIA/thrust#1496: Fix some issues affecting `icc` builds. +- NVIDIA/thrust#1552: Fix some collisions with the `min`/`max` macros defined + in `windows.h`. +- NVIDIA/thrust#1582: Fix issue with function type alias on 32-bit MSVC builds. +- NVIDIA/thrust#1591: Workaround issue affecting compilation with `nvc++`. +- NVIDIA/thrust#1597: Fix some collisions with the `small` macro defined + in `windows.h`. +- NVIDIA/thrust#1599, NVIDIA/thrust#1603: Fix some issues with version handling + in Thrust's CMake packages. +- NVIDIA/thrust#1614: Clarify that scan algorithm results are non-deterministic + for pseudo-associative operators (e.g. floating-point addition). + +## Thrust 1.15.0 + +### Summary + +Thrust 1.15.0 provides numerous bugfixes, including non-numeric +`thrust::sequence` support, several MSVC-related compilation fixes, fewer +conversion warnings, `counting_iterator` initialization, and documentation +updates. + +### Deprecation Notices + +**A future version of Thrust will remove support for CUDA Dynamic Parallelism +(CDP).** + +This will only affect calls to Thrust algorithms made from CUDA device-side code +that currently launches a kernel; such calls will instead execute sequentially +on the calling GPU thread instead of launching a device-wide kernel. + +### Bug Fixes + +- NVIDIA/thrust#1507: Allow `thrust::sequence` to work with non-numeric types. + Thanks to Ben Jude (@bjude) for this contribution. +- NVIDIA/thrust#1509: Avoid macro collision when calling `max()` on MSVC. Thanks + to Thomas (@tomintheshell) for this contribution. +- NVIDIA/thrust#1514: Initialize all members in `counting_iterator`'s default + constructor. +- NVIDIA/thrust#1518: Fix `std::allocator_traits` on MSVC + C++17. +- NVIDIA/thrust#1530: Fix several `-Wconversion` warnings. Thanks to Matt + Stack (@matt-stack) for this contribution. +- NVIDIA/thrust#1539: Fixed typo in `thrust::for_each` documentation. Thanks to + Salman (@untamedImpala) for this contribution. +- NVIDIA/thrust#1548: Avoid name collision with `B0` macro in termios.h system + header. Thanks to Philip Deegan (@PhilipDeegan) for this contribution. + +## Thrust 1.14.0 (NVIDIA HPC SDK 21.9) + +Thrust 1.14.0 is a major release accompanying the NVIDIA HPC SDK 21.9. + +This release adds the ability to wrap the `thrust::` namespace in an external +namespace, providing a workaround for a variety of shared library linking +issues. Thrust also learned to detect when CUB's symbols are in a wrapped +namespace and properly import them. To enable this feature, use +`#define THRUST_CUB_WRAPPED_NAMESPACE foo` to wrap both Thrust and CUB in the +`foo::` namespace. See `thrust/detail/config/namespace.h` for details and more +namespace options. + +Several bugfixes are also included: The `tuple_size` and `tuple_element` helpers +now support cv-qualified types. `scan_by_key` uses less memory. +`thrust::iterator_traits` is better integrated with `std::iterator_traits`. +See below for more details and references. + +### Breaking Changes + +- Thrust 1.14.0 included a change that aliased the `cub` namespace + to `thrust::cub`. This has caused issues with ambiguous namespaces for + projects that declare `using namespace thrust;` from the global namespace. We + recommend against this practice. + +### New Features + +- NVIDIA/thrust#1464: Add preprocessor hooks that allow `thrust::` to be wrapped + in an external namespace, and support cases when CUB is wrapped in an external + namespace. + +### Bug Fixes + +- NVIDIA/thrust#1457: Support cv-qualified types in `thrust::tuple_size` and + `thrust::tuple_element`. Thanks to Jake Hemstad for this contribution. +- NVIDIA/thrust#1471: Fixed excessive memory allocation in `scan_by_key`. Thanks + to Lilo Huang for this contribution. +- NVIDIA/thrust#1476: Removed dead code from the `expand` example. Thanks to + Lilo Huang for this contribution. +- NVIDIA/thrust#1488: Fixed the path to the installed CUB headers in the CMake + `find_package` configuration files. +- NVIDIA/thrust#1491: Fallback to `std::iterator_traits` when no + `thrust::iterator_traits` specialization exists for an iterator type. Thanks + to Divye Gala for this contribution. + +## Thrust 1.13.1 (CUDA Toolkit 11.5) + +Thrust 1.13.1 is a minor release accompanying the CUDA Toolkit 11.5. + +This release provides a new hook for embedding the `thrust::` namespace inside a +custom namespace. This is intended to work around various issues related to +linking multiple shared libraries that use Thrust. The existing `CUB_NS_PREFIX` +and `CUB_NS_POSTFIX` macros already provided this capability for CUB; this +update provides a simpler mechanism that is extended to and integrated with +Thrust. Simply define `THRUST_CUB_WRAPPED_NAMESPACE` to a namespace name, and +both `thrust::` and `cub::` will be placed inside the new namespace. Using +different wrapped namespaces for each shared library will prevent issues like +those reported in NVIDIA/thrust#1401. + +### New Features + +- NVIDIA/thrust#1464: Add `THRUST_CUB_WRAPPED_NAMESPACE` hooks. + +### Bug Fixes + +- NVIDIA/thrust#1488: Fix path to installed CUB in Thrust's CMake config files. + +## Thrust 1.13.0 (NVIDIA HPC SDK 21.7) + +Thrust 1.13.0 is the major release accompanying the NVIDIA HPC SDK 21.7 release. +Notable changes include `bfloat16` radix sort support (via `thrust::sort`) and + memory handling fixes in the `reserve` method of Thrust's vectors. +The `CONTRIBUTING.md` file has been expanded to include instructions for + building CUB as a component of Thrust, and API documentation now refers to + [cppreference](https://cppreference.com) instead of SGI's old STL reference. + +### Breaking Changes + +- NVIDIA/thrust#1459: Remove deprecated aliases `thrust::host_space_tag` and + `thrust::device_space_tag`. Use the equivalent `thrust::host_system_tag` and + `thrust::device_system_tag` instead. + +### New Features + +- NVIDIA/cub#306: Add radix-sort support for `bfloat16` in `thrust::sort`. + Thanks to Xiang Gao (@zasdfgbnm) for this contribution. +- NVIDIA/thrust#1423: `thrust::transform_iterator` now supports non-copyable + types. Thanks to Jake Hemstad (@jrhemstad) for this contribution. +- NVIDIA/thrust#1459: Introduce a new `THRUST_IGNORE_DEPRECATED_API` macro that + disables deprecation warnings on Thrust and CUB APIs. + +### Bug Fixes + +- NVIDIA/cub#277: Fixed sanitizer warnings when `thrust::sort` calls + into `cub::DeviceRadixSort`. Thanks to Andy Adinets (@canonizer) for this + contribution. +- NVIDIA/thrust#1442: Reduce extraneous comparisons in `thrust::sort`'s merge + sort implementation. +- NVIDIA/thrust#1447: Fix memory leak and avoid overallocation when + calling `reserve` on Thrust's vector containers. Thanks to Kai Germaschewski + (@germasch) for this contribution. + +### Other Enhancements + +- NVIDIA/thrust#1405: Update links to standard C++ documentations from sgi to + cppreference. Thanks to Muhammad Adeel Hussain (@AdeilH) for this + contribution. +- NVIDIA/thrust#1432: Updated build instructions in `CONTRIBUTING.md` to include + details on building CUB's test suite as part of Thrust. + +## Thrust 1.12.1 (CUDA Toolkit 11.4) + +Thrust 1.12.1 is a trivial patch release that slightly changes the phrasing of +a deprecation message. + +## Thrust 1.12.0 (NVIDIA HPC SDK 21.3) + +Thrust 1.12.0 is the major release accompanying the NVIDIA HPC SDK 21.3 + and the CUDA Toolkit 11.4. +It includes a new `thrust::universal_vector`, which holds data that is + accessible from both host and device. This allows users to easily leverage + CUDA's unified memory with Thrust. +New asynchronous `thrust::async:exclusive_scan` and `inclusive_scan` algorithms + have been added, and the synchronous versions of these have been updated to + use `cub::DeviceScan` directly. +CUB radix sort for floating point types is now stable when both +0.0 and -0.0 + are present in the input. This affects some usages of `thrust::sort` and + `thrust::stable_sort`. +Many compilation warnings and subtle overflow bugs were fixed in the device + algorithms, including a long-standing bug that returned invalid temporary + storage requirements when `num_items` was close to (but not + exceeding) `INT32_MAX`. +This release deprecates support for Clang < 7.0 and MSVC < 2019 (aka + 19.20/16.0/14.20). + +### Breaking Changes + +- NVIDIA/thrust#1372: Deprecate Clang < 7 and MSVC < 2019. +- NVIDIA/thrust#1376: Standardize `thrust::scan_by_key` functors / accumulator + types. + This may change the results from `scan_by_key` when input, output, and + initial value types are not the same type. + +### New Features + +- NVIDIA/thrust#1251: Add two new `thrust::async::` algorithms: `inclusive_scan` + and `exclusive_scan`. +- NVIDIA/thrust#1334: Add `thrust::universal_vector`, `universal_ptr`, + and `universal_allocator`. + +### Bug Fixes + +- NVIDIA/thrust#1347: Qualify calls to `make_reverse_iterator`. +- NVIDIA/thrust#1359: Enable stricter warning flags. This fixes several + outstanding issues: + - NVIDIA/cub#221: Overflow in `temp_storage_bytes` when `num_items` close to + (but not over) `INT32_MAX`. + - NVIDIA/cub#228: CUB uses non-standard C++ extensions that break strict + compilers. + - NVIDIA/cub#257: Warning when compiling `GridEvenShare` with unsigned + offsets. + - NVIDIA/thrust#974: Conversion warnings in `thrust::transform_reduce`. + - NVIDIA/thrust#1091: Conversion warnings in `thrust::counting_iterator`. +- NVIDIA/thrust#1373: Fix compilation error when a standard library type is + wrapped in `thrust::optional`. + Thanks to Vukasin Milovanovic for this contribution. +- NVIDIA/thrust#1388: Fix `signbit(double)` implementation on MSVC. +- NVIDIA/thrust#1389: Support building Thrust tests without CUDA enabled. + +### Other Enhancements + +- NVIDIA/thrust#1304: Use `cub::DeviceScan` to implement + `thrust::exclusive_scan` and `thrust::inclusive_scan`. +- NVIDIA/thrust#1362, NVIDIA/thrust#1370: Update smoke test naming. +- NVIDIA/thrust#1380: Fix typos in `set_operation` documentation. + Thanks to Hongyu Cai for this contribution. +- NVIDIA/thrust#1383: Include FreeBSD license in LICENSE.md for + `thrust::complex` implementation. +- NVIDIA/thrust#1384: Add missing precondition to `thrust::gather` + documentation. + +## Thrust 1.11.0 (CUDA Toolkit 11.3) + +Thrust 1.11.0 is a major release providing bugfixes and performance + enhancements. +It includes a new sort algorithm that provides up to 2x more performance + from `thrust::sort` when used with certain key types and hardware. +The new `thrust::shuffle` algorithm has been tweaked to improve the randomness + of the output. +Our CMake package and build system continue to see improvements with + better `add_subdirectory` support, installation rules, status messages, and + other features that make Thrust easier to use from CMake projects. +The release includes several other bugfixes and modernizations, and received + updates from 12 contributors. + +### New Features + +- NVIDIA/cub#204: New implementation for `thrust::sort` on CUDA when using + 32/64-bit numeric keys on Pascal and up (SM60+). + This improved radix sort algorithm provides up to 2x more performance. + Thanks for Andy Adinets for this contribution. +- NVIDIA/thrust#1310, NVIDIA/thrust#1312: Various tuple-related APIs have been + updated to use variadic templates. + Thanks for Andrew Corrigan for these contributions. +- NVIDIA/thrust#1297: Optionally add install rules when included with + CMake's `add_subdirectory`. + Thanks to Kai Germaschewski for this contribution. + +### Bug Fixes + +- NVIDIA/thrust#1309: Fix `thrust::shuffle` to produce better quality random + distributions. + Thanks to Rory Mitchell and Daniel Stokes for this contribution. +- NVIDIA/thrust#1337: Fix compile-time regression in `transform_inclusive_scan` + and `transform_exclusive_scan`. +- NVIDIA/thrust#1306: Fix binary search `middle` calculation to avoid overflows. + Thanks to Richard Barnes for this contribution. +- NVIDIA/thrust#1314: Use `size_t` for the index type parameter + in `thrust::tuple_element`. + Thanks to Andrew Corrigan for this contribution. +- NVIDIA/thrust#1329: Fix runtime error when copying an empty + `thrust::device_vector` in MSVC Debug builds. + Thanks to Ben Jude for this contribution. +- NVIDIA/thrust#1323: Fix and add test for cmake package install rules. + Thanks for Keith Kraus and Kai Germaschewski for testing and discussion. +- NVIDIA/thrust#1338: Fix GCC version checks in `thrust::detail::is_pod` + implementation. + Thanks to Anatoliy Tomilov for this contribution. +- NVIDIA/thrust#1289: Partial fixes for Clang 10 as host compiler. + Filed an NVCC bug that will be fixed in a future version of the CUDA Toolkit + (NVBug 3136307). +- NVIDIA/thrust#1272: Fix ambiguous `iter_swap` call when + using `thrust::partition` with STL containers. + Thanks to Isaac Deutsch for this contribution. +- NVIDIA/thrust#1281: Update our bundled `FindTBB.cmake` module to support + latest MSVC. +- NVIDIA/thrust#1298: Use semantic versioning rules for our CMake package's + compatibility checks. + Thanks to Kai Germaschewski for this contribution. +- NVIDIA/thrust#1300: Use `FindPackageHandleStandardArgs` to print standard + status messages when our CMake package is found. + Thanks to Kai Germaschewski for this contribution. +- NVIDIA/thrust#1320: Use feature-testing instead of a language dialect check + for `thrust::remove_cvref`. + Thanks to Andrew Corrigan for this contribution. +- NVIDIA/thrust#1319: Suppress GPU deprecation warnings. + +### Other Enhancements + +- NVIDIA/cub#213: Removed some tuning policies for unsupported hardware (` + specialization. + - The `thrust::intermediate_type_from_function_and_iterators` helper is no + longer needed and has been removed. +- NVIDIA/thrust#1255: Always use `cudaStreamSynchronize` instead of + `cudaDeviceSynchronize` if the execution policy has a stream attached to it. + Thanks to Rong Ou for this contribution. +- NVIDIA/thrust#1201: Tests for correct handling of legacy and per-thread + default streams. + Thanks to Rong Ou for this contribution. + +### Bug Fixes + +- NVIDIA/thrust#1260: Fix `thrust::transform_inclusive_scan` with heterogeneous + types. + Thanks to Rong Ou for this contribution. +- NVIDIA/thrust#1258, NVC++ FS #28463: Ensure the CUDA radix sort backend + synchronizes before returning; otherwise, copies from temporary storage will + race with destruction of said temporary storage. +- NVIDIA/thrust#1264: Evaluate `CUDA_CUB_RET_IF_FAIL` macro argument only once. + Thanks to Jason Lowe for this contribution. +- NVIDIA/thrust#1262: Add missing `` header. +- NVIDIA/thrust#1250: Restore some `THRUST_DECLTYPE_RETURNS` macros in async + test implementations. +- NVIDIA/thrust#1249: Use `std::iota` in `CUDATestDriver::target_devices`. + Thanks to Michael Francis for this contribution. +- NVIDIA/thrust#1244: Check for macro collisions with system headers during + header testing. +- NVIDIA/thrust#1224: Remove unnecessary SFINAE contexts from asynchronous + algorithms. +- NVIDIA/thrust#1190: Make `out_of_memory_recovery` test trigger faster. +- NVIDIA/thrust#1187: Elminate superfluous iterators specific to the CUDA + backend. +- NVIDIA/thrust#1181: Various fixes for GoUDA. + Thanks to Andrei Tchouprakov for this contribution. +- NVIDIA/thrust#1178, NVIDIA/thrust#1229: Use transparent functionals in + placeholder expressions, fixing issues with `thrust::device_reference` and + placeholder expressions and `thrust::find` with asymmetric equality + operators. +- NVIDIA/thrust#1153: Switch to placement new instead of assignment to + construct items in uninitialized memory. + Thanks to Hugh Winkler for this contribution. +- NVIDIA/thrust#1050: Fix compilation of asynchronous algorithms when RDC is + enabled. +- NVIDIA/thrust#1042: Correct return type of + `thrust::detail::predicate_to_integral` from `bool` to `IntegralType`. + Thanks to Andreas Hehn for this contribution. +- NVIDIA/thrust#1009: Avoid returning uninitialized allocators. + Thanks to Zhihao Yuan for this contribution. +- NVIDIA/thrust#990: Add missing `` include to + ``. + Thanks to Robert Maynard for this contribution. +- NVIDIA/thrust#966: Fix spurious MSVC conversion with loss of data warning in + sort algorithms. + Thanks to Zhihao Yuan for this contribution. +- Add more metadata to mock specializations for testing iterator in + `testing/copy.cu`. +- Add missing include to shuffle unit test. +- Specialize `thrust::wrapped_function` for `void` return types because MSVC is + not a fan of the pattern `return static_cast(expr);`. +- Replace deprecated `tbb/tbb_thread.h` with ``. +- Fix overcounting of initial value in TBB scans. +- Use `thrust::advance` instead of `+=` for generic iterators. +- Wrap the OMP flags in `-Xcompiler` for NVCC +- Extend `ASSERT_STATIC_ASSERT` skip for the OMP backend. +- Add missing header caught by `tbb.cuda` configs. +- Fix "unsafe API" warnings in examples on MSVC: `s/fopen/fstream/` +- Various C++17 fixes. + +## Thrust 1.9.10-1 (NVIDIA HPC SDK 20.7, CUDA Toolkit 11.1) + +Thrust 1.9.10-1 is the minor release accompanying the NVIDIA HPC SDK 20.7 release + and the CUDA Toolkit 11.1 release. + +### Bug Fixes + +- #1214, NVBug 200619442: Stop using `std::allocator` APIs deprecated in C++17. +- #1216, NVBug 200540293: Make `thrust::optional` work with Clang when used + with older libstdc++. +- #1207, NVBug 200618218: Don't force C++14 with older compilers that don't + support it. +- #1218: Wrap includes of `` and `` to avoid circular + inclusion with NVC++. + +## Thrust 1.9.10 (NVIDIA HPC SDK 20.5) + +Thrust 1.9.10 is the release accompanying the NVIDIA HPC SDK 20.5 release. +It adds CMake support for compilation with NVC++ and a number of minor bug fixes + for NVC++. +It also adds CMake `find_package` support, which replaces the broken 3rd-party + legacy `FindThrust.cmake` script. +C++03, C++11, GCC < 5, Clang < 6, and MSVC < 2017 are now deprecated. +Starting with the upcoming 1.10.0 release, C++03 support will be dropped + entirely. + +### Breaking Changes + +- #1082: Thrust now checks that it is compatible with the version of CUB found + in your include path, generating an error if it is not. + If you are using your own version of CUB, it may be too old. + It is recommended to simply delete your own version of CUB and use the + version of CUB that comes with Thrust. +- #1089: C++03 and C++11 are deprecated. + Using these dialects will generate a compile-time warning. + These warnings can be suppressed by defining + `THRUST_IGNORE_DEPRECATED_CPP_DIALECT` (to suppress C++03 and C++11 + deprecation warnings) or `THRUST_IGNORE_DEPRECATED_CPP11` (to suppress C++11 + deprecation warnings). + Suppression is only a short term solution. + We will be dropping support for C++03 in the 1.10.0 release and C++11 in the + near future. +- #1089: GCC < 5, Clang < 6, and MSVC < 2017 are deprecated. + Using these compilers will generate a compile-time warning. + These warnings can be suppressed by defining + `THRUST_IGNORE_DEPRECATED_COMPILER`. + Suppression is only a short term solution. + We will be dropping support for these compilers in the near future. + +### New Features + +- #1130: CMake `find_package` support. + This is significant because there is a legacy `FindThrust.cmake` script + authored by a third party in widespread use in the community which has a + bug in how it parses Thrust version numbers which will cause it to + incorrectly parse 1.9.10. + This script only handles the first digit of each part of the Thrust version + number correctly: for example, Thrust 17.17.17 would be interpreted as + Thrust 1.1.1701717. + You can find directions for using the new CMake `find_package` support and + migrating away from the legacy `FindThrust.cmake` [here](https://github.com/NVIDIA/thrust/blob/main/thrust/cmake/README.md) +- #1129: Added `thrust::detail::single_device_tls_caching_allocator`, a + convenient way to get an MR caching allocator for device memory, which is + used by NVC++. + +### Other Enhancements + +- #1129: Refactored RDC handling in CMake to be a global option and not create + two targets for each example and test. + +### Bug Fixes + +- #1129: Fix the legacy `thrust::return_temporary_buffer` API to support + passing a size. + This was necessary to enable usage of Thrust caching MR allocators with + synchronous Thrust algorithms. + This change has allowed NVC++'s C++17 Parallel Algorithms implementation to + switch to use Thrust caching MR allocators for device temporary storage, + which gives a 2x speedup on large multi-GPU systems such as V100 and A100 + DGX where `cudaMalloc` is very slow. +- #1128: Respect `CUDA_API_PER_THREAD_DEFAULT_STREAM`. + Thanks to Rong Ou for this contribution. +- #1131: Fix the one-policy overload of `thrust::async::copy` to not copy the + policy, resolving use-afer-move issues. +- #1145: When cleaning up type names in `unittest::base_class_name`, only call + `std::string::replace` if we found the substring we are looking to replace. +- #1139: Don't use `cxx::__demangle` in NVC++. +- #1102: Don't use `thrust::detail::normal_distribution_nvcc` for Feta because + it uses `erfcinv`, a non-standard function that Feta doesn't have. + +## Thrust 1.9.9 (CUDA Toolkit 11.0) + +Thrust 1.9.9 adds support for NVC++, which uses Thrust to implement + GPU-accelerated C++17 Parallel Algorithms. +`thrust::zip_function` and `thrust::shuffle` were also added. +C++03, C++11, GCC < 5, Clang < 6, and MSVC < 2017 are now deprecated. +Starting with the upcoming 1.10.0 release, C++03 support will be dropped + entirely. +All other deprecated platforms will be dropped in the near future. + +### Breaking Changes + +- #1082: Thrust now checks that it is compatible with the version of CUB found + in your include path, generating an error if it is not. + If you are using your own version of CUB, it may be too old. + It is recommended to simply delete your own version of CUB and use the + version of CUB that comes with Thrust. +- #1089: C++03 and C++11 are deprecated. + Using these dialects will generate a compile-time warning. + These warnings can be suppressed by defining + `THRUST_IGNORE_DEPRECATED_CPP_DIALECT` (to suppress C++03 and C++11 + deprecation warnings) or `THRUST_IGNORE_DEPRECATED_CPP_11` (to suppress C++11 + deprecation warnings). + Suppression is only a short term solution. + We will be dropping support for C++03 in the 1.10.0 release and C++11 in the + near future. +- #1089: GCC < 5, Clang < 6, and MSVC < 2017 are deprecated. + Using these compilers will generate a compile-time warning. + These warnings can be suppressed by defining + `THRUST_IGNORE_DEPRECATED_COMPILER`. + Suppression is only a short term solution. + We will be dropping support for these compilers in the near future. + +### New Features + +- #1086: Support for NVC++ aka "Feta". + The most significant change is in how we use `__CUDA_ARCH__`. + Now, there are four macros that must be used: + - `THRUST_IS_DEVICE_CODE`, which should be used in an `if` statement around + device-only code. + - `THRUST_INCLUDE_DEVICE_CODE`, which should be used in an `#if` preprocessor + directive inside of the `if` statement mentioned in the prior bullet. + - `THRUST_IS_HOST_CODE`, which should be used in an `if` statement around + host-only code. + - `THRUST_INCLUDE_HOST_CODE`, which should be used in an `#if` preprocessor + directive inside of the `if` statement mentioned in the prior bullet. +- #1085: `thrust::shuffle`. + Thanks to Rory Mitchell for this contribution. +- #1029: `thrust::zip_function`, a facility for zipping functions that take N + parameters instead of a tuple of N parameters as `thrust::zip_iterator` + does. + Thanks to Ben Jude for this contribution. +- #1068: `thrust::system::cuda::managed_memory_pointer`, a universal memory + strongly typed pointer compatible with the ISO C++ Standard Library. + +### Other Enhancements + +- #1029: Thrust is now built and tested with NVCC warnings treated as errors. +- #1029: MSVC C++11 support. +- #1029: `THRUST_DEPRECATED` abstraction for generating compile-time + deprecation warning messages. +- #1029: `thrust::pointer::pointer_to(reference)`. +- #1070: Unit test for `thrust::inclusive_scan` with a user defined types. + Thanks to Conor Hoekstra for this contribution. + +### Bug Fixes + +- #1088: Allow `thrust::replace` to take functions that have non-`const` + `operator()`. +- #1094: Add missing `constexpr` to `par_t` constructors. + Thanks to Patrick Stotko for this contribution. +- #1077: Remove `__device__` from CUDA MR-based device allocators to fix + obscure "host function called from host device function" warning that occurs + when you use the new Thrust MR-based allocators. +- #1029: Remove inconsistently-used `THRUST_BEGIN`/`END_NS` macros. +- #1029: Fix C++ dialect detection on newer MSVC. +- #1029 Use `_Pragma`/`__pragma` instead of `#pragma` in macros. +- #1029: Replace raw `__cplusplus` checks with the appropriate Thrust macros. +- #1105: Add a missing `` include. +- #1103: Fix regression of `thrust::detail::temporary_allocator` with non-CUDA + back ends. +- #1111: Use Thrust's random number engine instead of `std::`s in device code. +- #1108: Get rid of a GCC 9 warning about deprecated generation of copy ctors. + +## Thrust 1.9.8-1 (NVIDIA HPC SDK 20.3) + +Thrust 1.9.8-1 is a variant of 1.9.8 accompanying the NVIDIA HPC SDK 20.3 + release. +It contains modifications necessary to serve as the implementation of NVC++'s + GPU-accelerated C++17 Parallel Algorithms when using the CUDA Toolkit 11.0 + release. + +## Thrust 1.9.8 (CUDA Toolkit 11.0 Early Access) + +Thrust 1.9.8, which is included in the CUDA Toolkit 11.0 release, removes + Thrust's internal derivative of CUB, upstreams all relevant changes too CUB, + and adds CUB as a Git submodule. +It will now be necessary to do `git clone --recursive` when checking out + Thrust, and to update the CUB submodule when pulling in new Thrust changes. +Additionally, CUB is now included as a first class citizen in the CUDA toolkit. +Thrust 1.9.8 also fixes bugs preventing most Thrust algorithms from working + with more than `2^31-1` elements. +Now, `thrust::reduce`, `thrust::*_scan`, and related algorithms (aka most of + Thrust) work with large element counts. + +### Breaking Changes + +- Thrust will now use the version of CUB in your include path instead of its own + internal copy. + If you are using your own version of CUB, it may be older and incompatible + with Thrust. + It is recommended to simply delete your own version of CUB and use the + version of CUB that comes with Thrust. + +### Other Enhancements + +- Refactor Thrust and CUB to support 64-bit indices in most algorithms. + In most cases, Thrust now selects between kernels that use 32-bit indices and + 64-bit indices at runtime depending on the size of the input. + This means large element counts work, but small element counts do not have to + pay for the register usage of 64-bit indices if they are not needed. + Now, `thrust::reduce`, `thrust::*_scan`, and related algorithms (aka most of + Thrust) work with more than `2^31-1` elements. + Notably, `thrust::sort` is still limited to less than `2^31-1` elements. +- CUB is now a submodule and the internal copy of CUB has been removed. +- #1051: Stop specifying the `__launch_bounds__` minimum blocks parameter + because it messes up register allocation and increases register pressure, + and we don't actually know at compile time how many blocks we will use + (aside from single tile kernels). + +### Bug Fixes + +- #1020: After making a CUDA API call, always clear the global CUDA error state + by calling `cudaGetLastError`. +- #1021: Avoid calling destroy in the destructor of a Thrust vector if the + vector is empty. +- #1046: Actually throw `thrust::bad_alloc` when `thrust::system::cuda::malloc` + fails instead of just constructing a temporary and doing nothing with it. +- Add missing copy constructor or copy assignment operator to all classes that + GCC 9's `-Wdeprecated-copy` complains about +- Add missing move operations to `thrust::system::cuda::vector`. +- #1015: Check that the backend is CUDA before using CUDA-specifics in + `thrust::detail::temporary_allocator`. + Thanks to Hugh Winkler for this contribution. +- #1055: More correctly detect the presence of aligned/sized `new`/`delete`. +- #1043: Fix ill-formed specialization of `thrust::system::is_error_code_enum` + for `thrust::event_errc`. + Thanks to Toru Niina for this contribution. +- #1027: Add tests for `thrust::tuple_for_each` and `thrust::tuple_subset`. + Thanks to Ben Jude for this contribution. +- #1027: Use correct macro in `thrust::tuple_for_each`. + Thanks to Ben Jude for this contribution. +- #1026: Use correct MSVC version formatting in CMake. + Thanks to Ben Jude for this contribution. +- Workaround an NVCC issue with type aliases with template template arguments + containing a parameter pack. +- Remove unused functions from the CUDA backend which call slow CUDA attribute + query APIs. +- Replace `CUB_RUNTIME_FUNCTION` with `THRUST_RUNTIME_FUNCTION`. +- Correct typo in `thrust::transform` documentation. + Thanks to Eden Yefet for this contribution. + +### Known Issues + +- `thrust::sort` remains limited to `2^31-1` elements for now. + +## Thrust 1.9.7-1 (CUDA Toolkit 10.2 for Tegra) + +Thrust 1.9.7-1 is a minor release accompanying the CUDA Toolkit 10.2 release + for Tegra. +It is nearly identical to 1.9.7. + +### Bug Fixes + +- Remove support for GCC's broken nodiscard-like attribute. + +## Thrust 1.9.7 (CUDA Toolkit 10.2) + +Thrust 1.9.7 is a minor release accompanying the CUDA Toolkit 10.2 release. +Unfortunately, although the version and patch numbers are identical, one bug + fix present in Thrust 1.9.7 (NVBug 2646034: Fix incorrect dependency handling + for stream acquisition in `thrust::future`) was not included in the CUDA + Toolkit 10.2 preview release for AArch64 SBSA. +The tag `cuda-10.2aarch64sbsa` contains the exact version of Thrust present + in the CUDA Toolkit 10.2 preview release for AArch64 SBSA. + +### Bug Fixes + +- #967, NVBug 2448170: Fix the CUDA backend `thrust::for_each` so that it + supports large input sizes with 64-bit indices. +- NVBug 2646034: Fix incorrect dependency handling for stream acquisition in + `thrust::future`. + - Not present in the CUDA Toolkit 10.2 preview release for AArch64 SBSA. +- #968, NVBug 2612102: Fix the `thrust::mr::polymorphic_adaptor` to actually + use its template parameter. + +## Thrust 1.9.6-1 (NVIDIA HPC SDK 20.3) + +Thrust 1.9.6-1 is a variant of 1.9.6 accompanying the NVIDIA HPC SDK 20.3 + release. +It contains modifications necessary to serve as the implementation of NVC++'s + GPU-accelerated C++17 Parallel Algorithms when using the CUDA Toolkit 10.1 + Update 2 release. + +## Thrust 1.9.6 (CUDA Toolkit 10.1 Update 2) + +Thrust 1.9.6 is a minor release accompanying the CUDA Toolkit 10.1 Update 2 + release. + +### Bug Fixes + +- NVBug 2509847: Inconsistent alignment of `thrust::complex` +- NVBug 2586774: Compilation failure with Clang + older libstdc++ that doesn't + have `std::is_trivially_copyable` +- NVBug 200488234: CUDA header files contain Unicode characters which leads + compiling errors on Windows +- #949, #973, NVBug 2422333, NVBug 2522259, NVBug 2528822: + `thrust::detail::aligned_reinterpret_cast` must be annotated with + `__host__ __device__`. +- NVBug 2599629: Missing include in the OpenMP sort implementation +- NVBug 200513211: Truncation warning in test code under VC142 + +## Thrust 1.9.5 (CUDA Toolkit 10.1 Update 1) + +Thrust 1.9.5 is a minor release accompanying the CUDA Toolkit 10.1 Update 1 + release. + +### Bug Fixes + +- NVBug 2502854: Fixed assignment of + `thrust::device_vector>` between host and device. + +## Thrust 1.9.4 (CUDA Toolkit 10.1) + +Thrust 1.9.4 adds asynchronous interfaces for parallel algorithms, a new + allocator system including caching allocators and unified memory support, as + well as a variety of other enhancements, mostly related to + C++11/C++14/C++17/C++20 support. +The new asynchronous algorithms in the `thrust::async` namespace return + `thrust::event` or `thrust::future` objects, which can be waited upon to + synchronize with the completion of the parallel operation. + +### Breaking Changes + +Synchronous Thrust algorithms now block until all of their operations have + completed. +Use the new asynchronous Thrust algorithms for non-blocking behavior. + +### New Features + +- `thrust::event` and `thrust::future`, uniquely-owned asynchronous handles + consisting of a state (ready or not ready), content (some value; for + `thrust::future` only), and an optional set of objects that should be + destroyed only when the future's value is ready and has been consumed. + - The design is loosely based on C++11's `std::future`. + - They can be `.wait`'d on, and the value of a future can be waited on and + retrieved with `.get` or `.extract`. + - Multiple `thrust::event`s and `thrust::future`s can be combined with + `thrust::when_all`. + - `thrust::future`s can be converted to `thrust::event`s. + - Currently, these primitives are only implemented for the CUDA backend and + are C++11 only. +- New asynchronous algorithms that return `thrust::event`/`thrust::future`s, + implemented as C++20 range style customization points: + - `thrust::async::reduce`. + - `thrust::async::reduce_into`, which takes a target location to store the + reduction result into. + - `thrust::async::copy`, including a two-policy overload that allows + explicit cross system copies which execution policy properties can be + attached to. + - `thrust::async::transform`. + - `thrust::async::for_each`. + - `thrust::async::stable_sort`. + - `thrust::async::sort`. + - By default the asynchronous algorithms use the new caching allocators. + Deallocation of temporary storage is deferred until the destruction of + the returned `thrust::future`. The content of `thrust::future`s is + stored in either device or universal memory and transferred to the host + only upon request to prevent unnecessary data migration. + - Asynchronous algorithms are currently only implemented for the CUDA + system and are C++11 only. +- `exec.after(f, g, ...)`, a new execution policy method that takes a set of + `thrust::event`/`thrust::future`s and returns an execution policy that + operations on that execution policy should depend upon. +- New logic and mindset for the type requirements for cross-system sequence + copies (currently only used by `thrust::async::copy`), based on: + - `thrust::is_contiguous_iterator` and `THRUST_PROCLAIM_CONTIGUOUS_ITERATOR` + for detecting/indicating that an iterator points to contiguous storage. + - `thrust::is_trivially_relocatable` and + `THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE` for detecting/indicating that a + type is `memcpy`able (based on principles from + [P1144](https://wg21.link/P1144)). + - The new approach reduces buffering, increases performance, and increases + correctness. + - The fast path is now enabled when copying CUDA `__half` and vector types with + `thrust::async::copy`. +- All Thrust synchronous algorithms for the CUDA backend now actually + synchronize. Previously, any algorithm that did not allocate temporary + storage (counterexample: `thrust::sort`) and did not have a + computation-dependent result (counterexample: `thrust::reduce`) would + actually be launched asynchronously. Additionally, synchronous algorithms + that allocated temporary storage would become asynchronous if a custom + allocator was supplied that did not synchronize on allocation/deallocation, + unlike `cudaMalloc`/`cudaFree`. So, now `thrust::for_each`, + `thrust::transform`, `thrust::sort`, etc are truly synchronous. In some + cases this may be a performance regression; if you need asynchrony, use the + new asynchronous algorithms. +- Thrust's allocator framework has been rewritten. It now uses a memory + resource system, similar to C++17's `std::pmr` but supporting static + polymorphism. Memory resources are objects that allocate untyped storage and + allocators are cheap handles to memory resources in this new model. The new + facilities live in ``. + - `thrust::mr::memory_resource`, the memory resource base class, + which takes a (possibly tagged) pointer to `void` type as a parameter. + - `thrust::mr::allocator`, an allocator backed by a memory + resource object. + - `thrust::mr::polymorphic_adaptor_resource`, a type-erased memory + resource adaptor. + - `thrust::mr::polymorphic_allocator`, a C++17-style polymorphic allocator + backed by a type-erased memory resource object. + - New tunable C++17-style caching memory resources, + `thrust::mr::(disjoint_)?(un)?synchronized_pool_resource`, designed to + cache both small object allocations and large repetitive temporary + allocations. The disjoint variants use separate storage for management of + the pool, which is necessary if the memory being allocated cannot be + accessed on the host (e.g. device memory). + - System-specific allocators were rewritten to use the new memory resource + framework. + - New `thrust::device_memory_resource` for allocating device memory. + - New `thrust::universal_memory_resource` for allocating memory that can be + accessed from both the host and device (e.g. `cudaMallocManaged`). + - New `thrust::universal_host_pinned_memory_resource` for allocating memory + that can be accessed from the host and the device but always resides in + host memory (e.g. `cudaMallocHost`). + - `thrust::get_per_device_resource` and `thrust::per_device_allocator`, which + lazily create and retrieve a per-device singleton memory resource. + - Rebinding mechanisms (`rebind_traits` and `rebind_alloc`) for + `thrust::allocator_traits`. + - `thrust::device_make_unique`, a factory function for creating a + `std::unique_ptr` to a newly allocated object in device memory. + - ``, a C++11 implementation of the C++17 + uninitialized memory algorithms. + - `thrust::allocate_unique` and friends, based on the proposed C++23 + [`std::allocate_unique`](https://wg21.link/P0211). +- New type traits and metaprogramming facilities. Type traits are slowly being + migrated out of `thrust::detail::` and ``; their new home + will be `thrust::` and ``. + - `thrust::is_execution_policy`. + - `thrust::is_operator_less_or_greater_function_object`, which detects + `thrust::less`, `thrust::greater`, `std::less`, and `std::greater`. + - `thrust::is_operator_plus_function_object``, which detects `thrust::plus` + and `std::plus`. + - `thrust::remove_cvref(_t)?`, a C++11 implementation of C++20's + `thrust::remove_cvref(_t)?`. + - `thrust::void_t`, and various other new type traits. + - `thrust::integer_sequence` and friends, a C++11 implementation of C++20's + `std::integer_sequence` + - `thrust::conjunction`, `thrust::disjunction`, and `thrust::disjunction`, a + C++11 implementation of C++17's logical metafunctions. + - Some Thrust type traits (such as `thrust::is_constructible`) have been + redefined in terms of C++11's type traits when they are available. +- ``, new `std::tuple` algorithms: + - `thrust::tuple_transform`. + - `thrust::tuple_for_each`. + - `thrust::tuple_subset`. +- Miscellaneous new `std::`-like facilities: + - `thrust::optional`, a C++11 implementation of C++17's `std::optional`. + - `thrust::addressof`, an implementation of C++11's `std::addressof`. + - `thrust::next` and `thrust::prev`, an implementation of C++11's `std::next` + and `std::prev`. + - `thrust::square`, a `` style unary function object that + multiplies its argument by itself. + - `` and `thrust::numeric_limits`, a customized version of + `` and `std::numeric_limits`. +- ``, new general purpose preprocessor facilities: + - `THRUST_PP_CAT[2-5]`, concatenates two to five tokens. + - `THRUST_PP_EXPAND(_ARGS)?`, performs double expansion. + - `THRUST_PP_ARITY` and `THRUST_PP_DISPATCH`, tools for macro overloading. + - `THRUST_PP_BOOL`, boolean conversion. + - `THRUST_PP_INC` and `THRUST_PP_DEC`, increment/decrement. + - `THRUST_PP_HEAD`, a variadic macro that expands to the first argument. + - `THRUST_PP_TAIL`, a variadic macro that expands to all its arguments after + the first. + - `THRUST_PP_IIF`, bitwise conditional. + - `THRUST_PP_COMMA_IF`, and `THRUST_PP_HAS_COMMA`, facilities for adding and + detecting comma tokens. + - `THRUST_PP_IS_VARIADIC_NULLARY`, returns true if called with a nullary + `__VA_ARGS__`. + - `THRUST_CURRENT_FUNCTION`, expands to the name of the current function. +- New C++11 compatibility macros: + - `THRUST_NODISCARD`, expands to `[[nodiscard]]` when available and the best + equivalent otherwise. + - `THRUST_CONSTEXPR`, expands to `constexpr` when available and the best + equivalent otherwise. + - `THRUST_OVERRIDE`, expands to `override` when available and the best + equivalent otherwise. + - `THRUST_DEFAULT`, expands to `= default;` when available and the best + equivalent otherwise. + - `THRUST_NOEXCEPT`, expands to `noexcept` when available and the best + equivalent otherwise. + - `THRUST_FINAL`, expands to `final` when available and the best equivalent + otherwise. + - `THRUST_INLINE_CONSTANT`, expands to `inline constexpr` when available and + the best equivalent otherwise. +- ``, new C++11-only type deduction helpers: + - `THRUST_DECLTYPE_RETURNS*`, expand to function definitions with suitable + conditional `noexcept` qualifiers and trailing return types. + - `THRUST_FWD(x)`, expands to `::std::forward(x)`. + - `THRUST_MVCAP`, expands to a lambda move capture. + - `THRUST_RETOF`, expands to a decltype computing the return type of an + invocable. +- New CMake build system. + +### New Examples + +- `mr_basic` demonstrates how to use the new memory resource allocator system. + +### Other Enhancements + +- Tagged pointer enhancements: + - New `thrust::pointer_traits` specialization for `void const*`. + - `nullptr` support to Thrust tagged pointers. + - New `explicit operator bool` for Thrust tagged pointers when using C++11 + for `std::unique_ptr` interoperability. + - Added `thrust::reinterpret_pointer_cast` and `thrust::static_pointer_cast` + for casting Thrust tagged pointers. +- Iterator enhancements: + - `thrust::iterator_system` is now SFINAE friendly. + - Removed cv qualifiers from iterator types when using + `thrust::iterator_system`. +- Static assert enhancements: + - New `THRUST_STATIC_ASSERT_MSG`, takes an optional string constant to be + used as the error message when possible. + - Update `THRUST_STATIC_ASSERT(_MSG)` to use C++11's `static_assert` when + it's available. + - Introduce a way to test for static assertions. +- Testing enhancements: + - Additional scalar and sequence types, including non-builtin types and + vectors with unified memory allocators, have been added to the list of + types used by generic unit tests. + - The generation of random input data has been improved to increase the range + of values used and catch more corner cases. + - New `unittest::truncate_to_max_representable` utility for avoiding the + generation of ranges that cannot be represented by the underlying element + type in generic unit test code. + - The test driver now synchronizes with CUDA devices and check for errors + after each test, when switching devices, and after each raw kernel launch. + - The `warningtester` uber header is now compiled with NVCC to avoid needing + to disable CUDA-specific code with the preprocessor. + - Fixed the unit test framework's `ASSERT_*` to print `char`s as `int`s. + - New `DECLARE_INTEGRAL_VARIABLE_UNITTEST` test declaration macro. + - New `DECLARE_VARIABLE_UNITTEST_WITH_TYPES_AND_NAME` test declaration macro. + - `thrust::system_error` in the CUDA backend now print out its `cudaError_t` + enumerator in addition to the diagnostic message. + - Stopped using conditionally signed types like `char`. + +### Bug Fixes + +- #897, NVBug 2062242: Fix compilation error when using `__device__` lambdas + with `thrust::reduce` on MSVC. +- #908, NVBug 2089386: Static assert that `thrust::generate`/`thrust::fill` + isn't operating on const iterators. +- #919 Fix compilation failure with `thrust::zip_iterator` and + `thrust::complex`. +- #924, NVBug 2096679, NVBug 2315990: Fix dispatch for the CUDA backend's + `thrust::reduce` to use two functions (one with the pragma for disabling + exec checks, one with `THRUST_RUNTIME_FUNCTION`) instead of one. This fixes + a regression with device compilation that started in CUDA Toolkit 9.2. +- #928, NVBug 2341455: Add missing `__host__ __device__` annotations to a + `thrust::complex::operator=` to satisfy GoUDA. +- NVBug 2094642: Make `thrust::vector_base::clear` not depend on the element + type being default constructible. +- NVBug 2289115: Remove flaky `simple_cuda_streams` example. +- NVBug 2328572: Add missing `thrust::device_vector` constructor that takes an + allocator parameter. +- NVBug 2455740: Update the `range_view` example to not use device-side launch. +- NVBug 2455943: Ensure that sized unit tests that use + `thrust::counting_iterator` perform proper truncation. +- NVBug 2455952: Refactor questionable `thrust::copy_if` unit tests. + +## Thrust 1.9.3 (CUDA Toolkit 10.0) + +Thrust 1.9.3 unifies and integrates CUDA Thrust and GitHub Thrust. + +### Bug Fixes + +- #725, #850, #855, #859, #860: Unify the `thrust::iter_swap` interface and fix + `thrust::device_reference` swapping. +- NVBug 2004663: Add a `data` method to `thrust::detail::temporary_array` and + refactor temporary memory allocation in the CUDA backend to be exception + and leak safe. +- #886, #894, #914: Various documentation typo fixes. +- #724: Provide `NVVMIR_LIBRARY_DIR` environment variable to NVCC. +- #878: Optimize `thrust::min/max_element` to only use + `thrust::detail::get_iterator_value` for non-numeric types. +- #899: Make `thrust::cuda::experimental::pinned_allocator`'s comparison + operators `const`. +- NVBug 2092152: Remove all includes of ``. +- #911: Fix default comparator element type for `thrust::merge_by_key`. + +### Acknowledgments + +- Thanks to Andrew Corrigan for contributing fixes for swapping interfaces. +- Thanks to Francisco Facioni for contributing optimizations for + `thrust::min/max_element`. + +## Thrust 1.9.2 (CUDA Toolkit 9.2) + +Thrust 1.9.2 brings a variety of performance enhancements, bug fixes and test + improvements. +CUB 1.7.5 was integrated, enhancing the performance of `thrust::sort` on + small data types and `thrust::reduce`. +Changes were applied to `complex` to optimize memory access. +Thrust now compiles with compiler warnings enabled and treated as errors. +Additionally, the unit test suite and framework was enhanced to increase + coverage. + +### Breaking Changes + +- The `fallback_allocator` example was removed, as it was buggy and difficult + to support. + +### New Features + +- ``, utilities for memory alignment: + - `thrust::aligned_reinterpret_cast`. + - `thrust::aligned_storage_size`, which computes the amount of storage needed + for an object of a particular size and alignment. + - `thrust::alignment_of`, a C++03 implementation of C++11's + `std::alignment_of`. + - `thrust::aligned_storage`, a C++03 implementation of C++11's + `std::aligned_storage`. + - `thrust::max_align_t`, a C++03 implementation of C++11's + `std::max_align_t`. + +### Bug Fixes + +- NVBug 200385527, NVBug 200385119, NVBug 200385113, NVBug 200349350, NVBug + 2058778: Various compiler warning issues. +- NVBug 200355591: `thrust::reduce` performance issues. +- NVBug 2053727: Fixed an ADL bug that caused user-supplied `allocate` to be + overlooked but `deallocate` to be called with GCC <= 4.3. +- NVBug 1777043: Fixed `thrust::complex` to work with `thrust::sequence`. + +## Thrust 1.9.1-2 (CUDA Toolkit 9.1) + +Thrust 1.9.1-2 integrates version 1.7.4 of CUB and introduces a new CUDA backend + for `thrust::reduce` based on CUB. + +### Bug Fixes + +- NVBug 1965743: Remove unnecessary static qualifiers. +- NVBug 1940974: Fix regression causing a compilation error when using + `thrust::merge_by_key` with `thrust::constant_iterator`s. +- NVBug 1904217: Allow callables that take non-const refs to be used with + `thrust::reduce` and `thrust::*_scan`. + +## Thrust 1.9.0-5 (CUDA Toolkit 9.0) + +Thrust 1.9.0-5 replaces the original CUDA backend (bulk) with a new one + written using CUB, a high performance CUDA collectives library. +This brings a substantial performance improvement to the CUDA backend across + the board. + +### Breaking Changes + +- Any code depending on CUDA backend implementation details will likely be + broken. + +### New Features + +- New CUDA backend based on CUB which delivers substantially higher performance. +- `thrust::transform_output_iterator`, a fancy iterator that applies a function + to the output before storing the result. + +### New Examples + +- `transform_output_iterator` demonstrates use of the new fancy iterator + `thrust::transform_output_iterator`. + +### Other Enhancements + +- When C++11 is enabled, functors do not have to inherit from + `thrust::(unary|binary)_function` anymore to be used with + `thrust::transform_iterator`. +- Added C++11 only move constructors and move assignment operators for + `thrust::detail::vector_base`-based classes, e.g. `thrust::host_vector`, + `thrust::device_vector`, and friends. + +### Bug Fixes + +- `sin(thrust::complex)` no longer has precision loss to float. + +### Acknowledgments + +- Thanks to Manuel Schiller for contributing a C++11 based enhancement + regarding the deduction of functor return types, improving the performance + of `thrust::unique` and implementing `thrust::transform_output_iterator`. +- Thanks to Thibault Notargiacomo for the implementation of move semantics for + the `thrust::vector_base`-based classes. +- Thanks to Duane Merrill for developing CUB and helping to integrate it into + Thrust's backend. + +## Thrust 1.8.3 (CUDA Toolkit 8.0) + +Thrust 1.8.3 is a small bug fix release. + +### New Examples + +- `range_view` demonstrates the use of a view (a non-owning wrapper for an + iterator range with a container-like interface). + +### Bug Fixes + +- `thrust::(min|max|minmax)_element` can now accept raw device pointers when + an explicit device execution policy is used. +- `thrust::clear` operations on vector types no longer requires the element + type to have a default constructor. + +## Thrust 1.8.2 (CUDA Toolkit 7.5) + +Thrust 1.8.2 is a small bug fix release. + +### Bug Fixes + +- Avoid warnings and errors concerning user functions called from + `__host__ __device__` functions. +- #632: Fix an error in `thrust::set_intersection_by_key` with the CUDA backend. +- #651: `thrust::copy` between host and device now accepts execution policies + with streams attached, i.e. `thrust::::cuda::par.on(stream)`. +- #664: `thrust::for_each` and algorithms based on it no longer ignore streams + attached to execution policys. + +### Known Issues + +- #628: `thrust::reduce_by_key` for the CUDA backend fails for Compute + Capability 5.0 devices. + +## Thrust 1.8.1 (CUDA Toolkit 7.0) + +Thrust 1.8.1 is a small bug fix release. + +### Bug Fixes + +- #615, #620: Fixed `thrust::for_each` and `thrust::reduce` to no longer fail on + large inputs. + +### Known Issues + +- #628: `thrust::reduce_by_key` for the CUDA backend fails for Compute + Capability 5.0 devices. + +## Thrust 1.8.0 + +Thrust 1.8.0 introduces support for algorithm invocation from CUDA device + code, support for CUDA streams, and algorithm performance improvements. +Users may now invoke Thrust algorithms from CUDA device code, providing a + parallel algorithms library to CUDA programmers authoring custom kernels, as + well as allowing Thrust programmers to nest their algorithm calls within + functors. +The `thrust::seq` execution policy allows users to require sequential algorithm + execution in the calling thread and makes a sequential algorithms library + available to individual CUDA threads. +The `.on(stream)` syntax allows users to request a CUDA stream for kernels + launched during algorithm execution. +Finally, new CUDA algorithm implementations provide substantial performance + improvements. + +### New Features + +- Algorithms in CUDA Device Code: + - Thrust algorithms may now be invoked from CUDA `__device__` and + `__host__` __device__ functions. + Algorithms invoked in this manner must be invoked with an execution + policy as the first parameter. + The following execution policies are supported in CUDA __device__ code: + - `thrust::seq` + - `thrust::cuda::par` + - `thrust::device`, when THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA. + - Device-side algorithm execution may not be parallelized unless CUDA Dynamic + Parallelism is available. +- Execution Policies: + - CUDA Streams + - The `thrust::cuda::par.on(stream)` syntax allows users to request that + CUDA kernels launched during algorithm execution should occur on a given + stream. + - Algorithms executed with a CUDA stream in this manner may still + synchronize with other streams when allocating temporary storage or + returning results to the CPU. + - `thrust::seq`, which allows users to require that an algorithm execute + sequentially in the calling thread. +- `thrust::complex`, a complex number data type. + +### New Examples + +- simple_cuda_streams demonstrates how to request a CUDA stream during + algorithm execution. +- async_reduce demonstrates ways to achieve algorithm invocations which are + asynchronous with the calling thread. + +### Other Enhancements + +- CUDA sort performance for user-defined types is 300% faster on Tesla K20c for + large problem sizes. +- CUDA merge performance is 200% faster on Tesla K20c for large problem sizes. +- CUDA sort performance for primitive types is 50% faster on Tesla K20c for + large problem sizes. +- CUDA reduce_by_key performance is 25% faster on Tesla K20c for large problem + sizes. +- CUDA scan performance is 15% faster on Tesla K20c for large problem sizes. +- fallback_allocator example is simpler. + +### Bug Fixes + +- #364: Iterators with unrelated system tags may be used with algorithms invoked + with an execution policy +- #371: Do not redefine `__CUDA_ARCH__`. +- #379: Fix crash when dereferencing transform_iterator on the host. +- #391: Avoid use of uppercase variable names. +- #392: Fix `thrust::copy` between `cusp::complex` and `std::complex`. +- #396: Program compiled with gcc < 4.3 hangs during comparison sort. +- #406: `fallback_allocator.cu` example checks device for unified addressing support. +- #417: Avoid using `std::less` in binary search algorithms. +- #418: Avoid various warnings. +- #443: Including version.h no longer configures default systems. +- #578: NVCC produces warnings when sequential algorithms are used with CPU systems. + +### Known Issues + +- When invoked with primitive data types, thrust::sort, thrust::sort_by_key, + thrust::stable_sort, & thrust::stable_sort_by_key may +- Sometimes linking fails when compiling with `-rdc=true` with NVCC. +- The CUDA implementation of thrust::reduce_by_key incorrectly outputs the last + element in a segment of equivalent keys instead of the first. + +### Acknowledgments + +- Thanks to Sean Baxter for contributing faster CUDA reduce, merge, and scan + implementations. +- Thanks to Duane Merrill for contributing a faster CUDA radix sort implementation. +- Thanks to Filipe Maia for contributing the implementation of thrust::complex. + +## Thrust 1.7.2 (CUDA Toolkit 6.5) + +Thrust 1.7.2 is a minor bug fix release. + +### Bug Fixes + +- Avoid use of `std::min` in generic find implementation. + +## Thrust 1.7.1 (CUDA Toolkit 6.0) + +Thrust 1.7.1 is a minor bug fix release. + +### Bug Fixes + +- Eliminate identifiers in `set_operations.cu` example with leading underscore. +- Eliminate unused variable warning in CUDA `reduce_by_key` implementation. +- Avoid deriving function objects from `std::unary_function` and + `std::binary_function`. + +## Thrust 1.7.0 (CUDA Toolkit 5.5) + +Thrust 1.7.0 introduces a new interface for controlling algorithm execution as + well as several new algorithms and performance improvements. +With this new interface, users may directly control how algorithms execute as + well as details such as the allocation of temporary storage. +Key/value versions of thrust::merge and the set operation algorithms have been + added, as well stencil versions of partitioning algorithms. +thrust::tabulate has been introduced to tabulate the values of functions taking + integers. +For 32b types, new CUDA merge and set operations provide 2-15x faster + performance while a new CUDA comparison sort provides 1.3-4x faster + performance. +Finally, a new TBB reduce_by_key implementation provides 80% faster + performance. + +### Breaking Changes + +- Dispatch: + - Custom user backend systems' tag types must now inherit from the + corresponding system's execution_policy template (e.g. + thrust::cuda::execution_policy) instead of the tag struct (e.g. + thrust::cuda::tag). Otherwise, algorithm specializations will silently go + unfound during dispatch. See examples/minimal_custom_backend.cu and + examples/cuda/fallback_allocator.cu for usage examples. + - thrust::advance and thrust::distance are no longer dispatched based on + iterator system type and thus may no longer be customized. +- Iterators: + - iterator_facade and iterator_adaptor's Pointer template parameters have + been eliminated. + - iterator_adaptor has been moved into the thrust namespace (previously + thrust::experimental::iterator_adaptor). + - iterator_facade has been moved into the thrust namespace (previously + thrust::experimental::iterator_facade). + - iterator_core_access has been moved into the thrust namespace (previously + thrust::experimental::iterator_core_access). + - All iterators' nested pointer typedef (the type of the result of + operator->) is now void instead of a pointer type to indicate that such + expressions are currently impossible. + - Floating point counting_iterators' nested difference_type typedef is now a + signed integral type instead of a floating point type. +- Other: + - normal_distribution has been moved into the thrust::random namespace + (previously thrust::random::experimental::normal_distribution). + - Placeholder expressions may no longer include the comma operator. + +### New Features +- Execution Policies: + - Users may directly control the dispatch of algorithm invocations with + optional execution policy arguments. + For example, instead of wrapping raw pointers allocated by cudaMalloc with + thrust::device_ptr, the thrust::device execution_policy may be passed as + an argument to an algorithm invocation to enable CUDA execution. + - The following execution policies are supported in this version: + - `thrust::host` + - `thrust::device` + - `thrust::cpp::par` + - `thrust::cuda::par` + - `thrust::omp::par` + - `thrust::tbb::par` +- Algorithms: + - `thrust::merge_by_key` + - `thrust::partition` with stencil + - `thrust::partition_copy` with stencil + - `thrust::set_difference_by_key` + - `thrust::set_intersection_by_key` + - `thrust::set_symmetric_difference_by_key` + - `thrust::set_union_by_key` + - `thrust::stable_partition with stencil` + - `thrust::stable_partition_copy with stencil` + - `thrust::tabulate` +- Memory Allocation: + - `thrust::malloc` + - `thrust::free` + - `thrust::get_temporary_buffer` + - `thrust::return_temporary_buffer` + +### New Examples + +- uninitialized_vector demonstrates how to use a custom allocator to avoid the + automatic initialization of elements in thrust::device_vector. + +### Other Enhancements + +- Authors of custom backend systems may manipulate arbitrary state during + algorithm dispatch by incorporating it into their execution_policy parameter. +- Users may control the allocation of temporary storage during algorithm + execution by passing standard allocators as parameters via execution policies + such as thrust::device. +- THRUST_DEVICE_SYSTEM_CPP has been added as a compile-time target for the + device backend. +- CUDA merge performance is 2-15x faster. +- CUDA comparison sort performance is 1.3-4x faster. +- CUDA set operation performance is 1.5-15x faster. +- TBB reduce_by_key performance is 80% faster. +- Several algorithms have been parallelized with TBB. +- Support for user allocators in vectors has been improved. +- The sparse_vector example is now implemented with merge_by_key instead of + sort_by_key. +- Warnings have been eliminated in various contexts. +- Warnings about __host__ or __device__-only functions called from __host__ + __device__ functions have been eliminated in various contexts. +- Documentation about algorithm requirements have been improved. +- Simplified the minimal_custom_backend example. +- Simplified the cuda/custom_temporary_allocation example. +- Simplified the cuda/fallback_allocator example. + +### Bug Fixes + +- #248: Fix broken `thrust::counting_iterator` behavior with OpenMP. +- #231, #209: Fix set operation failures with CUDA. +- #187: Fix incorrect occupancy calculation with CUDA. +- #153: Fix broken multi GPU behavior with CUDA. +- #142: Eliminate warning produced by `thrust::random::taus88` and MSVC 2010. +- #208: Correctly initialize elements in temporary storage when necessary. +- #16: Fix compilation error when sorting bool with CUDA. +- #10: Fix ambiguous overloads of `thrust::reinterpret_tag`. + +### Known Issues + +- GCC 4.3 and lower may fail to dispatch thrust::get_temporary_buffer correctly + causing infinite recursion in examples such as + cuda/custom_temporary_allocation. + +### Acknowledgments + +- Thanks to Sean Baxter, Bryan Catanzaro, and Manjunath Kudlur for contributing + a faster merge implementation for CUDA. +- Thanks to Sean Baxter for contributing a faster set operation implementation + for CUDA. +- Thanks to Cliff Woolley for contributing a correct occupancy calculation + algorithm. + +## Thrust 1.6.0 + +Thrust 1.6.0 provides an interface for customization and extension and a new + backend system based on the Threading Building Blocks library. +With this new interface, programmers may customize the behavior of specific + algorithms as well as control the allocation of temporary storage or invent + entirely new backends. +These enhancements also allow multiple different backend systems + such as CUDA and OpenMP to coexist within a single program. +Support for TBB allows Thrust programs to integrate more naturally into + applications which may already employ the TBB task scheduler. + +### Breaking Changes + +- The header has been moved to + +- thrust::experimental::cuda::pinned_allocator has been moved to + thrust::cuda::experimental::pinned_allocator +- The macro THRUST_DEVICE_BACKEND has been renamed THRUST_DEVICE_SYSTEM +- The macro THRUST_DEVICE_BACKEND_CUDA has been renamed THRUST_DEVICE_SYSTEM_CUDA +- The macro THRUST_DEVICE_BACKEND_OMP has been renamed THRUST_DEVICE_SYSTEM_OMP +- thrust::host_space_tag has been renamed thrust::host_system_tag +- thrust::device_space_tag has been renamed thrust::device_system_tag +- thrust::any_space_tag has been renamed thrust::any_system_tag +- thrust::iterator_space has been renamed thrust::iterator_system + +### New Features + +- Backend Systems + - Threading Building Blocks (TBB) is now supported +- Algorithms + - `thrust::for_each_n` + - `thrust::raw_reference_cast` +- Types + - `thrust::pointer` + - `thrust::reference` + +### New Examples + +- `cuda/custom_temporary_allocation` +- `cuda/fallback_allocator` +- `device_ptr` +- `expand` +- `minimal_custom_backend` +- `raw_reference_cast` +- `set_operations` + +### Other Enhancements + +- `thrust::for_each` now returns the end of the input range similar to most + other algorithms. +- `thrust::pair` and `thrust::tuple` have swap functionality. +- All CUDA algorithms now support large data types. +- Iterators may be dereferenced in user `__device__` or `__global__` functions. +- The safe use of different backend systems is now possible within a single + binary + +### Bug Fixes + +- #469 `min_element` and `max_element` algorithms no longer require a const comparison operator + +### Known Issues + +- NVCC may crash when parsing TBB headers on Windows. + +## Thrust 1.5.3 (CUDA Toolkit 5.0) + +Thrust 1.5.3 is a minor bug fix release. + +### Bug Fixes + +- Avoid warnings about potential race due to `__shared__` non-POD variable + +## Thrust 1.5.2 (CUDA Toolkit 4.2) + +Thrust 1.5.2 is a minor bug fix release. + +### Bug Fixes + +- Fixed warning about C-style initialization of structures + +## Thrust 1.5.1 (CUDA Toolkit 4.1) + +Thrust 1.5.1 is a minor bug fix release. + +### Bug Fixes + +- Sorting data referenced by permutation_iterators on CUDA produces invalid results + +## Thrust 1.5.0 + +Thrust 1.5.0 provides introduces new programmer productivity and performance + enhancements. +New functionality for creating anonymous "lambda" functions has been added. +A faster host sort provides 2-10x faster performance for sorting arithmetic + types on (single-threaded) CPUs. +A new OpenMP sort provides 2.5x-3.0x speedup over the host sort using a + quad-core CPU. +When sorting arithmetic types with the OpenMP backend the combined performance + improvement is 5.9x for 32-bit integers and ranges from 3.0x (64-bit types) to + 14.2x (8-bit types). +A new CUDA `reduce_by_key` implementation provides 2-3x faster + performance. + +### Breaking Changes +- device_ptr no longer unsafely converts to device_ptr without an + explicit cast. + Use the expression device_pointer_cast(static_cast(void_ptr.get())) to + convert, for example, device_ptr to device_ptr. + +### New Features + +- Algorithms: + - Stencil-less `thrust::transform_if`. +- Lambda placeholders + +### New Examples +- lambda + +### Other Enhancements + +- Host sort is 2-10x faster for arithmetic types +- OMP sort provides speedup over host sort +- `reduce_by_key` is 2-3x faster +- `reduce_by_key` no longer requires O(N) temporary storage +- CUDA scan algorithms are 10-40% faster +- `host_vector` and `device_vector` are now documented +- out-of-memory exceptions now provide detailed information from CUDART +- improved histogram example +- `device_reference` now has a specialized swap +- `reduce_by_key` and scan algorithms are compatible with `discard_iterator` + +### Bug Fixes + +- #44: Allow `thrust::host_vector` to compile when `value_type` uses + `__align__`. +- #198: Allow `thrust::adjacent_difference` to permit safe in-situ operation. +- #303: Make thrust thread-safe. +- #313: Avoid race conditions in `thrust::device_vector::insert`. +- #314: Avoid unintended ADL invocation when dispatching copy. +- #365: Fix merge and set operation failures. + +### Known Issues + +- None + +### Acknowledgments + +- Thanks to Manjunath Kudlur for contributing his Carbon library, from which + the lambda functionality is derived. +- Thanks to Jean-Francois Bastien for suggesting a fix for #303. + +## Thrust 1.4.0 (CUDA Toolkit 4.0) + +Thrust 1.4.0 is the first release of Thrust to be included in the CUDA Toolkit. +Additionally, it brings many feature and performance improvements. +New set theoretic algorithms operating on sorted sequences have been added. +Additionally, a new fancy iterator allows discarding redundant or otherwise + unnecessary output from algorithms, conserving memory storage and bandwidth. + +### Breaking Changes + +- Eliminations + - `thrust/is_sorted.h` + - `thrust/utility.h` + - `thrust/set_intersection.h` + - `thrust/experimental/cuda/ogl_interop_allocator.h` and the functionality + therein + - `thrust::deprecated::copy_when` + - `thrust::deprecated::absolute_value` + - `thrust::deprecated::copy_when` + - `thrust::deprecated::absolute_value` + - `thrust::deprecated::copy_when` + - `thrust::deprecated::absolute_value` + - `thrust::gather` and `thrust::scatter` from host to device and vice versa + are no longer supported. + - Operations which modify the elements of a thrust::device_vector are no longer + available from source code compiled without nvcc when the device backend + is CUDA. + Instead, use the idiom from the cpp_interop example. + +### New Features + +- Algorithms: + - `thrust::copy_n` + - `thrust::merge` + - `thrust::set_difference` + - `thrust::set_symmetric_difference` + - `thrust::set_union` + +- Types + - `thrust::discard_iterator` + +- Device Support: + - Compute Capability 2.1 GPUs. + +### New Examples + +- run_length_decoding + +### Other Enhancements + +- Compilation warnings are substantially reduced in various contexts. +- The compilation time of thrust::sort, thrust::stable_sort, + thrust::sort_by_key, and thrust::stable_sort_by_key are substantially + reduced. +- A fast sort implementation is used when sorting primitive types with + thrust::greater. +- The performance of thrust::set_intersection is improved. +- The performance of thrust::fill is improved on SM 1.x devices. +- A code example is now provided in each algorithm's documentation. +- thrust::reverse now operates in-place + +### Bug Fixes + +- #212: `thrust::set_intersection` works correctly for large input sizes. +- #275: `thrust::counting_iterator` and `thrust::constant_iterator` work + correctly with OpenMP as the backend when compiling with optimization. +- #256: `min` and `max` correctly return their first argument as a tie-breaker +- #248: `NDEBUG` is interpreted incorrectly + +### Known Issues + +- NVCC may generate code containing warnings when compiling some Thrust + algorithms. +- When compiling with `-arch=sm_1x`, some Thrust algorithms may cause NVCC to + issue benign pointer advisories. +- When compiling with `-arch=sm_1x` and -G, some Thrust algorithms may fail to + execute correctly. +- `thrust::inclusive_scan`, `thrust::exclusive_scan`, + `thrust::inclusive_scan_by_key`, and `thrust::exclusive_scan_by_key` are + currently incompatible with `thrust::discard_iterator`. + +### Acknowledgments + +- Thanks to David Tarjan for improving the performance of set_intersection. +- Thanks to Duane Merrill for continued help with sort. +- Thanks to Nathan Whitehead for help with CUDA Toolkit integration. + +## Thrust 1.3.0 + +Thrust 1.3.0 provides support for CUDA Toolkit 3.2 in addition to many feature + and performance enhancements. +Performance of the sort and sort_by_key algorithms is improved by as much as 3x + in certain situations. +The performance of stream compaction algorithms, such as copy_if, is improved + by as much as 2x. +CUDA errors are now converted to runtime exceptions using the system_error + interface. +Combined with a debug mode, also new in 1.3, runtime errors can be located with + greater precision. +Lastly, a few header files have been consolidated or renamed for clarity. +See the deprecations section below for additional details. + +### Breaking Changes + +- Promotions + - thrust::experimental::inclusive_segmented_scan has been renamed + thrust::inclusive_scan_by_key and exposes a different interface + - thrust::experimental::exclusive_segmented_scan has been renamed + thrust::exclusive_scan_by_key and exposes a different interface + - thrust::experimental::partition_copy has been renamed + thrust::partition_copy and exposes a different interface + - thrust::next::gather has been renamed thrust::gather + - thrust::next::gather_if has been renamed thrust::gather_if + - thrust::unique_copy_by_key has been renamed thrust::unique_by_key_copy +- Deprecations + - thrust::copy_when has been renamed thrust::deprecated::copy_when + - thrust::absolute_value has been renamed thrust::deprecated::absolute_value + - The header thrust/set_intersection.h is now deprecated; use + thrust/set_operations.h instead + - The header thrust/utility.h is now deprecated; use thrust/swap.h instead + - The header thrust/swap_ranges.h is now deprecated; use thrust/swap.h instead +- Eliminations + - thrust::deprecated::gather + - thrust::deprecated::gather_if + - thrust/experimental/arch.h and the functions therein + - thrust/sorting/merge_sort.h + - thrust/sorting/radix_sort.h +- NVCC 2.3 is no longer supported + +### New Features + +- Algorithms: + - `thrust::exclusive_scan_by_key` + - `thrust::find` + - `thrust::find_if` + - `thrust::find_if_not` + - `thrust::inclusive_scan_by_key` + - `thrust::is_partitioned` + - `thrust::is_sorted_until` + - `thrust::mismatch` + - `thrust::partition_point` + - `thrust::reverse` + - `thrust::reverse_copy` + - `thrust::stable_partition_copy` + +- Types: + - `thrust::system_error` and related types. + - `thrust::experimental::cuda::ogl_interop_allocator`. + - `thrust::bit_and`, `thrust::bit_or`, and `thrust::bit_xor`. + +- Device Support: + - GF104-based GPUs. + +### New Examples + +- opengl_interop.cu +- repeated_range.cu +- simple_moving_average.cu +- sparse_vector.cu +- strided_range.cu + +### Other Enhancements + +- Performance of thrust::sort and thrust::sort_by_key is substantially improved + for primitive key types +- Performance of thrust::copy_if is substantially improved +- Performance of thrust::reduce and related reductions is improved +- THRUST_DEBUG mode added +- Callers of Thrust functions may detect error conditions by catching + thrust::system_error, which derives from std::runtime_error +- The number of compiler warnings generated by Thrust has been substantially + reduced +- Comparison sort now works correctly for input sizes > 32M +- min & max usage no longer collides with definitions +- Compiling against the OpenMP backend no longer requires nvcc +- Performance of device_vector initialized in .cpp files is substantially + improved in common cases +- Performance of thrust::sort_by_key on the host is substantially improved + +### Bug Fixes + +- Debug device code now compiles correctly +- thrust::uninitialized_copy and thrust::uninitialized_fill now dispatch + constructors on the device rather than the host + +### Known Issues + +- #212 set_intersection is known to fail for large input sizes +- partition_point is known to fail for 64b types with nvcc 3.2 + +Acknowledgments +- Thanks to Duane Merrill for contributing a fast CUDA radix sort implementation +- Thanks to Erich Elsen for contributing an implementation of find_if +- Thanks to Andrew Corrigan for contributing changes which allow the OpenMP + backend to compile in the absence of nvcc +- Thanks to Andrew Corrigan, Cliff Wooley, David Coeurjolly, Janick Martinez + Esturo, John Bowers, Maxim Naumov, Michael Garland, and Ryuta Suzuki for + bug reports +- Thanks to Cliff Woolley for help with testing + +## Thrust 1.2.1 + +Thrust 1.2.1 is a small bug fix release that is compatible with the CUDA + Toolkit 3.1 release. + +### Known Issues + +- `thrust::inclusive_scan` and `thrust::exclusive_scan` may fail with very + large types. +- MSVC may fail to compile code using both sort and binary search algorithms. +- `thrust::uninitialized_fill` and `thrust::uninitialized_copy` dispatch + constructors on the host rather than the device. +- #109: Some algorithms may exhibit poor performance with the OpenMP backend + with large numbers (>= 6) of CPU threads. +- `thrust::default_random_engine::discard` is not accelerated with NVCC 2.3 +- NVCC 3.1 may fail to compile code using types derived from + `thrust::subtract_with_carry_engine`, such as `thrust::ranlux24` and + `thrust::ranlux48`. + +## Thrust 1.2.0 + +Thrust 1.2.0 introduces support for compilation to multicore CPUs and the Ocelot + virtual machine, and several new facilities for pseudo-random number + generation. +New algorithms such as set intersection and segmented reduction have also been + added. +Lastly, improvements to the robustness of the CUDA backend ensure correctness + across a broad set of (uncommon) use cases. + +### Breaking Changes + +- `thrust::gather`'s interface was incorrect and has been removed. + The old interface is deprecated but will be preserved for Thrust version 1.2 + at `thrust::deprecated::gather` and `thrust::deprecated::gather_if`. + The new interface is provided at `thrust::next::gather` and + `thrust::next::gather_if`. + The new interface will be promoted to `thrust::` in Thrust version 1.3. + For more details, please refer to [this thread](http://groups.google.com/group/thrust-users/browse_thread/thread/f5f0583cb97b51fd). +- The `thrust::sorting` namespace has been deprecated in favor of the top-level + sorting functions, such as `thrust::sort` and `thrust::sort_by_key`. +- Removed support for `thrust::equal` between host & device sequences. +- Removed support for `thrust::scatter` between host & device sequences. + +### New Features + +- Algorithms: + - `thrust::reduce_by_key` + - `thrust::set_intersection` + - `thrust::unique_copy` + - `thrust::unique_by_key` + - `thrust::unique_copy_by_key` +- Types +- Random Number Generation: + - `thrust::discard_block_engine` + - `thrust::default_random_engine` + - `thrust::linear_congruential_engine` + - `thrust::linear_feedback_shift_engine` + - `thrust::subtract_with_carry_engine` + - `thrust::xor_combine_engine` + - `thrust::minstd_rand` + - `thrust::minstd_rand0` + - `thrust::ranlux24` + - `thrust::ranlux48` + - `thrust::ranlux24_base` + - `thrust::ranlux48_base` + - `thrust::taus88` + - `thrust::uniform_int_distribution` + - `thrust::uniform_real_distribution` + - `thrust::normal_distribution` (experimental) +- Function Objects: + - `thrust::project1st` + - `thrust::project2nd` +- `thrust::tie` +- Fancy Iterators: + - `thrust::permutation_iterator` + - `thrust::reverse_iterator` +- Vector Functions: + - `operator!=` + - `rbegin` + - `crbegin` + - `rend` + - `crend` + - `data` + - `shrink_to_fit` +- Device Support: + - Multicore CPUs via OpenMP. + - Fermi-class GPUs. + - Ocelot virtual machines. +- Support for NVCC 3.0. + +### New Examples + +- `cpp_integration` +- `histogram` +- `mode` +- `monte_carlo` +- `monte_carlo_disjoint_sequences` +- `padded_grid_reduction` +- `permutation_iterator` +- `row_sum` +- `run_length_encoding` +- `segmented_scan` +- `stream_compaction` +- `summary_statistics` +- `transform_iterator` +- `word_count` + +### Other Enhancements + +- Integer sorting performance is improved when max is large but (max - min) is + small and when min is negative +- Performance of `thrust::inclusive_scan` and `thrust::exclusive_scan` is + improved by 20-25% for primitive types. + +### Bug Fixes + +- #8 cause a compiler error if the required compiler is not found rather than a + mysterious error at link time +- #42 device_ptr & device_reference are classes rather than structs, + eliminating warnings on certain platforms +- #46 gather & scatter handle any space iterators correctly +- #51 thrust::experimental::arch functions gracefully handle unrecognized GPUs +- #52 avoid collisions with common user macros such as BLOCK_SIZE +- #62 provide better documentation for device_reference +- #68 allow built-in CUDA vector types to work with device_vector in pure C++ + mode +- #102 eliminated a race condition in device_vector::erase +- various compilation warnings eliminated + +### Known Issues + +- inclusive_scan & exclusive_scan may fail with very large types +- MSVC may fail to compile code using both sort and binary search algorithms +- uninitialized_fill & uninitialized_copy dispatch constructors on the host + rather than the device +- #109 some algorithms may exhibit poor performance with the OpenMP backend + with large numbers (>= 6) of CPU threads +- default_random_engine::discard is not accelerated with nvcc 2.3 + +### Acknowledgments + +- Thanks to Gregory Diamos for contributing a CUDA implementation of + set_intersection +- Thanks to Ryuta Suzuki & Gregory Diamos for rigorously testing Thrust's unit + tests and examples against Ocelot +- Thanks to Tom Bradley for contributing an implementation of normal_distribution +- Thanks to Joseph Rhoads for contributing the example summary_statistics + +## Thrust 1.1.1 + +Thrust 1.1.1 is a small bug fix release that is compatible with the CUDA + Toolkit 2.3a release and Mac OSX Snow Leopard. + +## Thrust 1.1.0 + +Thrust 1.1.0 introduces fancy iterators, binary search functions, and several + specialized reduction functions. +Experimental support for segmented scans has also been added. + +### Breaking Changes + +- `thrust::counting_iterator` has been moved into the `thrust` namespace + (previously `thrust::experimental`). + +### New Features + +- Algorithms: + - `thrust::copy_if` + - `thrust::lower_bound` + - `thrust::upper_bound` + - `thrust::vectorized lower_bound` + - `thrust::vectorized upper_bound` + - `thrust::equal_range` + - `thrust::binary_search` + - `thrust::vectorized binary_search` + - `thrust::all_of` + - `thrust::any_of` + - `thrust::none_of` + - `thrust::minmax_element` + - `thrust::advance` + - `thrust::inclusive_segmented_scan` (experimental) + - `thrust::exclusive_segmented_scan` (experimental) +- Types: + - `thrust::pair` + - `thrust::tuple` + - `thrust::device_malloc_allocator` +- Fancy Iterators: + - `thrust::constant_iterator` + - `thrust::counting_iterator` + - `thrust::transform_iterator` + - `thrust::zip_iterator` + +### New Examples + +- Computing the maximum absolute difference between vectors. +- Computing the bounding box of a two-dimensional point set. +- Sorting multiple arrays together (lexicographical sorting). +- Constructing a summed area table. +- Using `thrust::zip_iterator` to mimic an array of structs. +- Using `thrust::constant_iterator` to increment array values. + +### Other Enhancements + +- Added pinned memory allocator (experimental). +- Added more methods to host_vector & device_vector (issue #4). +- Added variant of remove_if with a stencil argument (issue #29). +- Scan and reduce use cudaFuncGetAttributes to determine grid size. +- Exceptions are reported when temporary device arrays cannot be allocated. + +### Bug Fixes + +- #5: Make vector work for larger data types +- #9: stable_partition_copy doesn't respect OutputIterator concept semantics +- #10: scans should return OutputIterator +- #16: make algorithms work for larger data types +- #27: Dispatch radix_sort even when comp=less is explicitly provided + +### Known Issues + +- Using functors with Thrust entry points may not compile on Mac OSX with gcc + 4.0.1. +- `thrust::uninitialized_copy` and `thrust::uninitialized_fill` dispatch + constructors on the host rather than the device. +- `thrust::inclusive_scan`, `thrust::inclusive_scan_by_key`, + `thrust::exclusive_scan`, and `thrust::exclusive_scan_by_key` may fail when + used with large types with the CUDA Toolkit 3.1. + +## Thrust 1.0.0 + +First production release of Thrust. + +### Breaking Changes + +- Rename top level namespace `komrade` to `thrust`. +- Move `thrust::partition_copy` & `thrust::stable_partition_copy` into + `thrust::experimental` namespace until we can easily provide the standard + interface. +- Rename `thrust::range` to `thrust::sequence` to avoid collision with + Boost.Range. +- Rename `thrust::copy_if` to `thrust::copy_when` due to semantic differences + with C++0x `std::copy_if`. + +### New Features + +- Add C++0x style `cbegin` & `cend` methods to `thrust::host_vector` and + `thrust::device_vector`. +- Add `thrust::transform_if` function. +- Add stencil versions of `thrust::replace_if` & `thrust::replace_copy_if`. +- Allow `counting_iterator` to work with `thrust::for_each`. +- Allow types with constructors in comparison `thrust::sort` and + `thrust::reduce`. + +### Other Enhancements + +- `thrust::merge_sort` and `thrust::stable_merge_sort` are now 2x to 5x faster + when executed on the parallel device. + +### Bug Fixes + +- Komrade 6: Workaround an issue where an incremented iterator causes NVCC to + crash. +- Komrade 7: Fix an issue where `const_iterator`s could not be passed to + `thrust::transform`. + diff --git a/CMakeLists.txt b/CMakeLists.txt index 8a4eca47a..967ebf53a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,10 +1,98 @@ -cmake_minimum_required(VERSION 3.8) - -project(Thrust CXX) - -set(THRUST_SOURCE ${CMAKE_SOURCE_DIR}) -include(cmake/common_variables.cmake) - +# 3.15 is the minimum for including the project with add_subdirectory. +# 3.17 for building the project's standalone tests/examples/etc. +# 3.18.3 for C++17 + CUDA +cmake_minimum_required(VERSION 3.15) + +# Remove this when we use the new CUDA_ARCHITECTURES properties with both +# nvcc and nvc++. +if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18) + cmake_policy(SET CMP0104 OLD) +endif() + +project(Thrust NONE) + +# Determine whether Thrust is the top-level project or included into +# another project via add_subdirectory() +if ("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_LIST_DIR}") + set(THRUST_TOPLEVEL_PROJECT ON) +else() + set(THRUST_TOPLEVEL_PROJECT OFF) +endif() + +## thrust_fix_clang_nvcc_build_for +# +# Modifies the given target to include a fix for the clang host compiler case. +# The fix consists of force-including a header into each compilation unit. +# +function(thrust_fix_clang_nvcc_build_for target) + if (UNIX) + # Path to the header containing the fix for clang + nvcc < 11.6. For more info, + # check the content of this header. + set(clang_fix_header_path "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/testing/fix_clang_nvcc_11.5.h") + + # Only affects host compiler + target_compile_options(${target} PRIVATE + "$<$:-include${clang_fix_header_path}>") + endif() +endfunction() + +# This must be done before any languages are enabled: +if (THRUST_TOPLEVEL_PROJECT) + include(cmake/ThrustCompilerHacks.cmake) +endif() + +# This must appear after our Compiler Hacks or else CMake will delete the cache +# and reconfigure from scratch. +# This must also appear before the installation rules, as it is required by the +# GNUInstallDirs CMake module. +enable_language(CXX) + +# Optionally include installation rules for non-top-level builds: +option(THRUST_ENABLE_INSTALL_RULES "Enable installation of Thrust" ${THRUST_TOPLEVEL_PROJECT}) +if (THRUST_ENABLE_INSTALL_RULES) + include(cmake/ThrustInstallRules.cmake) +endif() + +# Support adding Thrust to a parent project via add_subdirectory. +# See examples/cmake/add_subdir/CMakeLists.txt for details. +if (NOT THRUST_TOPLEVEL_PROJECT) + include(cmake/ThrustAddSubdir.cmake) + return() +endif() + +# We use 3.17 features when building our tests, etc. +cmake_minimum_required(VERSION 3.17) + +option(THRUST_ENABLE_HEADER_TESTING "Test that all public headers compile." "ON") +option(THRUST_ENABLE_TESTING "Build Thrust testing suite." "ON") +option(THRUST_ENABLE_EXAMPLES "Build Thrust examples." "ON") +option(THRUST_ENABLE_BENCHMARKS "Build Thrust runtime benchmarks." "OFF") +option(THRUST_INCLUDE_CUB_CMAKE "Build CUB tests and examples. (Requires CUDA)." "OFF") + +# Mark this option as advanced for now. We'll revisit this later once the new +# benchmarks are ready. For now, we just need to expose a way to compile +# bench.cu from CMake for NVIDIA's internal builds. +mark_as_advanced(THRUST_ENABLE_BENCHMARKS) + +# Check if we're actually building anything before continuing. If not, no need +# to search for deps, etc. This is a common approach for packagers that just +# need the install rules. See GH issue NVIDIA/thrust#1211. +if (NOT (THRUST_ENABLE_HEADER_TESTING OR + THRUST_ENABLE_TESTING OR + THRUST_ENABLE_EXAMPLES OR + THRUST_ENABLE_BENCHMARKS OR + THRUST_INCLUDE_CUB_CMAKE)) + return() +endif() + +include(cmake/AppendOptionIfAvailable.cmake) +include(cmake/ThrustBuildCompilerTargets.cmake) +include(cmake/ThrustBuildTargetList.cmake) +include(cmake/ThrustFindThrust.cmake) +include(cmake/ThrustMultiConfig.cmake) +include(cmake/ThrustUtilities.cmake) + +# Add cache string options for CMAKE_BUILD_TYPE and default to RelWithDebInfo. if ("" STREQUAL "${CMAKE_BUILD_TYPE}") set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING "Choose the type of build." FORCE) @@ -14,644 +102,54 @@ if ("" STREQUAL "${CMAKE_BUILD_TYPE}") ) endif () -if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.12) - set(CMAKE_CONFIGURE_DEPENDS CONFIGURE_DEPENDS) -endif () - -list(INSERT CMAKE_MODULE_PATH 0 ${PROJECT_SOURCE_DIR}/cmake) -include(AppendOptionIfAvailable) - -file(READ "thrust/version.h" THRUST_VERSION_HEADER) -string(REGEX MATCH "THRUST_VERSION ([0-9]+)" DUMMY ${THRUST_VERSION_HEADER}) -set(THRUST_VERSION ${CMAKE_MATCH_1}) -math(EXPR THRUST_VERSION_MAJOR "(${THRUST_VERSION} / 100000)") -math(EXPR THRUST_VERSION_MINOR "(${THRUST_VERSION} / 100) % 1000") -math(EXPR THRUST_VERSION_PATCH "${THRUST_VERSION} % 100") -set( - THRUST_VERSION_STR - "${THRUST_VERSION_MAJOR}.${THRUST_VERSION_MINOR}.${THRUST_VERSION_PATCH}" -) -message(STATUS "Thrust Version: ${THRUST_VERSION_STR}") - -set(THRUST_HOST_SYSTEM_OPTIONS CPP OMP TBB) -set(THRUST_HOST_SYSTEM CPP CACHE STRING "The device backend to target.") -set_property( - CACHE THRUST_HOST_SYSTEM - PROPERTY STRINGS ${THRUST_HOST_SYSTEM_OPTIONS} -) -if (NOT THRUST_HOST_SYSTEM IN_LIST THRUST_HOST_SYSTEM_OPTIONS) - message( - FATAL_ERROR - "THRUST_HOST_SYSTEM must be one of ${THRUST_HOST_SYSTEM_OPTIONS}" - ) -endif () - -add_definitions(-DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_${THRUST_HOST_SYSTEM}) - -set(THRUST_DEVICE_SYSTEM_OPTIONS CUDA CPP OMP TBB) -set(THRUST_DEVICE_SYSTEM CUDA CACHE STRING "The device backend to target.") -set_property( - CACHE THRUST_DEVICE_SYSTEM - PROPERTY STRINGS ${THRUST_DEVICE_SYSTEM_OPTIONS} -) -if (NOT THRUST_DEVICE_SYSTEM IN_LIST THRUST_DEVICE_SYSTEM_OPTIONS) - message( - FATAL_ERROR - "THRUST_DEVICE_SYSTEM must be one of ${THRUST_DEVICE_SYSTEM_OPTIONS}" - ) -endif () - -add_definitions(-DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_${THRUST_DEVICE_SYSTEM}) - -# Please note this also sets the default for the CUDA C++ version; see the comment below. -set(CMAKE_CXX_STANDARD 11 CACHE STRING "The C++ version to be used.") +# Disable compiler extensions: set(CMAKE_CXX_EXTENSIONS OFF) -message("-- C++ Standard version: ${CMAKE_CXX_STANDARD}") - -if ("CUDA" STREQUAL "${THRUST_DEVICE_SYSTEM}") - if (NOT "${CMAKE_CUDA_HOST_COMPILER}" STREQUAL "") - unset(CMAKE_CUDA_HOST_COMPILER CACHE) - message(FATAL_ERROR "Thrust tests and examples require the C++ compiler" - " and the CUDA host compiler to be the same; to set this compiler, please" - " use the CMAKE_CXX_COMPILER variable, not the CMAKE_CUDA_HOST_COMPILER" - " variable.") - endif () - set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER}) - - enable_language(CUDA) - - # Force CUDA C++ standard to be the same as the C++ standard used. - # - # Now, CMake is unaligned with reality on standard versions: https://gitlab.kitware.com/cmake/cmake/issues/18597 - # which means that using standard CMake methods, it's impossible to actually sync the CXX and CUDA versions for pre-11 - # versions of C++; CUDA accepts 98 but translates that to 03, while CXX doesn't accept 03 (and doesn't translate that to 03). - # In case this gives You, dear user, any trouble, please escalate the above CMake bug, so we can support reality properly. - if (DEFINED CMAKE_CUDA_STANDARD) - message(WARNING "You've set CMAKE_CUDA_STANDARD; please note that this variable is ignored, and CMAKE_CXX_STANDARD" - " is used as the C++ standard version for both C++ and CUDA.") - endif() - unset(CMAKE_CUDA_STANDARD CACHE) - set(CMAKE_CUDA_STANDARD ${CMAKE_CXX_STANDARD}) - - set(THRUST_HIGHEST_COMPUTE_ARCH 75) - set(THRUST_KNOWN_COMPUTE_ARCHS 30 32 35 50 52 53 60 61 62 70 72 75) - - option(THRUST_DISABLE_ARCH_BY_DEFAULT "If ON, then all CUDA architectures are disabled on the initial CMake run." OFF) - set(OPTION_INIT ON) - if (THRUST_DISABLE_ARCH_BY_DEFAULT) - set(OPTION_INIT OFF) - endif () - - if (NOT ${THRUST_HIGHEST_COMPUTE_ARCH} IN_LIST THRUST_KNOWN_COMPUTE_ARCHS) - message(FATAL_ERROR "When changing the highest compute version, don't forget to add it to the list!") - endif () - - foreach (COMPUTE_ARCH IN LISTS THRUST_KNOWN_COMPUTE_ARCHS) - option(THRUST_ENABLE_COMPUTE_${COMPUTE_ARCH} "Enable code generation for tests for sm_${COMPUTE_ARCH}" ${OPTION_INIT}) - if (THRUST_ENABLE_COMPUTE_${COMPUTE_ARCH}) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode arch=compute_${COMPUTE_ARCH},code=sm_${COMPUTE_ARCH}") - set(COMPUTE_MESSAGE "${COMPUTE_MESSAGE} sm_${COMPUTE_ARCH}") - endif () - endforeach () - - option(THRUST_ENABLE_COMPUTE_FUTURE "Enable code generation for tests for compute_${THRUST_HIGHEST_COMPUTE_ARCH}" ${OPTION_INIT}) - if (THRUST_ENABLE_COMPUTE_FUTURE) - set(CMAKE_CUDA_FLAGS - "${CMAKE_CUDA_FLAGS} -gencode arch=compute_${THRUST_HIGHEST_COMPUTE_ARCH},code=compute_${THRUST_HIGHEST_COMPUTE_ARCH}") - set(COMPUTE_MESSAGE "${COMPUTE_MESSAGE} compute_${THRUST_HIGHEST_COMPUTE_ARCH}") - endif () - - message("-- Enabled CUDA architectures:${COMPUTE_MESSAGE}") -endif () - -if ("OMP" STREQUAL "${THRUST_DEVICE_SYSTEM}") - find_package(OpenMP REQUIRED) - if (OPENMP_FOUND) - set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") - set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") - set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") - endif() -endif () - -if ("TBB" STREQUAL "${THRUST_DEVICE_SYSTEM}") - find_package(PkgConfig REQUIRED) - pkg_check_modules(TBB tbb REQUIRED) - if (TBB_FOUND) - set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${TDD_CFLAGS}") - set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TDD_CFLAGS}") - set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${TBB_LD_FLAGS}") - set (THRUST_ADDITIONAL_LIBRARIES "${TBB_LIBRARIES}") - endif () - - # There's a ton of these in the TBB backend, even though the code is correct. - # TODO: silence these warnings in code instead - append_option_if_available("-Wno-unused-parameter" THRUST_CXX_WARNINGS) -endif () - -if ("MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}") - if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 1900) - message(FATAL_ERROR "This version of MSVC no longer supported.") - endif () -endif () - -if ("GNU" STREQUAL "${CMAKE_CXX_COMPILER_ID}") - if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.4) - message(FATAL_ERROR "This version of GCC no longer supported.") - endif () -endif () - -if ("MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}") - # TODO Enable /Wall - append_option_if_available("/WX" THRUST_CXX_WARNINGS) - - # Disabled loss-of-data conversion warnings. - # TODO Re-enable. - append_option_if_available("/wd4244" THRUST_CXX_WARNINGS) - append_option_if_available("/wd4267" THRUST_CXX_WARNINGS) - - # Suppress numeric conversion-to-bool warnings. - # TODO Re-enable. - append_option_if_available("/wd4800" THRUST_CXX_WARNINGS) - - # Disable warning about applying unary operator- to unsigned type. - append_option_if_available("/wd4146" THRUST_CXX_WARNINGS) - - set(THRUST_TREAT_FILE_AS_CXX "/TP") -else () - append_option_if_available("-Werror" THRUST_CXX_WARNINGS) - append_option_if_available("-Wall" THRUST_CXX_WARNINGS) - append_option_if_available("-Wextra" THRUST_CXX_WARNINGS) - append_option_if_available("-Winit-self" THRUST_CXX_WARNINGS) - append_option_if_available("-Woverloaded-virtual" THRUST_CXX_WARNINGS) - append_option_if_available("-Wcast-qual" THRUST_CXX_WARNINGS) - append_option_if_available("-Wno-cast-align" THRUST_CXX_WARNINGS) - append_option_if_available("-Wno-long-long" THRUST_CXX_WARNINGS) - append_option_if_available("-Wno-variadic-macros" THRUST_CXX_WARNINGS) - append_option_if_available("-Wno-unused-function" THRUST_CXX_WARNINGS) - append_option_if_available("-Wno-unused-variable" THRUST_CXX_WARNINGS) - - set(THRUST_TREAT_FILE_AS_CXX "-x c++") -endif () - -if ("GNU" STREQUAL "${CMAKE_CXX_COMPILER_ID}") - if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.5) - # In GCC 4.4, the CUDA backend's kernel launch templates cause - # impossible-to-decipher "'' is used uninitialized in this - # function" warnings, so we disable uninitialized variable warnings. - append_option_if_available("-Wno-uninitialized" THRUST_CXX_WARNINGS) - endif () - - if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 4.5) - # This isn't available until GCC 4.3, and misfires on TMP code until - # GCC 4.5. - append_option_if_available("-Wlogical-op" THRUST_CXX_WARNINGS) - endif () - - if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.3) - # GCC 7.3 complains about name mangling changes due to `noexcept` - # becoming part of the type system; we don't care. - append_option_if_available("-Wno-noexcept-type" THRUST_CXX_WARNINGS) - endif () - - if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1 AND CMAKE_CXX_STANDARD EQUAL 98) - # thrust::complex can't really be made trivially copyable in pre-11. - # Disable a warning about a non-trivially-copyable type being memmoved that was added to GCC 8. - append_option_if_available("-Wno-class-memaccess" THRUST_CXX_WARNINGS) - endif () -endif () - -if (("Clang" STREQUAL "${CMAKE_CXX_COMPILER_ID}") OR - ("XL" STREQUAL "${CMAKE_CXX_COMPILER_ID}")) - # xlC and Clang warn about unused parameters in uninstantiated templates. - # This causes xlC to choke on the OMP backend, which is mostly #ifdef'd out - # (and thus has unused parameters) when you aren't using it. - append_option_if_available("-Wno-unused-parameters" THRUST_CXX_WARNINGS) -endif () - -if ("Clang" STREQUAL "${CMAKE_CXX_COMPILER_ID}") - # -Wunneeded-internal-declaration misfires in the unit test framework - # on older versions of Clang. - append_option_if_available("-Wno-unneeded-internal-declaration" THRUST_CXX_WARNINGS) -endif () - -foreach (CXX_OPTION IN LISTS THRUST_CXX_WARNINGS) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_OPTION}") -endforeach () - -if ("CUDA" STREQUAL "${THRUST_DEVICE_SYSTEM}") - foreach (CXX_OPTION IN LISTS THRUST_CXX_WARNINGS) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=${CXX_OPTION}") - endforeach () -endif () - -# For every public header, build a translation unit containing `#include
` -# to let the compiler try to figure out warnings in that header if it is not otherwise -# included in tests, and also to verify if the headers are modular enough. -# .inl files are not globbed for, because they are not supposed to be used as public -# entrypoints. -list(APPEND THRUST_HEADER_GLOBS thrust/*.h) -list(APPEND THRUST_HEADER_EXCLUDE_SYSTEMS_GLOBS thrust/system/*/*) - -string(TOLOWER ${THRUST_HOST_SYSTEM} THRUST_HOST_SYSTEM_LOWERCASE) -list(APPEND THRUST_HEADER_SYSTEMS_GLOBS thrust/system/${THRUST_HOST_SYSTEM_LOWERCASE}/*) - -string(TOLOWER ${THRUST_DEVICE_SYSTEM} THRUST_DEVICE_SYSTEM_LOWERCASE) -list(APPEND THRUST_HEADER_SYSTEMS_GLOBS thrust/system/${THRUST_DEVICE_SYSTEM_LOWERCASE}/*) - -list(APPEND THRUST_HEADER_EXCLUDE_DETAILS_GLOBS thrust/detail/*) -list(APPEND THRUST_HEADER_EXCLUDE_DETAILS_GLOBS thrust/*/detail/*) -list(APPEND THRUST_HEADER_EXCLUDE_DETAILS_GLOBS thrust/*/*/detail/*) - -# Get all .h files... -file( - GLOB_RECURSE THRUST_HEADERS - RELATIVE ${PROJECT_SOURCE_DIR}/thrust - ${CMAKE_CONFIGURE_DEPENDS} - ${THRUST_HEADER_GLOBS} -) - -# ...then remove all system specific headers... -file( - GLOB_RECURSE THRUST_HEADER_EXCLUDE_SYSTEMS - RELATIVE ${PROJECT_SOURCE_DIR}/thrust - ${CMAKE_CONFIGURE_DEPENDS} - ${THRUST_HEADER_EXCLUDE_SYSTEMS_GLOBS} -) -list(REMOVE_ITEM THRUST_HEADERS ${THRUST_HEADER_EXCLUDE_SYSTEMS}) - -# ...then add all headers specific to the selected host and device systems back again... -file( - GLOB_RECURSE THRUST_SYSTEMS_HEADERS - RELATIVE ${PROJECT_SOURCE_DIR}/thrust - ${CMAKE_CONFIGURE_DEPENDS} - ${THRUST_HEADER_SYSTEMS_GLOBS} -) -list(APPEND THRUST_HEADERS ${THRUST_SYSTEMS_HEADERS}) - -# ...and remove all the detail headers (also removing the detail headers from the selected systems). -file( - GLOB_RECURSE THRUST_HEADER_EXCLUDE_DETAILS - RELATIVE ${PROJECT_SOURCE_DIR}/thrust - ${CMAKE_CONFIGURE_DEPENDS} - ${THRUST_HEADER_EXCLUDE_DETAILS_GLOBS} -) -list(REMOVE_ITEM THRUST_HEADERS ${THRUST_HEADER_EXCLUDE_DETAILS}) - -# List of headers that aren't implemented for all backends, but are implemented for CUDA. -set(THRUST_PARTIALLY_IMPLEMENTED_HEADERS_CUDA - async/copy.h - async/for_each.h - async/reduce.h - async/sort.h - async/transform.h - event.h - future.h -) - -# List of headers that aren't implemented for all backends, but are implemented for CPP. -set(THRUST_PARTIALLY_IMPLEMENTED_HEADERS_CPP -) - -# List of headers that aren't implemented for all backends, but are implemented for TBB. -set(THRUST_PARTIALLY_IMPLEMENTED_HEADERS_TBB -) - -# List of headers that aren't implemented for all backends, but are implemented for OMP. -set(THRUST_PARTIALLY_IMPLEMENTED_HEADERS_OMP -) - -# List of all partially implemented headers. -set(THRUST_PARTIALLY_IMPLEMENTED_HEADERS - emptylistguard - ${THRUST_PARTIALLY_IMPLEMENTED_HEADERS_CUDA} - ${THRUST_PARTIALLY_IMPLEMENTED_HEADERS_CPP} - ${THRUST_PARTIALLY_IMPLEMENTED_HEADERS_TBB} - ${THRUST_PARTIALLY_IMPLEMENTED_HEADERS_OMP} -) - -list(REMOVE_DUPLICATES THRUST_PARTIALLY_IMPLEMENTED_HEADERS) - -foreach (THRUST_HEADER IN LISTS THRUST_HEADERS) - if ("${THRUST_HEADER}" IN_LIST THRUST_PARTIALLY_IMPLEMENTED_HEADERS) - # This header is partially implemented on _some_ backends... - if (NOT "${THRUST_HEADER}" IN_LIST THRUST_PARTIALLY_IMPLEMENTED_HEADERS_${THRUST_DEVICE_SYSTEM}) - # ...but not on the selected one. - continue() - endif () - endif () - - set(THRUST_HEADER_TEST_EXT .cpp) - if ("CUDA" STREQUAL "${THRUST_DEVICE_SYSTEM}") - set(THRUST_HEADER_TEST_EXT .cu) - endif () - - set(SOURCE_NAME headers/${THRUST_HEADER}${THRUST_HEADER_TEST_EXT}) - configure_file(cmake/header_test.in ${SOURCE_NAME}) - - list(APPEND THRUST_HEADER_TEST_SOURCES ${SOURCE_NAME}) -endforeach () - -add_library(header-test OBJECT ${THRUST_HEADER_TEST_SOURCES}) -target_include_directories( - header-test - PUBLIC ${PROJECT_SOURCE_DIR} -) - -include(CTest) -enable_testing() - -# Handle tests. - -option(THRUST_ENABLE_TESTS_WITH_RDC "Also build all tests with RDC." OFF) - -set(THRUST_TEST_RUN_ARGUMENTS - -DTHRUST_SOURCE=${CMAKE_SOURCE_DIR} - -P "${CMAKE_SOURCE_DIR}/cmake/run_test.cmake") - -list(APPEND THRUST_TESTFRAMEWORK_FILES testing/unittest/testframework.cu) -if ("CUDA" STREQUAL "${THRUST_DEVICE_SYSTEM}") - list(APPEND THRUST_TESTFRAMEWORK_FILES testing/unittest/cuda/testframework.cu) -else () - # When CUDA is disabled, explain to CMake that testframework.cu is actually a C++ file. - set_source_files_properties(testing/unittest/testframework.cu - PROPERTIES - LANGUAGE CXX - COMPILE_FLAGS "${THRUST_TREAT_FILE_AS_CXX}") -endif () - -add_library(thrust_testframework STATIC ${THRUST_TESTFRAMEWORK_FILES}) -target_include_directories( - thrust_testframework - PUBLIC ${PROJECT_SOURCE_DIR} - PRIVATE ${PROJECT_SOURCE_DIR}/testing -) - -list(APPEND THRUST_TEST_GLOBS testing/*.cu) -list(APPEND THRUST_TEST_GLOBS testing/*.cpp) - -if ("CUDA" STREQUAL "${THRUST_DEVICE_SYSTEM}") - list(APPEND THRUST_TEST_GLOBS testing/cuda/*.cu) -elseif ("OMP" STREQUAL "${THRUST_DEVICE_SYSTEM}") - list(APPEND THRUST_TEST_GLOBS testing/omp/*.cu) - list(APPEND THRUST_TEST_GLOBS testing/omp/*.cpp) -endif () - -file( - GLOB THRUST_TESTS - RELATIVE ${PROJECT_SOURCE_DIR}/testing - ${CMAKE_CONFIGURE_DEPENDS} - ${THRUST_TEST_GLOBS} -) - -# List of tests that aren't implemented for all backends, but are implemented for CUDA. -set(THRUST_PARTIALLY_IMPLEMENTED_CUDA - async_copy - async_for_each - async_reduce - async_reduce_into - async_sort - async_transform - event - future -) - -# List of tests that aren't implemented for all backends, but are implemented for CPP. -set(THRUST_PARTIALLY_IMPLEMENTED_CPP -) - -# List of tests that aren't implemented for all backends, but are implemented for TBB. -set(THRUST_PARTIALLY_IMPLEMENTED_TBB -) - -# List of tests that aren't implemented for all backends, but are implemented for OMP. -set(THRUST_PARTIALLY_IMPLEMENTED_OMP -) - -# List of all partially implemented tests. -set(THRUST_PARTIALLY_IMPLEMENTED - ${THRUST_PARTIALLY_IMPLEMENTED_CUDA} - ${THRUST_PARTIALLY_IMPLEMENTED_CPP} - ${THRUST_PARTIALLY_IMPLEMENTED_TBB} - ${THRUST_PARTIALLY_IMPLEMENTED_OMP} -) - -if ("CUDA" STREQUAL "${THRUST_DEVICE_SYSTEM}") - if (14 EQUAL ${CMAKE_CXX_STANDARD}) - # Temporarily disable until NVBug 2492786 is fixed. - list(APPEND THRUST_PARTIALLY_IMPLEMENTED tuple_algorithms) - endif() -endif () - -list(REMOVE_DUPLICATES THRUST_PARTIALLY_IMPLEMENTED) - -foreach (THRUST_TEST_SOURCE IN LISTS THRUST_TESTS) - # TODO: Per-test flags. - - set(THRUST_TEST_CREATION_ADDITIONAL) - set(THRUST_TEST_ADD_TO_CTEST ON) - - get_filename_component(THRUST_TEST_CATEGORY ${THRUST_TEST_SOURCE} DIRECTORY) - if (NOT ("" STREQUAL "${THRUST_TEST_CATEGORY}")) - set(THRUST_TEST_CATEGORY "${THRUST_TEST_CATEGORY}.") - endif () - - get_filename_component(THRUST_TEST_NAME ${THRUST_TEST_SOURCE} NAME_WE) - - if ("${THRUST_TEST_NAME}" IN_LIST THRUST_PARTIALLY_IMPLEMENTED) - # This test is partially implemented on _some_ backends... - if (NOT "${THRUST_TEST_NAME}" IN_LIST THRUST_PARTIALLY_IMPLEMENTED_${THRUST_DEVICE_SYSTEM}) - # ...but not on the selected one. - set(THRUST_TEST_CREATION_ADDITIONAL EXCLUDE_FROM_ALL) - set(THRUST_TEST_ADD_TO_CTEST OFF) - endif () - endif () - - set(THRUST_TEST "thrust.test.${THRUST_TEST_CATEGORY}${THRUST_TEST_NAME}") - - if (NOT "CUDA" STREQUAL "${THRUST_DEVICE_SYSTEM}") - # Test files are generally .cu; if CUDA is not enabled, CMake doesn't know what to - # do with them. But since they are pretty much just C++, we can compile them with - # non-nvcc C++ compilers... but we need to tell CMake that they are, in fact, just C++. - set_source_files_properties(${PROJECT_SOURCE_DIR}/testing/${THRUST_TEST_SOURCE} - PROPERTIES - LANGUAGE CXX - COMPILE_FLAGS "${THRUST_TREAT_FILE_AS_CXX}") - endif () - - add_executable( - ${THRUST_TEST} - ${THRUST_TEST_CREATION_ADDITIONAL} - # THRUST_TEST_CREATION_ADDITIONAL is actually a CMake keyword (sometimes). - ${PROJECT_SOURCE_DIR}/testing/${THRUST_TEST_SOURCE} - ) - - target_include_directories( - ${THRUST_TEST} - PUBLIC ${PROJECT_SOURCE_DIR} - PRIVATE ${PROJECT_SOURCE_DIR}/testing - ) - - target_link_libraries(${THRUST_TEST} - thrust_testframework - ${THRUST_ADDITIONAL_LIBRARIES}) - - if (THRUST_TEST_ADD_TO_CTEST) - add_test(NAME ${THRUST_TEST} - COMMAND ${CMAKE_COMMAND} - -DTHRUST_BINARY=$ - ${THRUST_TEST_RUN_ARGUMENTS}) - endif () - - if ("CUDA" STREQUAL "${THRUST_DEVICE_SYSTEM}" AND THRUST_ENABLE_TESTS_WITH_RDC) - set(THRUST_TEST_RDC "thrust.test.${THRUST_TEST_CATEGORY}rdc.${THRUST_TEST_NAME}") - - add_executable( - ${THRUST_TEST_RDC} - ${THRUST_TEST_CREATION_ADDITIONAL} - # THRUST_TEST_CREATION_ADDITIONAL is actually a CMake keyword (sometimes). - ${PROJECT_SOURCE_DIR}/testing/${THRUST_TEST_SOURCE} - ) - - target_include_directories( - ${THRUST_TEST_RDC} - PUBLIC ${PROJECT_SOURCE_DIR} - PRIVATE ${PROJECT_SOURCE_DIR}/testing - ) - - target_link_libraries(${THRUST_TEST_RDC} - thrust_testframework - ${THRUST_ADDITIONAL_LIBRARIES}) - - set_target_properties(${THRUST_TEST_RDC} - PROPERTIES CUDA_SEPARABLE_COMPILATION ON) - - if (THRUST_TEST_ADD_TO_CTEST) - add_test(NAME ${THRUST_TEST_RDC} - COMMAND ${CMAKE_COMMAND} - -DTHRUST_BINARY=$ - ${THRUST_TEST_RUN_ARGUMENTS}) - endif () - endif () -endforeach () - -# Handle examples. - -option(THRUST_EXAMPLE_FILECHECK_PATH "Path to the LLVM FileCheck utility." "") -option(THRUST_ENABLE_EXAMPLES_WITH_RDC "Also build all examples with RDC." OFF) - -set(THRUST_EXAMPLE_FILECHECK_ENABLED OFF) -if (NOT "" STREQUAL "${THRUST_EXAMPLE_FILECHECK_PATH}") - execute_process( - COMMAND "${THRUST_EXAMPLE_FILECHECK_PATH}" "${THRUST_FILECHECK_DATA_PATH}/thrust.sanity.filecheck" - INPUT_FILE "${CMAKE_SOURCE_DIR}/cmake/sanity" - RESULT_VARIABLE THRUST_FILECHECK_RESULT - ) - - if ("0" STREQUAL "${THRUST_FILECHECK_RESULT}") - set(THRUST_EXAMPLE_FILECHECK_ENABLED ON) - message("-- FileCheck enabled: ${THRUST_EXAMPLE_FILECHECK_PATH}") - endif () -endif () - -list(APPEND THRUST_EXAMPLE_GLOBS examples/*.cu) -list(APPEND THRUST_EXAMPLE_GLOBS examples/*.cpp) - -if ("CUDA" STREQUAL "${THRUST_DEVICE_SYSTEM}") - list(APPEND THRUST_EXAMPLE_GLOBS examples/cuda/*.cu) -elseif ("OMP" STREQUAL "${THRUST_DEVICE_SYSTEM}") - list(APPEND THRUST_EXAMPLE_GLOBS examples/omp/*.cu) - list(APPEND THRUST_EXAMPLE_GLOBS examples/omp/*.cpp) -endif () - -if (CMAKE_VERSION VERSION_LESS 3.12) - file( - GLOB THRUST_EXAMPLES - RELATIVE ${PROJECT_SOURCE_DIR}/examples - ${THRUST_EXAMPLE_GLOBS} - CONFIGURE_DEPENDS - ) -else () - file( - GLOB THRUST_EXAMPLES - RELATIVE ${PROJECT_SOURCE_DIR}/examples - ${THRUST_EXAMPLE_GLOBS} - ) -endif () - -set(THRUST_EXAMPLE_RUN_ARGUMENTS - -DTHRUST_SOURCE=${CMAKE_SOURCE_DIR} - -DTHRUST_FILECHECK_ENABLED=${THRUST_EXAMPLE_FILECHECK_ENABLED} - -DTHRUST_FILECHECK=${THRUST_EXAMPLE_FILECHECK_PATH} - -P "${CMAKE_SOURCE_DIR}/cmake/run_example.cmake") - -foreach (THRUST_EXAMPLE_SOURCE IN LISTS THRUST_EXAMPLES) - # TODO: Per-example flags. - - get_filename_component(THRUST_EXAMPLE_CATEGORY ${THRUST_EXAMPLE_SOURCE} DIRECTORY) - if (NOT ("" STREQUAL "${THRUST_EXAMPLE_CATEGORY}")) - set(THRUST_EXAMPLE_CATEGORY "${THRUST_EXAMPLE_CATEGORY}.") - endif () - - get_filename_component(THRUST_EXAMPLE_NAME ${THRUST_EXAMPLE_SOURCE} NAME_WE) - - set(THRUST_EXAMPLE "thrust.example.${THRUST_EXAMPLE_CATEGORY}${THRUST_EXAMPLE_NAME}") - - if (NOT "CUDA" STREQUAL "${THRUST_DEVICE_SYSTEM}") - # Example files are generally .cu; if CUDA is not enabled, CMake doesn't know what to - # do with them. But since they are pretty much just C++, we can compile them with - # non-nvcc C++ compilers... but we need to tell CMake that they are, in fact, just C++. - set_source_files_properties(${PROJECT_SOURCE_DIR}/examples/${THRUST_EXAMPLE_SOURCE} - PROPERTIES - LANGUAGE CXX - COMPILE_FLAGS "${THRUST_TREAT_FILE_AS_CXX}") - endif () - - add_executable( - ${THRUST_EXAMPLE} - ${PROJECT_SOURCE_DIR}/examples/${THRUST_EXAMPLE_SOURCE} - ) - - target_include_directories( - ${THRUST_EXAMPLE} - PUBLIC ${PROJECT_SOURCE_DIR} - PRIVATE ${PROJECT_SOURCE_DIR}/examples - ) - - target_link_libraries(${THRUST_EXAMPLE} - ${THRUST_ADDITIONAL_LIBRARIES}) - - add_test(NAME ${THRUST_EXAMPLE} - COMMAND ${CMAKE_COMMAND} - -DTHRUST_EXAMPLE=${THRUST_EXAMPLE} - -DTHRUST_BINARY=$ - ${THRUST_EXAMPLE_RUN_ARGUMENTS}) - - if ("CUDA" STREQUAL "${THRUST_DEVICE_SYSTEM}" AND THRUST_ENABLE_EXAMPLES_WITH_RDC) - set(THRUST_EXAMPLE_RDC "thrust.example.${THRUST_EXAMPLE_CATEGORY}rdc.${THRUST_EXAMPLE_NAME}") - - add_executable( - ${THRUST_EXAMPLE_RDC} - ${PROJECT_SOURCE_DIR}/examples/${THRUST_EXAMPLE_SOURCE} - ) - - target_include_directories( - ${THRUST_EXAMPLE_RDC} - PUBLIC ${PROJECT_SOURCE_DIR} - PRIVATE ${PROJECT_SOURCE_DIR}/examples - ) - - target_link_libraries(${THRUST_EXAMPLE_RDC} - ${THRUST_ADDITIONAL_LIBRARIES}) - - set_target_properties(${THRUST_EXAMPLE_RDC} - PROPERTIES CUDA_SEPERABLE_COMPILATION ON) - - add_test(NAME ${THRUST_EXAMPLE_RDC} - COMMAND ${CMAKE_COMMAND} - -DTHRUST_EXAMPLE=${THRUST_EXAMPLE} - -DTHRUST_BINARY=$ - ${THRUST_EXAMPLE_RUN_ARGUMENTS}) - endif () -endforeach () - +# Where to put build outputs. Use CMAKE_BINARY_DIR so they'll show up in the +# top-level project's dir when building Thrust via add_subdirectory. +set(THRUST_LIBRARY_OUTPUT_DIR "${CMAKE_BINARY_DIR}/lib") +set(THRUST_EXECUTABLE_OUTPUT_DIR "${CMAKE_BINARY_DIR}/bin") + +thrust_configure_multiconfig() +thrust_find_thrust() +thrust_build_compiler_targets() +thrust_update_system_found_flags() +if (THRUST_CUDA_FOUND) + include(cmake/ThrustCudaConfig.cmake) +endif() +thrust_build_target_list() + +message(STATUS "CPP system found? ${THRUST_CPP_FOUND}") +message(STATUS "CUDA system found? ${THRUST_CUDA_FOUND}") +message(STATUS "TBB system found? ${THRUST_TBB_FOUND}") +message(STATUS "OMP system found? ${THRUST_OMP_FOUND}") + +if (THRUST_ENABLE_HEADER_TESTING) + include(cmake/ThrustHeaderTesting.cmake) +endif() + +# Both testing and examples use ctest +if (THRUST_ENABLE_TESTING OR THRUST_ENABLE_EXAMPLES) + include(CTest) + enable_testing() +endif() + +if (THRUST_ENABLE_TESTING) + add_subdirectory(testing) +endif() + +if (THRUST_ENABLE_EXAMPLES) + add_subdirectory(examples) +endif() + +if (THRUST_ENABLE_BENCHMARKS) + add_subdirectory(internal/benchmark) +endif() + +if (THRUST_INCLUDE_CUB_CMAKE AND THRUST_CUDA_FOUND) + set(CUB_IN_THRUST ON) + # CUB's path is specified generically to support both GitHub and Perforce + # source tree layouts. The include directory used by cub-config.cmake + # for source layouts is the same as the project root. + add_subdirectory("${_CUB_INCLUDE_DIR}" dependencies/cub) +endif() diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 000000000..8c56af363 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,96 @@ +# Code of Conduct + +## Overview + +This document defines the Code of Conduct followed and enforced for NVIDIA C++ + Core Compute Libraries. + +### Intended Audience + +* Community +* Developers +* Project Leads + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as + contributors and maintainers pledge to making participation in our project and + our community a harassment-free experience for everyone, regardless of age, + body size, disability, ethnicity, sex characteristics, gender identity and + expression, level of experience, education, socio-economic status, nationality, + personal appearance, race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment include: + +- Using welcoming and inclusive language. +- Being respectful of differing viewpoints and experiences. +- Gracefully accepting constructive criticism. +- Focusing on what is best for the community. +- Showing empathy towards other community members. + +Examples of unacceptable behavior by participants include: + +- The use of sexualized language or imagery and unwelcome sexual attention or + advances. +- Trolling, insulting/derogatory comments, and personal or political attacks. +- Public or private harassment. +- Publishing others’ private information, such as a physical or electronic + address, without explicit permission. +- Other conduct which could reasonably be considered inappropriate. + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable + behavior and are expected to take appropriate and fair corrective action in + response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or + reject comments, commits, code, wiki edits, issues, and other contributions + that are not aligned to this Code of Conduct, or to ban temporarily or + permanently any contributor for other behaviors that they deem inappropriate, + threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies both within project spaces and in public spaces + when an individual is representing the project or its community. +Examples of representing a project or community include using an official + project email address, posting via an official social media account, or acting + as an appointed representative at an online or offline event. +Representation of a project may be further defined and clarified by project + maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be + reported by contacting [cpp-conduct@nvidia.com](mailto:cpp-conduct@nvidia.com). +All complaints will be reviewed and investigated and will result in a response + that is deemed necessary and appropriate to the circumstances. +The project team is obligated to maintain confidentiality with regard to the + reporter of an incident. +Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good + faith may face temporary or permanent repercussions as determined by other + members of the project’s leadership. + +## Attribution + +This Code of Conduct was taken from the [NVIDIA RAPIDS] project, which was + adapted from the [Contributor Covenant version 1.4]. + +Please see this [FAQ] for answers to common questions about this Code of Conduct. + +## Contact + +Please email [cpp-conduct@nvidia.com] for any Code of Conduct related matters. + + +[cpp-conduct@nvidia.com]: mailto:cpp-conduct@nvidia.com + +[FAQ]: https://www.contributor-covenant.org/faq + +[NVIDIA RAPIDS]: https://docs.rapids.ai/resources/conduct/ +[Contributor Covenant version 1.4]: https://www.contributor-covenant.org/version/1/4/code-of-conduct.html diff --git a/LICENSE b/LICENSE index e454a5258..c22c22563 100644 --- a/LICENSE +++ b/LICENSE @@ -1,3 +1,7 @@ +Unless otherwise noted, Thrust's source code is released under the Apache +License, Version 2.0: + +================================================================================ Apache License Version 2.0, January 2004 @@ -174,5 +178,72 @@ incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. - END OF TERMS AND CONDITIONS - +================================================================================ + +Some portions of Thrust may be licensed under other compatible open-source +licenses. Any divergence from the Apache 2 license will be noted in the source +code where applicable. + +Portions under other terms include, but are not limited to: + +================================================================================ + +Various C++ utility classes in Thrust are based on the Boost Iterator, Tuple, +System, and Random Number libraries, which are provided under the Boost Software +License: + + Boost Software License - Version 1.0 - August 17th, 2003 + + Permission is hereby granted, free of charge, to any person or organization + obtaining a copy of the software and accompanying documentation covered by + this license (the "Software") to use, reproduce, display, distribute, + execute, and transmit the Software, and to prepare derivative works of the + Software, and to permit third-parties to whom the Software is furnished to + do so, all subject to the following: + + The copyright notices in the Software and this entire statement, including + the above license grant, this restriction and the following disclaimer, + must be included in all copies of the Software, in whole or in part, and + all derivative works of the Software, unless such copies or derivative + works are solely in the form of machine-executable object code generated by + a source language processor. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT + SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE + FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, + ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + DEALINGS IN THE SOFTWARE. + +================================================================================ + +Portions of the thrust::complex implementation are derived from FreeBSD with the +following terms: + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions of source code must retain the above copyright + notice[1] unmodified, this list of conditions, and the following + disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +[1] Individual copyright notices from the original authors are included in + the relevant source files. + +================================================================================ diff --git a/Makefile b/Makefile index 12f9d964c..4b5a4a423 100644 --- a/Makefile +++ b/Makefile @@ -1,39 +1,25 @@ -# Copyright 1993-2010 NVIDIA Corporation. All rights reserved. +# Copyright 2010-2020 NVIDIA Corporation. # -# NOTICE TO USER: +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -# This source code is subject to NVIDIA ownership rights under U.S. and -# international Copyright laws. +# http://www.apache.org/licenses/LICENSE-2.0 # -# This software and the information contained herein is being provided -# under the terms and conditions of a Source Code License Agreement. -# -# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE -# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR -# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH -# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. -# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, -# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS -# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE -# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE -# OR PERFORMANCE OF THIS SOURCE CODE. -# -# U.S. Government End Users. This source code is a "commercial item" as -# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of -# "commercial computer software" and "commercial computer software -# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) -# and is provided to the U.S. Government only as a commercial end item. -# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through -# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the -# source code with only those rights set forth herein. +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. # Makefile for building Thrust unit test driver # Force C++11 mode. NVCC will ignore it if the host compiler doesn't support it. -#export CXX_STD = c++11 +export CXX_STD := c++11 -export VERBOSE = 1 +export CCCL_ENABLE_DEPRECATIONS := 1 + +export VERBOSE := 1 ifndef PROFILE ifdef VULCAN_TOOLKIT_BASE @@ -53,10 +39,6 @@ else include ../build/config/DetectOS.mk endif -ifeq ($(OS),win32) - export I_AM_SLOPPY := 1 -endif - TMP_DIR := built TMP_PREFIX := $(ROOTDIR) TMP_ARCH := $(ARCH)_$(PROFILE)_agnostic @@ -129,50 +111,20 @@ else include ../build/common.mk endif -# Print host compiler version. - -VERSION_FLAG := -ifeq ($(OS),$(filter $(OS),Linux Darwin)) - ifdef USEPGCXX # PGI - VERSION_FLAG := -V - else - ifdef USEXLC # XLC - VERSION_FLAG := -qversion - else # GCC, ICC or Clang AKA the sane ones. - VERSION_FLAG := --version - endif - endif -else ifeq ($(OS),win32) # MSVC - # cl.exe run without any options will print its version info and exit. - VERSION_FLAG := -endif - -CCBIN_ENVIRONMENT := -ifeq ($(OS), QNX) - # QNX's GCC complains if QNX_HOST and QNX_TARGET aren't defined in the - # environment. - CCBIN_ENVIRONMENT := QNX_HOST=$(QNX_HOST) QNX_TARGET=$(QNX_TARGET) -endif - -$(info #### CCBIN : $(CCBIN)) -$(info #### CCBIN VERSION : $(shell $(CCBIN_ENVIRONMENT) $(CCBIN) $(VERSION_FLAG))) -$(info #### CXX_STD : $(CXX_STD)) - ifeq ($(OS), win32) - CREATE_DVS_PACKAGE = $(ZIP) -r built/CUDA-thrust-package.zip bin thrust/internal/test thrust/internal/scripts thrust/internal/benchmark thrust/*.trs $(DVS_COMMON_TEST_PACKAGE_FILES) - APPEND_HEADERS_DVS_PACKAGE = $(ZIP) -rg built/CUDA-thrust-package.zip thrust -9 -i *.h + CREATE_DVS_PACKAGE = $(ZIP) -r built/CUDA-thrust-package.zip bin thrust/internal/test thrust/internal/scripts thrust/internal/benchmark $(DVS_COMMON_TEST_PACKAGE_FILES) + APPEND_H_DVS_PACKAGE = $(ZIP) -rg built/CUDA-thrust-package.zip thrust -9 -i *.h APPEND_INL_DVS_PACKAGE = $(ZIP) -rg built/CUDA-thrust-package.zip thrust -9 -i *.inl APPEND_CUH_DVS_PACKAGE = $(ZIP) -rg built/CUDA-thrust-package.zip thrust -9 -i *.cuh - MAKE_DVS_PACKAGE = $(CREATE_DVS_PACKAGE) && $(APPEND_HEADERS_DVS_PACKAGE) && $(APPEND_INL_DVS_PACKAGE) && $(APPEND_CUH_DVS_PACKAGE) + MAKE_DVS_PACKAGE = $(CREATE_DVS_PACKAGE) && $(APPEND_H_DVS_PACKAGE) && $(APPEND_INL_DVS_PACKAGE) && $(APPEND_CUH_DVS_PACKAGE) else - CREATE_DVS_PACKAGE = tar -cv -f built/CUDA-thrust-package.tar bin thrust/internal/test thrust/internal/scripts thrust/internal/benchmark thrust/*.trs $(DVS_COMMON_TEST_PACKAGE_FILES) - APPEND_HEADERS_DVS_PACKAGE = find thrust -name "*.h" | xargs tar rvf built/CUDA-thrust-package.tar - APPEND_INL_DVS_PACKAGE = find thrust -name "*.inl" | xargs tar rvf built/CUDA-thrust-package.tar - APPEND_CUH_DVS_PACKAGE = find thrust -name "*.cuh" | xargs tar rvf built/CUDA-thrust-package.tar - COMPRESS_DVS_PACKAGE = bzip2 built/CUDA-thrust-package.tar - MAKE_DVS_PACKAGE = $(CREATE_DVS_PACKAGE) && $(APPEND_HEADERS_DVS_PACKAGE) && $(APPEND_INL_DVS_PACKAGE) && $(APPEND_CUH_DVS_PACKAGE) && $(COMPRESS_DVS_PACKAGE) + TAR_FILES = bin thrust/internal/test thrust/internal/scripts thrust/internal/benchmark $(DVS_COMMON_TEST_PACKAGE_FILES) + TAR_FILES += `find -L thrust \( -name "*.cuh" -o -name "*.h" -o -name "*.inl" \)` + MAKE_DVS_PACKAGE = tar -I bzip2 -chvf built/CUDA-thrust-package.tar.bz2 $(TAR_FILES) endif +COPY_CUB_FOR_PACKAGING = rm -rf cub && cp -rp ../cub/cub cub + DVS_OPTIONS := ifneq ($(TARGET_ARCH),$(HOST_ARCH)) @@ -185,16 +137,20 @@ endif THRUST_DVS_BUILD = release pack: + $(COPY_CUB_FOR_PACKAGING) cd .. && $(MAKE_DVS_PACKAGE) dvs: + $(COPY_CUB_FOR_PACKAGING) +# Build the CUDA Runtime in GVS, because GVS has no CUDA Runtime component. +# This is a temporary workaround until the Tegra team adds a CUDA Runtime +# component, which they have promised to do. +ifdef GVS $(MAKE) $(DVS_OPTIONS) -s -C ../cuda $(THRUST_DVS_BUILD) +endif $(MAKE) $(DVS_OPTIONS) $(THRUST_DVS_BUILD) THRUST_DVS=1 cd .. && $(MAKE_DVS_PACKAGE) -# XXX Deprecated, remove. -dvs_nightly: dvs - dvs_release: $(MAKE) dvs THRUST_DVS_BUILD=release diff --git a/NOTICE b/NOTICE deleted file mode 100644 index 1ce1dcc29..000000000 --- a/NOTICE +++ /dev/null @@ -1,26 +0,0 @@ -Thrust includes source code from the Boost Iterator, Tuple, System, and Random Number libraries. - - Boost Software License - Version 1.0 - August 17th, 2003 - - Permission is hereby granted, free of charge, to any person or organization - obtaining a copy of the software and accompanying documentation covered by - this license (the "Software") to use, reproduce, display, distribute, - execute, and transmit the Software, and to prepare derivative works of the - Software, and to permit third-parties to whom the Software is furnished to - do so, all subject to the following: - - The copyright notices in the Software and this entire statement, including - the above license grant, this restriction and the following disclaimer, - must be included in all copies of the Software, in whole or in part, and - all derivative works of the Software, unless such copies or derivative - works are solely in the form of machine-executable object code generated by - a source language processor. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT - SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE - FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, - ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - DEALINGS IN THE SOFTWARE. - diff --git a/README.md b/README.md index 37c26ba90..b885389d4 100644 --- a/README.md +++ b/README.md @@ -1,78 +1,253 @@ -Thrust: Code at the speed of light -================================== +:warning: **The Thrust repository has been archived and is now part of the unified [nvidia/cccl repository](https://github.com/nvidia/cccl). See the [announcement here](https://github.com/NVIDIA/cccl/discussions/520) for more information. Please visit the new repository for the latest updates.** :warning: -Thrust is a C++ parallel programming library which resembles the C++ Standard -Library. Thrust's **high-level** interface greatly enhances -programmer **productivity** while enabling performance portability between -GPUs and multicore CPUs. **Interoperability** with established technologies -(such as CUDA, TBB, and OpenMP) facilitates integration with existing -software. Develop **high-performance** applications rapidly with Thrust! +# Thrust: The C++ Parallel Algorithms Library -Thrust is distributed with the CUDA Toolkit in addition to GitHub. + + + + +
ExamplesGodboltDocumentation
-Examples --------- +Thrust is the C++ parallel algorithms library which inspired the introduction + of parallel algorithms to the C++ Standard Library. +Thrust's **high-level** interface greatly enhances programmer **productivity** + while enabling performance portability between GPUs and multicore CPUs. +It builds on top of established parallel programming frameworks (such as CUDA, + TBB, and OpenMP). +It also provides a number of general-purpose facilities similar to those found + in the C++ Standard Library. -Thrust is best explained through examples. The following source code -generates random numbers serially and then transfers them to a parallel -device where they are sorted. +The NVIDIA C++ Standard Library is an open source project; it is available on + [GitHub] and included in the NVIDIA HPC SDK and CUDA Toolkit. +If you have one of those SDKs installed, no additional installation or compiler + flags are needed to use libcu++. -```c++ +## Examples + +Thrust is best learned through examples. + +The following example generates random numbers serially and then transfers them + to a parallel device where they are sorted. + +```cuda #include #include #include #include #include -#include -#include +#include -int main(void) -{ - // generate 32M random numbers serially +int main() { + // Generate 32M random numbers serially. + thrust::default_random_engine rng(1337); + thrust::uniform_int_distribution dist; thrust::host_vector h_vec(32 << 20); - std::generate(h_vec.begin(), h_vec.end(), rand); + thrust::generate(h_vec.begin(), h_vec.end(), [&] { return dist(rng); }); - // transfer data to the device + // Transfer data to the device. thrust::device_vector d_vec = h_vec; - // sort data on the device (846M keys per second on GeForce GTX 480) + // Sort data on the device. thrust::sort(d_vec.begin(), d_vec.end()); - // transfer data back to host + // Transfer data back to host. thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin()); - - return 0; } ``` - -This code sample computes the sum of 100 random numbers in parallel: -```c++ +[See it on Godbolt](https://godbolt.org/z/GeWEd8Er9) + +This example demonstrates computing the sum of some random numbers in parallel: + +```cuda #include #include #include #include #include -#include -#include +#include + +int main() { + // Generate random data serially. + thrust::default_random_engine rng(1337); + thrust::uniform_real_distribution dist(-50.0, 50.0); + thrust::host_vector h_vec(32 << 20); + thrust::generate(h_vec.begin(), h_vec.end(), [&] { return dist(rng); }); + + // Transfer to device and compute the sum. + thrust::device_vector d_vec = h_vec; + double x = thrust::reduce(d_vec.begin(), d_vec.end(), 0, thrust::plus()); +} +``` -int main(void) -{ - // generate random data serially - thrust::host_vector h_vec(100); - std::generate(h_vec.begin(), h_vec.end(), rand); +[See it on Godbolt](https://godbolt.org/z/cnsbWWME7) - // transfer to device and compute sum - thrust::device_vector d_vec = h_vec; - int x = thrust::reduce(d_vec.begin(), d_vec.end(), 0, thrust::plus()); - return 0; +This example show how to perform such a reduction asynchronously: + +```cuda +#include +#include +#include +#include +#include +#include +#include +#include + +int main() { + // Generate 32M random numbers serially. + thrust::default_random_engine rng(123456); + thrust::uniform_real_distribution dist(-50.0, 50.0); + thrust::host_vector h_vec(32 << 20); + thrust::generate(h_vec.begin(), h_vec.end(), [&] { return dist(rng); }); + + // Asynchronously transfer to the device. + thrust::device_vector d_vec(h_vec.size()); + thrust::device_event e = thrust::async::copy(h_vec.begin(), h_vec.end(), + d_vec.begin()); + + // After the transfer completes, asynchronously compute the sum on the device. + thrust::device_future f0 = thrust::async::reduce(thrust::device.after(e), + d_vec.begin(), d_vec.end(), + 0.0, thrust::plus()); + + // While the sum is being computed on the device, compute the sum serially on + // the host. + double f1 = std::accumulate(h_vec.begin(), h_vec.end(), 0.0, thrust::plus()); } ``` -Refer to the [Quick Start Guide](http://github.com/thrust/thrust/wiki/Quick-Start-Guide) page for further information and examples. +[See it on Godbolt](https://godbolt.org/z/be54efaKj) + +## Getting The Thrust Source Code + +Thrust is a header-only library; there is no need to build or install the project +unless you want to run the Thrust unit tests. + +The CUDA Toolkit provides a recent release of the Thrust source code in +`include/thrust`. This will be suitable for most users. + +Users that wish to contribute to Thrust or try out newer features should +recursively clone the Thrust Github repository: + +``` +git clone --recursive https://github.com/NVIDIA/thrust.git +``` + +## Using Thrust From Your Project + +For CMake-based projects, we provide a CMake package for use with +`find_package`. See the [CMake README](thrust/cmake/README.md) for more +information. Thrust can also be added via `add_subdirectory` or tools like +the [CMake Package Manager](https://github.com/cpm-cmake/CPM.cmake). + +For non-CMake projects, compile with: +- The Thrust include path (`-I`) +- The libcu++ include path (`-I/dependencies/libcudacxx/`) +- The CUB include path, if using the CUDA device system (`-I/dependencies/cub/`) +- By default, the CPP host system and CUDA device system are used. + These can be changed using compiler definitions: + - `-DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_XXX`, + where `XXX` is `CPP` (serial, default), `OMP` (OpenMP), or `TBB` (Intel TBB) + - `-DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_XXX`, where `XXX` is + `CPP`, `OMP`, `TBB`, or `CUDA` (default). + +## Developing Thrust + +Thrust uses the [CMake build system] to build unit tests, examples, and header + tests. +To build Thrust as a developer, it is recommended that you use our + containerized development system: + +```bash +# Clone Thrust and CUB repos recursively: +git clone --recursive https://github.com/NVIDIA/thrust.git +cd thrust + +# Build and run tests and examples: +ci/local/build.bash +``` + +That does the equivalent of the following, but in a clean containerized + environment which has all dependencies installed: + +```bash +# Clone Thrust and CUB repos recursively: +git clone --recursive https://github.com/NVIDIA/thrust.git +cd thrust + +# Create build directory: +mkdir build +cd build + +# Configure -- use one of the following: +cmake .. # Command line interface. +ccmake .. # ncurses GUI (Linux only). +cmake-gui # Graphical UI, set source/build directories in the app. + +# Build: +cmake --build . -j ${NUM_JOBS} # Invokes make (or ninja, etc). + +# Run tests and examples: +ctest +``` + +By default, a serial `CPP` host system, `CUDA` accelerated device system, and + C++14 standard are used. +This can be changed in CMake and via flags to `ci/local/build.bash` + +More information on configuring your Thrust build and creating a pull request + can be found in the [contributing section]. + +## Licensing + +Thrust is an open source project developed on [GitHub]. +Thrust is distributed under the [Apache License v2.0 with LLVM Exceptions]; + some parts are distributed under the [Apache License v2.0] and the + [Boost License v1.0]. + +## CI Status + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +[GitHub]: https://github.com/nvidia/thrust + +[CMake section]: https://nvidia.github.io/thrust/setup/cmake_options.html +[contributing section]: https://nvidia.github.io/thrust/contributing.html -Development process -------------------- +[CMake build system]: https://cmake.org -For information on development process and branching, see [this document](doc/branching.md). +[Apache License v2.0 with LLVM Exceptions]: https://llvm.org/LICENSE.txt +[Apache License v2.0]: https://www.apache.org/licenses/LICENSE-2.0.txt +[Boost License v1.0]: https://www.boost.org/LICENSE_1_0.txt diff --git a/ci/axis/cpu.yml b/ci/axis/cpu.yml new file mode 100644 index 000000000..cc393169d --- /dev/null +++ b/ci/axis/cpu.yml @@ -0,0 +1,61 @@ +# Copyright (c) 2018-2020 NVIDIA Corporation +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# Released under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. + +SDK_TYPE: + - cuda + +SDK_VER: + - 11.7.0-devel + +OS_TYPE: + - ubuntu + +OS_VER: + - 20.04 + +CXX_TYPE: + - clang + - gcc + - icc + +CXX_VER: + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + - latest + +exclude: + # Excludes by `CXX_VER`. + - CXX_TYPE: gcc + CXX_VER: 12 + - CXX_TYPE: gcc + CXX_VER: latest + - CXX_TYPE: clang + CXX_VER: 5 + - CXX_TYPE: clang + CXX_VER: 6 + - CXX_TYPE: clang + CXX_VER: latest + - CXX_TYPE: icc + CXX_VER: 5 + - CXX_TYPE: icc + CXX_VER: 6 + - CXX_TYPE: icc + CXX_VER: 7 + - CXX_TYPE: icc + CXX_VER: 8 + - CXX_TYPE: icc + CXX_VER: 9 + - CXX_TYPE: icc + CXX_VER: 10 + - CXX_TYPE: icc + CXX_VER: 11 + - CXX_TYPE: icc + CXX_VER: 12 diff --git a/ci/axis/gpu.yml b/ci/axis/gpu.yml new file mode 100644 index 000000000..550083aab --- /dev/null +++ b/ci/axis/gpu.yml @@ -0,0 +1,22 @@ +# Copyright (c) 2018-2020 NVIDIA Corporation +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# Released under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. + +SDK_TYPE: + - cuda + +SDK_VER: + - 11.7.0-devel + +OS_TYPE: + - ubuntu + +OS_VER: + - 20.04 + +CXX_TYPE: + - gcc + +CXX_VER: + - 9 diff --git a/ci/common/build.bash b/ci/common/build.bash new file mode 100755 index 000000000..37aafaf8b --- /dev/null +++ b/ci/common/build.bash @@ -0,0 +1,439 @@ +#! /usr/bin/env bash + +# Copyright (c) 2018-2022 NVIDIA Corporation +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# Released under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. + +################################################################################ +# Thrust and CUB build script for gpuCI +################################################################################ + +set -e # Stop on errors. + +# append variable value +# Appends ${value} to ${variable}, adding a space before ${value} if +# ${variable} is not empty. +function append { + tmp="${!1:+${!1} }${2}" + eval "${1}=\${tmp}" +} + +# log args... +# Prints out ${args[*]} with a gpuCI log prefix and a newline before and after. +function log() { + printf "\n>>>> %s\n\n" "${*}" +} + +# print_with_trailing_blank_line args... +# Prints ${args[*]} with one blank line following, preserving newlines within +# ${args[*]} but stripping any preceding ${args[*]}. +function print_with_trailing_blank_line { + printf "%s\n\n" "${*}" +} + +# echo_and_run name args... +# Echo ${args[@]}, then execute ${args[@]} +function echo_and_run { + echo "${1}: ${@:2}" + ${@:2} +} + +# echo_and_run_timed name args... +# Echo ${args[@]}, then execute ${args[@]} and report how long it took, +# including ${name} in the output of the time. +function echo_and_run_timed { + echo "${@:2}" + TIMEFORMAT=$'\n'"${1} Time: %lR" + time ${@:2} +} + +# join_delimit [value [value [...]]] +# Combine all values into a single string, separating each by a single character +# delimiter. Eg: +# foo=(bar baz kramble) +# joined_foo=$(join_delimit "|" "${foo[@]}") +# echo joined_foo # "bar|baz|kramble" +function join_delimit { + local IFS="${1}" + shift + echo "${*}" +} + +################################################################################ +# VARIABLES - Set up bash and environmental variables. +################################################################################ + +# Get the variables the Docker container set up for us: ${CXX}, ${CUDACXX}, etc. +set +e # Don't stop on errors from /etc/cccl.bashrc. +source /etc/cccl.bashrc +set -e # Stop on errors. + +# Configure sccache. +if [[ "${CXX_TYPE}" == "nvcxx" ]]; then + log "Disabling sccache (nvcxx not supported)" + unset ENABLE_SCCACHE +elif [[ "${BUILD_MODE}" == "pull-request" || "${BUILD_MODE}" == "branch" ]]; then + # gpuCI builds cache in S3. + export ENABLE_SCCACHE="gpuCI" + # Change to 'thrust-aarch64' if we add aarch64 builds to gpuCI: + export SCCACHE_S3_KEY_PREFIX=thrust-linux64 # [linux64] + export SCCACHE_BUCKET=rapids-sccache-east + export SCCACHE_REGION=us-east-2 + export SCCACHE_IDLE_TIMEOUT=32768 +else + export ENABLE_SCCACHE="local" + # local builds cache locally + export SCCACHE_DIR="${WORKSPACE}/build-sccache" +fi + +# Set sccache compiler flags +if [[ -n "${ENABLE_SCCACHE}" ]]; then + export CMAKE_CUDA_COMPILER_LAUNCHER="sccache" + export CMAKE_CXX_COMPILER_LAUNCHER="sccache" + export CMAKE_C_COMPILER_LAUNCHER="sccache" +fi + +# Set path. +export PATH=/usr/local/cuda/bin:${PATH} + +# Set home to the job's workspace. +export HOME=${WORKSPACE} + +# Per-process memory util logs: +MEMMON_LOG=${WORKSPACE}/build/memmon_log + +# Switch to the build directory. +cd ${WORKSPACE} +mkdir -p build +cd build + +# Remove any old .ninja_log file so the PrintNinjaBuildTimes step is accurate: +rm -f .ninja_log + +if [[ -z "${CMAKE_BUILD_TYPE}" ]]; then + CMAKE_BUILD_TYPE="Release" +fi + +CMAKE_BUILD_FLAGS="--" + +# The Docker image sets up `${CXX}` and `${CUDACXX}`. +append CMAKE_FLAGS "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}" +append CMAKE_FLAGS "-DCMAKE_CUDA_COMPILER='${CUDACXX}'" + +if [[ "${CXX_TYPE}" == "nvcxx" ]]; then + # NVC++ isn't properly detected by CMake, so we have to tell CMake to ignore + # detection and explicit provide the compiler ID. Ninja currently isn't + # supported, so we just use makefiles. + append CMAKE_FLAGS "-DCMAKE_CUDA_COMPILER_FORCED=ON" + append CMAKE_FLAGS "-DCMAKE_CUDA_COMPILER_ID=NVCXX" + # We use NVC++ "slim" image which only contain a single CUDA toolkit version. + # When using NVC++ in an environment without GPUs (like our CPU-only + # builders) it unfortunately defaults to the oldest CUDA toolkit version it + # supports, even if that version is not in the image. So, we have to + # explicitly tell NVC++ it which CUDA toolkit version to use. + CUDA_VER=$(echo ${SDK_VER} | sed 's/.*\(cuda[0-9]\+\.[0-9]\+\)/\1/') + append CMAKE_FLAGS "-DCMAKE_CUDA_FLAGS=-gpu=${CUDA_VER}" + # Don't stop on build failures. + append CMAKE_BUILD_FLAGS "-k" +else + if [[ "${CXX_TYPE}" == "icc" ]]; then + # Only the latest version of the Intel C++ compiler, which NVCC doesn't + # officially support yet, is freely available. + append CMAKE_FLAGS "-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler" + fi + # We're using NVCC so we need to set the host compiler. + append CMAKE_FLAGS "-DCMAKE_CXX_COMPILER='${CXX}'" + append CMAKE_FLAGS "-G Ninja" + # Don't stop on build failures. + append CMAKE_BUILD_FLAGS "-k0" +fi + +DETERMINE_PARALLELISM_FLAGS="" + +# Used to limit the number of default build threads. Any build/link +# steps that exceed this limit will cause this script to report a +# failure. Tune this using the memmon logs printed after each run. +# +# Build steps that take more memory than this limit should +# be split into multiple steps/translation units. Any temporary +# increases to this threshold should be reverted ASAP. The goal +# to do decrease this as much as possible and not increase it. +if [[ -z "${MIN_MEMORY_PER_THREAD}" ]]; then + if [[ "${CXX_TYPE}" == "nvcxx" ]]; then + MIN_MEMORY_PER_THREAD=3.0 # GiB + elif [[ "${CXX_TYPE}" == "icc" ]]; then + MIN_MEMORY_PER_THREAD=2.5 # GiB + else + MIN_MEMORY_PER_THREAD=2.0 # GiB + fi +fi +append DETERMINE_PARALLELISM_FLAGS "--min-memory-per-thread ${MIN_MEMORY_PER_THREAD}" + +if [[ -n "${PARALLEL_LEVEL}" ]]; then + append DETERMINE_PARALLELISM_FLAGS "-j ${PARALLEL_LEVEL}" +fi + +# COVERAGE_PLAN options: +# * Exhaustive +# * Thorough +# * Minimal +if [[ -z "${COVERAGE_PLAN}" ]]; then + # `ci/local/build.bash` always sets a coverage plan, so we can assume we're + # in gpuCI if one was not set. + if [[ "${CXX_TYPE}" == "nvcxx" ]]; then + # Today, NVC++ builds take too long to do anything more than Minimal. + COVERAGE_PLAN="Minimal" + elif [[ "${BUILD_TYPE}" == "cpu" ]] && [[ "${BUILD_MODE}" == "branch" ]]; then + # Post-commit CPU CI builds. + COVERAGE_PLAN="Exhaustive" + elif [[ "${BUILD_TYPE}" == "cpu" ]]; then + # Pre-commit CPU CI builds. + COVERAGE_PLAN="Thorough" + elif [[ "${BUILD_TYPE}" == "gpu" ]]; then + # Pre- and post-commit GPU CI builds. + COVERAGE_PLAN="Minimal" + fi +fi + +case "${COVERAGE_PLAN}" in + Exhaustive) + append CMAKE_FLAGS "-DTHRUST_ENABLE_MULTICONFIG=ON" + append CMAKE_FLAGS "-DTHRUST_IGNORE_DEPRECATED_CPP_11=ON" + append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_DIALECT_ALL=ON" + append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_SYSTEM_CPP=ON" + append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_SYSTEM_TBB=ON" + append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_SYSTEM_OMP=ON" + append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_SYSTEM_CUDA=ON" + append CMAKE_FLAGS "-DTHRUST_INCLUDE_CUB_CMAKE=ON" + append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_WORKLOAD=LARGE" + ;; + Thorough) + # Build the legacy bench.cu. We'll probably want to remove this when we + # switch to the new, heavier thrust_benchmarks project. + append CMAKE_FLAGS "-DTHRUST_ENABLE_BENCHMARKS=ON" + append CMAKE_FLAGS "-DTHRUST_ENABLE_MULTICONFIG=ON" + append CMAKE_FLAGS "-DTHRUST_IGNORE_DEPRECATED_CPP_11=ON" + append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_DIALECT_ALL=ON" + append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_SYSTEM_CPP=ON" + append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_SYSTEM_TBB=ON" + append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_SYSTEM_OMP=ON" + append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_SYSTEM_CUDA=ON" + append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_WORKLOAD=SMALL" + append CMAKE_FLAGS "-DTHRUST_INCLUDE_CUB_CMAKE=ON" + append CMAKE_FLAGS "-DTHRUST_AUTO_DETECT_COMPUTE_ARCHS=ON" + if [[ "${CXX_TYPE}" != "nvcxx" ]]; then + # NVC++ can currently only target one compute architecture at a time. + append CMAKE_FLAGS "-DTHRUST_ENABLE_COMPUTE_50=ON" + append CMAKE_FLAGS "-DTHRUST_ENABLE_COMPUTE_60=ON" + append CMAKE_FLAGS "-DTHRUST_ENABLE_COMPUTE_70=ON" + fi + append CMAKE_FLAGS "-DTHRUST_ENABLE_COMPUTE_80=ON" + ;; + Minimal) + append CMAKE_FLAGS "-DTHRUST_ENABLE_MULTICONFIG=ON" + append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_DIALECT_LATEST=ON" + append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_SYSTEM_CPP=ON" + append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_SYSTEM_TBB=OFF" + append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_SYSTEM_OMP=OFF" + append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_SYSTEM_CUDA=ON" + append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_WORKLOAD=SMALL" + append CMAKE_FLAGS "-DTHRUST_INCLUDE_CUB_CMAKE=ON" + append CMAKE_FLAGS "-DTHRUST_AUTO_DETECT_COMPUTE_ARCHS=ON" + if [[ "${BUILD_TYPE}" == "cpu" ]] && [[ "${CXX_TYPE}" == "nvcxx" ]]; then + # If no GPU is automatically detected, NVC++ insists that you explicitly + # provide an architecture. + # TODO: This logic should really be moved into CMake, but it will be + # tricky to do that until CMake officially supports NVC++. + append CMAKE_FLAGS "-DTHRUST_ENABLE_COMPUTE_80=ON" + fi + ;; +esac + +if [[ -n "${@}" ]]; then + append CMAKE_BUILD_FLAGS "${@}" +fi + +append CTEST_FLAGS "--output-on-failure" + +CTEST_EXCLUSION_REGEXES=() + +if [[ "${BUILD_TYPE}" == "cpu" ]]; then + CTEST_EXCLUSION_REGEXES+=("^cub" "^thrust.*cuda") +fi + +if [[ -n "${CTEST_EXCLUSION_REGEXES[@]}" ]]; then + CTEST_EXCLUSION_REGEX=$(join_delimit "|" "${CTEST_EXCLUSION_REGEXES[@]}") + append CTEST_FLAGS "-E ${CTEST_EXCLUSION_REGEX}" +fi + +if [[ -n "${@}" ]]; then + CTEST_INCLUSION_REGEX=$(join_delimit "|" "${@}") + append CTEST_FLAGS "-R ^${CTEST_INCLUSION_REGEX[@]}$" +fi + +# Export variables so they'll show up in the logs when we report the environment. +export COVERAGE_PLAN +export CMAKE_FLAGS +export CMAKE_BUILD_FLAGS +export CTEST_FLAGS + +################################################################################ +# ENVIRONMENT - Configure and print out information about the environment. +################################################################################ + +log "Determine system topology..." + +# Set `${PARALLEL_LEVEL}` if it is unset; otherwise, this just reports the +# system topology. +source ${WORKSPACE}/ci/common/determine_build_parallelism.bash ${DETERMINE_PARALLELISM_FLAGS} + +log "Get environment..." + +env | sort + +log "Check versions..." + +# We use sed and echo below to ensure there is always one and only trailing +# line following the output from each tool. + +${CXX} --version 2>&1 | sed -Ez '$ s/\n*$/\n/' + +echo + +${CUDACXX} --version 2>&1 | sed -Ez '$ s/\n*$/\n/' + +echo + +cmake --version 2>&1 | sed -Ez '$ s/\n*$/\n/' + +if [[ "${BUILD_TYPE}" == "gpu" ]]; then + echo + nvidia-smi 2>&1 | sed -Ez '$ s/\n*$/\n/' +fi + +if [[ -n "${ENABLE_SCCACHE}" ]]; then + echo + # Set sccache statistics to zero to capture clean run. + sccache --version + sccache --zero-stats | grep location +fi + +################################################################################ +# BUILD - Build Thrust and CUB examples and tests. +################################################################################ + +log "Configure Thrust and CUB..." + +echo_and_run_timed "Configure" cmake .. --log-level=VERBOSE ${CMAKE_FLAGS} +configure_status=$? + +log "Build Thrust and CUB..." + +# ${PARALLEL_LEVEL} needs to be passed after we run +# determine_build_parallelism.bash, so it can't be part of ${CMAKE_BUILD_FLAGS}. +set +e # Don't stop on build failures. + +# Monitor memory usage. Thresholds in GiB: +python3 ${WORKSPACE}/ci/common/memmon.py \ + --log-threshold 0.0 \ + --fail-threshold ${MIN_MEMORY_PER_THREAD} \ + --log-file ${MEMMON_LOG} \ + & +memmon_pid=$! + +echo_and_run_timed "Build" cmake --build . ${CMAKE_BUILD_FLAGS} -j ${PARALLEL_LEVEL} +build_status=$? + +# Stop memmon: +kill -s SIGINT ${memmon_pid} + +# Re-enable exit on failure: +set -e + +################################################################################ +# TEST - Run Thrust and CUB examples and tests. +################################################################################ + +log "Test Thrust and CUB..." + +( + # Make sure test_status captures ctest, not tee: + # https://stackoverflow.com/a/999259/11130318 + set -o pipefail + echo_and_run_timed "Test" ctest ${CTEST_FLAGS} | tee ctest_log +) +test_status=$? + +################################################################################ +# COMPILATION STATS +################################################################################ + +if [[ -n "${ENABLE_SCCACHE}" ]]; then + # Get sccache stats after the compile is completed + COMPILE_REQUESTS=$(sccache -s | grep "Compile requests \+ [0-9]\+$" | awk '{ print $NF }') + CACHE_HITS=$(sccache -s | grep "Cache hits \+ [0-9]\+$" | awk '{ print $NF }') + HIT_RATE=$(echo - | awk "{printf \"%.2f\n\", $CACHE_HITS / $COMPILE_REQUESTS * 100}") + log "sccache stats (${HIT_RATE}% hit):" + sccache -s +fi + +################################################################################ +# COMPILE TIME INFO: Print the 20 longest running build steps (ninja only) +################################################################################ + +if [[ -f ".ninja_log" ]]; then + log "Checking slowest build steps:" + echo_and_run "CompileTimeInfo" cmake -P ../cmake/PrintNinjaBuildTimes.cmake | head -n 23 +fi + +################################################################################ +# RUNTIME INFO: Print the 20 longest running test steps +################################################################################ + +if [[ -f "ctest_log" ]]; then + log "Checking slowest test steps:" + echo_and_run "TestTimeInfo" cmake -DLOGFILE=ctest_log -P ../cmake/PrintCTestRunTimes.cmake | head -n 20 +fi + +################################################################################ +# MEMORY_USAGE +################################################################################ + +memmon_status=0 +if [[ -f "${MEMMON_LOG}" ]]; then + log "Checking memmon logfile: ${MEMMON_LOG}" + + if [[ -n "$(grep -E "^FAIL" ${MEMMON_LOG})" ]]; then + log "error: Some build steps exceeded memory threshold (${MIN_MEMORY_PER_THREAD} GiB):" + grep -E "^FAIL" ${MEMMON_LOG} + memmon_status=1 + else + log "Top memory usage per build step (all less than limit of ${MIN_MEMORY_PER_THREAD} GiB):" + if [[ -s ${MEMMON_LOG} ]]; then + # Not empty: + head -n5 ${MEMMON_LOG} + else + echo "None detected above logging threshold." + fi + fi +fi + +################################################################################ +# SUMMARY - Print status of each step and exit with failure if needed. +################################################################################ + +log "Summary:" +echo "Warnings:" +# Not currently a failure; sccache makes these unreliable and intermittent: +echo "- Build Memory Check: ${memmon_status}" +echo "Failures:" +echo "- Configure Error Code: ${configure_status}" +echo "- Build Error Code: ${build_status}" +echo "- Test Error Code: ${test_status}" + +if [[ "${configure_status}" != "0" ]] || \ + [[ "${build_status}" != "0" ]] || \ + [[ "${test_status}" != "0" ]]; then + exit 1 +fi diff --git a/ci/common/determine_build_parallelism.bash b/ci/common/determine_build_parallelism.bash new file mode 100755 index 000000000..9813fcb2f --- /dev/null +++ b/ci/common/determine_build_parallelism.bash @@ -0,0 +1,119 @@ +#! /usr/bin/env bash + +# Copyright (c) 2018-2020 NVIDIA Corporation +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# Released under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. + +function usage { + echo "Usage: ${0} [flags...]" + echo + echo "Examine the system topology to determine a reasonable amount of build" + echo "parallelism." + echo + echo "Exported variables:" + echo " \${LOGICAL_CPUS} : Logical processors (e.g. threads)." + echo " \${PHYSICAL_CPUS} : Physical processors (e.g. cores)." + echo " \${TOTAL_MEM} : Total system memory [GB]." + echo " \${MAX_THREADS_PER_CORE} : Maximum threads per core allowed." + echo " \${MIN_MEMORY_PER_THREAD} : Minimum memory [GB] per thread allowed." + echo " \${CPU_BOUND_THREADS} : # of build threads constrained by processors." + echo " \${MEM_BOUND_THREADS} : # of build threads constrained by memory [GB]." + echo " \${PARALLEL_LEVEL} : Determined # of build threads." + echo " \${MEM_PER_THREAD} : Memory [GB] per build thread." + echo + echo "-h, -help, --help" + echo " Print this message." + echo + echo "-q, --quiet" + echo " Print nothing and only export variables." + echo + echo "-j , --jobs " + echo " Explicitly set the number of build threads to use." + echo + echo "--max-threads-per-core " + echo " Specify the maximum threads per core allowed (default: ${MAX_THREADS_PER_CORE} [threads/core])." + echo + echo "--min-memory-per-thread " + echo " Specify the minimum memory per thread allowed (default: ${MIN_MEMORY_PER_THREAD} [GBs/thread])." + + exit -3 +} + +QUIET=0 + +export MAX_THREADS_PER_CORE=2 +export MIN_MEMORY_PER_THREAD=4 # [GB] + +while test ${#} != 0 +do + case "${1}" in + -h) ;& + -help) ;& + --help) usage ;; + -q) ;& + --quiet) QUIET=1 ;; + -j) ;& + --jobs) + shift # The next argument is the number of threads. + PARALLEL_LEVEL="${1}" + ;; + --max-threads-per-core) + shift # The next argument is the number of threads per core. + MAX_THREADS_PER_CORE="${1}" + ;; + --min-memory-per-thread) + shift # The next argument is the amount of memory per thread. + MIN_MEMORY_PER_THREAD="${1}" + ;; + esac + shift +done + +# https://stackoverflow.com/a/23378780 +if [ $(uname) == "Darwin" ]; then + export LOGICAL_CPUS=$(sysctl -n hw.logicalcpu_max) + export PHYSICAL_CPUS=$(sysctl -n hw.physicalcpu_max) +else + export LOGICAL_CPUS=$(lscpu -p | egrep -v '^#' | wc -l) + export PHYSICAL_CPUS=$(lscpu -p | egrep -v '^#' | sort -u -t, -k 2,4 | wc -l) +fi + +export TOTAL_MEM=$(awk "BEGIN { printf \"%0.4g\", $(grep MemTotal /proc/meminfo | awk '{ print $2 }') / (1024 * 1024) }") + +export CPU_BOUND_THREADS=$(awk "BEGIN { printf \"%.04g\", int(${PHYSICAL_CPUS} * ${MAX_THREADS_PER_CORE}) }") +export MEM_BOUND_THREADS=$(awk "BEGIN { printf \"%.04g\", int(${TOTAL_MEM} / ${MIN_MEMORY_PER_THREAD}) }") + +if [[ -z "${PARALLEL_LEVEL}" ]]; then + # Pick the smaller of the two as the default. + if [[ "${MEM_BOUND_THREADS}" -lt "${CPU_BOUND_THREADS}" ]]; then + export PARALLEL_LEVEL=${MEM_BOUND_THREADS} + else + export PARALLEL_LEVEL=${CPU_BOUND_THREADS} + fi +else + EXPLICIT_PARALLEL_LEVEL=1 +fi + +# This can be a floating point number. +export MEM_PER_THREAD=$(awk "BEGIN { printf \"%.04g\", ${TOTAL_MEM} / ${PARALLEL_LEVEL} }") + +if [[ "${QUIET}" == 0 ]]; then + echo "Logical CPUs: ${LOGICAL_CPUS} [threads]" + echo "Physical CPUs: ${PHYSICAL_CPUS} [cores]" + echo "Total Mem: ${TOTAL_MEM} [GBs]" + echo "Max Threads Per Core: ${MAX_THREADS_PER_CORE} [threads/core]" + echo "Min Memory Per Threads: ${MIN_MEMORY_PER_THREAD} [GBs/thread]" + echo "CPU Bound Threads: ${CPU_BOUND_THREADS} [threads]" + echo "Mem Bound Threads: ${MEM_BOUND_THREADS} [threads]" + + echo -n "Parallel Level: ${PARALLEL_LEVEL} [threads]" + if [[ -n "${EXPLICIT_PARALLEL_LEVEL}" ]]; then + echo " (explicitly set)" + else + echo + fi + + echo "Mem Per Thread: ${MEM_PER_THREAD} [GBs/thread]" +fi + diff --git a/ci/common/memmon.py b/ci/common/memmon.py new file mode 100755 index 000000000..505503733 --- /dev/null +++ b/ci/common/memmon.py @@ -0,0 +1,110 @@ +#! /usr/bin/env python + +# Copyright (c) 2022 NVIDIA Corporation +# Reply-To: Allison Vacanti +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# Released under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. + +help_text = """%(prog)s [reference.json compare.json | reference_dir/ compare_dir/] + +This script: + +1. Runs `top -bco RES`, continuously extracting the memory usage of each process. +2. If a process uses more than `log_threshold` GiB and exceeds any other recorded + entry for the process, it is stored in `entries`. +3. When this script receives SIGINT, it writes two files: + * `log_file` will contain all recorded max-memory-per-process entries + * `fail_file` will contain all entries that exceed `fail_threshold` +""" + +import argparse +import os +import re +import signal +import sys + +from subprocess import Popen, PIPE, STDOUT + +parser = argparse.ArgumentParser(prog='memmon.py', usage=help_text) +parser.add_argument('--log-threshold', type=float, dest='log_threshold', + default=0.5, + help='Logging threshold in GiB.') +parser.add_argument('--fail-threshold', type=float, dest='fail_threshold', + default=2, + help='Failure threshold in GiB.') +parser.add_argument('--log-file', type=str, dest='log_file', default='memmon_log', + help='Output file for log entries.') +args, unused = parser.parse_known_args() + +entries = {} + + +def signal_handler(sig, frame): + # Sort by mem: + sortentries = sorted(entries.items(), key=lambda x: x[1], reverse=True) + + lf = open(args.log_file, "w") + + for com, mem in sortentries: + status = "PASS" + if mem >= args.fail_threshold: + status = "FAIL" + line = "%4s | %3.1f GiB | %s\n" % (status, mem, com) + lf.write(line) + + lf.close() + sys.exit(0) + + +signal.signal(signal.SIGINT, signal_handler) + +# Find the toprc config file and configure top's env. +# This config: +# - Hides all columns except for RES and COMMAND +# - Sorts by RES +# - Enables long command strings (-c) +script_dir = os.path.dirname(os.path.realpath(__file__)) +config_dir = os.path.join(script_dir, 'memmon_config') + +proc = Popen(["top", "-b", "-w", "512"], + stdin=PIPE, stdout=PIPE, stderr=STDOUT, + env={"XDG_CONFIG_HOME": config_dir}) + +regex = re.compile("^\\s*([0-9.]+[kmgtp]?)\\s+(.+)\\s*$") + + +# Convert a memory string from top into floating point GiB +def parse_mem(mem_str): + if mem_str[-1] == "k": + return float(mem_str[:-1]) / (1024 * 1024) + elif mem_str[-1] == "m": + return float(mem_str[:-1]) / (1024) + elif mem_str[-1] == "g": + return float(mem_str[:-1]) + elif mem_str[-1] == "t": + return float(mem_str[:-1]) * 1024 + elif mem_str[-1] == "p": # please no + return float(mem_str[:-1]) * 1024 * 1024 + # bytes: + return float(mem_str) / (1024 * 1024 * 1024) + + +for line in proc.stdout: + line = line.decode() + match = regex.match(line) + if match: + mem = parse_mem(match.group(1)) + if mem < args.log_threshold and mem < args.fail_threshold: + continue + com = match.group(2) + if com in entries and entries[com] > mem: + continue + if mem >= args.fail_threshold: + # Print a notice immediately -- this helps identify the failures + # as they happen, since `com` may not provide enough info. + print("memmon.py failure: Build step exceed memory threshold:\n" + " - Threshold: %3.1f GiB\n" + " - Usage: %3.1f GiB\n" + " - Command: %s" % (args.fail_threshold, mem, com)) + entries[com] = mem diff --git a/ci/common/memmon_config/procps/toprc b/ci/common/memmon_config/procps/toprc new file mode 100644 index 000000000..883a482ce --- /dev/null +++ b/ci/common/memmon_config/procps/toprc @@ -0,0 +1,16 @@ +top's Config File (Linux processes with windows) +Id:i, Mode_altscr=0, Mode_irixps=1, Delay_time=3.0, Curwin=0 +Def fieldscur=%(34;@D7:9&')*+,-./012568<>?ABCFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghij + winflags=193972, sortindx=18, maxtasks=0, graph_cpus=0, graph_mems=0 + summclr=1, msgsclr=1, headclr=3, taskclr=1 +Job fieldscur=(Ļ@<)*+,-./012568>?ABCFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghij + winflags=193844, sortindx=0, maxtasks=0, graph_cpus=0, graph_mems=0 + summclr=6, msgsclr=6, headclr=7, taskclr=6 +Mem fieldscur=?@ABCFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghij + winflags=193844, sortindx=3, maxtasks=0, graph_cpus=0, graph_mems=0 + summclr=3, msgsclr=3, headclr=2, taskclr=3 +Fixed_widest=0, Summ_mscale=1, Task_mscale=0, Zero_suppress=0 + diff --git a/ci/cpu/build.bash b/ci/cpu/build.bash new file mode 100755 index 000000000..69b99bbec --- /dev/null +++ b/ci/cpu/build.bash @@ -0,0 +1,14 @@ +#! /usr/bin/env bash + +# Copyright (c) 2018-2020 NVIDIA Corporation +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# Released under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. + +################################################################################ +# Thrust and CUB build script for gpuCI (CPU-only) +################################################################################ + +export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4} + +source ${WORKSPACE}/ci/common/build.bash diff --git a/ci/gpu/build.bash b/ci/gpu/build.bash new file mode 100755 index 000000000..f6cdf021c --- /dev/null +++ b/ci/gpu/build.bash @@ -0,0 +1,14 @@ +#! /usr/bin/env bash + +# Copyright (c) 2018-2020 NVIDIA Corporation +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# Released under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. + +################################################################################ +# Thrust and CUB build script for gpuCI (heterogeneous) +################################################################################ + +export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4} + +source ${WORKSPACE}/ci/common/build.bash diff --git a/ci/local/build.bash b/ci/local/build.bash new file mode 100755 index 000000000..8b20ef063 --- /dev/null +++ b/ci/local/build.bash @@ -0,0 +1,224 @@ +#! /usr/bin/env bash + +# Copyright (c) 2018-2020 NVIDIA Corporation +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# Released under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. + +################################################################################ +# Thrust and CUB local containerized build script +################################################################################ + +function usage { + echo "Usage: ${0} [flags...] [cmake-targets...]" + echo + echo "Build and test your local repository using a gpuCI Docker image." + echo "If CMake targets are specified, only those targets are built and tested." + echo "Otherwise, everything is built and tested." + echo + echo "-h, -help, --help" + echo " Print this message." + echo + echo "-r , --repository " + echo " Path to the repository (default: ${REPOSITORY_PATH})." + echo + echo "-i , --image " + echo " Docker image to use (default: ${IMAGE})" + echo + echo "-l, --local-image" + echo " Use the local version of the image instead of pulling from Docker hub." + echo + echo "-s, --shell-only" + echo " Skip building and testing and launch an interactive shell instead." + echo + echo "-d, --disable-gpus" + echo " Don't start the container with the NVIDIA runtime and GPUs attached." + echo + echo "-c, --clean" + echo " If the build directory already exists, delete it." + echo + echo "-j , --jobs " + echo " Number of threads to use when building (default: inferred)." + echo + echo "-b , --cmake-build-type " + echo " CMake build type to use, either Release, RelWithDebInfo, or Debug" + echo " (default: ${CMAKE_BUILD_TYPE})." + echo + echo "-p , --coverage-plan " + echo " Coverage plan to use, either Exhaustive, Thorough, or Minimal" + echo " (default: ${COVERAGE_PLAN})." + echo + + exit -3 +} + +SCRIPT_PATH=$(cd $(dirname ${0}); pwd -P) + +REPOSITORY_PATH=$(realpath ${SCRIPT_PATH}/../..) + +################################################################################ +# FLAGS - Process command line flags. +################################################################################ + +IMAGE="gpuci/cccl:cuda11.7.0-devel-ubuntu20.04-gcc9" + +LOCAL_IMAGE=0 + +SHELL_ONLY=0 + +BUILD_TYPE="gpu" + +CLEAN=0 + +PARALLEL_LEVEL="" + +CMAKE_BUILD_TYPE="Release" + +COVERAGE_PLAN="Minimal" + +TARGETS="" + +while test ${#} != 0 +do + case "${1}" in + -h) ;& + -help) ;& + --help) usage ;; + -r) ;& + --repository) + shift # The next argument is the path. + REPOSITORY_PATH="${1}" + ;; + -i) ;& + --image) + shift # The next argument is the image. + IMAGE="${1}" + ;; + -l) ;& + --local-image) LOCAL_IMAGE=1 ;; + -s) ;& + --shell-only) SHELL_ONLY=1 ;; + -d) ;& + --disable-gpus) BUILD_TYPE="cpu" ;; + -c) ;& + --clean) CLEAN=1 ;; + -j) ;& + --jobs) + shift # The next argument is the number of threads. + PARALLEL_LEVEL="${1}" + ;; + -b) ;& + --cmake-build-type) + shift # The next argument is the build type. + CMAKE_BUILD_TYPE="${1}" + ;; + -p) ;& + --coverage-plan) + shift # The next argument is the coverage plan. + COVERAGE_PLAN="${1}" + ;; + *) + TARGETS="${TARGETS:+${TARGETS} }${1}" + ;; + esac + shift +done + +################################################################################ +# PATHS - Setup paths for the container. +################################################################################ + +# ${REPOSITORY_PATH} is the local filesystem path to the Git repository being +# built and tested. It can be set with the --repository flag. +# +# ${BUILD_PATH} is the local filesystem path that will be used for the build. It +# is named after the image name, allowing multiple image builds to coexist on +# the local filesystem. +# +# ${REPOSITORY_PATH_IN_CONTAINER} is the location of ${REPOSITORY_PATH} inside +# the container. +# +# ${BUILD_PATH_IN_CONTAINER} is the location of ${BUILD_PATH} inside the +# container. + +BUILD_PATH=${REPOSITORY_PATH}/build_$(echo "$(basename "${IMAGE}")" | sed -e 's/:/_/g' | sed -e 's/-/_/g') + +if [[ "${CLEAN}" != 0 ]]; then + rm -rf ${BUILD_PATH} +fi + +mkdir -p ${BUILD_PATH} + +BASE_PATH_IN_CONTAINER="/cccl" + +REPOSITORY_PATH_IN_CONTAINER="${BASE_PATH_IN_CONTAINER}/$(basename "${REPOSITORY_PATH}")" + +BUILD_PATH_IN_CONTAINER="${BASE_PATH_IN_CONTAINER}/$(basename "${REPOSITORY_PATH}")/build" + +################################################################################ +# ENVIRONMENT - Setup the thunk build script that will be run by the container. +################################################################################ + +# We have to run `ldconfig` to rebuild `ld.so.cache` to work around this +# failure on Debian: https://github.com/NVIDIA/nvidia-docker/issues/1399 + +COMMAND="sudo ldconfig; sudo ldconfig" +if [[ "${SHELL_ONLY}" != 0 ]]; then + COMMAND="${COMMAND}; bash" +else + COMMAND="${COMMAND}; ${REPOSITORY_PATH_IN_CONTAINER}/ci/common/build.bash ${TARGETS} || bash" +fi + +################################################################################ +# GPU - Setup GPUs. +################################################################################ + +if [[ "${BUILD_TYPE}" == "gpu" ]]; then + # Limit GPUs available to the container based on ${CUDA_VISIBLE_DEVICES}. + if [[ -z "${CUDA_VISIBLE_DEVICES}" ]]; then + VISIBLE_DEVICES="all" + else + VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES}" + fi + + DOCKER_MAJOR_VER=$(docker -v | sed 's/[^[0-9]*\([0-9]*\).*/\1/') + GPU_OPTS="--gpus device=${VISIBLE_DEVICES}" + if [[ "${DOCKER_MAJOR_VER}" -lt 19 ]] + then + GPU_OPTS="--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES='${VISIBLE_DEVICES}'" + fi +fi + +################################################################################ +# LAUNCH - Pull and launch the container. +################################################################################ + +NVIDIA_DOCKER_INSTALLED=$(docker info 2>&1 | grep -i runtime | grep -c nvidia) +if [[ "${NVIDIA_DOCKER_INSTALLED}" == 0 ]]; then + echo "NVIDIA Docker not found, the build may fail." + echo "Please install it if you encounter issues: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#installing-docker-ce" +fi + +if [[ "${LOCAL_IMAGE}" == 0 ]]; then + docker pull "${IMAGE}" +fi + +docker run --rm -it ${GPU_OPTS} \ + --cap-add=SYS_PTRACE \ + --user "$(id -u)":"$(id -g)" \ + -v "${REPOSITORY_PATH}":"${REPOSITORY_PATH_IN_CONTAINER}" \ + -v "${BUILD_PATH}":"${BUILD_PATH_IN_CONTAINER}" \ + -v /etc/passwd:/etc/passwd:ro \ + -v /etc/group:/etc/group:ro \ + -v /etc/subuid:/etc/subuid:ro \ + -v /etc/subgid:/etc/subgid:ro \ + -v /etc/shadow:/etc/shadow:ro \ + -v /etc/gshadow:/etc/gshadow:ro \ + -e "WORKSPACE=${REPOSITORY_PATH_IN_CONTAINER}" \ + -e "BUILD_TYPE=${BUILD_TYPE}" \ + -e "CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}" \ + -e "COVERAGE_PLAN=${COVERAGE_PLAN}" \ + -e "PARALLEL_LEVEL=${PARALLEL_LEVEL}" \ + -w "${BUILD_PATH_IN_CONTAINER}" \ + "${IMAGE}" bash -c "${COMMAND}" + diff --git a/cmake/AppendOptionIfAvailable.cmake b/cmake/AppendOptionIfAvailable.cmake index 8df9f4a33..52dc12216 100644 --- a/cmake/AppendOptionIfAvailable.cmake +++ b/cmake/AppendOptionIfAvailable.cmake @@ -3,7 +3,7 @@ include(CheckCXXCompilerFlag) macro (APPEND_OPTION_IF_AVAILABLE _FLAG _LIST) -set(_VAR "CXX_FLAG_${_FLAG}") +string(MAKE_C_IDENTIFIER "CXX_FLAG_${_FLAG}" _VAR) check_cxx_compiler_flag(${_FLAG} ${_VAR}) if (${${_VAR}}) diff --git a/cmake/CheckCXXCompilerFlag.cmake b/cmake/CheckCXXCompilerFlag.cmake deleted file mode 100644 index 87df0be8e..000000000 --- a/cmake/CheckCXXCompilerFlag.cmake +++ /dev/null @@ -1,64 +0,0 @@ -# Distributed under the OSI-approved BSD 3-Clause License. See accompanying -# file Copyright.txt or https://cmake.org/licensing for details. - -#[=======================================================================[.rst: -CheckCXXCompilerFlag ------------------------- - -Check whether the CXX compiler supports a given flag. - -.. command:: check_cxx_compiler_flag - - :: - - check_cxx_compiler_flag( ) - - Check that the ```` is accepted by the compiler without - a diagnostic. Stores the result in an internal cache entry - named ````. - -This command temporarily sets the ``CMAKE_REQUIRED_DEFINITIONS`` variable -and calls the ``check_cxx_source_compiles`` macro from the -:module:`CheckCXXSourceCompiles` module. See documentation of that -module for a listing of variables that can otherwise modify the build. - -A positive result from this check indicates only that the compiler did not -issue a diagnostic message when given the flag. Whether the flag has any -effect or even a specific one is beyond the scope of this module. - -.. note:: - Since the :command:`try_compile` command forwards flags from variables - like :variable:`CMAKE_CXX_FLAGS _FLAGS>`, unknown flags - in such variables may cause a false negative for this check. -#]=======================================================================] - -include_guard(GLOBAL) -include(CheckCXXSourceCompiles) -include(CMakeCheckCompilerFlagCommonPatterns) - -macro (CHECK_CXX_COMPILER_FLAG _FLAG _RESULT) - set(SAFE_CMAKE_REQUIRED_DEFINITIONS "${CMAKE_REQUIRED_DEFINITIONS}") - set(CMAKE_REQUIRED_DEFINITIONS "${_FLAG}") - - # Normalize locale during test compilation. - set(_CheckCXXCompilerFlag_LOCALE_VARS LC_ALL LC_MESSAGES LANG) - foreach(v ${_CheckCXXCompilerFlag_LOCALE_VARS}) - set(_CheckCXXCompilerFlag_SAVED_${v} "$ENV{${v}}") - set(ENV{${v}} C) - endforeach() - CHECK_COMPILER_FLAG_COMMON_PATTERNS(_CheckCXXCompilerFlag_COMMON_PATTERNS) - CHECK_CXX_SOURCE_COMPILES("int main() { return 0; }" "${_RESULT}" "CXX flag ${_FLAG}" - # Some compilers do not fail with a bad flag - FAIL_REGEX "command line option .* is valid for .* but not for C\\\\+\\\\+" # GNU - ${_CheckCXXCompilerFlag_COMMON_PATTERNS} - ) - foreach(v ${_CheckCXXCompilerFlag_LOCALE_VARS}) - set(ENV{${v}} ${_CheckCXXCompilerFlag_SAVED_${v}}) - unset(_CheckCXXCompilerFlag_SAVED_${v}) - endforeach() - unset(_CheckCXXCompilerFlag_LOCALE_VARS) - unset(_CheckCXXCompilerFlag_COMMON_PATTERNS) - - set (CMAKE_REQUIRED_DEFINITIONS "${SAFE_CMAKE_REQUIRED_DEFINITIONS}") -endmacro () - diff --git a/cmake/CheckCXXSourceCompiles.cmake b/cmake/CheckCXXSourceCompiles.cmake deleted file mode 100644 index 38e915c27..000000000 --- a/cmake/CheckCXXSourceCompiles.cmake +++ /dev/null @@ -1,135 +0,0 @@ -# Distributed under the OSI-approved BSD 3-Clause License. See accompanying -# file Copyright.txt or https://cmake.org/licensing for details. - -#[=======================================================================[.rst: -CheckCXXSourceCompiles ----------------------- - -Check if given C++ source compiles and links into an executable. - -.. command:: check_cxx_source_compiles - - :: - - check_cxx_source_compiles(code resultVar [FAIL_REGEX regex1 [regex2...]]) - - Check that the source supplied in ``code`` can be compiled as a C++ source - file and linked as an executable (so it must contain at least a ``main()`` - function). The result will be stored in the internal cache variable specified - by ``resultVar``, with a boolean true value for success and boolean false for - failure. If ``FAIL_REGEX`` is provided, then failure is determined by - checking if anything in the output matches any of the specified regular - expressions. - - The underlying check is performed by the :command:`try_compile` command. The - compile and link commands can be influenced by setting any of the following - variables prior to calling ``check_cxx_source_compiles()``: - - ``CMAKE_REQUIRED_FLAGS`` - Additional flags to pass to the compiler. Note that the contents of - :variable:`CMAKE_CXX_FLAGS _FLAGS>` and its associated - configuration-specific variable are automatically added to the compiler - command before the contents of ``CMAKE_REQUIRED_FLAGS``. - - ``CMAKE_REQUIRED_DEFINITIONS`` - A :ref:`;-list ` of compiler definitions of the form - ``-DFOO`` or ``-DFOO=bar``. A definition for the name specified by - ``resultVar`` will also be added automatically. - - ``CMAKE_REQUIRED_INCLUDES`` - A :ref:`;-list ` of header search paths to pass to - the compiler. These will be the only header search paths used by - ``try_compile()``, i.e. the contents of the :prop_dir:`INCLUDE_DIRECTORIES` - directory property will be ignored. - - ``CMAKE_REQUIRED_LIBRARIES`` - A :ref:`;-list ` of libraries to add to the link - command. These can be the name of system libraries or they can be - :ref:`Imported Targets ` (see :command:`try_compile` for - further details). - - ``CMAKE_REQUIRED_QUIET`` - If this variable evaluates to a boolean true value, all status messages - associated with the check will be suppressed. - - The check is only performed once, with the result cached in the variable - named by ``resultVar``. Every subsequent CMake run will re-use this cached - value rather than performing the check again, even if the ``code`` changes. - In order to force the check to be re-evaluated, the variable named by - ``resultVar`` must be manually removed from the cache. - -#]=======================================================================] - -include_guard(GLOBAL) - -macro(CHECK_CXX_SOURCE_COMPILES SOURCE VAR NAME) - if(NOT DEFINED "${VAR}") - set(_FAIL_REGEX) - set(_key) - foreach(arg ${ARGN}) - if("${arg}" MATCHES "^(FAIL_REGEX)$") - set(_key "${arg}") - elseif(_key) - list(APPEND _${_key} "${arg}") - else() - message(FATAL_ERROR "Unknown argument:\n ${arg}\n") - endif() - endforeach() - - set(MACRO_CHECK_FUNCTION_DEFINITIONS - "${CMAKE_REQUIRED_FLAGS}") - if(CMAKE_REQUIRED_LIBRARIES) - set(CHECK_CXX_SOURCE_COMPILES_ADD_LIBRARIES - LINK_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES}) - else() - set(CHECK_CXX_SOURCE_COMPILES_ADD_LIBRARIES) - endif() - if(CMAKE_REQUIRED_INCLUDES) - set(CHECK_CXX_SOURCE_COMPILES_ADD_INCLUDES - "-DINCLUDE_DIRECTORIES:STRING=${CMAKE_REQUIRED_INCLUDES}") - else() - set(CHECK_CXX_SOURCE_COMPILES_ADD_INCLUDES) - endif() - file(WRITE "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/src.cxx" - "${SOURCE}\n") - - if(NOT CMAKE_REQUIRED_QUIET) - message(STATUS "Testing ${NAME}") - endif() - try_compile(${VAR} - ${CMAKE_BINARY_DIR} - ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/src.cxx - COMPILE_DEFINITIONS ${CMAKE_REQUIRED_DEFINITIONS} - ${CHECK_CXX_SOURCE_COMPILES_ADD_LIBRARIES} - CMAKE_FLAGS -DCOMPILE_DEFINITIONS:STRING=${MACRO_CHECK_FUNCTION_DEFINITIONS} - "${CHECK_CXX_SOURCE_COMPILES_ADD_INCLUDES}" - OUTPUT_VARIABLE OUTPUT) - - foreach(_regex ${_FAIL_REGEX}) - if("${OUTPUT}" MATCHES "${_regex}") - set(${VAR} 0) - endif() - endforeach() - - if(${VAR}) - set(${VAR} 1 CACHE INTERNAL "Test ${NAME}") - if(NOT CMAKE_REQUIRED_QUIET) - message(STATUS "Testing ${NAME} - Success") - endif() - file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeOutput.log - "Performing C++ SOURCE FILE Test ${NAME} succeeded with the following output:\n" - "${OUTPUT}\n" - "Source file was:\n${SOURCE}\n") - else() - if(NOT CMAKE_REQUIRED_QUIET) - message(STATUS "Testing ${NAME} - Failed") - endif() - set(${VAR} "" CACHE INTERNAL "Test ${NAME}") - file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeError.log - "Performing C++ SOURCE FILE Test ${NAME} failed with the following output:\n" - "${OUTPUT}\n" - "Source file was:\n${SOURCE}\n") - endif() - endif() -endmacro() - diff --git a/cmake/DetectSupportedStandards.cmake b/cmake/DetectSupportedStandards.cmake new file mode 100644 index 000000000..5dceefdab --- /dev/null +++ b/cmake/DetectSupportedStandards.cmake @@ -0,0 +1,47 @@ +# Detect the langauge standards supported by the current compilers. +# +# Usage: detect_supported_cxx_standards( ) +# +# - var_prefix: Used to name result variables, +# e.g. ${var_prefix}_${lang}_XX_SUPPORTED will be TRUE or FALSE. Defined for +# each XX in ${standards}. +# - lang: The language to test: C, CXX, or CUDA. +# - standards: List of any standard versions. +# +# Example: detect_supported_cxx_standards(PROJ CXX 11 14 17) +# - Sets the following variables in the parent scope to TRUE or FALSE: +# - PROJ_CXX_11_SUPPORTED +# - PROJ_CXX_14_SUPPORTED +# - PROJ_CXX_17_SUPPORTED +# +function(detect_supported_standards prefix lang) + string(TOLOWER "${lang}_std" feature_prefix) + foreach(standard IN LISTS ARGN) + set(var_name "${prefix}_${lang}_${standard}_SUPPORTED") + if ("${feature_prefix}_${standard}" IN_LIST CMAKE_${lang}_COMPILE_FEATURES) + set(${var_name} TRUE) + else() + set(${var_name} FALSE) + endif() + + + if (standard EQUAL 17 AND + (lang STREQUAL "CXX" OR lang STREQUAL "CUDA") AND + ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND + CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7) OR + (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND + CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8))) + # Special cases: + # gcc < 7 and clang < 8 don't fully support C++17. + # They accept the flag and have partial support, but nvcc will refuse + # to enable it and falls back to the default dialect for the current + # CXX compiler version. This breaks our CI. + # CMake's COMPILE_FEATURES var reports that these compilers support C++17, + # but we can't rely on it, so manually disable the dialect in these cases. + set(${var_name} FALSE) + endif() + + message(STATUS "Testing ${lang}${standard} Support: ${${var_name}}") + set(${var_name} ${${var_name}} PARENT_SCOPE) + endforeach() +endfunction() diff --git a/cmake/PrintCTestRunTimes.cmake b/cmake/PrintCTestRunTimes.cmake new file mode 100644 index 000000000..bf23b9bb6 --- /dev/null +++ b/cmake/PrintCTestRunTimes.cmake @@ -0,0 +1,109 @@ +## This CMake script parses the output of ctest and prints a formatted list +## of individual test runtimes, sorted longest first. +## +## ctest > ctest_log +## cmake -DLOGFILE=ctest_log \ +## -P PrintCTestRunTimes.cmake +## +################################################################################ + +cmake_minimum_required(VERSION 3.15) + +# Prepend the string with "0" until the string length equals the specified width +function(pad_string_with_zeros string_var width) + set(local_string "${${string_var}}") + string(LENGTH "${local_string}" size) + while(size LESS width) + string(PREPEND local_string "0") + string(LENGTH "${local_string}" size) + endwhile() + set(${string_var} "${local_string}" PARENT_SCOPE) +endfunction() + +################################################################################ + +if (NOT LOGFILE) + message(FATAL_ERROR "Missing -DLOGFILE= argument.") +endif() + +# Check if logfile exists +if (NOT EXISTS "${LOGFILE}") + message(FATAL_ERROR "LOGFILE does not exist ('${LOGFILE}').") +endif() + +string(JOIN "" regex + "^[ ]*[0-9]+/[0-9]+[ ]+Test[ ]+#" + "([0-9]+)" # Test ID + ":[ ]+" + "(.+)" # Test Name + "[ ]+\\.+[ ]+" + "(.+[^ ])" # Result + "[ ]+" + "([0-9]+)" # Seconds + "\\.[0-9]+[ ]+sec[ ]*$" +) + +message(DEBUG "Regex: ${regex}") + +# Read the logfile and generate a map / keylist +set(keys) +file(STRINGS "${LOGFILE}" lines) +foreach(line ${lines}) + + # Parse each build time + string(REGEX MATCH "${regex}" _DUMMY "${line}") + + if (CMAKE_MATCH_COUNT EQUAL 4) + set(test_id "${CMAKE_MATCH_1}") + set(test_name "${CMAKE_MATCH_2}") + set(test_result "${CMAKE_MATCH_3}") + set(tmp "${CMAKE_MATCH_4}") # floor(runtime_seconds) + + # Compute human readable time + math(EXPR days "${tmp} / (60 * 60 * 24)") + math(EXPR tmp "${tmp} - (${days} * 60 * 60 * 24)") + math(EXPR hours "${tmp} / (60 * 60)") + math(EXPR tmp "${tmp} - (${hours} * 60 * 60)") + math(EXPR minutes "${tmp} / (60)") + math(EXPR tmp "${tmp} - (${minutes} * 60)") + math(EXPR seconds "${tmp}") + + # Format time components + pad_string_with_zeros(days 3) + pad_string_with_zeros(hours 2) + pad_string_with_zeros(minutes 2) + pad_string_with_zeros(seconds 2) + + # Construct table entry + # Later values in the file for the same command overwrite earlier entries + string(MAKE_C_IDENTIFIER "${test_id}" key) + string(JOIN " | " ENTRY_${key} + "${days}d ${hours}h ${minutes}m ${seconds}s" + "${test_result}" + "${test_id}: ${test_name}" + ) + + # Record the key: + list(APPEND keys "${key}") + endif() +endforeach() + +list(REMOVE_DUPLICATES keys) + +# Build the entry list: +set(entries) +foreach(key ${keys}) + list(APPEND entries "${ENTRY_${key}}") +endforeach() + +if (NOT entries) + message(FATAL_ERROR "LOGFILE contained no test times ('${LOGFILE}').") +endif() + +# Sort in descending order: +list(SORT entries ORDER DESCENDING) + +# Dump table: +foreach(entry ${entries}) + message(STATUS ${entry}) +endforeach() diff --git a/cmake/PrintNinjaBuildTimes.cmake b/cmake/PrintNinjaBuildTimes.cmake new file mode 100644 index 000000000..65d243d35 --- /dev/null +++ b/cmake/PrintNinjaBuildTimes.cmake @@ -0,0 +1,101 @@ +## This CMake script parses a .ninja_log file (LOGFILE) and prints a list of +## build/link times, sorted longest first. +## +## cmake -DLOGFILE=<.ninja_log file> \ +## -P PrintNinjaBuildTimes.cmake +## +## If LOGFILE is omitted, the current directory's .ninja_log file is used. +################################################################################ + +cmake_minimum_required(VERSION 3.15) + +# Prepend the string with "0" until the string length equals the specified width +function(pad_string_with_zeros string_var width) + set(local_string "${${string_var}}") + string(LENGTH "${local_string}" size) + while(size LESS width) + string(PREPEND local_string "0") + string(LENGTH "${local_string}" size) + endwhile() + set(${string_var} "${local_string}" PARENT_SCOPE) +endfunction() + +################################################################################ + +if (NOT LOGFILE) + set(LOGFILE ".ninja_log") +endif() + +# Check if logfile exists +if (NOT EXISTS "${LOGFILE}") + message(FATAL_ERROR "LOGFILE does not exist ('${LOGFILE}').") +endif() + +# Read the logfile and generate a map / keylist +set(keys) +file(STRINGS "${LOGFILE}" lines) +foreach(line ${lines}) + + # Parse each build time + string(REGEX MATCH + "^([0-9]+)\t([0-9]+)\t[0-9]+\t([^\t]+)+\t[0-9a-fA-F]+$" _DUMMY "${line}") + + if (CMAKE_MATCH_COUNT EQUAL 3) + set(start_ms ${CMAKE_MATCH_1}) + set(end_ms ${CMAKE_MATCH_2}) + set(command "${CMAKE_MATCH_3}") + math(EXPR runtime_ms "${end_ms} - ${start_ms}") + + # Compute human readable time + math(EXPR days "${runtime_ms} / (1000 * 60 * 60 * 24)") + math(EXPR runtime_ms "${runtime_ms} - (${days} * 1000 * 60 * 60 * 24)") + math(EXPR hours "${runtime_ms} / (1000 * 60 * 60)") + math(EXPR runtime_ms "${runtime_ms} - (${hours} * 1000 * 60 * 60)") + math(EXPR minutes "${runtime_ms} / (1000 * 60)") + math(EXPR runtime_ms "${runtime_ms} - (${minutes} * 1000 * 60)") + math(EXPR seconds "${runtime_ms} / 1000") + math(EXPR milliseconds "${runtime_ms} - (${seconds} * 1000)") + + # Format time components + pad_string_with_zeros(days 3) + pad_string_with_zeros(hours 2) + pad_string_with_zeros(minutes 2) + pad_string_with_zeros(seconds 2) + pad_string_with_zeros(milliseconds 3) + + # Construct table entry + # Later values in the file for the same command overwrite earlier entries + string(MAKE_C_IDENTIFIER "${command}" key) + set(ENTRY_${key} + "${days}d ${hours}h ${minutes}m ${seconds}s ${milliseconds}ms | ${command}" + ) + + # Record the key: + list(APPEND keys "${key}") + endif() +endforeach() + +list(REMOVE_DUPLICATES keys) + +# Build the entry list: +set(entries) +foreach(key ${keys}) + list(APPEND entries "${ENTRY_${key}}") +endforeach() + +if (NOT entries) + message(FATAL_ERROR "LOGFILE contained no build entries ('${LOGFILE}').") +endif() + +# Sort in descending order: +list(SORT entries) +list(REVERSE entries) + +# Dump table: +message(STATUS "-----------------------+----------------------------") +message(STATUS "Time | Command ") +message(STATUS "-----------------------+----------------------------") + +foreach(entry ${entries}) + message(STATUS ${entry}) +endforeach() diff --git a/cmake/ThrustAddSubdir.cmake b/cmake/ThrustAddSubdir.cmake new file mode 100644 index 000000000..d48aa1415 --- /dev/null +++ b/cmake/ThrustAddSubdir.cmake @@ -0,0 +1,6 @@ +find_package(Thrust REQUIRED CONFIG + NO_DEFAULT_PATH # Only check the explicit path in HINTS: + HINTS "${CMAKE_CURRENT_LIST_DIR}/.." + COMPONENTS ${THRUST_REQUIRED_SYSTEMS} + OPTIONAL_COMPONENTS ${THRUST_OPTIONAL_SYSTEMS} +) diff --git a/cmake/ThrustBuildCompilerTargets.cmake b/cmake/ThrustBuildCompilerTargets.cmake new file mode 100644 index 000000000..aed0ec170 --- /dev/null +++ b/cmake/ThrustBuildCompilerTargets.cmake @@ -0,0 +1,191 @@ +# +# This file defines the `thrust_build_compiler_targets()` function, which +# creates the following interface targets: +# +# thrust.compiler_interface +# - Interface target providing compiler-specific options needed to build +# Thrust's tests, examples, etc. +# +# thrust.compiler_interface_cppXX +# - Interface targets providing compiler-specific options that should only be +# applied to certain dialects of C++. May not be defined for all dialects. +# +# thrust.promote_cudafe_warnings +# - Interface target that adds warning promotion for NVCC cudafe invocations. +# - Only exists to work around github issue #1174 on tbb.cuda configurations. +# - May be combined with thrust.compiler_interface when #1174 is fully resolved. +# +# thrust.silence_unreachable_code_warnings +# - Interface target that silences unreachable code warnings. +# - Used to selectively disable such warnings in unit tests caused by +# unconditionally thrown exceptions. + +function(thrust_build_compiler_targets) + set(cxx_compile_definitions) + set(cxx_compile_options) + + thrust_update_system_found_flags() + + if ("MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}") + append_option_if_available("/W4" cxx_compile_options) + + # Treat all warnings as errors. This is only supported on Release builds, + # as `nv_exec_check_disable` doesn't seem to work with MSVC debug iterators + # and spurious warnings are emitted. + # See NVIDIA/thrust#1273, NVBug 3129879. + if (CMAKE_BUILD_TYPE STREQUAL "Release") + append_option_if_available("/WX" cxx_compile_options) + endif() + + # Suppress overly-pedantic/unavoidable warnings brought in with /W4: + # C4324: structure was padded due to alignment specifier + append_option_if_available("/wd4324" cxx_compile_options) + # C4505: unreferenced local function has been removed + # The CUDA `host_runtime.h` header emits this for + # `__cudaUnregisterBinaryUtil`. + append_option_if_available("/wd4505" cxx_compile_options) + # C4706: assignment within conditional expression + # MSVC doesn't provide an opt-out for this warning when the assignment is + # intentional. Clang will warn for these, but suppresses the warning when + # double-parentheses are used around the assignment. We'll let Clang catch + # unintentional assignments and suppress all such warnings on MSVC. + append_option_if_available("/wd4706" cxx_compile_options) + + # Disabled loss-of-data conversion warnings. + # TODO Re-enable. + append_option_if_available("/wd4244" cxx_compile_options) + + # Disable warning about applying unary operator- to unsigned type. + # TODO Re-enable. + append_option_if_available("/wd4146" cxx_compile_options) + + # MSVC STL assumes that `allocator_traits`'s allocator will use raw pointers, + # and the `__DECLSPEC_ALLOCATOR` macro causes issues with thrust's universal + # allocators: + # warning C4494: 'std::allocator_traits<_Alloc>::allocate' : + # Ignoring __declspec(allocator) because the function return type is not + # a pointer or reference + # See https://github.com/microsoft/STL/issues/696 + append_option_if_available("/wd4494" cxx_compile_options) + + # Some of the async tests require /bigobj to fit all their sections into the + # object files: + append_option_if_available("/bigobj" cxx_compile_options) + + # "Oh right, this is Visual Studio." + list(APPEND cxx_compile_definitions "NOMINMAX") + else() + append_option_if_available("-Werror" cxx_compile_options) + append_option_if_available("-Wall" cxx_compile_options) + append_option_if_available("-Wextra" cxx_compile_options) + append_option_if_available("-Winit-self" cxx_compile_options) + append_option_if_available("-Woverloaded-virtual" cxx_compile_options) + append_option_if_available("-Wcast-qual" cxx_compile_options) + append_option_if_available("-Wpointer-arith" cxx_compile_options) + append_option_if_available("-Wunused-local-typedef" cxx_compile_options) + append_option_if_available("-Wvla" cxx_compile_options) + + # Disable GNU extensions (flag is clang only) + append_option_if_available("-Wgnu" cxx_compile_options) + # Calling a variadic macro with zero args is a GNU extension until C++20, + # but the THRUST_PP_ARITY macro is used with zero args. Need to see if this + # is a real problem worth fixing. + append_option_if_available("-Wno-gnu-zero-variadic-macro-arguments" cxx_compile_options) + + # This complains about functions in CUDA system headers when used with nvcc. + append_option_if_available("-Wno-unused-function" cxx_compile_options) + endif() + + if ("GNU" STREQUAL "${CMAKE_CXX_COMPILER_ID}") + if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.3) + # GCC 7.3 complains about name mangling changes due to `noexcept` + # becoming part of the type system; we don't care. + append_option_if_available("-Wno-noexcept-type" cxx_compile_options) + endif() + endif() + + if ("Intel" STREQUAL "${CMAKE_CXX_COMPILER_ID}") + # Disable warning that inlining is inhibited by compiler thresholds. + append_option_if_available("-diag-disable=11074" cxx_compile_options) + append_option_if_available("-diag-disable=11076" cxx_compile_options) + endif() + + if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") + # Today: + # * NVCC accepts CUDA C++ in .cu files but not .cpp files. + # * NVC++ accepts CUDA C++ in .cpp files but not .cu files. + # TODO: This won't be necessary in the future. + list(APPEND cxx_compile_options -cppsuffix=cu) + endif() + + add_library(thrust.compiler_interface INTERFACE) + + foreach (cxx_option IN LISTS cxx_compile_options) + target_compile_options(thrust.compiler_interface INTERFACE + $<$:${cxx_option}> + $<$,$>:${cxx_option}> + # Only use -Xcompiler with NVCC, not NVC++. + # + # CMake can't split genexs, so this can't be formatted better :( + # This is: + # if (using CUDA and CUDA_COMPILER is NVCC) add -Xcompiler=opt: + $<$,$>:-Xcompiler=${cxx_option}> + ) + endforeach() + + foreach (cxx_definition IN LISTS cxx_compile_definitions) + # Add these for both CUDA and CXX targets: + target_compile_definitions(thrust.compiler_interface INTERFACE + ${cxx_definition} + ) + endforeach() + + # Display warning numbers from nvcc cudafe errors: + target_compile_options(thrust.compiler_interface INTERFACE + # If using CUDA w/ NVCC... + $<$,$>:-Xcudafe=--display_error_number> + ) + + # Tell NVCC to be quiet about deprecated GPU targets: + target_compile_options(thrust.compiler_interface INTERFACE + # If using CUDA w/ NVCC... + $<$,$>:-Wno-deprecated-gpu-targets> + ) + + # This is kept separate for Github issue #1174. + add_library(thrust.promote_cudafe_warnings INTERFACE) + target_compile_options(thrust.promote_cudafe_warnings INTERFACE + $<$,$>:-Xcudafe=--promote_warnings> + ) + + # Some of our unit tests unconditionally throw exceptions, and compilers will + # detect that the following instructions are unreachable. This is intentional + # and unavoidable in these cases. This target can be used to silence + # unreachable code warnings. + add_library(thrust.silence_unreachable_code_warnings INTERFACE) + if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") + target_compile_options(thrust.silence_unreachable_code_warnings INTERFACE + $<$:/wd4702> + $<$,$>:-Xcompiler=/wd4702> + ) + endif() + + # These targets are used for dialect-specific options: + add_library(thrust.compiler_interface_cpp11 INTERFACE) + add_library(thrust.compiler_interface_cpp14 INTERFACE) + + if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") + # C4127: conditional expression is constant + # Disable this MSVC warning for C++11/C++14. In C++17, we can use + # THRUST_IF_CONSTEXPR to address these warnings. + target_compile_options(thrust.compiler_interface_cpp11 INTERFACE + $<$:/wd4127> + $<$,$>:-Xcompiler=/wd4127> + ) + target_compile_options(thrust.compiler_interface_cpp14 INTERFACE + $<$:/wd4127> + $<$,$>:-Xcompiler=/wd4127> + ) + endif() + +endfunction() diff --git a/cmake/ThrustBuildTargetList.cmake b/cmake/ThrustBuildTargetList.cmake new file mode 100644 index 000000000..f4adaf546 --- /dev/null +++ b/cmake/ThrustBuildTargetList.cmake @@ -0,0 +1,339 @@ +# This file provides utilities for building and working with thrust +# configuration targets. +# +# THRUST_TARGETS +# - Built by the calling the `thrust_build_target_list()` function. +# - Each item is the name of a thrust interface target that is configured for a +# certain combination of host/device/dialect. +# +# thrust_build_target_list() +# - Creates the THRUST_TARGETS list. +# +# The following functions can be used to test/set metadata on a thrust target: +# +# thrust_get_target_property( ) +# - Checks the ${prop} target property on thrust target ${target_name} +# and sets the ${prop_var} variable in the caller's scope. +# - is any valid cmake identifier. +# - is the name of a thrust target. +# - is one of the following: +# - HOST: The host system. Valid values: CPP, OMP, TBB. +# - DEVICE: The device system. Valid values: CUDA, CPP, OMP, TBB. +# - DIALECT: The C++ dialect. Valid values: 11, 14, 17, 20. +# - PREFIX: A unique prefix that should be used to name all +# targets/tests/examples that use this configuration. +# +# thrust_get_target_properties() +# - Defines ${target_name}_${prop} in the caller's scope, for `prop` in: +# HOST, DEVICE, DIALECT, PREFIX. See above for details. +# +# thrust_clone_target_properties( ) +# - Set the HOST, DEVICE, DIALECT, PREFIX metadata on ${dst_target} to match +# ${src_target}. See above for details. +# - This *MUST* be called on any targets that link to another thrust target +# to ensure that dialect information is updated correctly, e.g. +# `thrust_clone_target_properties(${my_thrust_test} ${some_thrust_target})` + +define_property(TARGET PROPERTY _THRUST_HOST + BRIEF_DOCS "A target's host system: CPP, TBB, or OMP." + FULL_DOCS "A target's host system: CPP, TBB, or OMP." +) +define_property(TARGET PROPERTY _THRUST_DEVICE + BRIEF_DOCS "A target's device system: CUDA, CPP, TBB, or OMP." + FULL_DOCS "A target's device system: CUDA, CPP, TBB, or OMP." +) +define_property(TARGET PROPERTY _THRUST_DIALECT + BRIEF_DOCS "A target's C++ dialect: 11, 14, or 17." + FULL_DOCS "A target's C++ dialect: 11, 14, or 17." +) +define_property(TARGET PROPERTY _THRUST_PREFIX + BRIEF_DOCS "A prefix describing the config, eg. 'thrust.cpp.cuda.cpp14'." + FULL_DOCS "A prefix describing the config, eg. 'thrust.cpp.cuda.cpp14'." +) + +function(thrust_set_target_properties target_name host device dialect prefix) + set_target_properties(${target_name} + PROPERTIES + _THRUST_HOST ${host} + _THRUST_DEVICE ${device} + _THRUST_DIALECT ${dialect} + _THRUST_PREFIX ${prefix} + ) + + get_property(langs GLOBAL PROPERTY ENABLED_LANGUAGES) + set(standard_features) + if (CUDA IN_LIST langs) + list(APPEND standard_features cuda_std_${dialect}) + endif() + if (CXX IN_LIST langs) + list(APPEND standard_features cxx_std_${dialect}) + endif() + + get_target_property(type ${target_name} TYPE) + if (${type} STREQUAL "INTERFACE_LIBRARY") + target_compile_features(${target_name} INTERFACE + ${standard_features} + ) + else() + target_compile_features(${target_name} PUBLIC + ${standard_features} + ) + set_target_properties(${target_name} + PROPERTIES + CXX_STANDARD ${dialect} + CUDA_STANDARD ${dialect} + # Must manually request that the standards above are actually respected + # or else CMake will silently fail to configure the targets correctly... + # Note that this doesn't actually work as of CMake 3.16: + # https://gitlab.kitware.com/cmake/cmake/-/issues/20953 + # We'll leave these properties enabled in hopes that they will someday + # work. + CXX_STANDARD_REQUIRED ON + CUDA_STANDARD_REQUIRED ON + ARCHIVE_OUTPUT_DIRECTORY "${THRUST_LIBRARY_OUTPUT_DIR}" + LIBRARY_OUTPUT_DIRECTORY "${THRUST_LIBRARY_OUTPUT_DIR}" + RUNTIME_OUTPUT_DIRECTORY "${THRUST_EXECUTABLE_OUTPUT_DIR}" + ) + + # CMake still emits errors about empty CUDA_ARCHITECTURES when CMP0104 + # is set to OLD. This suppresses the errors for good. + if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18) + set_target_properties(${target_name} + PROPERTIES + CUDA_ARCHITECTURES OFF + ) + endif() + + if ("CUDA" STREQUAL "${device}" AND + "NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") + set_target_properties(${target_name} PROPERTIES + CUDA_RESOLVE_DEVICE_SYMBOLS OFF + ) + endif() + endif() +endfunction() + +# Get a thrust property from a target and store it in var_name +# thrust_get_target_property( [HOST|DEVICE|DIALECT|PREFIX] +macro(thrust_get_target_property prop_var target_name prop) + get_property(${prop_var} TARGET ${target_name} PROPERTY _THRUST_${prop}) +endmacro() + +# Defines the following string variables in the caller's scope: +# - ${target_name}_HOST +# - ${target_name}_DEVICE +# - ${target_name}_DIALECT +# - ${target_name}_PREFIX +macro(thrust_get_target_properties target_name) + thrust_get_target_property(${target_name}_HOST ${target_name} HOST) + thrust_get_target_property(${target_name}_DEVICE ${target_name} DEVICE) + thrust_get_target_property(${target_name}_DIALECT ${target_name} DIALECT) + thrust_get_target_property(${target_name}_PREFIX ${target_name} PREFIX) +endmacro() + +# Set one target's THRUST_* properties to match another target +function(thrust_clone_target_properties dst_target src_target) + thrust_get_target_properties(${src_target}) + thrust_set_target_properties(${dst_target} + ${${src_target}_HOST} + ${${src_target}_DEVICE} + ${${src_target}_DIALECT} + ${${src_target}_PREFIX} + ) +endfunction() + +# Set ${var_name} to TRUE or FALSE in the caller's scope +function(_thrust_is_config_valid var_name host device dialect) + if (THRUST_MULTICONFIG_ENABLE_SYSTEM_${host} AND + THRUST_MULTICONFIG_ENABLE_SYSTEM_${device} AND + THRUST_MULTICONFIG_ENABLE_DIALECT_CPP${dialect} AND + "${host}_${device}" IN_LIST THRUST_MULTICONFIG_WORKLOAD_${THRUST_MULTICONFIG_WORKLOAD}_CONFIGS) + set(${var_name} TRUE PARENT_SCOPE) + else() + set(${var_name} FALSE PARENT_SCOPE) + endif() +endfunction() + +function(_thrust_init_target_list) + set(THRUST_TARGETS "" CACHE INTERNAL "" FORCE) +endfunction() + +function(_thrust_add_target_to_target_list target_name host device dialect prefix) + thrust_set_target_properties(${target_name} ${host} ${device} ${dialect} ${prefix}) + + target_link_libraries(${target_name} INTERFACE + thrust.compiler_interface + ) + + # dialect-specific interface: + if (TARGET thrust.compiler_interface_cpp${dialect}) + target_link_libraries(${target_name} INTERFACE + thrust.compiler_interface_cpp${dialect} + ) + endif() + + # Workaround Github issue #1174. cudafe promote TBB header warnings to + # errors, even when they're -isystem includes. + if ((NOT host STREQUAL "TBB") OR (NOT device STREQUAL "CUDA")) + target_link_libraries(${target_name} INTERFACE + thrust.promote_cudafe_warnings + ) + endif() + + set(THRUST_TARGETS ${THRUST_TARGETS} ${target_name} CACHE INTERNAL "" FORCE) + + set(label "${host}.${device}.cpp${dialect}") + string(TOLOWER "${label}" label) + message(STATUS "Enabling Thrust configuration: ${label}") +endfunction() + +function(_thrust_build_target_list_multiconfig) + # Detect supported dialects if requested -- this must happen after CUDA is + # enabled, if it's going to be enabled. + if (THRUST_MULTICONFIG_ENABLE_DIALECT_ALL OR + THRUST_MULTICONFIG_ENABLE_DIALECT_LATEST) + message(STATUS "Testing for supported language standards...") + include("${CMAKE_CURRENT_FUNCTION_LIST_DIR}/DetectSupportedStandards.cmake") + detect_supported_standards(THRUST CXX ${THRUST_CPP_DIALECT_OPTIONS}) + if (THRUST_CUDA_FOUND) + detect_supported_standards(THRUST CUDA ${THRUST_CPP_DIALECT_OPTIONS}) + endif() + + # Take the union of supported standards in CXX and CUDA: + set(supported_dialects) + set(latest_dialect 11) + foreach(standard IN LISTS THRUST_CPP_DIALECT_OPTIONS) + if ((THRUST_CXX_${standard}_SUPPORTED) AND + ((NOT THRUST_CUDA_FOUND) OR THRUST_CUDA_${standard}_SUPPORTED)) + + # MSVC silently promotes C++11 to C++14 -- skip it: + if ((${CMAKE_CXX_COMPILER_ID} STREQUAL MSVC) AND (standard EQUAL 11)) + continue() + endif() + + list(APPEND supported_dialects ${standard}) + if (latest_dialect LESS standard) + set(latest_dialect ${standard}) + endif() + endif() + endforeach() + + if (THRUST_MULTICONFIG_ENABLE_DIALECT_ALL) + foreach(standard IN LISTS THRUST_CPP_DIALECT_OPTIONS) + if (standard IN_LIST supported_dialects) + set(THRUST_MULTICONFIG_ENABLE_DIALECT_CPP${standard} ON CACHE BOOL + "Generate C++${dialect} build configurations." FORCE + ) + else() + set(THRUST_MULTICONFIG_ENABLE_DIALECT_CPP${standard} OFF CACHE BOOL + "Generate C++${dialect} build configurations." FORCE + ) + endif() + endforeach() + elseif(THRUST_MULTICONFIG_ENABLE_DIALECT_LATEST) + foreach(standard IN LISTS THRUST_CPP_DIALECT_OPTIONS) + if (standard EQUAL latest_dialect) + set(THRUST_MULTICONFIG_ENABLE_DIALECT_CPP${standard} ON CACHE BOOL + "Generate C++${dialect} build configurations." FORCE + ) + else() + set(THRUST_MULTICONFIG_ENABLE_DIALECT_CPP${standard} OFF CACHE BOOL + "Generate C++${dialect} build configurations." FORCE + ) + endif() + endforeach() + endif() + endif() + + # Supported versions of MSVC do not distinguish between C++11 and C++14. + # Warn the user that they may be generating a ton of redundant targets if + # they explicitly requested this configuration. + if ("MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}" AND + THRUST_MULTICONFIG_ENABLE_DIALECT_CPP11) + message(WARNING + "Supported versions of MSVC (2017+) do not distinguish between C++11 " + "and C++14. The requested C++11 targets may be redundant." + ) + endif() + + # Build THRUST_TARGETS + foreach(host IN LISTS THRUST_HOST_SYSTEM_OPTIONS) + foreach(device IN LISTS THRUST_DEVICE_SYSTEM_OPTIONS) + foreach(dialect IN LISTS THRUST_CPP_DIALECT_OPTIONS) + _thrust_is_config_valid(config_valid ${host} ${device} ${dialect}) + if (config_valid) + set(prefix "thrust.${host}.${device}.cpp${dialect}") + string(TOLOWER "${prefix}" prefix) + + # Configure a thrust interface target for this host/device + set(target_name "${prefix}") + thrust_create_target(${target_name} + HOST ${host} + DEVICE ${device} + ${THRUST_TARGET_FLAGS} + ) + + # Set configuration metadata for this thrust interface target: + _thrust_add_target_to_target_list(${target_name} + ${host} ${device} ${dialect} ${prefix} + ) + + # Create a meta target for all targets in this configuration: + add_custom_target(${prefix}.all) + add_dependencies(thrust.all ${prefix}.all) + endif() + endforeach() # dialects + endforeach() # devices + endforeach() # hosts + + list(LENGTH THRUST_TARGETS count) + message(STATUS "${count} unique thrust.host.device.dialect configurations generated") +endfunction() + +function(_thrust_build_target_list_singleconfig) + set(host ${THRUST_HOST_SYSTEM}) + set(device ${THRUST_DEVICE_SYSTEM}) + set(dialect ${THRUST_CPP_DIALECT}) + set(prefix "thrust") # single config + + _thrust_add_target_to_target_list(thrust ${host} ${device} ${dialect} ${prefix}) +endfunction() + +# Build a ${THRUST_TARGETS} list containing target names for all +# requested configurations +function(thrust_build_target_list) + # Clear the list of targets: + _thrust_init_target_list() + + # Generic config flags: + set(THRUST_TARGET_FLAGS) + macro(add_flag_option flag docstring default) + set(opt "THRUST_${flag}") + option(${opt} "${docstring}" "${default}") + mark_as_advanced(${opt}) + if (${${opt}}) + list(APPEND THRUST_TARGET_FLAGS ${flag}) + endif() + endmacro() + add_flag_option(IGNORE_DEPRECATED_CPP_DIALECT "Don't warn about any deprecated C++ standards and compilers." OFF) + add_flag_option(IGNORE_DEPRECATED_CPP_11 "Don't warn about deprecated C++11." OFF) + add_flag_option(IGNORE_DEPRECATED_COMPILER "Don't warn about deprecated compilers." OFF) + add_flag_option(IGNORE_CUB_VERSION_CHECK "Don't warn about mismatched CUB versions." OFF) + add_flag_option(IGNORE_DEPRECATED_API "Don't warn about deprecated Thrust or CUB APIs." OFF) + + # Top level meta-target. Makes it easier to just build thrust targets when + # building both CUB and Thrust. Add all project files here so IDEs will be + # aware of them. This will not generate build rules. + file(GLOB_RECURSE all_sources + RELATIVE "${CMAKE_CURRENT_LIST_DIR}" + "${Thrust_SOURCE_DIR}/thrust/*.h" + "${Thrust_SOURCE_DIR}/thrust/*.inl" + ) + add_custom_target(thrust.all SOURCES ${all_sources}) + + if (THRUST_ENABLE_MULTICONFIG) + _thrust_build_target_list_multiconfig() + else() + _thrust_build_target_list_singleconfig() + endif() +endfunction() diff --git a/cmake/ThrustCompilerHacks.cmake b/cmake/ThrustCompilerHacks.cmake new file mode 100644 index 000000000..5f7b0d98e --- /dev/null +++ b/cmake/ThrustCompilerHacks.cmake @@ -0,0 +1,110 @@ +# Set up compiler paths and apply temporary hacks to support NVC++. +# This file must be included before enabling any languages. + +# Temporary hacks to make NVC++ work; this requires you to define +# `CMAKE_CUDA_COMPILER_ID=NVCXX and `CMAKE_CUDA_COMPILER_FORCED=ON`. +if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") + # If using NVC++, don't set CXX compiler + if (NOT "${CMAKE_CXX_COMPILER}" STREQUAL "") + unset(CMAKE_CXX_COMPILER CACHE) + message(FATAL_ERROR "You are using NVC++ as your CUDA C++ compiler, but have" + " specified a different ISO C++ compiler; NVC++ acts as both, so please" + " unset the CMAKE_CXX_COMPILER variable." + ) + endif() + + # We don't set CMAKE_CUDA_HOST_COMPILER for NVC++; if we do, CMake tries to + # pass `-ccbin ${CMAKE_CUDA_HOST_COMPILER}` to NVC++, which it doesn't + # understand. + if (NOT "${CMAKE_CUDA_HOST_COMPILER}" STREQUAL "") + unset(CMAKE_CUDA_HOST_COMPILER CACHE) + message(FATAL_ERROR "You are using NVC++ as your CUDA C++ compiler, but have" + " specified a different host ISO C++ compiler; NVC++ acts as both, so" + " please unset the CMAKE_CUDA_HOST_COMPILER variable." + ) + endif() + + set(CMAKE_CXX_COMPILER "${CMAKE_CUDA_COMPILER}") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -stdpar") + set(CMAKE_CUDA_HOST_LINK_LAUNCHER "${CMAKE_CUDA_COMPILER}") + set(CMAKE_CUDA_LINK_EXECUTABLE + " -o ") + + # Setup CMAKE_CXX_LIBRARY_ARCHITECTURE on Debian/Ubuntu so that find_package + # works properly. + if (EXISTS /etc/debian_version) + if (NOT CMAKE_CXX_LIBRARY_ARCHITECTURE) + file(GLOB files_in_lib RELATIVE /lib /lib/*-linux-gnu* ) + foreach (file ${files_in_lib}) + if ("${file}" MATCHES "${CMAKE_LIBRARY_ARCHITECTURE_REGEX}") + set(CMAKE_CXX_LIBRARY_ARCHITECTURE ${file}) + break() + endif() + endforeach() + endif() + if (NOT CMAKE_LIBRARY_ARCHITECTURE) + set(CMAKE_LIBRARY_ARCHITECTURE ${CMAKE_CXX_LIBRARY_ARCHITECTURE}) + endif() + endif() +endif() + +# We don't set CMAKE_CUDA_HOST_COMPILER for NVC++; if we do, CMake tries to +# pass `-ccbin ${CMAKE_CUDA_HOST_COMPILER}` to NVC++, which it doesn't +# understand. +if ((NOT "NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")) + if (NOT ("${CMAKE_CUDA_HOST_COMPILER}" STREQUAL "" OR + "${CMAKE_CUDA_HOST_COMPILER}" STREQUAL "${CMAKE_CXX_COMPILER}")) + set(tmp "${CMAKE_CUDA_HOST_COMPILER}") + unset(CMAKE_CUDA_HOST_COMPILER CACHE) + message(FATAL_ERROR + "For convenience, Thrust's test harness uses CMAKE_CXX_COMPILER for the " + "CUDA host compiler. Refusing to overwrite specified " + "CMAKE_CUDA_HOST_COMPILER -- please reconfigure without setting this " + "variable. Currently:\n" + "CMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}\n" + "CMAKE_CUDA_HOST_COMPILER=${tmp}" + ) + endif () + set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}") +endif () + +# Temporary hacks to make NVC++ work; this requires you to define +# `CMAKE_CUDA_COMPILER_ID=NVCXX and `CMAKE_CUDA_COMPILER_FORCED=ON`. +if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") + # Need 3.17 for the properties used below. + cmake_minimum_required(VERSION 3.17) + + set(CMAKE_CUDA_STANDARD_DEFAULT 03) + + set(CMAKE_CUDA03_STANDARD_COMPILE_OPTION "-std=c++03") + set(CMAKE_CUDA03_EXTENSION_COMPILE_OPTION "-std=c++03") + set(CMAKE_CUDA03_STANDARD__HAS_FULL_SUPPORT TRUE) + set_property(GLOBAL PROPERTY CMAKE_CUDA03_KNOWN_FEATURES) + + set(CMAKE_CUDA11_STANDARD_COMPILE_OPTION "-std=c++11") + set(CMAKE_CUDA11_EXTENSION_COMPILE_OPTION "-std=c++11") + set(CMAKE_CUDA11_STANDARD__HAS_FULL_SUPPORT TRUE) + set_property(GLOBAL PROPERTY CMAKE_CUDA11_KNOWN_FEATURES) + + set(CMAKE_CUDA14_STANDARD_COMPILE_OPTION "-std=c++14") + set(CMAKE_CUDA14_EXTENSION_COMPILE_OPTION "-std=c++14") + set(CMAKE_CUDA14_STANDARD__HAS_FULL_SUPPORT TRUE) + set_property(GLOBAL PROPERTY CMAKE_CUDA14_KNOWN_FEATURES) + + set(CMAKE_CUDA17_STANDARD_COMPILE_OPTION "-std=c++17") + set(CMAKE_CUDA17_EXTENSION_COMPILE_OPTION "-std=c++17") + set(CMAKE_CUDA17_STANDARD__HAS_FULL_SUPPORT TRUE) + set_property(GLOBAL PROPERTY CMAKE_CUDA17_KNOWN_FEATURES) + + include(Internal/FeatureTesting) + include(Compiler/CMakeCommonCompilerMacros) + cmake_record_cuda_compile_features() + + set(CMAKE_CUDA_COMPILE_FEATURES + ${CMAKE_CUDA03_COMPILE_FEATURES} + ${CMAKE_CUDA11_COMPILE_FEATURES} + ${CMAKE_CUDA14_COMPILE_FEATURES} + ${CMAKE_CUDA17_COMPILE_FEATURES} + ${CMAKE_CUDA20_COMPILE_FEATURES} + ) +endif() diff --git a/cmake/ThrustCudaConfig.cmake b/cmake/ThrustCudaConfig.cmake new file mode 100644 index 000000000..a585c7910 --- /dev/null +++ b/cmake/ThrustCudaConfig.cmake @@ -0,0 +1,200 @@ +enable_language(CUDA) + +set(THRUST_KNOWN_COMPUTE_ARCHS 50 52 53 60 61 62 70 72 75 80 86) + +if (NVIDIA STREQUAL "${CMAKE_CUDA_COMPILER_ID}") + if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER 11.7) + list(APPEND THRUST_KNOWN_COMPUTE_ARCHS 90) + endif() + if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12.0) + list(APPEND THRUST_KNOWN_COMPUTE_ARCHS 35 37) + endif() +else() + list(APPEND THRUST_KNOWN_COMPUTE_ARCHS 35 37 90) +endif() + +# Split CUDA_FLAGS into 3 parts: +# +# THRUST_CUDA_FLAGS_BASE: Common CUDA flags for all targets. +# THRUST_CUDA_FLAGS_RDC: Additional CUDA flags for targets compiled with RDC. +# THRUST_CUDA_FLAGS_NO_RDC: Additional CUDA flags for targets compiled without RDC. +# +# This is necessary because CUDA SMs 5.3, 6.2, and 7.2 do not support RDC, but +# we want to always build some targets (e.g. testing/cuda/*) with RDC. +# We work around this by building the "always RDC" targets without support for +# those SMs. This requires two sets of CUDA_FLAGS. +# +# Enabling any of those SMs along with the ENABLE_RDC options will result in a +# configuration error. +# +# Because of how CMake handles the CMAKE_CUDA_FLAGS variables, every target +# generated in a given directory will use the same value for CMAKE_CUDA_FLAGS, +# which is determined at the end of the directory's scope. This means caution +# should be used when trying to build different targets with different flags, +# since they might not behave as expected. This will improve with CMake 3.18, +# which add the DEVICE_LINK genex, fixing the issue with using per-target +# CUDA_FLAGS: https://gitlab.kitware.com/cmake/cmake/-/issues/18265 +set(THRUST_CUDA_FLAGS_BASE "${CMAKE_CUDA_FLAGS}") +set(THRUST_CUDA_FLAGS_RDC) +set(THRUST_CUDA_FLAGS_NO_RDC) + +# Archs that don't support RDC: +set(no_rdc_archs 53 62 72) + +# Find the highest arch: +list(SORT THRUST_KNOWN_COMPUTE_ARCHS) +list(LENGTH THRUST_KNOWN_COMPUTE_ARCHS max_idx) +math(EXPR max_idx "${max_idx} - 1") +list(GET THRUST_KNOWN_COMPUTE_ARCHS ${max_idx} highest_arch) + +option(THRUST_AUTO_DETECT_COMPUTE_ARCHS + "If ON, compute architectures for all GPUs in the current system are enabled and all other compute architectures are disabled." + OFF +) + +if (THRUST_AUTO_DETECT_COMPUTE_ARCHS) + if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") + message(STATUS "Thrust: Using NVC++ builtin automatic compute architecture detection.") + else() + set(detect_compute_archs_source ${Thrust_SOURCE_DIR}/cmake/detect_compute_archs.cu) + set(detect_compute_archs_exe ${PROJECT_BINARY_DIR}/detect_compute_archs) + set(detect_compute_archs_error_log ${PROJECT_BINARY_DIR}/detect_compute_archs.stderr.log) + execute_process( + COMMAND ${CMAKE_CUDA_COMPILER} + -std=c++11 + -o ${detect_compute_archs_exe} + --run + ${detect_compute_archs_source} + OUTPUT_VARIABLE detected_archs + OUTPUT_STRIP_TRAILING_WHITESPACE + ERROR_FILE ${detect_compute_archs_error_log}) + if ("NONE" STREQUAL "${detected_archs}") + set(detected_message " none") + else() + foreach (arch IN LISTS detected_archs) + string(APPEND detected_message " sm_${arch}") + endforeach() + endif() + message(STATUS "Thrust: Automatically detected compute architectures:${detected_message}") + endif() +endif() + +set(option_init OFF) +if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") + set(option_init ON) +endif() +option(THRUST_DISABLE_ARCH_BY_DEFAULT + "If ON, then all compute architectures are disabled on the initial CMake run." + ${option_init} +) + +set(option_init ON) +if (THRUST_DISABLE_ARCH_BY_DEFAULT OR THRUST_AUTO_DETECT_COMPUTE_ARCHS) + set(option_init OFF) +endif() + +set(num_archs_enabled 0) +foreach (arch IN LISTS THRUST_KNOWN_COMPUTE_ARCHS) + set(this_option_init ${option_init}) + + if (${arch} IN_LIST detected_archs) + set(this_option_init ON) + endif() + + option(THRUST_ENABLE_COMPUTE_${arch} + "Enable code generation for tests for sm_${arch}" + ${this_option_init} + ) + + if (NOT THRUST_ENABLE_COMPUTE_${arch}) + continue() + endif() + + math(EXPR num_archs_enabled "${num_archs_enabled} + 1") + + if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") + if (NOT ${num_archs_enabled} EQUAL 1) + message(FATAL_ERROR + "NVCXX does not support compilation for multiple device architectures " + "at once." + ) + endif() + set(arch_flag "-gpu=cc${arch}") + elseif ("Clang" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") + set(arch_flag "--cuda-gpu-arch=sm_${arch}") + else() + set(arch_flag "-gencode arch=compute_${arch},code=sm_${arch}") + endif() + + string(APPEND compute_message " sm_${arch}") + string(APPEND THRUST_CUDA_FLAGS_NO_RDC " ${arch_flag}") + if (NOT arch IN_LIST no_rdc_archs) + string(APPEND THRUST_CUDA_FLAGS_RDC " ${arch_flag}") + endif() +endforeach() + +if (NOT "NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") + option(THRUST_ENABLE_COMPUTE_FUTURE + "Enable code generation for tests for compute_${highest_arch}" + ${option_init} + ) + if (THRUST_ENABLE_COMPUTE_FUTURE) + string(APPEND THRUST_CUDA_FLAGS_BASE + " -gencode arch=compute_${highest_arch},code=compute_${highest_arch}" + ) + string(APPEND compute_message " compute_${highest_arch}") + endif() +endif() + +message(STATUS "Thrust: Explicitly enabled compute architectures:${compute_message}") + +# RDC is off by default in NVCC and on by default in NVC++. Turning off RDC +# isn't currently supported by NVC++. So, we default to RDC off for NVCC and +# RDC on for NVC++. +set(option_init OFF) +if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") + set(option_init ON) +endif() + +option(THRUST_ENABLE_TESTS_WITH_RDC + "Build all Thrust tests with RDC; tests that require RDC are not affected by this option." + ${option_init} +) + +option(THRUST_ENABLE_EXAMPLES_WITH_RDC + "Build all Thrust examples with RDC; examples which require RDC are not affected by this option." + ${option_init} +) + +# Check for RDC/SM compatibility and error/warn if necessary +foreach (sm IN LISTS no_rdc_archs) + set(sm_opt THRUST_ENABLE_COMPUTE_${sm}) + if (${sm_opt}) + foreach (opt IN ITEMS TESTS EXAMPLES) + set(rdc_opt THRUST_ENABLE_${opt}_WITH_RDC) + if (${rdc_opt}) + message(FATAL_ERROR + "${rdc_opt} is incompatible with ${sm_opt}, since sm_${sm} does not " + "support RDC." + ) + endif() + endforeach() + + message(NOTICE + "sm_${sm} does not support RDC. Targets that require RDC will be built " + "without support for this architecture." + ) + endif() +endforeach() + + +# +# Clang CUDA options +# +if ("Clang" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") + set(THRUST_CUDA_FLAGS_BASE "${THRUST_CUDA_FLAGS_BASE} -Wno-unknown-cuda-version -Xclang=-fcuda-allow-variadic-functions") +endif() + + +# By default RDC is not used: +set(CMAKE_CUDA_FLAGS "${THRUST_CUDA_FLAGS_BASE} ${THRUST_CUDA_FLAGS_NO_RDC}") diff --git a/cmake/ThrustFindThrust.cmake b/cmake/ThrustFindThrust.cmake new file mode 100644 index 000000000..39a79e4b7 --- /dev/null +++ b/cmake/ThrustFindThrust.cmake @@ -0,0 +1,42 @@ +function(_thrust_find_thrust_multiconfig) + # Check which systems are enabled by multiconfig: + set(req_systems) + if (THRUST_MULTICONFIG_ENABLE_SYSTEM_CUDA) + list(APPEND req_systems CUDA) + endif() + if (THRUST_MULTICONFIG_ENABLE_SYSTEM_CPP) + list(APPEND req_systems CPP) + endif() + if (THRUST_MULTICONFIG_ENABLE_SYSTEM_TBB) + list(APPEND req_systems TBB) + endif() + if (THRUST_MULTICONFIG_ENABLE_SYSTEM_OMP) + list(APPEND req_systems OMP) + endif() + + find_package(Thrust REQUIRED CONFIG + NO_DEFAULT_PATH # Only check the explicit path in HINTS: + HINTS "${Thrust_SOURCE_DIR}" + COMPONENTS ${req_systems} + ) +endfunction() + +function(_thrust_find_thrust_singleconfig) + find_package(Thrust REQUIRED CONFIG + NO_DEFAULT_PATH # Only check the explicit path in HINTS: + HINTS "${Thrust_SOURCE_DIR}" + ) + # Create target now to prepare system found flags: + thrust_create_target(thrust FROM_OPTIONS ${THRUST_TARGET_FLAGS}) + thrust_debug_target(thrust "${THRUST_VERSION}") +endfunction() + +# Build a ${THRUST_TARGETS} list containing target names for all +# requested configurations +function(thrust_find_thrust) + if (THRUST_ENABLE_MULTICONFIG) + _thrust_find_thrust_multiconfig() + else() + _thrust_find_thrust_singleconfig() + endif() +endfunction() diff --git a/cmake/ThrustHeaderTesting.cmake b/cmake/ThrustHeaderTesting.cmake new file mode 100644 index 000000000..3b3e00ca8 --- /dev/null +++ b/cmake/ThrustHeaderTesting.cmake @@ -0,0 +1,140 @@ +# For every public header, build a translation unit containing `#include
` +# to let the compiler try to figure out warnings in that header if it is not otherwise +# included in tests, and also to verify if the headers are modular enough. +# .inl files are not globbed for, because they are not supposed to be used as public +# entrypoints. + +# Meta target for all configs' header builds: +add_custom_target(thrust.all.headers) + +foreach(thrust_target IN LISTS THRUST_TARGETS) + thrust_get_target_property(config_host ${thrust_target} HOST) + thrust_get_target_property(config_device ${thrust_target} DEVICE) + thrust_get_target_property(config_prefix ${thrust_target} PREFIX) + set(config_systems ${config_host} ${config_device}) + + string(TOLOWER "${config_host}" host_lower) + string(TOLOWER "${config_device}" device_lower) + + # GLOB ALL THE THINGS + set(headers_globs thrust/*.h) + set(headers_exclude_systems_globs thrust/system/*/*) + set(headers_systems_globs + thrust/system/${host_lower}/* + thrust/system/${device_lower}/* + ) + set(headers_exclude_details_globs + thrust/detail/* + thrust/*/detail/* + thrust/*/*/detail/* + ) + + # Get all .h files... + file(GLOB_RECURSE headers + RELATIVE "${Thrust_SOURCE_DIR}/thrust" + CONFIGURE_DEPENDS + ${headers_globs} + ) + + # ...then remove all system specific headers... + file(GLOB_RECURSE headers_exclude_systems + RELATIVE "${Thrust_SOURCE_DIR}/thrust" + CONFIGURE_DEPENDS + ${headers_exclude_systems_globs} + ) + list(REMOVE_ITEM headers ${headers_exclude_systems}) + + # ...then add all headers specific to the selected host and device systems back again... + file(GLOB_RECURSE headers_systems + RELATIVE ${Thrust_SOURCE_DIR}/thrust + CONFIGURE_DEPENDS + ${headers_systems_globs} + ) + list(APPEND headers ${headers_systems}) + + # ...and remove all the detail headers (also removing the detail headers from the selected systems). + file(GLOB_RECURSE headers_exclude_details + RELATIVE "${Thrust_SOURCE_DIR}/thrust" + CONFIGURE_DEPENDS + ${headers_exclude_details_globs} + ) + list(REMOVE_ITEM headers ${headers_exclude_details}) + + # List of headers that aren't implemented for all backends, but are implemented for CUDA. + set(partially_implemented_CUDA + async/copy.h + async/for_each.h + async/reduce.h + async/scan.h + async/sort.h + async/transform.h + event.h + future.h + ) + + # List of headers that aren't implemented for all backends, but are implemented for CPP. + set(partially_implemented_CPP + ) + + # List of headers that aren't implemented for all backends, but are implemented for TBB. + set(partially_implemented_TBB + ) + + # List of headers that aren't implemented for all backends, but are implemented for OMP. + set(partially_implemented_OMP + ) + + # List of all partially implemented headers. + set(partially_implemented + ${partially_implemented_CUDA} + ${partially_implemented_CPP} + ${partially_implemented_TBB} + ${partially_implemented_OMP} + ) + list(REMOVE_DUPLICATES partially_implemented) + + set(headertest_srcs) + + foreach (header IN LISTS headers) + if ("${header}" IN_LIST partially_implemented) + # This header is partially implemented on _some_ backends... + if (NOT "${header}" IN_LIST partially_implemented_${config_device}) + # ...but not on the selected one. + continue() + endif() + endif() + + set(headertest_src_ext .cpp) + if ("CUDA" STREQUAL "${config_device}") + set(headertest_src_ext .cu) + endif() + + set(headertest_src "headers/${config_prefix}/${header}${headertest_src_ext}") + configure_file("${Thrust_SOURCE_DIR}/cmake/header_test.in" "${headertest_src}") + + list(APPEND headertest_srcs "${headertest_src}") + endforeach() + + set(headertest_target ${config_prefix}.headers) + add_library(${headertest_target} OBJECT ${headertest_srcs}) + target_link_libraries(${headertest_target} PUBLIC ${thrust_target}) + # Wrap Thrust/CUB in a custom namespace to check proper use of ns macros: + target_compile_definitions(${headertest_target} PRIVATE + "THRUST_WRAPPED_NAMESPACE=wrapped_thrust" + "CUB_WRAPPED_NAMESPACE=wrapped_cub" + ) + thrust_clone_target_properties(${headertest_target} ${thrust_target}) + + # Disable macro checks on TBB; the TBB atomic implementation uses `I` and + # our checks will issue false errors. + if ("TBB" IN_LIST config_systems) + target_compile_definitions(${headertest_target} + PRIVATE THRUST_IGNORE_MACRO_CHECKS + ) + endif() + + thrust_fix_clang_nvcc_build_for(${headertest_target}) + + add_dependencies(thrust.all.headers ${headertest_target}) + add_dependencies(${config_prefix}.all ${headertest_target}) +endforeach() diff --git a/cmake/ThrustInstallRules.cmake b/cmake/ThrustInstallRules.cmake new file mode 100644 index 000000000..98e72e196 --- /dev/null +++ b/cmake/ThrustInstallRules.cmake @@ -0,0 +1,58 @@ +# Bring in CMAKE_INSTALL_LIBDIR +include(GNUInstallDirs) + +# Thrust is a header library; no need to build anything before installing: +set(CMAKE_SKIP_INSTALL_ALL_DEPENDENCY TRUE) + +install(DIRECTORY "${Thrust_SOURCE_DIR}/thrust" + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" + FILES_MATCHING + PATTERN "*.h" + PATTERN "*.inl" +) + +install(DIRECTORY "${Thrust_SOURCE_DIR}/thrust/cmake/" + DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/thrust" + PATTERN *.cmake.in EXCLUDE +) +# Need to configure a file to store the infix specified in +# CMAKE_INSTALL_INCLUDEDIR since it can be defined by the user +set(install_location "${CMAKE_INSTALL_LIBDIR}/cmake/thrust") +configure_file("${Thrust_SOURCE_DIR}/thrust/cmake/thrust-header-search.cmake.in" + "${Thrust_BINARY_DIR}/thrust/cmake/thrust-header-search.cmake" + @ONLY) +install(FILES "${Thrust_BINARY_DIR}/thrust/cmake/thrust-header-search.cmake" + DESTINATION "${install_location}") + +# Depending on how Thrust is configured, libcudacxx and CUB's CMake scripts may +# or may not be include()'d, so force include their install rules when requested. +# By default, these projects are installed alongside Thrust. This is controlled by +# THRUST_INSTALL_CUB_HEADERS and THRUST_INSTALL_LIBCUDACXX_HEADERS. +option(THRUST_INSTALL_CUB_HEADERS "Include CUB headers when installing." ON) +if (THRUST_INSTALL_CUB_HEADERS) + # Use a function to limit scope of the CUB_*_DIR vars: + function(_thrust_install_cub_headers) + # Fake these for the logic in CUBInstallRules.cmake: + set(CUB_SOURCE_DIR "${Thrust_SOURCE_DIR}/dependencies/cub/") + set(CUB_BINARY_DIR "${Thrust_BINARY_DIR}/cub-config/") + set(CUB_ENABLE_INSTALL_RULES ON) + set(CUB_IN_THRUST OFF) + include("${Thrust_SOURCE_DIR}/dependencies/cub/cmake/CubInstallRules.cmake") + endfunction() + + _thrust_install_cub_headers() +endif() + +option(THRUST_INSTALL_LIBCUDACXX_HEADERS "Include libcudacxx headers when installing." ON) +if (THRUST_INSTALL_LIBCUDACXX_HEADERS) + # Use a function to limit scope of the libcudacxx_*_DIR vars: + function(_thrust_install_libcudacxx_headers) + # Fake these for the logic in libcudacxxInstallRules.cmake: + set(libcudacxx_SOURCE_DIR "${Thrust_SOURCE_DIR}/dependencies/libcudacxx/") + set(libcudacxx_BINARY_DIR "${Thrust_BINARY_DIR}/libcudacxx-config/") + set(libcudacxx_ENABLE_INSTALL_RULES ON) + include("${Thrust_SOURCE_DIR}/dependencies/libcudacxx/cmake/libcudacxxInstallRules.cmake") + endfunction() + + _thrust_install_libcudacxx_headers() +endif() diff --git a/cmake/ThrustMultiConfig.cmake b/cmake/ThrustMultiConfig.cmake new file mode 100644 index 000000000..aa9fc0226 --- /dev/null +++ b/cmake/ThrustMultiConfig.cmake @@ -0,0 +1,129 @@ +# This file defines thrust_configure_multiconfig(), which sets up and handles +# the MultiConfig options that allow multiple host/device/dialect configurations +# to be generated from a single thrust build. + +function(thrust_configure_multiconfig) + option(THRUST_ENABLE_MULTICONFIG "Enable multiconfig options for coverage testing." OFF) + + # Dialects: + set(THRUST_CPP_DIALECT_OPTIONS + 11 14 17 20 + CACHE INTERNAL "C++ dialects supported by Thrust." FORCE + ) + + if (THRUST_ENABLE_MULTICONFIG) + # Handle dialect options: + foreach (dialect IN LISTS THRUST_CPP_DIALECT_OPTIONS) + set(default_value OFF) + if (dialect EQUAL 14) # Default to just 14 on: + set(default_value ON) + endif() + option(THRUST_MULTICONFIG_ENABLE_DIALECT_CPP${dialect} + "Generate C++${dialect} build configurations." + ${default_value} + ) + endforeach() + + # Option to enable all standards supported by the CUDA and CXX compilers: + option(THRUST_MULTICONFIG_ENABLE_DIALECT_ALL + "Generate build configurations for all C++ standards supported by the configured compilers." + OFF + ) + + # Option to enable only the most recent supported dialect: + option(THRUST_MULTICONFIG_ENABLE_DIALECT_LATEST + "Generate a single build configuration for the most recent C++ standard supported by the configured compilers." + OFF + ) + + # Systems: + option(THRUST_MULTICONFIG_ENABLE_SYSTEM_CPP "Generate build configurations that use CPP." ON) + option(THRUST_MULTICONFIG_ENABLE_SYSTEM_CUDA "Generate build configurations that use CUDA." ON) + option(THRUST_MULTICONFIG_ENABLE_SYSTEM_OMP "Generate build configurations that use OpenMP." OFF) + option(THRUST_MULTICONFIG_ENABLE_SYSTEM_TBB "Generate build configurations that use TBB." OFF) + + # CMake fixed C++17 support for NVCC + MSVC targets in 3.18.3: + if (THRUST_MULTICONFIG_ENABLE_DIALECT_CPP17 AND + THRUST_MULTICONFIG_ENABLE_SYSTEM_CUDA) + cmake_minimum_required(VERSION 3.18.3) + endif() + + # Workload: + # - `SMALL`: [3 configs] Minimal coverage and validation of each device system against the `CPP` host. + # - `MEDIUM`: [6 configs] Cheap extended coverage. + # - `LARGE`: [8 configs] Expensive extended coverage. Include all useful build configurations. + # - `FULL`: [12 configs] The complete cross product of all possible build configurations. + # + # Config | Workloads | Value | Expense | Note + # ---------|-----------|------------|-----------|----------------------------- + # CPP/CUDA | F L M S | Essential | Expensive | Validates CUDA against CPP + # CPP/OMP | F L M S | Essential | Cheap | Validates OMP against CPP + # CPP/TBB | F L M S | Essential | Cheap | Validates TBB against CPP + # CPP/CPP | F L M | Important | Cheap | Tests CPP as device + # OMP/OMP | F L M | Important | Cheap | Tests OMP as host + # TBB/TBB | F L M | Important | Cheap | Tests TBB as host + # TBB/CUDA | F L | Important | Expensive | Validates TBB/CUDA interop + # OMP/CUDA | F L | Important | Expensive | Validates OMP/CUDA interop + # TBB/OMP | F | Not useful | Cheap | Mixes CPU-parallel systems + # OMP/TBB | F | Not useful | Cheap | Mixes CPU-parallel systems + # TBB/CPP | F | Not Useful | Cheap | Parallel host, serial device + # OMP/CPP | F | Not Useful | Cheap | Parallel host, serial device + + set(THRUST_MULTICONFIG_WORKLOAD SMALL CACHE STRING + "Limit host/device configs: SMALL (up to 3 h/d combos per dialect), MEDIUM(6), LARGE(8), FULL(12)" + ) + set_property(CACHE THRUST_MULTICONFIG_WORKLOAD PROPERTY STRINGS + SMALL MEDIUM LARGE FULL + ) + set(THRUST_MULTICONFIG_WORKLOAD_SMALL_CONFIGS + CPP_OMP CPP_TBB CPP_CUDA + CACHE INTERNAL "Host/device combos enabled for SMALL workloads." FORCE + ) + set(THRUST_MULTICONFIG_WORKLOAD_MEDIUM_CONFIGS + ${THRUST_MULTICONFIG_WORKLOAD_SMALL_CONFIGS} + CPP_CPP TBB_TBB OMP_OMP + CACHE INTERNAL "Host/device combos enabled for MEDIUM workloads." FORCE + ) + set(THRUST_MULTICONFIG_WORKLOAD_LARGE_CONFIGS + ${THRUST_MULTICONFIG_WORKLOAD_MEDIUM_CONFIGS} + OMP_CUDA TBB_CUDA + CACHE INTERNAL "Host/device combos enabled for LARGE workloads." FORCE + ) + set(THRUST_MULTICONFIG_WORKLOAD_FULL_CONFIGS + ${THRUST_MULTICONFIG_WORKLOAD_LARGE_CONFIGS} + OMP_CPP TBB_CPP OMP_TBB TBB_OMP + CACHE INTERNAL "Host/device combos enabled for FULL workloads." FORCE + ) + + # Hide the single config options if they exist from a previous run: + if (DEFINED THRUST_HOST_SYSTEM) + set_property(CACHE THRUST_HOST_SYSTEM PROPERTY TYPE INTERNAL) + set_property(CACHE THRUST_DEVICE_SYSTEM PROPERTY TYPE INTERNAL) + endif() + if (DEFINED THRUST_CPP_DIALECT) + set_property(CACHE THRUST_CPP_DIALECT PROPERTY TYPE INTERNAL) + endif() + + else() # Single config: + # Restore system option visibility if these cache options already exist + # from a previous run. + if (DEFINED THRUST_HOST_SYSTEM) + set_property(CACHE THRUST_HOST_SYSTEM PROPERTY TYPE STRING) + set_property(CACHE THRUST_DEVICE_SYSTEM PROPERTY TYPE STRING) + endif() + + set(THRUST_CPP_DIALECT 14 + CACHE STRING "The C++ standard to target: ${THRUST_CPP_DIALECT_OPTIONS}" + ) + set_property(CACHE THRUST_CPP_DIALECT + PROPERTY STRINGS + ${THRUST_CPP_DIALECT_OPTIONS} + ) + + # CMake fixed C++17 support for NVCC + MSVC targets in 3.18.3: + if (THRUST_CPP_DIALECT EQUAL 17 AND + THRUST_DEVICE_SYSTEM STREQUAL "CUDA") + cmake_minimum_required(VERSION 3.18.3) + endif() + endif() +endfunction() diff --git a/cmake/ThrustRunExample.cmake b/cmake/ThrustRunExample.cmake new file mode 100644 index 000000000..24e9dd2bb --- /dev/null +++ b/cmake/ThrustRunExample.cmake @@ -0,0 +1,49 @@ +# Inputs: +# +# Variable | Type | Doc +# ---------------------|----------|-------------------------------------- +# EXAMPLE_EXECUTABLE | FilePath | Path to example executable +# FILECHECK_ENABLED | Boolean | Run FileCheck comparison test +# FILECHECK_EXECUTABLE | FilePath | Path to the LLVM FileCheck utility +# REFERENCE_FILE | FilePath | Path to the FileCheck reference file + +if (FILECHECK_ENABLED) + if (NOT EXISTS "${REFERENCE_FILE}") + message(FATAL_ERROR + "FileCheck requested for '${EXAMPLE_EXECUTABLE}', but reference file " + "does not exist at '${REFERENCE_FILE}`." + ) + endif() + + # If the reference file is empty, validate that the example doesn't + # produce any output. + file(SIZE "${REFERENCE_FILE}" file_size) + message("${REFERENCE_FILE}: ${file_size} bytes") + + if (file_size EQUAL 0) + set(check_empty_output TRUE) + set(filecheck_command) + else() + set(check_empty_output FALSE) + set(filecheck_command COMMAND "${FILECHECK_EXECUTABLE}" "${REFERENCE_FILE}") + endif() +endif() + +execute_process( + COMMAND "${EXAMPLE_EXECUTABLE}" + ${filecheck_command} + RESULT_VARIABLE exit_code + OUTPUT_VARIABLE stdout + ERROR_VARIABLE stderr +) + +if (NOT 0 EQUAL exit_code) + message(FATAL_ERROR "${EXAMPLE_EXECUTABLE} failed (${exit_code}):\n${stderr}") +endif() + +if (check_empty_output) + string(LENGTH "${stdout}" stdout_size) + if (NOT stdout_size EQUAL 0) + message(FATAL_ERROR "${EXAMPLE_EXECUTABLE}: output received, but not expected:\n${stdout}") + endif() +endif() diff --git a/cmake/run_test.cmake b/cmake/ThrustRunTest.cmake similarity index 100% rename from cmake/run_test.cmake rename to cmake/ThrustRunTest.cmake diff --git a/cmake/ThrustUtilities.cmake b/cmake/ThrustUtilities.cmake new file mode 100644 index 000000000..6bbb1200a --- /dev/null +++ b/cmake/ThrustUtilities.cmake @@ -0,0 +1,25 @@ +# Given a cu_file (e.g. foo/bar.cu) relative to CMAKE_CURRENT_SOURCE_DIR +# and a thrust_target, create a cpp file that includes the .cu file, and set +# ${cpp_file_var} in the parent scope to the full path of the new file. The new +# file will be generated in: +# ${CMAKE_CURRENT_BINARY_DIR}//${cu_file}.cpp +function(thrust_wrap_cu_in_cpp cpp_file_var cu_file thrust_target) + thrust_get_target_property(prefix ${thrust_target} PREFIX) + set(wrapped_source_file "${CMAKE_CURRENT_SOURCE_DIR}/${cu_file}") + set(cpp_file "${CMAKE_CURRENT_BINARY_DIR}/${prefix}/${cu_file}.cpp") + configure_file("${Thrust_SOURCE_DIR}/cmake/wrap_source_file.cpp.in" "${cpp_file}") + set(${cpp_file_var} "${cpp_file}" PARENT_SCOPE) +endfunction() + +# Enable RDC for a CUDA target. Encapsulates compiler hacks: +function(thrust_enable_rdc_for_cuda_target target_name) + if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") + set_target_properties(${target_name} PROPERTIES + COMPILE_FLAGS "-gpu=rdc" + ) + else() + set_target_properties(${target_name} PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + ) + endif() +endfunction() diff --git a/cmake/common_variables.cmake b/cmake/common_variables.cmake deleted file mode 100644 index 2ff72eb53..000000000 --- a/cmake/common_variables.cmake +++ /dev/null @@ -1 +0,0 @@ -set(THRUST_FILECHECK_DATA_PATH "${THRUST_SOURCE}/internal/test") diff --git a/cmake/detect_compute_archs.cu b/cmake/detect_compute_archs.cu new file mode 100644 index 000000000..1d30dca4b --- /dev/null +++ b/cmake/detect_compute_archs.cu @@ -0,0 +1,43 @@ +/* + * Copyright 2019-2020 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +int main(int argc, char** argv) { + std::set archs; + int devices; + if ((cudaGetDeviceCount(&devices) == cudaSuccess) && (devices > 0)) { + for (int dev = 0; dev < devices; ++dev) { + char buff[32]; + cudaDeviceProp prop; + if(cudaGetDeviceProperties(&prop, dev) != cudaSuccess) continue; + sprintf(buff, "%d%d", prop.major, prop.minor); + archs.insert(buff); + } + } + if (archs.empty()) { + printf("NONE"); + } else { + bool first = true; + for(const auto& arch : archs) { + printf(first ? "%s" : ";%s", arch.c_str()); + first = false; + } + } + printf("\n"); +} diff --git a/cmake/filecheck_smoke_test b/cmake/filecheck_smoke_test new file mode 100644 index 000000000..aad1b0fd1 --- /dev/null +++ b/cmake/filecheck_smoke_test @@ -0,0 +1 @@ +SMOKE diff --git a/cmake/header_test.in b/cmake/header_test.in index 4c8ec00f5..250dd5170 100644 --- a/cmake/header_test.in +++ b/cmake/header_test.in @@ -1,3 +1,61 @@ +// This source file checks that: +// 1) Header compiles without error. +// 2) Common macro collisions with platform/system headers are avoided. + +// Turn off failures for certain configurations: #define THRUST_CPP11_REQUIRED_NO_ERROR +#define THRUST_CPP14_REQUIRED_NO_ERROR #define THRUST_MODERN_GCC_REQUIRED_NO_ERROR -#include + +#ifndef THRUST_IGNORE_MACRO_CHECKS + +// Define THRUST_MACRO_CHECK(macro, header), which emits a diagnostic indicating +// a potential macro collision and halts. +// +// Hacky way to build a string, but it works on all tested platforms. +#define THRUST_MACRO_CHECK(MACRO, HEADER) \ + THRUST_MACRO_CHECK_IMPL(Identifier MACRO should not be used from Thrust \ + headers due to conflicts with HEADER macros.) + +// Use raw platform checks instead of the THRUST_HOST_COMPILER macros since we +// don't want to #include any headers other than the one being tested. +// +// This is only implemented for MSVC/GCC/Clang. +#if defined(_MSC_VER) // MSVC + +// Fake up an error for MSVC +#define THRUST_MACRO_CHECK_IMPL(msg) \ + /* Print message that looks like an error: */ \ + __pragma(message(__FILE__ ":" THRUST_MACRO_CHECK_IMPL0(__LINE__) \ + ": error: " #msg)) \ + /* abort compilation due to static_assert or syntax error: */ \ + static_assert(false, #msg); +#define THRUST_MACRO_CHECK_IMPL0(x) THRUST_MACRO_CHECK_IMPL1(x) +#define THRUST_MACRO_CHECK_IMPL1(x) #x + +#elif defined(__clang__) || defined(__GNUC__) + +// GCC/clang are easy: +#define THRUST_MACRO_CHECK_IMPL(msg) THRUST_MACRO_CHECK_IMPL0(GCC error #msg) +#define THRUST_MACRO_CHECK_IMPL0(expr) _Pragma(#expr) + +#endif + +// complex.h conflicts +#define I THRUST_MACRO_CHECK('I', complex.h) + +// windows.h conflicts +#define small THRUST_MACRO_CHECK('small', windows.h) +// We can't enable these checks without breaking some builds -- some standard +// library implementations unconditionally `#undef` these macros, which then +// causes random failures later. +// Leaving these commented out as a warning: Here be dragons. +//#define min(...) THRUST_MACRO_CHECK('min', windows.h) +//#define max(...) THRUST_MACRO_CHECK('max', windows.h) + +// termios.h conflicts (NVIDIA/thrust#1547) +#define B0 THRUST_MACRO_CHECK("B0", termios.h) + +#endif // THRUST_IGNORE_MACRO_CHECKS + +#include diff --git a/cmake/run_example.cmake b/cmake/run_example.cmake deleted file mode 100644 index d51152d1e..000000000 --- a/cmake/run_example.cmake +++ /dev/null @@ -1,34 +0,0 @@ -include("${THRUST_SOURCE}/cmake/common_variables.cmake") - -if (THRUST_FILECHECK_ENABLED) - set(DATA_FILE "${THRUST_FILECHECK_DATA_PATH}/${THRUST_EXAMPLE}.filecheck") - file(READ "${DATA_FILE}" CONTENTS) - string(LENGTH "${CONTENTS}" LENGTH) - message(${LENGTH}) - - if (NOT ${LENGTH} EQUAL 0) - set(FILECHECK_COMMAND - COMMAND "${THRUST_FILECHECK}" "${THRUST_FILECHECK_DATA_PATH}/${THRUST_EXAMPLE}.filecheck") - else () - set(CHECK_EMPTY_OUTPUT TRUE) - endif () -endif () - -execute_process( - COMMAND "${THRUST_BINARY}" - ${FILECHECK_COMMAND} - RESULT_VARIABLE EXIT_CODE - OUTPUT_VARIABLE STDOUT - ERROR_VARIABLE STDERR -) - -if (NOT "0" STREQUAL "${EXIT_CODE}") - message(FATAL_ERROR "${THRUST_BINARY} failed (${EXIT_CODE}):\n${STDERR}") -endif () - -if (CHECK_EMPTY_OUTPUT) - string(LENGTH "${OUTPUT_VARIABLE}" LENGTH) - if (NOT ${LENGTH} EQUAL 0) - message(FATAL_ERROR "${THRUST_BINARY}: output received, but not expected.") - endif () -endif () diff --git a/cmake/sanity b/cmake/sanity deleted file mode 100644 index f9db80b7f..000000000 --- a/cmake/sanity +++ /dev/null @@ -1 +0,0 @@ -SANITY diff --git a/cmake/wrap_source_file.cpp.in b/cmake/wrap_source_file.cpp.in new file mode 100644 index 000000000..3015238cc --- /dev/null +++ b/cmake/wrap_source_file.cpp.in @@ -0,0 +1 @@ +#include <${wrapped_source_file}> diff --git a/dependencies/cub b/dependencies/cub new file mode 160000 index 000000000..b2e8bccb8 --- /dev/null +++ b/dependencies/cub @@ -0,0 +1 @@ +Subproject commit b2e8bccb8c0cd15279974fe4b9b8d6fcd1842b57 diff --git a/dependencies/libcudacxx b/dependencies/libcudacxx new file mode 160000 index 000000000..55dd2c993 --- /dev/null +++ b/dependencies/libcudacxx @@ -0,0 +1 @@ +Subproject commit 55dd2c99346baa3a14949a0f7e9c41865e434eda diff --git a/doc/branching.md b/doc/branching.md deleted file mode 100644 index 947ab1062..000000000 --- a/doc/branching.md +++ /dev/null @@ -1,125 +0,0 @@ -# Thrust Branching and Development Model - -The following is a description of how the Thrust development teams approaches branching and release tagging. This -is a living document that will evolve as our process evolves. - -## Thrust Version - -Thrust has historically had its own versioning system, independent of the versioning scheme of the CUDA Toolkit. -Today, Thrust is released with the CUDA Toolkit, but we currently still maintain the double versioning scheme. - -The following is a mapping from Thrust versions to CUDA Toolkit versions and vice versa. Note that some Thrust -versions don't directly map to any CUDA Toolkit version. - -| Thrust version | CUDA version | -| ----------------- | ------------- | -| 1.9.5 | 10.1 Update 1 | -| 1.9.4 | 10.1 | -| 1.9.3 | 10.0 | -| 1.9.2 | 9.2 | -| 1.9.1 | 9.1 | -| 1.9.0 | 9.0 | -| 1.8.3 | 8.0 | -| 1.8.2 | 7.5 | -| 1.8.1 | 7.0 | -| 1.8.0 | *N/A* | -| 1.7.2 | 6.5 | -| 1.7.1 | 6.0 | -| 1.7.0 | 5.5 | -| 1.6.0 | *N/A* | -| 1.5.3 | 5.0 | -| 1.5.2 | 4.2 | -| 1.5.1 | 4.1 | -| 1.5.0 | *N/A* | -| 1.4.0 | 4.0 | -| 1.3.0 | 3.2 | -| 1.2.1 | 3.1 | -| 1.2.0 | *N/A* | -| 1.1.1 | *N/A* | -| 1.1.0 | *N/A* | -| 1.0.0 | *N/A* | - -## Repositories - -As Thrust is developed both on GitHub and internally at NVIDIA, there's three main places where code lives: - - * The [public Thrust repository](https://github.com/thrust/thrust), referred to as `github` later in this - document. - * An internal GitLab repository, referred to as `gitlab` later in this document. - * An internal Perforce repository, referred to as `perforce` later in this document. - -## Branches and Tags - -The following tag names are used in the Thrust project: - - * `github/cuda-X.Y`: the tag that directly corresponds to what has been shipped in the CUDA Toolkit release X.Y. - * `github/A.B.C`: the tag that directly corresponds to a Thrust version A.B.C. - -The following branch names are used in the Thrust project: - - * `github/master`: the Source of Truth development branch of Thrust. - * `github/old-master`: the old Source of Truth branch, before unification of public and internal repositories. - * `perforce/private`: mirrored github/master, plus files necessary for internal NVIDIA testing systems. - * `gitlab/staging/cuda-X.Y`: the branch for a CUDA Toolkit release that has not been released yet. cuda-X.Y should - be tagged on this branch after the final commit freeze (see "Release branches" below). - * `github/maintenance/cuda-Z.W`: the continuation of gitlab/staging/cuda-Z.W, but after release of CUDA Z.W, plus - post-release fixes if any are needed (see "Old release branches" below). - * `gitlab/feature/`: feature branch for internally developed features. - * `gitlab/bug/-`: bug fix branch, where `bug-system` is `github` or `nvbug`. Permits a description - after `bug-id`. - * `gitlab/master`: same as `github/master`, but not yet published, during a freezing period (see "Feature freeze" - below). - -## Development Process Described - -### Normal development - -During regular parts of the development cycle, when we develop features on feature branches, and fix bugs on the -main branch, we can: - - * Merge internal fixes to `github/master` and to `perforce/private`. - * Merge Github contributions to `github/master` and to `perforce/private`. - -### Feature freeze - -In case where we have a new feature for a CUDA Toolkit release: just before the CUDA Toolkit feature freeze for a -new release branch, we should stop merging commits (including public contributions) to `github/master`, and move to -development on `gitlab/master`, and merge the not yet public features there. - -In those cases, we should wait until the new version of the toolkit is released before we push the new updated -`gitlab/master` to `github/master`, roughly at the same time as we push from `gitlab/staging/cuda-X.Y` to -`github/maintenance/cuda-X.Y` and tag `cuda-X.Y`, and the appropriate Thrust version tag. - -If we don't have big, not-public-before-release features landing in X.Y, however, we can avoid having a feature -freeze period. - -The reason for having a freeze period at all is: `github/master` is supposed to be the Source of Truth. We want the -history to follow the same order of commits in both Git and Perforce, and once a change is merged, we cannot rebase -things that went into `perforce/internal` on top of it. Therefore: since we only really commit to Perforce but not -`github/master` when we have a feature that is ready to be delivered, but is only a part of a new release and -shouldn't/can't be public yet, we have to make sure that after it is merged to `gitlab/master` (and to `perforce/internal`), -nothing new lands in `github/master` before we push the feature out. - -To avoid situations like this with bug fixes, when we fix a bug at a not crazy point in the release cycle, we -should develop it on git, merge/push it on Github, and then pull the new commit to Perforce. - -### Release branches - -These are the internal Git branches that map directly to internal CUDA release branches. These branches are primarily -developed in Git, and commits applied to them are then pushed to Perforce. - -After a CUDA Toolkit version is released, these transition to being old release branches. - -### Old release branches - -These branches represent a version that has landed in a CUDA Toolkit version, but with bugfixes for things that do -deserve being fixed on a release branch. These shouldn't be groundbreaking; the following are an acceptable set of -fixes to go into these branches, because they can remove annoyances, but shouldn't change behavior: - - * Documentation fixes and updates. - * Thrust build system changes. - * Additional examples, fixes to examples and tests. - * (Possibly:) Fixing missing headers. This one is slightly less obvious, because it makes it possible for users - of standalone Thrust to write programs that won't compile with CUDA Thrust. Determinations will be made on a - case by case basis. - diff --git a/doc/changelog.md b/doc/changelog.md deleted file mode 100644 index 98923388a..000000000 --- a/doc/changelog.md +++ /dev/null @@ -1,1192 +0,0 @@ -# Thrust v1.9.5 (CUDA 10.1 Update 1) - -## Summary - -Thrust 1.9.5 is a minor release accompanying the CUDA 10.1 Update 1 release. - -## Bug Fixes - -- NVBug 2502854: Fixed assignment of - `thrust::device_vector>` between host and device. - -# Thrust 1.9.4 (CUDA 10.1) - -## Summary - -Thrust 1.9.4 adds asynchronous interfaces for parallel algorithms, a new - allocator system including caching allocators and unified memory support, as - well as a variety of other enhancements, mostly related to - C++11/C++14/C++17/C++20 support. -The new asynchronous algorithms in the `thrust::async` namespace return - `thrust::event` or `thrust::future` objects, which can be waited upon to - synchronize with the completion of the parallel operation. - -## Breaking Changes - -Synchronous Thrust algorithms now block until all of their operations have - completed. -Use the new asynchronous Thrust algorithms for non-blocking behavior. - -## New Features - -- `thrust::event` and `thrust::future`, uniquely-owned asynchronous handles - consisting of a state (ready or not ready), content (some value; for - `thrust::future` only), and an optional set of objects that should be - destroyed only when the future's value is ready and has been consumed. - - The design is loosely based on C++11's `std::future`. - - They can be `.wait`'d on, and the value of a future can be waited on and - retrieved with `.get` or `.extract`. - - Multiple `thrust::event`s and `thrust::future`s can be combined with - `thrust::when_all`. - - `thrust::future`s can be converted to `thrust::event`s. - - Currently, these primitives are only implemented for the CUDA backend and - are C++11 only. -- New asynchronous algorithms that return `thrust::event`/`thrust::future`s, - implemented as C++20 range style customization points: - - `thrust::async::reduce`. - - `thrust::async::reduce_into`, which takes a target location to store the - reduction result into. - - `thrust::async::copy`, including a two-policy overload that allows - explicit cross system copies which execution policy properties can be - attached to. - - `thrust::async::transform`. - - `thrust::async::for_each`. - - `thrust::async::stable_sort`. - - `thrust::async::sort`. - - By default the asynchronous algorithms use the new caching allocators. - Deallocation of temporary storage is deferred until the destruction of - the returned `thrust::future`. The content of `thrust::future`s is - stored in either device or universal memory and transferred to the host - only upon request to prevent unnecessary data migration. - - Asynchronous algorithms are currently only implemented for the CUDA - system and are C++11 only. -- `exec.after(f, g, ...)`, a new execution policy method that takes a set of - `thrust::event`/`thrust::future`s and returns an execution policy that - operations on that execution policy should depend upon. -- New logic and mindset for the type requirements for cross-system sequence - copies (currently only used by `thrust::async::copy`), based on: - - `thrust::is_contiguous_iterator` and `THRUST_PROCLAIM_CONTIGUOUS_ITERATOR` - for detecting/indicating that an iterator points to contiguous storage. - - `thrust::is_trivially_relocatable` and - `THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE` for detecting/indicating that a - type is `memcpy`able (based on principles from - [P1144](https://wg21.link/P1144)). - - The new approach reduces buffering, increases performance, and increases - correctness. - - The fast path is now enabled when copying CUDA `__half` and vector types with - `thrust::async::copy`. -- All Thrust synchronous algorithms for the CUDA backend now actually - synchronize. Previously, any algorithm that did not allocate temporary - storage (counterexample: `thrust::sort`) and did not have a - computation-dependent result (counterexample: `thrust::reduce`) would - actually be launched asynchronously. Additionally, synchronous algorithms - that allocated temporary storage would become asynchronous if a custom - allocator was supplied that did not synchronize on allocation/deallocation, - unlike `cudaMalloc`/`cudaFree`. So, now `thrust::for_each`, - `thrust::transform`, `thrust::sort`, etc are truly synchronous. In some - cases this may be a performance regression; if you need asynchrony, use the - new asynchronous algorithms. -- Thrust's allocator framework has been rewritten. It now uses a memory - resource system, similar to C++17's `std::pmr` but supporting static - polymorphism. Memory resources are objects that allocate untyped storage and - allocators are cheap handles to memory resources in this new model. The new - facilities live in ``. - - `thrust::mr::memory_resource`, the memory resource base class, - which takes a (possibly tagged) pointer to `void` type as a parameter. - - `thrust::mr::allocator`, an allocator backed by a memory - resource object. - - `thrust::mr::polymorphic_adaptor_resource`, a type-erased memory - resource adaptor. - - `thrust::mr::polymorphic_allocator`, a C++17-style polymorphic allocator - backed by a type-erased memory resource object. - - New tunable C++17-style caching memory resources, - `thrust::mr::(disjoint_)?(un)?synchronized_pool_resource`, designed to - cache both small object allocations and large repetitive temporary - allocations. The disjoint variants use separate storage for management of - the pool, which is necessary if the memory being allocated cannot be - accessed on the host (e.g. device memory). - - System-specific allocators were rewritten to use the new memory resource - framework. - - New `thrust::device_memory_resource` for allocating device memory. - - New `thrust::universal_memory_resource` for allocating memory that can be - accessed from both the host and device (e.g. `cudaMallocManaged`). - - New `thrust::universal_host_pinned_memory_resource` for allocating memory - that can be accessed from the host and the device but always resides in - host memory (e.g. `cudaMallocHost`). - - `thrust::get_per_device_resource` and `thrust::per_device_allocator`, which - lazily create and retrieve a per-device singleton memory resource. - - Rebinding mechanisms (`rebind_traits` and `rebind_alloc`) for - `thrust::allocator_traits`. - - `thrust::device_make_unique`, a factory function for creating a - `std::unique_ptr` to a newly allocated object in device memory. - - ``, a C++11 implementation of the C++17 - uninitialized memory algorithms. - - `thrust::allocate_unique` and friends, based on the proposed C++23 - [`std::allocate_unique`](https://wg21.link/P0211). -- New type traits and metaprogramming facilities. Type traits are slowly being - migrated out of `thrust::detail::` and ``; their new home - will be `thrust::` and ``. - - `thrust::is_execution_policy`. - - `thrust::is_operator_less_or_greater_function_object`, which detects - `thrust::less`, `thrust::greater`, `std::less`, and `std::greater`. - - `thrust::is_operator_plus_function_object``, which detects `thrust::plus` - and `std::plus`. - - `thrust::remove_cvref(_t)?`, a C++11 implementation of C++20's - `thrust::remove_cvref(_t)?`. - - `thrust::void_t`, and various other new type traits. - - `thrust::integer_sequence` and friends, a C++11 implementation of C++20's - `std::integer_sequence` - - `thrust::conjunction`, `thrust::disjunction`, and `thrust::disjunction`, a - C++11 implementation of C++17's logical metafunctions. - - Some Thrust type traits (such as `thrust::is_constructible`) have been - redefined in terms of C++11's type traits when they are available. -- ``, new `std::tuple` algorithms: - - `thrust::tuple_transform`. - - `thrust::tuple_for_each`. - - `thrust::tuple_subset`. -- Miscellaneous new `std::`-like facilities: - - `thrust::optional`, a C++11 implementation of C++17's `std::optional`. - - `thrust::addressof`, an implementation of C++11's `std::addressof`. - - `thrust::next` and `thrust::prev`, an implementation of C++11's `std::next` - and `std::prev`. - - `thrust::square`, a `` style unary function object that - multiplies its argument by itself. - - `` and `thrust::numeric_limits`, a customized version of - `` and `std::numeric_limits`. -- ``, new general purpose preprocessor facilities: - - `THRUST_PP_CAT[2-5]`, concatenates two to five tokens. - - `THRUST_PP_EXPAND(_ARGS)?`, performs double expansion. - - `THRUST_PP_ARITY` and `THRUST_PP_DISPATCH`, tools for macro overloading. - - `THRUST_PP_BOOL`, boolean conversion. - - `THRUST_PP_INC` and `THRUST_PP_DEC`, increment/decrement. - - `THRUST_PP_HEAD`, a variadic macro that expands to the first argument. - - `THRUST_PP_TAIL`, a variadic macro that expands to all its arguments after - the first. - - `THRUST_PP_IIF`, bitwise conditional. - - `THRUST_PP_COMMA_IF`, and `THRUST_PP_HAS_COMMA`, facilities for adding and - detecting comma tokens. - - `THRUST_PP_IS_VARIADIC_NULLARY`, returns true if called with a nullary - `__VA_ARGS__`. - - `THRUST_CURRENT_FUNCTION`, expands to the name of the current function. -- New C++11 compatibility macros: - - `THRUST_NODISCARD`, expands to `[[nodiscard]]` when available and the best - equivalent otherwise. - - `THRUST_CONSTEXPR`, expands to `constexpr` when available and the best - equivalent otherwise. - - `THRUST_OVERRIDE`, expands to `override` when available and the best - equivalent otherwise. - - `THRUST_DEFAULT`, expands to `= default;` when available and the best - equivalent otherwise. - - `THRUST_NOEXCEPT`, expands to `noexcept` when available and the best - equivalent otherwise. - - `THRUST_FINAL`, expands to `final` when available and the best equivalent - otherwise. - - `THRUST_INLINE_CONSTANT`, expands to `inline constexpr` when available and - the best equivalent otherwise. -- ``, new C++11-only type deduction helpers: - - `THRUST_DECLTYPE_RETURNS*`, expand to function definitions with suitable - conditional `noexcept` qualifiers and trailing return types. - - `THRUST_FWD(x)`, expands to `::std::forward(x)`. - - `THRUST_MVCAP`, expands to a lambda move capture. - - `THRUST_RETOF`, expands to a decltype computing the return type of an - invocable. -- New CMake build system. - -## New Examples - -- `mr_basic` demonstrates how to use the new memory resource allocator system. - -## Other Enhancements - -- Tagged pointer enhancements: - - New `thrust::pointer_traits` specialization for `void const*`. - - `nullptr` support to Thrust tagged pointers. - - New `explicit operator bool` for Thrust tagged pointers when using C++11 - for `std::unique_ptr` interoperability. - - Added `thrust::reinterpret_pointer_cast` and `thrust::static_pointer_cast` - for casting Thrust tagged pointers. -- Iterator enhancements: - - `thrust::iterator_system` is now SFINAE friendly. - - Removed cv qualifiers from iterator types when using - `thrust::iterator_system`. -- Static assert enhancements: - - New `THRUST_STATIC_ASSERT_MSG`, takes an optional string constant to be - used as the error message when possible. - - Update `THRUST_STATIC_ASSERT(_MSG)` to use C++11's `static_assert` when - it's available. - - Introduce a way to test for static assertions. -- Testing enhancements: - - Additional scalar and sequence types, including non-builtin types and - vectors with unified memory allocators, have been added to the list of - types used by generic unit tests. - - The generation of random input data has been improved to increase the range - of values used and catch more corner cases. - - New `unittest::truncate_to_max_representable` utility for avoiding the - generation of ranges that cannot be represented by the underlying element - type in generic unit test code. - - The test driver now synchronizes with CUDA devices and check for errors - after each test, when switching devices, and after each raw kernel launch. - - The `warningtester` uber header is now compiled with NVCC to avoid needing - to disable CUDA-specific code with the preprocessor. - - Fixed the unit test framework's `ASSERT_*` to print `char`s as `int`s. - - New `DECLARE_INTEGRAL_VARIABLE_UNITTEST` test declaration macro. - - New `DECLARE_VARIABLE_UNITTEST_WITH_TYPES_AND_NAME` test declaration macro. - - `thrust::system_error` in the CUDA backend now print out its `cudaError_t` - enumerator in addition to the diagnostic message. - - Stopped using conditionally signed types like `char`. - -## Bug Fixes - -- #897, NVBug 2062242: Fix compilation error when using `__device__` lambdas - with `thrust::reduce` on MSVC. -- #908, NVBug 2089386: Static assert that `thrust::generate`/`thrust::fill` - isn't operating on const iterators. -- #919 Fix compilation failure with `thrust::zip_iterator` and - `thrust::complex`. -- #924, NVBug 2096679, NVBug 2315990: Fix dispatch for the CUDA backend's - `thrust::reduce` to use two functions (one with the pragma for disabling - exec checks, one with `THRUST_RUNTIME_FUNCTION`) instead of one. This fixes - a regression with device compilation that started in CUDA 9.2. -- #928, NVBug 2341455: Add missing `__host__ __device__` annotations to a - `thrust::complex::operator=` to satisfy GoUDA. -- NVBug 2094642: Make `thrust::vector_base::clear` not depend on the element - type being default constructible. -- NVBug 2289115: Remove flaky `simple_cuda_streams` example. -- NVBug 2328572: Add missing `thrust::device_vector` constructor that takes an - allocator parameter. -- NVBug 2455740: Update the `range_view` example to not use device-side launch. -- NVBug 2455943: Ensure that sized unit tests that use - `thrust::counting_iterator` perform proper truncation. -- NVBug 2455952: Refactor questionable `thrust::copy_if` unit tests. - -# Thrust 1.9.3 (CUDA 10.0) - -## Summary - -Thrust 1.9.3 unifies and integrates CUDA Thrust and GitHub Thrust. - -## Bug Fixes - -- #725, #850, #855, #859, #860: Unify the `thrust::iter_swap` interface and fix - `thrust::device_reference` swapping. -- NVBug 2004663: Add a `data` method to `thrust::detail::temporary_array` and - refactor temporary memory allocation in the CUDA backend to be exception - and leak safe. -- #886, #894, #914: Various documentation typo fixes. -- #724: Provide `NVVMIR_LIBRARY_DIR` environment variable to NVCC. -- #878: Optimize `thrust::min/max_element` to only use - `thrust::detail::get_iterator_value` for non-numeric types. -- #899: Make `thrust::cuda::experimental::pinned_allocator`'s comparison - operators `const`. -- NVBug 2092152: Remove all includes of ``. -- #911: Fix default comparator element type for `thrust::merge_by_key`. - -## Acknowledgments - -- Thanks to Andrew Corrigan for contributing fixes for swapping interfaces. -- Thanks to Francisco Facioni for contributing optimizations for - `thrust::min/max_element`. - -# Thrust 1.9.2 (CUDA 9.2) - -## Summary - -Thrust 1.9.2 brings a variety of performance enhancements, bug fixes and test - improvements. -CUB 1.7.5 was integrated, enhancing the performance of `thrust::sort` on - small data types and `thrust::reduce`. -Changes were applied to `complex` to optimize memory access. -Thrust now compiles with compiler warnings enabled and treated as errors. -Additionally, the unit test suite and framework was enhanced to increase - coverage. - -## Breaking Changes - -- The `fallback_allocator` example was removed, as it was buggy and difficult - to support. - -## New Features - -- ``, utilities for memory alignment: - - `thrust::aligned_reinterpret_cast`. - - `thrust::aligned_storage_size`, which computes the amount of storage needed - for an object of a particular size and alignment. - - `thrust::alignment_of`, a C++03 implementation of C++11's - `std::alignment_of`. - - `thrust::aligned_storage`, a C++03 implementation of C++11's - `std::aligned_storage`. - - `thrust::max_align_t`, a C++03 implementation of C++11's - `std::max_align_t`. - -## Bug Fixes -- NVBug 200385527, NVBug 200385119, NVBug 200385113, NVBug 200349350, NVBug - 2058778: Various compiler warning issues. -- NVBug 200355591: `thrust::reduce` performance issues. -- NVBug 2053727: Fixed an ADL bug that caused user-supplied `allocate` to be - overlooked but `deallocate` to be called with GCC <= 4.3. -- NVBug 1777043: Fixed `thrust::complex` to work with `thrust::sequence`. - -# Thrust 1.9.1 (CUDA 9.1) - -## Summary - -Thrust 1.9.1 integrates version 1.7.4 of CUB and introduces a new CUDA backend -for `thrust::reduce` based on CUB. - -## Bug Fixes - -- NVBug 1965743: Remove unnecessary static qualifiers. -- NVBug 1940974: Fix regression causing a compilation error when using - `thrust::merge_by_key` with `thrust::constant_iterator`s. -- NVBug 1904217: Allow callables that take non-const refs to be used with - `thrust::reduce` and `thrust::*_scan`. - -# Thrust 1.9.0 (CUDA 9.0) - -## Summary - -Thrust 1.9.0 replaces the original CUDA backend (bulk) with a new one - written using CUB, a high performance CUDA collectives library. -This brings a substantial performance improvement to the CUDA backend across - the board. - -## Breaking Changes - -- Any code depending on CUDA backend implementation details will likely be - broken. - -## New Features - -- New CUDA backend based on CUB which delivers substantially higher performance. -- `thrust::transform_output_iterator`, a fancy iterator that applies a function - to the output before storing the result. - -## New Examples - -- `transform_output_iterator` demonstrates use of the new fancy iterator - `thrust::transform_output_iterator`. - -## Other Enhancements - -- When C++11 is enabled, functors do not have to inherit from - `thrust::(unary|binary)_function` anymore to be used with - `thrust::transform_iterator`. -- Added C++11 only move constructors and move assignment operators for - `thrust::detail::vector_base`-based classes, e.g. `thrust::host_vector`, - `thrust::device_vector`, and friends. - -## Bug Fixes - -- `sin(thrust::complex)` no longer has precision loss to float. - -## Acknowledgments - -- Thanks to Manuel Schiller for contributing a C++11 based enhancement - regarding the deduction of functor return types, improving the performance - of `thrust::unique` and implementing `thrust::transform_output_iterator`. -- Thanks to Thibault Notargiacomo for the implementation of move semantics for - the `thrust::vector_base`-based classes. -- Thanks to Duane Merrill for developing CUB and helping to integrate it into - Thrust's backend. - -# Thrust 1.8.3 (CUDA 8.0) - -Thrust 1.8.3 is a small bug fix release. - -## New Examples - -- `range_view` demonstrates the use of a view (a non-owning wrapper for an - iterator range with a container-like interface). - -## Bug Fixes - -- `thrust::(min|max|minmax)_element` can now accept raw device pointers when - an explicit device execution policy is used. -- `thrust::clear` operations on vector types no longer requires the element - type to have a default constructor. - -# Thrust 1.8.2 (CUDA 7.5) - -Thrust 1.8.2 is a small bug fix release. - -## Bug Fixes - -- Avoid warnings and errors concerning user functions called from - `__host__ __device__` functions. -- #632: Fix an error in `thrust::set_intersection_by_key` with the CUDA backend. -- #651: `thrust::copy` between host and device now accepts execution policies - with streams attached, i.e. `thrust::::cuda::par.on(stream)`. -- #664: `thrust::for_each` and algorithms based on it no longer ignore streams - attached to execution policys. - -## Known Issues - -- #628: `thrust::reduce_by_key` for the CUDA backend fails for Compute - Capability 5.0 devices. - -# Thrust 1.8.1 (CUDA 7.0) - -Thrust 1.8.1 is a small bug fix release. - -## Bug Fixes - -- #615, #620: Fixed `thrust::for_each` and `thrust::reduce` to no longer fail on - large inputs. - -## Known Issues - -- #628: `thrust::reduce_by_key` for the CUDA backend fails for Compute - Capability 5.0 devices. - -# Thrust 1.8.0 - -Summary -- Thrust 1.8.0 introduces support for algorithm invocation from CUDA __device__ code, support for CUDA streams, -- and algorithm performance improvements. Users may now invoke Thrust algorithms from CUDA __device__ code, -- providing a parallel algorithms library to CUDA programmers authoring custom kernels, as well as allowing -- Thrust programmers to nest their algorithm calls within functors. The thrust::seq execution policy -- allows users to require sequential algorithm execution in the calling thread and makes a -- sequential algorithms library available to individual CUDA threads. The .on(stream) syntax allows users to -- request a CUDA stream for kernels launched during algorithm execution. Finally, new CUDA algorithm -- implementations provide substantial performance improvements. - -## New Features -- Algorithms in CUDA __device__ code - Thrust algorithms may now be invoked from CUDA __device__ and __host__ __device__ functions. - - Algorithms invoked in this manner must be invoked with an execution policy as the first parameter: - - __device__ int my_device_sort(int *data, size_t n) - { - thrust::sort(thrust::device, data, data + n); - } - - The following execution policies are supported in CUDA __device__ code: - thrust::seq - thrust::cuda::par - thrust::device, when THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA - - Parallel algorithm execution may not be accelerated unless CUDA Dynamic Parallelism is available. - -- Execution Policies - CUDA Streams - The thrust::cuda::par.on(stream) syntax allows users to request that CUDA __global__ functions launched during algorithm - execution should occur on a given stream: - - // execute for_each on stream s - thrust::for_each(thrust::cuda::par.on(s), begin, end, my_functor); - - Algorithms executed with a CUDA stream in this manner may still synchronize with other streams when allocating temporary - storage or returning results to the CPU. - - thrust::seq - The thrust::seq execution policy allows users to require that an algorithm execute sequentially in the calling thread: - - // execute for_each sequentially in this thread - thrust::for_each(thrust::seq, begin, end, my_functor); - -- Other - The new thrust::complex template provides complex number support. - -## New Examples -- simple_cuda_streams demonstrates how to request a CUDA stream during algorithm execution. -- async_reduce demonstrates ways to achieve algorithm invocations which are asynchronous with the calling thread. - -## Other Enhancements -- CUDA sort performance for user-defined types is 300% faster on Tesla K20c for large problem sizes. -- CUDA merge performance is 200% faster on Tesla K20c for large problem sizes. -- CUDA sort performance for primitive types is 50% faster on Tesla K20c for large problem sizes. -- CUDA reduce_by_key performance is 25% faster on Tesla K20c for large problem sizes. -- CUDA scan performance is 15% faster on Tesla K20c for large problem sizes. -- fallback_allocator example is simpler. - -## Bug Fixes -- #364 iterators with unrelated system tags may be used with algorithms invoked with an execution policy -- #371 do not redefine __CUDA_ARCH__ -- #379 fix crash when dereferencing transform_iterator on the CPU -- #391 avoid use of uppercase variable names -- #392 fix thrust::copy between cusp::complex & std::complex -- #396 program compiled with gcc < 4.3 hangs during comparison sort -- #406 fallback_allocator.cu example checks device for unified addressing support -- #417 avoid using std::less in binary search algorithms -- #418 avoid various warnings -- #443 including version.h no longer configures default systems -- #578 nvcc produces warnings when sequential algorithms are used with cpu systems - -## Known Issues -- When invoked with primitive data types, thrust::sort, thrust::sort_by_key, thrust::stable_sort, & thrust::stable_sort_by_key may -- fail to link in some cases with nvcc -rdc=true. - -- The CUDA implementation of thrust::reduce_by_key incorrectly outputs the last element in a segment of equivalent keys instead of the first. - -Acknowledgments -- Thanks to Sean Baxter for contributing faster CUDA reduce, merge, and scan implementations. -- Thanks to Duane Merrill for contributing a faster CUDA radix sort implementation. -- Thanks to Filipe Maia for contributing the implementation of thrust::complex. - -# Thrust 1.7.2 (CUDA 6.5) - -Summary -- Small bug fixes - -## Bug Fixes -- Avoid use of std::min in generic find implementation - -# Thrust 1.7.1 (CUDA 6.0) - -Summary -- Small bug fixes - -## Bug Fixes -- Eliminate identifiers in set_operations.cu example with leading underscore -- Eliminate unused variable warning in CUDA reduce_by_key implementation -- Avoid deriving function objects from std::unary_function and std::binary_function - -# Thrust 1.7.0 (CUDA 5.5) - -Summary -- Thrust 1.7.0 introduces a new interface for controlling algorithm execution as -- well as several new algorithms and performance improvements. With this new -- interface, users may directly control how algorithms execute as well as details -- such as the allocation of temporary storage. Key/value versions of thrust::merge -- and the set operation algorithms have been added, as well stencil versions of -- partitioning algorithms. thrust::tabulate has been introduced to tabulate the -- values of functions taking integers. For 32b types, new CUDA merge and set -- operations provide 2-15x faster performance while a new CUDA comparison sort -- provides 1.3-4x faster performance. Finally, a new TBB reduce_by_key implementation -- provides 80% faster performance. - -## Breaking Changes -- Dispatch - Custom user backend systems' tag types must now inherit from the corresponding system's execution_policy template (e.g. thrust::cuda::execution_policy) instead - of the tag struct (e.g. thrust::cuda::tag). Otherwise, algorithm specializations will silently go unfound during dispatch. - See examples/minimal_custom_backend.cu and examples/cuda/fallback_allocator.cu for usage examples. - - thrust::advance and thrust::distance are no longer dispatched based on iterator system type and thus may no longer be customized. - -- Iterators - iterator_facade and iterator_adaptor's Pointer template parameters have been eliminated. - iterator_adaptor has been moved into the thrust namespace (previously thrust::experimental::iterator_adaptor). - iterator_facade has been moved into the thrust namespace (previously thrust::experimental::iterator_facade). - iterator_core_access has been moved into the thrust namespace (previously thrust::experimental::iterator_core_access). - All iterators' nested pointer typedef (the type of the result of operator->) is now void instead of a pointer type to indicate that such expressions are currently impossible. - Floating point counting_iterators' nested difference_type typedef is now a signed integral type instead of a floating point type. - -- Other - normal_distribution has been moved into the thrust::random namespace (previously thrust::random::experimental::normal_distribution). - Placeholder expressions may no longer include the comma operator. - -## New Features -- Execution Policies - Users may directly control the dispatch of algorithm invocations with optional execution policy arguments. - For example, instead of wrapping raw pointers allocated by cudaMalloc with thrust::device_ptr, the thrust::device execution_policy may be passed as an argument to an algorithm invocation to enable CUDA execution. - The following execution policies are supported in this version: - - thrust::host - thrust::device - thrust::cpp::par - thrust::cuda::par - thrust::omp::par - thrust::tbb::par - -- Algorithms - free - get_temporary_buffer - malloc - merge_by_key - partition with stencil - partition_copy with stencil - return_temporary_buffer - set_difference_by_key - set_intersection_by_key - set_symmetric_difference_by_key - set_union_by_key - stable_partition with stencil - stable_partition_copy with stencil - tabulate - -## New Examples -- uninitialized_vector demonstrates how to use a custom allocator to avoid the automatic initialization of elements in thrust::device_vector. - -## Other Enhancements -- Authors of custom backend systems may manipulate arbitrary state during algorithm dispatch by incorporating it into their execution_policy parameter. -- Users may control the allocation of temporary storage during algorithm execution by passing standard allocators as parameters via execution policies such as thrust::device. -- THRUST_DEVICE_SYSTEM_CPP has been added as a compile-time target for the device backend. -- CUDA merge performance is 2-15x faster. -- CUDA comparison sort performance is 1.3-4x faster. -- CUDA set operation performance is 1.5-15x faster. -- TBB reduce_by_key performance is 80% faster. -- Several algorithms have been parallelized with TBB. -- Support for user allocators in vectors has been improved. -- The sparse_vector example is now implemented with merge_by_key instead of sort_by_key. -- Warnings have been eliminated in various contexts. -- Warnings about __host__ or __device__-only functions called from __host__ __device__ functions have been eliminated in various contexts. -- Documentation about algorithm requirements have been improved. -- Simplified the minimal_custom_backend example. -- Simplified the cuda/custom_temporary_allocation example. -- Simplified the cuda/fallback_allocator example. - -## Bug Fixes -- #248 fix broken counting_iterator behavior with OpenMP -- #231, #209 fix set operation failures with CUDA -- #187 fix incorrect occupancy calculation with CUDA -- #153 fix broken multigpu behavior with CUDA -- #142 eliminate warning produced by thrust::random::taus88 and MSVC 2010 -- #208 correctly initialize elements in temporary storage when necessary -- #16 fix compilation error when sorting bool with CUDA -- #10 fix ambiguous overloads of reinterpret_tag - -## Known Issues -- g++ versions 4.3 and lower may fail to dispatch thrust::get_temporary_buffer correctly causing infinite recursion in examples such as cuda/custom_temporary_allocation. - -Acknowledgments -- Thanks to Sean Baxter, Bryan Catanzaro, and Manjunath Kudlur for contributing a faster merge implementation for CUDA. -- Thanks to Sean Baxter for contributing a faster set operation implementation for CUDA. -- Thanks to Cliff Woolley for contributing a correct occupancy calculation algorithm. - -# Thrust 1.6.0 - -Summary -- Thrust v1.6.0 provides an interface for customization and extension and a new -- backend system based on the Threading Building Blocks library. With this -- new interface, programmers may customize the behavior of specific algorithms -- as well as control the allocation of temporary storage or invent entirely new -- backends. These enhancements also allow multiple different backend systems -- such as CUDA and OpenMP to coexist within a single program. Support for TBB -- allows Thrust programs to integrate more naturally into applications which -- may already employ the TBB task scheduler. - -## Breaking Changes -- The header has been moved to -- thrust::experimental::cuda::pinned_allocator has been moved to thrust::cuda::experimental::pinned_allocator -- The macro THRUST_DEVICE_BACKEND has been renamed THRUST_DEVICE_SYSTEM -- The macro THRUST_DEVICE_BACKEND_CUDA has been renamed THRUST_DEVICE_SYSTEM_CUDA -- The macro THRUST_DEVICE_BACKEND_OMP has been renamed THRUST_DEVICE_SYSTEM_OMP -- thrust::host_space_tag has been renamed thrust::host_system_tag -- thrust::device_space_tag has been renamed thrust::device_system_tag -- thrust::any_space_tag has been renamed thrust::any_system_tag -- thrust::iterator_space has been renamed thrust::iterator_system - - -## New Features -- Backend Systems - Threading Building Blocks (TBB) is now supported -- Functions - for_each_n - raw_reference_cast -- Types - pointer - reference - -## New Examples -- cuda/custom_temporary_allocation -- cuda/fallback_allocator -- device_ptr -- expand -- minimal_custom_backend -- raw_reference_cast -- set_operations - -## Other Enhancements -- thrust::for_each now returns the end of the input range similar to most other algorithms -- thrust::pair and thrust::tuple have swap functionality -- all CUDA algorithms now support large data types -- iterators may be dereferenced in user __device__ or __global__ functions -- the safe use of different backend systems is now possible within a single binary - -## Bug Fixes -- #469 min_element and max_element algorithms no longer require a const comparison operator - -## Known Issues -- cudafe++.exe may crash when parsing TBB headers on Windows. - -# Thrust 1.5.3 (CUDA 5.0) - -Summary -- Small bug fixes - -## Bug Fixes -- Avoid warnings about potential race due to __shared__ non-POD variable - -# Thrust 1.5.2 (CUDA 4.2) - -Summary -- Small bug fixes - -## Bug Fixes -- Fixed warning about C-style initialization of structures - -# Thrust 1.5.1 (CUDA 4.1) - -Summary -- Small bug fixes - -## Bug Fixes -- Sorting data referenced by permutation_iterators on CUDA produces invalid results - -# Thrust 1.5.0 - -Summary -- Thrust v1.5.0 provides introduces new programmer productivity and performance -- enhancements. New functionality for creating anonymous "lambda" functions has -- been added. A faster host sort provides 2-10x faster performance for sorting -- arithmetic types on (single-threaded) CPUs. A new OpenMP sort provides -- 2.5x-3.0x speedup over the host sort using a quad-core CPU. When sorting -- arithmetic types with the OpenMP backend the combined performance improvement -- is 5.9x for 32-bit integers and ranges from 3.0x (64-bit types) to 14.2x -- (8-bit types). A new CUDA reduce_by_key implementation provides 2-3x faster -- performance. - -## Breaking Changes -- device_ptr no longer unsafely converts to device_ptr without an -- explicit cast. Use the expression -- device_pointer_cast(static_cast(void_ptr.get())) -- to convert, for example, device_ptr to device_ptr. - -## New Features -- Functions - stencil-less transform_if - -- Types - lambda placeholders - -## New Examples -- lambda - -## Other Enhancements -- host sort is 2-10x faster for arithmetic types -- OMP sort provides speedup over host sort -- reduce_by_key is 2-3x faster -- reduce_by_key no longer requires O(N) temporary storage -- CUDA scan algorithms are 10-40% faster -- host_vector and device_vector are now documented -- out-of-memory exceptions now provide detailed information from CUDART -- improved histogram example -- device_reference now has a specialized swap -- reduce_by_key and scan algorithms are compatible with discard_iterator - -Removed Functionality - -## Bug Fixes - #44 allow host_vector to compile when value_type uses __align__ -- #198 allow adjacent_difference to permit safe in-situ operation -- #303 make thrust thread-safe -- #313 avoid race conditions in device_vector::insert -- #314 avoid unintended adl invocation when dispatching copy -- #365 fix merge and set operation failures - -## Known Issues -- None - -Acknowledgments -- Thanks to Manjunath Kudlur for contributing his Carbon library, from which the lambda functionality is derived. -- Thanks to Jean-Francois Bastien for suggesting a fix for issue 303. - -# Thrust 1.4.0 (CUDA 4.0) - -Summary -- Thrust v1.4.0 provides support for CUDA 4.0 in addition to many feature -- and performance improvements. New set theoretic algorithms operating on -- sorted sequences have been added. Additionally, a new fancy iterator -- allows discarding redundant or otherwise unnecessary output from -- algorithms, conserving memory storage and bandwidth. - -## Breaking Changes -- Eliminations - thrust/is_sorted.h - thrust/utility.h - thrust/set_intersection.h - thrust/experimental/cuda/ogl_interop_allocator.h and the functionality therein - thrust::deprecated::copy_when - thrust::deprecated::absolute_value - -## New Features -- Functions - copy_n - merge - set_difference - set_symmetric_difference - set_union - -- Types - discard_iterator - -- Device support - Compute Capability 2.1 GPUs - -## New Examples -- run_length_decoding - -## Other Enhancements -- Compilation warnings are substantially reduced in various contexts. -- The compilation time of thrust::sort, thrust::stable_sort, thrust::sort_by_key, -- and thrust::stable_sort_by_key are substantially reduced. -- A fast sort implementation is used when sorting primitive types with thrust::greater. -- The performance of thrust::set_intersection is improved. -- The performance of thrust::fill is improved on SM 1.x devices. -- A code example is now provided in each algorithm's documentation. -- thrust::reverse now operates in-place - -Removed Functionality -- thrust::deprecated::copy_when -- thrust::deprecated::absolute_value -- thrust::experimental::cuda::ogl_interop_allocator -- thrust::gather and thrust::scatter from host to device and vice versa are no longer supported. -- Operations which modify the elements of a thrust::device_vector are no longer -- available from source code compiled without nvcc when the device backend is CUDA. -- Instead, use the idiom from the cpp_interop example. - -## Bug Fixes -- #212 set_intersection works correctly for large input sizes. -- #275 counting_iterator and constant_iterator work correctly with OpenMP as the -- backend when compiling with optimization -- #256 min and max correctly return their first argument as a tie-breaker -- #248 NDEBUG is interpreted correctly - -## Known Issues -- nvcc may generate code containing warnings when compiling some Thrust algorithms. -- When compiling with -arch=sm_1x, some Thrust algorithms may cause nvcc to issue -- benign pointer advisories. -- When compiling with -arch=sm_1x and -G, some Thrust algorithms may fail to execute correctly. -- thrust::inclusive_scan, thrust::exclusive_scan, thrust::inclusive_scan_by_key, -- and thrust::exclusive_scan_by_key are currently incompatible with thrust::discard_iterator. - -Acknowledgments -- Thanks to David Tarjan for improving the performance of set_intersection. -- Thanks to Duane Merrill for continued help with sort. -- Thanks to Nathan Whitehead for help with CUDA Toolkit integration. - -# Thrust 1.3.0 (CUDA 3.2) - -Summary -- Thrust v1.3.0 provides support for CUDA 3.2 in addition to many feature -- and performance enhancements. - -- Performance of the sort and sort_by_key algorithms is improved by as much -- as 3x in certain situations. The performance of stream compaction algorithms, -- such as copy_if, is improved by as much as 2x. Reduction performance is -- also improved, particularly for small input sizes. - -- CUDA errors are now converted to runtime exceptions using the system_error -- interface. Combined with a debug mode, also new in v1.3, runtime errors -- can be located with greater precision. - -- Lastly, a few header files have been consolidated or renamed for clarity. -- See the deprecations section below for additional details. - - -## Breaking Changes -- Promotions - thrust::experimental::inclusive_segmented_scan has been renamed thrust::inclusive_scan_by_key and exposes a different interface - thrust::experimental::exclusive_segmented_scan has been renamed thrust::exclusive_scan_by_key and exposes a different interface - thrust::experimental::partition_copy has been renamed thrust::partition_copy and exposes a different interface - thrust::next::gather has been renamed thrust::gather - thrust::next::gather_if has been renamed thrust::gather_if - thrust::unique_copy_by_key has been renamed thrust::unique_by_key_copy -- Deprecations - thrust::copy_when has been renamed thrust::deprecated::copy_when - thrust::absolute_value has been renamed thrust::deprecated::absolute_value - The header thrust/set_intersection.h is now deprecated; use thrust/set_operations.h instead - The header thrust/utility.h is now deprecated; use thrust/swap.h instead - The header thrust/swap_ranges.h is now deprecated; use thrust/swap.h instead -- Eliminations - thrust::deprecated::gather - thrust::deprecated::gather_if - thrust/experimental/arch.h and the functions therein - thrust/sorting/merge_sort.h - thrust/sorting/radix_sort.h - -## New Features -- Functions - exclusive_scan_by_key - find - find_if - find_if_not - inclusive_scan_by_key - is_partitioned - is_sorted_until - mismatch - partition_point - reverse - reverse_copy - stable_partition_copy - -- Types - system_error and related types - experimental::cuda::ogl_interop_allocator - bit_and, bit_or, and bit_xor - -- Device support - gf104-based GPUs - -## New Examples -- opengl_interop.cu -- repeated_range.cu -- simple_moving_average.cu -- sparse_vector.cu -- strided_range.cu - -## Other Enhancements -- Performance of thrust::sort and thrust::sort_by_key is substantially improved for primitive key types -- Performance of thrust::copy_if is substantially improved -- Performance of thrust::reduce and related reductions is improved -- THRUST_DEBUG mode added -- Callers of Thrust functions may detect error conditions by catching thrust::system_error, which derives from std::runtime_error -- The number of compiler warnings generated by Thrust has been substantially reduced -- Comparison sort now works correctly for input sizes > 32M -- min & max usage no longer collides with definitions -- Compiling against the OpenMP backend no longer requires nvcc -- Performance of device_vector initialized in .cpp files is substantially improved in common cases -- Performance of thrust::sort_by_key on the host is substantially improved - -Removed Functionality -- nvcc 2.3 is no longer supported - -## Bug Fixes -- Debug device code now compiles correctly -- thrust::uninitialized_copy and thrust::unintialized_fill now dispatch constructors on the device rather than the host - -## Known Issues -- #212 set_intersection is known to fail for large input sizes -- partition_point is known to fail for 64b types with nvcc 3.2 - -Acknowledgments -- Thanks to Duane Merrill for contributing a fast CUDA radix sort implementation -- Thanks to Erich Elsen for contributing an implementation of find_if -- Thanks to Andrew Corrigan for contributing changes which allow the OpenMP backend to compile in the absence of nvcc -- Thanks to Andrew Corrigan, Cliff Wooley, David Coeurjolly, Janick Martinez Esturo, John Bowers, Maxim Naumov, Michael Garland, and Ryuta Suzuki for bug reports -- Thanks to Cliff Woolley for help with testing - -# Thrust 1.2.1 (CUDA 3.1) - -Summary -- Small fixes for compatibility with CUDA 3.1 - -## Known Issues -- inclusive_scan & exclusive_scan may fail with very large types -- the Microsoft compiler may fail to compile code using both sort and binary search algorithms -- uninitialized_fill & uninitialized_copy dispatch constructors on the host rather than the device -- # 109 some algorithms may exhibit poor performance with the OpenMP backend with large numbers (>= 6) of CPU threads -- default_random_engine::discard is not accelerated with nvcc 2.3 -- nvcc 3.1 may fail to compile code using types derived from thrust::subtract_with_carry_engine, such as thrust::ranlux24 & thrust::ranlux48. - -# Thrust 1.2.0 - -Summary -- Thrust v1.2 introduces support for compilation to multicore CPUs -- and the Ocelot virtual machine, and several new facilities for -- pseudo-random number generation. New algorithms such as set -- intersection and segmented reduction have also been added. Lastly, -- improvements to the robustness of the CUDA backend ensure -- correctness across a broad set of (uncommon) use cases. - -## Breaking Changes -- thrust::gather's interface was incorrect and has been removed. -- The old interface is deprecated but will be preserved for Thrust -- version 1.2 at thrust::deprecated::gather & -- thrust::deprecated::gather_if. The new interface is provided at -- thrust::next::gather & thrust::next::gather_if. The new interface -- will be promoted to thrust:: in Thrust version 1.3. For more details, -- please refer to this thread: -- http://groups.google.com/group/thrust-users/browse_thread/thread/f5f0583cb97b51fd - -- The thrust::sorting namespace has been deprecated in favor of the -- top-level sorting functions, such as thrust::sort() and -- thrust::sort_by_key(). - -## New Features -- Functions - reduce_by_key - set_intersection - tie - unique_copy - unique_by_key - unique_copy_by_key - -- Types - Random Number Generation - discard_block_engine - default_random_engine - linear_congruential_engine - linear_feedback_shift_engine - minstd_rand - minstd_rand0 - normal_distribution (experimental) - ranlux24 - ranlux48 - ranlux24_base - ranlux48_base - subtract_with_carry_engine - taus88 - uniform_int_distribution - uniform_real_distribution - xor_combine_engine - Functionals - project1st - project2nd - -- Fancy Iterators - permutation_iterator - reverse_iterator - -- Device support - Add support for multicore CPUs via OpenMP - Add support for Fermi-class GPUs - Add support for Ocelot virtual machine - -## New Examples -- cpp_integration -- histogram -- mode -- monte_carlo -- monte_carlo_disjoint_sequences -- padded_grid_reduction -- permutation_iterator -- row_sum -- run_length_encoding -- segmented_scan -- stream_compaction -- summary_statistics -- transform_iterator -- word_count - -## Other Enhancements -- vector functions operator!=, rbegin, crbegin, rend, crend, data, & shrink_to_fit -- integer sorting performance is improved when max is large but (max - min) is small and when min is negative -- performance of inclusive_scan() and exclusive_scan() is improved by 20-25% for primitive types -- support for nvcc 3.0 - -Removed Functionality -- removed support for equal between host & device sequences -- removed support for gather() and scatter() between host & device sequences - -## Bug Fixes -- # 8 cause a compiler error if the required compiler is not found rather than a mysterious error at link time -- # 42 device_ptr & device_reference are classes rather than structs, eliminating warnings on certain platforms -- # 46 gather & scatter handle any space iterators correctly -- # 51 thrust::experimental::arch functions gracefully handle unrecognized GPUs -- # 52 avoid collisions with common user macros such as BLOCK_SIZE -- # 62 provide better documentation for device_reference -- # 68 allow built-in CUDA vector types to work with device_vector in pure C++ mode -- # 102 eliminated a race condition in device_vector::erase -- various compilation warnings eliminated - -## Known Issues - inclusive_scan & exclusive_scan may fail with very large types - the Microsoft compiler may fail to compile code using both sort and binary search algorithms - uninitialized_fill & uninitialized_copy dispatch constructors on the host rather than the device - # 109 some algorithms may exhibit poor performance with the OpenMP backend with large numbers (>= 6) of CPU threads - default_random_engine::discard is not accelerated with nvcc 2.3 - -Acknowledgments - Thanks to Gregory Diamos for contributing a CUDA implementation of set_intersection - Thanks to Ryuta Suzuki & Gregory Diamos for rigorously testing Thrust's unit tests and examples against Ocelot - Thanks to Tom Bradley for contributing an implementation of normal_distribution - Thanks to Joseph Rhoads for contributing the example summary_statistics - -# Thrust 1.1.1 - -Summary -- Small fixes for compatibility with CUDA 2.3a and Mac OSX Snow Leopard. - -# Thrust 1.1.0 - -Summary -- Thrust v1.1 introduces fancy iterators, binary search functions, and -- several specialized reduction functions. Experimental support for -- segmented scan has also been added. - -## Breaking Changes -- counting_iterator has been moved into the thrust namespace (previously thrust::experimental) - -## New Features -- Functions - copy_if - lower_bound - upper_bound - vectorized lower_bound - vectorized upper_bound - equal_range - binary_search - vectorized binary_search - all_of - any_of - none_of - minmax_element - advance - inclusive_segmented_scan (experimental) - exclusive_segmented_scan (experimental) - -- Types - pair - tuple - device_malloc_allocator - -- Fancy Iterators - constant_iterator - counting_iterator - transform_iterator - zip_iterator - -## New Examples -- computing the maximum absolute difference between vectors -- computing the bounding box of a two-dimensional point set -- sorting multiple arrays together (lexicographical sorting) -- constructing a summed area table -- using zip_iterator to mimic an array of structs -- using constant_iterator to increment array values - -## Other Enhancements -- added pinned memory allocator (experimental) -- added more methods to host_vector & device_vector (issue #4) -- added variant of remove_if with a stencil argument (issue #29) -- scan and reduce use cudaFuncGetAttributes to determine grid size -- exceptions are reported when temporary device arrays cannot be allocated - -## Bug Fixes - #5 make vector work for larger data types - #9 stable_partition_copy doesn't respect OutputIterator concept semantics -- #10 scans should return OutputIterator -- #16 make algorithms work for larger data types -- #27 dispatch radix_sort even when comp=less is explicitly provided - -## Known Issues -- Using functors with Thrust entry points may not compile on Mac OSX with gcc - 4.0.1. -- `thrust::uninitialized_copy` and `thrust::uninitialized_fill` dispatch - constructors on the host rather than the device. -- `thrust::inclusive_scan`, `thrust::inclusive_scan_by_key`, - `thrust::exclusive_scan`, and `thrust::exclusive_scan_by_key` may fail when - used with large types with the CUDA 3.1 driver. - -# Thrust 1.0.0 - -## Breaking Changes -- Rename top level namespace `komrade` to `thrust`. -- Move `thrust::partition_copy` & `thrust::stable_partition_copy` into - `thrust::experimental` namespace until we can easily provide the standard - interface. -- Rename `thrust::range` to `thrust::sequence` to avoid collision with - Boost.Range. -- Rename `thrust::copy_if` to `thrust::copy_when` due to semantic differences - with C++0x copy_if(). - -## New Features -- Add C++0x style `cbegin` & `cend` methods to `thrust::host_vector` and - `thrust::device_vector`. -- Add `thrust::transform_if` function. -- Add stencil versions of `thrust::replace_if` & `thrust::replace_copy_if`. -- Allow `counting_iterator` to work with `thrust::for_each`. -- Allow types with constructors in comparison `thrust::sort` and - `thrust::reduce`. - -## Other Enhancements -- `thrust::merge_sort` and `thrust::stable_merge_sort` are now 2x to 5x faster - when executed on the parallel device. - -## Bug Fixes -- Komrade 6: Workaround an issue where an incremented iterator causes NVCC to - crash. -- Komrade 7: Fix an issue where `const_iterator`s could not be passed to - `thrust::transform`. - diff --git a/doc/thrust_logo.png b/doc/thrust_logo.png deleted file mode 100644 index 123794b6a..000000000 Binary files a/doc/thrust_logo.png and /dev/null differ diff --git a/doc/thrust_logo.svg b/doc/thrust_logo.svg deleted file mode 100644 index 4fd82acaf..000000000 --- a/doc/thrust_logo.svg +++ /dev/null @@ -1,272 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - image/svg+xml - - - - - - - - - - - - - - - - - - - Thrust - - diff --git a/docs/doxybook/config.json b/docs/doxybook/config.json new file mode 100644 index 000000000..56b7a238b --- /dev/null +++ b/docs/doxybook/config.json @@ -0,0 +1,49 @@ +{ + "baseUrl": "{{ site.baseurl }}/api/", + "copyImages": true, + "fileExt": "md", + "filesFilter": [], + "folderClassesName": "classes", + "folderExamplesName": "examples", + "folderFilesName": "files", + "folderGroupsName": "groups", + "folderNamespacesName": "namespaces", + "folderRelatedPagesName": "pages", + "imagesFolder": "images", + "indexClassesName": "index_classes", + "indexClassesTitle": "Classes", + "indexExamplesName": "index_examples", + "indexExamplesTitle": "Examples", + "indexFilesName": "index_files", + "indexFilesTitle": "Files", + "indexGroupsName": "index_groups", + "indexGroupsTitle": "Groups", + "indexInFolders": false, + "indexNamespacesName": "index_namespaces", + "indexNamespacesTitle": "namespaces", + "indexRelatedPagesName": "index_pages", + "indexRelatedPagesTitle": "pages", + "linkLowercase": true, + "linkAndInlineCodeAsHTML": true, + "linkSuffix": ".html", + "mainPageInRoot": false, + "mainPageName": "indexpage", + "sort": false, + "templateIndexClasses": "index_classes", + "templateIndexExamples": "index_examples", + "templateIndexFiles": "index_files", + "templateIndexGroups": "index_groups", + "templateIndexNamespaces": "index_namespaces", + "templateIndexRelatedPages": "index_pages", + "templateKindClass": "kind_class", + "templateKindDir": "kind_file", + "templateKindExample": "kind_page", + "templateKindFile": "kind_file", + "templateKindGroup": "kind_nonclass", + "templateKindInterface": "kind_class", + "templateKindNamespace": "kind_nonclass", + "templateKindPage": "kind_page", + "templateKindStruct": "kind_class", + "templateKindUnion": "kind_class", + "useFolders": true +} diff --git a/docs/doxybook/templates/class_members.tmpl b/docs/doxybook/templates/class_members.tmpl new file mode 100644 index 000000000..cb5f65f38 --- /dev/null +++ b/docs/doxybook/templates/class_members.tmpl @@ -0,0 +1,210 @@ +{%- if exists("publicClasses") or exists("publicTypes") or exists("publicAttributes") or exists("publicFunctions") or exists("friends") -%} + {%- set has_public_members = true -%} +{%- endif -%} +{%- if exists("protectedClasses") or exists("protectedTypes") or exists("protectedAttributes") or exists("protectedFunctions") -%} + {%- set has_protected_members = true -%} +{%- endif -%} +{%- if exists("baseClasses") -%} + {%- for base in baseClasses -%} + {%- if existsIn(base, "publicClasses") or existsIn(base, "publicTypes") or existsIn(base, "publicAttributes") or existsIn(base, "publicFunctions") or existsIn(base, "friends") -%} + {%- set has_public_members = true -%} + {%- endif -%} + {%- if existsIn(base, "protectedClasses") or existsIn(base, "protectedTypes") or existsIn(base, "protectedAttributes") or existsIn(base, "protectedFunctions") -%} + {%- set has_protected_members = true -%} + {%- endif -%} + {%- endfor -%} +{%- endif -%} + +{%- if exists("includes") -%} + #include {{includes}}{{ noop() -}} +
+{%- endif -%} +{%- include "synopsis_template_parameters.tmpl" -%} +{%- include "synopsis_kind.tmpl" -%}{% include "name_qualified.tmpl" %} { +{%- set synopsis_indent_width = 2 -%} +{%- set names_qualified = false -%} +{%- if default(has_public_members, false) -%} + public:{{- noop() -}} +{%- endif -%} +{%- if exists("publicTypes") -%} + {%- for child in publicTypes -%} + {%- include "synopsis_type.tmpl" -%} + {%- set synopsis_needs_leading_line_break = true -%} + {%- endfor -%} +{%- endif -%} +{%- if exists("baseClasses") -%}{%- for base in baseClasses -%} + {%- if existsIn(base, "publicTypes") -%} + {%- for child in base.publicTypes -%} + {%- set synopsis_is_inherited = true -%} + {%- include "synopsis_type.tmpl" -%} + {%- set synopsis_is_inherited = false -%} + {%- set synopsis_needs_leading_line_break = true -%} + {%- endfor -%} + {%- endif -%} +{%- endfor -%}{%- endif -%} +{%- if exists("publicClasses") -%} + {%- for child in publicClasses -%} + {%- include "synopsis_class.tmpl" -%} + {%- set synopsis_needs_leading_line_break = true -%} + {%- endfor -%} +{%- endif -%} +{%- if exists("baseClasses") -%}{%- for base in baseClasses -%} + {%- if existsIn(base, "publicClasses") -%} + {%- for child in base.publicClasses -%} + {%- set synopsis_is_inherited = true -%} + {%- include "synopsis_class.tmpl" -%} + {%- set synopsis_is_inherited = false -%} + {%- set synopsis_needs_leading_line_break = true -%} + {%- endfor -%} + {%- endif -%} +{%- endfor -%}{%- endif -%} +{%- if exists("friends") -%} + {%- for child in friends -%} + {%- if child.type == "class" or child.type == "struct" -%} + {%- include "synopsis_friend_class.tmpl" -%} + {%- set synopsis_needs_leading_line_break = true -%} + {%- endif -%} + {%- endfor -%} +{%- endif -%} +{%- if exists("baseClasses") -%}{%- for base in baseClasses -%} + {%- if existsIn(base, "friends") -%} + {%- for child in base.friends -%} + {%- if child.type == "class" or child.type == "struct" -%} + {%- set synopsis_is_inherited = true -%} + {%- include "synopsis_friend_class.tmpl" -%} + {%- set synopsis_is_inherited = false -%} + {%- set synopsis_needs_leading_line_break = true -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} +{%- endfor -%}{%- endif -%} +{%- if exists("publicAttributes") -%} + {%- for child in publicAttributes -%} + {%- include "synopsis_variable.tmpl" -%} + {%- set synopsis_needs_leading_line_break = true -%} + {%- endfor -%} +{%- endif -%} +{%- if exists("baseClasses") -%}{%- for base in baseClasses -%} + {%- if existsIn(base, "publicAttributes") -%} + {%- for child in base.publicAttributes -%} + {%- set synopsis_is_inherited = true -%} + {%- include "synopsis_variable.tmpl" -%} + {%- set synopsis_is_inherited = false -%} + {%- set synopsis_needs_leading_line_break = true -%} + {%- endfor -%} + {%- endif -%} +{%- endfor -%}{%- endif -%} +{%- if exists("publicFunctions") -%} + {%- for child in publicFunctions -%} + {%- include "synopsis_function.tmpl" -%} + {%- set synopsis_needs_leading_line_break = true -%} + {%- endfor -%} +{%- endif -%} +{%- if exists("baseClasses") -%}{%- for base in baseClasses -%} + {%- if existsIn(base, "publicFunctions") -%} + {%- for child in base.publicFunctions -%} + {%- set synopsis_is_inherited = true -%} + {%- include "synopsis_function.tmpl" -%} + {%- set synopsis_is_inherited = false -%} + {%- set synopsis_needs_leading_line_break = true -%} + {%- endfor -%} + {%- endif -%} +{%- endfor -%}{%- endif -%} +{%- if exists("friends") -%} + {%- for child in friends -%} + {%- if child.type != "class" and child.type != "struct" -%} + {%- include "synopsis_friend_function.tmpl" -%} + {%- set synopsis_needs_leading_line_break = true -%} + {%- endif -%} + {%- endfor -%} +{%- endif -%} +{%- if exists("baseClasses") -%}{%- for base in baseClasses -%} + {%- if existsIn(base, "friends") -%} + {%- for child in base.friends -%} + {%- if child.type != "class" and child.type != "struct" -%} + {%- set synopsis_is_inherited = true -%} + {%- include "synopsis_friend_function.tmpl" -%} + {%- set synopsis_is_inherited = false -%} + {%- set synopsis_needs_leading_line_break = true -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} +{%- endfor -%}{%- endif -%} +{%- if default(has_public_members, false) -%} + {%- if default(has_protected_members, false) -%} +
+ {%- endif -%} +{%- endif -%} +{#- Reset leading line breaks for protected members -#}{{ noop() -}} +{%- set synopsis_needs_leading_line_break = false -%} +{%- if default(has_protected_members, false) -%} + protected:{{- noop() -}} +{%- endif -%} +{%- if exists("protectedTypes") -%} + {%- for child in protectedTypes -%} + {%- include "synopsis_type.tmpl" -%} + {%- set synopsis_needs_leading_line_break = true -%} + {%- endfor -%} +{%- endif -%} +{%- if exists("baseClasses") -%}{%- for base in baseClasses -%} + {%- if existsIn(base, "protectedTypes") -%} + {%- for child in base.protectedTypes -%} + {%- set synopsis_is_inherited = true -%} + {%- include "synopsis_type.tmpl" -%} + {%- set synopsis_is_inherited = false -%} + {%- set synopsis_needs_leading_line_break = true -%} + {%- endfor -%} + {%- endif -%} +{%- endfor -%}{%- endif -%} +{%- if exists("protectedClasses") -%} + {%- for child in protectedClasses -%} + {%- include "synopsis_class.tmpl" -%} + {%- set synopsis_needs_leading_line_break = true -%} + {%- endfor -%} +{%- endif -%} +{%- if exists("baseClasses") -%}{%- for base in baseClasses -%} + {%- if existsIn(base, "protectedClasses") -%} + {%- for child in base.protectedClasses -%} + {%- set synopsis_is_inherited = true -%} + {%- include "synopsis_class.tmpl" -%} + {%- set synopsis_is_inherited = false -%} + {%- set synopsis_needs_leading_line_break = true -%} + {%- endfor -%} + {%- endif -%} +{%- endfor -%}{%- endif -%} +{%- if exists("protectedAttributes") -%} + {%- for child in protectedAttributes -%} + {%- include "synopsis_variable.tmpl" -%} + {%- set synopsis_needs_leading_line_break = true -%} + {%- endfor -%} +{%- endif -%} +{%- if exists("baseClasses") -%}{%- for base in baseClasses -%} + {%- if existsIn(base, "protectedAttributes") -%} + {%- for child in base.protectedAttributes -%} + {%- set synopsis_is_inherited = true -%} + {%- include "synopsis_variable.tmpl" -%} + {%- set synopsis_is_inherited = false -%} + {%- set synopsis_needs_leading_line_break = true -%} + {%- endfor -%} + {%- endif -%} +{%- endfor -%}{%- endif -%} +{%- if exists("protectedFunctions") -%} + {%- for child in protectedFunctions -%} + {%- include "synopsis_function.tmpl" -%} + {%- set synopsis_needs_leading_line_break = true -%} + {%- endfor -%} +{%- endif -%} +{%- if exists("baseClasses") -%}{%- for base in baseClasses -%} + {%- if existsIn(base, "protectedFunctions") -%} + {%- for child in base.protectedFunctions -%} + {%- set synopsis_is_inherited = true -%} + {%- include "synopsis_function.tmpl" -%} + {%- set synopsis_is_inherited = false -%} + {%- set synopsis_needs_leading_line_break = true -%} + {%- endfor -%} + {%- endif -%} +{%- endfor -%}{%- endif -%} +{%- set synopsis_indent_width = 0 -%} +}; +
+ diff --git a/docs/doxybook/templates/class_members_details.tmpl b/docs/doxybook/templates/class_members_details.tmpl new file mode 100644 index 000000000..a77eec5ef --- /dev/null +++ b/docs/doxybook/templates/class_members_details.tmpl @@ -0,0 +1,49 @@ +{%- if exists("publicClasses") -%}## Member Classes + + {%- for child in publicClasses -%} + {% include "title_member.tmpl" %} + {{- render("member_details.tmpl", child) -}} + {%- endfor %} +{%- endif -%} +{%- if exists("publicTypes") -%}## Member Types + + {%- for child in publicTypes -%} + {% include "title_member.tmpl" %} + {{- render("member_details.tmpl", child) -}} + {%- endfor %} +{%- endif -%} +{%- if exists("publicAttributes") %}## Member Variables + + {%- for child in publicAttributes -%} + {% include "title_member.tmpl" %} + {{- render("member_details.tmpl", child) -}} + {%- endfor %} +{%- endif -%} +{%- if exists("publicFunctions") %}## Member Functions + + {%- for child in publicFunctions -%} + {% include "title_member.tmpl" %} + {{- render("member_details.tmpl", child) -}} + {%- endfor %} +{%- endif -%} +{%- if exists("protectedTypes") -%}## Protected Member Types + {%- for child in publicTypes -%} + {% include "title_member.tmpl" %} + {{- render("member_details.tmpl", child) -}} + {%- endfor %} +{% endif -%} +{%- if exists("protectedAttributes") -%}## Protected Member Variables + + {%- for child in protectedAttributes -%} + {% include "title_member.tmpl" %} + {{- render("member_details.tmpl", child) -}} + {%- endfor %} +{%- endif -%} +{%- if exists("protectedFunctions") -%}## Protected Member Functions + + {%- for child in protectedFunctions -%} + {% include "title_member.tmpl" %} + {{- render("member_details.tmpl", child) -}} + {%- endfor %} +{%- endif -%} + diff --git a/docs/doxybook/templates/details.tmpl b/docs/doxybook/templates/details.tmpl new file mode 100644 index 000000000..d72119abf --- /dev/null +++ b/docs/doxybook/templates/details.tmpl @@ -0,0 +1,206 @@ +{%- if exists("brief") -%}{{brief}} + +{% endif -%} +{%- if exists("details") -%}{{details}} + +{% endif -%} +{%- if exists("inbody") -%}{{inbody}} + +{% endif -%} +{%- if exists("tests") -%}**Test**: + {%- if length(tests) == 1 -%}{{first(tests)}} + {%- else -%} + {%- for item in tests -%}* {{item}} + {%- endfor -%} + {%- endif %} +{% endif -%} +{%- if exists("note") -%}**Note**: + {%- if length(note) == 1 -%}{{first(note)}} + {%- else -%} + {%- for item in note -%}* {{item}} + {%- endfor -%} + {%- endif %} +{% endif -%} +{%- if exists("remark") -%}**Remark**: + {%- if length(remark) == 1 -%}{{first(remark)}} + {%- else -%} + {%- for item in remark -%}* {{item}} + {%- endfor -%} + {%- endif %} +{% endif -%} +{%- if exists("attention") -%}**Attention**: + {%- if length(attention) == 1 -%}{{first(attention)}} + {%- else -%} + {%- for item in attention -%}* {{item}} + {%- endfor -%} + {%- endif %} +{% endif -%} +{%- if exists("bugs") -%}**Bug**: + {%- if length(bugs) == 1 -%}{{first(bugs)}} + {%- else -%} + {%- for item in bugs -%}* {{item}} + {%- endfor -%} + {%- endif %} +{% endif -%} +{%- if exists("warning") -%}**Warning**: + {%- if length(warning) == 1 -%}{{first(warning)}} + {%- else -%} + {%- for item in warning -%}* {{item}} + {%- endfor -%} + {%- endif %} +{% endif -%} +{%- if exists("todos") -%}**TODO**: + {%- if length(todos) == 1 -%}{{first(todos)}} + {%- else -%} + {%- for item in todos -%}* {{item}} + {%- endfor -%} + {%- endif %} +{% endif -%} +{%- if exists("templateParamsList") -%}**Template Parameters**: + {%- if length(templateParamsList) == 1 -%}**`{{get(first(templateParamsList), "name")}}`**: {{get(first(templateParamsList), "text")}} + {%- else -%} + {%- for param in templateParamsList -%}* **`{{param.name}}`** {{param.text}} + {%- endfor -%} + {%- endif %} +{% endif -%} +{%- if exists("paramList") -%}**Function Parameters**: + {%- if length(paramList) == 1 -%}**`{{get(first(paramList), "name")}}`**: {{get(first(paramList), "text")}} + {%- else -%} + {%- for param in paramList -%}* **`{{param.name}}`** {{param.text}} + {%- endfor -%} + {%- endif %} +{% endif -%} +{%- if exists("pre") -%}**Preconditions**: + {%- if length(pre) == 1 -%}{{first(pre)}} + {%- else -%} + {%- for item in pre -%}* {{item}} + {%- endfor -%} + {%- endif %} +{% endif -%} +{%- if exists("post") -%}**Postconditions**: + {%- if length(post) == 1 -%}{{first(post)}} + {%- else -%} + {%- for item in post -%}* {{item}} + {%- endfor -%} + {%- endif %} +{% endif -%} +{%- if exists("invariant") -%}**Invariant**: + {%- if length(invariant) == 1 -%}{{first(invariant)}} + {%- else -%} + {%- for item in invariant -%}* {{item}} + {%- endfor -%} + {%- endif %} +{% endif -%} +{%- if exists("returns") or exists("returnsList") -%}**Returns**: + {%- if exists("returns") and exists("returnsList") -%} + {%- for item in returns -%}* {{item}} + {%- endfor -%} + {%- for item in returnsList -%}* **`{{item.name}}`**: {{item.text}} + {%- endfor -%} + {%- else if exists("returns") -%} + {%- if length(returns) == 1 -%}{{first(returns)}} + {%- else -%} + {%- for item in returns -%}* {{item}} + {%- endfor -%} + {%- endif -%} + {%- else if exists("returnsList") -%} + {%- if length(returnsList) == 1 -%}**`{{get(first(returnsList), "name")}}`** {{get(first(returnsList), "text")}} + {%- else -%} + {%- for item in returnsList -%}* **`{{item.name}}`**: {{item.text}} + {%- endfor -%} + {%- endif -%} + {%- endif %} +{% endif -%} +{%- if exists("exceptionsList") -%}**Exceptions**: + {%- if length(exceptionsList) == 1 -%}**`{{get(first(exceptionsList), "name")}}`**: {{get(first(exceptionsList), "text")}} + {%- else -%} + {%- for param in exceptionsList -%}* **`{{param.name}}`**: {{param.text}} + {%- endfor -%} + {%- endif %} +{% endif -%} +{%- if exists("deprecated") -%}**Deprecated**: {{deprecated}} + +{% endif -%} +{%- if exists("authors") -%}**Author**: + {%- if length(authors) == 1 -%}{{first(authors)}} + {%- else -%} + {%- for item in authors -%}* {{item}} + {%- endfor -%} + {%- endif %} +{% endif -%} +{%- if exists("copyright") -%}**Copyright**: + {%- if length(copyright) == 1 -%}{{first(copyright)}} + {%- else -%} + {%- for item in copyright -%}* {{item}} + {%- endfor -%} + {%- endif %} +{% endif -%} +{%- if exists("version") -%}**Version**: + {%- if length(version) == 1 -%}{{first(version)}} + {%- else -%} + {%- for item in version -%}* {{item}} + {%- endfor -%} + {%- endif %} +{% endif -%} +{%- if exists("since") -%}**Since**: + {%- if length(since) == 1 -%}{{first(since)}} + {%- else -%} + {%- for item in since -%}* {{item}} + {%- endfor -%} + {%- endif %} +{% endif -%} +{%- if exists("date") -%}**Date**: + {%- if length(date) == 1 -%}{{first(date)}} + {%- else -%} + {%- for item in date -%}* {{item}} + {%- endfor -%} + {%- endif %} +{% endif -%} +{%- if exists("baseClasses") -%}**Inherits From**: + {%- if length(baseClasses) == 1 -%} + {%- if existsIn(first(baseClasses), "url") -%}[`{{get(first(baseClasses), "name")}}`]({{get(first(baseClasses), "url")}}) + {%- else -%}`{{get(first(baseClasses), "name")}}` + {%- endif -%} + {%- else -%} + {%- for base in baseClasses -%} + {%- if existsIn(baseClasses, "url") -%}* [`{{base.name}}`]({{base.url}}) + {%- else -%}* `{{base.name}}` + {%- endif -%} + {%- endfor -%} + {%- endif %} +{% endif -%} +{%- if exists("derivedClasses") -%}**Inherited By**: + {%- if length(derivedClasses) == 1 -%} + {%- if existsIn(first(derivedClasses), "url") -%}[`{{get(first(derivedClasses), "name")}}`]({{get(first(derivedClasses), "url")}}) + {%- else -%}`{{get(first(derivedClasses), "name")}}` + {%- endif -%} + {%- else -%} + {%- for derived in derivedClasses -%} + {%- if existsIn(first(derivedClasses), "url") -%}* [`{{derived.name}}`]({{derived.url}}) + {%- else -%}* `{{derived.name}}`{%- endif -%} + {%- endfor -%} + {%- endif %} +{% endif -%} +{%- if exists("reimplements") -%}**Implements**: [`{{reimplements.name}}`]({{reimplements.url}}) + +{% endif -%} +{%- if exists("reimplementedBy") -%}**Implemented By**: + {%- if length(reimplementedBy) == 1 -%} + {%- if existsIn(first(reimplementedBy), "url") -%}[`{{get(first(reimplementedBy), "name")}}`]({{get(first(reimplementedBy), "url")}}) + {%- else -%}`{{get(first(reimplementedBy), "name")}}` + {%- endif -%} + {%- else -%} + {%- for impl in reimplementedBy -%} + {%- if existsIn(first(reimplementedBy), "url") -%}* [`{{impl.name}}`]({{impl.url}}) + {%- else -%}* `{{impl.name}}` + {%- endif -%} + {%- endfor -%} + {%- endif %} +{% endif -%} +{%- if exists("see") -%}**See**: + {%- if length(see) == 1 -%}{{first(see)}} + {%- else -%} + {%- for item in see -%}* {{item}} + {%- endfor -%} + {%- endif %} +{% endif -%} diff --git a/docs/doxybook/templates/frontmatter.tmpl b/docs/doxybook/templates/frontmatter.tmpl new file mode 100644 index 000000000..d3b1e5b4f --- /dev/null +++ b/docs/doxybook/templates/frontmatter.tmpl @@ -0,0 +1,43 @@ +--- +{%- if exists("title") -%} + title: {{title}} +{%- else if exists("name") -%} + title: {{name}} +{%- endif -%} +{%- if exists("summary") -%} + summary: {{summary}} +{%- endif -%} +{%- if exists("moduleBreadcrumbs") -%} + {%- if length(moduleBreadcrumbs) > 0 -%} + parent: {{ get(last(moduleBreadcrumbs), "title") }} + {%- endif -%} + {%- if length(moduleBreadcrumbs) > 1 -%} + grand_parent: {{ get(index(moduleBreadcrumbs, -2), "title") }} + {%- else if length(moduleBreadcrumbs == 1) and exists("kind") and kind == "group" -%} + grand_parent: API + {%- endif -%} +{%- else if exists("kind") and kind == "group" -%} + parent: API +{%- endif -%} +{%- if exists("kind") and kind == "group" -%} + nav_exclude: false +{%- else -%} + nav_exclude: true +{%- endif -%} +has_children: true +has_toc: false +--- + +{%- if exists("title") -%} + {%- if exists("kind") and kind in ["class", "struct", "namespace"] -%} + # {{title(kind)}} `{{title}}` + {%- else -%} + # {{title}} + {%- endif -%} +{%- else if exists("name") -%} + {%- if exists("kind") and kind != "page" -%} + # {{name}} {{title(kind)}} Reference + {%- else -%} + # {{name}} + {%- endif -%} +{%- endif %} diff --git a/docs/doxybook/templates/index.tmpl b/docs/doxybook/templates/index.tmpl new file mode 100644 index 000000000..e28f37729 --- /dev/null +++ b/docs/doxybook/templates/index.tmpl @@ -0,0 +1,14 @@ +{%- if exists("children") -%}{%- for child in children -%} + {%- for i in range(default(index_depth, 0)) -%} + {{- noop() }} {{ noop() -}} + {%- endfor -%} + * {{ noop() -}} + {{ render("name_qualified.tmpl", child) }}{{ noop() -}} + {%- if existsIn(child, "brief") -%} + {{- noop() }}
{{ child.brief -}} + {%- endif %} + {%- if existsIn(child, "children") -%} + {%- set child.index_depth = default(index_depth, 0) + 1 -%} + {{- render("index.tmpl", child) -}} + {%- endif -%} +{%- endfor -%}{%- endif -%} diff --git a/docs/doxybook/templates/index_classes.tmpl b/docs/doxybook/templates/index_classes.tmpl new file mode 100644 index 000000000..1ccdf71e9 --- /dev/null +++ b/docs/doxybook/templates/index_classes.tmpl @@ -0,0 +1,2 @@ +{% include "frontmatter.tmpl" -%} +{% include "index.tmpl" -%} diff --git a/docs/doxybook/templates/index_examples.tmpl b/docs/doxybook/templates/index_examples.tmpl new file mode 100644 index 000000000..1ccdf71e9 --- /dev/null +++ b/docs/doxybook/templates/index_examples.tmpl @@ -0,0 +1,2 @@ +{% include "frontmatter.tmpl" -%} +{% include "index.tmpl" -%} diff --git a/docs/doxybook/templates/index_files.tmpl b/docs/doxybook/templates/index_files.tmpl new file mode 100644 index 000000000..1ccdf71e9 --- /dev/null +++ b/docs/doxybook/templates/index_files.tmpl @@ -0,0 +1,2 @@ +{% include "frontmatter.tmpl" -%} +{% include "index.tmpl" -%} diff --git a/docs/doxybook/templates/index_groups.tmpl b/docs/doxybook/templates/index_groups.tmpl new file mode 100644 index 000000000..1ccdf71e9 --- /dev/null +++ b/docs/doxybook/templates/index_groups.tmpl @@ -0,0 +1,2 @@ +{% include "frontmatter.tmpl" -%} +{% include "index.tmpl" -%} diff --git a/docs/doxybook/templates/index_namespaces.tmpl b/docs/doxybook/templates/index_namespaces.tmpl new file mode 100644 index 000000000..1ccdf71e9 --- /dev/null +++ b/docs/doxybook/templates/index_namespaces.tmpl @@ -0,0 +1,2 @@ +{% include "frontmatter.tmpl" -%} +{% include "index.tmpl" -%} diff --git a/docs/doxybook/templates/index_pages.tmpl b/docs/doxybook/templates/index_pages.tmpl new file mode 100644 index 000000000..1ccdf71e9 --- /dev/null +++ b/docs/doxybook/templates/index_pages.tmpl @@ -0,0 +1,2 @@ +{% include "frontmatter.tmpl" -%} +{% include "index.tmpl" -%} diff --git a/docs/doxybook/templates/kind_class.tmpl b/docs/doxybook/templates/kind_class.tmpl new file mode 100644 index 000000000..e5650b69b --- /dev/null +++ b/docs/doxybook/templates/kind_class.tmpl @@ -0,0 +1,4 @@ +{% include "frontmatter.tmpl" -%} +{%- if hasDetails -%}{% include "details.tmpl" -%}{%- endif -%} +{% include "class_members.tmpl" -%} +{% include "class_members_details.tmpl" -%} diff --git a/docs/doxybook/templates/kind_example.tmpl b/docs/doxybook/templates/kind_example.tmpl new file mode 100644 index 000000000..48501318b --- /dev/null +++ b/docs/doxybook/templates/kind_example.tmpl @@ -0,0 +1,2 @@ +{% include "frontmatter.tmpl" -%} +{%- if exists("details") -%}{{details}}{%- endif -%} diff --git a/docs/doxybook/templates/kind_file.tmpl b/docs/doxybook/templates/kind_file.tmpl new file mode 100644 index 000000000..c883442f1 --- /dev/null +++ b/docs/doxybook/templates/kind_file.tmpl @@ -0,0 +1,10 @@ +{% include "frontmatter.tmpl" -%} +{%- if hasDetails -%}{% include "details.tmpl" -%}{%- endif -%} +{% include "nonclass_members_details.tmpl" -%} +{% include "nonclass_members.tmpl" -%} +{%- if exists("programlisting") -%} + +```cpp +{{programlisting}} +``` +{%- endif -%} diff --git a/docs/doxybook/templates/kind_group.tmpl b/docs/doxybook/templates/kind_group.tmpl new file mode 100644 index 000000000..1ff7342a4 --- /dev/null +++ b/docs/doxybook/templates/kind_group.tmpl @@ -0,0 +1,4 @@ +{% include "frontmatter.tmpl" -%} +{%- if hasDetails %}{% include "details.tmpl" -%}{%- endif -%} +{% include "nonclass_members.tmpl" -%} +{% include "nonclass_members_details.tmpl" -%} diff --git a/docs/doxybook/templates/kind_nonclass.tmpl b/docs/doxybook/templates/kind_nonclass.tmpl new file mode 100644 index 000000000..299208c41 --- /dev/null +++ b/docs/doxybook/templates/kind_nonclass.tmpl @@ -0,0 +1,8 @@ +{% include "frontmatter.tmpl" -%} +{%- if hasDetails %}{% include "details.tmpl" -%}{%- endif -%} +{% if kind == "namespace" -%} + {%- include "namespace_members.tmpl" -%} +{%- else -%} + {%- include "nonclass_members.tmpl" -%} +{%- endif -%} +{% include "nonclass_members_details.tmpl" -%} diff --git a/docs/doxybook/templates/kind_page.tmpl b/docs/doxybook/templates/kind_page.tmpl new file mode 100644 index 000000000..48501318b --- /dev/null +++ b/docs/doxybook/templates/kind_page.tmpl @@ -0,0 +1,2 @@ +{% include "frontmatter.tmpl" -%} +{%- if exists("details") -%}{{details}}{%- endif -%} diff --git a/docs/doxybook/templates/member_details.tmpl b/docs/doxybook/templates/member_details.tmpl new file mode 100644 index 000000000..14b34dcfc --- /dev/null +++ b/docs/doxybook/templates/member_details.tmpl @@ -0,0 +1,39 @@ +{%- if exists("type") and type in ["class", "struct"] -%} + + {%- include "synopsis_class.tmpl" -%} + +{%- else if kind == "enum" -%} + {%- include "table_header_enum.tmpl" -%} + {%- for enumerator in enumvalues -%}{{- render("table_row_enum.tmpl", enumerator) -}} + {%- endfor %} +{%- else if kind in ["typedef", "using"] -%} + + {%- include "synopsis_template_parameters.tmpl" -%} + {%- include "synopsis_kind.tmpl" -%}{{name}}{%- include "synopsis_initializer.tmpl" -%};{{- noop() -}} + +{%- else if kind in ["variable", "property"] -%} + + {%- include "synopsis_template_parameters.tmpl" -%} + {%- include "synopsis_type_and_leading_specifiers.tmpl" -%}{{name}}{%- include "synopsis_initializer.tmpl" -%};{{- noop() -}} + +{%- else if kind in ["function", "slot", "signal", "event"] -%} + + {%- include "synopsis_template_parameters.tmpl" -%} + {% include "synopsis_function_type_and_leading_specifiers.tmpl" -%} + {{name}}({%- include "synopsis_function_parameters.tmpl" -%}){%- include "synopsis_function_trailing_specifiers.tmpl" -%};{{- noop() -}} + +{%- else if kind == "friend" -%} + {%- if type != "class" and type != "struct" -%} + + {% include "synopsis_template_parameters.tmpl" -%} + {% include "synopsis_function_type_and_leading_specifiers.tmpl" -%} + {{name}}({%- include "synopsis_function_parameters.tmpl" -%}){%- include "synopsis_function_trailing_specifiers.tmpl" -%};{{- noop() -}} + + {%- endif -%} +{%- else if kind == "define" -%} + {#- We have no way to get the parameters to function-like -#}{{ noop() -}} + {#- macros, and the macro definitions in `initializer` fields -#}{{ noop() -}} + {#- don't have line breaks. So we can't render a useful -#}{{ noop() -}} + {#- synopsis. -#}{{ noop() -}} +{% endif -%} +{% include "details.tmpl" -%} diff --git a/docs/doxybook/templates/name.tmpl b/docs/doxybook/templates/name.tmpl new file mode 100644 index 000000000..09f15420e --- /dev/null +++ b/docs/doxybook/templates/name.tmpl @@ -0,0 +1,5 @@ +{%- if default(names_qualified, true) -%} + {{- render("name_qualified.tmpl", child) -}} +{%- else -%} + {{- render("name_unqualified.tmpl", child) -}} +{%- endif -%} diff --git a/docs/doxybook/templates/name_qualified.tmpl b/docs/doxybook/templates/name_qualified.tmpl new file mode 100644 index 000000000..da088dd34 --- /dev/null +++ b/docs/doxybook/templates/name_qualified.tmpl @@ -0,0 +1,7 @@ +{%- if exists("qualifiedname") -%} + {{- escape(qualifiedname) -}} +{%- else if exists("name") -%} + {{- escape(name) -}} +{%- else -%} + {{- escape(title) -}} +{%- endif -%} diff --git a/docs/doxybook/templates/name_unqualified.tmpl b/docs/doxybook/templates/name_unqualified.tmpl new file mode 100644 index 000000000..2a0d73725 --- /dev/null +++ b/docs/doxybook/templates/name_unqualified.tmpl @@ -0,0 +1,5 @@ +{%- if exists("name") -%} + {{- escape(stripNamespace(name)) -}} +{%- else -%} + {{- escape(stripNamespace(title)) -}} +{%- endif -%} diff --git a/docs/doxybook/templates/namespace_members.tmpl b/docs/doxybook/templates/namespace_members.tmpl new file mode 100644 index 000000000..8bb4bdffc --- /dev/null +++ b/docs/doxybook/templates/namespace_members.tmpl @@ -0,0 +1,43 @@ + +{%- if exists("includes") -%} + #include {{includes}}{{ noop() -}} +
+{%- endif -%} +{%- include "synopsis_kind.tmpl" -%}{% include "name_qualified.tmpl" %} { +{%- set synopsis_needs_leading_line_break = true -%} +{%- set names_qualified = false -%} +{%- if exists("namespaces") -%} + {%- for child in namespaces -%} + {%- include "synopsis_namespace_abbreviated.tmpl" -%} + {%- endfor -%} +{%- endif -%} +{%- if exists("publicClasses") -%} + {%- for child in publicClasses -%} + {%- include "synopsis_class.tmpl" -%} + {%- endfor -%} +{%- endif -%} +{%- if exists("publicTypes") -%} + {%- for child in publicTypes -%} + {%- include "synopsis_type.tmpl" -%} + {%- endfor -%} +{%- endif -%} +{%- if exists("publicAttributes") -%} + {%- for child in publicAttributes -%} + {%- include "synopsis_variable.tmpl" -%} + {%- endfor -%} +{%- endif -%} +{%- if exists("publicFunctions") -%} + {%- for child in publicFunctions -%} + {%- if existsIn(child, "type") -%} + {#- If the child doesn't have a type, it's probably a -#}{{- noop() -}} + {#- constructor that Doxygen put into a non-class entity -#}{{- noop() -}} + {#- due to a bug whose nature is beyond me. -#}{{- noop() -}} + {%- include "synopsis_function.tmpl" -%} + {%- endif -%} + {%- endfor -%} +{%- endif -%} +} {{ noop() -}} + /* {%- include "synopsis_kind.tmpl" -%}{% include "name_qualified.tmpl" %} */{{ noop() -}} + +
+ diff --git a/docs/doxybook/templates/nonclass_members.tmpl b/docs/doxybook/templates/nonclass_members.tmpl new file mode 100644 index 000000000..af3d39c17 --- /dev/null +++ b/docs/doxybook/templates/nonclass_members.tmpl @@ -0,0 +1,60 @@ +{%- if exists("groups") %}## Groups + + {%- for child in sort(groups) -%}* **[{{ child.title }}]({{ child.url }})**{% if existsIn(child, "brief") %}: {{ child.brief }}{% endif %} + {%- endfor %} +{% endif -%} +{%- if exists("dirs") %}## Directories + + {%- for child in dirs -%}* **[`{{ child.name }}`]({{ child.url }})**{% if existsIn(child, "brief") %}: {{ child.brief }}{% endif %} + {%- endfor %} +{% endif -%} +{%- if exists("files") %}## Files + + {%- include "table_header_brief.tmpl" -%} + {%- for child in files -%}{{- render("table_row_brief.tmpl", child) -}} + {%- endfor %} +{% endif -%} + +{%- if exists("namespaces") -%} + {%- for child in namespaces -%} + {%- include "synopsis_namespace_abbreviated.tmpl" -%} + {%- set synopsis_needs_leading_line_break = true -%} + {%- endfor -%} +{%- endif -%} +{%- if exists("publicClasses") -%} + {%- for child in publicClasses -%} + {%- include "synopsis_class.tmpl" -%} + {%- set synopsis_needs_leading_line_break = true -%} + {%- endfor -%} +{%- endif -%} +{%- if exists("publicTypes") -%} + {%- for child in publicTypes -%} + {%- include "synopsis_type.tmpl" -%} + {%- set synopsis_needs_leading_line_break = true -%} + {%- endfor -%} +{%- endif -%} +{%- if exists("publicAttributes") -%} + {%- for child in publicAttributes -%} + {%- include "synopsis_variable.tmpl" -%} + {%- set synopsis_needs_leading_line_break = true -%} + {%- endfor -%} +{%- endif -%} +{%- if exists("publicFunctions") -%} + {%- for child in publicFunctions -%} + {%- if existsIn(child, "type") -%} + {#- If the child doesn't have a type, it's probably a -#}{{- noop() -}} + {#- constructor that Doxygen put into a non-class entity -#}{{- noop() -}} + {#- due to a bug whose nature is beyond me. -#}{{- noop() -}} + {%- include "synopsis_function.tmpl" -%} + {%- set synopsis_needs_leading_line_break = true -%} + {%- endif -%} + {%- endfor -%} +{%- endif -%} +{%- if exists("defines") -%} + {%- for child in defines -%} + {%- include "synopsis_macro.tmpl" -%} + {%- set synopsis_needs_leading_line_break = true -%} + {%- endfor -%} +{%- endif -%} + + diff --git a/docs/doxybook/templates/nonclass_members_details.tmpl b/docs/doxybook/templates/nonclass_members_details.tmpl new file mode 100644 index 000000000..c941f22f7 --- /dev/null +++ b/docs/doxybook/templates/nonclass_members_details.tmpl @@ -0,0 +1,35 @@ +{%- if exists("publicClasses") -%}## Member Classes + + {%- for child in publicClasses -%} + {% include "title_nonmember.tmpl" %} + {{- render("member_details.tmpl", child) -}} + {%- endfor %} +{%- endif -%} +{%- if exists("publicTypes") -%}## Types + + {%- for child in publicTypes -%} + {% include "title_nonmember.tmpl" %} + {{- render("member_details.tmpl", child) -}} + {%- endfor %} +{%- endif -%} +{%- if exists("publicAttributes") %}## Variables + + {%- for child in publicAttributes -%} + {% include "title_nonmember.tmpl" %} + {{- render("member_details.tmpl", child) -}} + {%- endfor %} +{%- endif -%} +{%- if exists("publicFunctions") %}## Functions + + {%- for child in publicFunctions -%} + {% include "title_nonmember.tmpl" %} + {{- render("member_details.tmpl", child) -}} + {%- endfor %} +{%- endif -%} +{%- if exists("defines") %}## Macros + + {%- for child in defines -%} + {% include "title_nonmember.tmpl" %} + {{- render("member_details.tmpl", child) -}} + {%- endfor %} +{%- endif -%} diff --git a/docs/doxybook/templates/synopsis_brief.tmpl b/docs/doxybook/templates/synopsis_brief.tmpl new file mode 100644 index 000000000..2f48cec1d --- /dev/null +++ b/docs/doxybook/templates/synopsis_brief.tmpl @@ -0,0 +1,8 @@ +{%- if exists("brief") -%} + {{ noop() -}} + {%- if default(synopsis_indent_width, 0) != 0 -%} + {%- include "synopsis_indent.tmpl" -%} + {%- endif -%} + /* {{ brief }} */{{ noop() -}} + {{ noop() -}} +{%- endif -%} diff --git a/docs/doxybook/templates/synopsis_class.tmpl b/docs/doxybook/templates/synopsis_class.tmpl new file mode 100644 index 000000000..a5492997c --- /dev/null +++ b/docs/doxybook/templates/synopsis_class.tmpl @@ -0,0 +1,16 @@ +{%- include "synopsis_leading_line_break.tmpl" -%} +{%- include "synopsis_inherited_from.tmpl" -%} +{%- set child.synopsis_indent_width = default(synopsis_indent_width, 0) -%} +{{- render("synopsis_brief.tmpl", child) -}} +{#- The Doxygen metadata that a parent has on its nested -#}{{ noop() -}} +{#- classes doesn't include their template parameters. -#}{{ noop() -}} +{#- Fortunately, we have the refid of the nested class, so -#}{{ noop() -}} +{#- so we can just load the data from their page. -#}{{ noop() -}} +{%- set child_class = load(child.refid)) -%} +{%- set child_class.synopsis_indent_width = default(synopsis_indent_width, 0) -%} +{{- render("synopsis_template_parameters.tmpl", child_class) -}} +{{ noop() -}} + {%- include "synopsis_indent.tmpl" -%} + {{- render("synopsis_kind_abbreviated.tmpl", child) -}} + {% include "name.tmpl" %};{{ noop() -}} + diff --git a/docs/doxybook/templates/synopsis_friend_class.tmpl b/docs/doxybook/templates/synopsis_friend_class.tmpl new file mode 100644 index 000000000..39f23bb09 --- /dev/null +++ b/docs/doxybook/templates/synopsis_friend_class.tmpl @@ -0,0 +1,14 @@ +{%- include "synopsis_leading_line_break.tmpl" -%} +{%- include "synopsis_inherited_from.tmpl" -%} +{%- set child.synopsis_indent_width = default(synopsis_indent_width, 0) -%} +{{- render("synopsis_brief.tmpl", child) -}} +{{- render("synopsis_template_parameters.tmpl", child) -}} +{#- Unfortunately, the refid and URL for a friend class -#}{{ noop() -}} +{#- incorrectly refers to a definition on the local -#}{{ noop() -}} +{#- page, instead of the friend class's own page. -#}{{ noop() -}} +{#- So we don't link to friend classes. -#}{{ noop() -}} +{{ noop() -}} + {%- include "synopsis_indent.tmpl" -%} + {{- render("synopsis_kind_abbreviated.tmpl", child) -}} + {{- render("name_qualified.tmpl", child) -}};{{ noop() -}} + diff --git a/docs/doxybook/templates/synopsis_friend_function.tmpl b/docs/doxybook/templates/synopsis_friend_function.tmpl new file mode 100644 index 000000000..440989c23 --- /dev/null +++ b/docs/doxybook/templates/synopsis_friend_function.tmpl @@ -0,0 +1,19 @@ +{%- include "synopsis_leading_line_break.tmpl" -%} +{%- include "synopsis_inherited_from.tmpl" -%} +{%- set child.synopsis_indent_width = default(synopsis_indent_width, 0) -%} +{{- render("synopsis_brief.tmpl", child) -}} +{{- render("synopsis_template_parameters.tmpl", child) -}} +{#- Unfortunately, the refid and URL for a friend class -#}{{ noop() -}} +{#- incorrectly refers to a definition on the local -#}{{ noop() -}} +{#- page, instead of the friend class's own page. -#}{{ noop() -}} +{#- So we don't link to friend classes. -#}{{ noop() -}} +{{ noop() -}} + {%- include "synopsis_indent.tmpl" -%} + friend {{- render("synopsis_type_and_leading_specifiers.tmpl", child) -}} + +{{ noop() -}} + {%- include "synopsis_indent.tmpl" -%} + {{- render("name_qualified.tmpl", child) -}}{{ noop() -}} + ({{- render("synopsis_function_parameters.tmpl", child) -}}){{ noop() -}} + {{- render("synopsis_function_trailing_specifiers.tmpl", child) -}};{{ noop() -}} + diff --git a/docs/doxybook/templates/synopsis_function.tmpl b/docs/doxybook/templates/synopsis_function.tmpl new file mode 100644 index 000000000..93a3e822e --- /dev/null +++ b/docs/doxybook/templates/synopsis_function.tmpl @@ -0,0 +1,12 @@ +{%- include "synopsis_leading_line_break.tmpl" -%} +{%- include "synopsis_inherited_from.tmpl" -%} +{%- set child.synopsis_indent_width = default(synopsis_indent_width, 0) -%} +{{- render("synopsis_brief.tmpl", child) -}} +{{- render("synopsis_template_parameters.tmpl", child) -}} +{{- render("synopsis_function_type_and_leading_specifiers.tmpl", child) -}} +{{ noop() -}} + {%- include "synopsis_indent.tmpl" -%} + {% include "name.tmpl" %}{{ noop() -}} + ({{- render("synopsis_function_parameters.tmpl", child) -}}){{ noop() -}} + {{- render("synopsis_function_trailing_specifiers.tmpl", child) -}};{{- noop() -}} + diff --git a/docs/doxybook/templates/synopsis_function_parameters.tmpl b/docs/doxybook/templates/synopsis_function_parameters.tmpl new file mode 100644 index 000000000..204a52c50 --- /dev/null +++ b/docs/doxybook/templates/synopsis_function_parameters.tmpl @@ -0,0 +1,11 @@ +{%- for param in params -%} + {%- if not loop.is_first -%}  {%- endif -%} + {{- param.type -}} + {%- if not isEmpty(param.name) %} {% endif -%} + {{- param.name -}} + {%- if existsIn(param, "defvalPlain") %} = {{ escape(param.defvalPlain) }}{% endif -%} + {%- if not loop.is_last -%} + , + {{- noop() }}{% include "synopsis_indent.tmpl" -%} + {%- endif -%} +{%- endfor -%} diff --git a/docs/doxybook/templates/synopsis_function_trailing_specifiers.tmpl b/docs/doxybook/templates/synopsis_function_trailing_specifiers.tmpl new file mode 100644 index 000000000..bbde0f1dd --- /dev/null +++ b/docs/doxybook/templates/synopsis_function_trailing_specifiers.tmpl @@ -0,0 +1,5 @@ +{%- if const %} const{% endif -%} +{%- if override %} override{% endif -%} +{%- if default %} = default{% endif -%} +{%- if deleted %} = deleted{% endif -%} +{%- if pureVirtual %} = 0{% endif -%} diff --git a/docs/doxybook/templates/synopsis_function_type_and_leading_specifiers.tmpl b/docs/doxybook/templates/synopsis_function_type_and_leading_specifiers.tmpl new file mode 100644 index 000000000..5cde64d28 --- /dev/null +++ b/docs/doxybook/templates/synopsis_function_type_and_leading_specifiers.tmpl @@ -0,0 +1,6 @@ +{%- if default(virtual, false) or default(static, false) or default(explicit, false) or default(type, false) -%} + {{ noop() -}} + {%- include "synopsis_indent.tmpl" -%} + {%- include "synopsis_type_and_leading_specifiers.tmpl" -%} + {{ noop() -}} +{%- endif -%} diff --git a/docs/doxybook/templates/synopsis_indent.tmpl b/docs/doxybook/templates/synopsis_indent.tmpl new file mode 100644 index 000000000..a2d7193a6 --- /dev/null +++ b/docs/doxybook/templates/synopsis_indent.tmpl @@ -0,0 +1,5 @@ +{%- if default(synopsis_indent_width, false) -%} + {%- for i in range(synopsis_indent_width) -%} +  {{ noop() -}} + {%- endfor -%} +{%- endif -%} diff --git a/docs/doxybook/templates/synopsis_inherited_from.tmpl b/docs/doxybook/templates/synopsis_inherited_from.tmpl new file mode 100644 index 000000000..fd88b649c --- /dev/null +++ b/docs/doxybook/templates/synopsis_inherited_from.tmpl @@ -0,0 +1,4 @@ +{%- if default(synopsis_is_inherited, false) != false -%} + {%- set base.synopsis_indent_width = default(synopsis_indent_width, 0) -%} + {{- render("synopsis_inherited_from_comment.tmpl", base) -}} +{%- endif -%} diff --git a/docs/doxybook/templates/synopsis_inherited_from_comment.tmpl b/docs/doxybook/templates/synopsis_inherited_from_comment.tmpl new file mode 100644 index 000000000..4afda1250 --- /dev/null +++ b/docs/doxybook/templates/synopsis_inherited_from_comment.tmpl @@ -0,0 +1,8 @@ +{{ noop() -}} + {%- if default(synopsis_indent_width, 0) != 0 -%} + {%- include "synopsis_indent.tmpl" -%} + {%- endif -%} + /* Inherited from {{ noop() -}} + {%- include "name_qualified.tmpl" -%}{{ noop() -}} + */{{ noop() -}} +{{ noop() -}} diff --git a/docs/doxybook/templates/synopsis_initializer.tmpl b/docs/doxybook/templates/synopsis_initializer.tmpl new file mode 100644 index 000000000..dd159979d --- /dev/null +++ b/docs/doxybook/templates/synopsis_initializer.tmpl @@ -0,0 +1,3 @@ +{%- if kind == "using" %} = {{ escape(type) -}} +{%- else if exists("initializer") %} {{ escape(initializer) -}} +{%- endif -%} diff --git a/docs/doxybook/templates/synopsis_initializer_abbreviated.tmpl b/docs/doxybook/templates/synopsis_initializer_abbreviated.tmpl new file mode 100644 index 000000000..2bc4d4856 --- /dev/null +++ b/docs/doxybook/templates/synopsis_initializer_abbreviated.tmpl @@ -0,0 +1 @@ +{% if kind == "using" or exists("initializer") %} = see below{% endif -%} diff --git a/docs/doxybook/templates/synopsis_kind.tmpl b/docs/doxybook/templates/synopsis_kind.tmpl new file mode 100644 index 000000000..34cd602a9 --- /dev/null +++ b/docs/doxybook/templates/synopsis_kind.tmpl @@ -0,0 +1,9 @@ +{%- if kind == "interface" %}class {{ noop() -}} +{%- else if kind == "namespace" %}namespace {{ noop() -}} +{%- else if kind == "typedef" %}typedef {{ type -}} +{%- else if kind == "enum" %}enum {% if strong %}class {% endif -%} {{ noop() -}} +{%- else if kind == "friend" %}friend {{ noop() -}} + {%- if type == "class" or type == "struct" %}{{ type }} {% endif -%} +{%- else if kind == "define" %}#define {{ noop() -}} +{%- else %}{{ kind }} {{ noop() -}} +{%- endif -%} diff --git a/docs/doxybook/templates/synopsis_kind_abbreviated.tmpl b/docs/doxybook/templates/synopsis_kind_abbreviated.tmpl new file mode 100644 index 000000000..881582773 --- /dev/null +++ b/docs/doxybook/templates/synopsis_kind_abbreviated.tmpl @@ -0,0 +1,9 @@ +{%- if kind == "interface" %}class {{ noop() -}} +{%- else if kind == "namespace" %}namespace {{ noop() -}} +{%- else if kind == "typedef" %}typedef see below {{ noop() -}} +{%- else if kind == "enum" %}enum {% if strong %}class {% endif -%} +{%- else if kind == "friend" %}friend {{ noop() -}} + {%- if type == "class" or type == "struct" %}{{type}} {% endif -%} +{%- else if kind == "define" %}#define {{ noop() -}} +{%- else %}{{ kind }} {{ noop() -}} +{%- endif -%} diff --git a/docs/doxybook/templates/synopsis_leading_line_break.tmpl b/docs/doxybook/templates/synopsis_leading_line_break.tmpl new file mode 100644 index 000000000..13a1574e3 --- /dev/null +++ b/docs/doxybook/templates/synopsis_leading_line_break.tmpl @@ -0,0 +1,3 @@ +{%- if default(synopsis_needs_leading_line_break, false) -%} +
+{%- endif -%} diff --git a/docs/doxybook/templates/synopsis_macro.tmpl b/docs/doxybook/templates/synopsis_macro.tmpl new file mode 100644 index 000000000..612773439 --- /dev/null +++ b/docs/doxybook/templates/synopsis_macro.tmpl @@ -0,0 +1,7 @@ +{%- include "synopsis_leading_line_break.tmpl" -%} +{%- set child.synopsis_indent_width = default(synopsis_indent_width, 0) -%} +{{ noop() -}} + {{- render("synopsis_kind.tmpl", child) -}} + {{- render("name_qualified.tmpl", child) -}}{{ noop() -}} + {{- render("synopsis_initializer_abbreviated.tmpl", child) -}};{{ noop() -}} + diff --git a/docs/doxybook/templates/synopsis_member_namespace_abbreviated.tmpl b/docs/doxybook/templates/synopsis_member_namespace_abbreviated.tmpl new file mode 100644 index 000000000..682f615c9 --- /dev/null +++ b/docs/doxybook/templates/synopsis_member_namespace_abbreviated.tmpl @@ -0,0 +1,7 @@ +{%- include "synopsis_leading_line_break.tmpl" -%} +{%- set child.synopsis_indent_width = default(synopsis_indent_width, 0) -%} +{{- render("synopsis_brief.tmpl", child) -}} +{{ noop() -}} + {{- render("synopsis_kind_abbreviated.tmpl", child) -}} + {{- render("name_qualified.tmpl", child) -}} { }{{ noop() -}} + diff --git a/docs/doxybook/templates/synopsis_namespace_abbreviated.tmpl b/docs/doxybook/templates/synopsis_namespace_abbreviated.tmpl new file mode 100644 index 000000000..682f615c9 --- /dev/null +++ b/docs/doxybook/templates/synopsis_namespace_abbreviated.tmpl @@ -0,0 +1,7 @@ +{%- include "synopsis_leading_line_break.tmpl" -%} +{%- set child.synopsis_indent_width = default(synopsis_indent_width, 0) -%} +{{- render("synopsis_brief.tmpl", child) -}} +{{ noop() -}} + {{- render("synopsis_kind_abbreviated.tmpl", child) -}} + {{- render("name_qualified.tmpl", child) -}} { }{{ noop() -}} + diff --git a/docs/doxybook/templates/synopsis_template_parameters.tmpl b/docs/doxybook/templates/synopsis_template_parameters.tmpl new file mode 100644 index 000000000..4391c3d99 --- /dev/null +++ b/docs/doxybook/templates/synopsis_template_parameters.tmpl @@ -0,0 +1,14 @@ +{%- if exists("templateParams") -%} + {% include "synopsis_indent.tmpl" -%}template <{{ noop() -}} + {%- for param in templateParams -%} + {%- if not loop.is_first %}{% include "synopsis_indent.tmpl" -%}  {% endif -%} + {{- param.type -}} + {%- if not isEmpty(param.name) %} {% endif -%} + {{- param.name -}} + {%- if existsIn(param, "defvalPlain") %} = {{ escape(param.defvalPlain) }}{% endif -%} + {%- if not loop.is_last -%} + , + {{- noop() }}{% include "synopsis_indent.tmpl" -%} + {%- endif -%} + {%- endfor -%}> +{%- endif -%} diff --git a/docs/doxybook/templates/synopsis_type.tmpl b/docs/doxybook/templates/synopsis_type.tmpl new file mode 100644 index 000000000..586555f08 --- /dev/null +++ b/docs/doxybook/templates/synopsis_type.tmpl @@ -0,0 +1,11 @@ +{%- include "synopsis_leading_line_break.tmpl" -%} +{%- include "synopsis_inherited_from.tmpl" -%} +{%- set child.synopsis_indent_width = default(synopsis_indent_width, 0) -%} +{{- render("synopsis_brief.tmpl", child) -}} +{{- render("synopsis_template_parameters.tmpl", child) -}} +{{ noop() -}} + {%- include "synopsis_indent.tmpl" -%} + {{- render("synopsis_kind_abbreviated.tmpl", child) -}} + {% include "name.tmpl" %}{{ noop() -}} + {{- render("synopsis_initializer_abbreviated.tmpl", child) -}};{{ noop() -}} + diff --git a/docs/doxybook/templates/synopsis_type_and_leading_specifiers.tmpl b/docs/doxybook/templates/synopsis_type_and_leading_specifiers.tmpl new file mode 100644 index 000000000..12136020f --- /dev/null +++ b/docs/doxybook/templates/synopsis_type_and_leading_specifiers.tmpl @@ -0,0 +1,4 @@ +{%- if default(virtual, false) %}virtual {% endif -%} +{%- if default(static, false) %}static {% endif -%} +{%- if default(explicit, false) %}explicit {% endif -%} +{%- if exists("type") %}{{ type }} {% endif -%} diff --git a/docs/doxybook/templates/synopsis_variable.tmpl b/docs/doxybook/templates/synopsis_variable.tmpl new file mode 100644 index 000000000..52c48da50 --- /dev/null +++ b/docs/doxybook/templates/synopsis_variable.tmpl @@ -0,0 +1,11 @@ +{%- include "synopsis_leading_line_break.tmpl" -%} +{%- include "synopsis_inherited_from.tmpl" -%} +{%- set child.synopsis_indent_width = default(synopsis_indent_width, 0) -%} +{{- render("synopsis_brief.tmpl", child) -}} +{{- render("synopsis_template_parameters.tmpl", child) -}} +{{ noop() -}} + {%- include "synopsis_indent.tmpl" -%} + {{- render("synopsis_type_and_leading_specifiers.tmpl", child) -}} + {% include "name.tmpl" %}{{ noop() -}} + {{- render("synopsis_initializer_abbreviated.tmpl", child) -}};{{ noop() -}} + diff --git a/docs/doxybook/templates/table_header_brief.tmpl b/docs/doxybook/templates/table_header_brief.tmpl new file mode 100644 index 000000000..ed13f970f --- /dev/null +++ b/docs/doxybook/templates/table_header_brief.tmpl @@ -0,0 +1,2 @@ +| Name | Description | +|------|-------------| diff --git a/docs/doxybook/templates/table_header_enum.tmpl b/docs/doxybook/templates/table_header_enum.tmpl new file mode 100644 index 000000000..cdf95bc6f --- /dev/null +++ b/docs/doxybook/templates/table_header_enum.tmpl @@ -0,0 +1,2 @@ +| Enumerator | Value | Description | +|------------|-------|-------------| diff --git a/docs/doxybook/templates/table_row_brief.tmpl b/docs/doxybook/templates/table_row_brief.tmpl new file mode 100644 index 000000000..1d599755f --- /dev/null +++ b/docs/doxybook/templates/table_row_brief.tmpl @@ -0,0 +1 @@ +| **[`{{name}}`]({{url}})** | {% if exists("brief") %}{{brief}}{% endif %} | diff --git a/docs/doxybook/templates/table_row_enum.tmpl b/docs/doxybook/templates/table_row_enum.tmpl new file mode 100644 index 000000000..77c205be3 --- /dev/null +++ b/docs/doxybook/templates/table_row_enum.tmpl @@ -0,0 +1 @@ +| `{{ name }}` | {% if exists("initializer") -%}`{{ escape(replace(initializer, "= ", "")) }}`{%- endif %} | {% if exists("brief") -%}{{ brief }}{%- endif %} | diff --git a/docs/doxybook/templates/title_kind.tmpl b/docs/doxybook/templates/title_kind.tmpl new file mode 100644 index 000000000..100db2e84 --- /dev/null +++ b/docs/doxybook/templates/title_kind.tmpl @@ -0,0 +1,4 @@ +{%- if child.kind == "using" %}Type Alias{{ noop() -}} +{%- else -%}{{ title(child.kind) -}} +{%- endif -%} +{%- if child.kind == "enum" and child.strong %} Class{%- endif -%} diff --git a/docs/doxybook/templates/title_leading.tmpl b/docs/doxybook/templates/title_leading.tmpl new file mode 100644 index 000000000..54eb7e967 --- /dev/null +++ b/docs/doxybook/templates/title_leading.tmpl @@ -0,0 +1,4 @@ +

+{%- if existsIn(child, "kind") and child.kind in ["class", "struct"] -%} + {{ noop() -}} +{%- endif -%} diff --git a/docs/doxybook/templates/title_member.tmpl b/docs/doxybook/templates/title_member.tmpl new file mode 100644 index 000000000..50e70f378 --- /dev/null +++ b/docs/doxybook/templates/title_member.tmpl @@ -0,0 +1,4 @@ +{%- include "title_leading.tmpl" -%} + {%- include "title_kind.tmpl" -%} + {{- noop() }} {% include "name_qualified.tmpl" %}::{{ render("name_unqualified.tmpl", child) }} +{%- include "title_trailing.tmpl" -%} diff --git a/docs/doxybook/templates/title_nonmember.tmpl b/docs/doxybook/templates/title_nonmember.tmpl new file mode 100644 index 000000000..4ea9797fd --- /dev/null +++ b/docs/doxybook/templates/title_nonmember.tmpl @@ -0,0 +1,5 @@ +{%- include "title_leading.tmpl" -%} + {%- include "title_kind.tmpl" -%} + {{- noop() }} {{render("name_qualified.tmpl", child)}} +{%- include "title_trailing.tmpl" -%} + diff --git a/docs/doxybook/templates/title_trailing.tmpl b/docs/doxybook/templates/title_trailing.tmpl new file mode 100644 index 000000000..fcc4f24e6 --- /dev/null +++ b/docs/doxybook/templates/title_trailing.tmpl @@ -0,0 +1,4 @@ +{%- if existsIn(child, "kind") and child.kind in ["class", "struct"] -%} + +{%- endif -%} +

diff --git a/doc/thrust.dox b/docs/doxygen/config.dox similarity index 82% rename from doc/thrust.dox rename to docs/doxygen/config.dox index b74f436f5..7e06e3545 100644 --- a/doc/thrust.dox +++ b/docs/doxygen/config.dox @@ -1,4 +1,4 @@ -# Doxyfile 1.8.13 +# Doxyfile 1.9.3 # This file describes the settings to be used by the documentation system # doxygen (www.doxygen.org) for a project. @@ -17,11 +17,11 @@ # Project related configuration options #--------------------------------------------------------------------------- -# This tag specifies the encoding used for all characters in the config file -# that follow. The default is UTF-8 which is also the encoding used for all text -# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv -# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv -# for the list of possible encodings. +# This tag specifies the encoding used for all characters in the configuration +# file that follow. The default is UTF-8 which is also the encoding used for all +# text before the first occurrence of this tag. Doxygen uses libiconv (or the +# iconv built into libc) for the transcoding. See +# https://www.gnu.org/software/libiconv/ for the list of possible encodings. # The default value is: UTF-8. DOXYFILE_ENCODING = UTF-8 @@ -32,7 +32,7 @@ DOXYFILE_ENCODING = UTF-8 # title of most generated pages and in a few other places. # The default value is: My Project. -PROJECT_NAME = thrust +PROJECT_NAME = Thrust # The PROJECT_NUMBER tag can be used to enter a project or revision number. This # could be handy for archiving the generated documentation or if some version @@ -58,7 +58,7 @@ PROJECT_LOGO = # entered, it will be relative to the location where doxygen was started. If # left blank the current directory will be used. -OUTPUT_DIRECTORY = doc +OUTPUT_DIRECTORY = # If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- # directories (in 2 levels) under the output directory of each output format and @@ -187,7 +187,17 @@ SHORT_NAMES = NO # description.) # The default value is: NO. -JAVADOC_AUTOBRIEF = NO +JAVADOC_AUTOBRIEF = YES + +# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line +# such as +# /*************** +# as being the beginning of a Javadoc-style comment "banner". If set to NO, the +# Javadoc-style will behave just like regular comments and it will not be +# interpreted by doxygen. +# The default value is: NO. + +JAVADOC_BANNER = NO # If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first # line (until the first dot) of a Qt-style comment as the brief description. If @@ -209,6 +219,14 @@ QT_AUTOBRIEF = NO MULTILINE_CPP_IS_BRIEF = NO +# By default Python docstrings are displayed as preformatted text and doxygen's +# special commands cannot be used. By setting PYTHON_DOCSTRING to NO the +# doxygen's special commands can be used and the contents of the docstring +# documentation blocks is shown as doxygen documentation. +# The default value is: YES. + +PYTHON_DOCSTRING = YES + # If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the # documentation from any documented member that it re-implements. # The default value is: YES. @@ -220,7 +238,7 @@ INHERIT_DOCS = YES # of the file/class/namespace that contains it. # The default value is: NO. -SEPARATE_MEMBER_PAGES = YES +SEPARATE_MEMBER_PAGES = NO # The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen # uses this value to replace tabs by spaces in code fragments. @@ -232,20 +250,19 @@ TAB_SIZE = 8 # the documentation. An alias has the form: # name=value # For example adding -# "sideeffect=@par Side Effects:\n" +# "sideeffect=@par Side Effects:^^" # will allow you to put the command \sideeffect (or @sideeffect) in the # documentation, which will result in a user-defined paragraph with heading -# "Side Effects:". You can put \n's in the value part of an alias to insert -# newlines. +# "Side Effects:". Note that you cannot put \n's in the value part of an alias +# to insert newlines (in the resulting output). You can put ^^ in the value part +# of an alias to insert a newline as if a physical newline was in the original +# file. When you need a literal { or } or , in the value part of an alias you +# have to escape them by means of a backslash (\), this can lead to conflicts +# with the commands \{ and \} for these it is advised to use the version @{ and +# @} or use a double escape (\\{ and \\}) ALIASES = -# This tag can be used to specify a number of word-keyword mappings (TCL only). -# A mapping has the form "name=value". For example adding "class=itcl::class" -# will allow you to use the command class in the itcl::class meaning. - -TCL_SUBST = - # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources # only. Doxygen will then generate output that is more tailored for C. For # instance, some of the names that are used will be different. The list of all @@ -274,28 +291,40 @@ OPTIMIZE_FOR_FORTRAN = NO OPTIMIZE_OUTPUT_VHDL = NO +# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice +# sources only. Doxygen will then generate output that is more tailored for that +# language. For instance, namespaces will be presented as modules, types will be +# separated into more groups, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_SLICE = NO + # Doxygen selects the parser to use depending on the extension of the files it # parses. With this tag you can assign which parser to use for a given # extension. Doxygen has a built-in mapping, but you can override or extend it # using this tag. The format is ext=language, where ext is a file extension, and -# language is one of the parsers supported by doxygen: IDL, Java, Javascript, -# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran: -# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran: -# Fortran. In the later case the parser tries to guess whether the code is fixed -# or free formatted code, this is the default for Fortran type files), VHDL. For -# instance to make doxygen treat .inc files as Fortran files (default is PHP), -# and .f files as C (default is Fortran), use: inc=Fortran f=C. +# language is one of the parsers supported by doxygen: IDL, Java, JavaScript, +# Csharp (C#), C, C++, Lex, D, PHP, md (Markdown), Objective-C, Python, Slice, +# VHDL, Fortran (fixed format Fortran: FortranFixed, free formatted Fortran: +# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser +# tries to guess whether the code is fixed or free formatted code, this is the +# default for Fortran type files). For instance to make doxygen treat .inc files +# as Fortran files (default is PHP), and .f files as C (default is Fortran), +# use: inc=Fortran f=C. # # Note: For files without extension you can use no_extension as a placeholder. # # Note that for custom extensions you also need to set FILE_PATTERNS otherwise -# the files are not read by doxygen. +# the files are not read by doxygen. When specifying no_extension you should add +# * to the FILE_PATTERNS. +# +# Note see also the list of default file extension mappings. EXTENSION_MAPPING = # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments # according to the Markdown format, which allows for more readable -# documentation. See http://daringfireball.net/projects/markdown/ for details. +# documentation. See https://daringfireball.net/projects/markdown/ for details. # The output of markdown processing is further processed by doxygen, so you can # mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in # case of backward compatibilities issues. @@ -307,7 +336,7 @@ MARKDOWN_SUPPORT = YES # to that level are automatically included in the table of contents, even if # they do not have an id attribute. # Note: This feature currently applies only to Markdown headings. -# Minimum value: 0, maximum value: 99, default value: 0. +# Minimum value: 0, maximum value: 99, default value: 5. # This tag requires that the tag MARKDOWN_SUPPORT is set to YES. TOC_INCLUDE_HEADINGS = 0 @@ -337,7 +366,7 @@ BUILTIN_STL_SUPPORT = NO CPP_CLI_SUPPORT = NO # Set the SIP_SUPPORT tag to YES if your project consists of sip (see: -# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen +# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen # will parse them like normal C++ but will assume all classes use public instead # of private inheritance when no explicit protection keyword is present. # The default value is: NO. @@ -423,6 +452,19 @@ TYPEDEF_HIDES_STRUCT = NO LOOKUP_CACHE_SIZE = 0 +# The NUM_PROC_THREADS specifies the number threads doxygen is allowed to use +# during processing. When set to 0 doxygen will based this on the number of +# cores available in the system. You can set it explicitly to a value larger +# than 0 to get more control over the balance between CPU load and processing +# speed. At this moment only the input processing can be done using multiple +# threads. Since this is still an experimental feature the default is set to 1, +# which effectively disables parallel processing. Please report any issues you +# encounter. Generating dot graphs in parallel is controlled by the +# DOT_NUM_THREADS setting. +# Minimum value: 0, maximum value: 32, default value: 1. + +NUM_PROC_THREADS = 1 + #--------------------------------------------------------------------------- # Build related configuration options #--------------------------------------------------------------------------- @@ -443,6 +485,12 @@ EXTRACT_ALL = NO EXTRACT_PRIVATE = NO +# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual +# methods of a class will be included in the documentation. +# The default value is: NO. + +EXTRACT_PRIV_VIRTUAL = NO + # If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal # scope will be included in the documentation. # The default value is: NO. @@ -480,6 +528,13 @@ EXTRACT_LOCAL_METHODS = NO EXTRACT_ANON_NSPACES = NO +# If this flag is set to YES, the name of an unnamed parameter in a declaration +# will be determined by the corresponding definition. By default unnamed +# parameters remain unnamed in the output. +# The default value is: YES. + +RESOLVE_UNNAMED_PARAMS = YES + # If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all # undocumented members inside documented classes or files. If set to NO these # members will be included in the various overviews, but no documentation @@ -497,11 +552,11 @@ HIDE_UNDOC_MEMBERS = NO HIDE_UNDOC_CLASSES = YES # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend -# (class|struct|union) declarations. If set to NO, these declarations will be -# included in the documentation. +# declarations. If set to NO, these declarations will be included in the +# documentation. # The default value is: NO. -HIDE_FRIEND_COMPOUNDS = NO +HIDE_FRIEND_COMPOUNDS = YES # If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any # documentation blocks found inside the body of a function. If set to NO, these @@ -517,11 +572,18 @@ HIDE_IN_BODY_DOCS = NO INTERNAL_DOCS = NO -# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file -# names in lower-case letters. If set to YES, upper-case letters are also -# allowed. This is useful if you have classes or files whose names only differ -# in case and if your file system supports case sensitive file names. Windows -# and Mac users are advised to set this option to NO. +# With the correct setting of option CASE_SENSE_NAMES doxygen will better be +# able to match the capabilities of the underlying filesystem. In case the +# filesystem is case sensitive (i.e. it supports files in the same directory +# whose names only differ in casing), the option must be set to YES to properly +# deal with such files in case they appear in the input. For filesystems that +# are not case sensitive the option should be be set to NO to properly deal with +# output files written for symbols that only differ in casing, such as for two +# classes, one named CLASS and the other named Class, and to also support +# references to files without having to specify the exact matching casing. On +# Windows (including Cygwin) and MacOS, users should typically set this option +# to NO, whereas on Linux or other Unix flavors it should typically be set to +# YES. # The default value is: system dependent. CASE_SENSE_NAMES = YES @@ -540,6 +602,12 @@ HIDE_SCOPE_NAMES = NO HIDE_COMPOUND_REFERENCE= NO +# If the SHOW_HEADERFILE tag is set to YES then the documentation for a class +# will show which file needs to be included to use the class. +# The default value is: YES. + +SHOW_HEADERFILE = YES + # If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of # the files that are included by a file in the documentation of that file. # The default value is: YES. @@ -563,7 +631,7 @@ FORCE_LOCAL_INCLUDES = NO # documentation for inline members. # The default value is: YES. -INLINE_INFO = YES +INLINE_INFO = NO # If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the # (detailed) documentation of file and class members alphabetically by member @@ -666,21 +734,21 @@ MAX_INITIALIZER_LINES = 30 # list will mention the files that were used to generate the documentation. # The default value is: YES. -SHOW_USED_FILES = YES +SHOW_USED_FILES = NO # Set the SHOW_FILES tag to NO to disable the generation of the Files page. This # will remove the Files entry from the Quick Index and from the Folder Tree View # (if specified). # The default value is: YES. -SHOW_FILES = YES +SHOW_FILES = NO # Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces # page. This will remove the Namespaces entry from the Quick Index and from the # Folder Tree View (if specified). # The default value is: YES. -SHOW_NAMESPACES = YES +SHOW_NAMESPACES = NO # The FILE_VERSION_FILTER tag can be used to specify a program or script that # doxygen should invoke to get the current version for each file (typically from @@ -697,7 +765,8 @@ FILE_VERSION_FILTER = # output files in an output format independent way. To create the layout file # that represents doxygen's defaults, run doxygen with the -l option. You can # optionally specify a file name after the option, if omitted DoxygenLayout.xml -# will be used as the name of the layout file. +# will be used as the name of the layout file. See also section "Changing the +# layout of pages" for information. # # Note that if you run doxygen from a directory containing a file called # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE @@ -708,7 +777,7 @@ LAYOUT_FILE = # The CITE_BIB_FILES tag can be used to specify one or more bib files containing # the reference definitions. This must be a list of .bib files. The .bib # extension is automatically appended if omitted. This requires the bibtex tool -# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info. +# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info. # For LaTeX the style of the bibliography can be controlled using # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the # search path. See also \cite for info how to create references. @@ -743,23 +812,35 @@ WARNINGS = YES WARN_IF_UNDOCUMENTED = YES # If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for -# potential errors in the documentation, such as not documenting some parameters -# in a documented function, or documenting parameters that don't exist or using -# markup commands wrongly. +# potential errors in the documentation, such as documenting some parameters in +# a documented function twice, or documenting parameters that don't exist or +# using markup commands wrongly. # The default value is: YES. WARN_IF_DOC_ERROR = YES +# If WARN_IF_INCOMPLETE_DOC is set to YES, doxygen will warn about incomplete +# function parameter documentation. If set to NO, doxygen will accept that some +# parameters have no documentation without warning. +# The default value is: YES. + +WARN_IF_INCOMPLETE_DOC = YES + # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that # are documented, but have no documentation for their parameters or return -# value. If set to NO, doxygen will only warn about wrong or incomplete -# parameter documentation, but not about the absence of documentation. +# value. If set to NO, doxygen will only warn about wrong parameter +# documentation, but not about the absence of documentation. If EXTRACT_ALL is +# set to YES then this flag will automatically be disabled. See also +# WARN_IF_INCOMPLETE_DOC # The default value is: NO. WARN_NO_PARAMDOC = NO # If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when -# a warning is encountered. +# a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS +# then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but +# at the end of the doxygen process doxygen will return with a non-zero status. +# Possible values are: NO, YES and FAIL_ON_WARNINGS. # The default value is: NO. WARN_AS_ERROR = NO @@ -776,7 +857,10 @@ WARN_FORMAT = "$file:$line: $text" # The WARN_LOGFILE tag can be used to specify a file to which warning and error # messages should be written. If left blank the output is written to standard -# error (stderr). +# error (stderr). In case the file specified cannot be opened for writing the +# warning and error messages are written to standard error. When as file - is +# specified the warning and error messages are written to standard output +# (stdout). WARN_LOGFILE = @@ -790,14 +874,13 @@ WARN_LOGFILE = # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. -INPUT = thrust \ - examples +INPUT = thrust # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses # libiconv (or the iconv built into libc) for the transcoding. See the libiconv -# documentation (see: http://www.gnu.org/software/libiconv) for the list of -# possible encodings. +# documentation (see: +# https://www.gnu.org/software/libiconv/) for the list of possible encodings. # The default value is: UTF-8. INPUT_ENCODING = UTF-8 @@ -810,11 +893,15 @@ INPUT_ENCODING = UTF-8 # need to set EXTENSION_MAPPING for the extension otherwise the files are not # read by doxygen. # +# Note the list of default checked file patterns might differ from the list of +# default file extension mappings. +# # If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, # *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, -# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, -# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, -# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf and *.qsf. +# *.hh, *.hxx, *.hpp, *.h++, *.l, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, +# *.inc, *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C +# comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd, +# *.vhdl, *.ucf, *.qsf and *.ice. FILE_PATTERNS = @@ -831,7 +918,7 @@ RECURSIVE = YES # Note that relative paths are relative to the directory from which doxygen is # run. -EXCLUDE = examples +EXCLUDE = # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded @@ -847,13 +934,13 @@ EXCLUDE_SYMLINKS = NO # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories for example use the pattern */test/* -EXCLUDE_PATTERNS = */detail/* +EXCLUDE_PATTERNS = *detail* # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names # (namespaces, classes, functions, etc.) that should be excluded from the # output. The symbol name can be a fully qualified name, a word, or if the # wildcard * is used, a substring. Examples: ANamespace, AClass, -# AClass::ANamespace, ANamespace::*Test +# ANamespace::AClass, ANamespace::*Test # # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories use the pattern */test/* @@ -969,7 +1056,7 @@ INLINE_SOURCES = NO STRIP_CODE_COMMENTS = YES # If the REFERENCED_BY_RELATION tag is set to YES then for each documented -# function all documented functions referencing it will be listed. +# entity all documented functions referencing it will be listed. # The default value is: NO. REFERENCED_BY_RELATION = YES @@ -1001,12 +1088,12 @@ SOURCE_TOOLTIPS = YES # If the USE_HTAGS tag is set to YES then the references to source code will # point to the HTML generated by the htags(1) tool instead of doxygen built-in # source browser. The htags tool is part of GNU's global source tagging system -# (see http://www.gnu.org/software/global/global.html). You will need version +# (see https://www.gnu.org/software/global/global.html). You will need version # 4.8.6 or higher. # # To use it do the following: # - Install the latest version of global -# - Enable SOURCE_BROWSER and USE_HTAGS in the config file +# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file # - Make sure the INPUT points to the root of the source tree # - Run doxygen as normal # @@ -1028,25 +1115,6 @@ USE_HTAGS = NO VERBATIM_HEADERS = YES -# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the -# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the -# cost of reduced performance. This can be particularly helpful with template -# rich C++ code for which doxygen's built-in parser lacks the necessary type -# information. -# Note: The availability of this option depends on whether or not doxygen was -# generated with the -Duse-libclang=ON option for CMake. -# The default value is: NO. - -CLANG_ASSISTED_PARSING = NO - -# If clang assisted parsing is enabled you can provide the compiler with command -# line options that you would normally use when invoking the compiler. Note that -# the include paths will already be set by doxygen for the files and directories -# specified with INPUT and INCLUDE_PATH. -# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. - -CLANG_OPTIONS = - #--------------------------------------------------------------------------- # Configuration options related to the alphabetical class index #--------------------------------------------------------------------------- @@ -1056,14 +1124,7 @@ CLANG_OPTIONS = # classes, structs, unions or interfaces. # The default value is: YES. -ALPHABETICAL_INDEX = NO - -# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in -# which the alphabetical index list will be split. -# Minimum value: 1, maximum value: 20, default value: 5. -# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. - -COLS_IN_ALPHA_INDEX = 5 +ALPHABETICAL_INDEX = YES # In case all classes in a project start with a common prefix, all classes will # be put under the same header in the alphabetical index. The IGNORE_PREFIX tag @@ -1080,7 +1141,7 @@ IGNORE_PREFIX = # If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output # The default value is: YES. -GENERATE_HTML = YES +GENERATE_HTML = NO # The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of @@ -1088,7 +1149,7 @@ GENERATE_HTML = YES # The default directory is: html. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_OUTPUT = html +HTML_OUTPUT = build_docs/doxygen/html # The HTML_FILE_EXTENSION tag can be used to specify the file extension for each # generated HTML page (for example: .htm, .php, .asp). @@ -1164,8 +1225,8 @@ HTML_EXTRA_FILES = # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen # will adjust the colors in the style sheet and background images according to -# this color. Hue is specified as an angle on a colorwheel, see -# http://en.wikipedia.org/wiki/Hue for more information. For instance the value +# this color. Hue is specified as an angle on a color-wheel, see +# https://en.wikipedia.org/wiki/Hue for more information. For instance the value # 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 # purple, and 360 is red again. # Minimum value: 0, maximum value: 359, default value: 220. @@ -1174,7 +1235,7 @@ HTML_EXTRA_FILES = HTML_COLORSTYLE_HUE = 220 # The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors -# in the HTML output. For a value of 0 the output will use grayscales only. A +# in the HTML output. For a value of 0 the output will use gray-scales only. A # value of 255 will produce the most vivid colors. # Minimum value: 0, maximum value: 255, default value: 100. # This tag requires that the tag GENERATE_HTML is set to YES. @@ -1201,6 +1262,17 @@ HTML_COLORSTYLE_GAMMA = 80 HTML_TIMESTAMP = NO +# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML +# documentation will contain a main index with vertical navigation menus that +# are dynamically created via JavaScript. If disabled, the navigation index will +# consists of multiple levels of tabs that are statically embedded in every HTML +# page. Disable this option to support browsers that do not have JavaScript, +# like the Qt help browser. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_DYNAMIC_MENUS = YES + # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML # documentation will contain sections that can be hidden and shown after the # page has loaded. @@ -1224,13 +1296,14 @@ HTML_INDEX_NUM_ENTRIES = 100 # If the GENERATE_DOCSET tag is set to YES, additional index files will be # generated that can be used as input for Apple's Xcode 3 integrated development -# environment (see: http://developer.apple.com/tools/xcode/), introduced with -# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a -# Makefile in the HTML output directory. Running make will produce the docset in -# that directory and running make install will install the docset in +# environment (see: +# https://developer.apple.com/xcode/), introduced with OSX 10.5 (Leopard). To +# create a documentation set, doxygen will generate a Makefile in the HTML +# output directory. Running make will produce the docset in that directory and +# running make install will install the docset in # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at -# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html -# for more information. +# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy +# genXcode/_index.html for more information. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. @@ -1244,6 +1317,13 @@ GENERATE_DOCSET = NO DOCSET_FEEDNAME = "Doxygen generated docs" +# This tag determines the URL of the docset feed. A documentation feed provides +# an umbrella under which multiple documentation sets from a single provider +# (such as a company or product suite) can be grouped. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_FEEDURL = + # This tag specifies a string that should uniquely identify the documentation # set bundle. This should be a reverse domain-name style string, e.g. # com.mycompany.MyDocSet. Doxygen will append .docset to the name. @@ -1269,8 +1349,12 @@ DOCSET_PUBLISHER_NAME = Publisher # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three # additional HTML index files: index.hhp, index.hhc, and index.hhk. The # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop -# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on -# Windows. +# on Windows. In the beginning of 2021 Microsoft took the original page, with +# a.o. the download links, offline the HTML help workshop was already many years +# in maintenance mode). You can download the HTML help workshop from the web +# archives at Installation executable (see: +# http://web.archive.org/web/20160201063255/http://download.microsoft.com/downlo +# ad/0/A/9/0A939EF6-E31C-430F-A3DF-DFAE7960D564/htmlhelp.exe). # # The HTML Help Workshop contains a compiler that can convert all HTML output # generated by doxygen into a single compiled HTML file (.chm). Compiled HTML @@ -1300,7 +1384,7 @@ CHM_FILE = HHC_LOCATION = # The GENERATE_CHI flag controls if a separate .chi index file is generated -# (YES) or that it should be included in the master .chm file (NO). +# (YES) or that it should be included in the main .chm file (NO). # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. @@ -1345,7 +1429,8 @@ QCH_FILE = # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help # Project output. For more information please see Qt Help Project / Namespace -# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace). +# (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace). # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_QHP is set to YES. @@ -1353,8 +1438,8 @@ QHP_NAMESPACE = org.doxygen.Project # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt # Help Project output. For more information please see Qt Help Project / Virtual -# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual- -# folders). +# Folders (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-folders). # The default value is: doc. # This tag requires that the tag GENERATE_QHP is set to YES. @@ -1362,30 +1447,30 @@ QHP_VIRTUAL_FOLDER = doc # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom # filter to add. For more information please see Qt Help Project / Custom -# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- -# filters). +# Filters (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_NAME = # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the # custom filter to add. For more information please see Qt Help Project / Custom -# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- -# filters). +# Filters (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_ATTRS = # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this # project's filter section matches. Qt Help Project / Filter Attributes (see: -# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes). +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_SECT_FILTER_ATTRS = -# The QHG_LOCATION tag can be used to specify the location of Qt's -# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the -# generated .qhp file. +# The QHG_LOCATION tag can be used to specify the location (absolute path +# including file name) of Qt's qhelpgenerator. If non-empty doxygen will try to +# run qhelpgenerator on the generated .qhp file. # This tag requires that the tag GENERATE_QHP is set to YES. QHG_LOCATION = @@ -1419,7 +1504,7 @@ ECLIPSE_DOC_ID = org.doxygen.Project # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. -DISABLE_INDEX = NO +DISABLE_INDEX = YES # The GENERATE_TREEVIEW tag is used to specify whether a tree-like index # structure should be generated to display hierarchical information. If the tag @@ -1428,16 +1513,28 @@ DISABLE_INDEX = NO # to work a browser that supports JavaScript, DHTML, CSS and frames is required # (i.e. any modern browser). Windows users are probably better off using the # HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can -# further fine-tune the look of the index. As an example, the default style -# sheet generated by doxygen has an example that shows how to put an image at -# the root of the tree instead of the PROJECT_NAME. Since the tree basically has -# the same information as the tab index, you could consider setting -# DISABLE_INDEX to YES when enabling this option. +# further fine tune the look of the index (see "Fine-tuning the output"). As an +# example, the default style sheet generated by doxygen has an example that +# shows how to put an image at the root of the tree instead of the PROJECT_NAME. +# Since the tree basically has the same information as the tab index, you could +# consider setting DISABLE_INDEX to YES when enabling this option. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_TREEVIEW = NO +# When both GENERATE_TREEVIEW and DISABLE_INDEX are set to YES, then the +# FULL_SIDEBAR option determines if the side bar is limited to only the treeview +# area (value NO) or if it should extend to the full height of the window (value +# YES). Setting this to YES gives a layout similar to +# https://docs.readthedocs.io with more room for contents, but less room for the +# project logo, title, and description. If either GENERATE_TREEVIEW or +# DISABLE_INDEX is set to NO, this option has no effect. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FULL_SIDEBAR = NO + # The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that # doxygen will group on one line in the generated HTML documentation. # @@ -1462,6 +1559,17 @@ TREEVIEW_WIDTH = 250 EXT_LINKS_IN_WINDOW = NO +# If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg +# tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see +# https://inkscape.org) to generate formulas as SVG images instead of PNGs for +# the HTML output. These images will generally look nicer at scaled resolutions. +# Possible values are: png (the default) and svg (looks nicer but requires the +# pdf2svg or inkscape tool). +# The default value is: png. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FORMULA_FORMAT = png + # Use this tag to change the font size of LaTeX formulas included as images in # the HTML documentation. When you change the font size after a successful # doxygen run you need to manually remove any form_*.png images from the HTML @@ -1471,7 +1579,7 @@ EXT_LINKS_IN_WINDOW = NO FORMULA_FONTSIZE = 10 -# Use the FORMULA_TRANPARENT tag to determine whether or not the images +# Use the FORMULA_TRANSPARENT tag to determine whether or not the images # generated for formulas are transparent PNGs. Transparent PNGs are not # supported properly for IE 6.0, but are supported on all modern browsers. # @@ -1482,8 +1590,14 @@ FORMULA_FONTSIZE = 10 FORMULA_TRANSPARENT = YES +# The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands +# to create new LaTeX commands to be used in formulas as building blocks. See +# the section "Including formulas" for details. + +FORMULA_MACROFILE = + # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see -# http://www.mathjax.org) which uses client side Javascript for the rendering +# https://www.mathjax.org) which uses client side JavaScript for the rendering # instead of using pre-rendered bitmaps. Use this if you do not have LaTeX # installed or if you want to formulas look prettier in the HTML output. When # enabled you may also need to install MathJax separately and configure the path @@ -1493,11 +1607,29 @@ FORMULA_TRANSPARENT = YES USE_MATHJAX = NO +# With MATHJAX_VERSION it is possible to specify the MathJax version to be used. +# Note that the different versions of MathJax have different requirements with +# regards to the different settings, so it is possible that also other MathJax +# settings have to be changed when switching between the different MathJax +# versions. +# Possible values are: MathJax_2 and MathJax_3. +# The default value is: MathJax_2. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_VERSION = MathJax_2 + # When MathJax is enabled you can set the default output format to be used for -# the MathJax output. See the MathJax site (see: -# http://docs.mathjax.org/en/latest/output.html) for more details. +# the MathJax output. For more details about the output format see MathJax +# version 2 (see: +# http://docs.mathjax.org/en/v2.7-latest/output.html) and MathJax version 3 +# (see: +# http://docs.mathjax.org/en/latest/web/components/output.html). # Possible values are: HTML-CSS (which is slower, but has the best -# compatibility), NativeMML (i.e. MathML) and SVG. +# compatibility. This is the name for Mathjax version 2, for MathJax version 3 +# this will be translated into chtml), NativeMML (i.e. MathML. Only supported +# for NathJax 2. For MathJax version 3 chtml will be used instead.), chtml (This +# is the name for Mathjax version 3, for MathJax version 2 this will be +# translated into HTML-CSS) and SVG. # The default value is: HTML-CSS. # This tag requires that the tag USE_MATHJAX is set to YES. @@ -1510,22 +1642,29 @@ MATHJAX_FORMAT = HTML-CSS # MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax # Content Delivery Network so you can quickly see the result without installing # MathJax. However, it is strongly recommended to install a local copy of -# MathJax from http://www.mathjax.org before deployment. -# The default value is: http://cdn.mathjax.org/mathjax/latest. +# MathJax from https://www.mathjax.org before deployment. The default value is: +# - in case of MathJax version 2: https://cdn.jsdelivr.net/npm/mathjax@2 +# - in case of MathJax version 3: https://cdn.jsdelivr.net/npm/mathjax@3 # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest # The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax # extension names that should be enabled during MathJax rendering. For example +# for MathJax version 2 (see https://docs.mathjax.org/en/v2.7-latest/tex.html +# #tex-and-latex-extensions): # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols +# For example for MathJax version 3 (see +# http://docs.mathjax.org/en/latest/input/tex/extensions/index.html): +# MATHJAX_EXTENSIONS = ams # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_EXTENSIONS = # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces # of code that will be used on startup of the MathJax code. See the MathJax site -# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an +# (see: +# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an # example see the documentation. # This tag requires that the tag USE_MATHJAX is set to YES. @@ -1553,7 +1692,7 @@ MATHJAX_CODEFILE = SEARCHENGINE = NO # When the SERVER_BASED_SEARCH tag is enabled the search engine will be -# implemented using a web server instead of a web client using Javascript. There +# implemented using a web server instead of a web client using JavaScript. There # are two flavors of web server based searching depending on the EXTERNAL_SEARCH # setting. When disabled, doxygen will generate a PHP script for searching and # an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing @@ -1572,7 +1711,8 @@ SERVER_BASED_SEARCH = NO # # Doxygen ships with an example indexer (doxyindexer) and search engine # (doxysearch.cgi) which are based on the open source search engine library -# Xapian (see: http://xapian.org/). +# Xapian (see: +# https://xapian.org/). # # See the section "External Indexing and Searching" for details. # The default value is: NO. @@ -1585,8 +1725,9 @@ EXTERNAL_SEARCH = NO # # Doxygen ships with an example indexer (doxyindexer) and search engine # (doxysearch.cgi) which are based on the open source search engine library -# Xapian (see: http://xapian.org/). See the section "External Indexing and -# Searching" for details. +# Xapian (see: +# https://xapian.org/). See the section "External Indexing and Searching" for +# details. # This tag requires that the tag SEARCHENGINE is set to YES. SEARCHENGINE_URL = @@ -1637,21 +1778,35 @@ LATEX_OUTPUT = latex # The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be # invoked. # -# Note that when enabling USE_PDFLATEX this option is only used for generating -# bitmaps for formulas in the HTML output, but not in the Makefile that is -# written to the output directory. -# The default file is: latex. +# Note that when not enabling USE_PDFLATEX the default is latex when enabling +# USE_PDFLATEX the default is pdflatex and when in the later case latex is +# chosen this is overwritten by pdflatex. For specific output languages the +# default can have been set differently, this depends on the implementation of +# the output language. # This tag requires that the tag GENERATE_LATEX is set to YES. -LATEX_CMD_NAME = latex +LATEX_CMD_NAME = # The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate # index for LaTeX. +# Note: This tag is used in the Makefile / make.bat. +# See also: LATEX_MAKEINDEX_CMD for the part in the generated output file +# (.tex). # The default file is: makeindex. # This tag requires that the tag GENERATE_LATEX is set to YES. MAKEINDEX_CMD_NAME = makeindex +# The LATEX_MAKEINDEX_CMD tag can be used to specify the command name to +# generate index for LaTeX. In case there is no backslash (\) as first character +# it will be automatically added in the LaTeX code. +# Note: This tag is used in the generated output file (.tex). +# See also: MAKEINDEX_CMD_NAME for the part in the Makefile / make.bat. +# The default value is: makeindex. +# This tag requires that the tag GENERATE_LATEX is set to YES. + +LATEX_MAKEINDEX_CMD = makeindex + # If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX # documents. This may be useful for small projects and may help to save some # trees in general. @@ -1667,7 +1822,7 @@ COMPACT_LATEX = NO # The default value is: a4. # This tag requires that the tag GENERATE_LATEX is set to YES. -PAPER_TYPE = a4wide +PAPER_TYPE = a4 # The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names # that should be included in the LaTeX output. The package can be specified just @@ -1681,29 +1836,31 @@ PAPER_TYPE = a4wide EXTRA_PACKAGES = -# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the -# generated LaTeX document. The header should contain everything until the first -# chapter. If it is left blank doxygen will generate a standard header. See -# section "Doxygen usage" for information on how to let doxygen write the -# default header to a separate file. +# The LATEX_HEADER tag can be used to specify a user-defined LaTeX header for +# the generated LaTeX document. The header should contain everything until the +# first chapter. If it is left blank doxygen will generate a standard header. It +# is highly recommended to start with a default header using +# doxygen -w latex new_header.tex new_footer.tex new_stylesheet.sty +# and then modify the file new_header.tex. See also section "Doxygen usage" for +# information on how to generate the default header that doxygen normally uses. # -# Note: Only use a user-defined header if you know what you are doing! The -# following commands have a special meaning inside the header: $title, -# $datetime, $date, $doxygenversion, $projectname, $projectnumber, -# $projectbrief, $projectlogo. Doxygen will replace $title with the empty -# string, for the replacement values of the other commands the user is referred -# to HTML_HEADER. +# Note: Only use a user-defined header if you know what you are doing! +# Note: The header is subject to change so you typically have to regenerate the +# default header when upgrading to a newer version of doxygen. The following +# commands have a special meaning inside the header (and footer): For a +# description of the possible markers and block names see the documentation. # This tag requires that the tag GENERATE_LATEX is set to YES. LATEX_HEADER = -# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the -# generated LaTeX document. The footer should contain everything after the last -# chapter. If it is left blank doxygen will generate a standard footer. See +# The LATEX_FOOTER tag can be used to specify a user-defined LaTeX footer for +# the generated LaTeX document. The footer should contain everything after the +# last chapter. If it is left blank doxygen will generate a standard footer. See # LATEX_HEADER for more information on how to generate a default footer and what -# special commands can be used inside the footer. -# -# Note: Only use a user-defined footer if you know what you are doing! +# special commands can be used inside the footer. See also section "Doxygen +# usage" for information on how to generate the default footer that doxygen +# normally uses. Note: Only use a user-defined footer if you know what you are +# doing! # This tag requires that the tag GENERATE_LATEX is set to YES. LATEX_FOOTER = @@ -1734,20 +1891,21 @@ LATEX_EXTRA_FILES = # The default value is: YES. # This tag requires that the tag GENERATE_LATEX is set to YES. -PDF_HYPERLINKS = NO +PDF_HYPERLINKS = YES -# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate -# the PDF file directly from the LaTeX files. Set this option to YES, to get a -# higher quality PDF documentation. +# If the USE_PDFLATEX tag is set to YES, doxygen will use the engine as +# specified with LATEX_CMD_NAME to generate the PDF file directly from the LaTeX +# files. Set this option to YES, to get a higher quality PDF documentation. +# +# See also section LATEX_CMD_NAME for selecting the engine. # The default value is: YES. # This tag requires that the tag GENERATE_LATEX is set to YES. -USE_PDFLATEX = NO +USE_PDFLATEX = YES # If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode # command to the generated LaTeX files. This will instruct LaTeX to keep running -# if errors occur, instead of asking the user for help. This option is also used -# when generating formulas in HTML. +# if errors occur, instead of asking the user for help. # The default value is: NO. # This tag requires that the tag GENERATE_LATEX is set to YES. @@ -1760,19 +1918,9 @@ LATEX_BATCHMODE = NO LATEX_HIDE_INDICES = NO -# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source -# code with syntax highlighting in the LaTeX output. -# -# Note that which sources are shown also depends on other settings such as -# SOURCE_BROWSER. -# The default value is: NO. -# This tag requires that the tag GENERATE_LATEX is set to YES. - -LATEX_SOURCE_CODE = NO - # The LATEX_BIB_STYLE tag can be used to specify the style to use for the # bibliography, e.g. plainnat, or ieeetr. See -# http://en.wikipedia.org/wiki/BibTeX and \cite for more info. +# https://en.wikipedia.org/wiki/BibTeX and \cite for more info. # The default value is: plain. # This tag requires that the tag GENERATE_LATEX is set to YES. @@ -1786,6 +1934,14 @@ LATEX_BIB_STYLE = plain LATEX_TIMESTAMP = NO +# The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute) +# path from which the emoji images will be read. If a relative path is entered, +# it will be relative to the LATEX_OUTPUT directory. If left blank the +# LATEX_OUTPUT directory will be used. +# This tag requires that the tag GENERATE_LATEX is set to YES. + +LATEX_EMOJI_DIRECTORY = + #--------------------------------------------------------------------------- # Configuration options related to the RTF output #--------------------------------------------------------------------------- @@ -1825,9 +1981,9 @@ COMPACT_RTF = NO RTF_HYPERLINKS = NO -# Load stylesheet definitions from file. Syntax is similar to doxygen's config -# file, i.e. a series of assignments. You only have to provide replacements, -# missing definitions are set to their default value. +# Load stylesheet definitions from file. Syntax is similar to doxygen's +# configuration file, i.e. a series of assignments. You only have to provide +# replacements, missing definitions are set to their default value. # # See also section "Doxygen usage" for information on how to generate the # default style sheet that doxygen normally uses. @@ -1836,22 +1992,12 @@ RTF_HYPERLINKS = NO RTF_STYLESHEET_FILE = # Set optional variables used in the generation of an RTF document. Syntax is -# similar to doxygen's config file. A template extensions file can be generated -# using doxygen -e rtf extensionFile. +# similar to doxygen's configuration file. A template extensions file can be +# generated using doxygen -e rtf extensionFile. # This tag requires that the tag GENERATE_RTF is set to YES. RTF_EXTENSIONS_FILE = -# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code -# with syntax highlighting in the RTF output. -# -# Note that which sources are shown also depends on other settings such as -# SOURCE_BROWSER. -# The default value is: NO. -# This tag requires that the tag GENERATE_RTF is set to YES. - -RTF_SOURCE_CODE = NO - #--------------------------------------------------------------------------- # Configuration options related to the man page output #--------------------------------------------------------------------------- @@ -1904,7 +2050,7 @@ MAN_LINKS = NO # captures the structure of the code including all documentation. # The default value is: NO. -GENERATE_XML = NO +GENERATE_XML = YES # The XML_OUTPUT tag is used to specify where the XML pages will be put. If a # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of @@ -1912,7 +2058,7 @@ GENERATE_XML = NO # The default directory is: xml. # This tag requires that the tag GENERATE_XML is set to YES. -XML_OUTPUT = xml +XML_OUTPUT = build_docs/doxygen/xml # If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program # listings (including syntax highlighting and cross-referencing information) to @@ -1923,6 +2069,13 @@ XML_OUTPUT = xml XML_PROGRAMLISTING = YES +# If the XML_NS_MEMB_FILE_SCOPE tag is set to YES, doxygen will include +# namespace members in file scope as well, matching the HTML output. +# The default value is: NO. +# This tag requires that the tag GENERATE_XML is set to YES. + +XML_NS_MEMB_FILE_SCOPE = NO + #--------------------------------------------------------------------------- # Configuration options related to the DOCBOOK output #--------------------------------------------------------------------------- @@ -1941,23 +2094,14 @@ GENERATE_DOCBOOK = NO DOCBOOK_OUTPUT = docbook -# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the -# program listings (including syntax highlighting and cross-referencing -# information) to the DOCBOOK output. Note that enabling this will significantly -# increase the size of the DOCBOOK output. -# The default value is: NO. -# This tag requires that the tag GENERATE_DOCBOOK is set to YES. - -DOCBOOK_PROGRAMLISTING = NO - #--------------------------------------------------------------------------- # Configuration options for the AutoGen Definitions output #--------------------------------------------------------------------------- # If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an -# AutoGen Definitions (see http://autogen.sf.net) file that captures the -# structure of the code including all documentation. Note that this feature is -# still experimental and incomplete at the moment. +# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures +# the structure of the code including all documentation. Note that this feature +# is still experimental and incomplete at the moment. # The default value is: NO. GENERATE_AUTOGEN_DEF = NO @@ -2057,15 +2201,12 @@ INCLUDE_FILE_PATTERNS = # recursively expanded use the := operator instead of the = operator. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -PREDEFINED = THRUST_NOEXCEPT=noexcept \ - "THRUST_DEFAULT={}" \ - "THRUST_NODISCARD=[[nodiscard]]" \ - "THRUST_MR_DEFAULT_ALIGNMENT=alignof(max_align_t)" \ - "THRUST_FINAL=final" \ - "THRUST_OVERRIDE=" \ - "THRUST_BEGIN_NS=namespace thrust {" \ - "THRUST_END_NS=}" \ - "cuda_cub=system::cuda" +PREDEFINED = THRUST_DOXYGEN \ + THRUST_CPP_DIALECT=2017 \ + THRUST_NODISCARD=[[nodiscard]] \ + THRUST_MR_DEFAULT_ALIGNMENT=alignof(std::max_align_t) \ + "THRUST_NAMESPACE_BEGIN=namespace thrust {" \ + THRUST_NAMESPACE_END=} # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this # tag can be used to specify a list of macro names that should be expanded. The @@ -2130,36 +2271,12 @@ EXTERNAL_GROUPS = YES # be listed. # The default value is: YES. -EXTERNAL_PAGES = YES - -# The PERL_PATH should be the absolute path and name of the perl script -# interpreter (i.e. the result of 'which perl'). -# The default file (with absolute path) is: /usr/bin/perl. - -PERL_PATH = /usr/bin/perl +EXTERNAL_PAGES = NO #--------------------------------------------------------------------------- # Configuration options related to the dot tool #--------------------------------------------------------------------------- -# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram -# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to -# NO turns the diagrams off. Note that this option also works with HAVE_DOT -# disabled, but it is recommended to install and use dot, since it yields more -# powerful graphs. -# The default value is: YES. - -CLASS_DIAGRAMS = YES - -# You can define message sequence charts within doxygen comments using the \msc -# command. Doxygen will then run the mscgen tool (see: -# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the -# documentation. The MSCGEN_PATH tag allows you to specify the directory where -# the mscgen tool resides. If left empty the tool is assumed to be found in the -# default search path. - -MSCGEN_PATH = - # You can include diagrams made with dia in doxygen documentation. Doxygen will # then run dia to produce the diagram and insert it in the documentation. The # DIA_PATH tag allows you to specify the directory where the dia binary resides. @@ -2178,7 +2295,7 @@ HIDE_UNDOC_RELATIONS = YES # http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent # Bell Labs. The other options in this section have no effect if this option is # set to NO -# The default value is: YES. +# The default value is: NO. HAVE_DOT = NO @@ -2216,13 +2333,16 @@ DOT_FONTSIZE = 10 DOT_FONTPATH = -# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for -# each documented class showing the direct and indirect inheritance relations. -# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO. +# If the CLASS_GRAPH tag is set to YES (or GRAPH) then doxygen will generate a +# graph for each documented class showing the direct and indirect inheritance +# relations. In case HAVE_DOT is set as well dot will be used to draw the graph, +# otherwise the built-in generator will be used. If the CLASS_GRAPH tag is set +# to TEXT the direct and indirect inheritance relations will be shown as texts / +# links. +# Possible values are: NO, YES, TEXT and GRAPH. # The default value is: YES. -# This tag requires that the tag HAVE_DOT is set to YES. -CLASS_GRAPH = YES +CLASS_GRAPH = NO # If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a # graph for each documented class showing the direct and indirect implementation @@ -2231,14 +2351,14 @@ CLASS_GRAPH = YES # The default value is: YES. # This tag requires that the tag HAVE_DOT is set to YES. -COLLABORATION_GRAPH = YES +COLLABORATION_GRAPH = NO # If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for # groups, showing the direct groups dependencies. # The default value is: YES. # This tag requires that the tag HAVE_DOT is set to YES. -GROUP_GRAPHS = YES +GROUP_GRAPHS = NO # If the UML_LOOK tag is set to YES, doxygen will generate inheritance and # collaboration diagrams in a style similar to the OMG's Unified Modeling @@ -2257,10 +2377,32 @@ UML_LOOK = NO # but if the number exceeds 15, the total amount of fields shown is limited to # 10. # Minimum value: 0, maximum value: 100, default value: 10. -# This tag requires that the tag HAVE_DOT is set to YES. +# This tag requires that the tag UML_LOOK is set to YES. UML_LIMIT_NUM_FIELDS = 10 +# If the DOT_UML_DETAILS tag is set to NO, doxygen will show attributes and +# methods without types and arguments in the UML graphs. If the DOT_UML_DETAILS +# tag is set to YES, doxygen will add type and arguments for attributes and +# methods in the UML graphs. If the DOT_UML_DETAILS tag is set to NONE, doxygen +# will not generate fields with class member information in the UML graphs. The +# class diagrams will look similar to the default class diagrams but using UML +# notation for the relationships. +# Possible values are: NO, YES and NONE. +# The default value is: NO. +# This tag requires that the tag UML_LOOK is set to YES. + +DOT_UML_DETAILS = NO + +# The DOT_WRAP_THRESHOLD tag can be used to set the maximum number of characters +# to display on a single line. If the actual line length exceeds this threshold +# significantly it will wrapped across multiple lines. Some heuristics are apply +# to avoid ugly line breaks. +# Minimum value: 0, maximum value: 1000, default value: 17. +# This tag requires that the tag HAVE_DOT is set to YES. + +DOT_WRAP_THRESHOLD = 17 + # If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and # collaboration graphs will show the relations between templates and their # instances. @@ -2276,7 +2418,7 @@ TEMPLATE_RELATIONS = NO # The default value is: YES. # This tag requires that the tag HAVE_DOT is set to YES. -INCLUDE_GRAPH = YES +INCLUDE_GRAPH = NO # If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are # set to YES then doxygen will generate a graph for each documented file showing @@ -2285,7 +2427,7 @@ INCLUDE_GRAPH = YES # The default value is: YES. # This tag requires that the tag HAVE_DOT is set to YES. -INCLUDED_BY_GRAPH = YES +INCLUDED_BY_GRAPH = NO # If the CALL_GRAPH tag is set to YES then doxygen will generate a call # dependency graph for every global function or class method. @@ -2316,7 +2458,7 @@ CALLER_GRAPH = NO # The default value is: YES. # This tag requires that the tag HAVE_DOT is set to YES. -GRAPHICAL_HIERARCHY = YES +GRAPHICAL_HIERARCHY = NO # If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the # dependencies a directory has on other directories in a graphical way. The @@ -2325,7 +2467,14 @@ GRAPHICAL_HIERARCHY = YES # The default value is: YES. # This tag requires that the tag HAVE_DOT is set to YES. -DIRECTORY_GRAPH = YES +DIRECTORY_GRAPH = NO + +# The DIR_GRAPH_MAX_DEPTH tag can be used to limit the maximum number of levels +# of child directories generated in directory dependency graphs by dot. +# Minimum value: 1, maximum value: 25, default value: 1. +# This tag requires that the tag DIRECTORY_GRAPH is set to YES. + +DIR_GRAPH_MAX_DEPTH = 1 # The DOT_IMAGE_FORMAT tag can be used to set the image format of the images # generated by dot. For an explanation of the image formats see the section @@ -2334,9 +2483,7 @@ DIRECTORY_GRAPH = YES # Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order # to make the SVG files visible in IE 9+ (other browsers do not have this # requirement). -# Possible values are: png, png:cairo, png:cairo:cairo, png:cairo:gd, png:gd, -# png:gd:gd, jpg, jpg:cairo, jpg:cairo:gd, jpg:gd, jpg:gd:gd, gif, gif:cairo, -# gif:cairo:gd, gif:gd, gif:gd:gd, svg, png:gd, png:gd:gd, png:cairo, +# Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo, # png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and # png:gdiplus:gdiplus. # The default value is: png. @@ -2382,10 +2529,10 @@ MSCFILE_DIRS = DIAFILE_DIRS = # When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the -# path where java can find the plantuml.jar file. If left blank, it is assumed -# PlantUML is not used or called during a preprocessing step. Doxygen will -# generate a warning when it encounters a \startuml command in this case and -# will not generate output for the diagram. +# path where java can find the plantuml.jar file or to the filename of jar file +# to be used. If left blank, it is assumed PlantUML is not used or called during +# a preprocessing step. Doxygen will generate a warning when it encounters a +# \startuml command in this case and will not generate output for the diagram. PLANTUML_JAR_PATH = @@ -2447,14 +2594,18 @@ DOT_MULTI_TARGETS = NO # If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page # explaining the meaning of the various boxes and arrows in the dot generated # graphs. +# Note: This tag requires that UML_LOOK isn't set, i.e. the doxygen internal +# graphical representation for inheritance and collaboration diagrams is used. # The default value is: YES. # This tag requires that the tag HAVE_DOT is set to YES. GENERATE_LEGEND = YES -# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot +# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate # files that are used to generate the various graphs. +# +# Note: This setting is not only used for dot files but also for msc temporary +# files. # The default value is: YES. -# This tag requires that the tag HAVE_DOT is set to YES. DOT_CLEANUP = YES diff --git a/docs/generate_markdown.bash b/docs/generate_markdown.bash new file mode 100755 index 000000000..3b711db10 --- /dev/null +++ b/docs/generate_markdown.bash @@ -0,0 +1,106 @@ +#! /usr/bin/env bash + +############################################################################### +# Copyright (c) 2018-2021 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### + +set -e + +function usage { + echo "Usage: ${0} [flags...]" + echo + echo "Generate Thrust documentation markdown with Doxygen and Doxybook that " + echo "can be served with Jekyll." + echo + echo "-h, -help, --help" + echo " Print this message." + echo + echo "-c, --clean" + echo " Delete the all existing build artifacts before generating the " + echo " markdown." + + exit -3 +} + +LOCAL=0 +CLEAN=0 + +while test ${#} != 0 +do + case "${1}" in + -h) ;& + -help) ;& + --help) usage ;; + -c) ;& + --clean) CLEAN=1 ;; + esac + shift +done + +SCRIPT_PATH=$(cd $(dirname ${0}); pwd -P) + +REPO_PATH=${SCRIPT_PATH}/.. + +BUILD_DOCS_PATH=build_docs +BUILD_DOXYGEN_PATH=${BUILD_DOCS_PATH}/doxygen +BUILD_GITHUB_PAGES_PATH=${BUILD_DOCS_PATH}/github_pages + +cd ${REPO_PATH} + +if [[ "${CLEAN}" == 1 ]]; then + rm -rf ${BUILD_DOXYGEN_PATH} + rm -rf ${BUILD_GITHUB_PAGES_PATH} +fi + +mkdir -p ${BUILD_DOXYGEN_PATH}/xml +mkdir -p ${BUILD_GITHUB_PAGES_PATH} +mkdir -p ${BUILD_GITHUB_PAGES_PATH}/api +mkdir -p ${BUILD_GITHUB_PAGES_PATH}/contributing +mkdir -p ${BUILD_GITHUB_PAGES_PATH}/releases + +# Copy all the documentation sources and Jekyll configuration into +# `{BUILD_GITHUB_PAGES_PATH}`. +cp -ur docs/github_pages/* ${BUILD_GITHUB_PAGES_PATH}/ +cp README.md ${BUILD_GITHUB_PAGES_PATH}/overview.md +cp CODE_OF_CONDUCT.md ${BUILD_GITHUB_PAGES_PATH}/contributing/code_of_conduct.md +cp CHANGELOG.md ${BUILD_GITHUB_PAGES_PATH}/releases/changelog.md + +doxygen docs/doxygen/config.dox + +# `--debug-templates` will cause JSON output to be generated, which is useful +# for debugging. +doxybook2 --config docs/doxybook/config.json \ + --templates docs/doxybook/templates \ + --debug-templates \ + --input ${BUILD_DOXYGEN_PATH}/xml \ + --output ${BUILD_GITHUB_PAGES_PATH}/api + +# Doxygen and Doxybook don't give us a way to disable all the things we'd like, +# so it's important to purge Doxybook Markdown output that we don't need: +# 0) We want our Jekyll build to be as fast as possible and avoid wasting time +# on stuff we don't need. +# 1) We don't want content that we don't plan to use to either show up on the +# site index or appear in search results. +rm -rf ${BUILD_GITHUB_PAGES_PATH}/api/files +rm -rf ${BUILD_GITHUB_PAGES_PATH}/api/index_files.md +rm -rf ${BUILD_GITHUB_PAGES_PATH}/api/pages +rm -rf ${BUILD_GITHUB_PAGES_PATH}/api/index_pages.md +rm -rf ${BUILD_GITHUB_PAGES_PATH}/api/examples +rm -rf ${BUILD_GITHUB_PAGES_PATH}/api/index_examples.md +rm -rf ${BUILD_GITHUB_PAGES_PATH}/api/images +rm -rf ${BUILD_GITHUB_PAGES_PATH}/api/index_namespaces.md +rm -rf ${BUILD_GITHUB_PAGES_PATH}/api/index_groups.md +rm -rf ${BUILD_GITHUB_PAGES_PATH}/api/index_classes.md + diff --git a/docs/github_pages/Gemfile b/docs/github_pages/Gemfile new file mode 100644 index 000000000..09d948e17 --- /dev/null +++ b/docs/github_pages/Gemfile @@ -0,0 +1,10 @@ +source "https://rubygems.org" +gem "just-the-docs" +group :jekyll_plugins do + gem "github-pages" # GitHub Pages. + gem "jekyll-optional-front-matter" # GitHub Pages. + gem "jekyll-default-layout" # GitHub Pages. + gem "jekyll-titles-from-headings" # GitHub Pages. + gem "jekyll-relative-links" # GitHub Pages. + gem "jekyll-include-cache" +end diff --git a/docs/github_pages/_config.yml b/docs/github_pages/_config.yml new file mode 100644 index 000000000..c131e84fb --- /dev/null +++ b/docs/github_pages/_config.yml @@ -0,0 +1,47 @@ +title: Thrust + +repository: nvidia/thrust + +remote_theme: pmarsceill/just-the-docs + +color_scheme: nvidia +logo: /assets/images/nvidia_logo.png + +search_enabled: true +search.heading_level: 4 + +incremental: true + +# just-the-docs ignores these filenames by default. +include: [ "contributing.md", "code_of_conduct.md" ] + +exclude: [ "node_modules", "doxybook_templates", + "generate_markdown.bash", "serve_docs_locally.bash" ] + +plugins: + - jekyll-optional-front-matter # GitHub Pages. + - jekyll-default-layout # GitHub Pages. + - jekyll-titles-from-headings # GitHub Pages. + - jekyll-relative-links # GitHub Pages. + - jekyll-include-cache + +defaults: + - + scope: + path: overview.md + values: + title: Overview + nav_order: 0 + permalink: / + - + scope: + path: contributing/code_of_conduct.md + values: + parent: Contributing + nav_order: 2 + - + scope: + path: releases/changelog.md + values: + parent: Releases + nav_order: 0 diff --git a/docs/github_pages/_sass/color_schemes/nvidia.scss b/docs/github_pages/_sass/color_schemes/nvidia.scss new file mode 100644 index 000000000..4b44fa222 --- /dev/null +++ b/docs/github_pages/_sass/color_schemes/nvidia.scss @@ -0,0 +1,145 @@ +$body-line-height: 1.4; +$content-line-height: 1.4; +.highlight { line-height: 1.0 !important; } + +/* h1 size. We make this smaller so the README title fits on one line. */ +$font-size-9: 30px; + +/* Inline code. */ +code, +code.highlighter-rouge +{ font-size: 0.85em !important; } + +/* Code blocks. */ +pre.highlight code { font-size: 0.9em !important; } + +/* Doxybook generated code snippets. */ +code.doxybook { display: block; } + +/* Line wrap with an indent of four characters in Doxybook-generated code snippets. */ +code.doxybook span +{ display: block; text-indent: -4ex !important; padding-left: 4ex !important; } + +/* Line wrap with an indent of eight characters in Doxybook-generated code snippets. */ +code.doxybook span +{ display: block; text-indent: -8ex !important; padding-left: 8ex !important; } + +/* Disable line wrap for indent s. */ +code.doxybook +{ display: block; text-indent: 0ex !important; padding-left: 0ex !important; } + +h3 { margin-bottom: 1.0em !important; } + +$nav-width: 300px; + +$body-background-color: $grey-dk-300; +$sidebar-color: $grey-dk-300; +$border-color: $grey-dk-200; + +$body-text-color: $grey-lt-300; +$body-heading-color: $grey-lt-000; +$nav-child-link-color: $grey-dk-000; +$search-result-preview-color: $grey-dk-000; + +$link-color: #76b900; +$btn-primary-color: #76b900; +$base-button-color: $grey-dk-250; + +$code-background-color: $grey-dk-250; +$search-background-color: $grey-dk-250; +$table-background-color: $grey-dk-250; +$feedback-color: darken($sidebar-color, 3%); + +div.highlighter-rouge, +pre.highlight code, +code.doxybook +{ background-color: #111 !important; } + +span.doxybook-comment code +{ background-color: #111 !important; border: none !important; } + +.highlight span.err { color: #ff0000; font-weight: bold; } /* Error */ + +.highlight span.ow, /* Operator.Word */ +.highlight span.k, /* Keyword */ +.highlight span.kc, /* Keyword.Constant */ +.highlight span.kd, /* Keyword.Declaration */ +.highlight span.kp, /* Keyword.Pseudo */ +.highlight span.kr, /* Keyword.Reserved */ +.highlight span.bp, /* Name.Builtin.Pseudo */ +.highlight span.vc, /* Name.Variable.Class */ +.highlight span.vg, /* Name.Variable.Global */ +.highlight span.vi /* Name.Variable.Instance */ +{ color: #76b900; font-weight: bold; } + +.highlight span.n, /* Name */ +.highlight span.h, /* Name */ +.highlight span.na, /* Name.Attribute */ +.highlight span.nb, /* Name.Builtin */ +.highlight span.nc, /* Name.Class */ +.highlight span.no, /* Name.Constant */ +.highlight span.nd, /* Name.Decorator */ +.highlight span.ni, /* Name.Entity */ +.highlight span.ne, /* Name.Exception */ +.highlight span.nf, /* Name.Function */ +.highlight span.nl, /* Name.Label */ +.highlight span.nn, /* Name.Namespace */ +.highlight span.nx, /* Name.Other */ +.highlight span.py, /* Name.Property */ +.highlight span.nt, /* Name.Tag */ +.highlight span.nv, /* Name.Variable */ +.highlight span.kt /* Keyword.Type */ +{ color: $grey-lt-300 } + +.highlight span.c, /* Comment */ +.highlight span.cm, /* Comment.Multiline */ +.highlight span.c1, /* Comment.Single */ +.highlight span.cs, /* Comment.Special */ +span.doxybook-comment +{ color: #009966; font-family: $body-font-family; font-style: italic; } + +.highlight span.cp /* Preprocessor */ +.highlight span.kn, /* Keyword.Namespace */ +{ color: $grey-dk-000 } + +.highlight span.o, /* Operator */ +.highlight span.p /* Punctuation */ +{ color: #00ff00; } + +.highlight span.ge { font-style: italic; } /* Generic.Emph */ + +.highlight span.gs { font-weight: bold; } /* Generic.Strong */ + +.highlight span.l, /* Literal */ +.highlight span.ld, /* Literal.Date */ +.highlight span.m, /* Literal.Number */ +.highlight span.mf, /* Literal.Number.Float */ +.highlight span.mh, /* Literal.Number.Hex */ +.highlight span.mi, /* Literal.Number.Integer */ +.highlight span.mo, /* Literal.Number.Oct */ +.highlight span.il, /* Literal.Number.Integer.Long */ +.highlight span.s, /* Literal.String */ +.highlight span.sb, /* Literal.String.Backtick */ +.highlight span.sc, /* Literal.String.Char */ +.highlight span.sd, /* Literal.String.Doc */ +.highlight span.s2, /* Literal.String.Double */ +.highlight span.se, /* Literal.String.Escape */ +.highlight span.sh, /* Literal.String.Heredoc */ +.highlight span.si, /* Literal.String.Interpol */ +.highlight span.sx, /* Literal.String.Other */ +.highlight span.sr, /* Literal.String.Regex */ +.highlight span.s1, /* Literal.String.Single */ +.highlight span.ss /* Literal.String.Symbol */ +{ color: #119911; } + +.highlight span.w { color: #00cc00; } /* Text.Whitespace */ + +.highlight span.gh, /* Generic.Heading */ +.highlight span.gp, /* Generic.Prompt */ +.highlight span.gu /* Generic.Subheading */ +{ color: #00ff00; font-weight: bold; } + +.highlight span.gd { color: #ff0000; } /* Generic.Deleted */ +.highlight span.gi { color: #00ff00; } /* Generic.Inserted */ + +.search-input { color: $body-text-color; } diff --git a/docs/github_pages/api.md b/docs/github_pages/api.md new file mode 100644 index 000000000..6a2d1af43 --- /dev/null +++ b/docs/github_pages/api.md @@ -0,0 +1,8 @@ +--- +has_children: true +has_toc: true +nav_order: 2 +--- + +# API + diff --git a/docs/github_pages/assets/images/nvidia_logo.png b/docs/github_pages/assets/images/nvidia_logo.png new file mode 100644 index 000000000..6b005a283 Binary files /dev/null and b/docs/github_pages/assets/images/nvidia_logo.png differ diff --git a/docs/github_pages/contributing.md b/docs/github_pages/contributing.md new file mode 100644 index 000000000..6539768c4 --- /dev/null +++ b/docs/github_pages/contributing.md @@ -0,0 +1,10 @@ +--- +has_children: true +has_toc: true +nav_order: 4 +--- + +# Contributing + +We welcome contributions - just send us a pull request! + diff --git a/docs/github_pages/contributing/release_process.md b/docs/github_pages/contributing/release_process.md new file mode 100644 index 000000000..db21f60b4 --- /dev/null +++ b/docs/github_pages/contributing/release_process.md @@ -0,0 +1,85 @@ +--- +parent: Contributing +nav_order: 1 +--- + +# Release Process + +## Create a Changelog Entry + +Every release must have a changelog entry. +The changelog entry should include: +* A summary of the major accomplishments of the release. +* A list of all the changes in the release. +* A list of all the bugs fixed by the release. + +Contributions from new collaborators should be acknowledged in the changelog. + +## Create Git Annotated Tags and GitHub Releases + +Each release needs to have a Git annotated tag and a GitHub release for that tag. +The changelog for the release should be used for the text of the GitHub release. + +## Update Compiler Explorer + +Thrust and CUB are bundled together on +[Compiler Explorer](https://www.godbolt.org/) (CE) as libraries for the CUDA +language. When releasing a new version of these projects, CE will need to be +updated. + +There are two files in two repos that need to be updated: + +### libraries.yaml + +- Repo: https://github.com/compiler-explorer/infra +- Path: bin/yaml/libraries.yaml + +This file tells CE how to pull in library files and defines which versions to +fetch. Look for the `thrustcub:` section: + +```yaml + thrustcub: + type: github + method: clone_branch + repo: NVIDIA/thrust + check_file: dependencies/cub/cub/cub.cuh + targets: + - 1.9.9 + - 1.9.10 + - 1.9.10-1 + - 1.10.0 +``` + +Simply add the new version tag to list of `targets:`. This will check out the +specified tag to `/opt/compiler-explorer/libs/thrustcub//`. + +### cuda.amazon.properties + +- Repo: https://github.com/compiler-explorer/compiler-explorer +- File: etc/config/cuda.amazon.properties + +This file defines the library versions displayed in the CE UI and maps them +to a set of include directories. Look for the `libs.thrustcub` section: + +```yaml +libs.thrustcub.name=Thrust+CUB +libs.thrustcub.description=CUDA collective and parallel algorithms +libs.thrustcub.versions=trunk:109090:109100:109101:110000 +libs.thrustcub.url=http://www.github.com/NVIDIA/thrust +libs.thrustcub.versions.109090.version=1.9.9 +libs.thrustcub.versions.109090.path=/opt/compiler-explorer/libs/thrustcub/1.9.9:/opt/compiler-explorer/libs/thrustcub/1.9.9/dependencies/cub +libs.thrustcub.versions.109100.version=1.9.10 +libs.thrustcub.versions.109100.path=/opt/compiler-explorer/libs/thrustcub/1.9.10:/opt/compiler-explorer/libs/thrustcub/1.9.10/dependencies/cub +libs.thrustcub.versions.109101.version=1.9.10-1 +libs.thrustcub.versions.109101.path=/opt/compiler-explorer/libs/thrustcub/1.9.10-1:/opt/compiler-explorer/libs/thrustcub/1.9.10-1/dependencies/cub +libs.thrustcub.versions.110000.version=1.10.0 +libs.thrustcub.versions.110000.path=/opt/compiler-explorer/libs/thrustcub/1.10.0:/opt/compiler-explorer/libs/thrustcub/1.10.0/dependencies/cub +libs.thrustcub.versions.trunk.version=trunk +libs.thrustcub.versions.trunk.path=/opt/compiler-explorer/libs/thrustcub/trunk:/opt/compiler-explorer/libs/thrustcub/trunk/dependencies/cub +``` + +Add a new version identifier to the `libs.thrustcub.versions` key, using the +convention `X.Y.Z-W -> XXYYZZWW`. Then add a corresponding UI label (the +`version` key) and set of colon-separated include paths for Thrust and CUB +(`path`). The version used in the `path` entries must exactly match the tag +specified in `libraries.yaml`. diff --git a/docs/github_pages/contributing/submitting_a_pr.md b/docs/github_pages/contributing/submitting_a_pr.md new file mode 100644 index 000000000..9c1757655 --- /dev/null +++ b/docs/github_pages/contributing/submitting_a_pr.md @@ -0,0 +1,295 @@ +--- +parent: Contributing +nav_order: 0 +--- + +# Submitting a PR + +Thrust uses Github to manage all open-source development, including bug +tracking, pull requests, and design discussions. This document details how to get +started as a Thrust contributor. + +An overview of this process is: + +1. [Clone the Thrust repository](#clone-the-thrust-repository) +1. [Setup a fork of Thrust](#setup-a-fork-of-thrust) +1. [Setup your environment](#setup-your-environment) +1. [Create a development branch](#create-a-development-branch) +1. [Local development loop](#local-development-loop) +1. [Push development branch to your fork](#push-development-branch-to-your-fork) +1. [Create pull request](#create-pull-request) +1. [Address feedback and update pull request](#address-feedback-and-update-pull-request) +1. [When your PR is approved...](#when-your-pr-is-approved) + +## Clone the Thrust Repository + +To get started, clone the main repository to your local computer. Thrust should +be cloned recursively to setup the CUB submodule (required for `CUDA` +acceleration). + +``` +git clone --recursive https://github.com/NVIDIA/thrust.git +cd thrust +``` + +## Setup a Fork of Thrust + +You'll need a fork of Thrust on Github to create a pull request. To setup your +fork: + +1. Create a Github account (if needed) +2. Go to [the Thrust Github page](https://github.com/NVIDIA/thrust) +3. Click "Fork" and follow any prompts that appear. + +Once your fork is created, setup a new remote repo in your local Thrust clone: + +``` +git remote add github-fork git@github.com:/thrust.git +``` + +If you need to modify CUB, too, go to +[the CUB Github page](https://github.com/NVIDIA/cub) and repeat this process. +Create CUB's `github-fork` remote in the `thrust/dependencies/cub` submodule. + +## Setup Your Environment + +### Git Environment + +If you haven't already, this is a good time to tell git who you are. This +information is used to fill out authorship information on your git commits. + +``` +git config --global user.name "John Doe" +git config --global user.email johndoe@example.com +``` + +### Configure CMake builds + +Thrust uses [CMake](https://www.cmake.org) for its primary build system. To +configure, build, and test your checkout of Thrust: + +``` +# Create build directory: +mkdir build +cd build + +# Configure -- use one of the following: +cmake .. # Command line interface +cmake -DTHRUST_INCLUDE_CUB_CMAKE=ON .. # Enables CUB development targets +ccmake .. # ncurses GUI (Linux only) +cmake-gui # Graphical UI, set source/build directories in the app + +# Build: +cmake --build . -j # invokes make (or ninja, etc) + +# Run tests and examples: +ctest +``` + +See [CMake Options](../setup/cmake_options.md) for details on customizing the build. To +enable CUB tests and examples, set the `THRUST_INCLUDE_CUB_CMAKE` option to +`ON`. Additional CMake options for CUB are listed +[here](https://github.com/NVIDIA/cub/blob/main/CONTRIBUTING.md#cmake-options). + +## Create a Development Branch + +All work should be done in a development branch (also called a "topic branch") +and not directly in the `main` branch. This makes it easier to manage multiple +in-progress patches at once, and provides a descriptive label for your patch +as it passes through the review system. + +To create a new branch based on the current `main`: + +``` +# Checkout local main branch: +cd /path/to/thrust/sources +git checkout main + +# Sync local main branch with github: +git pull + +# Create a new branch named `my_descriptive_branch_name` based on main: +git checkout -b my_descriptive_branch_name + +# Verify that the branch has been created and is currently checked out: +git branch +``` + +Thrust branch names should follow a particular pattern: + +- For new features, name the branch `feature/` +- For bugfixes associated with a github issue, use `bug/github/-` + - Internal nvidia and gitlab bugs should use `nvidia` or `gitlab` in place of + `github`. + +If you plan to work on CUB as part of your patch, repeat this process in the +`thrust/dependencies/cub` submodule. + +## Local Development Loop + +### Edit, Build, Test, Repeat + +Once the topic branch is created, you're all set to start working on Thrust +code. Make some changes, then build and test them: + +``` +# Implement changes: +cd /path/to/thrust/sources +emacs thrust/some_file.h # or whatever editor you prefer + +# Create / update a unit test for your changes: +emacs testing/some_test.cu + +# Check that everything builds and tests pass: +cd /path/to/thrust/build/directory +cmake --build . -j +ctest +``` + +### Creating a Commit + +Once you're satisfied with your patch, commit your changes: + +#### Thrust-only Changes + +``` +# Manually add changed files and create a commit: +cd /path/to/thrust +git add thrust/some_file.h +git add testing/some_test.cu +git commit + +# Or, if possible, use git-gui to review your changes while building your patch: +git gui +``` + +#### Thrust and CUB Changes + +``` +# Create CUB patch first: +cd /path/to/thrust/dependencies/cub +# Manually add changed files and create a commit: +git add cub/some_file.cuh +git commit + +# Create Thrust patch, including submodule update: +cd /path/to/thrust/ +git add dependencies/cub # Updates submodule info +git add thrust/some_file.h +git add testing/some_test.cu +git commit + +# Or, if possible, use git-gui to review your changes while building your patch: +cd /path/to/thrust/dependencies/cub +git gui +cd /path/to/thrust +git gui # Include dependencies/cub as part of your commit + +``` + +#### Writing a Commit Message + +Your commit message will communicate the purpose and rationale behind your +patch to other developers, and will be used to populate the initial description +of your Github pull request. + +When writing a commit message, the following standard format should be used, +since tools in the git ecosystem are designed to parse this correctly: + +``` +First line of commit message is a short summary (<80 char) + +Detailed description of change begins on third line. This portion can +span multiple lines, try to manually wrap them at something reasonable. + +Blank lines can be used to separate multiple paragraphs in the description. + +If your patch is associated with another pull request or issue in the main +Thrust repository, you should reference it with a `#` symbol, e.g. +#1023 for issue 1023. + +For issues / pull requests in a different github repo, reference them using +the full syntax, e.g. NVIDIA/cub#4 for issue 4 in the NVIDIA/cub repo. + +Markdown is recommended for formatting more detailed messages, as these will +be nicely rendered on Github, etc. +``` + +## Push Development Branch to your Fork + +Once you've committed your changes to a local development branch, it's time to +push them to your fork: + +``` +cd /path/to/thrust/checkout +git checkout my_descriptive_branch_name # if not already checked out +git push --set-upstream github-fork my_descriptive_branch_name +``` + +`--set-upstream github-fork` tells git that future pushes/pulls on this branch +should target your `github-fork` remote by default. + +If have CUB changes to commit as part of your patch, repeat this process in the +`thrust/dependencies/cub` submodule. + +## Create Pull Request + +To create a pull request for your freshly pushed branch, open your github fork +in a browser by going to `https://www.github.com//thrust`. A +prompt may automatically appear asking you to create a pull request if you've +recently pushed a branch. + +If there's no prompt, go to "Code" > "Branches" and click the appropriate +"New pull request" button for your branch. + +If you would like a specific developer to review your patch, feel free to +request them as a reviewer at this time. + +The Thrust team will review your patch, test it on NVIDIA's internal CI, and +provide feedback. + + +If have CUB changes to commit as part of your patch, repeat this process with +your CUB branch and fork. + +## Address Feedback and Update Pull Request + +If the reviewers request changes to your patch, use the following process to +update the pull request: + +``` +# Make changes: +cd /path/to/thrust/sources +git checkout my_descriptive_branch_name +emacs thrust/some_file.h +emacs testing/some_test.cu + +# Build + test +cd /path/to/thrust/build/directory +cmake --build . -j +ctest + +# Amend commit: +cd /path/to/thrust/sources +git add thrust/some_file.h +git add testing/some_test.cu +git commit --amend +# Or +git gui # Check the "Amend Last Commit" box + +# Update the branch on your fork: +git push -f +``` + +At this point, the pull request should show your recent changes. + +If have CUB changes to commit as part of your patch, repeat this process in the +`thrust/dependencies/cub` submodule, and be sure to include any CUB submodule +updates as part of your commit. + +## When Your PR is Approved + +Once your pull request is approved by the Thrust team, no further action is +needed from you. We will handle integrating it since we must coordinate changes +to `main` with NVIDIA's internal perforce repository. + diff --git a/docs/github_pages/favicon.ico b/docs/github_pages/favicon.ico new file mode 100644 index 000000000..424df8720 Binary files /dev/null and b/docs/github_pages/favicon.ico differ diff --git a/docs/github_pages/releases.md b/docs/github_pages/releases.md new file mode 100644 index 000000000..81a5f2f3d --- /dev/null +++ b/docs/github_pages/releases.md @@ -0,0 +1,60 @@ +--- +has_children: true +has_toc: true +nav_order: 3 +--- + +# Releases + +| Version | Included In | +|-----------------|-------------------------------------------| +| 2.0.1 | CUDA Toolkit 12.0 | +| 2.0.0 | TBD | +| 1.17.2 | TBD | +| 1.17.1 | TBD | +| 1.17.0 | TBD | +| 1.16.0 | TBD | +| 1.15.0 | NVIDIA HPC SDK 22.1 & CUDA Toolkit 11.6 | +| 1.14.0 | NVIDIA HPC SDK 21.9 | +| 1.13.1 | CUDA Toolkit 11.5 | +| 1.13.1 | CUDA Toolkit 11.5 | +| 1.13.0 | NVIDIA HPC SDK 21.7 | +| 1.12.1 | CUDA Toolkit 11.4 | +| 1.12.0 | NVIDIA HPC SDK 21.3 | +| 1.11.0 | CUDA Toolkit 11.3 | +| 1.10.0 | NVIDIA HPC SDK 20.9 & CUDA Toolkit 11.2 | +| 1.9.10-1 | NVIDIA HPC SDK 20.7 & CUDA Toolkit 11.1 | +| 1.9.10 | NVIDIA HPC SDK 20.5 | +| 1.9.9 | CUDA Toolkit 11.0 | +| 1.9.8-1 | NVIDIA HPC SDK 20.3 | +| 1.9.8 | CUDA Toolkit 11.0 Early Access | +| 1.9.7-1 | CUDA Toolkit 10.2 for Tegra | +| 1.9.7 | CUDA Toolkit 10.2 | +| 1.9.6-1 | NVIDIA HPC SDK 20.3 | +| 1.9.6 | CUDA Toolkit 10.1 Update 2 | +| 1.9.5 | CUDA Toolkit 10.1 Update 1 | +| 1.9.4 | CUDA Toolkit 10.1 | +| 1.9.3 | CUDA Toolkit 10.0 | +| 1.9.2 | CUDA Toolkit 9.2 | +| 1.9.1-2 | CUDA Toolkit 9.1 | +| 1.9.0-5 | CUDA Toolkit 9.0 | +| 1.8.3 | CUDA Toolkit 8.0 | +| 1.8.2 | CUDA Toolkit 7.5 | +| 1.8.1 | CUDA Toolkit 7.0 | +| 1.8.0 | | +| 1.7.2 | CUDA Toolkit 6.5 | +| 1.7.1 | CUDA Toolkit 6.0 | +| 1.7.0 | CUDA Toolkit 5.5 | +| 1.6.0 | | +| 1.5.3 | CUDA Toolkit 5.0 | +| 1.5.2 | CUDA Toolkit 4.2 | +| 1.5.1 | CUDA Toolkit 4.1 | +| 1.5.0 | | +| 1.4.0 | CUDA Toolkit 4.0 | +| 1.3.0 | | +| 1.2.1 | | +| 1.2.0 | | +| 1.1.1 | | +| 1.1.0 | | +| 1.0.0 | | + diff --git a/docs/github_pages/releases/versioning.md b/docs/github_pages/releases/versioning.md new file mode 100644 index 000000000..e5f0e8eb1 --- /dev/null +++ b/docs/github_pages/releases/versioning.md @@ -0,0 +1,71 @@ +--- +parent: Releases +nav_order: 1 +--- + +# Versioning + +Thrust has its own versioning system for releases, independent of the + versioning scheme of the NVIDIA HPC SDK or the CUDA Toolkit. + +Today, Thrust version numbers have a specific [semantic meaning](https://semver.org/). +Releases prior to 1.10.0 largely, but not strictly, followed these semantic + meanings. + +The version number for a Thrust release uses the following format: + `MMM.mmm.ss-ppp`, where: + +* `THRUST_VERSION_MAJOR`/`MMM`: Major version, up to 3 decimal digits. + It is incremented when changes that are API-backwards-incompatible are made. +* `THRUST_VERSION_MINOR`/`mmm`: Minor version, up to 3 decimal digits. + It is incremented when breaking API, ABI, or semantic changes are made. +* `THRUST_VERSION_SUBMINOR`/`ss`: Subminor version, up to 2 decimal digits. + It is incremented when notable new features or bug fixes or features that are + API-backwards-compatible are made. +* `THRUST_PATCH_NUMBER`/`ppp`: Patch number, up to 3 decimal digits. + This is no longer used and will be zero for all future releases. + +The `` header defines `THRUST_*` macros for all of the + version components mentioned above. +Additionally, a `THRUST_VERSION` macro is defined, which is an integer literal + containing all of the version components except for `THRUST_PATCH_NUMBER`. + +## Trunk Based Development + +Thrust uses [trunk based development](https://trunkbaseddevelopment.com). +There is a single long-lived branch called `main`, which is public and the + "source of truth". +All other branches are downstream from `main`. +Engineers may create branches for feature development. +Such branches always merge into `main`. +There are no release branches. +Releases are produced by taking a snapshot of `main` ("snapping"). +After a release has been snapped from `main`, it will never be changed. + +## Branches and Tags + +The following tag names are used in the Thrust project: + +* `nvhpc-X.Y`: the tag that directly corresponds to what has been + shipped in the NVIDIA HPC SDK release X.Y. +* `cuda-X.Y`: the tag that directly corresponds to what has been shipped + in the CUDA Toolkit release X.Y. +* `A.B.C`: the tag that directly corresponds to Thrust version A.B.C. +* `A.B.C-rcN`: the tag that directly corresponds to Thrust version A.B.C + release candidate N. + +The following branch names are used in the Thrust project: + +* `main`: the "source of truth" development branch of Thrust. +* `old-master`: the old "source of truth" branch, before unification of + public and internal repositories. +* `feature/`: feature branch for a feature under development. +* `bug//-`: bug fix branch, where + `bug-system` is `github` or `nvidia`. + +On the rare occasion that we cannot do work in the open, for example when + developing a change specific to an unreleased product, these branches may + exist on an internal NVIDIA GitLab instance instead of the public GitHub. +By default, everything should be in the open on GitHub unless there is a strong + motivation for it to not be open. + diff --git a/docs/github_pages/setup.md b/docs/github_pages/setup.md new file mode 100644 index 000000000..edbef2e5c --- /dev/null +++ b/docs/github_pages/setup.md @@ -0,0 +1,7 @@ +--- +has_children: true +has_toc: true +nav_order: 1 +--- + +# Setup diff --git a/docs/github_pages/setup/cmake_options.md b/docs/github_pages/setup/cmake_options.md new file mode 100644 index 000000000..b62faddeb --- /dev/null +++ b/docs/github_pages/setup/cmake_options.md @@ -0,0 +1,139 @@ +--- +parent: Setup +nav_order: 1 +--- + +# CMake Options + +A Thrust build is configured using CMake options. These may be passed to CMake +using + +``` +cmake -D= /path/to/thrust/sources +``` + +or configured interactively with the `ccmake` or `cmake-gui` interfaces. + +Thrust supports two build modes. By default, a single configuration is built +that targets a specific host system, device system, and C++ dialect. +When `THRUST_ENABLE_MULTICONFIG` is `ON`, multiple configurations +targeting a variety of systems and dialects are generated. + +The CMake options are divided into these categories: + +1. [Generic CMake Options](#generic-cmake-options): Options applicable to all + Thrust builds. +1. [Single Config CMake Options](#single-config-cmake-options) Options + applicable only when `THRUST_ENABLE_MULTICONFIG` is disabled. +1. [Multi Config CMake Options](#multi-config-cmake-options) Options applicable + only when `THRUST_ENABLE_MULTICONFIG` is enabled. +1. [CUDA Specific CMake Options](#cuda-specific-cmake-options) Options that + control CUDA compilation. Only available when one or more configurations + targets the CUDA system. +1. [TBB Specific CMake Options](#tbb-specific-cmake-options) Options that + control TBB compilation. Only available when one or more configurations + targets the TBB system. + +## Generic CMake Options + +- `CMAKE_BUILD_TYPE={Release, Debug, RelWithDebInfo, MinSizeRel}` + - Standard CMake build option. Default: `RelWithDebInfo` +- `THRUST_ENABLE_HEADER_TESTING={ON, OFF}` + - Whether to test compile public headers. Default is `ON`. +- `THRUST_ENABLE_TESTING={ON, OFF}` + - Whether to build unit tests. Default is `ON`. +- `THRUST_ENABLE_EXAMPLES={ON, OFF}` + - Whether to build examples. Default is `ON`. +- `THRUST_ENABLE_MULTICONFIG={ON, OFF}` + - Toggles single-config and multi-config modes. Default is `OFF` (single config). +- `THRUST_ENABLE_EXAMPLE_FILECHECK={ON, OFF}` + - Enable validation of example outputs using the LLVM FileCheck utility. + Default is `OFF`. +- `THRUST_ENABLE_INSTALL_RULES={ON, OFF}` + - If true, installation rules will be generated for thrust. Default is `ON`. + +## Single Config CMake Options + +- `THRUST_HOST_SYSTEM={CPP, TBB, OMP}` + - Selects the host system. Default: `CPP` +- `THRUST_DEVICE_SYSTEM={CUDA, TBB, OMP, CPP}` + - Selects the device system. Default: `CUDA` +- `THRUST_CPP_DIALECT={11, 14, 17}` + - Selects the C++ standard dialect to use. Default is `14` (C++14). + +## Multi Config CMake Options + +- `THRUST_MULTICONFIG_ENABLE_DIALECT_CPPXX={ON, OFF}` + - Toggle whether a specific C++ dialect will be targeted. + - Possible values of `XX` are `{11, 14, 17}`. + - By default, only C++14 is enabled. +- `THRUST_MULTICONFIG_ENABLE_SYSTEM_XXXX={ON, OFF}` + - Toggle whether a specific system will be targeted. + - Possible values of `XXXX` are `{CPP, CUDA, TBB, OMP}` + - By default, only `CPP` and `CUDA` are enabled. +- `THRUST_MULTICONFIG_WORKLOAD={SMALL, MEDIUM, LARGE, FULL}` + - Restricts the host/device combinations that will be targeted. + - By default, the `SMALL` workload is used. + - The full cross product of `host x device` systems results in 12 + configurations, some of which are more important than others. + This option can be used to prune some of the less important ones. + - `SMALL`: (3 configs) Minimal coverage and validation of each device system against the `CPP` host. + - `MEDIUM`: (6 configs) Cheap extended coverage. + - `LARGE`: (8 configs) Expensive extended coverage. Includes all useful build configurations. + - `FULL`: (12 configs) The complete cross product of all possible build configurations. + +| Config | Workloads | Value | Expense | Note | +|----------|-----------|------------|-----------|------------------------------| +| CPP/CUDA | `F L M S` | Essential | Expensive | Validates CUDA against CPP | +| CPP/OMP | `F L M S` | Essential | Cheap | Validates OMP against CPP | +| CPP/TBB | `F L M S` | Essential | Cheap | Validates TBB against CPP | +| CPP/CPP | `F L M ` | Important | Cheap | Tests CPP as device | +| OMP/OMP | `F L M ` | Important | Cheap | Tests OMP as host | +| TBB/TBB | `F L M ` | Important | Cheap | Tests TBB as host | +| TBB/CUDA | `F L ` | Important | Expensive | Validates TBB/CUDA interop | +| OMP/CUDA | `F L ` | Important | Expensive | Validates OMP/CUDA interop | +| TBB/OMP | `F ` | Not useful | Cheap | Mixes CPU-parallel systems | +| OMP/TBB | `F ` | Not useful | Cheap | Mixes CPU-parallel systems | +| TBB/CPP | `F ` | Not Useful | Cheap | Parallel host, serial device | +| OMP/CPP | `F ` | Not Useful | Cheap | Parallel host, serial device | + +## CUDA Specific CMake Options + +- `THRUST_INCLUDE_CUB_CMAKE={ON, OFF}` + - If enabled, the CUB project will be built as part of Thrust. Default is + `OFF`. + - This adds CUB tests, etc. Useful for working on both CUB and Thrust + simultaneously. + - CUB configurations will be generated for each C++ dialect targeted by + the current Thrust build. +- `THRUST_INSTALL_CUB_HEADERS={ON, OFF}` + - If enabled, the CUB project's headers will be installed through Thrust's + installation rules. Default is `ON`. + - This option depends on `THRUST_ENABLE_INSTALL_RULES`. +- `THRUST_ENABLE_COMPUTE_XX={ON, OFF}` + - Controls the targeted CUDA architecture(s) + - Multiple options may be selected when using NVCC as the CUDA compiler. + - Valid values of `XX` are: + `{35, 37, 50, 52, 53, 60, 61, 62, 70, 72, 75, 80}` + - Default value depends on `THRUST_DISABLE_ARCH_BY_DEFAULT`: +- `THRUST_ENABLE_COMPUTE_FUTURE={ON, OFF}` + - If enabled, CUDA objects will target the most recent virtual architecture + in addition to the real architectures specified by the + `THRUST_ENABLE_COMPUTE_XX` options. + - Default value depends on `THRUST_DISABLE_ARCH_BY_DEFAULT`: +- `THRUST_DISABLE_ARCH_BY_DEFAULT={ON, OFF}` + - When `ON`, all `THRUST_ENABLE_COMPUTE_*` options are initially `OFF`. + - Default: `OFF` (meaning all architectures are enabled by default) +- `THRUST_ENABLE_TESTS_WITH_RDC={ON, OFF}` + - Whether to enable Relocatable Device Code when building tests. + Default is `OFF`. +- `THRUST_ENABLE_EXAMPLES_WITH_RDC={ON, OFF}` + - Whether to enable Relocatable Device Code when building examples. + Default is `OFF`. + +## TBB Specific CMake Options + +- `THRUST_TBB_ROOT=` + - When the TBB system is requested, set this to the root of the TBB installation + (e.g. the location of `lib/`, `bin/` and `include/` for the TBB libraries). + diff --git a/docs/github_pages/setup/requirements.md b/docs/github_pages/setup/requirements.md new file mode 100644 index 000000000..9d5316456 --- /dev/null +++ b/docs/github_pages/setup/requirements.md @@ -0,0 +1,82 @@ +--- +parent: Setup +nav_order: 0 +--- + +# Requirements + +All requirements are applicable to the `main` branch on GitHub. +For details on specific releases, please see the [CHANGELOG.md]. + +## Usage Requirements + +To use the NVIDIA C++ Standard Library, you must meet the following + requirements. + +### System Software + +Thrust and CUB require either the [NVIDIA HPC SDK] or the [CUDA Toolkit]. + +Releases of Thrust and CUB are only tested against the latest releases of NVHPC + and CUDA. +It may be possible to use newer version of Thrust and CUB with an older NVHPC or + CUDA installation by using a Thrust and CUB release from GitHub, but please + be aware this is not officially supported. + +### C++ Dialects + +Thrust and CUB support the following C++ dialects: + +- C++11 (deprecated) +- C++14 +- C++17 + +### Compilers + +Thrust and CUB support the following compilers when used in conjunction with + NVCC: + +- NVCC (latest version) +- NVC++ (latest version) +- GCC 5+ +- Clang 7+ +- MSVC 2019+ (19.20/16.0/14.20) + +Unsupported versions may emit deprecation warnings, which can be + silenced by defining `THRUST_IGNORE_DEPRECATED_COMPILER` during compilation. + +### Device Architectures + +Thrust and CUB support all NVIDIA device architectures since SM 35. + +### Host Architectures + +Thrust and CUB support the following host architectures: + +- aarch64. +- x86-64. +- ppc64le. + +### Host Operating Systems + +Thrust and CUB support the following host operating systems: + +- Linux. +- Windows. + +## Build and Test Requirements + +To build and test Thrust and CUB yourself, you will need the following in + addition to the above requirements: + +- [CMake]. + + + +[changelog]: ./releases/changelog.md + +[NVIDIA HPC SDK]: https://developer.nvidia.com/hpc-sdk +[CUDA Toolkit]: https://developer.nvidia.com/cuda-toolkit + +[CMake]: https://cmake.org + diff --git a/docs/serve_docs_locally.bash b/docs/serve_docs_locally.bash new file mode 100755 index 000000000..f438795e4 --- /dev/null +++ b/docs/serve_docs_locally.bash @@ -0,0 +1,35 @@ +#! /usr/bin/env bash + +############################################################################### +# Copyright (c) 2018-2021 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### + +SCRIPT_PATH=$(cd $(dirname ${0}); pwd -P) + +REPO_PATH=${SCRIPT_PATH}/.. + +BUILD_DOCS_PATH=build_docs +BUILD_GITHUB_PAGES_PATH=${BUILD_DOCS_PATH}/github_pages + +cd ${REPO_PATH}/${BUILD_GITHUB_PAGES_PATH} + +bundle install +bundle exec jekyll serve \ + --verbose \ + --incremental \ + --profile \ + --baseurl "/thrust" \ + ${@} + diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt new file mode 100644 index 000000000..306ecb7a3 --- /dev/null +++ b/examples/CMakeLists.txt @@ -0,0 +1,157 @@ +# Setup FileCheck if requested and available: +option(THRUST_ENABLE_EXAMPLE_FILECHECK + "Check example output with the LLVM FileCheck utility." + OFF +) +set(filecheck_data_path "${Thrust_SOURCE_DIR}/internal/test") + +if (THRUST_ENABLE_EXAMPLE_FILECHECK) + # TODO this should go into a find module + find_program(THRUST_FILECHECK_EXECUTABLE + DOC "Path to the LLVM FileCheck utility." + NAMES + FileCheck + FileCheck-3.9 + FileCheck-4.0 + FileCheck-5.0 + FileCheck-6.0 + FileCheck-7 + FileCheck-8 + FileCheck-9 + ) + + if (NOT THRUST_FILECHECK_EXECUTABLE) + message(FATAL_ERROR + "Could not find the LLVM FileCheck utility. Set THRUST_FILECHECK_EXECUTABLE manually, " + "or disable THRUST_ENABLE_EXAMPLE_FILECHECK." + ) + endif() + + execute_process( + COMMAND "${THRUST_FILECHECK_EXECUTABLE}" "${filecheck_data_path}/thrust.smoke.filecheck" + INPUT_FILE "${Thrust_SOURCE_DIR}/cmake/filecheck_smoke_test" + RESULT_VARIABLE exit_code + ) + + if (0 EQUAL exit_code) + message(STATUS "FileCheck enabled: ${THRUST_FILECHECK_EXECUTABLE}") + else() + message(FATAL_ERROR + "The current THRUST_FILECHECK_EXECUTABLE ('${THRUST_FILECHECK_EXECUTABLE}') " + "does not seem to be a valid FileCheck executable." + ) + endif() +endif() + +# Create meta targets that build all examples for a single configuration: +foreach(thrust_target IN LISTS THRUST_TARGETS) + thrust_get_target_property(config_prefix ${thrust_target} PREFIX) + set(config_meta_target ${config_prefix}.examples) + add_custom_target(${config_meta_target}) + add_dependencies(${config_prefix}.all ${config_meta_target}) +endforeach() + +# Update flags to reflect RDC options. See note in ThrustCudaConfig.cmake -- +# these flag variables behave unintuitively: +if (THRUST_ENABLE_EXAMPLES_WITH_RDC) + set(CMAKE_CUDA_FLAGS "${THRUST_CUDA_FLAGS_BASE} ${THRUST_CUDA_FLAGS_RDC}") +else() + set(CMAKE_CUDA_FLAGS "${THRUST_CUDA_FLAGS_BASE} ${THRUST_CUDA_FLAGS_NO_RDC}") +endif() + +## thrust_add_example +# +# Add an example executable and register it with ctest. +# +# target_name_var: Variable name to overwrite with the name of the example +# target. Useful for post-processing target information per-backend. +# example_name: The name of the example minus ".example." For +# instance, examples/vector.cu will be "vector", and examples/cuda/copy.cu +# would be "cuda.copy". +# example_src: The source file that implements the example. +# thrust_target: The reference thrust target with configuration information. +# +function(thrust_add_example target_name_var example_name example_src thrust_target) + thrust_get_target_property(config_host ${thrust_target} HOST) + thrust_get_target_property(config_device ${thrust_target} DEVICE) + thrust_get_target_property(config_prefix ${thrust_target} PREFIX) + + # Wrap the .cu file in .cpp for non-CUDA backends + if ("CUDA" STREQUAL "${config_device}") + set(real_example_src "${example_src}") + else() + thrust_wrap_cu_in_cpp(real_example_src "${example_src}" ${thrust_target}) + endif() + + # The actual name of the test's target: + set(example_target ${config_prefix}.example.${example_name}) + set(${target_name_var} ${example_target} PARENT_SCOPE) + + # Related target names: + set(config_meta_target ${config_prefix}.examples) + set(example_meta_target thrust.all.example.${example_name}) + + add_executable(${example_target} "${real_example_src}") + target_link_libraries(${example_target} ${thrust_target}) + target_include_directories(${example_target} PRIVATE "${Thrust_SOURCE_DIR}/examples") + thrust_clone_target_properties(${example_target} ${thrust_target}) + thrust_fix_clang_nvcc_build_for(${example_target}) + + # Add to the active configuration's meta target + add_dependencies(${config_meta_target} ${example_target}) + + # Meta target that builds examples with this name for all configurations: + if (NOT TARGET ${example_meta_target}) + add_custom_target(${example_meta_target}) + endif() + add_dependencies(${example_meta_target} ${example_target}) + + if ("CUDA" STREQUAL "${config_device}" AND + THRUST_ENABLE_EXAMPLES_WITH_RDC) + thrust_enable_rdc_for_cuda_target(${example_target}) + endif() + + if (NOT "Clang" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") + target_compile_definitions(${example_target} PRIVATE THRUST_EXAMPLE_DEVICE_SIDE) + endif() + + # Get the name of FileCheck input by stripping out the config name. + # (e.g. "thrust.cpp.cuda.cpp14.example.xxx" -> "thrust.example.xxx.filecheck") + string(REPLACE "${config_prefix}" "thrust" + filecheck_reference_file + "${example_target}.filecheck" + ) + + add_test(NAME ${example_target} + COMMAND "${CMAKE_COMMAND}" + "-DEXAMPLE_EXECUTABLE=$" + "-DFILECHECK_ENABLED=${THRUST_ENABLE_EXAMPLE_FILECHECK}" + "-DFILECHECK_EXECUTABLE=${THRUST_FILECHECK_EXECUTABLE}" + "-DREFERENCE_FILE=${filecheck_data_path}/${filecheck_reference_file}" + -P "${Thrust_SOURCE_DIR}/cmake/ThrustRunExample.cmake" + ) + + # Run OMP/TBB tests in serial. Multiple OMP processes will massively + # oversubscribe the machine with GCC's OMP, and we want to test these with + # the full CPU available to each unit test. + set(config_systems ${config_host} ${config_device}) + if (("OMP" IN_LIST config_systems) OR ("TBB" IN_LIST config_systems)) + set_tests_properties(${example_target} PROPERTIES RUN_SERIAL ON) + endif() +endfunction() + +file(GLOB example_srcs + RELATIVE "${CMAKE_CURRENT_LIST_DIR}" + CONFIGURE_DEPENDS + *.cu *.cpp +) + +foreach(thrust_target IN LISTS THRUST_TARGETS) + foreach(example_src IN LISTS example_srcs) + get_filename_component(example_name "${example_src}" NAME_WLE) + thrust_add_example(example_target ${example_name} "${example_src}" ${thrust_target}) + endforeach() +endforeach() + +add_subdirectory(cmake) +add_subdirectory(cuda) diff --git a/examples/README b/examples/README.md similarity index 56% rename from examples/README rename to examples/README.md index 4188534fe..8a43897bb 100644 --- a/examples/README +++ b/examples/README.md @@ -4,8 +4,4 @@ norm example. $ nvcc norm.cu -o norm These examples are also available online: - https://github.com/thrust/thrust/tree/master/examples - -For additional information refer to the Quick Start Guide: - https://github.com/thrust/thrust/wiki/Quick-Start-Guide - + https://github.com/NVIDIA/thrust/tree/main/examples diff --git a/examples/arbitrary_transformation.cu b/examples/arbitrary_transformation.cu index d1a15096f..be22c2e5a 100644 --- a/examples/arbitrary_transformation.cu +++ b/examples/arbitrary_transformation.cu @@ -3,6 +3,12 @@ #include #include +#include + +#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC) +#include +#endif // >= C++11 + // This example shows how to implement an arbitrary transformation of // the form output[i] = F(first[i], second[i], third[i], ... ). // In this example, we use a function with 3 inputs and 1 output. @@ -22,6 +28,10 @@ // D[i] = A[i] + B[i] * C[i]; // by invoking arbitrary_functor() on each of the tuples using for_each. // +// If we are using a functor that is not designed for zip iterators by taking a +// tuple instead of individual arguments we can adapt this function using the +// zip_function adaptor (C++11 only). +// // Note that we could extend this example to implement functions with an // arbitrary number of input arguments by zipping more sequence together. // With the same approach we can have multiple *output* sequences, if we @@ -31,7 +41,7 @@ // // The possibilities are endless! :) -struct arbitrary_functor +struct arbitrary_functor1 { template __host__ __device__ @@ -42,6 +52,17 @@ struct arbitrary_functor } }; +#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC) +struct arbitrary_functor2 +{ + __host__ __device__ + void operator()(const float& a, const float& b, const float& c, float& d) + { + // D[i] = A[i] + B[i] * C[i]; + d = a + b * c; + } +}; +#endif // >= C++11 int main(void) { @@ -49,7 +70,7 @@ int main(void) thrust::device_vector A(5); thrust::device_vector B(5); thrust::device_vector C(5); - thrust::device_vector D(5); + thrust::device_vector D1(5); // initialize input vectors A[0] = 3; B[0] = 6; C[0] = 2; @@ -59,12 +80,26 @@ int main(void) A[4] = 2; B[4] = 8; C[4] = 3; // apply the transformation - thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin(), C.begin(), D.begin())), - thrust::make_zip_iterator(thrust::make_tuple(A.end(), B.end(), C.end(), D.end())), - arbitrary_functor()); + thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin(), C.begin(), D1.begin())), + thrust::make_zip_iterator(thrust::make_tuple(A.end(), B.end(), C.end(), D1.end())), + arbitrary_functor1()); + + // print the output + std::cout << "Tuple functor" << std::endl; + for(int i = 0; i < 5; i++) + std::cout << A[i] << " + " << B[i] << " * " << C[i] << " = " << D1[i] << std::endl; + + // apply the transformation using zip_function +#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC) + thrust::device_vector D2(5); + thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin(), C.begin(), D2.begin())), + thrust::make_zip_iterator(thrust::make_tuple(A.end(), B.end(), C.end(), D2.end())), + thrust::make_zip_function(arbitrary_functor2())); // print the output + std::cout << "N-ary functor" << std::endl; for(int i = 0; i < 5; i++) - std::cout << A[i] << " + " << B[i] << " * " << C[i] << " = " << D[i] << std::endl; + std::cout << A[i] << " + " << B[i] << " * " << C[i] << " = " << D2[i] << std::endl; +#endif // >= C++11 } diff --git a/examples/cmake/CMakeLists.txt b/examples/cmake/CMakeLists.txt new file mode 100644 index 000000000..25d2a2f95 --- /dev/null +++ b/examples/cmake/CMakeLists.txt @@ -0,0 +1,28 @@ +thrust_update_system_found_flags() + +set(extra_cmake_flags) + +# Need to pass these when testing NVC++. +if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") + set(extra_cmake_flags + -D "CMAKE_CUDA_COMPILER_ID=${CMAKE_CUDA_COMPILER_ID}" + -D "CMAKE_CUDA_COMPILER_FORCED=${CMAKE_CUDA_COMPILER_FORCED}" + ) +endif() + +if (THRUST_CPP_FOUND AND THRUST_CUDA_FOUND) + # Do a basic check of the cmake/ThrustAddSubdir.cmake mechanism: + add_test( + NAME thrust.example.cmake.add_subdir + COMMAND "${CMAKE_COMMAND}" + --log-level=VERBOSE + -G "${CMAKE_GENERATOR}" + -S "${CMAKE_CURRENT_SOURCE_DIR}/add_subdir" + -B "${CMAKE_CURRENT_BINARY_DIR}/add_subdir" + -D "THRUST_ROOT=${Thrust_SOURCE_DIR}" + -D "CMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" + -D "CMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}" + -D "CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}" + ${extra_cmake_flags} + ) +endif() diff --git a/examples/cmake/add_subdir/CMakeLists.txt b/examples/cmake/add_subdir/CMakeLists.txt new file mode 100644 index 000000000..96283699f --- /dev/null +++ b/examples/cmake/add_subdir/CMakeLists.txt @@ -0,0 +1,91 @@ +# This example demonstrates / tests adding thrust via a CMake add_subdirectory +# call from a parent project. +# +# The variables THRUST_REQUIRED_SYSTEMS and THRUST_OPTIONAL_SYSTEMS must be +# set prior to add_subdirectory(thrust), and afterwards the thrust_create_target +# function may be used to create targets with the desired systems. See +# NVIDIA/thrust/cmake/README.md for more details on thrust_create_target. + +cmake_minimum_required(VERSION 3.15) + +# Silence warnings about empty CUDA_ARCHITECTURES properties on example targets: +if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18) + cmake_policy(SET CMP0104 OLD) +endif() + +project(ThrustAddSubDirExample CXX) + +# Add required Thrust systems to THRUST_REQUIRED_SYSTEMS. +# Options are: CPP, CUDA, TBB or OMP. +# Multiple systems may be specified. +# An error is emitted if the system is not found. +set(THRUST_REQUIRED_SYSTEMS CPP) + +# Add optional Thrust systems to THRUST_OPTIONAL_SYSTEMS. +# Options are: CPP, CUDA, TBB or OMP. +# Multiple systems may be specified. +# No error is emitted if not found. +set(THRUST_OPTIONAL_SYSTEMS CUDA) + +# Use your project's checkout of Thrust here, for most cases +# `add_subdirectory(thrust)` will be sufficient. +add_subdirectory("${THRUST_ROOT}" thrust) + +# Create a thrust target that only uses the serial CPP backend. +# See thrust/thrust/cmake/README.md for details and additional options: +thrust_create_target(ThrustCPP HOST CPP DEVICE CPP) + +# Create an executable that uses the CPP-only thrust target: +add_executable(ExecWithCPP dummy.cpp) +target_link_libraries(ExecWithCPP ThrustCPP) + +# To test for optional systems, first call thrust_update_system_found_flags to +# set the THRUST_${system}_FOUND flags in current scope. +# Required due to CMake scoping rules. +thrust_update_system_found_flags() + +# Create and use a Thrust target configured to use CUDA acceleration if CUDA +# is available: +if (THRUST_CUDA_FOUND) + enable_language(CUDA) + thrust_create_target(ThrustCUDA HOST CPP DEVICE CUDA) + add_executable(ExecWithCUDA dummy.cu) + target_link_libraries(ExecWithCUDA ThrustCUDA) +endif() + +# +# Validation +# + +function(assert_boolean var_name expect) + if (expect) + if (NOT ${var_name}) + message(FATAL_ERROR "'${var_name}' is false, expected true.") + endif() + else() + if (${var_name}) + message(FATAL_ERROR "'${var_name}' is true, expected false.") + endif() + endif() +endfunction() + +function(assert_target target_name) + if (NOT TARGET "${target_name}") + message(FATAL_ERROR "Target '${target_name}' not defined.") + endif() +endfunction() + +assert_boolean(THRUST_CPP_FOUND TRUE) +assert_boolean(THRUST_CUDA_FOUND TRUE) +assert_boolean(THRUST_OMP_FOUND FALSE) +assert_boolean(THRUST_TBB_FOUND FALSE) + +assert_target(ThrustCPP) +assert_target(ThrustCUDA) +assert_target(ExecWithCPP) +assert_target(ExecWithCUDA) + +thrust_debug_target(ThrustCPP "") +thrust_debug_target(ThrustCUDA "") +thrust_debug_target(ExecWithCPP "") +thrust_debug_target(ExecWithCUDA "") diff --git a/examples/cmake/add_subdir/dummy.cpp b/examples/cmake/add_subdir/dummy.cpp new file mode 100644 index 000000000..ad7b9435f --- /dev/null +++ b/examples/cmake/add_subdir/dummy.cpp @@ -0,0 +1,32 @@ +#include + +#include + +int main() +{ + std::cout << "Hello from Thrust version " << THRUST_VERSION << ":\n" + + << "Host system: " +#if THRUST_HOST_SYSTEM == THRUST_HOST_SYSTEM_CPP + << "CPP\n" +#elif THRUST_HOST_SYSTEM == THRUST_HOST_SYSTEM_OMP + << "OMP\n" +#elif THRUST_HOST_SYSTEM == THRUST_HOST_SYSTEM_TBB + << "TBB\n" +#else + << "Unknown\n" +#endif + + << "Device system: " +#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CPP + << "CPP\n"; +#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA + << "CUDA\n"; +#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_OMP + << "OMP\n"; +#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_TBB + << "TBB\n"; +#else + << "Unknown\n"; +#endif +} diff --git a/examples/cmake/add_subdir/dummy.cu b/examples/cmake/add_subdir/dummy.cu new file mode 100644 index 000000000..b5645fc3d --- /dev/null +++ b/examples/cmake/add_subdir/dummy.cu @@ -0,0 +1 @@ +#include "dummy.cpp" diff --git a/examples/cuda/CMakeLists.txt b/examples/cuda/CMakeLists.txt new file mode 100644 index 000000000..bd72c58c0 --- /dev/null +++ b/examples/cuda/CMakeLists.txt @@ -0,0 +1,18 @@ +file(GLOB example_srcs + RELATIVE "${CMAKE_CURRENT_LIST_DIR}}" + CONFIGURE_DEPENDS + *.cu *.cpp +) + +foreach(thrust_target IN LISTS THRUST_TARGETS) + thrust_get_target_property(config_device ${thrust_target} DEVICE) + if (NOT config_device STREQUAL "CUDA") + continue() + endif() + + foreach(example_src IN LISTS example_srcs) + get_filename_component(example_name "${example_src}" NAME_WLE) + string(PREPEND example_name "cuda.") + thrust_add_example(example_target ${example_name} "${example_src}" ${thrust_target}) + endforeach() +endforeach() diff --git a/examples/cuda/async_reduce.cu b/examples/cuda/async_reduce.cu index ca21c88cb..6e1584bcc 100644 --- a/examples/cuda/async_reduce.cu +++ b/examples/cuda/async_reduce.cu @@ -1,9 +1,10 @@ +#include #include #include #include #include -#if __cplusplus >= 201103L +#if THRUST_CPP_DIALECT >= 2011 #include #endif @@ -20,11 +21,13 @@ // std::future to wait for the result of the reduction. This method requires a compiler which supports // C++11-capable language and library constructs. +#ifdef THRUST_EXAMPLE_DEVICE_SIDE template __global__ void reduce_kernel(Iterator first, Iterator last, T init, BinaryOperation binary_op, Pointer result) { *result = thrust::reduce(thrust::cuda::par, first, last, init, binary_op); } +#endif int main() { @@ -39,7 +42,11 @@ int main() cudaStreamCreate(&s); // launch a CUDA kernel with only 1 thread on our stream +#ifdef THRUST_EXAMPLE_DEVICE_SIDE reduce_kernel<<<1,1,0,s>>>(data.begin(), data.end(), 0, thrust::plus(), result.data()); +#else + result[0] = thrust::reduce(thrust::cuda::par, data.begin(), data.end(), 0, thrust::plus()); +#endif // wait for the stream to finish cudaStreamSynchronize(s); @@ -52,7 +59,7 @@ int main() // reset the result result[0] = 0; -#if __cplusplus >= 201103L +#if THRUST_CPP_DIALECT >= 2011 // method 2: use std::async to create asynchrony // copy all the algorithm parameters diff --git a/examples/cuda/custom_temporary_allocation.cu b/examples/cuda/custom_temporary_allocation.cu index fe08e5f95..7bba0fa9e 100644 --- a/examples/cuda/custom_temporary_allocation.cu +++ b/examples/cuda/custom_temporary_allocation.cu @@ -10,13 +10,16 @@ #include #include -// This example demonstrates how to intercept calls to get_temporary_buffer -// and return_temporary_buffer to control how Thrust allocates temporary storage -// during algorithms such as thrust::sort. The idea will be to create a simple -// cache of allocations to search when temporary storage is requested. If a hit -// is found in the cache, we quickly return the cached allocation instead of -// resorting to the more expensive thrust::cuda::malloc. -// +// This example demonstrates how to control how Thrust allocates temporary +// storage during algorithms such as thrust::sort. The idea will be to create a +// simple cache of allocations to search when temporary storage is requested. +// If a hit is found in the cache, we quickly return the cached allocation +// instead of resorting to the more expensive thrust::cuda::malloc. + +// Note: Thrust now has its own caching allocator layer; if you just need a +// caching allocator, you ought to use that. This example is still useful +// as a demonstration of how to use a Thrust custom allocator. + // Note: this implementation cached_allocator is not thread-safe. If multiple // (host) threads use the same cached_allocator then they should gain exclusive // access to the allocator before accessing its methods. diff --git a/examples/cuda/explicit_cuda_stream.cu b/examples/cuda/explicit_cuda_stream.cu new file mode 100644 index 000000000..303a14723 --- /dev/null +++ b/examples/cuda/explicit_cuda_stream.cu @@ -0,0 +1,80 @@ +#include +#include // For thrust::device +#include +#include + +#include + +#include + +// This example shows how to execute a Thrust device algorithm on an explicit +// CUDA stream. The simple program below fills a vector with the numbers +// [0, 1000) (thrust::sequence) and then performs a scan operation +// (thrust::inclusive_scan) on them. Both algorithms are executed on the same +// custom CUDA stream using the CUDA execution policies. +// +// Thrust provides two execution policies that accept CUDA streams that differ +// in when/if they synchronize the stream: +// 1. thrust::cuda::par.on(stream) +// - `stream` will *always* be synchronized before an algorithm returns. +// - This is the default `thrust::device` policy when compiling with the +// CUDA device backend. +// 2. thrust::cuda::par_nosync.on(stream) +// - `stream` will only be synchronized when necessary for correctness +// (e.g., returning a result from `thrust::reduce`). This is a hint that +// may be ignored by an algorithm's implementation. + +int main() +{ + thrust::device_vector d_vec(1000); + + // Create the stream: + cudaStream_t custom_stream; + cudaError_t err = cudaStreamCreate(&custom_stream); + if (err != cudaSuccess) + { + std::cerr << "Error creating stream: " << cudaGetErrorString(err) << "\n"; + return 1; + } + + // Construct a new `nosync` execution policy with the custom stream + auto nosync_exec_policy = thrust::cuda::par_nosync.on(custom_stream); + + // Fill the vector with sequential data. + // This will execute using the custom stream and the stream will *not* be + // synchronized before the function returns, meaning asynchronous work may + // still be executing after returning and the contents of `d_vec` are + // undefined. Synchronization is not needed here because the following + // `inclusive_scan` is executed on the same stream and is therefore guaranteed + // to be ordered after the `sequence` + thrust::sequence(nosync_exec_policy, d_vec.begin(), d_vec.end()); + + // Construct a new *synchronous* execution policy with the same custom stream + auto sync_exec_policy = thrust::cuda::par.on(custom_stream); + + // Compute in-place inclusive sum scan of data in the vector. + // This also executes in the custom stream, but the execution policy ensures + // the stream is synchronized before the algorithm returns. This guarantees + // there is no pending asynchronous work and the contents of `d_vec` are + // immediately accessible. + thrust::inclusive_scan(sync_exec_policy, + d_vec.cbegin(), + d_vec.cend(), + d_vec.begin()); + + // This access is only valid because the stream has been synchronized + int sum = d_vec.back(); + + // Free the stream: + err = cudaStreamDestroy(custom_stream); + if (err != cudaSuccess) + { + std::cerr << "Error destroying stream: " << cudaGetErrorString(err) << "\n"; + return 1; + } + + // Print the sum: + std::cout << "sum is " << sum << std::endl; + + return 0; +} diff --git a/examples/cuda/global_device_vector.cu b/examples/cuda/global_device_vector.cu index 1419cae62..a99566796 100644 --- a/examples/cuda/global_device_vector.cu +++ b/examples/cuda/global_device_vector.cu @@ -1,3 +1,4 @@ +#include #include // If you create a global `thrust::device_vector` with the default allocator, @@ -20,7 +21,7 @@ typedef thrust::system::cuda::detail::cuda_memory_resource< thrust::cuda::pointer > device_ignore_shutdown_memory_resource; -#if __cplusplus >= 201103L +#if THRUST_CPP_DIALECT >= 2011 template using device_ignore_shutdown_allocator = thrust::mr::stateless_resource_allocator< diff --git a/examples/cuda/range_view.cu b/examples/cuda/range_view.cu index e863a6199..2ede62047 100644 --- a/examples/cuda/range_view.cu +++ b/examples/cuda/range_view.cu @@ -226,7 +226,7 @@ int main() // print values from original device_vector Z // to ensure that range view was mapped to this vector - for (int i = 0, n = Z.size(); i < n; ++i) + for (std::size_t i = 0, n = Z.size(); i < n; ++i) { cout << "z[" << i << "]= " << Z[i] << endl; } diff --git a/examples/discrete_voronoi.cu b/examples/discrete_voronoi.cu index 93e7e5622..bfbf2242d 100644 --- a/examples/discrete_voronoi.cu +++ b/examples/discrete_voronoi.cu @@ -4,10 +4,10 @@ #include #include #include -#include +#include #include -#include +#include #include #include "include/timer.h" @@ -135,21 +135,26 @@ void generate_random_sites(thrust::host_vector &t, int Nb, int m, int n) //Export the tab to PGM image format void vector_to_pgm(thrust::host_vector &t, int m, int n, const char *out) { - FILE *f; + assert(static_cast(t.size()) == m * n && + "Vector size does not match image dims."); - f=fopen(out,"w+t"); - fprintf(f,"P2\n"); - fprintf(f,"%d %d\n 253\n",m,n); + std::fstream f(out, std::fstream::out); + f << "P2\n"; + f << m << " " << n << "\n"; + f << "253\n"; + + //Hash function to map values to [0,255] + auto to_grey_level = [](int in_value) -> int + { + return (71 * in_value) % 253; + }; - for(int j = 0; j < n ; j++) + for (int value : t) { - for(int i = 0; i < m ; i++) - { - fprintf(f,"%d ",(int)(71*t[j*m+i])%253); //Hash function to map values to [0,255] - } + f << to_grey_level(value) << " "; } - fprintf(f,"\n"); - fclose(f); + f << "\n"; + f.close(); } /************Main Jfa loop********************/ diff --git a/examples/dot_products_with_zip.cu b/examples/dot_products_with_zip.cu index 52e33d8e6..81ff7ac12 100644 --- a/examples/dot_products_with_zip.cu +++ b/examples/dot_products_with_zip.cu @@ -6,9 +6,9 @@ #include -// This example shows how thrust::zip_iterator can be used to create a -// 'virtual' array of structures. In this case the structure is a 3d -// vector type (Float3) whose (x,y,z) components will be stored in +// This example shows how thrust::zip_iterator can be used to create a +// 'virtual' array of structures. In this case the structure is a 3d +// vector type (Float3) whose (x,y,z) components will be stored in // three separate float arrays. The zip_iterator "zips" these arrays // into a single virtual Float3 array. @@ -54,17 +54,17 @@ int main(void) // We'll store the components of the 3d vectors in separate arrays. One set of // arrays will store the 'A' vectors and another set will store the 'B' vectors. - // This 'structure of arrays' (SoA) approach is usually more efficient than the + // This 'structure of arrays' (SoA) approach is usually more efficient than the // 'array of structures' (AoS) approach. The primary reason is that structures, // like Float3, don't always obey the memory coalescing rules, so they are not // efficiently transferred to and from memory. Another reason to prefer SoA to // AoS is that we don't aways want to process all members of the structure. For - // example, if we only need to look at first element of the structure then it + // example, if we only need to look at first element of the structure then it // is wasteful to load the entire structure from memory. With the SoA approach, // we can chose which elements of the structure we wish to read. thrust::device_vector A0 = random_vector(N); // x components of the 'A' vectors - thrust::device_vector A1 = random_vector(N); // y components of the 'A' vectors + thrust::device_vector A1 = random_vector(N); // y components of the 'A' vectors thrust::device_vector A2 = random_vector(N); // z components of the 'A' vectors thrust::device_vector B0 = random_vector(N); // x components of the 'B' vectors @@ -78,7 +78,7 @@ int main(void) // We'll now illustrate two ways to use zip_iterator to compute the dot // products. The first method is verbose but shows how the parts fit together. // The second method hides these details and is more concise. - + // METHOD #1 // Defining a zip_iterator type can be a little cumbersome ... @@ -87,24 +87,24 @@ int main(void) typedef thrust::zip_iterator Float3Iterator; // Now we'll create some zip_iterators for A and B - Float3Iterator A_first = thrust::make_zip_iterator(make_tuple(A0.begin(), A1.begin(), A2.begin())); - Float3Iterator A_last = thrust::make_zip_iterator(make_tuple(A0.end(), A1.end(), A2.end())); - Float3Iterator B_first = thrust::make_zip_iterator(make_tuple(B0.begin(), B1.begin(), B2.begin())); - + Float3Iterator A_first = thrust::make_zip_iterator(thrust::make_tuple(A0.begin(), A1.begin(), A2.begin())); + Float3Iterator A_last = thrust::make_zip_iterator(thrust::make_tuple(A0.end(), A1.end(), A2.end())); + Float3Iterator B_first = thrust::make_zip_iterator(thrust::make_tuple(B0.begin(), B1.begin(), B2.begin())); + // Finally, we pass the zip_iterators into transform() as if they // were 'normal' iterators for a device_vector. thrust::transform(A_first, A_last, B_first, result.begin(), DotProduct()); // METHOD #2 - // Alternatively, we can avoid creating variables for X_first, X_last, + // Alternatively, we can avoid creating variables for X_first, X_last, // and Y_first and invoke transform() directly. - thrust::transform( thrust::make_zip_iterator(make_tuple(A0.begin(), A1.begin(), A2.begin())), - thrust::make_zip_iterator(make_tuple(A0.end(), A1.end(), A2.end())), - thrust::make_zip_iterator(make_tuple(B0.begin(), B1.begin(), B2.begin())), + thrust::transform( thrust::make_zip_iterator(thrust::make_tuple(A0.begin(), A1.begin(), A2.begin())), + thrust::make_zip_iterator(thrust::make_tuple(A0.end(), A1.end(), A2.end())), + thrust::make_zip_iterator(thrust::make_tuple(B0.begin(), B1.begin(), B2.begin())), result.begin(), DotProduct() ); - + // Finally, we'll print a few results @@ -126,8 +126,8 @@ int main(void) std::cout << "(" << thrust::get<0>(b) << "," << thrust::get<1>(b) << "," << thrust::get<2>(b) << ")"; std::cout << " = "; std::cout << dot << std::endl; - } + } return 0; } - + diff --git a/examples/expand.cu b/examples/expand.cu index 4547bcd13..f61edec8f 100644 --- a/examples/expand.cu +++ b/examples/expand.cu @@ -51,7 +51,6 @@ OutputIterator expand(InputIterator1 first1, thrust::maximum()); // gather input values according to index array (output = first2[output_indices]) - OutputIterator output_end = output; thrust::advance(output_end, output_size); thrust::gather(output_indices.begin(), output_indices.end(), first2, diff --git a/examples/raw_reference_cast.cu b/examples/raw_reference_cast.cu index ec9a9783f..d6c854590 100644 --- a/examples/raw_reference_cast.cu +++ b/examples/raw_reference_cast.cu @@ -84,11 +84,9 @@ int main(void) typedef Vector::iterator Iterator; typedef thrust::device_system_tag System; - size_t N = 5; - // allocate device memory - Vector A(N); - Vector B(N); + Vector A(5); + Vector B(5); // initialize A and B thrust::sequence(A.begin(), A.end()); @@ -100,7 +98,7 @@ int main(void) // note: we must specify the System to ensure correct execution thrust::for_each(thrust::counting_iterator(0), - thrust::counting_iterator(N), + thrust::counting_iterator(5), copy_iterators(A.begin(), B.begin())); std::cout << "After A->B Copy" << std::endl; diff --git a/examples/scan_matrix_by_rows.cu b/examples/scan_matrix_by_rows.cu index df303d8bd..2cf1986e9 100644 --- a/examples/scan_matrix_by_rows.cu +++ b/examples/scan_matrix_by_rows.cu @@ -1,5 +1,6 @@ #include #include +#include #include #include @@ -20,7 +21,7 @@ void scan_matrix_by_rows0(thrust::device_vector& u, int n, int m) { // We can batch the operation using `thrust::inclusive_scan_by_key`, which // scans each group of consecutive equal keys. All we need to do is generate -// the right key sequence. We want the keys for elements on the same row to +// the right key sequence. We want the keys for elements on the same row to // be identical. // So first, we define an unary function object which takes the index of an diff --git a/examples/sort.cu b/examples/sort.cu index 700fc5f3f..1bbb5d897 100644 --- a/examples/sort.cu +++ b/examples/sort.cu @@ -41,7 +41,7 @@ void initialize(thrust::device_vector& v1, thrust::device_vector& v2) for(size_t i = 0; i < v1.size(); i++) { v1[i] = dist(rng); - v2[i] = i; + v2[i] = static_cast(i); } } diff --git a/examples/sorting_aos_vs_soa.cu b/examples/sorting_aos_vs_soa.cu index 1bf990982..649a78ab1 100644 --- a/examples/sorting_aos_vs_soa.cu +++ b/examples/sorting_aos_vs_soa.cu @@ -1,3 +1,4 @@ +#include #include #include #include @@ -7,7 +8,7 @@ // This examples compares sorting performance using Array of Structures (AoS) // and Structure of Arrays (SoA) data layout. Legacy applications will often -// store data in C/C++ structs, such as MyStruct defined below. Although +// store data in C/C++ structs, such as MyStruct defined below. Although // Thrust can process array of structs, it is typically less efficient than // the equivalent structure of arrays layout. In this particular example, // the optimized SoA approach is approximately *five times faster* than the @@ -57,7 +58,7 @@ int main(void) { size_t N = 2 * 1024 * 1024; - // Sort Key-Value pairs using Array of Structures (AoS) storage + // Sort Key-Value pairs using Array of Structures (AoS) storage { thrust::device_vector structures(N); @@ -71,7 +72,7 @@ int main(void) std::cout << "AoS sort took " << 1e3 * t.elapsed() << " milliseconds" << std::endl; } - // Sort Key-Value pairs using Structure of Arrays (SoA) storage + // Sort Key-Value pairs using Structure of Arrays (SoA) storage { thrust::device_vector keys(N); thrust::device_vector values(N); diff --git a/examples/sparse_vector.cu b/examples/sparse_vector.cu index c7528cff2..463bfa008 100644 --- a/examples/sparse_vector.cu +++ b/examples/sparse_vector.cu @@ -11,7 +11,6 @@ template B_index(6); thrust::device_vector B_value(6); @@ -97,7 +95,7 @@ int main(void) // compute sparse vector C = A + B thrust::device_vector C_index; thrust::device_vector C_value; - + sum_sparse_vectors(A_index, A_value, B_index, B_value, C_index, C_value); std::cout << "Computing C = A + B for sparse vectors A and B" << std::endl; diff --git a/examples/transform_input_output_iterator.cu b/examples/transform_input_output_iterator.cu new file mode 100644 index 000000000..afdccc35a --- /dev/null +++ b/examples/transform_input_output_iterator.cu @@ -0,0 +1,111 @@ +#include +#include +#include +#include +#include +#include +#include + +// Base 2 fixed point +class ScaledInteger +{ + int value_; + int scale_; + +public: + __host__ __device__ + ScaledInteger(int value, int scale): value_{value}, scale_{scale} {} + + __host__ __device__ + int value() const { return value_; } + + __host__ __device__ + ScaledInteger rescale(int scale) const + { + int shift = scale - scale_; + int result = shift < 0 ? value_ << (-shift) : value_ >> shift; + return ScaledInteger{result, scale}; + } + + __host__ __device__ + friend ScaledInteger operator+(ScaledInteger a, ScaledInteger b) + { + // Rescale inputs to the lesser of the two scales + if (b.scale_ < a.scale_) + a = a.rescale(b.scale_); + else if (a.scale_ < b.scale_) + b = b.rescale(a.scale_); + return ScaledInteger{a.value_ + b.value_, a.scale_}; + } +}; + +struct ValueToScaledInteger +{ + int scale; + + __host__ __device__ + ScaledInteger operator()(const int& value) const + { + return ScaledInteger{value, scale}; + } +}; + +struct ScaledIntegerToValue +{ + int scale; + + __host__ __device__ + int operator()(const ScaledInteger& scaled) const + { + return scaled.rescale(scale).value(); + } +}; + +int main(void) +{ + const size_t size = 4; + thrust::device_vector A(size); + thrust::device_vector B(size); + thrust::device_vector C(size); + + thrust::sequence(A.begin(), A.end(), 1); + thrust::sequence(B.begin(), B.end(), 5); + + const int A_scale = 16; // Values in A are left shifted by 16 + const int B_scale = 8; // Values in B are left shifted by 8 + const int C_scale = 4; // Values in C are left shifted by 4 + + auto A_begin = thrust::make_transform_input_output_iterator(A.begin(), + ValueToScaledInteger{A_scale}, ScaledIntegerToValue{A_scale}); + auto A_end = thrust::make_transform_input_output_iterator(A.end(), + ValueToScaledInteger{A_scale}, ScaledIntegerToValue{A_scale}); + auto B_begin = thrust::make_transform_input_output_iterator(B.begin(), + ValueToScaledInteger{B_scale}, ScaledIntegerToValue{B_scale}); + auto C_begin = thrust::make_transform_input_output_iterator(C.begin(), + ValueToScaledInteger{C_scale}, ScaledIntegerToValue{C_scale}); + + // Sum A and B as ScaledIntegers, storing the scaled result in C + thrust::transform(A_begin, A_end, B_begin, C_begin, thrust::plus{}); + + thrust::host_vector A_h(A); + thrust::host_vector B_h(B); + thrust::host_vector C_h(C); + + std::cout << std::hex; + + std::cout << "Expected [ "; + for (size_t i = 0; i < size; i++) { + const int expected = ((A_h[i] << A_scale) + (B_h[i] << B_scale)) >> C_scale; + std::cout << expected << " "; + } + std::cout << "] \n"; + + std::cout << "Result [ "; + for (size_t i = 0; i < size; i++) { + std::cout << C_h[i] << " "; + } + std::cout << "] \n"; + + return 0; +} + diff --git a/examples/uninitialized_vector.cu b/examples/uninitialized_vector.cu index 5f522a809..90e8141fa 100644 --- a/examples/uninitialized_vector.cu +++ b/examples/uninitialized_vector.cu @@ -29,6 +29,10 @@ template __host__ ~uninitialized_allocator() {} +#if THRUST_CPP_DIALECT >= 2011 + uninitialized_allocator & operator=(const uninitialized_allocator &) = default; +#endif + // for correctness, you should also redefine rebind when you inherit // from an allocator type; this way, if the allocator is rebound somewhere, // it's going to be rebound to the correct type - and not to its base diff --git a/generate_mk.py b/generate_mk.py index 46042036c..84071338c 100755 --- a/generate_mk.py +++ b/generate_mk.py @@ -6,6 +6,7 @@ # A single example or unit test source file generates its own executable # This program is called by a top level Makefile, but can also be used stand-alone for debugging # This program also generates testing.mk, examples.mk and dependencies.mk +from __future__ import print_function import sys import shutil as sh import os @@ -31,7 +32,7 @@ def Glob(pattern, directory,exclude='\B'): def generate_test_mk(mk_path, test_path, group, TEST_DIR): - print 'Generating makefiles in "'+mk_path+'" for tests in "'+test_path+'"' + print('Generating makefiles in "'+mk_path+'" for tests in "'+test_path+'"') src_cu = Glob("*.cu", test_path, ".*testframework.cu$") src_cxx = Glob("*.cpp", test_path) src_cu.sort(); @@ -52,7 +53,7 @@ def generate_test_mk(mk_path, test_path, group, TEST_DIR): return [tests_all, dependencies_all] def generate_example_mk(mk_path, example_path, group, EXAMPLE_DIR): - print 'Generating makefiles in "'+mk_path+'" for examples in "'+example_path+'"' + print('Generating makefiles in "'+mk_path+'" for examples in "'+example_path+'"') src_cu = Glob("*.cu", example_path) src_cxx = Glob("*.cpp", example_path) src_cu.sort(); diff --git a/internal/benchmark/CMakeLists.txt b/internal/benchmark/CMakeLists.txt new file mode 100644 index 000000000..8c59747b8 --- /dev/null +++ b/internal/benchmark/CMakeLists.txt @@ -0,0 +1,30 @@ +if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") + # MSVC builds fail at runtime. Benchmarks are linux-only for now. + message(STATUS "Thrust benchmarking is not available on MSVC.") + return() +endif() + +add_custom_target(thrust.all.bench) + +foreach(thrust_target IN LISTS THRUST_TARGETS) + thrust_get_target_property(config_host ${thrust_target} HOST) + thrust_get_target_property(config_device ${thrust_target} DEVICE) + thrust_get_target_property(config_prefix ${thrust_target} PREFIX) + + # Skip non cpp.cuda targets: + if (NOT config_host STREQUAL "CPP" OR + NOT config_device STREQUAL "CUDA") + continue() + endif() + + set(bench_target ${config_prefix}.bench) + + add_executable(${bench_target} bench.cu) + target_link_libraries(${bench_target} PRIVATE ${thrust_target}) + target_include_directories(${bench_target} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}") + thrust_clone_target_properties(${bench_target} ${thrust_target}) + thrust_fix_clang_nvcc_build_for(${bench_target}) + + add_dependencies(thrust.all.bench ${bench_target}) + add_dependencies(${config_prefix}.all ${bench_target}) +endforeach() diff --git a/internal/benchmark/bench.cu b/internal/benchmark/bench.cu index eba49f608..38d1d647a 100644 --- a/internal/benchmark/bench.cu +++ b/internal/benchmark/bench.cu @@ -4,6 +4,14 @@ #include #include #include +#include + +#if THRUST_CPP_DIALECT >= 2011 +#include +#include + +#include +#endif #include #include @@ -42,7 +50,7 @@ // We don't use THRUST_NOEXCEPT because it's new, and we want this benchmark to // be backwards-compatible to older versions of Thrust. -#if __cplusplus >= 201103L +#if THRUST_CPP_DIALECT >= 2011 #define NOEXCEPT noexcept #else #define NOEXCEPT throw() @@ -393,7 +401,6 @@ struct experiment_driver ); #endif -/* stl_average_walltime = round_to_precision( stl_average_walltime, stl_walltime_precision ); @@ -417,7 +424,6 @@ struct experiment_driver tbb_walltime_uncertainty, tbb_walltime_precision ); #endif -*/ // Round the average throughput and throughput uncertainty to the // significant figure of the throughput uncertainty. @@ -436,7 +442,6 @@ struct experiment_driver ); #endif -/* stl_average_throughput = round_to_precision( stl_average_throughput, stl_throughput_precision ); @@ -460,7 +465,6 @@ struct experiment_driver tbb_throughput_uncertainty, tbb_throughput_precision ); #endif -*/ std::cout << THRUST_VERSION // Thrust Version. << "," << test_name // Algorithm. @@ -695,6 +699,21 @@ struct copy_trial_base : trial_base } }; +#if THRUST_CPP_DIALECT >= 2011 +template +struct shuffle_trial_base : trial_base +{ + Container input; + + void setup(uint64_t elements) + { + input.resize(elements); + + randomize(input); + } +}; +#endif + /////////////////////////////////////////////////////////////////////////////// template @@ -890,6 +909,37 @@ struct copy_tester #endif }; +#if THRUST_CPP_DIALECT >= 2011 +template +struct shuffle_tester +{ + static char const* test_name() { return "shuffle"; } + + struct std_trial : shuffle_trial_base, baseline_trial> + { + std::default_random_engine g; + void operator()() + { + std::shuffle(this->input.begin(), this->input.end(), this->g); + } + }; + + struct thrust_trial : shuffle_trial_base > + { + thrust::default_random_engine g; + void operator()() + { + thrust::shuffle(this->input.begin(), this->input.end(), this->g); + #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA + cudaError_t err = cudaDeviceSynchronize(); + if (err != cudaSuccess) + throw thrust::error_code(err, thrust::cuda_category()); + #endif + } + }; +}; +#endif + /////////////////////////////////////////////////////////////////////////////// template < @@ -941,6 +991,14 @@ void run_core_primitives_experiments_for_type() , BaselineTrials , RegularTrials >::run_experiment(); + + experiment_driver< + shuffle_tester + , ElementMetaType + , Elements / sizeof(typename ElementMetaType::type) + , BaselineTrials + , RegularTrials + >::run_experiment(); } /////////////////////////////////////////////////////////////////////////////// diff --git a/internal/build/common_build.mk b/internal/build/common_build.mk index a77a5e940..25cee6bb4 100644 --- a/internal/build/common_build.mk +++ b/internal/build/common_build.mk @@ -1,41 +1,18 @@ USE_NEW_PROJECT_MK := 1 +CCCL_ENABLE_DEPRECATIONS := 1 + ifeq ($(OS),Linux) LIBRARIES += m endif -include $(ROOTDIR)/thrust/internal/build/common_warnings.mk +include $(ROOTDIR)/thrust/internal/build/common_compiler.mk # Add /bigobj to Windows build flag to workaround building Thrust with debug ifeq ($(OS),win32) CUDACC_FLAGS += -Xcompiler "/bigobj" endif -ARCH_NEG_FILTER += 20 21 -# Determine which SASS to generate -# if DVS (either per-CL or on-demand) -ifneq ($(or $(THRUST_DVS),$(THRUST_DVS_NIGHTLY)),) - # DVS doesn't run Thrust on fermi so filter out SM 2.0/2.1 - # DVS doesn't run Thrust on mobile so filter those out as well - # DVS doesn't have PASCAL configs at the moment - ARCH_NEG_FILTER += 20 21 32 37 53 60 -else - # If building for ARMv7 (32-bit ARM), build only mobile SASS since no dGPU+ARM32 are supported anymore - ifeq ($(TARGET_ARCH),ARMv7) - ARCH_FILTER = 32 53 62 - endif - # If its androideabi, we know its mobile, so can target specific SASS - ifeq ($(OS),Linux) - ifeq ($(ABITYPE), androideabi) - ARCH_FILTER = 32 53 62 - ifeq ($(THRUST_TEST),1) - NVCC_OPTIONS += -include "$(ROOTDIR)/cuda/tools/demangler/demangler.h" - LIBRARIES += demangler - endif - endif - endif -endif - # Add -mthumb for Linux on ARM to work around bug in arm cross compiler from p4 ifeq ($(TARGET_ARCH),ARMv7) ifneq ($(HOST_ARCH),ARMv7) @@ -80,8 +57,15 @@ ifndef BUILD_AGAINST_RELEASE else INCLUDES_ABSPATH += $(ROOTDIR)/thrust endif + + # CUB includes + ifdef VULCAN + INCLUDES_ABSPATH += $(VULCAN_TOOLKIT_BASE)/cub + else + INCLUDES_ABSPATH += $(ROOTDIR)/cub + endif else - # CUDA and Thrust includes + # CUDA, CUB, and Thrust includes INCLUDES_ABSPATH += $(GPGPU_COMPILER_EXPORT)/include ifeq ($(TARGET_ARCH),ARMv7) @@ -95,6 +79,8 @@ ifdef VULCAN LIBDIRS_ABSPATH += $(VULCAN_BUILD_DIR)/bin/$(VULCAN_ARCH)_$(VULCAN_OS)$(VULCAN_ABI)_$(VULCAN_BUILD) endif +USES_CUDA_DRIVER_HEADERS := 1 + ifdef VULCAN_TOOLKIT_BASE include $(VULCAN_TOOLKIT_BASE)/build/common.mk else diff --git a/internal/build/common_warnings.mk b/internal/build/common_compiler.mk similarity index 64% rename from internal/build/common_warnings.mk rename to internal/build/common_compiler.mk index 7809d3752..020159365 100644 --- a/internal/build/common_warnings.mk +++ b/internal/build/common_compiler.mk @@ -3,10 +3,18 @@ ifeq ($(OS),$(filter $(OS),Linux Darwin)) CUDACC_FLAGS += -Xcompiler "-Wall -Wextra -Werror" ifdef USEXLC + CXX_STD := c++14 + # GCC does not warn about unused parameters in uninstantiated # template functions, but xlC does. This causes xlC to choke on the # OMP backend, which is mostly #ifdef'd out when you aren't using it. CUDACC_FLAGS += -Xcompiler "-Wno-unused-parameter" + + # xlC is unreasonable about unused functions in a translation unit + # when this warning is enabled; this includes warning on most functions + # that are defined as static inline in cuda_fp16.h. Disable this warning + # entirely under xlC. + CUDACC_FLAGS += -Xcompiler "-Wno-unused-function" else # GCC, ICC or Clang AKA the sane ones. # XXX Enable -Wcast-align. CUDACC_FLAGS += -Xcompiler "-Winit-self -Woverloaded-virtual -Wno-cast-align -Wcast-qual -Wno-long-long -Wno-variadic-macros -Wno-unused-function" @@ -26,6 +34,8 @@ ifeq ($(OS),$(filter $(OS),Linux Darwin)) endif ifdef IS_CLANG + CXX_STD := c++14 + ifdef USE_CLANGLLVM CLANG_VERSION = $(shell $(USE_CLANGLLVM) --version 2>/dev/null | head -1 | sed -e 's/.*\([0-9]\)\.\([0-9]\)\(\.[0-9]\).*/\1\2/g') else @@ -66,35 +76,34 @@ ifeq ($(OS),$(filter $(OS),Linux Darwin)) GCC_VERSION = $(shell $(CCBIN_ENVIRONMENT) $(CCBIN) -dumpversion | sed -e 's/\([0-9]\)\.\([0-9]\)\(\.[0-9]\)\?/\1\2/g') endif - ifeq ($(shell if test $(GCC_VERSION) -lt 42; then echo true; fi),true) - # In GCC 4.1.2 and older, numeric conversion warnings are not - # suppressable, so shut off -Wno-error. - CUDACC_FLAGS += -Xcompiler "-Wno-error" - endif - ifeq ($(shell if test $(GCC_VERSION) -eq 44; then echo true; fi),true) - # In GCC 4.4, the CUDA backend's kernel launch templates cause - # impossible-to-decipher "'' is used uninitialized in - # this function" warnings, so disable uninitialized variable - # warnings. - CUDACC_FLAGS += -Xcompiler "-Wno-uninitialized" - endif - ifeq ($(shell if test $(GCC_VERSION) -ge 45; then echo true; fi),true) - # This isn't available until GCC 4.3, and misfires on TMP code until - # GCC 4.5. - CUDACC_FLAGS += -Xcompiler "-Wlogical-op" + ifeq ($(shell if test $(GCC_VERSION) -ge 50; then echo true; fi),true) + CXX_STD := c++14 + else + CUDACC_FLAGS += -DTHRUST_IGNORE_DEPRECATED_CPP_DIALECT endif + ifeq ($(shell if test $(GCC_VERSION) -ge 73; then echo true; fi),true) # GCC 7.3 complains about name mangling changes due to `noexcept` # becoming part of the type system; we don't care. CUDACC_FLAGS += -Xcompiler "-Wno-noexcept-type" endif + ifeq ($(shell if test $(GCC_VERSION) -ge 80; then echo true; fi),true) + # GCC 8.x has a new warning that tries to diagnose technical misuses of + # memcpy and memmove. We need to resolve it better than this, but for the + # time being, we'll downgrade it from an error to a warning. + CUDACC_FLAGS += -Xcompiler "-Wno-error=class-memaccess" + endif else $(error CCBIN is not defined.) endif endif endif + else + CXX_STD := c++14 endif else ifeq ($(OS),win32) + CXX_STD := c++14 + # XXX Enable /Wall CUDACC_FLAGS += -Xcompiler "/WX" @@ -108,5 +117,44 @@ else ifeq ($(OS),win32) # Disable warning about applying unary - to unsigned type. CUDACC_FLAGS += -Xcompiler "/wd4146" + + # Warning about declspec(allocator) on inappropriate function types + CUDACC_FLAGS += -Xcompiler "/wd4494" + + # Allow tests to have lots and lots of sections in each translation unit: + CUDACC_FLAGS += -Xcompiler "/bigobj" endif +# Promote all NVCC warnings into errors +CUDACC_FLAGS += -Werror all-warnings + +# Print warning numbers with cudafe diagnostics +CUDACC_FLAGS += -Xcudafe --display_error_number + +VERSION_FLAG := +ifeq ($(OS),$(filter $(OS),Linux Darwin)) + ifdef USEPGCXX # PGI + VERSION_FLAG := -V + else + ifdef USEXLC # XLC + VERSION_FLAG := -qversion + else # GCC, ICC or Clang AKA the sane ones. + VERSION_FLAG := --version + endif + endif +else ifeq ($(OS),win32) # MSVC + # cl.exe run without any options will print its version info and exit. + VERSION_FLAG := +endif + +CCBIN_ENVIRONMENT := +ifeq ($(OS), QNX) + # QNX's GCC complains if QNX_HOST and QNX_TARGET aren't defined in the + # environment. + CCBIN_ENVIRONMENT := QNX_HOST=$(QNX_HOST) QNX_TARGET=$(QNX_TARGET) +endif + +$(info #### CCBIN : $(CCBIN)) +$(info #### CCBIN VERSION : $(shell $(CCBIN_ENVIRONMENT) $(CCBIN) $(VERSION_FLAG))) +$(info #### CXX_STD : $(CXX_STD)) + diff --git a/internal/build/common_detect.mk b/internal/build/common_detect.mk index df755fe49..e4beb6b88 100644 --- a/internal/build/common_detect.mk +++ b/internal/build/common_detect.mk @@ -1,3 +1,5 @@ +CXX_STD = c++11 + ifeq ($(THRUST_TEST),1) include $(ROOTDIR)/build/getprofile.mk include $(ROOTDIR)/build/config/$(PROFILE).mk diff --git a/internal/build/generic_example.mk b/internal/build/generic_example.mk index 7441f8665..8fe562245 100644 --- a/internal/build/generic_example.mk +++ b/internal/build/generic_example.mk @@ -1,8 +1,6 @@ # Generic project mk that is included by examples mk -# EXAMPLE_NAME : the name of the example -# EXAMPLE_SRC : path to the source code relative to thrust -EXECUTABLE := $(EXAMPLE_NAME) -BUILD_SRC := $(ROOTDIR)/thrust/$(EXAMPLE_SRC) +EXECUTABLE := $(EXAMPLE_NAME) +BUILD_SRC := $(ROOTDIR)/thrust/$(EXAMPLE_SRC) include $(ROOTDIR)/thrust/internal/build/common_detect.mk diff --git a/internal/build/generic_test.mk b/internal/build/generic_test.mk index 937f903f7..1be548c93 100644 --- a/internal/build/generic_test.mk +++ b/internal/build/generic_test.mk @@ -1,8 +1,6 @@ # Generic project mk that is included by unit tests mk -# TEST_NAME : the name of the test -# TEST_SRC : path to the source code relative to thrust -EXECUTABLE := $(TEST_NAME) -BUILD_SRC := $(ROOTDIR)/thrust/$(TEST_SRC) +EXECUTABLE := $(TEST_NAME) +BUILD_SRC := $(ROOTDIR)/thrust/$(TEST_SRC) ifdef VULCAN INCLUDES_ABSPATH += $(VULCAN_TOOLKIT_BASE)/thrust/testing diff --git a/internal/build/warningstester.mk b/internal/build/warningstester.mk index 7db50f201..f2ceecd8e 100644 --- a/internal/build/warningstester.mk +++ b/internal/build/warningstester.mk @@ -1,4 +1,5 @@ USE_NEW_PROJECT_MK := 1 + EXECUTABLE := warningstester PROJ_DIR := internal/build #GENCODE := @@ -23,20 +24,22 @@ endif CU_FILES += ../test/warningstester.cu -# Thrust includes (thrust/) +# Thrust includes ifdef VULCAN -INCLUDES += $(VULCAN_INSTALL_DIR)/cuda/include/ +INCLUDES += $(VULCAN_INSTALL_DIR)/cuda/include INCLUDES += $(VULCAN_INSTALL_DIR)/cuda/_internal/cudart +INCLUDES += $(VULCAN_TOOLKIT_BASE)/cub else -INCLUDES += ../../ +INCLUDES += ../.. INCLUDES += ../../../cuda/tools/cudart +INCLUDES += ../../../cub endif # Location of generated include file that includes all Thrust public headers GENERATED_SOURCES = $(BUILT_CWD) CUDACC_FLAGS += -I$(GENERATED_SOURCES) -include $(ROOTDIR)/thrust/internal/build/common_warnings.mk +include $(ROOTDIR)/thrust/internal/build/common_compiler.mk ifdef VULCAN_TOOLKIT_BASE include $(VULCAN_TOOLKIT_BASE)/build/common.mk diff --git a/internal/build/warningstester_create_uber_header.py b/internal/build/warningstester_create_uber_header.py index 29a333063..cef19a43d 100644 --- a/internal/build/warningstester_create_uber_header.py +++ b/internal/build/warningstester_create_uber_header.py @@ -46,6 +46,7 @@ def find_headers(base_dir, rel_dir, exclude = ['\B']): print('#error no include files found\n') print('#define THRUST_CPP11_REQUIRED_NO_ERROR') +print('#define THRUST_CPP14_REQUIRED_NO_ERROR') print('#define THRUST_MODERN_GCC_REQUIRED_NO_ERROR') for h in headers: print('#include <' + h + '>') diff --git a/internal/scripts/eris_perf.py b/internal/scripts/eris_perf.py index 7b50a8a85..580471101 100755 --- a/internal/scripts/eris_perf.py +++ b/internal/scripts/eris_perf.py @@ -169,6 +169,9 @@ def print_file(p): for record in reader: for variable, directionality in measured_variables: + # Don't monitor regressions for STL implementations, nvbug 28980890: + if "STL" in variable: + continue print "&&&& PERF {0}_{1}_{2}bit_{3}mib_{4} {5} {6}{7}".format( record["Algorithm"], record["Element Type"], diff --git a/internal/scripts/refresh_from_github2.sh b/internal/scripts/refresh_from_github2.sh index fb4a2aff1..6b977bcf3 100755 --- a/internal/scripts/refresh_from_github2.sh +++ b/internal/scripts/refresh_from_github2.sh @@ -1,4 +1,4 @@ -branch="master" +branch="main" while getopts "hb:c:" opt; do case $opt in @@ -37,7 +37,7 @@ set -e echo "Downloading thrust code from the $branch branch into /tmp/thrust-${branch}" rm -rf /tmp/thrust-${branch} -git clone -q git://github.com/thrust/thrust.git -b ${branch} /tmp/thrust-${branch} +git clone -q git://github.com/NVIDIA/thrust.git -b ${branch} /tmp/thrust-${branch} cd `dirname $0`/../.. echo "Changed current directory to `pwd`" diff --git a/internal/test/thrust.example.cuda.explicit_cuda_stream.filecheck b/internal/test/thrust.example.cuda.explicit_cuda_stream.filecheck new file mode 100644 index 000000000..8b81c77d3 --- /dev/null +++ b/internal/test/thrust.example.cuda.explicit_cuda_stream.filecheck @@ -0,0 +1 @@ + CHECK: sum is 499500 diff --git a/internal/test/thrust.example.transform_input_output_iterator.filecheck b/internal/test/thrust.example.transform_input_output_iterator.filecheck new file mode 100644 index 000000000..caeca2de5 --- /dev/null +++ b/internal/test/thrust.example.transform_input_output_iterator.filecheck @@ -0,0 +1,2 @@ + CHECK: Expected [ 1050 2060 3070 4080 ] +CHECK-NEXT: Result [ 1050 2060 3070 4080 ] diff --git a/internal/test/thrust.sanity.filecheck b/internal/test/thrust.sanity.filecheck deleted file mode 100644 index 1770bc9f3..000000000 --- a/internal/test/thrust.sanity.filecheck +++ /dev/null @@ -1 +0,0 @@ - CHECK: SANITY diff --git a/internal/test/thrust.smoke.filecheck b/internal/test/thrust.smoke.filecheck new file mode 100644 index 000000000..6906f6d86 --- /dev/null +++ b/internal/test/thrust.smoke.filecheck @@ -0,0 +1 @@ + CHECK: SMOKE diff --git a/internal/test/thrust_nightly.pl b/internal/test/thrust_nightly.pl index 61e03bda4..ab5815111 100755 --- a/internal/test/thrust_nightly.pl +++ b/internal/test/thrust_nightly.pl @@ -182,12 +182,12 @@ sub process_return_code { my $have_filecheck = 1; -sub filecheck_sanity { - my $filecheck_cmd = "$filecheck_path/FileCheck $filecheck_data_path/thrust.sanity.filecheck"; +sub filecheck_smoke_test { + my $filecheck_cmd = "$filecheck_path/FileCheck $filecheck_data_path/thrust.smoke.filecheck"; my $filecheck_pid = open(my $filecheck_stdin, "|-", "$filecheck_cmd 2>&1"); - print $filecheck_stdin "SANITY"; + print $filecheck_stdin "SMOKE"; my $filecheck_ret = 0; if (close($filecheck_stdin) == 0) @@ -196,21 +196,21 @@ sub filecheck_sanity { } if ($filecheck_ret == 0) { - printf("#### SANE FileCheck\n"); + printf("&&&& PASSED FileCheck\n"); } else { # Use a temporary file to send the output to # FileCheck so we can get the output this time, # because Perl and bidirectional pipes suck. my $tmp = File::Temp->new(); my $tmp_filename = $tmp->filename; - print $tmp "SANITY"; + print $tmp "SMOKE"; printf("********************************************************************************\n"); print `$filecheck_cmd -input-file $tmp_filename`; printf("********************************************************************************\n"); - process_return_code("FileCheck Sanity", $filecheck_ret, ""); - printf("#### INSANE FileCheck\n"); + process_return_code("FileCheck Test", $filecheck_ret, ""); + printf("&&&& FAILED FileCheck\n"); $have_filecheck = 0; } @@ -243,7 +243,7 @@ sub run_cmd { { $ret = $?; } - + alarm 0; }; my $elapsed = timestamp() - $start; @@ -286,7 +286,7 @@ sub run_examples { { my $test_exe = $test; - # Ignore FileCheck files. + # Ignore FileCheck files. if ($test =~ /[.]filecheck$/) { next; @@ -403,7 +403,7 @@ sub run_unit_tests { { my $test_exe = $test; - # Ignore FileCheck files. + # Ignore FileCheck files. if ($test =~ /[.]filecheck$/) { next; @@ -558,6 +558,7 @@ sub dvs_summary { printf("\n"); + # We can't remove "sanity" here yet because DVS looks for this exact string. printf("CUDA DVS BASIC SANITY SCORE : %.1f\n", $dvs_score); if ($failures + $errors > 0) { @@ -582,7 +583,7 @@ sub dvs_summary { printf("\n"); -filecheck_sanity(); +filecheck_smoke_test(); printf("\n"); diff --git a/scripts/gdb-pretty-printers.py b/scripts/gdb-pretty-printers.py new file mode 100644 index 000000000..15d790411 --- /dev/null +++ b/scripts/gdb-pretty-printers.py @@ -0,0 +1,153 @@ +import gdb +import sys + +if sys.version_info[0] > 2: + Iterator = object +else: + # "Polyfill" for Python2 Iterator interface + class Iterator: + def next(self): + return self.__next__() + + +class ThrustVectorPrinter(gdb.printing.PrettyPrinter): + "Print a thrust::*_vector" + + class _host_accessible_iterator(Iterator): + def __init__(self, start, size): + self.item = start + self.size = size + self.count = 0 + + def __iter__(self): + return self + + def __next__(self): + if self.count >= self.size: + raise StopIteration + elt = self.item.dereference() + count = self.count + self.item = self.item + 1 + self.count = self.count + 1 + return ('[%d]' % count, elt) + + class _device_iterator(Iterator): + def __init__(self, start, size): + self.exec = exec + self.item = start + self.size = size + self.count = 0 + self.buffer = None + self.sizeof = self.item.dereference().type.sizeof + self.buffer_start = 0 + # At most 1 MB or size, at least 1 + self.buffer_size = min(size, max(1, 2 ** 20 // self.sizeof)) + self.buffer = gdb.parse_and_eval( + '(void*)malloc(%s)' % (self.buffer_size * self.sizeof)) + self.buffer.fetch_lazy() + self.buffer_count = self.buffer_size + self.update_buffer() + + def update_buffer(self): + if self.buffer_count >= self.buffer_size: + self.buffer_item = gdb.parse_and_eval( + hex(self.buffer)).cast(self.item.type) + self.buffer_count = 0 + self.buffer_start = self.count + device_addr = hex(self.item.dereference().address) + buffer_addr = hex(self.buffer) + size = min(self.buffer_size, self.size - + self.buffer_start) * self.sizeof + status = gdb.parse_and_eval( + '(cudaError)cudaMemcpy(%s, %s, %d, cudaMemcpyDeviceToHost)' % (buffer_addr, device_addr, size)) + if status != 0: + raise gdb.MemoryError( + 'memcpy from device failed: %s' % status) + + def __del__(self): + gdb.parse_and_eval('(void)free(%s)' % + hex(self.buffer)).fetch_lazy() + + def __iter__(self): + return self + + def __next__(self): + if self.count >= self.size: + raise StopIteration + self.update_buffer() + elt = self.buffer_item.dereference() + self.buffer_item = self.buffer_item + 1 + self.buffer_count = self.buffer_count + 1 + count = self.count + self.item = self.item + 1 + self.count = self.count + 1 + return ('[%d]' % count, elt) + + def __init__(self, val): + self.val = val + self.pointer = val['m_storage']['m_begin']['m_iterator'] + self.size = int(val['m_size']) + self.capacity = int(val['m_storage']['m_size']) + self.is_device = False + if str(self.pointer.type).startswith("thrust::device_ptr"): + self.pointer = self.pointer['m_iterator'] + self.is_device = True + + def children(self): + if self.is_device: + return self._device_iterator(self.pointer, self.size) + else: + return self._host_accessible_iterator(self.pointer, self.size) + + def to_string(self): + typename = str(self.val.type) + return ('%s of length %d, capacity %d' % (typename, self.size, self.capacity)) + + def display_hint(self): + return 'array' + + +class ThrustReferencePrinter(gdb.printing.PrettyPrinter): + "Print a thrust::device_reference" + + def __init__(self, val): + self.val = val + self.pointer = val['ptr']['m_iterator'] + self.type = self.pointer.dereference().type + sizeof = self.type.sizeof + self.buffer = gdb.parse_and_eval('(void*)malloc(%s)' % sizeof) + device_addr = hex(self.pointer) + buffer_addr = hex(self.buffer) + status = gdb.parse_and_eval('(cudaError)cudaMemcpy(%s, %s, %d, cudaMemcpyDeviceToHost)' % ( + buffer_addr, device_addr, sizeof)) + if status != 0: + raise gdb.MemoryError('memcpy from device failed: %s' % status) + self.buffer_val = gdb.parse_and_eval( + hex(self.buffer)).cast(self.pointer.type).dereference() + + def __del__(self): + gdb.parse_and_eval('(void)free(%s)' % hex(self.buffer)).fetch_lazy() + + def children(self): + return [] + + def to_string(self): + typename = str(self.val.type) + return ('(%s) @%s: %s' % (typename, self.pointer, self.buffer_val)) + + def display_hint(self): + return None + + +def lookup_thrust_type(val): + if not str(val.type.unqualified()).startswith('thrust::'): + return None + suffix = str(val.type.unqualified())[8:] + if suffix.startswith('host_vector') or suffix.startswith('device_vector'): + return ThrustVectorPrinter(val) + elif int(gdb.VERSION.split(".")[0]) >= 10 and suffix.startswith('device_reference'): + return ThrustReferencePrinter(val) + return None + + +gdb.pretty_printers.append(lookup_thrust_type) diff --git a/testing/CMakeLists.txt b/testing/CMakeLists.txt new file mode 100644 index 000000000..0f0749c4e --- /dev/null +++ b/testing/CMakeLists.txt @@ -0,0 +1,169 @@ +# Create meta targets that build all tests for a single configuration: +foreach(thrust_target IN LISTS THRUST_TARGETS) + thrust_get_target_property(config_prefix ${thrust_target} PREFIX) + set(config_meta_target ${config_prefix}.tests) + add_custom_target(${config_meta_target}) + add_dependencies(${config_prefix}.all ${config_meta_target}) +endforeach() + +# Update flags to reflect RDC options. See note in ThrustCudaConfig.cmake -- +# these flag variables behave unintuitively: +if (THRUST_ENABLE_TESTS_WITH_RDC) + set(CMAKE_CUDA_FLAGS "${THRUST_CUDA_FLAGS_BASE} ${THRUST_CUDA_FLAGS_RDC}") +else() + set(CMAKE_CUDA_FLAGS "${THRUST_CUDA_FLAGS_BASE} ${THRUST_CUDA_FLAGS_NO_RDC}") +endif() + +# Generate testing framework libraries: +add_subdirectory(unittest) + +# Some tests only support certain host.device configurations. Use this macro to +# declare allowed configurations. If not specified, all host.device config are +# used. +set(restricted_tests) +macro(thrust_declare_test_restrictions test_name) + list(APPEND restricted_tests ${test_name}) + list(APPEND ${test_name}_host.device_allowed ${ARGN}) +endmacro() + +# Async/future/event tests only support the CUDA backend: +thrust_declare_test_restrictions(async_copy CPP.CUDA OMP.CUDA TBB.CUDA) +thrust_declare_test_restrictions(async_for_each CPP.CUDA OMP.CUDA TBB.CUDA) +thrust_declare_test_restrictions(async_reduce CPP.CUDA OMP.CUDA TBB.CUDA) +thrust_declare_test_restrictions(async_reduce_into CPP.CUDA OMP.CUDA TBB.CUDA) +thrust_declare_test_restrictions(async_sort CPP.CUDA OMP.CUDA TBB.CUDA) +thrust_declare_test_restrictions(async_transform CPP.CUDA OMP.CUDA TBB.CUDA) +thrust_declare_test_restrictions(event CPP.CUDA OMP.CUDA TBB.CUDA) +thrust_declare_test_restrictions(future CPP.CUDA OMP.CUDA TBB.CUDA) + +# This test is incompatible with TBB and OMP, since it requires special per-device +# handling to process exceptions in a device function, which is only implemented +# for CUDA. +thrust_declare_test_restrictions(unittest_static_assert CPP.CPP CPP.CUDA) + +# In the TBB backend, reduce_by_key does not currently work with transform_output_iterator +# https://github.com/NVIDIA/thrust/issues/1811 +thrust_declare_test_restrictions(transform_output_iterator_reduce_by_key CPP.CPP CPP.OMP CPP.CUDA) + +## thrust_add_test +# +# Add a test executable and register it with ctest. +# +# target_name_var: Variable name to overwrite with the name of the test +# target. Useful for post-processing target information per-backend. +# test_name: The name of the test minus ".test." For example, +# testing/vector.cu will be "vector", and testing/cuda/copy.cu will be +# "cuda.copy". +# test_src: The source file that implements the test. +# thrust_target: The reference thrust target with configuration information. +# +function(thrust_add_test target_name_var test_name test_src thrust_target) + thrust_get_target_property(config_host ${thrust_target} HOST) + thrust_get_target_property(config_device ${thrust_target} DEVICE) + thrust_get_target_property(config_prefix ${thrust_target} PREFIX) + + # Wrap the .cu file in .cpp for non-CUDA backends + if ("CUDA" STREQUAL "${config_device}") + set(real_test_src "${test_src}") + else() + thrust_wrap_cu_in_cpp(real_test_src "${test_src}" ${thrust_target}) + endif() + + # The actual name of the test's target: + set(test_target ${config_prefix}.test.${test_name}) + set(${target_name_var} ${test_target} PARENT_SCOPE) + + # Related target names: + set(config_framework_target ${config_prefix}.test.framework) + set(config_meta_target ${config_prefix}.tests) + set(test_meta_target thrust.all.test.${test_name}) + + add_executable(${test_target} "${real_test_src}") + target_link_libraries(${test_target} PRIVATE ${config_framework_target}) + target_include_directories(${test_target} PRIVATE "${Thrust_SOURCE_DIR}/testing") + thrust_clone_target_properties(${test_target} ${thrust_target}) + + if (NOT "Clang" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") + target_compile_definitions(${test_target} PRIVATE THRUST_TEST_DEVICE_SIDE) + endif() + + thrust_fix_clang_nvcc_build_for(${test_target}) + + # Add to the active configuration's meta target + add_dependencies(${config_meta_target} ${test_target}) + + # Meta target that builds tests with this name for all configurations: + if (NOT TARGET ${test_meta_target}) + add_custom_target(${test_meta_target}) + endif() + add_dependencies(${test_meta_target} ${test_target}) + + add_test(NAME ${test_target} + COMMAND "${CMAKE_COMMAND}" + "-DTHRUST_BINARY=$" + "-DTHRUST_SOURCE=${Thrust_SOURCE_DIR}" + -P "${Thrust_SOURCE_DIR}/cmake/ThrustRunTest.cmake" + ) + + # Run OMP/TBB tests in serial. Multiple OMP processes will massively + # oversubscribe the machine with GCC's OMP, and we want to test these with + # the full CPU available to each unit test. + set(config_systems ${config_host} ${config_device}) + if (("OMP" IN_LIST config_systems) OR ("TBB" IN_LIST config_systems)) + set_tests_properties(${test_target} PROPERTIES RUN_SERIAL ON) + endif() + + # Check for per-test script. Script will be included in the current scope + # to allow custom property modifications. + get_filename_component(test_cmake_script "${test_src}" NAME_WLE) + set(test_cmake_script "${CMAKE_CURRENT_LIST_DIR}/${test_cmake_script}.cmake") + # Use a glob so we can detect if this changes: + file(GLOB test_cmake_script + RELATIVE "${CMAKE_CURRENT_LIST_DIR}" + CONFIGURE_DEPENDS + "${test_cmake_script}" + ) + if (test_cmake_script) # Will be non-empty only if the script exists + include("${test_cmake_script}") + endif() +endfunction() + +file(GLOB test_srcs + RELATIVE "${CMAKE_CURRENT_LIST_DIR}" + CONFIGURE_DEPENDS + *.cu *.cpp +) + +# Add common tests to all configs: +foreach(thrust_target IN LISTS THRUST_TARGETS) + thrust_get_target_property(config_host ${thrust_target} HOST) + thrust_get_target_property(config_device ${thrust_target} DEVICE) + thrust_get_target_property(config_prefix ${thrust_target} PREFIX) + + foreach(test_src IN LISTS test_srcs) + get_filename_component(test_name "${test_src}" NAME_WLE) + + # Is this test restricted to only certain host/device combinations? + if(${test_name} IN_LIST restricted_tests) + # Is the current host/device combination supported? + if (NOT "${config_host}.${config_device}" IN_LIST + ${test_name}_host.device_allowed) + continue() + endif() + endif() + + thrust_add_test(test_target ${test_name} "${test_src}" ${thrust_target}) + + if (THRUST_ENABLE_TESTS_WITH_RDC AND ("CUDA" STREQUAL "${config_device}")) + thrust_enable_rdc_for_cuda_target(${test_target}) + endif() + endforeach() +endforeach() + +# Add specialized tests: +add_subdirectory(async) +add_subdirectory(cmake) +add_subdirectory(cpp) +add_subdirectory(cuda) +add_subdirectory(omp) +add_subdirectory(regression) diff --git a/testing/adjacent_difference.cu b/testing/adjacent_difference.cu index 8e5cd3ff8..5f97ea350 100644 --- a/testing/adjacent_difference.cu +++ b/testing/adjacent_difference.cu @@ -2,6 +2,8 @@ #include #include #include +#include +#include template void TestAdjacentDifferenceSimple(void) @@ -13,21 +15,21 @@ void TestAdjacentDifferenceSimple(void) input[0] = 1; input[1] = 4; input[2] = 6; typename Vector::iterator result; - + result = thrust::adjacent_difference(input.begin(), input.end(), output.begin()); ASSERT_EQUAL(result - output.begin(), 3); ASSERT_EQUAL(output[0], T(1)); ASSERT_EQUAL(output[1], T(3)); ASSERT_EQUAL(output[2], T(2)); - + result = thrust::adjacent_difference(input.begin(), input.end(), output.begin(), thrust::plus()); - + ASSERT_EQUAL(result - output.begin(), 3); ASSERT_EQUAL(output[0], T( 1)); ASSERT_EQUAL(output[1], T( 5)); ASSERT_EQUAL(output[2], T(10)); - + // test in-place operation, result and first are permitted to be the same result = thrust::adjacent_difference(input.begin(), input.end(), input.begin()); @@ -57,14 +59,14 @@ void TestAdjacentDifference(const size_t n) ASSERT_EQUAL(std::size_t(h_result - h_output.begin()), n); ASSERT_EQUAL(std::size_t(d_result - d_output.begin()), n); ASSERT_EQUAL(h_output, d_output); - + h_result = thrust::adjacent_difference(h_input.begin(), h_input.end(), h_output.begin(), thrust::plus()); d_result = thrust::adjacent_difference(d_input.begin(), d_input.end(), d_output.begin(), thrust::plus()); ASSERT_EQUAL(std::size_t(h_result - h_output.begin()), n); ASSERT_EQUAL(std::size_t(d_result - d_output.begin()), n); ASSERT_EQUAL(h_output, d_output); - + // in-place operation h_result = thrust::adjacent_difference(h_input.begin(), h_input.end(), h_input.begin(), thrust::plus()); d_result = thrust::adjacent_difference(d_input.begin(), d_input.end(), d_input.begin(), thrust::plus()); @@ -90,7 +92,7 @@ void TestAdjacentDifferenceInPlaceWithRelatedIteratorTypes(const size_t n) h_result = thrust::adjacent_difference(h_input.begin(), h_input.end(), h_output.begin(), thrust::plus()); d_result = thrust::adjacent_difference(d_input.begin(), d_input.end(), d_output.begin(), thrust::plus()); - + // in-place operation with different iterator types h_result = thrust::adjacent_difference(h_input.cbegin(), h_input.cend(), h_input.begin(), thrust::plus()); d_result = thrust::adjacent_difference(d_input.cbegin(), d_input.cend(), d_input.begin(), thrust::plus()); @@ -159,4 +161,3 @@ void TestAdjacentDifferenceDispatchImplicit() ASSERT_EQUAL(13, d_input.front()); } DECLARE_UNITTEST(TestAdjacentDifferenceDispatchImplicit); - diff --git a/testing/alignment.cu b/testing/alignment.cu index 6ddf1c73c..e55df2e96 100644 --- a/testing/alignment.cu +++ b/testing/alignment.cu @@ -210,7 +210,7 @@ void test_aligned_type() DECLARE_UNITTEST(test_aligned_type); template -void test_aligned_storage_instantiation() +void test_aligned_storage_instantiation(thrust::detail::true_type /* Align is valid */) { typedef typename thrust::detail::aligned_storage::type type; ASSERT_GEQUAL(sizeof(type), Len); @@ -218,6 +218,21 @@ void test_aligned_storage_instantiation() ASSERT_EQUAL(thrust::detail::alignment_of::value, Align); } +template +void test_aligned_storage_instantiation(thrust::detail::false_type /* Align is invalid */) +{ + // no-op -- alignment is > max_align_t and MSVC complains loudly. +} + +template +void test_aligned_storage_instantiation() +{ + typedef thrust::detail::integral_constant< + bool, Align <= THRUST_ALIGNOF(thrust::detail::max_align_t)> + ValidAlign; + test_aligned_storage_instantiation(ValidAlign()); +} + template void test_aligned_storage_size() { diff --git a/testing/allocator.cu b/testing/allocator.cu index edc6f0d52..175685ed0 100644 --- a/testing/allocator.cu +++ b/testing/allocator.cu @@ -1,6 +1,10 @@ #include +#include #include #include + +#include + #include template @@ -59,9 +63,12 @@ DECLARE_VARIABLE_UNITTEST(TestAllocatorCustomCopyConstruct); template struct my_allocator_with_custom_destroy { - typedef T value_type; - typedef T & reference; - typedef const T & const_reference; + // This is only used with thrust::cpp::vector: + using system_type = thrust::cpp::tag; + + using value_type = T; + using reference = T &; + using const_reference = const T &; static bool g_state; @@ -79,9 +86,7 @@ struct my_allocator_with_custom_destroy __host__ __device__ void destroy(T *) { -#if !__CUDA_ARCH__ - g_state = true; -#endif + NV_IF_TARGET(NV_IS_HOST, (g_state = true;)); } value_type *allocate(std::ptrdiff_t n) @@ -118,12 +123,14 @@ bool my_allocator_with_custom_destroy::g_state = false; template void TestAllocatorCustomDestroy(size_t n) { + my_allocator_with_custom_destroy::g_state = false; + { thrust::cpp::vector > vec(n); } // destroy everything - if (0 < n) - ASSERT_EQUAL(true, my_allocator_with_custom_destroy::g_state); + // state should only be true when there are values to destroy: + ASSERT_EQUAL(n > 0, my_allocator_with_custom_destroy::g_state); } DECLARE_VARIABLE_UNITTEST(TestAllocatorCustomDestroy); @@ -202,7 +209,6 @@ void TestAllocatorTraitsRebind() } DECLARE_UNITTEST(TestAllocatorTraitsRebind); -#if __cplusplus >= 201103L void TestAllocatorTraitsRebindCpp11() { ASSERT_EQUAL( @@ -250,5 +256,3 @@ void TestAllocatorTraitsRebindCpp11() ); } DECLARE_UNITTEST(TestAllocatorTraitsRebindCpp11); -#endif - diff --git a/testing/allocator_aware_policies.cu b/testing/allocator_aware_policies.cu index a1b7b911a..0a737c3ce 100644 --- a/testing/allocator_aware_policies.cu +++ b/testing/allocator_aware_policies.cu @@ -17,15 +17,16 @@ struct test_allocator_t test_allocator_t test_allocator = test_allocator_t(); const test_allocator_t const_test_allocator = test_allocator_t(); -struct test_memory_resource_t THRUST_FINAL : thrust::mr::memory_resource<> +struct test_memory_resource_t final : thrust::mr::memory_resource<> { - void * do_allocate(std::size_t, std::size_t) THRUST_OVERRIDE + void * do_allocate(std::size_t size, std::size_t) override { - return NULL; + return reinterpret_cast(size); } - void do_deallocate(void *, std::size_t, std::size_t) THRUST_OVERRIDE + void do_deallocate(void * ptr, std::size_t size, std::size_t) override { + ASSERT_EQUAL(ptr, reinterpret_cast(size)); } } test_memory_resource; @@ -83,7 +84,8 @@ struct TestAllocatorAttachment get_temporary_buffer( policy, 123 - ).first + ).first, + 123 ); } @@ -106,8 +108,9 @@ struct TestAllocatorAttachment test_temporary_allocation_valid(policy(std::allocator())); test_temporary_allocation_valid(policy(alloc)); test_temporary_allocation_valid(policy(const_alloc)); + test_temporary_allocation_valid(policy(&test_memory_resource)); - #if THRUST_CPP_DIALECT >= 2011 + #if THRUST_CPP_DIALECT >= 2011 test_temporary_allocation_valid(policy(std::allocator()).after(1)); test_temporary_allocation_valid(policy(alloc).after(1)); test_temporary_allocation_valid(policy(const_alloc).after(1)); diff --git a/testing/async/CMakeLists.txt b/testing/async/CMakeLists.txt new file mode 100644 index 000000000..00d50f097 --- /dev/null +++ b/testing/async/CMakeLists.txt @@ -0,0 +1,80 @@ +# The async tests perform a large amount of codegen, making them expensive to +# build and test. To keep compilation and runtimes manageable, the tests are +# broken up into many files per algorithm to enable parallelism during +# compilation and testing. The structure of these test directories are: +# +# thrust/testing/async//.cu +# +# These generate executables and CTest tests named +# ${config_prefix}.test.async... + +# The async tests only support CUDA enabled configs. Create a list of valid +# thrust targets: +set(cuda_configs) +foreach(thrust_target IN LISTS THRUST_TARGETS) + thrust_get_target_property(config_device ${thrust_target} DEVICE) + if (config_device STREQUAL CUDA) + list(APPEND cuda_configs ${thrust_target}) + endif() +endforeach() + +list(LENGTH cuda_configs num_cuda_configs) +if (num_cuda_configs EQUAL 0) + return() # No valid configs found, nothing to do. +endif() + +# Process a single algorithm directory, adding all .cu/cpp files as tests for +# each valid backend. algo_name is the name of the subdir ( +# above) and is used for naming the executable/targets. +function(thrust_add_async_test_dir algo_name) + file(GLOB test_srcs + RELATIVE "${CMAKE_CURRENT_LIST_DIR}" + CONFIGURE_DEPENDS + "${algo_name}/*.cu" + "${algo_name}/*.cpp" + ) + + # Per-algorithm, all-config metatarget: thrust.all.test.async.[algo].all + set(algo_meta_target thrust.all.test.async.${algo_name}.all) + add_custom_target(${algo_meta_target}) + + foreach(thrust_target IN LISTS cuda_configs) + thrust_get_target_property(config_prefix ${thrust_target} PREFIX) + + # Per-algorithm, per-config metatarget: thrust.[config].test.async.[algo].all + set(algo_config_meta_target ${config_prefix}.test.async.${algo_name}.all) + add_custom_target(${algo_config_meta_target}) + add_dependencies(${algo_meta_target} ${algo_config_meta_target}) + + foreach(test_src IN LISTS test_srcs) + get_filename_component(test_name "${test_src}" NAME_WLE) + string(PREPEND test_name async.${algo_name}.) + + thrust_add_test(test_target ${test_name} "${test_src}" ${thrust_target}) + if(THRUST_ENABLE_TESTS_WITH_RDC) + thrust_enable_rdc_for_cuda_target(${test_target}) + endif() + + add_dependencies(${algo_config_meta_target} ${test_target}) + endforeach() + endforeach() +endfunction() + +# Grab all algorithm subdirectories: +set(test_dirs) +file(GLOB contents + CONFIGURE_DEPENDS + "${CMAKE_CURRENT_LIST_DIR}/*" +) + +foreach(test_dir IN LISTS contents) + if(IS_DIRECTORY "${test_dir}") + list(APPEND test_dirs "${test_dir}") + endif() +endforeach() + +# Process all test dirs: +foreach(test_dir IN LISTS test_dirs) + get_filename_component(algo_name "${test_dir}" NAME_WLE) + thrust_add_async_test_dir(${algo_name}) +endforeach() diff --git a/testing/async/exclusive_scan/counting_iterator.cu b/testing/async/exclusive_scan/counting_iterator.cu new file mode 100644 index 000000000..7771299dd --- /dev/null +++ b/testing/async/exclusive_scan/counting_iterator.cu @@ -0,0 +1,46 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +#include +#include + +template > +struct invoker + : testing::async::mixin::input::counting_iterator_from_0 + , testing::async::mixin::output::device_vector + , testing::async::exclusive_scan::mixin::postfix_args:: + all_overloads + , testing::async::exclusive_scan::mixin::invoke_reference:: + host_synchronous + , testing::async::exclusive_scan::mixin::invoke_async::simple + , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet +{ + static std::string description() + { + return "fancy input iterator (counting_iterator)"; + } +}; + +template +struct test_counting_iterator +{ + void operator()(std::size_t num_values) const + { + num_values = unittest::truncate_to_max_representable(num_values); + testing::async::test_policy_overloads>::run(num_values); + } +}; +// Use built-in types only, counting_iterator doesn't seem to be compatible with +// the custom_numeric. +DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_counting_iterator, + BuiltinNumericTypes); + +#endif // C++14 diff --git a/testing/async/exclusive_scan/discard_output.cu b/testing/async/exclusive_scan/discard_output.cu new file mode 100644 index 000000000..ec7ca5f47 --- /dev/null +++ b/testing/async/exclusive_scan/discard_output.cu @@ -0,0 +1,38 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +// Compilation test with discard iterators. No runtime validation is actually +// performed, other than testing whether the algorithm completes without +// exception. + +template > +struct discard_invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::discard_iterator + , testing::async::exclusive_scan::mixin::postfix_args:: + all_overloads + , testing::async::mixin::invoke_reference::noop + , testing::async::exclusive_scan::mixin::invoke_async::simple + , testing::async::mixin::compare_outputs::noop +{ + static std::string description() { return "discard output"; } +}; + +template +struct test_discard +{ + void operator()(std::size_t num_values) const + { + testing::async::test_policy_overloads>::run(num_values); + } +}; +DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_discard, NumericTypes); + +#endif // C++14 diff --git a/testing/async/exclusive_scan/large_indices.cu b/testing/async/exclusive_scan/large_indices.cu new file mode 100644 index 000000000..4d1c51df0 --- /dev/null +++ b/testing/async/exclusive_scan/large_indices.cu @@ -0,0 +1,244 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +#include +#include +#include +#include + +#include +#include + +#include + +// This test is an adaptation of TestInclusiveScanWithBigIndices from scan.cu. + +namespace +{ + +// Fake iterator that asserts +// (a) it is written with a sequence and +// (b) a defined maximum value is written at some point +// +// This allows us to test very large problem sizes without actually allocating +// large amounts of memory that would exceed most devices' capacity. +struct assert_sequence_iterator +{ + using value_type = std::int64_t; + using difference_type = std::int64_t; + + // Defined for thrust::iterator_traits: + using pointer = value_type*; + using reference = assert_sequence_iterator; // weird but convenient + using iterator_category = + typename thrust::detail::iterator_facade_category< + thrust::device_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference>::type; + + std::int64_t expected{0}; + std::int64_t max{0}; + mutable thrust::device_ptr found_max{nullptr}; + mutable thrust::device_ptr unexpected_value{nullptr}; + + // Should be called on the first iterator generated. This needs to be + // done explicitly from the host. + void initialize_shared_state() + { + found_max = thrust::device_malloc(1); + unexpected_value = thrust::device_malloc(1); + *found_max = false; + *unexpected_value = false; + } + + // Should be called only once on the initialized iterator. This needs to be + // done explicitly from the host. + void free_shared_state() const + { + thrust::device_free(found_max); + thrust::device_free(unexpected_value); + found_max = nullptr; + unexpected_value = nullptr; + } + + __host__ __device__ assert_sequence_iterator operator+(difference_type i) const + { + return clone(expected + i); + } + + __host__ __device__ reference operator[](difference_type i) const + { + return clone(expected + i); + } + + // Some weirdness, this iterator acts like its own reference + __device__ assert_sequence_iterator operator=(value_type val) + { + if (val != expected) + { + printf("Error: expected %lld, got %lld\n", expected, val); + *unexpected_value = true; + } + else if (val == max) + { + *found_max = true; + } + + return *this; + } + +private: + __host__ __device__ + assert_sequence_iterator clone(value_type new_expected) const + { + return {new_expected, max, found_max, unexpected_value}; + } +}; + +// output mixin that generates assert_sequence_iterators. +// Must be paired with validate_assert_sequence_iterators mixin to free +// shared state. +struct assert_sequence_output +{ + struct output_type + { + using iterator = assert_sequence_iterator; + + iterator iter; + + explicit output_type(iterator&& it) + : iter{std::move(it)} + { + iter.initialize_shared_state(); + } + + ~output_type() + { + iter.free_shared_state(); + } + + iterator begin() { return iter; } + }; + + template + static output_type generate_output(std::size_t num_values, InputType&) + { + using value_type = typename assert_sequence_iterator::value_type; + assert_sequence_iterator it{0, + // minus one bc exclusive scan: + static_cast(num_values - 1), + nullptr, + nullptr}; + return output_type{std::move(it)}; + } +}; + +struct validate_assert_sequence_iterators +{ + using output_t = assert_sequence_output::output_type; + + template + static void compare_outputs(EventType& e, + output_t const&, + output_t const& test) + { + testing::async::mixin::compare_outputs::detail::basic_event_validation(e); + + ASSERT_EQUAL(*test.iter.unexpected_value, false); + ASSERT_EQUAL(*test.iter.found_max, true); + } +}; + +//------------------------------------------------------------------------------ +// Overloads without custom binary operators use thrust::plus<>, so use +// constant input iterator to generate the output sequence: +struct default_bin_op_overloads +{ + using postfix_args_type = std::tuple< // List any extra arg overloads: + std::tuple<>, // - no extra args + std::tuple // - initial_value + >; + + static postfix_args_type generate_postfix_args() + { + return postfix_args_type{std::tuple<>{}, std::tuple{0}}; + } +}; + +struct default_bin_op_invoker + : testing::async::mixin::input::constant_iterator_1 + , assert_sequence_output + , default_bin_op_overloads + , testing::async::mixin::invoke_reference::noop + , testing::async::exclusive_scan::mixin::invoke_async::simple + , validate_assert_sequence_iterators +{ + static std::string description() + { + return "test large array indices with default binary operator"; + } +}; + +} // anon namespace + +void test_large_indices_default_scan_op() +{ + // Test problem sizes around signed/unsigned int max: + testing::async::test_policy_overloads::run(1ll << 30); + testing::async::test_policy_overloads::run(1ll << 31); + testing::async::test_policy_overloads::run(1ll << 32); + testing::async::test_policy_overloads::run(1ll << 33); +} +DECLARE_UNITTEST(test_large_indices_default_scan_op); + +namespace +{ + +//------------------------------------------------------------------------------ +// Generate the output sequence using counting iterators and thrust::max<> for +// custom operator overloads. +struct custom_bin_op_overloads +{ + using postfix_args_type = std::tuple< // List any extra arg overloads: + std::tuple> // - initial_value, binop + >; + + static postfix_args_type generate_postfix_args() + { + return postfix_args_type{std::make_tuple(0, thrust::maximum<>{})}; + } +}; + +struct custom_bin_op_invoker + : testing::async::mixin::input::counting_iterator_from_1 + , assert_sequence_output + , custom_bin_op_overloads + , testing::async::mixin::invoke_reference::noop + , testing::async::exclusive_scan::mixin::invoke_async::simple + , validate_assert_sequence_iterators +{ + static std::string description() + { + return "test large array indices with custom binary operator"; + } +}; + +} // namespace + +void test_large_indices_custom_scan_op() +{ + // Test problem sizes around signed/unsigned int max: + testing::async::test_policy_overloads::run(1ll << 30); + testing::async::test_policy_overloads::run(1ll << 31); + testing::async::test_policy_overloads::run(1ll << 32); + testing::async::test_policy_overloads::run(1ll << 33); +} +DECLARE_UNITTEST(test_large_indices_custom_scan_op); + +#endif // C++14 diff --git a/testing/async/exclusive_scan/large_types.cu b/testing/async/exclusive_scan/large_types.cu new file mode 100644 index 000000000..571d39262 --- /dev/null +++ b/testing/async/exclusive_scan/large_types.cu @@ -0,0 +1,58 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +#include + +// This test is an adaptation of TestScanWithLargeTypes from scan.cu. + +// Need special initialization for the FixedVector type: +template +struct device_vector_fill +{ + using input_type = thrust::device_vector; + + static input_type generate_input(std::size_t num_values) + { + input_type input(num_values); + thrust::fill(input.begin(), input.end(), value_type{2}); + return input; + } +}; + +template > +struct invoker + : device_vector_fill + , testing::async::mixin::output::device_vector + , testing::async::exclusive_scan::mixin::postfix_args:: + all_overloads + , testing::async::exclusive_scan::mixin::invoke_reference::host_synchronous< + value_type> + , testing::async::exclusive_scan::mixin::invoke_async::simple + , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet +{ + static std::string description() + { + return "scan with large value types."; + } +}; + +struct test_large_types +{ + void operator()(std::size_t num_values) const + { + using testing::async::test_policy_overloads; + + test_policy_overloads>>::run(num_values); + test_policy_overloads>>::run(num_values); + test_policy_overloads>>::run(num_values); + test_policy_overloads>>::run(num_values); + } +}; +DECLARE_UNITTEST(test_large_types); + +#endif // C++14 diff --git a/testing/async/exclusive_scan/mixed_types.cu b/testing/async/exclusive_scan/mixed_types.cu new file mode 100644 index 000000000..f69af1794 --- /dev/null +++ b/testing/async/exclusive_scan/mixed_types.cu @@ -0,0 +1,120 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +// Test using mixed int/float types for: +// - input_value_type | (int, float) +// - output_value_type | (int, float) +// - initial_value_type | (int, float, ) +// - thrust::plus T-type | (int, float, void>) +// +// The initial_value_type and thrust::plus types are covered by the +// mixin::postfix_args::scan_mixed_types_overloads component. +// +// The testing/scan.cu TestMixedTypes test spells out the expected behavior, +// which is defined by https://wg21.link/P0571. + +namespace +{ + +template +struct mixed_type_input_generator +{ + using input_type = thrust::device_vector; + + static input_type generate_input(std::size_t num_values) + { + input_type input(num_values); + thrust::sequence(input.begin(), + input.end(), + // fractional values are chosen deliberately to test + // casting orders and accumulator types: + static_cast(1.5), + static_cast(1)); + return input; + } +}; + +// A fractional value is used to ensure that a different result is obtained when +// using float vs. int. +template +struct mixed_types_postfix_args +{ + using postfix_args_type = std::tuple< // Overloads to test: + std::tuple<>, // - no extra args + std::tuple, // - initial_value + std::tuple>, // - initial_value, plus<> + std::tuple>, // - initial_value, plus + std::tuple> // - initial_value, plus + >; + + static postfix_args_type generate_postfix_args() + { + return postfix_args_type{ + std::tuple<>{}, + std::make_tuple(static_cast(5.5)), + std::make_tuple(static_cast(5.5), thrust::plus<>{}), + std::make_tuple(static_cast(5.5), thrust::plus{}), + std::make_tuple(static_cast(5.5), thrust::plus{})}; + } +}; + +template +struct invoker + : mixed_type_input_generator + , testing::async::mixin::output::device_vector + , mixed_types_postfix_args + , testing::async::exclusive_scan::mixin::invoke_reference:: + host_synchronous + , testing::async::exclusive_scan::mixin::invoke_async::simple + // Use almost_equal instead of almost_equal_if_fp because floating point + // addition may be hidden in the scan_op (thrust::plus is always + // tested). + , testing::async::mixin::compare_outputs::assert_almost_equal +{ + static std::string description() + { + return "mixed input/output/initial type tests"; + } +}; + +} // namespace + +void test_scan_mixed_types(size_t num_values) +{ + // Since fp addition is non-associative, the results may be slightly off + // from the reference. + // This is primarily handled by using `compare_almost_equal` to do a fuzzy + // comparison. But for large enough test sizes, eventually the scan results + // will wrap for integral value_types. If a float accumulator is used, the + // small errors from non-associative addition may cause the wrap to happen in + // a different location, resulting in an error too large for almost_equal to + // ignore. + // This wrap seems to happen around 2^16 values, so skip when num_values is + // close to that. + if (num_values > ((1ll << 16) - 10)) + { + return; + } + + // invoker template params are input_value_type, output_vt, initial_vt: + using testing::async::test_policy_overloads; + test_policy_overloads>::run(num_values); + test_policy_overloads>::run(num_values); + test_policy_overloads>::run(num_values); + test_policy_overloads>::run(num_values); + test_policy_overloads>::run(num_values); + test_policy_overloads>::run(num_values); + test_policy_overloads>::run(num_values); + // We all float down here + test_policy_overloads>::run(num_values); +} +DECLARE_SIZED_UNITTEST(test_scan_mixed_types); + +#endif // C++14 diff --git a/testing/async/exclusive_scan/mixin.h b/testing/async/exclusive_scan/mixin.h new file mode 100644 index 000000000..02ac9908f --- /dev/null +++ b/testing/async/exclusive_scan/mixin.h @@ -0,0 +1,119 @@ +#pragma once + +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +#include + +namespace testing +{ +namespace async +{ +namespace exclusive_scan +{ + +namespace mixin +{ + +//------------------------------------------------------------------------------ +namespace postfix_args +{ + +template > +struct all_overloads +{ + using postfix_args_type = std::tuple< // List any extra arg overloads: + std::tuple<>, // - no extra args + std::tuple, // - initial_value + std::tuple // - initial_value, binary_op + >; + + static postfix_args_type generate_postfix_args() + { + return postfix_args_type{std::tuple<>{}, + std::make_tuple(value_type{42}), + std::make_tuple(value_type{42}, + alternate_binary_op{})}; + } +}; + +} // namespace postfix_args + +//------------------------------------------------------------------------------ +namespace invoke_reference +{ + +template +struct host_synchronous +{ + template + static void invoke_reference(InputType const& input, + OutputType& output, + PostfixArgTuple&& postfix_tuple, + std::index_sequence) + { + // Create host versions of the input/output: + thrust::host_vector host_input(input.cbegin(), + input.cend()); + thrust::host_vector host_output(host_input.size()); + + // Run host synchronous algorithm to generate reference. + thrust::exclusive_scan(host_input.cbegin(), + host_input.cend(), + host_output.begin(), + std::get( + THRUST_FWD(postfix_tuple))...); + + // Copy back to device. + output = host_output; + } +}; + +} // namespace invoke_reference + +//------------------------------------------------------------------------------ +namespace invoke_async +{ + +struct simple +{ + template + static auto invoke_async(PrefixArgTuple&& prefix_tuple, + std::index_sequence, + InputType const& input, + OutputType& output, + PostfixArgTuple&& postfix_tuple, + std::index_sequence) + { + auto e = thrust::async::exclusive_scan( + std::get(THRUST_FWD(prefix_tuple))..., + input.cbegin(), + input.cend(), + output.begin(), + std::get(THRUST_FWD(postfix_tuple))...); + return e; + } +}; + +} // namespace invoke_async + +} // namespace mixin +} // namespace exclusive_scan +} // namespace async +} // namespace testing + +#endif // C++14 diff --git a/testing/async/exclusive_scan/simple.cu b/testing/async/exclusive_scan/simple.cu new file mode 100644 index 000000000..8c55052d7 --- /dev/null +++ b/testing/async/exclusive_scan/simple.cu @@ -0,0 +1,72 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +template > +struct simple_invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::device_vector + , testing::async::exclusive_scan::mixin::postfix_args:: + all_overloads + , testing::async::exclusive_scan::mixin::invoke_reference:: + host_synchronous + , testing::async::exclusive_scan::mixin::invoke_async::simple + , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet +{ + static std::string description() + { + return "simple invocation with device vectors"; + } +}; + +template +struct test_simple +{ + void operator()(std::size_t num_values) const + { + testing::async::test_policy_overloads>::run(num_values); + } +}; +DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_simple, NumericTypes); + +// Testing the in-place algorithm uses the exact same instantiations of the +// underlying scan implementation as above. Test them here to avoid compiling +// them twice. +template > +struct simple_inplace_invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::device_vector_reuse_input + , testing::async::exclusive_scan::mixin::postfix_args:: + all_overloads + , testing::async::exclusive_scan::mixin::invoke_reference::host_synchronous< + input_value_type> + , testing::async::exclusive_scan::mixin::invoke_async::simple + , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet +{ + static std::string description() + { + return "simple in-place invocation with device vectors"; + } +}; + +template +struct test_simple_in_place +{ + void operator()(std::size_t num_values) const + { + using invoker = simple_inplace_invoker; + testing::async::test_policy_overloads::run(num_values); + } +}; +DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_simple_in_place, NumericTypes); + +#endif // C++14 diff --git a/testing/async/exclusive_scan/stateful_operator.cu b/testing/async/exclusive_scan/stateful_operator.cu new file mode 100644 index 000000000..411ffbd99 --- /dev/null +++ b/testing/async/exclusive_scan/stateful_operator.cu @@ -0,0 +1,62 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +namespace +{ + +// Custom binary operator for scan: +template +struct stateful_operator +{ + T offset; + + __host__ __device__ T operator()(T v1, T v2) { return v1 + v2 + offset; } +}; + +// Postfix args overload definition that uses a stateful custom binary operator +template +struct use_stateful_operator +{ + using postfix_args_type = std::tuple< // Single overload: + std::tuple> // init_val, bin_op + >; + + static postfix_args_type generate_postfix_args() + { + return postfix_args_type{ + std::make_tuple(value_type{42}, + stateful_operator{value_type{2}})}; + } +}; + +template +struct invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::device_vector + , use_stateful_operator + , testing::async::exclusive_scan::mixin::invoke_reference::host_synchronous< + value_type> + , testing::async::exclusive_scan::mixin::invoke_async::simple + , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet +{ + static std::string description() { return "scan with stateful operator"; } +}; + +} // namespace + +template +struct test_stateful_operator +{ + void operator()(std::size_t num_values) const + { + testing::async::test_policy_overloads>::run(num_values); + } +}; +DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_stateful_operator, NumericTypes); + +#endif // C++14 diff --git a/testing/async/exclusive_scan/using_vs_adl.cu b/testing/async/exclusive_scan/using_vs_adl.cu new file mode 100644 index 000000000..34a80bd79 --- /dev/null +++ b/testing/async/exclusive_scan/using_vs_adl.cu @@ -0,0 +1,171 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +// Verify what happens when calling the algorithm without any namespace +// qualifiers: +// - If the async entry point is available in the global namespace due to a +// using statement, the async algorithm should be called. +// - Otherwise, ADL should resolve the call to the synchronous algo in the +// thrust:: namespace. + +namespace invoke_reference +{ + +template +struct adl_host_synchronous +{ + template + static void invoke_reference(InputType const& input, + OutputType& output, + PostfixArgTuple&& postfix_tuple, + std::index_sequence) + { + // Create host versions of the input/output: + thrust::host_vector host_input(input.cbegin(), + input.cend()); + thrust::host_vector host_output(host_input.size()); + + using OutIter = thrust::remove_cvref_t; + + // ADL should resolve this to the synchronous `thrust::` algorithm. + // This is checked by ensuring that the call returns an output iterator. + OutIter result = + exclusive_scan(host_input.cbegin(), + host_input.cend(), + host_output.begin(), + std::get(THRUST_FWD(postfix_tuple))...); + (void)result; + + // Copy back to device. + output = host_output; + } +}; + +} // namespace invoke_reference + +namespace invoke_async +{ + +struct using_namespace +{ + template + static auto invoke_async(PrefixArgTuple&& prefix_tuple, + std::index_sequence, + InputType const& input, + OutputType& output, + PostfixArgTuple&& postfix_tuple, + std::index_sequence) + { + // Importing the CPO into the current namespace should unambiguously resolve + // this call to the CPO, as opposed to resolving to the thrust:: algorithm + // via ADL. This is verified by checking that an event is returned. + using namespace thrust::async; + thrust::device_event e = + exclusive_scan(std::get(THRUST_FWD(prefix_tuple))..., + input.cbegin(), + input.cend(), + output.begin(), + std::get(THRUST_FWD(postfix_tuple))...); + return e; + } +}; + +struct using_cpo +{ + template + static auto invoke_async(PrefixArgTuple&& prefix_tuple, + std::index_sequence, + InputType const& input, + OutputType& output, + PostfixArgTuple&& postfix_tuple, + std::index_sequence) + { + // Importing the CPO into the current namespace should unambiguously resolve + // this call to the CPO, as opposed to resolving to the thrust:: algorithm + // via ADL. This is verified by checking that an event is returned. + using thrust::async::exclusive_scan; + thrust::device_event e = + exclusive_scan(std::get(THRUST_FWD(prefix_tuple))..., + input.cbegin(), + input.cend(), + output.begin(), + std::get(THRUST_FWD(postfix_tuple))...); + return e; + } +}; + +} // namespace invoke_async + +template > +struct using_namespace_invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::device_vector + , testing::async::exclusive_scan::mixin::postfix_args:: + all_overloads + , invoke_reference::adl_host_synchronous + , invoke_async::using_namespace + , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet +{ + static std::string description() + { + return "importing async CPO with `using namespace thrust::async`"; + } +}; + +void test_using_namespace() +{ + using invoker = using_namespace_invoker; + testing::async::test_policy_overloads::run(128); +} +DECLARE_UNITTEST(test_using_namespace); + +template > +struct using_cpo_invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::device_vector + , testing::async::exclusive_scan::mixin::postfix_args:: + all_overloads + , invoke_reference::adl_host_synchronous + , invoke_async::using_cpo + , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet +{ + static std::string description() + { + return "importing async CPO with " + "`using namespace thrust::async::exclusive_scan`"; + } +}; + +void test_using_cpo() +{ + using invoker = using_cpo_invoker; + testing::async::test_policy_overloads::run(128); +} +DECLARE_UNITTEST(test_using_cpo); + +#endif // C++14 diff --git a/testing/async/inclusive_scan/counting_iterator.cu b/testing/async/inclusive_scan/counting_iterator.cu new file mode 100644 index 000000000..fe9fdeb80 --- /dev/null +++ b/testing/async/inclusive_scan/counting_iterator.cu @@ -0,0 +1,45 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +#include +#include + +template > +struct invoker + : testing::async::mixin::input::counting_iterator_from_0 + , testing::async::mixin::output::device_vector + , testing::async::inclusive_scan::mixin::postfix_args:: + all_overloads + , testing::async::inclusive_scan::mixin::invoke_reference:: + host_synchronous + , testing::async::inclusive_scan::mixin::invoke_async::simple + , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet +{ + static std::string description() + { + return "fancy input iterator (counting_iterator)"; + } +}; + +template +struct test_counting_iterator +{ + void operator()(std::size_t num_values) const + { + num_values = unittest::truncate_to_max_representable(num_values); + testing::async::test_policy_overloads>::run(num_values); + } +}; +// Use built-in types only, counting_iterator doesn't seem to be compatible with +// the custom_numeric. +DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_counting_iterator, + BuiltinNumericTypes); + +#endif // C++14 diff --git a/testing/async/inclusive_scan/discard_output.cu b/testing/async/inclusive_scan/discard_output.cu new file mode 100644 index 000000000..c202de7f0 --- /dev/null +++ b/testing/async/inclusive_scan/discard_output.cu @@ -0,0 +1,37 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +// Compilation test with discard iterators. No runtime validation is actually +// performed, other than testing whether the algorithm completes without +// exception. + +template > +struct discard_invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::discard_iterator + , testing::async::inclusive_scan::mixin::postfix_args:: + all_overloads + , testing::async::mixin::invoke_reference::noop + , testing::async::inclusive_scan::mixin::invoke_async::simple + , testing::async::mixin::compare_outputs::noop +{ + static std::string description() { return "discard output"; } +}; + +template +struct test_discard +{ + void operator()(std::size_t num_values) const + { + testing::async::test_policy_overloads>::run(num_values); + } +}; +DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_discard, NumericTypes); + +#endif // C++14 diff --git a/testing/async/inclusive_scan/large_indices.cu b/testing/async/inclusive_scan/large_indices.cu new file mode 100644 index 000000000..4124cf96d --- /dev/null +++ b/testing/async/inclusive_scan/large_indices.cu @@ -0,0 +1,239 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +#include +#include +#include +#include + +#include +#include + +#include + +// This test is an adaptation of TestInclusiveScanWithBigIndices from scan.cu. + +namespace +{ + +// Fake iterator that asserts +// (a) it is written with a sequence and +// (b) a defined maximum value is written at some point +// +// This allows us to test very large problem sizes without actually allocating +// large amounts of memory that would exceed most devices' capacity. +struct assert_sequence_iterator +{ + using value_type = std::int64_t; + using difference_type = std::int64_t; + + // Defined for thrust::iterator_traits: + using pointer = value_type *; + using reference = assert_sequence_iterator; // weird but convenient + using iterator_category = typename thrust::detail::iterator_facade_category< + thrust::device_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference>::type; + + std::int64_t expected{0}; + std::int64_t max{0}; + mutable thrust::device_ptr found_max{nullptr}; + mutable thrust::device_ptr unexpected_value{nullptr}; + + // Should be called on the first iterator generated. This needs to be done + // explicitly from the host. + void initialize_shared_state() + { + found_max = thrust::device_malloc(1); + unexpected_value = thrust::device_malloc(1); + *found_max = false; + *unexpected_value = false; + } + + // Should be called only once on the initialized iterator. This needs to be + // done explicitly from the host. + void free_shared_state() const + { + thrust::device_free(found_max); + thrust::device_free(unexpected_value); + found_max = nullptr; + unexpected_value = nullptr; + } + + __host__ __device__ assert_sequence_iterator operator+(difference_type i) const + { + return clone(expected + i); + } + + __host__ __device__ reference operator[](difference_type i) const + { + return clone(expected + i); + } + + // Some weirdness, this iterator acts like its own reference + __device__ assert_sequence_iterator operator=(value_type val) + { + if (val != expected) + { + printf("Error: expected %lld, got %lld\n", expected, val); + + *unexpected_value = true; + } + else if (val == max) + { + *found_max = true; + } + + return *this; + } + +private: + __host__ __device__ assert_sequence_iterator + clone(value_type new_expected) const + { + return {new_expected, max, found_max, unexpected_value}; + } +}; + +// output mixin that generates assert_sequence_iterators. +// Must be paired with validate_assert_sequence_iterators mixin to free +// shared state. +struct assert_sequence_output +{ + struct output_type + { + using iterator = assert_sequence_iterator; + + iterator iter; + + explicit output_type(iterator &&it) + : iter{std::move(it)} + { + iter.initialize_shared_state(); + } + + ~output_type() { iter.free_shared_state(); } + + iterator begin() { return iter; } + }; + + template + static output_type generate_output(std::size_t num_values, InputType &) + { + using value_type = typename assert_sequence_iterator::value_type; + assert_sequence_iterator it{1, + static_cast(num_values), + nullptr, + nullptr}; + return output_type{std::move(it)}; + } +}; + +struct validate_assert_sequence_iterators +{ + using output_t = assert_sequence_output::output_type; + + template + static void compare_outputs(EventType &e, + output_t const &, + output_t const &test) + { + testing::async::mixin::compare_outputs::detail::basic_event_validation(e); + + ASSERT_EQUAL(*test.iter.unexpected_value, false); + ASSERT_EQUAL(*test.iter.found_max, true); + } +}; + +//------------------------------------------------------------------------------ +// Overloads without custom binary operators use thrust::plus<>, so use +// constant input iterator to generate the output sequence: +struct default_bin_op_overloads +{ + using postfix_args_type = std::tuple< // List any extra arg overloads: + std::tuple<> // - no extra args + >; + + static postfix_args_type generate_postfix_args() + { + return std::tuple>{}; + } +}; + +struct default_bin_op_invoker + : testing::async::mixin::input::constant_iterator_1 + , assert_sequence_output + , default_bin_op_overloads + , testing::async::mixin::invoke_reference::noop + , testing::async::inclusive_scan::mixin::invoke_async::simple + , validate_assert_sequence_iterators +{ + static std::string description() + { + return "test large array indices with default binary operator"; + } +}; + +} // end anon namespace + +void test_large_indices_default_scan_op() +{ + // Test problem sizes around signed/unsigned int max: + testing::async::test_policy_overloads::run(1ll << 30); + testing::async::test_policy_overloads::run(1ll << 31); + testing::async::test_policy_overloads::run(1ll << 32); + testing::async::test_policy_overloads::run(1ll << 33); +} +DECLARE_UNITTEST(test_large_indices_default_scan_op); + +namespace +{ + +//------------------------------------------------------------------------------ +// Generate the output sequence using counting iterators and thrust::max<> for +// custom operator overloads. +struct custom_bin_op_overloads +{ + using postfix_args_type = std::tuple< // List any extra arg overloads: + std::tuple> // - custom binary op + >; + + static postfix_args_type generate_postfix_args() + { + return postfix_args_type{std::make_tuple(thrust::maximum<>{})}; + } +}; + +struct custom_bin_op_invoker + : testing::async::mixin::input::counting_iterator_from_1 + , assert_sequence_output + , custom_bin_op_overloads + , testing::async::mixin::invoke_reference::noop + , testing::async::inclusive_scan::mixin::invoke_async::simple + , validate_assert_sequence_iterators +{ + static std::string description() + { + return "test large array indices with custom binary operator"; + } +}; + +} // end anon namespace + +void test_large_indices_custom_scan_op() +{ + // Test problem sizes around signed/unsigned int max: + testing::async::test_policy_overloads::run(1ll << 30); + testing::async::test_policy_overloads::run(1ll << 31); + testing::async::test_policy_overloads::run(1ll << 32); + testing::async::test_policy_overloads::run(1ll << 33); +} +DECLARE_UNITTEST(test_large_indices_custom_scan_op); + +#endif // C++14 diff --git a/testing/async/inclusive_scan/large_types.cu b/testing/async/inclusive_scan/large_types.cu new file mode 100644 index 000000000..00bb8b461 --- /dev/null +++ b/testing/async/inclusive_scan/large_types.cu @@ -0,0 +1,58 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +#include + +// This test is an adaptation of TestScanWithLargeTypes from scan.cu. + +// Need special initialization for the FixedVector type: +template +struct device_vector_fill +{ + using input_type = thrust::device_vector; + + static input_type generate_input(std::size_t num_values) + { + input_type input(num_values); + thrust::fill(input.begin(), input.end(), value_type{2}); + return input; + } +}; + +template > +struct invoker + : device_vector_fill + , testing::async::mixin::output::device_vector + , testing::async::inclusive_scan::mixin::postfix_args:: + all_overloads + , testing::async::inclusive_scan::mixin::invoke_reference::host_synchronous< + value_type> + , testing::async::inclusive_scan::mixin::invoke_async::simple + , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet +{ + static std::string description() + { + return "scan with large value types."; + } +}; + +struct test_large_types +{ + void operator()(std::size_t num_values) const + { + using testing::async::test_policy_overloads; + + test_policy_overloads>>::run(num_values); + test_policy_overloads>>::run(num_values); + test_policy_overloads>>::run(num_values); + test_policy_overloads>>::run(num_values); + } +}; +DECLARE_UNITTEST(test_large_types); + +#endif // C++14 diff --git a/testing/async/inclusive_scan/mixed_types.cu b/testing/async/inclusive_scan/mixed_types.cu new file mode 100644 index 000000000..57931c8d0 --- /dev/null +++ b/testing/async/inclusive_scan/mixed_types.cu @@ -0,0 +1,109 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +// Test using mixed int/float types for: +// - input_value_type | (int, float) +// - output_value_type | (int, float) +// - thrust::plus T-type | (int, float, void>) +// +// The thrust::plus types are covered by the +// scan_mixed_types_overloads component. +// +// The testing/scan.cu TestMixedTypes test spells out the expected behavior, +// which is defined by https://wg21.link/P0571. + +namespace +{ + +template +struct mixed_type_input_generator +{ + using input_type = thrust::device_vector; + + static input_type generate_input(std::size_t num_values) + { + input_type input(num_values); + thrust::sequence(input.begin(), + input.end(), + // fractional values are chosen deliberately to test + // casting orders and accumulator types: + static_cast(1.5), + static_cast(1)); + return input; + } +}; + +// A fractional value is used to ensure that a different result is obtained when +// using float vs. int. +struct mixed_types_postfix_args +{ + using postfix_args_type = std::tuple< // Overloads to test: + std::tuple<>, // - no extra args + std::tuple>, // - plus<> + std::tuple>, // - plus + std::tuple> // - plus + >; + + static postfix_args_type generate_postfix_args() + { + return postfix_args_type{std::tuple<>{}, + std::make_tuple(thrust::plus<>{}), + std::make_tuple(thrust::plus{}), + std::make_tuple(thrust::plus{})}; + } +}; + +template +struct invoker + : mixed_type_input_generator + , testing::async::mixin::output::device_vector + , mixed_types_postfix_args + , testing::async::inclusive_scan::mixin::invoke_reference:: + host_synchronous + , testing::async::inclusive_scan::mixin::invoke_async::simple + // Use almost_equal instead of almost_equal_if_fp because floating point + // addition may be hidden in the scan_op (thrust::plus is always + // tested). + , testing::async::mixin::compare_outputs::assert_almost_equal +{ + static std::string description() + { + return "mixed input/output/functor value_type tests"; + } +}; + +} // namespace + +void test_scan_mixed_types(size_t num_values) +{ + // Since fp addition is non-associative, the results may be slightly off + // from the reference. + // This is primarily handled by using `compare_almost_equal` to do a fuzzy + // comparison. But for large enough test sizes, eventually the scan results + // will wrap for integral value_types. If a float accumulator is used, the + // small errors from non-associative addition may cause the wrap to happen in + // a different location, resulting in an error too large for almost_equal to + // ignore. + // This wrap seems to happen around 2^16 values, so skip when num_values is + // close to that. + if (num_values > ((1ll << 16) - 10)) + { + return; + } + + // invoker template params are input_value_type, output_vt: + using testing::async::test_policy_overloads; + test_policy_overloads>::run(num_values); + test_policy_overloads>::run(num_values); + test_policy_overloads>::run(num_values); + test_policy_overloads>::run(num_values); +} +DECLARE_SIZED_UNITTEST(test_scan_mixed_types); + +#endif // C++14 diff --git a/testing/async/inclusive_scan/mixin.h b/testing/async/inclusive_scan/mixin.h new file mode 100644 index 000000000..82ecd59b8 --- /dev/null +++ b/testing/async/inclusive_scan/mixin.h @@ -0,0 +1,115 @@ +#pragma once + +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +#include + +namespace testing +{ +namespace async +{ +namespace inclusive_scan +{ + +namespace mixin +{ + +//------------------------------------------------------------------------------ +namespace postfix_args +{ + +template > +struct all_overloads +{ + using postfix_args_type = std::tuple< // List any extra arg overloads: + std::tuple<>, // - no extra args + std::tuple // - binary_op + >; + + static postfix_args_type generate_postfix_args() + { + return postfix_args_type{std::tuple<>{}, std::make_tuple(alternate_binary_op{})}; + } +}; + +} // namespace postfix_args + +//------------------------------------------------------------------------------ +namespace invoke_reference +{ + +template +struct host_synchronous +{ + template + static void invoke_reference(InputType const& input, + OutputType& output, + PostfixArgTuple&& postfix_tuple, + std::index_sequence) + { + // Create host versions of the input/output: + thrust::host_vector host_input(input.cbegin(), + input.cend()); + thrust::host_vector host_output(host_input.size()); + + // Run host synchronous algorithm to generate reference. + thrust::inclusive_scan(host_input.cbegin(), + host_input.cend(), + host_output.begin(), + std::get( + THRUST_FWD(postfix_tuple))...); + + // Copy back to device. + output = host_output; + } +}; + +} // namespace invoke_reference + +//------------------------------------------------------------------------------ +namespace invoke_async +{ + +struct simple +{ + template + static auto invoke_async(PrefixArgTuple&& prefix_tuple, + std::index_sequence, + InputType const& input, + OutputType& output, + PostfixArgTuple&& postfix_tuple, + std::index_sequence) + { + auto e = thrust::async::inclusive_scan( + std::get(THRUST_FWD(prefix_tuple))..., + input.cbegin(), + input.cend(), + output.begin(), + std::get(THRUST_FWD(postfix_tuple))...); + return e; + } +}; + +} // namespace invoke_async + +} // namespace mixin +} // namespace inclusive_scan +} // namespace async +} // namespace testing + +#endif // C++14 diff --git a/testing/async/inclusive_scan/simple.cu b/testing/async/inclusive_scan/simple.cu new file mode 100644 index 000000000..1256f009b --- /dev/null +++ b/testing/async/inclusive_scan/simple.cu @@ -0,0 +1,70 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +template > +struct simple_invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::device_vector + , testing::async::inclusive_scan::mixin::postfix_args:: + all_overloads + , testing::async::inclusive_scan::mixin::invoke_reference:: + host_synchronous + , testing::async::inclusive_scan::mixin::invoke_async::simple + , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet +{ + static std::string description() + { + return "simple invocation with device vectors"; + } +}; + +template +struct test_simple +{ + void operator()(std::size_t num_values) const + { + testing::async::test_policy_overloads>::run(num_values); + } +}; +DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_simple, NumericTypes); + +// Testing the in-place algorithm uses the exact same instantiations of the +// underlying scan implementation as above. Test them here to avoid compiling +// them twice. +template > +struct simple_inplace_invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::device_vector_reuse_input + , testing::async::inclusive_scan::mixin::postfix_args:: + all_overloads + , testing::async::inclusive_scan::mixin::invoke_reference::host_synchronous< + input_value_type> + , testing::async::inclusive_scan::mixin::invoke_async::simple + , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet +{ + static std::string description() + { + return "simple in-place invocation with device vectors"; + } +}; + +template +struct test_simple_in_place +{ + void operator()(std::size_t num_values) const + { + using invoker = simple_inplace_invoker; + testing::async::test_policy_overloads::run(num_values); + } +}; +DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_simple_in_place, NumericTypes); + +#endif // C++14 diff --git a/testing/async/inclusive_scan/stateful_operator.cu b/testing/async/inclusive_scan/stateful_operator.cu new file mode 100644 index 000000000..224c29303 --- /dev/null +++ b/testing/async/inclusive_scan/stateful_operator.cu @@ -0,0 +1,61 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +namespace +{ + +// Custom binary operator for scan: +template +struct stateful_operator +{ + T offset; + + __host__ __device__ T operator()(T v1, T v2) { return v1 + v2 + offset; } +}; + +// Postfix args overload definition that uses a stateful custom binary operator +template +struct use_stateful_operator +{ + using postfix_args_type = std::tuple< // Single overload: + std::tuple> // bin_op + >; + + static postfix_args_type generate_postfix_args() + { + return postfix_args_type{ + std::make_tuple(stateful_operator{value_type{2}})}; + } +}; + +template +struct invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::device_vector + , use_stateful_operator + , testing::async::inclusive_scan::mixin::invoke_reference::host_synchronous< + value_type> + , testing::async::inclusive_scan::mixin::invoke_async::simple + , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet +{ + static std::string description() { return "scan with stateful operator"; } +}; + +} // namespace + +template +struct test_stateful_operator +{ + void operator()(std::size_t num_values) const + { + testing::async::test_policy_overloads>::run(num_values); + } +}; +DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_stateful_operator, NumericTypes); + +#endif // C++14 diff --git a/testing/async/inclusive_scan/using_vs_adl.cu b/testing/async/inclusive_scan/using_vs_adl.cu new file mode 100644 index 000000000..9789ce5c9 --- /dev/null +++ b/testing/async/inclusive_scan/using_vs_adl.cu @@ -0,0 +1,169 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +// Verify what happens when calling the algorithm without any namespace +// qualifiers: +// - If the async entry point is available in the global namespace due to a +// using statement, the async algorithm should be called. +// - Otherwise, ADL should resolve the call to the synchronous algo in the +// thrust:: namespace. + +namespace invoke_reference +{ + +template +struct adl_host_synchronous +{ + template + static void invoke_reference(InputType const& input, + OutputType& output, + PostfixArgTuple&& postfix_tuple, + std::index_sequence) + { + // Create host versions of the input/output: + thrust::host_vector host_input(input.cbegin(), + input.cend()); + thrust::host_vector host_output(host_input.size()); + + using OutIter = thrust::remove_cvref_t; + + // ADL should resolve this to the synchronous `thrust::` algorithm. + // This is checked by ensuring that the call returns an output iterator. + OutIter result = + inclusive_scan(host_input.cbegin(), + host_input.cend(), + host_output.begin(), + std::get(THRUST_FWD(postfix_tuple))...); + (void)result; + + // Copy back to device. + output = host_output; + } +}; + +} // namespace invoke_reference + +namespace invoke_async +{ + +struct using_namespace +{ + template + static auto invoke_async(PrefixArgTuple&& prefix_tuple, + std::index_sequence, + InputType const& input, + OutputType& output, + PostfixArgTuple&& postfix_tuple, + std::index_sequence) + { + // Importing the CPO into the current namespace should unambiguously resolve + // this call to the CPO, as opposed to resolving to the thrust:: algorithm + // via ADL. This is verified by checking that an event is returned. + using namespace thrust::async; + thrust::device_event e = + inclusive_scan(std::get(THRUST_FWD(prefix_tuple))..., + input.cbegin(), + input.cend(), + output.begin(), + std::get(THRUST_FWD(postfix_tuple))...); + return e; + } +}; + +struct using_cpo +{ + template + static auto invoke_async(PrefixArgTuple&& prefix_tuple, + std::index_sequence, + InputType const& input, + OutputType& output, + PostfixArgTuple&& postfix_tuple, + std::index_sequence) + { + // Importing the CPO into the current namespace should unambiguously resolve + // this call to the CPO, as opposed to resolving to the thrust:: algorithm + // via ADL. This is verified by checking that an event is returned. + using thrust::async::inclusive_scan; + thrust::device_event e = + inclusive_scan(std::get(THRUST_FWD(prefix_tuple))..., + input.cbegin(), + input.cend(), + output.begin(), + std::get(THRUST_FWD(postfix_tuple))...); + return e; + } +}; + +} // namespace invoke_async + +template > +struct using_namespace_invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::device_vector + , testing::async::inclusive_scan::mixin::postfix_args:: + all_overloads + , invoke_reference::adl_host_synchronous + , invoke_async::using_namespace + , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet +{ + static std::string description() + { + return "importing async CPO with `using namespace thrust::async`"; + } +}; + +void test_using_namespace() +{ + using invoker = using_namespace_invoker; + testing::async::test_policy_overloads::run(128); +} +DECLARE_UNITTEST(test_using_namespace); + +template > +struct using_cpo_invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::device_vector + , testing::async::inclusive_scan::mixin::postfix_args:: + all_overloads + , invoke_reference::adl_host_synchronous + , invoke_async::using_cpo + , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet +{ + static std::string description() + { + return "importing async CPO with " + "`using namespace thrust::async::inclusive_scan`"; + } +}; + +void test_using_cpo() +{ + using invoker = using_cpo_invoker; + testing::async::test_policy_overloads::run(128); +} +DECLARE_UNITTEST(test_using_cpo); + +#endif // C++14 diff --git a/testing/async/mixin.h b/testing/async/mixin.h new file mode 100644 index 000000000..6d1c06ed7 --- /dev/null +++ b/testing/async/mixin.h @@ -0,0 +1,663 @@ +#pragma once + +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include +#include +#include + +#include +#include +#include + +#include + +#include +#include + +#include +#include + +// clang-format off + +// This file contains a set of mix-in classes that define an algorithm +// definition for use with test_policy_overloads. The algorithm +// definition describes the details of a thrust::async algorithm invocation: +// +// - Input type and initialization +// - Output type and initialization (supports in-place, too) +// - Postfix arguments that define the algorithm's overload set +// - Abstracted invocation of the async algorithm +// - Abstracted invocation of a reference algorithm +// - Validation of async vs. reference output +// - A description string. +// +// This definition is used by test_policy_overloads to test each overload +// against a reference while injecting a variety of execution policies. This +// validates that each overload behaves correctly according to some reference. +// +// Since much of the algorithm definition is generic and may be reused in +// multiple tests with slight changes, a mix-in system is used to simplify +// the creation of algorithm definitions. The following namespace hierarchy is +// used to organize these generic components: +// +// * testing::async::mixin:: +// ** ::input - Input types/values (device vectors, counting iterators, etc) +// ** ::output - Output types/values (device vectors, inplace device vectors, +// discard iterators, etc) +// ** ::postfix_args - Algorithm specific overload sets +// ** ::invoke_reference - Algorithm specific reference invocation +// ** ::invoke_async - Algorithm specific async algo invocation +// ** ::compare_outputs - Compare output values. +// +// Each algorithm should define its own `mixins.h` header to declare algorithm +// specific mixins (e.g. postfix_args, invoke_reference, and invoke_async) +// in a testing::async::::mixins namespace structure. +// +// For example, the test.async.exclusive_scan.basic test uses the following +// algorithm definition from mix-ins: +// +// ``` +// #include +// #include +// #include +// template > +// struct basic_invoker +// : testing::async::mixin::input::device_vector +// , testing::async::mixin::output::device_vector +// , testing::async::exclusive_scan::mixin::postfix_args:: +// all_overloads +// , testing::async::exclusive_scan::mixin::invoke_reference:: +// host_synchronous +// , testing::async::exclusive_scan::mixin::invoke_async::basic +// , testing::async::mixin::compare_outputs::assert_equal_quiet +// { +// static std::string description() +// { +// return "basic invocation with device vectors"; +// } +// }; +// +// ... +// +// testing::async::test_policy_overloads>::run(num_values); +// ``` +// +// The basic_invoker class expands to something similar to the following: +// +// ``` +// template > +// struct basic_invoker +// { +// public: +// +// static std::string description() +// { +// return "basic invocation with device vectors"; +// } +// +// //------------------------------------------------------------------------- +// // testing::async::mixin::input::device_vector +// // +// // input_type must provide idiomatic definitions of: +// // - `using iterator = ...;` +// // - `iterator begin() const { ... }` +// // - `iterator end() const { ... }` +// // - `size_t size() const { ... }` +// using input_type = thrust::device_vector; +// +// // Generate an instance of the input: +// static input_type generate_input(std::size_t num_values) +// { +// input_type input(num_values); +// thrust::sequence(input.begin(), input.end(), 25, 3); +// return input; +// } +// +// //------------------------------------------------------------------------- +// // testing::async::mixin::output::device_vector +// // +// // output_type must provide idiomatic definitions of: +// // - `using iterator = ...;` +// // - `iterator begin() { ... }` +// using output_type = thrust::device_vector; +// +// // Generate an instance of the output: +// // Might be more complicated, eg. fancy iterators, etc +// static output_type generate_output(std::size_t num_values) +// { +// return output_type(num_values); +// } +// +// //------------------------------------------------------------------------- +// // testing::async::exclusive_scan::mixin::postfix_args::all_overloads +// using postfix_args_type = std::tuple< // List any extra arg overloads: +// std::tuple<>, // - no extra args +// std::tuple, // - initial_value +// std::tuple // - initial_value, binary_op +// >; +// +// // Create instances of the extra arguments to use when invoking the +// // algorithm: +// static postfix_args_type generate_postfix_args() +// { +// return postfix_args_type{ +// std::tuple<>{}, // no extra args +// std::make_tuple(initial_value_type{42}), // initial_value +// // initial_value, binary_op: +// std::make_tuple(initial_value_Type{57}, alternate_binary_op{}) +// }; +// } +// +// //------------------------------------------------------------------------- +// // +// testing::async::exclusive_scan::mixin::invoke_reference::host_synchronous +// // +// // Invoke a reference implementation for a single overload as described by +// // postfix_tuple. This tuple contains instances of any trailing arguments +// // to pass to the algorithm. The tuple/index_sequence pattern is used to +// // support a "no extra args" overload, since the parameter pack expansion +// // will do exactly what we want in all cases. +// template +// static void invoke_reference(input_type const &input, +// output_type &output, +// PostfixArgTuple &&postfix_tuple, +// std::index_sequence) +// { +// // Create host versions of the input/output: +// thrust::host_vector host_input(input.cbegin(), +// input.cend()); +// thrust::host_vector host_output(host_input.size()); +// +// // Run host synchronous algorithm to generate reference. +// thrust::exclusive_scan(host_input.cbegin(), +// host_input.cend(), +// host_output.begin(), +// std::get( +// THRUST_FWD(postfix_tuple))...); +// +// // Copy back to device. +// output = host_output; +// } +// +// //------------------------------------------------------------------------- +// // testing::async::mixin::exclusive_scan::mixin::invoke_async::basic +// // +// // Invoke the async algorithm for a single overload as described by +// // the prefix and postfix tuples. These tuples contains instances of any +// // additional arguments to pass to the algorithm. The tuple/index_sequence +// // pattern is used to support the "no extra args" overload, since the +// // parameter pack expansion will do exactly what we want in all cases. +// // Prefix args are included here (but not for invoke_reference) to allow +// // the test framework to change the execution policy. +// // This method must return an event or future. +// template +// static auto invoke_async(PrefixArgTuple &&prefix_tuple, +// std::index_sequence, +// input_type const &input, +// output_type &output, +// PostfixArgTuple &&postfix_tuple, +// std::index_sequence) +// { +// output.resize(input.size()); +// auto e = thrust::async::exclusive_scan( +// std::get(THRUST_FWD(prefix_tuple))..., +// input.cbegin(), +// input.cend(), +// output.begin(), +// std::get(THRUST_FWD(postfix_tuple))...); +// return e; +// } +// +// //------------------------------------------------------------------------- +// // testing::async::mixin::compare_outputs::assert_equal_quiet +// // +// // Wait on and validate the event/future (usually with TEST_EVENT_WAIT / +// // TEST_FUTURE_VALUE_RETRIEVAL), then check that the reference output +// // matches the testing output. +// template +// static void compare_outputs(EventType &e, +// output_type const &ref, +// output_type const &test) +// { +// TEST_EVENT_WAIT(e); +// ASSERT_EQUAL_QUIET(ref, test); +// } +// }; +// ``` +// +// Similar invokers with slight tweaks are used in other +// async/exclusive_scan/*.cu tests. + +// clang-format on + +namespace testing +{ +namespace async +{ +namespace mixin +{ + +//------------------------------------------------------------------------------ +namespace input +{ + +template +struct device_vector +{ + using input_type = thrust::device_vector; + + static input_type generate_input(std::size_t num_values) + { + input_type input(num_values); + thrust::sequence(input.begin(), + input.end(), + static_cast(1), + static_cast(1)); + return input; + } +}; + +template +struct counting_iterator_from_0 +{ + struct input_type + { + using iterator = thrust::counting_iterator; + + std::size_t num_values; + + iterator begin() const { return iterator{static_cast(0)}; } + iterator cbegin() const { return iterator{static_cast(0)}; } + + iterator end() const { return iterator{static_cast(num_values)}; } + iterator cend() const { return iterator{static_cast(num_values)}; } + + std::size_t size() const { return num_values; } + }; + + static input_type generate_input(std::size_t num_values) + { + return {num_values}; + } +}; + +template +struct counting_iterator_from_1 +{ + struct input_type + { + using iterator = thrust::counting_iterator; + + std::size_t num_values; + + iterator begin() const { return iterator{static_cast(1)}; } + iterator cbegin() const { return iterator{static_cast(1)}; } + + iterator end() const { return iterator{static_cast(1 + num_values)}; } + iterator cend() const { return iterator{static_cast(1 + num_values)}; } + + std::size_t size() const { return num_values; } + }; + + static input_type generate_input(std::size_t num_values) + { + return {num_values}; + } +}; + +template +struct constant_iterator_1 +{ + struct input_type + { + using iterator = thrust::constant_iterator; + + std::size_t num_values; + + iterator begin() const { return iterator{static_cast(1)}; } + iterator cbegin() const { return iterator{static_cast(1)}; } + + iterator end() const + { + return iterator{static_cast(1)} + num_values; + } + iterator cend() const + { + return iterator{static_cast(1)} + num_values; + } + + std::size_t size() const { return num_values; } + }; + + static input_type generate_input(std::size_t num_values) + { + return {num_values}; + } +}; + +} // namespace input + +//------------------------------------------------------------------------------ +namespace output +{ + +template +struct device_vector +{ + using output_type = thrust::device_vector; + + template + static output_type generate_output(std::size_t num_values, + InputType& /* unused */) + { + return output_type(num_values); + } +}; + +template +struct device_vector_reuse_input +{ + using output_type = thrust::device_vector&; + + template + static output_type generate_output(std::size_t /*num_values*/, + InputType& input) + { + return input; + } +}; + +struct discard_iterator +{ + struct output_type + { + using iterator = thrust::discard_iterator<>; + + iterator begin() const { return thrust::make_discard_iterator(); } + iterator cbegin() const { return thrust::make_discard_iterator(); } + }; + + template + static output_type generate_output(std::size_t /* num_values */, + InputType& /* input */) + { + return output_type{}; + } +}; + +} // namespace output + +//------------------------------------------------------------------------------ +namespace postfix_args +{ +/* Defined per algorithm. Example: + * + * // Defines several overloads: + * // algorithm([policy,] input, output) // no postfix args + * // algorithm([policy,] input, output, initial_value) + * // algorithm([policy,] input, output, initial_value, binary_op) + * template > + * struct all_overloads + * { + * using postfix_args_type = std::tuple< // List any extra arg overloads: + * std::tuple<>, // - no extra args + * std::tuple, // - initial_value + * std::tuple // - initial_value, binary_op + * >; + * + * static postfix_args_type generate_postfix_args() + * { + * return postfix_args_type{ + * std::tuple<>{}, // no extra args + * std::make_tuple(initial_value_type{42}), // initial_value + * // initial_value, binary_op: + * std::make_tuple(initial_value_Type{57}, alternate_binary_op{}) + * } + * }; + * + */ +} + +//------------------------------------------------------------------------------ +namespace invoke_reference +{ + +/* Defined per algorithm. Example: + * + * template + * struct host_synchronous + * { + * template + * static void invoke_reference(InputType const& input, + * OutputType& output, + * PostfixArgTuple&& postfix_tuple, + * std::index_sequence) + * { + * // Create host versions of the input/output: + * thrust::host_vector host_input(input.cbegin(), + * input.cend()); + * thrust::host_vector host_output(host_input.size()); + * + * // Run host synchronous algorithm to generate reference. + * // Be sure to call a backend that doesn't use the same underlying + * // implementation. + * thrust::exclusive_scan(host_input.cbegin(), + * host_input.cend(), + * host_output.begin(), + * std::get( + * THRUST_FWD(postfix_tuple))...); + * + * // Copy back to device. + * output = host_output; + * } + * }; + * + */ + +// Used to save time when testing unverifiable invocations (discard_iterators) +struct noop +{ + template + static void invoke_reference(Ts&&...) + {} +}; + +} // namespace invoke_reference + +//------------------------------------------------------------------------------ +namespace invoke_async +{ + +/* Defined per algorithm. Example: + * + * struct basic + * { + * template + * static auto invoke_async(PrefixArgTuple&& prefix_tuple, + * std::index_sequence, + * InputType const& input, + * OutputType& output, + * PostfixArgTuple&& postfix_tuple, + * std::index_sequence) + * { + * auto e = thrust::async::exclusive_scan( + * std::get(THRUST_FWD(prefix_tuple))..., + * input.cbegin(), + * input.cend(), + * output.begin(), + * std::get(THRUST_FWD(postfix_tuple))...); + * return e; + * } + * }; + */ + +} // namespace invoke_async + +//------------------------------------------------------------------------------ +namespace compare_outputs +{ + +namespace detail +{ + +void basic_event_validation(thrust::device_event& e) +{ + TEST_EVENT_WAIT(e); +} + +template +void basic_event_validation(thrust::device_future& f) +{ + TEST_FUTURE_VALUE_RETRIEVAL(f); +} + +} // namespace detail + +struct assert_equal +{ + template + static void compare_outputs(EventType& e, + OutputType const& ref, + OutputType const& test) + { + detail::basic_event_validation(e); + ASSERT_EQUAL(ref, test); + } +}; + +struct assert_almost_equal +{ + template + static void compare_outputs(EventType& e, + OutputType const& ref, + OutputType const& test) + { + detail::basic_event_validation(e); + ASSERT_ALMOST_EQUAL(ref, test); + } +}; + +// Does an 'almost_equal' comparison for floating point types. Since fp +// addition is non-associative, this is sometimes necessary. +struct assert_almost_equal_if_fp +{ +private: + template + static void compare_outputs_impl(EventType& e, + OutputType const& ref, + OutputType const& test, + std::false_type /* is_floating_point */) + { + detail::basic_event_validation(e); + ASSERT_EQUAL(ref, test); + } + + template + static void compare_outputs_impl(EventType& e, + OutputType const& ref, + OutputType const& test, + std::true_type /* is_floating_point */) + { + detail::basic_event_validation(e); + ASSERT_ALMOST_EQUAL(ref, test); + } + +public: + template + static void compare_outputs(EventType& e, + OutputType const& ref, + OutputType const& test) + { + using value_type = typename OutputType::value_type; + compare_outputs_impl(e, ref, test, std::is_floating_point{}); + } +}; + +struct assert_equal_quiet +{ + template + static void compare_outputs(EventType& e, + OutputType const& ref, + OutputType const& test) + { + detail::basic_event_validation(e); + ASSERT_EQUAL_QUIET(ref, test); + } +}; + +// Does an 'almost_equal' comparison for floating point types, since fp +// addition is non-associative +struct assert_almost_equal_if_fp_quiet +{ +private: + template + static void compare_outputs_impl(EventType& e, + OutputType const& ref, + OutputType const& test, + std::false_type /* is_floating_point */) + { + detail::basic_event_validation(e); + ASSERT_EQUAL_QUIET(ref, test); + } + + template + static void compare_outputs_impl(EventType& e, + OutputType const& ref, + OutputType const& test, + std::true_type /* is_floating_point */) + { + detail::basic_event_validation(e); + ASSERT_ALMOST_EQUAL(ref, test); + } + +public: + template + static void compare_outputs(EventType& e, + OutputType const& ref, + OutputType const& test) + { + using value_type = typename OutputType::value_type; + compare_outputs_impl(e, ref, test, std::is_floating_point{}); + } +}; + +// Used to save time when testing unverifiable invocations (discard_iterators). +// Just does basic validation of the future/event. +struct noop +{ + template + static void compare_outputs(EventType &e, Ts&&...) + { + detail::basic_event_validation(e); + } +}; + +} // namespace compare_outputs + +} // namespace mixin +} // namespace async +} // namespace testing + +#endif // C++14 diff --git a/testing/async/test_policy_overloads.h b/testing/async/test_policy_overloads.h new file mode 100644 index 000000000..b7bf1ab94 --- /dev/null +++ b/testing/async/test_policy_overloads.h @@ -0,0 +1,410 @@ +#pragma once + +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include +#include + +#include + +#include + +// TODO Cover these cases from testing/async_reduce.cu: +// - [x] test_async_reduce_after ("after_future" in test_policy_overloads) +// - [ ] test_async_reduce_on_then_after (KNOWN_FAILURE, see #1195) +// - [ ] all the child variants (e.g. with allocator) too +// - [ ] test_async_copy_then_reduce (Need to figure out how to fit this in) +// - [ ] test_async_reduce_caching (only useful when returning future) + +namespace testing +{ + +namespace async +{ + +// Tests that policies are handled correctly for all overloads of an async +// algorithm. +// +// The AlgoDef parameter type defines an async algorithm, its overloads, and +// abstracts its invocation. See the async/mixins.h for a documented example of +// this interface and some convenience mixins that can be used to construct a +// definition quickly. +// +// The AlgoDef interface is used to run several tests of the algorithm, +// exhaustively testing all overloads for algorithm correctness and proper +// policy handling. +// +// ## Basic tests +// +// In the basic tests, each overload is called repeatedly with: +// 1) No policy +// 2) thrust::device +// 3) thrust::device(thrust::device_allocator) +// 4) thrust::device.on(stream) +// 5) thrust::device(thrust::device_allocator).on(stream) +// +// The output of the async algorithm is compared against a reference output, +// and the returned event/future is tested to make sure it holds a reference to +// the expected stream. +// +// ## After Future tests +// +// The after_future tests check that the future/event returned from an algorithm +// behaves properly when consumed by a policy's `.after` method. +template +struct test_policy_overloads +{ + using algo_def = AlgoDef; + using input_type = typename algo_def::input_type; + using output_type = typename algo_def::output_type; + using postfix_args_type = typename algo_def::postfix_args_type; + + static constexpr std::size_t num_postfix_arg_sets = + std::tuple_size::value; + + // Main entry point; call this from a unit test function. + static void run(std::size_t num_values) + { + test_postfix_overloads(num_values); + } + +private: + template + using size_const = std::integral_constant; + + //---------------------------------------------------------------------------- + // Recursively call sub tests for each overload set in postfix_args: + template + static void test_postfix_overloads(std::size_t const num_values, + size_const = {}) + { + static_assert(PostfixIdx < num_postfix_arg_sets, "Internal error."); + + run_basic_policy_tests(num_values); + run_after_future_tests(num_values); + + // Recurse to test next round of overloads: + test_postfix_overloads(num_values, size_const{}); + } + + static void test_postfix_overloads(std::size_t const, + size_const) + { + // terminal case, no-op + } + + //---------------------------------------------------------------------------- + // For the specified postfix overload set, test the algorithm with several + // different policy configurations. + template + static void run_basic_policy_tests(std::size_t const num_values) + { + // When a policy uses the default stream, the algorithm implementation + // should spawn a new stream in the returned event: + auto using_default_stream = [](auto& e) { + ASSERT_NOT_EQUAL(thrust::cuda_cub::default_stream(), + e.stream().native_handle()); + }; + + // When a policy uses a non-default stream, the implementation should pass + // the stream through to the output: + thrust::system::cuda::detail::unique_stream test_stream{}; + auto using_test_stream = [&test_stream](auto& e) { + ASSERT_EQUAL(test_stream.native_handle(), e.stream().native_handle()); + }; + + // Test the different types of policies: + basic_policy_test("(no policy)", + std::make_tuple(), + using_default_stream, + num_values); + + basic_policy_test("thrust::device", + std::make_tuple(thrust::device), + using_default_stream, + num_values); + + basic_policy_test( + "thrust::device(thrust::device_allocator{})", + std::make_tuple(thrust::device(thrust::device_allocator{})), + using_default_stream, + num_values); + + basic_policy_test("thrust::device.on(test_stream.get())", + std::make_tuple( + thrust::device.on(test_stream.get())), + using_test_stream, + num_values); + + basic_policy_test( + "thrust::device(thrust::device_allocator{}).on(test_stream.get())", + std::make_tuple( + thrust::device(thrust::device_allocator{}).on(test_stream.get())), + using_test_stream, + num_values); + } + + // Invoke the algorithm multiple times with the provided policy and validate + // the results. + template + static void basic_policy_test(std::string const &policy_desc, + PrefixArgTuple &&prefix_tuple_ref, + ValidateEvent const &validate, + std::size_t num_values) + try + { + // Sink the prefix tuple into a const local so it can be safely passed to + // multiple invocations without worrying about potential modifications. + using prefix_tuple_type = thrust::remove_cvref_t; + prefix_tuple_type const prefix_tuple = THRUST_FWD(prefix_tuple_ref); + + using postfix_tuple_type = + std::tuple_element_t; + postfix_tuple_type const postfix_tuple = get_postfix_tuple(); + + // Generate index sequences for the tuples: + constexpr auto prefix_tuple_size = std::tuple_size{}; + constexpr auto postfix_tuple_size = std::tuple_size{}; + using prefix_index_seq = std::make_index_sequence; + using postfix_index_seq = std::make_index_sequence; + + // Use unique, non-const inputs for each invocation to support in-place + // algo_def configurations. + input_type input_a = algo_def::generate_input(num_values); + input_type input_b = algo_def::generate_input(num_values); + input_type input_c = algo_def::generate_input(num_values); + input_type input_d = algo_def::generate_input(num_values); + input_type input_ref = algo_def::generate_input(num_values); + + output_type output_a = algo_def::generate_output(num_values, input_a); + output_type output_b = algo_def::generate_output(num_values, input_b); + output_type output_c = algo_def::generate_output(num_values, input_c); + output_type output_d = algo_def::generate_output(num_values, input_d); + output_type output_ref = algo_def::generate_output(num_values, input_ref); + + // Invoke multiple overlapping async algorithms, capturing their outputs + // and events/futures: + auto e_a = algo_def::invoke_async(prefix_tuple, + prefix_index_seq{}, + input_a, + output_a, + postfix_tuple, + postfix_index_seq{}); + auto e_b = algo_def::invoke_async(prefix_tuple, + prefix_index_seq{}, + input_b, + output_b, + postfix_tuple, + postfix_index_seq{}); + auto e_c = algo_def::invoke_async(prefix_tuple, + prefix_index_seq{}, + input_c, + output_c, + postfix_tuple, + postfix_index_seq{}); + auto e_d = algo_def::invoke_async(prefix_tuple, + prefix_index_seq{}, + input_d, + output_d, + postfix_tuple, + postfix_index_seq{}); + + // Let reference calc overlap with async testing: + algo_def::invoke_reference(input_ref, + output_ref, + postfix_tuple, + postfix_index_seq{}); + + // These wait on the e_X events: + algo_def::compare_outputs(e_a, output_ref, output_a); + algo_def::compare_outputs(e_b, output_ref, output_b); + algo_def::compare_outputs(e_c, output_ref, output_c); + algo_def::compare_outputs(e_d, output_ref, output_d); + + validate(e_a); + validate(e_b); + validate(e_c); + validate(e_d); + } + catch (unittest::UnitTestException &exc) + { + // Append some identifying information to the exception to help with + // debugging: + using overload_t = std::tuple_element_t; + + std::string const overload_desc = + unittest::demangle(typeid(overload_t).name()); + std::string const input_desc = + unittest::demangle(typeid(input_type).name()); + std::string const output_desc = + unittest::demangle(typeid(output_type).name()); + + exc << "\n" + << " - algo_def::description = " << algo_def::description() << "\n" + << " - test = basic_policy\n" + << " - policy = " << policy_desc << "\n" + << " - input_type = " << input_desc << "\n" + << " - output_type = " << output_desc << "\n" + << " - tuple of trailing arguments = " << overload_desc << "\n" + << " - num_values = " << num_values; + throw; + } + + //---------------------------------------------------------------------------- + // Test .after(event/future) handling: + template + static void run_after_future_tests(std::size_t const num_values) + try + { + using postfix_tuple_type = + std::tuple_element_t; + postfix_tuple_type const postfix_tuple = get_postfix_tuple(); + + // Generate index sequences for the tuples. Prefix size always = 1 here, + // since the async algorithms are always invoked with a single prefix + // arg (the execution policy) here. + constexpr auto postfix_tuple_size = std::tuple_size{}; + using prefix_index_seq = std::make_index_sequence<1>; + using postfix_index_seq = std::make_index_sequence; + + // Use unique, non-const inputs for each invocation to support in-place + // algo_def configurations. + input_type input_a = algo_def::generate_input(num_values); + input_type input_b = algo_def::generate_input(num_values); + input_type input_c = algo_def::generate_input(num_values); + input_type input_tmp = algo_def::generate_input(num_values); + input_type input_ref = algo_def::generate_input(num_values); + + output_type output_a = algo_def::generate_output(num_values, input_a); + output_type output_b = algo_def::generate_output(num_values, input_b); + output_type output_c = algo_def::generate_output(num_values, input_c); + output_type output_tmp = algo_def::generate_output(num_values, input_tmp); + output_type output_ref = algo_def::generate_output(num_values, input_ref); + + auto e_a = algo_def::invoke_async(std::make_tuple(thrust::device), + prefix_index_seq{}, + input_a, + output_a, + postfix_tuple, + postfix_index_seq{}); + ASSERT_EQUAL(true, e_a.valid_stream()); + auto const stream_a = e_a.stream().native_handle(); + + // Execution on default stream should create a new stream in the result: + ASSERT_NOT_EQUAL_QUIET(thrust::cuda_cub::default_stream(), stream_a); + + //-------------------------------------------------------------------------- + // Test event consumption when the event is an rvalue. + //-------------------------------------------------------------------------- + // Using `forward_as_tuple` instead of `make_tuple` to explicitly control + // value categories. + // Explicitly order this invocation after e_a: + auto e_b = + algo_def::invoke_async(std::forward_as_tuple(thrust::device.after(e_a)), + prefix_index_seq{}, + input_b, + output_b, + postfix_tuple, + postfix_index_seq{}); + ASSERT_EQUAL(true, e_b.valid_stream()); + auto const stream_b = e_b.stream().native_handle(); + + // Second invocation should use same stream as before: + ASSERT_EQUAL_QUIET(stream_a, stream_b); + + // Verify that double consumption of e_a produces an exception: + ASSERT_THROWS_EQUAL(auto x = algo_def::invoke_async( + std::forward_as_tuple(thrust::device.after(e_a)), + prefix_index_seq{}, + input_tmp, + output_tmp, + postfix_tuple, + postfix_index_seq{}); + THRUST_UNUSED_VAR(x), + thrust::event_error, + thrust::event_error(thrust::event_errc::no_state)); + + //-------------------------------------------------------------------------- + // Test event consumption when the event is an lvalue + //-------------------------------------------------------------------------- + // Explicitly order this invocation after e_b: + auto policy_after_e_b = thrust::device.after(e_b); + auto policy_after_e_b_tuple = std::forward_as_tuple(policy_after_e_b); + auto e_c = + algo_def::invoke_async(policy_after_e_b_tuple, + prefix_index_seq{}, + input_c, + output_c, + postfix_tuple, + postfix_index_seq{}); + ASSERT_EQUAL(true, e_c.valid_stream()); + auto const stream_c = e_c.stream().native_handle(); + + // Should use same stream as e_b: + ASSERT_EQUAL_QUIET(stream_b, stream_c); + + // Verify that double consumption of e_b produces an exception: + ASSERT_THROWS_EQUAL( + auto x = algo_def::invoke_async(policy_after_e_b_tuple, + prefix_index_seq{}, + input_tmp, + output_tmp, + postfix_tuple, + postfix_index_seq{}); + THRUST_UNUSED_VAR(x), + thrust::event_error, + thrust::event_error(thrust::event_errc::no_state)); + + // Let reference calc overlap with async testing: + algo_def::invoke_reference(input_ref, + output_ref, + postfix_tuple, + postfix_index_seq{}); + + // Validate results + // Use e_c for all three checks -- e_a and e_b will not pass the event + // checks since their streams were stolen by dependencies. + algo_def::compare_outputs(e_c, output_ref, output_a); + algo_def::compare_outputs(e_c, output_ref, output_b); + algo_def::compare_outputs(e_c, output_ref, output_c); + } + catch (unittest::UnitTestException &exc) + { + // Append some identifying information to the exception to help with + // debugging: + using postfix_t = std::tuple_element_t; + + std::string const postfix_desc = + unittest::demangle(typeid(postfix_t).name()); + std::string const input_desc = + unittest::demangle(typeid(input_type).name()); + std::string const output_desc = + unittest::demangle(typeid(output_type).name()); + + exc << "\n" + << " - algo_def::description = " << algo_def::description() << "\n" + << " - test = after_future\n" + << " - input_type = " << input_desc << "\n" + << " - output_type = " << output_desc << "\n" + << " - tuple of trailing arguments = " << postfix_desc << "\n" + << " - num_values = " << num_values; + throw; + } + + //---------------------------------------------------------------------------- + // Various helper functions: + template + static auto get_postfix_tuple() + { + return std::get(algo_def::generate_postfix_args()); + } +}; + +} // namespace async +} // namespace testing + +#endif // C++14 diff --git a/testing/async_copy.cu b/testing/async_copy.cu index 338b94e1a..2666a6c38 100644 --- a/testing/async_copy.cu +++ b/testing/async_copy.cu @@ -1,6 +1,6 @@ #include -#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC) +#if THRUST_CPP_DIALECT >= 2014 #include #include @@ -18,7 +18,7 @@ auto operator()( \ ForwardIt&& first, Sentinel&& last, OutputIt&& output \ ) const \ - THRUST_DECLTYPE_RETURNS( \ + THRUST_RETURNS( \ ::thrust::async::copy( \ __VA_ARGS__ \ THRUST_PP_COMMA_IF(THRUST_PP_ARITY(__VA_ARGS__)) \ @@ -101,7 +101,7 @@ struct test_async_copy_device_to_host void operator()(std::size_t n) { thrust::host_vector h0(unittest::random_integers(n)); - thrust::device_vector h1(n); + thrust::host_vector h1(n); thrust::device_vector d0(n); thrust::copy(h0.begin(), h0.end(), d0.begin()); @@ -267,6 +267,11 @@ struct test_async_copy_counting_iterator_input_to_host_vector f0.wait(); ASSERT_EQUAL(d0, d1); + + #if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_INTEL) + // ICC fails this for some unknown reason - see #1468. + KNOWN_FAILURE; + #endif } }; }; @@ -319,6 +324,84 @@ DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME( /////////////////////////////////////////////////////////////////////////////// +template +struct test_async_copy_after +{ + __host__ + void operator()(std::size_t n) + { + thrust::host_vector h0(unittest::random_integers(n)); + thrust::host_vector h1(n); + thrust::device_vector d0(n); + thrust::device_vector d1(n); + thrust::device_vector d2(n); + + auto e0 = thrust::async::copy( + h0.begin(), h0.end(), d0.begin() + ); + + ASSERT_EQUAL(true, e0.valid_stream()); + + auto const e0_stream = e0.stream().native_handle(); + + auto e1 = thrust::async::copy( + thrust::device.after(e0), d0.begin(), d0.end(), d1.begin() + ); + + // Verify that double consumption of a future produces an exception. + ASSERT_THROWS_EQUAL( + auto x = thrust::async::copy( + thrust::device.after(e0), d0.begin(), d0.end(), d1.begin() + ); + THRUST_UNUSED_VAR(x) + , thrust::event_error + , thrust::event_error(thrust::event_errc::no_state) + ); + + ASSERT_EQUAL_QUIET(e0_stream, e1.stream().native_handle()); + + auto after_policy2 = thrust::device.after(e1); + + auto e2 = thrust::async::copy( + thrust::host, after_policy2 + , h0.begin(), h0.end(), d2.begin() + ); + + // Verify that double consumption of a policy produces an exception. + ASSERT_THROWS_EQUAL( + auto x = thrust::async::copy( + thrust::host, after_policy2 + , h0.begin(), h0.end(), d2.begin() + ); + THRUST_UNUSED_VAR(x) + , thrust::event_error + , thrust::event_error(thrust::event_errc::no_state) + ); + + ASSERT_EQUAL_QUIET(e0_stream, e2.stream().native_handle()); + + auto e3 = thrust::async::copy( + thrust::device.after(e2), thrust::host + , d1.begin(), d1.end(), h1.begin() + ); + + ASSERT_EQUAL_QUIET(e0_stream, e3.stream().native_handle()); + + TEST_EVENT_WAIT(e3); + + ASSERT_EQUAL(h0, h1); + ASSERT_EQUAL(h0, d0); + ASSERT_EQUAL(h0, d1); + ASSERT_EQUAL(h0, d2); + } +}; +DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES( + test_async_copy_after +, BuiltinNumericTypes +); + +/////////////////////////////////////////////////////////////////////////////// + // TODO: device_to_device NonContiguousIterator output (discard_iterator). // TODO: host_to_device non trivially relocatable. diff --git a/testing/async_for_each.cu b/testing/async_for_each.cu index 7ed033e9e..a09adf255 100644 --- a/testing/async_for_each.cu +++ b/testing/async_for_each.cu @@ -1,6 +1,6 @@ #include -#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC) +#if THRUST_CPP_DIALECT >= 2014 #include @@ -16,7 +16,7 @@ auto operator()( \ ForwardIt&& first, Sentinel&& last, UnaryFunction&& f \ ) const \ - THRUST_DECLTYPE_RETURNS( \ + THRUST_RETURNS( \ ::thrust::async::for_each( \ __VA_ARGS__ \ THRUST_PP_COMMA_IF(THRUST_PP_ARITY(__VA_ARGS__)) \ diff --git a/testing/async_reduce.cmake b/testing/async_reduce.cmake new file mode 100644 index 000000000..44c0fbda1 --- /dev/null +++ b/testing/async_reduce.cmake @@ -0,0 +1,4 @@ +# Disable unreachable code warnings. +# This test unconditionally throws in some places, the compiler will detect that +# control flow will never reach some instructions. This is intentional. +target_link_libraries(${test_target} PRIVATE thrust.silence_unreachable_code_warnings) diff --git a/testing/async_reduce.cu b/testing/async_reduce.cu index 5987fe6ae..c033c2311 100644 --- a/testing/async_reduce.cu +++ b/testing/async_reduce.cu @@ -2,7 +2,7 @@ #include -#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC) +#if THRUST_CPP_DIALECT >= 2014 #include #include @@ -76,7 +76,7 @@ struct custom_plus auto operator()( \ ForwardIt&& first, Sentinel&& last \ ) \ - THRUST_DECLTYPE_RETURNS( \ + THRUST_RETURNS( \ ::thrust::reduce( \ __VA_ARGS__ \ ) \ @@ -975,6 +975,8 @@ struct test_async_reduce_allocator_on_then_after KNOWN_FAILURE; // FIXME: The below fails because you can't combine allocator attachment, // `.on`, and `.after`. + // The `#if 0` can be removed once the KNOWN_FAILURE is resolved. +#if 0 ASSERT_EQUAL_QUIET(stream1, f2.stream().native_handle()); // This potentially runs concurrently with the copies. @@ -986,6 +988,7 @@ struct test_async_reduce_allocator_on_then_after thrust::cuda_cub::throw_on_error(cudaStreamDestroy(stream0)); thrust::cuda_cub::throw_on_error(cudaStreamDestroy(stream1)); +#endif } }; DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES( diff --git a/testing/async_reduce_into.cu b/testing/async_reduce_into.cu index 0800a1a50..a4a2be99e 100644 --- a/testing/async_reduce_into.cu +++ b/testing/async_reduce_into.cu @@ -2,7 +2,7 @@ #include -#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC) +#if THRUST_CPP_DIALECT >= 2014 #include #include @@ -77,7 +77,7 @@ struct custom_plus auto operator()( \ ForwardIt&& first, Sentinel&& last \ ) \ - THRUST_DECLTYPE_RETURNS( \ + THRUST_RETURNS( \ ::thrust::reduce( \ __VA_ARGS__ \ ) \ diff --git a/testing/async_sort.cu b/testing/async_sort.cu index 626e21c3c..c5cfeae23 100644 --- a/testing/async_sort.cu +++ b/testing/async_sort.cu @@ -1,6 +1,13 @@ #include -#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC) +// Disabled on MSVC && NVCC < 11.1 for GH issue #1098. +#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC) && defined(__CUDACC__) +#if (__CUDACC_VER_MAJOR__ < 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ < 1) +#define THRUST_BUG_1098_ACTIVE +#endif // NVCC version check +#endif // MSVC + NVCC check + +#if THRUST_CPP_DIALECT >= 2014 && !defined(THRUST_BUG_1098_ACTIVE) #include @@ -48,7 +55,7 @@ struct custom_greater static auto async( \ ForwardIt&& first, Sentinel&& last \ ) \ - THRUST_DECLTYPE_RETURNS( \ + THRUST_RETURNS( \ ::thrust::async::sort( \ __VA_ARGS__ \ THRUST_PP_COMMA_IF(THRUST_PP_ARITY(__VA_ARGS__)) \ @@ -89,7 +96,7 @@ DEFINE_SORT_INVOKER( static auto async( \ ForwardIt&& first, Sentinel&& last \ ) \ - THRUST_DECLTYPE_RETURNS( \ + THRUST_RETURNS( \ ::thrust::async::sort( \ __VA_ARGS__ \ THRUST_PP_COMMA_IF(THRUST_PP_ARITY(__VA_ARGS__)) \ @@ -147,7 +154,7 @@ struct test_async_sort d0_data.begin(), d0_data.end() ); - if (wait_for_futures == WaitPolicy) + THRUST_IF_CONSTEXPR(wait_for_futures == WaitPolicy) { f0.wait(); diff --git a/testing/async_transform.cu b/testing/async_transform.cu index 328a4e563..efaa885f0 100644 --- a/testing/async_transform.cu +++ b/testing/async_transform.cu @@ -1,6 +1,6 @@ #include -#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC) +#if THRUST_CPP_DIALECT >= 2014 #include #include @@ -78,7 +78,7 @@ struct divide_by_2 ForwardIt&& first, Sentinel&& last, OutputIt&& output \ , UnaryOperation&& op \ ) \ - THRUST_DECLTYPE_RETURNS( \ + THRUST_RETURNS( \ ::thrust::transform( \ __VA_ARGS__ \ ) \ diff --git a/testing/binary_search.cu b/testing/binary_search.cu index d83e6acbc..2aceb8645 100644 --- a/testing/binary_search.cu +++ b/testing/binary_search.cu @@ -291,3 +291,57 @@ void TestScalarEqualRangeDispatchImplicit() DECLARE_UNITTEST(TestScalarEqualRangeDispatchImplicit); THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END + +void TestBoundsWithBigIndexesHelper(int magnitude) +{ + thrust::counting_iterator begin(1); + thrust::counting_iterator end = begin + (1ll << magnitude); + ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude); + + thrust::detail::intmax_t distance_low_value = thrust::distance( + begin, + thrust::lower_bound( + thrust::device, + begin, + end, + 17)); + + thrust::detail::intmax_t distance_high_value = thrust::distance( + begin, + thrust::lower_bound( + thrust::device, + begin, + end, + (1ll << magnitude) - 17)); + + ASSERT_EQUAL(distance_low_value, 16); + ASSERT_EQUAL(distance_high_value, (1ll << magnitude) - 18); + + distance_low_value = thrust::distance( + begin, + thrust::upper_bound( + thrust::device, + begin, + end, + 17)); + + distance_high_value = thrust::distance( + begin, + thrust::upper_bound( + thrust::device, + begin, + end, + (1ll << magnitude) - 17)); + + ASSERT_EQUAL(distance_low_value, 17); + ASSERT_EQUAL(distance_high_value, (1ll << magnitude) - 17); +} + +void TestBoundsWithBigIndexes() +{ + TestBoundsWithBigIndexesHelper(30); + TestBoundsWithBigIndexesHelper(31); + TestBoundsWithBigIndexesHelper(32); + TestBoundsWithBigIndexesHelper(33); +} +DECLARE_UNITTEST(TestBoundsWithBigIndexes); diff --git a/testing/binary_search_descending.cu b/testing/binary_search_descending.cu index 5228c4567..08294c044 100644 --- a/testing/binary_search_descending.cu +++ b/testing/binary_search_descending.cu @@ -22,16 +22,16 @@ void TestScalarLowerBoundDescendingSimple(void) vec[3] = 2; vec[4] = 0; - ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::lower_bound(vec.begin(), vec.end(), 0, thrust::greater())); - ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::lower_bound(vec.begin(), vec.end(), 1, thrust::greater())); - ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::lower_bound(vec.begin(), vec.end(), 2, thrust::greater())); - ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::lower_bound(vec.begin(), vec.end(), 3, thrust::greater())); - ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::lower_bound(vec.begin(), vec.end(), 4, thrust::greater())); - ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::lower_bound(vec.begin(), vec.end(), 5, thrust::greater())); - ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::lower_bound(vec.begin(), vec.end(), 6, thrust::greater())); - ASSERT_EQUAL_QUIET(vec.begin() + 1, thrust::lower_bound(vec.begin(), vec.end(), 7, thrust::greater())); - ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::lower_bound(vec.begin(), vec.end(), 8, thrust::greater())); - ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::lower_bound(vec.begin(), vec.end(), 9, thrust::greater())); + ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::lower_bound(vec.begin(), vec.end(), T{0}, thrust::greater())); + ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::lower_bound(vec.begin(), vec.end(), T{1}, thrust::greater())); + ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::lower_bound(vec.begin(), vec.end(), T{2}, thrust::greater())); + ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::lower_bound(vec.begin(), vec.end(), T{3}, thrust::greater())); + ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::lower_bound(vec.begin(), vec.end(), T{4}, thrust::greater())); + ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::lower_bound(vec.begin(), vec.end(), T{5}, thrust::greater())); + ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::lower_bound(vec.begin(), vec.end(), T{6}, thrust::greater())); + ASSERT_EQUAL_QUIET(vec.begin() + 1, thrust::lower_bound(vec.begin(), vec.end(), T{7}, thrust::greater())); + ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::lower_bound(vec.begin(), vec.end(), T{8}, thrust::greater())); + ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::lower_bound(vec.begin(), vec.end(), T{9}, thrust::greater())); } DECLARE_VECTOR_UNITTEST(TestScalarLowerBoundDescendingSimple); @@ -49,16 +49,16 @@ void TestScalarUpperBoundDescendingSimple(void) vec[3] = 2; vec[4] = 0; - ASSERT_EQUAL_QUIET(vec.begin() + 5, thrust::upper_bound(vec.begin(), vec.end(), 0, thrust::greater())); - ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::upper_bound(vec.begin(), vec.end(), 1, thrust::greater())); - ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::upper_bound(vec.begin(), vec.end(), 2, thrust::greater())); - ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::upper_bound(vec.begin(), vec.end(), 3, thrust::greater())); - ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::upper_bound(vec.begin(), vec.end(), 4, thrust::greater())); - ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::upper_bound(vec.begin(), vec.end(), 5, thrust::greater())); - ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::upper_bound(vec.begin(), vec.end(), 6, thrust::greater())); - ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::upper_bound(vec.begin(), vec.end(), 7, thrust::greater())); - ASSERT_EQUAL_QUIET(vec.begin() + 1, thrust::upper_bound(vec.begin(), vec.end(), 8, thrust::greater())); - ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::upper_bound(vec.begin(), vec.end(), 9, thrust::greater())); + ASSERT_EQUAL_QUIET(vec.begin() + 5, thrust::upper_bound(vec.begin(), vec.end(), T{0}, thrust::greater())); + ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::upper_bound(vec.begin(), vec.end(), T{1}, thrust::greater())); + ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::upper_bound(vec.begin(), vec.end(), T{2}, thrust::greater())); + ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::upper_bound(vec.begin(), vec.end(), T{3}, thrust::greater())); + ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::upper_bound(vec.begin(), vec.end(), T{4}, thrust::greater())); + ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::upper_bound(vec.begin(), vec.end(), T{5}, thrust::greater())); + ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::upper_bound(vec.begin(), vec.end(), T{6}, thrust::greater())); + ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::upper_bound(vec.begin(), vec.end(), T{7}, thrust::greater())); + ASSERT_EQUAL_QUIET(vec.begin() + 1, thrust::upper_bound(vec.begin(), vec.end(), T{8}, thrust::greater())); + ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::upper_bound(vec.begin(), vec.end(), T{9}, thrust::greater())); } DECLARE_VECTOR_UNITTEST(TestScalarUpperBoundDescendingSimple); @@ -76,16 +76,16 @@ void TestScalarBinarySearchDescendingSimple(void) vec[3] = 2; vec[4] = 0; - ASSERT_EQUAL(true, thrust::binary_search(vec.begin(), vec.end(), 0, thrust::greater())); - ASSERT_EQUAL(false, thrust::binary_search(vec.begin(), vec.end(), 1, thrust::greater())); - ASSERT_EQUAL(true, thrust::binary_search(vec.begin(), vec.end(), 2, thrust::greater())); - ASSERT_EQUAL(false, thrust::binary_search(vec.begin(), vec.end(), 3, thrust::greater())); - ASSERT_EQUAL(false, thrust::binary_search(vec.begin(), vec.end(), 4, thrust::greater())); - ASSERT_EQUAL(true, thrust::binary_search(vec.begin(), vec.end(), 5, thrust::greater())); - ASSERT_EQUAL(false, thrust::binary_search(vec.begin(), vec.end(), 6, thrust::greater())); - ASSERT_EQUAL(true, thrust::binary_search(vec.begin(), vec.end(), 7, thrust::greater())); - ASSERT_EQUAL(true, thrust::binary_search(vec.begin(), vec.end(), 8, thrust::greater())); - ASSERT_EQUAL(false, thrust::binary_search(vec.begin(), vec.end(), 9, thrust::greater())); + ASSERT_EQUAL(true, thrust::binary_search(vec.begin(), vec.end(), T{0}, thrust::greater())); + ASSERT_EQUAL(false, thrust::binary_search(vec.begin(), vec.end(), T{1}, thrust::greater())); + ASSERT_EQUAL(true, thrust::binary_search(vec.begin(), vec.end(), T{2}, thrust::greater())); + ASSERT_EQUAL(false, thrust::binary_search(vec.begin(), vec.end(), T{3}, thrust::greater())); + ASSERT_EQUAL(false, thrust::binary_search(vec.begin(), vec.end(), T{4}, thrust::greater())); + ASSERT_EQUAL(true, thrust::binary_search(vec.begin(), vec.end(), T{5}, thrust::greater())); + ASSERT_EQUAL(false, thrust::binary_search(vec.begin(), vec.end(), T{6}, thrust::greater())); + ASSERT_EQUAL(true, thrust::binary_search(vec.begin(), vec.end(), T{7}, thrust::greater())); + ASSERT_EQUAL(true, thrust::binary_search(vec.begin(), vec.end(), T{8}, thrust::greater())); + ASSERT_EQUAL(false, thrust::binary_search(vec.begin(), vec.end(), T{9}, thrust::greater())); } DECLARE_VECTOR_UNITTEST(TestScalarBinarySearchDescendingSimple); @@ -103,27 +103,27 @@ void TestScalarEqualRangeDescendingSimple(void) vec[3] = 2; vec[4] = 0; - ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::equal_range(vec.begin(), vec.end(), 0, thrust::greater()).first); - ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::equal_range(vec.begin(), vec.end(), 1, thrust::greater()).first); - ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), 2, thrust::greater()).first); - ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), 3, thrust::greater()).first); - ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), 4, thrust::greater()).first); - ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::equal_range(vec.begin(), vec.end(), 5, thrust::greater()).first); - ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::equal_range(vec.begin(), vec.end(), 6, thrust::greater()).first); - ASSERT_EQUAL_QUIET(vec.begin() + 1, thrust::equal_range(vec.begin(), vec.end(), 7, thrust::greater()).first); - ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::equal_range(vec.begin(), vec.end(), 8, thrust::greater()).first); - ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::equal_range(vec.begin(), vec.end(), 9, thrust::greater()).first); - - ASSERT_EQUAL_QUIET(vec.begin() + 5, thrust::equal_range(vec.begin(), vec.end(), 0, thrust::greater()).second); - ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::equal_range(vec.begin(), vec.end(), 1, thrust::greater()).second); - ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::equal_range(vec.begin(), vec.end(), 2, thrust::greater()).second); - ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), 3, thrust::greater()).second); - ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), 4, thrust::greater()).second); - ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), 5, thrust::greater()).second); - ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::equal_range(vec.begin(), vec.end(), 6, thrust::greater()).second); - ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::equal_range(vec.begin(), vec.end(), 7, thrust::greater()).second); - ASSERT_EQUAL_QUIET(vec.begin() + 1, thrust::equal_range(vec.begin(), vec.end(), 8, thrust::greater()).second); - ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::equal_range(vec.begin(), vec.end(), 9, thrust::greater()).second); + ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::equal_range(vec.begin(), vec.end(), T{0}, thrust::greater()).first); + ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::equal_range(vec.begin(), vec.end(), T{1}, thrust::greater()).first); + ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), T{2}, thrust::greater()).first); + ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), T{3}, thrust::greater()).first); + ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), T{4}, thrust::greater()).first); + ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::equal_range(vec.begin(), vec.end(), T{5}, thrust::greater()).first); + ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::equal_range(vec.begin(), vec.end(), T{6}, thrust::greater()).first); + ASSERT_EQUAL_QUIET(vec.begin() + 1, thrust::equal_range(vec.begin(), vec.end(), T{7}, thrust::greater()).first); + ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::equal_range(vec.begin(), vec.end(), T{8}, thrust::greater()).first); + ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::equal_range(vec.begin(), vec.end(), T{9}, thrust::greater()).first); + + ASSERT_EQUAL_QUIET(vec.begin() + 5, thrust::equal_range(vec.begin(), vec.end(), T{0}, thrust::greater()).second); + ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::equal_range(vec.begin(), vec.end(), T{1}, thrust::greater()).second); + ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::equal_range(vec.begin(), vec.end(), T{2}, thrust::greater()).second); + ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), T{3}, thrust::greater()).second); + ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), T{4}, thrust::greater()).second); + ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), T{5}, thrust::greater()).second); + ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::equal_range(vec.begin(), vec.end(), T{6}, thrust::greater()).second); + ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::equal_range(vec.begin(), vec.end(), T{7}, thrust::greater()).second); + ASSERT_EQUAL_QUIET(vec.begin() + 1, thrust::equal_range(vec.begin(), vec.end(), T{8}, thrust::greater()).second); + ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::equal_range(vec.begin(), vec.end(), T{9}, thrust::greater()).second); } DECLARE_VECTOR_UNITTEST(TestScalarEqualRangeDescendingSimple); diff --git a/testing/binary_search_vector.cu b/testing/binary_search_vector.cu index d9a261c45..5e8f8358e 100644 --- a/testing/binary_search_vector.cu +++ b/testing/binary_search_vector.cu @@ -1,6 +1,7 @@ #include #include +#include #include #include #include @@ -16,7 +17,8 @@ template struct vector_like { typedef typename ExampleVector::allocator_type alloc; - typedef typename alloc::template rebind::other new_alloc; + typedef typename thrust::detail::allocator_traits alloc_traits; + typedef typename alloc_traits::template rebind_alloc new_alloc; typedef thrust::detail::vector_base type; }; diff --git a/testing/binary_search_vector_descending.cu b/testing/binary_search_vector_descending.cu index 88ec5a3e3..edc70663a 100644 --- a/testing/binary_search_vector_descending.cu +++ b/testing/binary_search_vector_descending.cu @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -14,7 +15,8 @@ template struct vector_like { typedef typename ExampleVector::allocator_type alloc; - typedef typename alloc::template rebind::other new_alloc; + typedef typename thrust::detail::allocator_traits alloc_traits; + typedef typename alloc_traits::template rebind_alloc new_alloc; typedef thrust::detail::vector_base type; }; diff --git a/testing/caching_allocator.cu b/testing/caching_allocator.cu new file mode 100644 index 000000000..f98ea336b --- /dev/null +++ b/testing/caching_allocator.cu @@ -0,0 +1,23 @@ +#include + +#include +#include + +template +void test_implementation(Allocator alloc) +{ + typedef typename thrust::detail::allocator_traits Traits; + typedef typename Allocator::pointer Ptr; + + Ptr p = Traits::allocate(alloc, 123); + Traits::deallocate(alloc, p, 123); + + Ptr p2 = Traits::allocate(alloc, 123); + ASSERT_EQUAL(p, p2); +} + +void TestSingleDeviceTLSCachingAllocator() +{ + test_implementation(thrust::detail::single_device_tls_caching_allocator()); +}; +DECLARE_UNITTEST(TestSingleDeviceTLSCachingAllocator); diff --git a/testing/cmake/CMakeLists.txt b/testing/cmake/CMakeLists.txt new file mode 100644 index 000000000..71798de75 --- /dev/null +++ b/testing/cmake/CMakeLists.txt @@ -0,0 +1,37 @@ +thrust_update_system_found_flags() + +set(extra_cmake_flags) + +# Need to pass these when testing NVC++. +if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") + set(extra_cmake_flags + -D "CMAKE_CUDA_COMPILER_ID=${CMAKE_CUDA_COMPILER_ID}" + -D "CMAKE_CUDA_COMPILER_FORCED=${CMAKE_CUDA_COMPILER_FORCED}" + ) +endif() + +if (THRUST_CPP_FOUND AND THRUST_CUDA_FOUND) + # Test that we can use `find_package` on an installed Thrust: + add_test( + NAME thrust.test.cmake.test_install + COMMAND "${CMAKE_COMMAND}" + --log-level=VERBOSE + -G "${CMAKE_GENERATOR}" + -S "${CMAKE_CURRENT_SOURCE_DIR}/test_install" + -B "${CMAKE_CURRENT_BINARY_DIR}/test_install" + -D "THRUST_BINARY_DIR=${Thrust_BINARY_DIR}" + -D "CMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" + -D "CMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}" + -D "CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}" + ${extra_cmake_flags} + ) +endif() + +# Check source code for issues that can be found by pattern matching: +add_test( + NAME thrust.test.cmake.check_source_files + COMMAND + "${CMAKE_COMMAND}" + -D "Thrust_SOURCE_DIR=${Thrust_SOURCE_DIR}" + -P "${CMAKE_CURRENT_LIST_DIR}/check_source_files.cmake" +) diff --git a/testing/cmake/check_source_files.cmake b/testing/cmake/check_source_files.cmake new file mode 100644 index 000000000..900300c67 --- /dev/null +++ b/testing/cmake/check_source_files.cmake @@ -0,0 +1,185 @@ +# Check all source files for various issues that can be detected using pattern +# matching. +# +# This is run as a ctest test named `thrust.test.cmake.check_source_files`, or +# manually with: +# cmake -D "Thrust_SOURCE_DIR=" -P check_source_files.cmake + +cmake_minimum_required(VERSION 3.15) + +function(count_substrings input search_regex output_var) + string(REGEX MATCHALL "${search_regex}" matches "${input}") + list(LENGTH matches num_matches) + set(${output_var} ${num_matches} PARENT_SCOPE) +endfunction() + +set(found_errors 0) +file(GLOB_RECURSE thrust_srcs + RELATIVE "${Thrust_SOURCE_DIR}" + "${Thrust_SOURCE_DIR}/thrust/*.h" + "${Thrust_SOURCE_DIR}/thrust/*.inl" +) + +################################################################################ +# Namespace checks. +# Check all files in thrust to make sure that they use +# THRUST_NAMESPACE_BEGIN/END instead of bare `namespace thrust {}` declarations. +set(namespace_exclusions + # This defines the macros and must have bare namespace declarations: + thrust/detail/config/namespace.h +) + +set(bare_ns_regex "namespace[ \n\r\t]+thrust[ \n\r\t]*\\{") + +# Validation check for the above regex: +count_substrings([=[ +namespace thrust{ +namespace thrust { +namespace thrust { + namespace thrust { +namespace thrust +{ +namespace +thrust +{ +]=] + ${bare_ns_regex} valid_count) +if (NOT valid_count EQUAL 6) + message(FATAL_ERROR "Validation of bare namespace regex failed: " + "Matched ${valid_count} times, expected 6.") +endif() + +################################################################################ +# stdpar header checks. +# Check all files in Thrust to make sure that they aren't including +# or , both of which will cause circular dependencies in nvc++'s +# stdpar library. +# +# The headers following headers should be used instead: +# -> +# -> +# +set(stdpar_header_exclusions + # The wrappers are allowed to include the unwrapped headers + thrust/detail/algorithm_wrapper.h + thrust/detail/memory_wrapper.h + thrust/detail/numeric_wrapper.h +) + +set(algorithm_regex "#[ \t]*include[ \t]+") +set(memory_regex "#[ \t]*include[ \t]+") +set(numeric_regex "#[ \t]*include[ \t]+") + +# Validation check for the above regex pattern: +count_substrings([=[ +#include +# include +#include +# include +# include // ... +]=] + ${algorithm_regex} valid_count) +if (NOT valid_count EQUAL 5) + message(FATAL_ERROR "Validation of stdpar header regex failed: " + "Matched ${valid_count} times, expected 5.") +endif() + +################################################################################ +# Legacy macro checks. +# Check all files in Thrust to make sure that they aren't using the legacy +# CUB_RUNTIME_ENABLED and __THRUST_HAS_CUDART__ macros. +# +# These macros depend on __CUDA_ARCH__ and are not compatible with NV_IF_TARGET. +# They are provided for legacy purposes and should be replaced with +# [THRUST|CUB]_RDC_ENABLED and NV_IF_TARGET in Thrust/CUB code. +# +# +set(legacy_macro_header_exclusions + # This header defines a legacy CUDART macro: + thrust/system/cuda/config.h +) + +set(cub_legacy_macro_regex "CUB_RUNTIME_ENABLED") +set(thrust_legacy_macro_regex "__THRUST_HAS_CUDART__") + +################################################################################ +# Read source files: +foreach(src ${thrust_srcs}) + file(READ "${Thrust_SOURCE_DIR}/${src}" src_contents) + + if (NOT ${src} IN_LIST namespace_exclusions) + count_substrings("${src_contents}" "${bare_ns_regex}" bare_ns_count) + count_substrings("${src_contents}" THRUST_NS_PREFIX prefix_count) + count_substrings("${src_contents}" THRUST_NS_POSTFIX postfix_count) + count_substrings("${src_contents}" THRUST_NAMESPACE_BEGIN begin_count) + count_substrings("${src_contents}" THRUST_NAMESPACE_END end_count) + count_substrings("${src_contents}" "#include " header_count) + + if (NOT bare_ns_count EQUAL 0) + message("'${src}' contains 'namespace thrust {...}'. Replace with THRUST_NAMESPACE macros.") + set(found_errors 1) + endif() + + if (NOT prefix_count EQUAL 0) + message("'${src}' contains 'THRUST_NS_PREFIX'. Replace with THRUST_NAMESPACE macros.") + set(found_errors 1) + endif() + + if (NOT postfix_count EQUAL 0) + message("'${src}' contains 'THRUST_NS_POSTFIX'. Replace with THRUST_NAMESPACE macros.") + set(found_errors 1) + endif() + + if (NOT begin_count EQUAL end_count) + message("'${src}' namespace macros are unbalanced:") + message(" - THRUST_NAMESPACE_BEGIN occurs ${begin_count} times.") + message(" - THRUST_NAMESPACE_END occurs ${end_count} times.") + set(found_errors 1) + endif() + + if (begin_count GREATER 0 AND header_count EQUAL 0) + message("'${src}' uses Thrust namespace macros, but does not (directly) `#include `.") + set(found_errors 1) + endif() + endif() + + if (NOT ${src} IN_LIST stdpar_header_exclusions) + count_substrings("${src_contents}" "${algorithm_regex}" algorithm_count) + count_substrings("${src_contents}" "${memory_regex}" memory_count) + count_substrings("${src_contents}" "${numeric_regex}" numeric_count) + + if (NOT algorithm_count EQUAL 0) + message("'${src}' includes the header. Replace with .") + set(found_errors 1) + endif() + + if (NOT memory_count EQUAL 0) + message("'${src}' includes the header. Replace with .") + set(found_errors 1) + endif() + + if (NOT numeric_count EQUAL 0) + message("'${src}' includes the header. Replace with .") + set(found_errors 1) + endif() + endif() + + if (NOT ${src} IN_LIST legacy_macro_header_exclusions) + count_substrings("${src_contents}" "${thrust_legacy_macro_regex}" thrust_count) + count_substrings("${src_contents}" "${cub_legacy_macro_regex}" cub_count) + + if (NOT thrust_count EQUAL 0) + message("'${src}' uses __THRUST_HAS_CUDART__. Replace with THRUST_RDC_ENABLED and NV_IF_TARGET.") + set(found_errors 1) + endif() + + if (NOT cub_count EQUAL 0) + message("'${src}' uses CUB_RUNTIME_ENABLED. Replace with CUB_RDC_ENABLED and NV_IF_TARGET.") + set(found_errors 1) + endif() + endif() +endforeach() + +if (NOT found_errors EQUAL 0) + message(FATAL_ERROR "Errors detected.") +endif() diff --git a/testing/cmake/test_install/CMakeLists.txt b/testing/cmake/test_install/CMakeLists.txt new file mode 100644 index 000000000..30cf8405c --- /dev/null +++ b/testing/cmake/test_install/CMakeLists.txt @@ -0,0 +1,110 @@ +# Test that an installation of the project can be located by find_package() call +# with appropriate prefix settings. +# +# Expects THRUST_BINARY_DIR to be set to an existing thrust build directory. + +cmake_minimum_required(VERSION 3.15) + +project(ThrustTestInstall CXX CUDA) + +# This will eventually get deleted recursively -- keep that in mind if modifying +set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/install_prefix/") + +function(do_manual_install) + # Inspired by the VTK-m install tests, we can just glob up all of the + # cmake_install.cmake, include (ie. run) them, and they'll effectively + # install the project into the current value of CMAKE_INSTALL_PREFIX. + + # Gather all of the install files from Thrust's root: + file(GLOB_RECURSE install_files + LIST_DIRECTORIES False + "${THRUST_BINARY_DIR}/cmake_install.cmake" + ) + + message(STATUS "Locating install files...") + foreach (install_file IN LISTS install_files) + message(STATUS " * ${install_file}") + endforeach() + + message(STATUS "Building install tree...") + foreach(install_file IN LISTS install_files) + include("${install_file}") + endforeach() +endfunction() + +function(do_cleanup) + message(STATUS "Removing ${CMAKE_INSTALL_PREFIX}") + file(REMOVE_RECURSE "${CMAKE_INSTALL_PREFIX}") +endfunction() + +function(assert_boolean var_name expect) + if (expect) + if (NOT ${var_name}) + message(FATAL_ERROR "'${var_name}' is false, expected true.") + endif() + else() + if (${var_name}) + message(FATAL_ERROR "'${var_name}' is true, expected false.") + endif() + endif() +endfunction() + +function(assert_target target_name) + if (NOT TARGET "${target_name}") + message(FATAL_ERROR "Target '${target_name}' not defined.") + endif() +endfunction() + +function(find_installed_project) + set(CMAKE_PREFIX_PATH "${CMAKE_INSTALL_PREFIX}") + find_package(Thrust CONFIG COMPONENTS CPP CUDA) + + if (NOT Thrust_FOUND) + message(FATAL_ERROR + "find_package(Thrust) failed. " + "CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}" + ) + endif() + + # Test some internal config vars to check that this is the expected install: + # TODO The cmake_path (3.19) command will provide more robust ways to do this + + # Escape regex special characters in the install prefix, see + # https://gitlab.kitware.com/cmake/cmake/-/issues/18580 + string(REGEX REPLACE "([][+.*()^])" "\\\\\\1" + prefix_regex + "${CMAKE_INSTALL_PREFIX}" + ) + if (NOT _THRUST_INCLUDE_DIR MATCHES "^${prefix_regex}") + message(FATAL_ERROR + "Found Thrust in unexpected location: " + " * _THRUST_INCLUDE_DIR=${_THRUST_INCLUDE_DIR} " + " * ExpectedPrefix=${CMAKE_INSTALL_DIR}" + ) + endif() + if (NOT _CUB_INCLUDE_DIR MATCHES "^${prefix_regex}") + message(FATAL_ERROR + "Found CUB in unexpected location: " + " * _CUB_INCLUDE_DIR=${_CUB_INCLUDE_DIR} " + " * ExpectedPrefix=${CMAKE_INSTALL_DIR}" + ) + endif() + + thrust_create_target(Thrust) + assert_target(Thrust) + assert_target(CUB::CUB) + assert_target(Thrust::CPP::Host) + assert_target(Thrust::CUDA::Device) + + thrust_update_system_found_flags() + assert_boolean(THRUST_CPP_FOUND TRUE) + assert_boolean(THRUST_CUDA_FOUND TRUE) + assert_boolean(THRUST_OMP_FOUND FALSE) + assert_boolean(THRUST_TBB_FOUND FALSE) + +endfunction() + +do_cleanup() # Prepare for new installation +do_manual_install() +find_installed_project() +do_cleanup() # Clean up if successful diff --git a/testing/complex.cu b/testing/complex.cu index e69f2e7cd..cf980962a 100644 --- a/testing/complex.cu +++ b/testing/complex.cu @@ -1,6 +1,8 @@ #include #include +#include + #include #include #include @@ -273,7 +275,7 @@ struct TestComplexTrigonometricFunctions ASSERT_ALMOST_EQUAL(sinh(a),sinh(c)); ASSERT_ALMOST_EQUAL(tanh(a),tanh(c)); -#if __cplusplus >= 201103L +#if THRUST_CPP_DIALECT >= 2011 ASSERT_ALMOST_EQUAL(acos(a),acos(c)); ASSERT_ALMOST_EQUAL(asin(a),asin(c)); diff --git a/testing/complex_transform.cu b/testing/complex_transform.cu index c4496aad6..439597a0d 100644 --- a/testing/complex_transform.cu +++ b/testing/complex_transform.cu @@ -235,15 +235,6 @@ struct TestComplexArithmeticTransform thrust::device_vector d_p1 = h_p1; thrust::device_vector d_p2 = h_p2; thrust::device_vector d_result(n); -#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA - if(const CUDATestDriver *driver = dynamic_cast(&UnitTestDriver::s_driver())) - { - if(sizeof(T) == sizeof(double) && driver->current_device_architecture() < 200) - { - KNOWN_FAILURE; - } // end if - } // end if -#endif thrust::transform(h_p1.begin(), h_p1.end(), h_p2.begin(), h_result.begin(), basic_arithmetic_functor()); thrust::transform(d_p1.begin(), d_p1.end(), d_p2.begin(), d_result.begin(), basic_arithmetic_functor()); @@ -264,16 +255,6 @@ struct TestComplexPlaneTransform thrust::device_vector d_p1 = h_p1; thrust::device_vector d_result(n); -#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA - if(const CUDATestDriver *driver = dynamic_cast(&UnitTestDriver::s_driver())) - { - if(sizeof(T) == sizeof(double) && driver->current_device_architecture() < 200) - { - KNOWN_FAILURE; - } // end if - } // end if -#endif - thrust::transform(h_p1.begin(), h_p1.end(), h_result.begin(), complex_plane_functor()); thrust::transform(d_p1.begin(), d_p1.end(), d_result.begin(), complex_plane_functor()); ASSERT_ALMOST_EQUAL(h_result, d_result); @@ -296,16 +277,6 @@ struct TestComplexPowerTransform thrust::device_vector d_p2 = h_p2; thrust::device_vector d_result(n); -#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA - if(const CUDATestDriver *driver = dynamic_cast(&UnitTestDriver::s_driver())) - { - if(sizeof(T) == sizeof(double) && driver->current_device_architecture() < 200) - { - KNOWN_FAILURE; - } // end if - } // end if -#endif - thrust::transform(h_p1.begin(), h_p1.end(), h_p2.begin(), h_result.begin(), pow_functor()); thrust::transform(d_p1.begin(), d_p1.end(), d_p2.begin(), d_result.begin(), pow_functor()); // pow can be very innacurate there's no point trying to check for equality @@ -331,16 +302,6 @@ struct TestComplexExponentialTransform thrust::device_vector d_p1 = h_p1; thrust::device_vector d_result(n); -#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA - if(const CUDATestDriver *driver = dynamic_cast(&UnitTestDriver::s_driver())) - { - if(sizeof(T) == sizeof(double) && driver->current_device_architecture() < 200) - { - KNOWN_FAILURE; - } // end if - } // end if -#endif - thrust::transform(h_p1.begin(), h_p1.end(), h_result.begin(), exp_functor()); thrust::transform(d_p1.begin(), d_p1.end(), d_result.begin(), exp_functor()); ASSERT_ALMOST_EQUAL(h_result, d_result); @@ -368,15 +329,6 @@ struct TestComplexTrigonometricTransform thrust::device_vector d_p1 = h_p1; thrust::device_vector d_result(n); -#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA - if(const CUDATestDriver *driver = dynamic_cast(&UnitTestDriver::s_driver())) - { - if(sizeof(T) == sizeof(double) && driver->current_device_architecture() < 200) - { - KNOWN_FAILURE; - } // end if - } // end if -#endif thrust::transform(h_p1.begin(), h_p1.end(), h_result.begin(), sin_functor()); thrust::transform(d_p1.begin(), d_p1.end(), d_result.begin(), sin_functor()); @@ -404,7 +356,6 @@ struct TestComplexTrigonometricTransform ASSERT_ALMOST_EQUAL(h_result, d_result); - thrust::transform(h_p1.begin(), h_p1.end(), h_result.begin(), asin_functor()); thrust::transform(d_p1.begin(), d_p1.end(), d_result.begin(), asin_functor()); ASSERT_ALMOST_EQUAL(h_result, d_result); diff --git a/testing/constant_iterator.cu b/testing/constant_iterator.cu index cbf771c9a..e42cfea8d 100644 --- a/testing/constant_iterator.cu +++ b/testing/constant_iterator.cu @@ -109,11 +109,12 @@ void TestConstantIteratorCopy(void) { using namespace thrust; - typedef constant_iterator ConstIter; + using ValueType = typename Vector::value_type; + using ConstIter = constant_iterator; Vector result(4); - ConstIter first = make_constant_iterator(7); + ConstIter first = make_constant_iterator(7); ConstIter last = first + result.size(); thrust::copy(first, last, result.begin()); diff --git a/testing/copy.cu b/testing/copy.cu index 342788acf..661e379a2 100644 --- a/testing/copy.cu +++ b/testing/copy.cu @@ -1,14 +1,19 @@ #include #include +#include +#include #include #include +#include #include #include #include #include #include #include +#include +#include void TestCopyFromConstIterator(void) { @@ -336,9 +341,6 @@ void TestCopyIfSequence(const size_t n) thrust::host_vector h_data(n); thrust::sequence(h_data.begin(), h_data.end()); thrust::device_vector d_data(n); thrust::sequence(d_data.begin(), d_data.end()); - thrust::host_vector h_result(n); - thrust::device_vector d_result(n); - typename thrust::host_vector::iterator h_new_end; typename thrust::device_vector::iterator d_new_end; @@ -405,9 +407,6 @@ void TestCopyIfStencil(const size_t n) thrust::host_vector h_stencil = unittest::random_integers(n); thrust::device_vector d_stencil = unittest::random_integers(n); - thrust::host_vector h_result(n); - thrust::device_vector d_result(n); - typename thrust::host_vector::iterator h_new_end; typename thrust::device_vector::iterator d_new_end; @@ -427,6 +426,100 @@ void TestCopyIfStencil(const size_t n) } DECLARE_INTEGRAL_VARIABLE_UNITTEST(TestCopyIfStencil); +namespace +{ + +struct object_with_non_trivial_ctor +{ + // This struct will only properly assign if its `magic` member is + // set to this certain number. + static constexpr int MAGIC = 923390; + + int field; + int magic; + + __host__ __device__ object_with_non_trivial_ctor() + { + magic = MAGIC; + field = 0; + } + __host__ __device__ object_with_non_trivial_ctor(int f) + { + magic = MAGIC; + field = f; + } + + object_with_non_trivial_ctor(const object_with_non_trivial_ctor& x) = default; + + // This non-trivial assignment requires that `this` points to initialized + // memory + __host__ __device__ object_with_non_trivial_ctor& + operator=(const object_with_non_trivial_ctor& x) + { + // To really copy over x's field value, require we have magic value set. + // If copy_if copies to uninitialized bits, the field will rarely be 923390. + if (magic == MAGIC) + { + field = x.field; + } + return *this; + } +}; + +struct always_true +{ + __host__ __device__ + bool operator()(const object_with_non_trivial_ctor&) + { + return true; + }; +}; + +} // end anon namespace + +void TestCopyIfNonTrivial() +{ + // Attempting to copy an object_with_non_trivial_ctor into uninitialized + // memory will fail: + { + static constexpr size_t BufferAlign = alignof(object_with_non_trivial_ctor); + static constexpr size_t BufferSize = sizeof(object_with_non_trivial_ctor); + alignas(BufferAlign) std::array buffer; + + // Fill buffer with 0s to prevent warnings about uninitialized reads while + // ensure that the 'magic number' mechanism works as intended: + std::fill(buffer.begin(), buffer.end(), 0); + + object_with_non_trivial_ctor initialized; + object_with_non_trivial_ctor *uninitialized = + reinterpret_cast(buffer.data()); + + object_with_non_trivial_ctor source(42); + initialized = source; + *uninitialized = source; + + ASSERT_EQUAL(42, initialized.field); + ASSERT_NOT_EQUAL(42, uninitialized->field); + } + + // This test ensures that we use placement new instead of assigning + // to uninitialized memory. See Thrust Github issue #1153. + thrust::device_vector a(10, object_with_non_trivial_ctor(99)); + thrust::device_vector b(10); + + thrust::copy_if(a.begin(), a.end(), b.begin(), always_true()); + + for (int i = 0; i < 10; i++) + { + object_with_non_trivial_ctor ha(a[i]); + object_with_non_trivial_ctor hb(b[i]); + int ia = ha.field; + int ib = hb.field; + + ASSERT_EQUAL(ia, ib); + } +} +DECLARE_UNITTEST(TestCopyIfNonTrivial); template void TestCopyCountingIterator(void) @@ -617,3 +710,72 @@ void TestCopyIfStencilDispatchImplicit() } DECLARE_UNITTEST(TestCopyIfStencilDispatchImplicit); +struct only_set_when_expected_it +{ + long long expected; + bool * flag; + + __host__ __device__ only_set_when_expected_it operator++() const { return *this; } + __host__ __device__ only_set_when_expected_it operator*() const { return *this; } + template + __host__ __device__ only_set_when_expected_it operator+(Difference) const { return *this; } + template + __host__ __device__ only_set_when_expected_it operator+=(Difference) const { return *this; } + template + __host__ __device__ only_set_when_expected_it operator[](Index) const { return *this; } + + __device__ + void operator=(long long value) const + { + if (value == expected) + { + *flag = true; + } + } +}; + +THRUST_NAMESPACE_BEGIN +namespace detail +{ +// We need this type to pass as a non-const ref for unary_transform_functor +// to compile: +template <> +struct is_non_const_reference : thrust::true_type {}; +} // end namespace detail + +template<> +struct iterator_traits +{ + typedef long long value_type; + typedef only_set_when_expected_it reference; + typedef thrust::random_access_device_iterator_tag iterator_category; +}; +THRUST_NAMESPACE_END + +void TestCopyWithBigIndexesHelper(int magnitude) +{ + thrust::counting_iterator begin(0); + thrust::counting_iterator end = begin + (1ll << magnitude); + ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude); + + thrust::device_ptr has_executed = thrust::device_malloc(1); + *has_executed = false; + + only_set_when_expected_it out = { (1ll << magnitude) - 1, thrust::raw_pointer_cast(has_executed) }; + + thrust::copy(thrust::device, begin, end, out); + + bool has_executed_h = *has_executed; + thrust::device_free(has_executed); + + ASSERT_EQUAL(has_executed_h, true); +} + +void TestCopyWithBigIndexes() +{ + TestCopyWithBigIndexesHelper(30); + TestCopyWithBigIndexesHelper(31); + TestCopyWithBigIndexesHelper(32); + TestCopyWithBigIndexesHelper(33); +} +DECLARE_UNITTEST(TestCopyWithBigIndexes); diff --git a/testing/count.cu b/testing/count.cu index 10c951c47..a6021da79 100644 --- a/testing/count.cu +++ b/testing/count.cu @@ -116,3 +116,22 @@ void TestCountDispatchImplicit() } DECLARE_UNITTEST(TestCountDispatchImplicit); +void TestCountWithBigIndexesHelper(int magnitude) +{ + thrust::counting_iterator begin(1); + thrust::counting_iterator end = begin + (1ll << magnitude); + ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude); + + long long result = thrust::count(thrust::device, begin, end, (1ll << magnitude) - 17); + + ASSERT_EQUAL(result, 1); +} + +void TestCountWithBigIndexes() +{ + TestCountWithBigIndexesHelper(30); + TestCountWithBigIndexesHelper(31); + TestCountWithBigIndexesHelper(32); + TestCountWithBigIndexesHelper(33); +} +DECLARE_UNITTEST(TestCountWithBigIndexes); diff --git a/testing/counting_iterator.cu b/testing/counting_iterator.cu index eede510fc..ebefe4d64 100644 --- a/testing/counting_iterator.cu +++ b/testing/counting_iterator.cu @@ -8,6 +8,14 @@ THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN +template +void TestCountingDefaultConstructor(void) +{ + thrust::counting_iterator iter0; + ASSERT_EQUAL(*iter0, T{}); +} +DECLARE_GENERIC_UNITTEST(TestCountingDefaultConstructor); + void TestCountingIteratorCopyConstructor(void) { thrust::counting_iterator iter0(100); diff --git a/testing/cpp/CMakeLists.txt b/testing/cpp/CMakeLists.txt new file mode 100644 index 000000000..215b81ee4 --- /dev/null +++ b/testing/cpp/CMakeLists.txt @@ -0,0 +1,18 @@ +file(GLOB test_srcs + RELATIVE "${CMAKE_CURRENT_LIST_DIR}}" + CONFIGURE_DEPENDS + *.cu *.cpp +) + +foreach(thrust_target IN LISTS THRUST_TARGETS) + thrust_get_target_property(config_device ${thrust_target} DEVICE) + if (NOT config_device STREQUAL "CPP") + continue() + endif() + + foreach(test_src IN LISTS test_srcs) + get_filename_component(test_name "${test_src}" NAME_WLE) + string(PREPEND test_name "cpp.") + thrust_add_test(test_target ${test_name} "${test_src}" ${thrust_target}) + endforeach() +endforeach() diff --git a/testing/cpp/adjacent_difference.cu b/testing/cpp/adjacent_difference.cu new file mode 100644 index 000000000..584899bec --- /dev/null +++ b/testing/cpp/adjacent_difference.cu @@ -0,0 +1,54 @@ +#include +#include +#include +#include +#include + +struct detect_wrong_difference +{ + bool * flag; + + __host__ __device__ detect_wrong_difference operator++() const { return *this; } + __host__ __device__ detect_wrong_difference operator*() const { return *this; } + template + __host__ __device__ detect_wrong_difference operator+(Difference) const { return *this; } + template + __host__ __device__ detect_wrong_difference operator[](Index) const { return *this; } + + __device__ + void operator=(long long difference) const + { + if (difference != 1) + { + *flag = false; + } + } +}; + +void TestAdjacentDifferenceWithBigIndexesHelper(int magnitude) +{ + thrust::counting_iterator begin(1); + thrust::counting_iterator end = begin + (1ll << magnitude); + ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude); + + thrust::device_ptr all_differences_correct = thrust::device_malloc(1); + *all_differences_correct = true; + + detect_wrong_difference out = { thrust::raw_pointer_cast(all_differences_correct) }; + + thrust::adjacent_difference(thrust::device, begin, end, out); + + bool all_differences_correct_h = *all_differences_correct; + thrust::device_free(all_differences_correct); + + ASSERT_EQUAL(all_differences_correct_h, true); +} + +void TestAdjacentDifferenceWithBigIndexes() +{ + TestAdjacentDifferenceWithBigIndexesHelper(30); + TestAdjacentDifferenceWithBigIndexesHelper(31); + TestAdjacentDifferenceWithBigIndexesHelper(32); + TestAdjacentDifferenceWithBigIndexesHelper(33); +} +DECLARE_UNITTEST(TestAdjacentDifferenceWithBigIndexes); diff --git a/testing/cuda/CMakeLists.txt b/testing/cuda/CMakeLists.txt new file mode 100644 index 000000000..8fe4a4be7 --- /dev/null +++ b/testing/cuda/CMakeLists.txt @@ -0,0 +1,35 @@ +file(GLOB test_srcs + RELATIVE "${CMAKE_CURRENT_LIST_DIR}}" + CONFIGURE_DEPENDS + *.cu *.cpp +) + +# These tests always build with RDC, so make sure that the sm_XX flags are +# compatible. See note in ThrustCudaConfig.cmake. +# TODO once we're using CUDA_ARCHITECTURES, we can setup non-rdc fallback +# tests to build for non-rdc arches. But for now, all files in a given directory +# must build with the same `CMAKE_CUDA_FLAGS` due to CMake constraints around +# how CUDA_FLAGS works. +set(CMAKE_CUDA_FLAGS "${THRUST_CUDA_FLAGS_BASE} ${THRUST_CUDA_FLAGS_RDC}") + +foreach(thrust_target IN LISTS THRUST_TARGETS) + thrust_get_target_property(config_device ${thrust_target} DEVICE) + if (NOT config_device STREQUAL "CUDA") + continue() + endif() + + foreach(test_src IN LISTS test_srcs) + get_filename_component(test_name "${test_src}" NAME_WLE) + string(PREPEND test_name "cuda.") + + # Create two targets, one with RDC enabled, the other without. This tests + # both device-side behaviors -- the CDP kernel launch with RDC, and the + # serial fallback path without RDC. + thrust_add_test(seq_test_target ${test_name}.cdp_0 "${test_src}" ${thrust_target}) + + if (THRUST_ENABLE_TESTS_WITH_RDC) + thrust_add_test(cdp_test_target ${test_name}.cdp_1 "${test_src}" ${thrust_target}) + thrust_enable_rdc_for_cuda_target(${cdp_test_target}) + endif() + endforeach() +endforeach() diff --git a/testing/cuda/adjacent_difference.cu b/testing/cuda/adjacent_difference.cu index 1e0b5a784..9b101ea2e 100644 --- a/testing/cuda/adjacent_difference.cu +++ b/testing/cuda/adjacent_difference.cu @@ -1,8 +1,11 @@ #include #include #include +#include +#include +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void adjacent_difference_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result) { @@ -22,28 +25,28 @@ void TestAdjacentDifferenceDevice(ExecutionPolicy exec, const size_t n) { thrust::host_vector h_input = unittest::random_samples(n); thrust::device_vector d_input = h_input; - + thrust::host_vector h_output(n); thrust::device_vector d_output(n); - + thrust::adjacent_difference(h_input.begin(), h_input.end(), h_output.begin()); adjacent_difference_kernel<<<1,1>>>(exec, d_input.begin(), d_input.end(), d_output.begin()); { cudaError_t const err = cudaDeviceSynchronize(); ASSERT_EQUAL(cudaSuccess, err); } - + ASSERT_EQUAL(h_output, d_output); - + thrust::adjacent_difference(h_input.begin(), h_input.end(), h_output.begin(), thrust::plus()); adjacent_difference_kernel<<<1,1>>>(exec, d_input.begin(), d_input.end(), d_output.begin(), thrust::plus()); { cudaError_t const err = cudaDeviceSynchronize(); ASSERT_EQUAL(cudaSuccess, err); } - + ASSERT_EQUAL(h_output, d_output); - + // in-place operation thrust::adjacent_difference(h_input.begin(), h_input.end(), h_input.begin(), thrust::plus()); adjacent_difference_kernel<<<1,1>>>(exec, d_input.begin(), d_input.end(), d_input.begin(), thrust::plus()); @@ -51,7 +54,7 @@ void TestAdjacentDifferenceDevice(ExecutionPolicy exec, const size_t n) cudaError_t const err = cudaDeviceSynchronize(); ASSERT_EQUAL(cudaSuccess, err); } - + ASSERT_EQUAL(h_input, h_output); //computed previously ASSERT_EQUAL(d_input, d_output); //computed previously } @@ -71,21 +74,22 @@ void TestAdjacentDifferenceDeviceDevice(const size_t n) TestAdjacentDifferenceDevice(thrust::device, n); } DECLARE_VARIABLE_UNITTEST(TestAdjacentDifferenceDeviceDevice); +#endif void TestAdjacentDifferenceCudaStreams() { cudaStream_t s; cudaStreamCreate(&s); - + thrust::device_vector input(3); thrust::device_vector output(3); input[0] = 1; input[1] = 4; input[2] = 6; - + thrust::adjacent_difference(thrust::cuda::par.on(s), input.begin(), input.end(), output.begin()); cudaStreamSynchronize(s); - + ASSERT_EQUAL(output[0], 1); ASSERT_EQUAL(output[1], 3); ASSERT_EQUAL(output[2], 2); @@ -94,3 +98,57 @@ void TestAdjacentDifferenceCudaStreams() } DECLARE_UNITTEST(TestAdjacentDifferenceCudaStreams); +struct detect_wrong_difference +{ + using difference_type = void; + using value_type = void; + using pointer = void; + using reference = void; + using iterator_category = std::output_iterator_tag; + + bool * flag; + + __host__ __device__ detect_wrong_difference operator++() const { return *this; } + __host__ __device__ detect_wrong_difference operator*() const { return *this; } + template + __host__ __device__ detect_wrong_difference operator+(Difference) const { return *this; } + template + __host__ __device__ detect_wrong_difference operator[](Index) const { return *this; } + + __device__ + void operator=(long long difference) const + { + if (difference != 1) + { + *flag = false; + } + } +}; + +void TestAdjacentDifferenceWithBigIndexesHelper(int magnitude) +{ + thrust::counting_iterator begin(1); + thrust::counting_iterator end = begin + (1ll << magnitude); + ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude); + + thrust::device_ptr all_differences_correct = thrust::device_malloc(1); + *all_differences_correct = true; + + detect_wrong_difference out = { thrust::raw_pointer_cast(all_differences_correct) }; + + thrust::adjacent_difference(thrust::device, begin, end, out); + + bool all_differences_correct_h = *all_differences_correct; + thrust::device_free(all_differences_correct); + + ASSERT_EQUAL(all_differences_correct_h, true); +} + +void TestAdjacentDifferenceWithBigIndexes() +{ + TestAdjacentDifferenceWithBigIndexesHelper(30); + TestAdjacentDifferenceWithBigIndexesHelper(31); + TestAdjacentDifferenceWithBigIndexesHelper(32); + TestAdjacentDifferenceWithBigIndexesHelper(33); +} +DECLARE_UNITTEST(TestAdjacentDifferenceWithBigIndexes); diff --git a/testing/cuda/adjacent_difference.mk b/testing/cuda/adjacent_difference.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/adjacent_difference.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/binary_search.cu b/testing/cuda/binary_search.cu new file mode 100644 index 000000000..58a83f61c --- /dev/null +++ b/testing/cuda/binary_search.cu @@ -0,0 +1,25 @@ +#include + +#include +#include +#include +#include +#include + +void TestEqualRangeOnStream() +{ // Regression test for GH issue #921 (nvbug 2173437) + typedef typename thrust::device_vector vector_t; + typedef typename vector_t::iterator iterator_t; + typedef thrust::pair result_t; + + vector_t input(10); + thrust::sequence(thrust::device, input.begin(), input.end(), 0); + cudaStream_t stream = 0; + result_t result = thrust::equal_range(thrust::cuda::par.on(stream), + input.begin(), input.end(), + 5); + + ASSERT_EQUAL(5, thrust::distance(input.begin(), result.first)); + ASSERT_EQUAL(6, thrust::distance(input.begin(), result.second)); +} +DECLARE_UNITTEST(TestEqualRangeOnStream); diff --git a/testing/cuda/binary_search.mk b/testing/cuda/binary_search.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/binary_search.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/complex.mk b/testing/cuda/complex.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/complex.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/copy.cu b/testing/cuda/copy.cu index 1ad6e2626..6fe91853d 100644 --- a/testing/cuda/copy.cu +++ b/testing/cuda/copy.cu @@ -3,6 +3,7 @@ #include +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void copy_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result) @@ -89,4 +90,5 @@ void TestCopyNDeviceDevice(size_t n) TestCopyNDevice(thrust::device, n); } DECLARE_VARIABLE_UNITTEST(TestCopyNDeviceDevice); +#endif diff --git a/testing/cuda/copy.mk b/testing/cuda/copy.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/copy.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/copy_if.cu b/testing/cuda/copy_if.cu index dcec12fde..bb879b671 100644 --- a/testing/cuda/copy_if.cu +++ b/testing/cuda/copy_if.cu @@ -3,7 +3,6 @@ #include #include - template struct is_even { @@ -20,6 +19,7 @@ struct mod_3 }; +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void copy_if_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result1, Predicate pred, Iterator3 result2) { @@ -95,7 +95,15 @@ void TestCopyIfDeviceDevice() DECLARE_UNITTEST(TestCopyIfDeviceDevice); -void TestCopyIfCudaStreams() +void TestCopyIfDeviceNoSync() +{ + TestCopyIfDevice(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestCopyIfDeviceNoSync); +#endif + +template +void TestCopyIfCudaStreams(ExecutionPolicy policy) { typedef thrust::device_vector Vector; @@ -111,7 +119,7 @@ void TestCopyIfCudaStreams() cudaStream_t s; cudaStreamCreate(&s); - Vector::iterator end = thrust::copy_if(thrust::cuda::par.on(s), + Vector::iterator end = thrust::copy_if(policy.on(s), data.begin(), data.end(), result.begin(), @@ -124,9 +132,19 @@ void TestCopyIfCudaStreams() cudaStreamDestroy(s); } -DECLARE_UNITTEST(TestCopyIfCudaStreams); +void TestCopyIfCudaStreamsSync(){ + TestCopyIfCudaStreams(thrust::cuda::par); +} +DECLARE_UNITTEST(TestCopyIfCudaStreamsSync); +void TestCopyIfCudaStreamsNoSync(){ + TestCopyIfCudaStreams(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestCopyIfCudaStreamsNoSync); + + +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void copy_if_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 stencil_first, Iterator3 result1, Predicate pred, Iterator4 result2) { @@ -144,9 +162,6 @@ void TestCopyIfStencilDevice(ExecutionPolicy exec) thrust::host_vector h_stencil = unittest::random_integers(n); thrust::device_vector d_stencil = unittest::random_integers(n); - thrust::host_vector h_result(n); - thrust::device_vector d_result(n); - typename thrust::host_vector::iterator h_new_end; typename thrust::device_vector::iterator d_new_end; @@ -208,7 +223,16 @@ void TestCopyIfStencilDeviceDevice() DECLARE_UNITTEST(TestCopyIfStencilDeviceDevice); -void TestCopyIfStencilCudaStreams() +void TestCopyIfStencilDeviceNoSync() +{ + TestCopyIfStencilDevice(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestCopyIfStencilDeviceNoSync); +#endif + + +template +void TestCopyIfStencilCudaStreams(ExecutionPolicy policy) { typedef thrust::device_vector Vector; typedef Vector::value_type T; @@ -232,7 +256,7 @@ void TestCopyIfStencilCudaStreams() cudaStream_t s; cudaStreamCreate(&s); - Vector::iterator end = thrust::copy_if(thrust::cuda::par.on(s), + Vector::iterator end = thrust::copy_if(policy.on(s), data.begin(), data.end(), stencil.begin(), @@ -246,5 +270,17 @@ void TestCopyIfStencilCudaStreams() cudaStreamDestroy(s); } -DECLARE_UNITTEST(TestCopyIfStencilCudaStreams); + +void TestCopyIfStencilCudaStreamsSync() +{ + TestCopyIfStencilCudaStreams(thrust::cuda::par); +} +DECLARE_UNITTEST(TestCopyIfStencilCudaStreamsSync); + + +void TestCopyIfStencilCudaStreamsNoSync() +{ + TestCopyIfStencilCudaStreams(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestCopyIfStencilCudaStreamsNoSync); diff --git a/testing/cuda/copy_if.mk b/testing/cuda/copy_if.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/copy_if.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/count.cu b/testing/cuda/count.cu index 32835f5c4..e2b9b5f5a 100644 --- a/testing/cuda/count.cu +++ b/testing/cuda/count.cu @@ -3,6 +3,7 @@ #include +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void count_kernel(ExecutionPolicy exec, Iterator first, Iterator last, T value, Iterator2 result) @@ -91,6 +92,7 @@ void TestCountIfDeviceDevice(const size_t n) TestCountIfDevice(thrust::device, n); } DECLARE_VARIABLE_UNITTEST(TestCountIfDeviceDevice); +#endif void TestCountCudaStreams() diff --git a/testing/cuda/count.mk b/testing/cuda/count.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/count.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/cudart.mk b/testing/cuda/cudart.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/cudart.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/device_side_universal_vector.cu b/testing/cuda/device_side_universal_vector.cu new file mode 100644 index 000000000..a31919cfc --- /dev/null +++ b/testing/cuda/device_side_universal_vector.cu @@ -0,0 +1,84 @@ +#include + +#include + +template +__host__ __device__ void universal_vector_access(VecT &in, thrust::universal_vector &out) +{ + const int expected_front = 4; + const int expected_back = 2; + + out[0] = in.size() == 2 && // + in[0] == expected_front && // + in.front() == expected_front && // + *in.data() == expected_front && // + in[1] == expected_back && // + in.back() == expected_back; +} + +#if defined(THRUST_TEST_DEVICE_SIDE) +template +__global__ void universal_vector_device_access_kernel(VecT &vec, + thrust::universal_vector &out) +{ + universal_vector_access(vec, out); +} + +template +void test_universal_vector_access(VecT &vec, thrust::universal_vector &out) +{ + universal_vector_device_access_kernel<<<1, 1>>>(vec, out); + cudaError_t const err = cudaDeviceSynchronize(); + ASSERT_EQUAL(cudaSuccess, err); + ASSERT_EQUAL(out[0], true); +} +#else +template +void test_universal_vector_access(VecT &vec, thrust::universal_vector &out) +{ + universal_vector_access(vec, out); + ASSERT_EQUAL(out[0], true); +} +#endif + +void TestUniversalVectorDeviceAccess() +{ + thrust::universal_vector> in_storage(1); + thrust::universal_vector &in = *thrust::raw_pointer_cast(in_storage.data()); + + in.resize(2); + in[0] = 4; + in[1] = 2; + + thrust::universal_vector> out_storage(1); + thrust::universal_vector &out = *thrust::raw_pointer_cast(out_storage.data()); + out.resize(1); + out[0] = false; + + test_universal_vector_access(in, out); +} +DECLARE_UNITTEST(TestUniversalVectorDeviceAccess); + +void TestConstUniversalVectorDeviceAccess() +{ + thrust::universal_vector> in_storage(1); + + { + thrust::universal_vector &in = *thrust::raw_pointer_cast(in_storage.data()); + + in.resize(2); + in[0] = 4; + in[1] = 2; + } + + const thrust::universal_vector &in = *thrust::raw_pointer_cast(in_storage.data()); + + thrust::universal_vector> out_storage(1); + thrust::universal_vector &out = *thrust::raw_pointer_cast(out_storage.data()); + + out.resize(1); + out[0] = false; + + test_universal_vector_access(in, out); +} +DECLARE_UNITTEST(TestConstUniversalVectorDeviceAccess); diff --git a/testing/cuda/equal.cu b/testing/cuda/equal.cu index 84eb7254d..c5e794ed5 100644 --- a/testing/cuda/equal.cu +++ b/testing/cuda/equal.cu @@ -4,6 +4,7 @@ #include +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void equal_kernel(ExecutionPolicy exec, Iterator1 first1, Iterator1 last1, Iterator2 first2, Iterator3 result) @@ -92,6 +93,7 @@ void TestEqualDeviceDevice(const size_t n) TestEqualDevice(thrust::device, n); } DECLARE_VARIABLE_UNITTEST(TestEqualDeviceDevice); +#endif void TestEqualCudaStreams() diff --git a/testing/cuda/equal.mk b/testing/cuda/equal.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/equal.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/fill.cu b/testing/cuda/fill.cu index 17cf58c54..ee0a51776 100644 --- a/testing/cuda/fill.cu +++ b/testing/cuda/fill.cu @@ -3,6 +3,7 @@ #include #include +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void fill_kernel(ExecutionPolicy exec, Iterator first, Iterator last, T value) @@ -169,6 +170,7 @@ void TestFillNDeviceDevice(size_t n) TestFillNDevice(thrust::device, n); } DECLARE_VARIABLE_UNITTEST(TestFillNDeviceDevice); +#endif void TestFillCudaStreams() { diff --git a/testing/cuda/fill.mk b/testing/cuda/fill.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/fill.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/find.cu b/testing/cuda/find.cu index 4fe6f4dca..fbd86f5a0 100644 --- a/testing/cuda/find.cu +++ b/testing/cuda/find.cu @@ -39,6 +39,7 @@ struct less_than_value_pred }; +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void find_kernel(ExecutionPolicy exec, Iterator first, Iterator last, T value, Iterator2 result) { @@ -219,6 +220,7 @@ void TestFindIfNotDeviceDevice() TestFindIfNotDevice(thrust::device); }; DECLARE_UNITTEST(TestFindIfNotDeviceDevice); +#endif void TestFindCudaStreams() diff --git a/testing/cuda/find.mk b/testing/cuda/find.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/find.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/for_each.cu b/testing/cuda/for_each.cu index be6a7738c..afd54c621 100644 --- a/testing/cuda/for_each.cu +++ b/testing/cuda/for_each.cu @@ -59,6 +59,7 @@ struct mark_present_for_each }; +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void for_each_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Function f) { @@ -202,6 +203,7 @@ void TestForEachNDeviceDevice(const size_t n) ASSERT_EQUAL(h_output, d_output); } DECLARE_VARIABLE_UNITTEST(TestForEachNDeviceDevice); +#endif void TestForEachCudaStreams() diff --git a/testing/cuda/for_each.mk b/testing/cuda/for_each.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/for_each.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/gather.cu b/testing/cuda/gather.cu index a9a8c9333..6af4d4727 100644 --- a/testing/cuda/gather.cu +++ b/testing/cuda/gather.cu @@ -3,6 +3,7 @@ #include #include +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void gather_kernel(ExecutionPolicy exec, Iterator1 map_first, Iterator1 map_last, Iterator2 elements_first, Iterator3 result) @@ -56,6 +57,7 @@ void TestGatherDeviceDevice(const size_t n) TestGatherDevice(thrust::device, n); } DECLARE_VARIABLE_UNITTEST(TestGatherDeviceDevice); +#endif void TestGatherCudaStreams() @@ -85,6 +87,7 @@ void TestGatherCudaStreams() DECLARE_UNITTEST(TestGatherCudaStreams); +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void gather_if_kernel(ExecutionPolicy exec, Iterator1 map_first, Iterator1 map_last, Iterator2 stencil_first, Iterator3 elements_first, Iterator4 result, Predicate pred) @@ -157,6 +160,7 @@ void TestGatherIfDeviceDevice(const size_t n) TestGatherIfDevice(thrust::device, n); } DECLARE_VARIABLE_UNITTEST(TestGatherIfDeviceDevice); +#endif void TestGatherIfCudaStreams(void) { diff --git a/testing/cuda/gather.mk b/testing/cuda/gather.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/gather.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/generate.cu b/testing/cuda/generate.cu index c495e5563..407da920c 100644 --- a/testing/cuda/generate.cu +++ b/testing/cuda/generate.cu @@ -3,14 +3,6 @@ #include -template -__global__ -void generate_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Function f) -{ - thrust::generate(exec, first, last, f); -} - - template struct return_value { @@ -24,6 +16,15 @@ struct return_value }; +#ifdef THRUST_TEST_DEVICE_SIDE +template +__global__ +void generate_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Function f) +{ + thrust::generate(exec, first, last, f); +} + + template void TestGenerateDevice(ExecutionPolicy exec, const size_t n) { @@ -59,6 +60,7 @@ void TestGenerateDeviceDevice(const size_t n) TestGenerateDevice(thrust::device, n); } DECLARE_VARIABLE_UNITTEST(TestGenerateDeviceDevice); +#endif void TestGenerateCudaStreams() @@ -86,6 +88,7 @@ void TestGenerateCudaStreams() DECLARE_UNITTEST(TestGenerateCudaStreams); +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void generate_n_kernel(ExecutionPolicy exec, Iterator first, Size n, Function f) @@ -129,6 +132,7 @@ void TestGenerateNDeviceDevice(const size_t n) TestGenerateNDevice(thrust::device, n); } DECLARE_VARIABLE_UNITTEST(TestGenerateNDeviceDevice); +#endif void TestGenerateNCudaStreams() diff --git a/testing/cuda/generate.mk b/testing/cuda/generate.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/generate.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/inner_product.cu b/testing/cuda/inner_product.cu index 3dbb1150c..0c2276942 100644 --- a/testing/cuda/inner_product.cu +++ b/testing/cuda/inner_product.cu @@ -3,6 +3,7 @@ #include +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void inner_product_kernel(ExecutionPolicy exec, Iterator1 first1, Iterator1 last1, Iterator2 first2, T init, Iterator3 result) @@ -50,6 +51,7 @@ void TestInnerProductDeviceDevice() TestInnerProductDevice(thrust::device); }; DECLARE_UNITTEST(TestInnerProductDeviceDevice); +#endif void TestInnerProductCudaStreams() diff --git a/testing/cuda/inner_product.mk b/testing/cuda/inner_product.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/inner_product.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/is_partitioned.cu b/testing/cuda/is_partitioned.cu index 70379793b..468e17746 100644 --- a/testing/cuda/is_partitioned.cu +++ b/testing/cuda/is_partitioned.cu @@ -4,6 +4,7 @@ #include +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void is_partitioned_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Predicate pred, Iterator2 result) @@ -66,6 +67,7 @@ void TestIsPartitionedDeviceDevice() TestIsPartitionedDevice(thrust::device); } DECLARE_UNITTEST(TestIsPartitionedDeviceDevice); +#endif void TestIsPartitionedCudaStreams() diff --git a/testing/cuda/is_partitioned.mk b/testing/cuda/is_partitioned.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/is_partitioned.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/is_sorted.cu b/testing/cuda/is_sorted.cu index c6e11f6fc..1e9ef16ae 100644 --- a/testing/cuda/is_sorted.cu +++ b/testing/cuda/is_sorted.cu @@ -3,6 +3,7 @@ #include +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void is_sorted_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Iterator2 result) @@ -55,6 +56,7 @@ void TestIsSortedDeviceDevice() TestIsSortedDevice(thrust::device); } DECLARE_UNITTEST(TestIsSortedDeviceDevice); +#endif void TestIsSortedCudaStreams() diff --git a/testing/cuda/is_sorted.mk b/testing/cuda/is_sorted.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/is_sorted.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/is_sorted_until.cu b/testing/cuda/is_sorted_until.cu index d84f09fca..9e6d5ac76 100644 --- a/testing/cuda/is_sorted_until.cu +++ b/testing/cuda/is_sorted_until.cu @@ -3,6 +3,7 @@ #include +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void is_sorted_until_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result) @@ -57,6 +58,7 @@ void TestIsSortedUntilDeviceDevice() TestIsSortedUntilDevice(thrust::device); } DECLARE_UNITTEST(TestIsSortedUntilDeviceDevice); +#endif void TestIsSortedUntilCudaStreams() diff --git a/testing/cuda/is_sorted_until.mk b/testing/cuda/is_sorted_until.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/is_sorted_until.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/logical.cu b/testing/cuda/logical.cu index 61e7dc49a..a08f041b7 100644 --- a/testing/cuda/logical.cu +++ b/testing/cuda/logical.cu @@ -4,6 +4,7 @@ #include +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void all_of_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Function f, Iterator2 result) @@ -83,6 +84,7 @@ void TestAllOfDeviceDevice() TestAllOfDevice(thrust::device); } DECLARE_UNITTEST(TestAllOfDeviceDevice); +#endif void TestAllOfCudaStreams() @@ -111,6 +113,7 @@ void TestAllOfCudaStreams() DECLARE_UNITTEST(TestAllOfCudaStreams); +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void any_of_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Function f, Iterator2 result) @@ -191,6 +194,7 @@ void TestAnyOfDeviceDevice() TestAnyOfDevice(thrust::device); } DECLARE_UNITTEST(TestAnyOfDeviceDevice); +#endif void TestAnyOfCudaStreams() @@ -219,6 +223,7 @@ void TestAnyOfCudaStreams() DECLARE_UNITTEST(TestAnyOfCudaStreams); +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void none_of_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Function f, Iterator2 result) @@ -299,6 +304,7 @@ void TestNoneOfDeviceDevice() TestNoneOfDevice(thrust::device); } DECLARE_UNITTEST(TestNoneOfDeviceDevice); +#endif void TestNoneOfCudaStreams() diff --git a/testing/cuda/logical.mk b/testing/cuda/logical.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/logical.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/managed_memory_pointer.mk b/testing/cuda/managed_memory_pointer.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/managed_memory_pointer.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/max_element.cu b/testing/cuda/max_element.cu index a18d9656a..defc314d1 100644 --- a/testing/cuda/max_element.cu +++ b/testing/cuda/max_element.cu @@ -3,6 +3,7 @@ #include +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void max_element_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Iterator2 result) @@ -67,7 +68,16 @@ void TestMaxElementDeviceDevice() DECLARE_UNITTEST(TestMaxElementDeviceDevice); -void TestMaxElementCudaStreams() +void TestMaxElementDeviceNoSync() +{ + TestMaxElementDevice(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestMaxElementDeviceNoSync); +#endif + + +template +void TestMaxElementCudaStreams(ExecutionPolicy policy) { typedef thrust::device_vector Vector; typedef Vector::value_type T; @@ -83,15 +93,28 @@ void TestMaxElementCudaStreams() cudaStream_t s; cudaStreamCreate(&s); - ASSERT_EQUAL( *thrust::max_element(thrust::cuda::par.on(s), data.begin(), data.end()), 5); - ASSERT_EQUAL( thrust::max_element(thrust::cuda::par.on(s), data.begin(), data.end()) - data.begin(), 1); + auto streampolicy = policy.on(s); + + ASSERT_EQUAL( *thrust::max_element(streampolicy, data.begin(), data.end()), 5); + ASSERT_EQUAL( thrust::max_element(streampolicy, data.begin(), data.end()) - data.begin(), 1); - ASSERT_EQUAL( *thrust::max_element(thrust::cuda::par.on(s), data.begin(), data.end(), thrust::greater()), 1); - ASSERT_EQUAL( thrust::max_element(thrust::cuda::par.on(s), data.begin(), data.end(), thrust::greater()) - data.begin(), 2); + ASSERT_EQUAL( *thrust::max_element(streampolicy, data.begin(), data.end(), thrust::greater()), 1); + ASSERT_EQUAL( thrust::max_element(streampolicy, data.begin(), data.end(), thrust::greater()) - data.begin(), 2); cudaStreamDestroy(s); } -DECLARE_UNITTEST(TestMaxElementCudaStreams); + +void TestMaxElementCudaStreamsSync(){ + TestMaxElementCudaStreams(thrust::cuda::par); +} +DECLARE_UNITTEST(TestMaxElementCudaStreamsSync); + + +void TestMaxElementCudaStreamsNoSync(){ + TestMaxElementCudaStreams(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestMaxElementCudaStreamsNoSync); + void TestMaxElementDevicePointer() { diff --git a/testing/cuda/max_element.mk b/testing/cuda/max_element.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/max_element.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/memory.cu b/testing/cuda/memory.cu index ed9acec55..eda432ca8 100644 --- a/testing/cuda/memory.cu +++ b/testing/cuda/memory.cu @@ -35,6 +35,7 @@ void TestSelectSystemCudaToCpp() DECLARE_UNITTEST(TestSelectSystemCudaToCpp); +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void get_temporary_buffer_kernel(size_t n, Iterator result) { @@ -43,9 +44,9 @@ __global__ void get_temporary_buffer_kernel(size_t n, Iterator result) template -__global__ void return_temporary_buffer_kernel(Pointer ptr) +__global__ void return_temporary_buffer_kernel(Pointer ptr, std::ptrdiff_t n) { - thrust::return_temporary_buffer(thrust::seq, ptr); + thrust::return_temporary_buffer(thrust::seq, ptr, n); } @@ -58,8 +59,10 @@ void TestGetTemporaryBufferDeviceSeq() thrust::device_vector d_result(1); get_temporary_buffer_kernel<<<1,1>>>(n, d_result.begin()); - cudaError_t const err = cudaDeviceSynchronize(); - ASSERT_EQUAL(cudaSuccess, err); + { + cudaError_t const err = cudaDeviceSynchronize(); + ASSERT_EQUAL(cudaSuccess, err); + } ptr_and_sz_type ptr_and_sz = d_result[0]; @@ -74,9 +77,11 @@ void TestGetTemporaryBufferDeviceSeq() ASSERT_EQUAL(true, thrust::all_of(thrust::device, ptr_and_sz.first, ptr_and_sz.first + n, thrust::placeholders::_1 == ref_val)); - return_temporary_buffer_kernel<<<1,1>>>(ptr_and_sz.first); - cudaError_t const err = cudaDeviceSynchronize(); - ASSERT_EQUAL(cudaSuccess, err); + return_temporary_buffer_kernel<<<1,1>>>(ptr_and_sz.first, ptr_and_sz.second); + { + cudaError_t const err = cudaDeviceSynchronize(); + ASSERT_EQUAL(cudaSuccess, err); + } } } DECLARE_UNITTEST(TestGetTemporaryBufferDeviceSeq); @@ -104,8 +109,10 @@ void TestMallocDeviceSeq() thrust::device_vector d_result(1); malloc_kernel<<<1,1>>>(n, d_result.begin()); - cudaError_t const err = cudaDeviceSynchronize(); - ASSERT_EQUAL(cudaSuccess, err); + { + cudaError_t const err = cudaDeviceSynchronize(); + ASSERT_EQUAL(cudaSuccess, err); + } pointer ptr = d_result[0]; @@ -119,9 +126,12 @@ void TestMallocDeviceSeq() ASSERT_EQUAL(true, thrust::all_of(thrust::device, ptr, ptr + n, thrust::placeholders::_1 == ref_val)); free_kernel<<<1,1>>>(ptr); - cudaError_t const err = cudaDeviceSynchronize(); - ASSERT_EQUAL(cudaSuccess, err); + { + cudaError_t const err = cudaDeviceSynchronize(); + ASSERT_EQUAL(cudaSuccess, err); + } } } DECLARE_UNITTEST(TestMallocDeviceSeq); +#endif diff --git a/testing/cuda/memory.mk b/testing/cuda/memory.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/memory.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/merge.cu b/testing/cuda/merge.cu index 5e13b9d3a..1a96e8774 100644 --- a/testing/cuda/merge.cu +++ b/testing/cuda/merge.cu @@ -6,6 +6,7 @@ #include +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void merge_kernel(ExecutionPolicy exec, @@ -80,6 +81,7 @@ void TestMergeDeviceDevice() TestMergeDevice(thrust::device); } DECLARE_UNITTEST(TestMergeDeviceDevice); +#endif void TestMergeCudaStreams() diff --git a/testing/cuda/merge.mk b/testing/cuda/merge.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/merge.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/merge_by_key.cu b/testing/cuda/merge_by_key.cu index 84b80e007..40ea542df 100644 --- a/testing/cuda/merge_by_key.cu +++ b/testing/cuda/merge_by_key.cu @@ -5,6 +5,7 @@ #include +#ifdef THRUST_TEST_DEVICE_SIDE template +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void min_element_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Iterator2 result) @@ -64,6 +65,7 @@ void TestMinElementDeviceDevice() TestMinElementDevice(thrust::device); } DECLARE_UNITTEST(TestMinElementDeviceDevice); +#endif void TestMinElementCudaStreams() diff --git a/testing/cuda/min_element.mk b/testing/cuda/min_element.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/min_element.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/minmax_element.cu b/testing/cuda/minmax_element.cu index e3cae07a2..6376bc28b 100644 --- a/testing/cuda/minmax_element.cu +++ b/testing/cuda/minmax_element.cu @@ -2,6 +2,7 @@ #include +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void minmax_element_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result) @@ -85,6 +86,7 @@ void TestMinMaxElementDeviceDevice() TestMinMaxElementDevice(thrust::device); } DECLARE_UNITTEST(TestMinMaxElementDeviceDevice); +#endif void TestMinMaxElementCudaStreams() diff --git a/testing/cuda/minmax_element.mk b/testing/cuda/minmax_element.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/minmax_element.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/mismatch.cu b/testing/cuda/mismatch.cu index 5b08f4307..aac89352a 100644 --- a/testing/cuda/mismatch.cu +++ b/testing/cuda/mismatch.cu @@ -3,6 +3,7 @@ #include +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void mismatch_kernel(ExecutionPolicy exec, Iterator1 first1, Iterator1 last1, Iterator2 first2, Iterator3 result) { @@ -72,6 +73,7 @@ void TestMismatchDeviceDevice() TestMismatchDevice(thrust::device); } DECLARE_UNITTEST(TestMismatchDeviceDevice); +#endif void TestMismatchCudaStreams() diff --git a/testing/cuda/mismatch.mk b/testing/cuda/mismatch.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/mismatch.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/pair_sort.cu b/testing/cuda/pair_sort.cu index 87838e429..da23e4cb2 100644 --- a/testing/cuda/pair_sort.cu +++ b/testing/cuda/pair_sort.cu @@ -4,16 +4,12 @@ #include -template +#ifdef THRUST_TEST_DEVICE_SIDE +template __global__ -void stable_sort_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 is_supported) +void stable_sort_kernel(ExecutionPolicy exec, Iterator first, Iterator last) { -#if (__CUDA_ARCH__ >= 200) - *is_supported = true; thrust::stable_sort(exec, first, last); -#else - *is_supported = false; -#endif } @@ -43,19 +39,14 @@ void TestPairStableSortDevice(ExecutionPolicy exec) thrust::device_vector

d_pairs = h_pairs; - thrust::device_vector is_supported(1); - - stable_sort_kernel<<<1,1>>>(exec, d_pairs.begin(), d_pairs.end(), is_supported.begin()); + stable_sort_kernel<<<1,1>>>(exec, d_pairs.begin(), d_pairs.end()); cudaError_t const err = cudaDeviceSynchronize(); ASSERT_EQUAL(cudaSuccess, err); - if(is_supported[0]) - { - // sort on the host - thrust::stable_sort(h_pairs.begin(), h_pairs.end()); + // sort on the host + thrust::stable_sort(h_pairs.begin(), h_pairs.end()); - ASSERT_EQUAL_QUIET(h_pairs, d_pairs); - } + ASSERT_EQUAL_QUIET(h_pairs, d_pairs); }; @@ -71,4 +62,5 @@ void TestPairStableSortDeviceDevice() TestPairStableSortDevice(thrust::device); } DECLARE_UNITTEST(TestPairStableSortDeviceDevice); +#endif diff --git a/testing/cuda/pair_sort.mk b/testing/cuda/pair_sort.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/pair_sort.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/pair_sort_by_key.cu b/testing/cuda/pair_sort_by_key.cu index 19996e5a2..fa229b8a6 100644 --- a/testing/cuda/pair_sort_by_key.cu +++ b/testing/cuda/pair_sort_by_key.cu @@ -6,16 +6,12 @@ #include -template +#ifdef THRUST_TEST_DEVICE_SIDE +template __global__ -void stable_sort_by_key_kernel(ExecutionPolicy exec, Iterator1 keys_first, Iterator1 keys_last, Iterator2 values_first, Iterator3 is_supported) +void stable_sort_by_key_kernel(ExecutionPolicy exec, Iterator1 keys_first, Iterator1 keys_last, Iterator2 values_first) { -#if (__CUDA_ARCH__ >= 200) - *is_supported = true; thrust::stable_sort_by_key(exec, keys_first, keys_last, values_first); -#else - *is_supported = false; -#endif } @@ -51,21 +47,16 @@ void TestPairStableSortByKeyDevice(ExecutionPolicy exec) thrust::device_vector

d_pairs = h_pairs; thrust::device_vector d_values = h_values; - thrust::device_vector is_supported(1); - // sort on the device - stable_sort_by_key_kernel<<<1,1>>>(exec, d_pairs.begin(), d_pairs.end(), d_values.begin(), is_supported.begin()); + stable_sort_by_key_kernel<<<1,1>>>(exec, d_pairs.begin(), d_pairs.end(), d_values.begin()); cudaError_t const err = cudaDeviceSynchronize(); ASSERT_EQUAL(cudaSuccess, err); - if(is_supported[0]) - { - // sort on the host - thrust::stable_sort_by_key(h_pairs.begin(), h_pairs.end(), h_values.begin()); + // sort on the host + thrust::stable_sort_by_key(h_pairs.begin(), h_pairs.end(), h_values.begin()); - ASSERT_EQUAL_QUIET(h_pairs, d_pairs); - ASSERT_EQUAL(h_values, d_values); - } + ASSERT_EQUAL_QUIET(h_pairs, d_pairs); + ASSERT_EQUAL(h_values, d_values); }; @@ -81,4 +72,5 @@ void TestPairStableSortByKeyDeviceDevice() TestPairStableSortByKeyDevice(thrust::device); } DECLARE_UNITTEST(TestPairStableSortByKeyDeviceDevice); +#endif diff --git a/testing/cuda/pair_sort_by_key.mk b/testing/cuda/pair_sort_by_key.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/pair_sort_by_key.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/partition.cu b/testing/cuda/partition.cu index a70ac0732..f8701db6f 100644 --- a/testing/cuda/partition.cu +++ b/testing/cuda/partition.cu @@ -4,14 +4,6 @@ #include -template -__global__ -void partition_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Predicate pred, Iterator2 result) -{ - *result = thrust::partition(exec, first, last, pred); -} - - template struct is_even { @@ -20,6 +12,15 @@ struct is_even }; +#ifdef THRUST_TEST_DEVICE_SIDE +template +__global__ +void partition_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Predicate pred, Iterator2 result) +{ + *result = thrust::partition(exec, first, last, pred); +} + + template void TestPartitionDevice(ExecutionPolicy exec) { @@ -65,6 +66,13 @@ void TestPartitionDeviceDevice() DECLARE_UNITTEST(TestPartitionDeviceDevice); +void TestPartitionDeviceNoSync() +{ + TestPartitionDevice(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestPartitionDeviceNoSync); + + template __global__ void partition_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 stencil_first, Predicate pred, Iterator3 result) @@ -125,6 +133,13 @@ void TestPartitionStencilDeviceDevice() DECLARE_UNITTEST(TestPartitionStencilDeviceDevice); +void TestPartitionStencilDeviceNoSync() +{ + TestPartitionStencilDevice(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestPartitionStencilDeviceNoSync); + + template __global__ void partition_copy_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 true_result, Iterator3 false_result, Predicate pred, Iterator4 result) @@ -188,6 +203,13 @@ void TestPartitionCopyDeviceDevice() DECLARE_UNITTEST(TestPartitionCopyDeviceDevice); +void TestPartitionCopyDeviceNoSync() +{ + TestPartitionCopyDevice(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestPartitionCopyDeviceNoSync); + + template __global__ void partition_copy_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 stencil_first, Iterator3 true_result, Iterator4 false_result, Predicate pred, Iterator5 result) @@ -258,16 +280,18 @@ void TestPartitionCopyStencilDeviceDevice() DECLARE_UNITTEST(TestPartitionCopyStencilDeviceDevice); -template +void TestPartitionCopyStencilDeviceNoSync() +{ + TestPartitionCopyStencilDevice(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestPartitionCopyStencilDeviceNoSync); + + +template __global__ -void stable_partition_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Predicate pred, Iterator2 result, Iterator3 is_supported) +void stable_partition_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Predicate pred, Iterator2 result) { -#if (__CUDA_ARCH__ >= 200) - *is_supported = true; *result = thrust::stable_partition(exec, first, last, pred); -#else - *is_supported = false; -#endif } @@ -285,24 +309,20 @@ void TestStablePartitionDevice(ExecutionPolicy exec) data[4] = 2; thrust::device_vector result(1); - thrust::device_vector is_supported(1); - - stable_partition_kernel<<<1,1>>>(exec, data.begin(), data.end(), is_even(), result.begin(), is_supported.begin()); + + stable_partition_kernel<<<1,1>>>(exec, data.begin(), data.end(), is_even(), result.begin()); cudaError_t const err = cudaDeviceSynchronize(); ASSERT_EQUAL(cudaSuccess, err); - if(is_supported[0]) - { - thrust::device_vector ref(5); - ref[0] = 2; - ref[1] = 2; - ref[2] = 1; - ref[3] = 1; - ref[4] = 1; + thrust::device_vector ref(5); + ref[0] = 2; + ref[1] = 2; + ref[2] = 1; + ref[3] = 1; + ref[4] = 1; - ASSERT_EQUAL(2, (iterator)result[0] - data.begin()); - ASSERT_EQUAL(ref, data); - } + ASSERT_EQUAL(2, (iterator)result[0] - data.begin()); + ASSERT_EQUAL(ref, data); } @@ -320,16 +340,18 @@ void TestStablePartitionDeviceDevice() DECLARE_UNITTEST(TestStablePartitionDeviceDevice); -template +void TestStablePartitionDeviceNoSync() +{ + TestStablePartitionDevice(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestStablePartitionDeviceNoSync); + + +template __global__ -void stable_partition_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 stencil_first, Predicate pred, Iterator3 result, Iterator4 is_supported) +void stable_partition_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 stencil_first, Predicate pred, Iterator3 result) { -#if (__CUDA_ARCH__ >= 200) - *is_supported = true; *result = thrust::stable_partition(exec, first, last, stencil_first, pred); -#else - *is_supported = false; -#endif } @@ -354,24 +376,20 @@ void TestStablePartitionStencilDevice(ExecutionPolicy exec) stencil[4] = 2; thrust::device_vector result(1); - thrust::device_vector is_supported(1); - - stable_partition_kernel<<<1,1>>>(exec, data.begin(), data.end(), stencil.begin(), is_even(), result.begin(), is_supported.begin()); + + stable_partition_kernel<<<1,1>>>(exec, data.begin(), data.end(), stencil.begin(), is_even(), result.begin()); cudaError_t const err = cudaDeviceSynchronize(); ASSERT_EQUAL(cudaSuccess, err); - if(is_supported[0]) - { - thrust::device_vector ref(5); - ref[0] = 1; - ref[1] = 1; - ref[2] = 0; - ref[3] = 0; - ref[4] = 0; + thrust::device_vector ref(5); + ref[0] = 1; + ref[1] = 1; + ref[2] = 0; + ref[3] = 0; + ref[4] = 0; - ASSERT_EQUAL(2, (iterator)result[0] - data.begin()); - ASSERT_EQUAL(ref, data); - } + ASSERT_EQUAL(2, (iterator)result[0] - data.begin()); + ASSERT_EQUAL(ref, data); } @@ -389,6 +407,13 @@ void TestStablePartitionStencilDeviceDevice() DECLARE_UNITTEST(TestStablePartitionStencilDeviceDevice); +void TestStablePartitionStencilDeviceNoSync() +{ + TestStablePartitionStencilDevice(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestStablePartitionStencilDeviceNoSync); + + template __global__ void stable_partition_copy_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 true_result, Iterator3 false_result, Predicate pred, Iterator4 result) @@ -452,6 +477,13 @@ void TestStablePartitionCopyDeviceDevice() DECLARE_UNITTEST(TestStablePartitionCopyDeviceDevice); +void TestStablePartitionCopyDeviceNoSync() +{ + TestStablePartitionCopyDevice(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestStablePartitionCopyDeviceNoSync); + + template __global__ void stable_partition_copy_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 stencil_first, Iterator3 true_result, Iterator4 false_result, Predicate pred, Iterator5 result) @@ -522,7 +554,16 @@ void TestStablePartitionCopyStencilDeviceDevice() DECLARE_UNITTEST(TestStablePartitionCopyStencilDeviceDevice); -void TestPartitionCudaStreams() +void TestStablePartitionCopyStencilDeviceNoSync() +{ + TestStablePartitionCopyStencilDevice(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestStablePartitionCopyStencilDeviceNoSync); +#endif + + +template +void TestPartitionCudaStreams(ExecutionPolicy policy) { typedef thrust::device_vector Vector; typedef Vector::value_type T; @@ -537,8 +578,10 @@ void TestPartitionCudaStreams() cudaStream_t s; cudaStreamCreate(&s); + + auto streampolicy = policy.on(s); - Iterator iter = thrust::partition(thrust::cuda::par.on(s), data.begin(), data.end(), is_even()); + Iterator iter = thrust::partition(streampolicy, data.begin(), data.end(), is_even()); Vector ref(5); ref[0] = 2; @@ -552,5 +595,17 @@ void TestPartitionCudaStreams() cudaStreamDestroy(s); } -DECLARE_UNITTEST(TestPartitionCudaStreams); + +void TestPartitionCudaStreamsSync() +{ + TestPartitionCudaStreams(thrust::cuda::par); +} +DECLARE_UNITTEST(TestPartitionCudaStreamsSync); + + +void TestPartitionCudaStreamsNoSync() +{ + TestPartitionCudaStreams(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestPartitionCudaStreamsNoSync); diff --git a/testing/cuda/partition.mk b/testing/cuda/partition.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/partition.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/partition_point.cu b/testing/cuda/partition_point.cu index 0b95fcb02..57e4344ee 100644 --- a/testing/cuda/partition_point.cu +++ b/testing/cuda/partition_point.cu @@ -4,6 +4,7 @@ #include +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void partition_point_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Predicate pred, Iterator2 result) @@ -50,6 +51,7 @@ void TestPartitionPointDeviceDevice() TestPartitionPointDevice(thrust::device); } DECLARE_UNITTEST(TestPartitionPointDeviceDevice); +#endif void TestPartitionPointCudaStreams() diff --git a/testing/cuda/partition_point.mk b/testing/cuda/partition_point.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/partition_point.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/pinned_allocator.cu b/testing/cuda/pinned_allocator.cu deleted file mode 100644 index 23ccc7d40..000000000 --- a/testing/cuda/pinned_allocator.cu +++ /dev/null @@ -1,19 +0,0 @@ -#include -#include -#include -#include - -template -void TestPinnedAllocatorSimple(const size_t n) -{ - typedef thrust::host_vector > Vector; - - Vector h_input = unittest::random_integers(n); - Vector h_output(n); - - thrust::copy(h_input.begin(), h_input.end(), h_output.begin()); - - ASSERT_EQUAL(h_input, h_output); -} -DECLARE_VARIABLE_UNITTEST(TestPinnedAllocatorSimple); - diff --git a/testing/cuda/reduce.cu b/testing/cuda/reduce.cu index 9cefcc0ed..865d31c22 100644 --- a/testing/cuda/reduce.cu +++ b/testing/cuda/reduce.cu @@ -1,6 +1,7 @@ #include #include #include +#include template @@ -11,6 +12,7 @@ void reduce_kernel(ExecutionPolicy exec, Iterator first, Iterator last, T init, } +#ifdef THRUST_TEST_DEVICE_SIDE template void TestReduceDevice(ExecutionPolicy exec, const size_t n) { @@ -53,7 +55,20 @@ struct TestReduceDeviceDevice VariableUnitTest TestReduceDeviceDeviceInstance; -void TestReduceCudaStreams() +template +struct TestReduceDeviceNoSync +{ + void operator()(const size_t n) + { + TestReduceDevice(thrust::cuda::par_nosync, n); + } +}; +VariableUnitTest TestReduceDeviceNoSyncInstance; +#endif + + +template +void TestReduceCudaStreams(ExecutionPolicy policy) { typedef thrust::device_vector Vector; @@ -63,13 +78,46 @@ void TestReduceCudaStreams() cudaStream_t s; cudaStreamCreate(&s); + auto streampolicy = policy.on(s); + // no initializer - ASSERT_EQUAL(thrust::reduce(thrust::cuda::par.on(s), v.begin(), v.end()), 2); + ASSERT_EQUAL(thrust::reduce(streampolicy, v.begin(), v.end()), 2); // with initializer - ASSERT_EQUAL(thrust::reduce(thrust::cuda::par.on(s), v.begin(), v.end(), 10), 12); + ASSERT_EQUAL(thrust::reduce(streampolicy, v.begin(), v.end(), 10), 12); cudaStreamDestroy(s); } -DECLARE_UNITTEST(TestReduceCudaStreams); + +void TestReduceCudaStreamsSync() +{ + TestReduceCudaStreams(thrust::cuda::par); +} +DECLARE_UNITTEST(TestReduceCudaStreamsSync); + + +void TestReduceCudaStreamsNoSync() +{ + TestReduceCudaStreams(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestReduceCudaStreamsNoSync); + +#if defined(THRUST_RDC_ENABLED) +void TestReduceLargeInput() +{ + using T = unsigned long long; + using OffsetT = std::size_t; + const OffsetT num_items = 1ull << 32; + + thrust::constant_iterator d_data(T{1}); + thrust::device_vector d_result(1); + + reduce_kernel<<<1,1>>>(thrust::device, d_data, d_data + num_items, T{}, d_result.begin()); + cudaError_t const err = cudaDeviceSynchronize(); + ASSERT_EQUAL(cudaSuccess, err); + + ASSERT_EQUAL(num_items, d_result[0]); +} +DECLARE_UNITTEST(TestReduceLargeInput); +#endif diff --git a/testing/cuda/reduce.mk b/testing/cuda/reduce.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/reduce.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/reduce_by_key.cu b/testing/cuda/reduce_by_key.cu index 993a39bd4..20f44fb42 100644 --- a/testing/cuda/reduce_by_key.cu +++ b/testing/cuda/reduce_by_key.cu @@ -1,8 +1,14 @@ -#include -#include +#include #include +#include +#include +#include +#include +#include + +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void reduce_by_key_kernel(ExecutionPolicy exec, @@ -43,6 +49,7 @@ void reduce_by_key_kernel(ExecutionPolicy exec, { *result = thrust::reduce_by_key(exec, keys_first, keys_last, values_first, keys_result, values_result, pred, binary_op); } +#endif template @@ -85,6 +92,7 @@ void initialize_values(Vector& values) } +#ifdef THRUST_TEST_DEVICE_SIDE template void TestReduceByKeyDevice(ExecutionPolicy exec) { @@ -191,7 +199,16 @@ void TestReduceByKeyDeviceDevice() DECLARE_UNITTEST(TestReduceByKeyDeviceDevice); -void TestReduceByKeyCudaStreams() +void TestReduceByKeyDeviceNoSync() +{ + TestReduceByKeyDevice(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestReduceByKeyDeviceNoSync); +#endif + + +template +void TestReduceByKeyCudaStreams(ExecutionPolicy policy) { typedef thrust::device_vector Vector; typedef Vector::value_type T; @@ -210,7 +227,9 @@ void TestReduceByKeyCudaStreams() cudaStream_t s; cudaStreamCreate(&s); - new_last = thrust::reduce_by_key(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin()); + auto streampolicy = policy.on(s); + + new_last = thrust::reduce_by_key(streampolicy, keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin()); ASSERT_EQUAL(new_last.first - output_keys.begin(), 5); ASSERT_EQUAL(new_last.second - output_values.begin(), 5); @@ -229,7 +248,7 @@ void TestReduceByKeyCudaStreams() // test BinaryPredicate initialize_keys(keys); initialize_values(values); - new_last = thrust::reduce_by_key(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), is_equal_div_10_reduce()); + new_last = thrust::reduce_by_key(streampolicy, keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), is_equal_div_10_reduce()); ASSERT_EQUAL(new_last.first - output_keys.begin(), 3); ASSERT_EQUAL(new_last.second - output_values.begin(), 3); @@ -244,7 +263,7 @@ void TestReduceByKeyCudaStreams() // test BinaryFunction initialize_keys(keys); initialize_values(values); - new_last = thrust::reduce_by_key(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), thrust::equal_to(), thrust::plus()); + new_last = thrust::reduce_by_key(streampolicy, keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), thrust::equal_to(), thrust::plus()); ASSERT_EQUAL(new_last.first - output_keys.begin(), 5); ASSERT_EQUAL(new_last.second - output_values.begin(), 5); @@ -262,5 +281,120 @@ void TestReduceByKeyCudaStreams() cudaStreamDestroy(s); } -DECLARE_UNITTEST(TestReduceByKeyCudaStreams); +void TestReduceByKeyCudaStreamsSync() +{ + TestReduceByKeyCudaStreams(thrust::cuda::par); +} +DECLARE_UNITTEST(TestReduceByKeyCudaStreamsSync); + + +void TestReduceByKeyCudaStreamsNoSync() +{ + TestReduceByKeyCudaStreams(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestReduceByKeyCudaStreamsNoSync); + + +// Maps indices to key ids +class div_op : public thrust::unary_function +{ + std::int64_t m_divisor; + +public: + __host__ div_op(std::int64_t divisor) + : m_divisor(divisor) + {} + + __host__ __device__ + std::int64_t operator()(std::int64_t x) const + { + return x / m_divisor; + } +}; + +// Produces unique sequence for key +class mod_op : public thrust::unary_function +{ + std::int64_t m_divisor; + +public: + __host__ mod_op(std::int64_t divisor) + : m_divisor(divisor) + {} + + __host__ __device__ + std::int64_t operator()(std::int64_t x) const + { + // div: 2 + // idx: 0 1 2 3 4 5 + // key: 0 0 | 1 1 | 2 2 + // mod: 0 1 | 0 1 | 0 1 + // ret: 0 1 1 2 2 3 + return (x % m_divisor) + (x / m_divisor); + } +}; + + +void TestReduceByKeyWithBigIndexesHelper(int magnitude) +{ + const std::int64_t key_size_magnitude = 8; + ASSERT_EQUAL(true, key_size_magnitude < magnitude); + + const std::int64_t num_items = 1ll << magnitude; + const std::int64_t num_unique_keys = 1ll << key_size_magnitude; + + // Size of each key group + const std::int64_t key_size = num_items / num_unique_keys; + + using counting_it = thrust::counting_iterator; + using transform_key_it = thrust::transform_iterator; + using transform_val_it = thrust::transform_iterator; + + counting_it count_begin(0ll); + counting_it count_end = count_begin + num_items; + ASSERT_EQUAL(static_cast(thrust::distance(count_begin, count_end)), + num_items); + + transform_key_it keys_begin(count_begin, div_op{key_size}); + transform_key_it keys_end(count_end, div_op{key_size}); + + transform_val_it values_begin(count_begin, mod_op{key_size}); + + thrust::device_vector output_keys(num_unique_keys); + thrust::device_vector output_values(num_unique_keys); + + // example: + // items: 6 + // unique_keys: 2 + // key_size: 3 + // keys: 0 0 0 | 1 1 1 + // values: 0 1 2 | 1 2 3 + // result: 3 6 = sum(range(key_size)) + key_size * key_id + thrust::reduce_by_key(keys_begin, + keys_end, + values_begin, + output_keys.begin(), + output_values.begin()); + + ASSERT_EQUAL( + true, + thrust::equal(output_keys.begin(), output_keys.end(), count_begin)); + + thrust::host_vector result = output_values; + + const std::int64_t sum = (key_size - 1) * key_size / 2; + for (std::int64_t key_id = 0; key_id < num_unique_keys; key_id++) + { + ASSERT_EQUAL(result[key_id], sum + key_id * key_size); + } +} + +void TestReduceByKeyWithBigIndexes() +{ + TestReduceByKeyWithBigIndexesHelper(30); + TestReduceByKeyWithBigIndexesHelper(31); + TestReduceByKeyWithBigIndexesHelper(32); + TestReduceByKeyWithBigIndexesHelper(33); +} +DECLARE_UNITTEST(TestReduceByKeyWithBigIndexes); diff --git a/testing/cuda/reduce_by_key.mk b/testing/cuda/reduce_by_key.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/reduce_by_key.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/remove.cu b/testing/cuda/remove.cu index 3509cd31b..0331c24b8 100644 --- a/testing/cuda/remove.cu +++ b/testing/cuda/remove.cu @@ -3,6 +3,7 @@ #include +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void remove_kernel(ExecutionPolicy exec, Iterator first, Iterator last, T val, Iterator2 result) @@ -49,6 +50,7 @@ void remove_copy_if_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last { *result_end = thrust::remove_copy_if(exec, first, last, stencil_first, result, pred); } +#endif template @@ -69,6 +71,7 @@ struct is_true }; +#ifdef THRUST_TEST_DEVICE_SIDE template void TestRemoveDevice(ExecutionPolicy exec) { @@ -328,6 +331,7 @@ void TestRemoveCopyIfStencilDeviceDevice() TestRemoveCopyIfStencilDevice(thrust::device); } DECLARE_UNITTEST(TestRemoveCopyIfStencilDeviceDevice); +#endif void TestRemoveCudaStreams() diff --git a/testing/cuda/remove.mk b/testing/cuda/remove.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/remove.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/replace.cu b/testing/cuda/replace.cu index 24a03b2d5..bb8b7faa9 100644 --- a/testing/cuda/replace.cu +++ b/testing/cuda/replace.cu @@ -10,6 +10,7 @@ struct less_than_five }; +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void replace_kernel(ExecutionPolicy exec, Iterator first, Iterator last, T1 old_value, T2 new_value) @@ -258,6 +259,7 @@ void TestReplaceCopyIfStencilDeviceDevice() TestReplaceCopyIfStencilDevice(thrust::device); } DECLARE_UNITTEST(TestReplaceCopyIfStencilDeviceDevice); +#endif void TestReplaceCudaStreams() diff --git a/testing/cuda/replace.mk b/testing/cuda/replace.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/replace.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/reverse.cu b/testing/cuda/reverse.cu index 4f6dfab08..f6599ed61 100644 --- a/testing/cuda/reverse.cu +++ b/testing/cuda/reverse.cu @@ -3,6 +3,7 @@ #include +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void reverse_kernel(ExecutionPolicy exec, Iterator first, Iterator last) @@ -82,6 +83,7 @@ void TestReverseCopyDeviceDevice() TestReverseCopyDevice(thrust::device); } DECLARE_UNITTEST(TestReverseCopyDeviceDevice); +#endif void TestReverseCudaStreams() diff --git a/testing/cuda/reverse.mk b/testing/cuda/reverse.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/reverse.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/scan.cu b/testing/cuda/scan.cu index e67470cab..5a19798cd 100644 --- a/testing/cuda/scan.cu +++ b/testing/cuda/scan.cu @@ -4,6 +4,7 @@ #include +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void inclusive_scan_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result) @@ -116,6 +117,7 @@ struct TestScanDeviceDevice } }; VariableUnitTest TestScanDeviceDeviceInstance; +#endif void TestScanCudaStreams() @@ -212,3 +214,48 @@ void TestScanCudaStreams() } DECLARE_UNITTEST(TestScanCudaStreams); +template +struct const_ref_plus_mod3 +{ + T * table; + + const_ref_plus_mod3(T * table) : table(table) {} + + __host__ __device__ + const T& operator()(T a, T b) + { + return table[(int) (a + b)]; + } +}; + +static void TestInclusiveScanWithConstAccumulator(void) +{ + // add numbers modulo 3 with external lookup table + thrust::device_vector data(7); + data[0] = 0; + data[1] = 1; + data[2] = 2; + data[3] = 1; + data[4] = 2; + data[5] = 0; + data[6] = 1; + + thrust::device_vector table(6); + table[0] = 0; + table[1] = 1; + table[2] = 2; + table[3] = 0; + table[4] = 1; + table[5] = 2; + + thrust::inclusive_scan(data.begin(), data.end(), data.begin(), const_ref_plus_mod3(thrust::raw_pointer_cast(&table[0]))); + + ASSERT_EQUAL(data[0], 0); + ASSERT_EQUAL(data[1], 1); + ASSERT_EQUAL(data[2], 0); + ASSERT_EQUAL(data[3], 1); + ASSERT_EQUAL(data[4], 0); + ASSERT_EQUAL(data[5], 0); + ASSERT_EQUAL(data[6], 1); +} +DECLARE_UNITTEST(TestInclusiveScanWithConstAccumulator); diff --git a/testing/cuda/scan.mk b/testing/cuda/scan.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/scan.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/scan_by_key.cu b/testing/cuda/scan_by_key.cu index e65560edf..0fea161d7 100644 --- a/testing/cuda/scan_by_key.cu +++ b/testing/cuda/scan_by_key.cu @@ -4,6 +4,7 @@ #include +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void inclusive_scan_by_key_kernel(ExecutionPolicy exec, Iterator1 keys_first, Iterator1 keys_last, Iterator2 values_first, Iterator3 result) @@ -78,7 +79,7 @@ void TestScanByKeyDevice(ExecutionPolicy exec) } ASSERT_EQUAL(d_output, h_output); - // in-place scans + // in-place scans: in/out values aliasing h_output = h_vals; d_output = d_vals; thrust::inclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_output.begin(), h_output.begin()); @@ -98,6 +99,24 @@ void TestScanByKeyDevice(ExecutionPolicy exec) ASSERT_EQUAL(cudaSuccess, err); } ASSERT_EQUAL(d_output, h_output); + + // in-place scans: keys/values aliasing + thrust::inclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_output.begin()); + inclusive_scan_by_key_kernel<<<1,1>>>(exec, d_keys.begin(), d_keys.end(), d_vals.begin(), d_keys.begin()); + { + cudaError_t const err = cudaDeviceSynchronize(); + ASSERT_EQUAL(cudaSuccess, err); + } + ASSERT_EQUAL(d_keys, h_output); + + d_keys = h_keys; + thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_output.begin(), 11); + exclusive_scan_by_key_kernel<<<1,1>>>(exec, d_keys.begin(), d_keys.end(), d_vals.begin(), d_keys.begin(), 11); + { + cudaError_t const err = cudaDeviceSynchronize(); + ASSERT_EQUAL(cudaSuccess, err); + } + ASSERT_EQUAL(d_keys, h_output); } @@ -113,6 +132,7 @@ void TestScanByKeyDeviceDevice() TestScanByKeyDevice(thrust::device); } DECLARE_UNITTEST(TestScanByKeyDeviceDevice); +#endif void TestInclusiveScanByKeyCudaStreams() diff --git a/testing/cuda/scan_by_key.mk b/testing/cuda/scan_by_key.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/scan_by_key.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/scatter.cu b/testing/cuda/scatter.cu index 52bd9755f..92e7f342a 100644 --- a/testing/cuda/scatter.cu +++ b/testing/cuda/scatter.cu @@ -3,6 +3,7 @@ #include #include +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void scatter_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 map_first, Iterator3 result) @@ -112,6 +113,7 @@ void TestScatterIfDeviceDevice() TestScatterIfDevice(thrust::device); } DECLARE_UNITTEST(TestScatterIfDeviceDevice); +#endif void TestScatterCudaStreams() diff --git a/testing/cuda/scatter.mk b/testing/cuda/scatter.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/scatter.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/sequence.cu b/testing/cuda/sequence.cu index acbe09848..16b2d799b 100644 --- a/testing/cuda/sequence.cu +++ b/testing/cuda/sequence.cu @@ -3,6 +3,7 @@ #include +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void sequence_kernel(ExecutionPolicy exec, Iterator first, Iterator last) @@ -80,6 +81,7 @@ void TestSequenceDeviceDevice() TestSequenceDevice(thrust::device); } DECLARE_UNITTEST(TestSequenceDeviceDevice); +#endif void TestSequenceCudaStreams() { diff --git a/testing/cuda/sequence.mk b/testing/cuda/sequence.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/sequence.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/set_difference.cu b/testing/cuda/set_difference.cu index d87db42d9..bd9da131f 100644 --- a/testing/cuda/set_difference.cu +++ b/testing/cuda/set_difference.cu @@ -3,6 +3,7 @@ #include +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void set_difference_kernel(ExecutionPolicy exec, Iterator1 first1, Iterator1 last1, Iterator2 first2, Iterator2 last2, Iterator3 result1, Iterator4 result2) @@ -52,6 +53,7 @@ void TestSetDifferenceDeviceDevice() TestSetDifferenceDevice(thrust::device); } DECLARE_UNITTEST(TestSetDifferenceDeviceDevice); +#endif void TestSetDifferenceCudaStreams() diff --git a/testing/cuda/set_difference.mk b/testing/cuda/set_difference.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/set_difference.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/set_difference_by_key.cu b/testing/cuda/set_difference_by_key.cu index 31d2860b0..2c32466f1 100644 --- a/testing/cuda/set_difference_by_key.cu +++ b/testing/cuda/set_difference_by_key.cu @@ -4,6 +4,7 @@ #include +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void set_difference_by_key_kernel(ExecutionPolicy exec, @@ -82,6 +83,7 @@ void TestSetDifferenceByKeyDeviceDevice() TestSetDifferenceByKeyDevice(thrust::device); } DECLARE_UNITTEST(TestSetDifferenceByKeyDeviceDevice); +#endif void TestSetDifferenceByKeyCudaStreams() diff --git a/testing/cuda/set_difference_by_key.mk b/testing/cuda/set_difference_by_key.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/set_difference_by_key.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/set_intersection.cu b/testing/cuda/set_intersection.cu index a57bc1b2a..2bb30ea87 100644 --- a/testing/cuda/set_intersection.cu +++ b/testing/cuda/set_intersection.cu @@ -6,6 +6,7 @@ #include +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void set_intersection_kernel(ExecutionPolicy exec, Iterator1 first1, Iterator1 last1, @@ -59,7 +60,16 @@ void TestSetIntersectionDeviceDevice() DECLARE_UNITTEST(TestSetIntersectionDeviceDevice); -void TestSetIntersectionCudaStreams() +void TestSetIntersectionDeviceNoSync() +{ + TestSetIntersectionDevice(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestSetIntersectionDeviceNoSync); +#endif + + +template +void TestSetIntersectionCudaStreams(ExecutionPolicy policy) { typedef thrust::device_vector Vector; typedef Vector::iterator Iterator; @@ -77,7 +87,9 @@ void TestSetIntersectionCudaStreams() cudaStream_t s; cudaStreamCreate(&s); - Iterator end = thrust::set_intersection(thrust::cuda::par.on(s), + auto streampolicy = policy.on(s); + + Iterator end = thrust::set_intersection(streampolicy, a.begin(), a.end(), b.begin(), b.end(), result.begin()); @@ -88,5 +100,17 @@ void TestSetIntersectionCudaStreams() cudaStreamDestroy(s); } -DECLARE_UNITTEST(TestSetIntersectionCudaStreams); + +void TestSetIntersectionCudaStreamsSync() +{ + TestSetIntersectionCudaStreams(thrust::cuda::par); +} +DECLARE_UNITTEST(TestSetIntersectionCudaStreamsSync); + + +void TestSetIntersectionCudaStreamsNoSync() +{ + TestSetIntersectionCudaStreams(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestSetIntersectionCudaStreamsNoSync); diff --git a/testing/cuda/set_intersection.mk b/testing/cuda/set_intersection.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/set_intersection.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/set_intersection_by_key.cu b/testing/cuda/set_intersection_by_key.cu index a19f82221..fed6cb6f6 100644 --- a/testing/cuda/set_intersection_by_key.cu +++ b/testing/cuda/set_intersection_by_key.cu @@ -3,6 +3,7 @@ #include +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void set_intersection_by_key_kernel(ExecutionPolicy exec, @@ -73,7 +74,16 @@ void TestSetIntersectionByKeyDeviceDevice() DECLARE_UNITTEST(TestSetIntersectionByKeyDeviceDevice); -void TestSetIntersectionByKeyCudaStreams() +void TestSetIntersectionByKeyDeviceNoSync() +{ + TestSetIntersectionByKeyDevice(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestSetIntersectionByKeyDeviceNoSync); +#endif + + +template +void TestSetIntersectionByKeyCudaStreams(ExecutionPolicy policy) { typedef thrust::device_vector Vector; typedef Vector::iterator Iterator; @@ -95,8 +105,10 @@ void TestSetIntersectionByKeyCudaStreams() cudaStream_t s; cudaStreamCreate(&s); + auto streampolicy = policy.on(s); + thrust::pair end = - thrust::set_intersection_by_key(thrust::cuda::par.on(s), + thrust::set_intersection_by_key(streampolicy, a_key.begin(), a_key.end(), b_key.begin(), b_key.end(), a_val.begin(), @@ -111,5 +123,17 @@ void TestSetIntersectionByKeyCudaStreams() cudaStreamDestroy(s); } -DECLARE_UNITTEST(TestSetIntersectionByKeyCudaStreams); + +void TestSetIntersectionByKeyCudaStreamsSync() +{ + TestSetIntersectionByKeyCudaStreams(thrust::cuda::par); +} +DECLARE_UNITTEST(TestSetIntersectionByKeyCudaStreamsSync); + + +void TestSetIntersectionByKeyCudaStreamsNoSync() +{ + TestSetIntersectionByKeyCudaStreams(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestSetIntersectionByKeyCudaStreamsNoSync); diff --git a/testing/cuda/set_intersection_by_key.mk b/testing/cuda/set_intersection_by_key.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/set_intersection_by_key.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/set_symmetric_difference.cu b/testing/cuda/set_symmetric_difference.cu index 34969886e..43fc0e993 100644 --- a/testing/cuda/set_symmetric_difference.cu +++ b/testing/cuda/set_symmetric_difference.cu @@ -3,6 +3,7 @@ #include +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void set_symmetric_difference_kernel(ExecutionPolicy exec, @@ -59,6 +60,7 @@ void TestSetSymmetricDifferenceDeviceDevice() TestSetSymmetricDifferenceDevice(thrust::device); } DECLARE_UNITTEST(TestSetSymmetricDifferenceDeviceDevice); +#endif void TestSetSymmetricDifferenceCudaStreams() diff --git a/testing/cuda/set_symmetric_difference.mk b/testing/cuda/set_symmetric_difference.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/set_symmetric_difference.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/set_symmetric_difference_by_key.cu b/testing/cuda/set_symmetric_difference_by_key.cu index 3a6c68ce9..7e7adba5e 100644 --- a/testing/cuda/set_symmetric_difference_by_key.cu +++ b/testing/cuda/set_symmetric_difference_by_key.cu @@ -3,6 +3,7 @@ #include +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void set_symmetric_difference_by_key_kernel(ExecutionPolicy exec, @@ -74,6 +75,7 @@ void TestSetSymmetricDifferenceByKeyDeviceDevice() TestSetSymmetricDifferenceByKeyDevice(thrust::device); } DECLARE_UNITTEST(TestSetSymmetricDifferenceByKeyDeviceDevice); +#endif void TestSetSymmetricDifferenceByKeyCudaStreams() diff --git a/testing/cuda/set_symmetric_difference_by_key.mk b/testing/cuda/set_symmetric_difference_by_key.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/set_symmetric_difference_by_key.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/set_union.cu b/testing/cuda/set_union.cu index fb5b543e1..058f0e700 100644 --- a/testing/cuda/set_union.cu +++ b/testing/cuda/set_union.cu @@ -3,6 +3,7 @@ #include +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void set_union_kernel(ExecutionPolicy exec, @@ -59,6 +60,7 @@ void TestSetUnionDeviceDevice() TestSetUnionDevice(thrust::device); } DECLARE_UNITTEST(TestSetUnionDeviceDevice); +#endif void TestSetUnionCudaStreams() diff --git a/testing/cuda/set_union.mk b/testing/cuda/set_union.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/set_union.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/set_union_by_key.cu b/testing/cuda/set_union_by_key.cu index 1be3d9302..013ebe11b 100644 --- a/testing/cuda/set_union_by_key.cu +++ b/testing/cuda/set_union_by_key.cu @@ -3,6 +3,7 @@ #include +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void set_union_by_key_kernel(ExecutionPolicy exec, @@ -73,6 +74,7 @@ void TestSetUnionByKeyDeviceDevice() TestSetUnionByKeyDevice(thrust::device); } DECLARE_UNITTEST(TestSetUnionByKeyDeviceDevice); +#endif void TestSetUnionByKeyCudaStreams() diff --git a/testing/cuda/set_union_by_key.mk b/testing/cuda/set_union_by_key.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/set_union_by_key.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/sort.cu b/testing/cuda/sort.cu index 7f3d6413c..c3d5ff2bc 100644 --- a/testing/cuda/sort.cu +++ b/testing/cuda/sort.cu @@ -4,19 +4,6 @@ #include -template -__global__ -void sort_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Compare comp, Iterator2 is_supported) -{ -#if (__CUDA_ARCH__ >= 200) - *is_supported = true; - thrust::sort(exec, first, last, comp); -#else - *is_supported = false; -#endif -} - - template struct my_less { @@ -28,25 +15,29 @@ struct my_less }; +#ifdef THRUST_TEST_DEVICE_SIDE +template +__global__ +void sort_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Compare comp) +{ + thrust::sort(exec, first, last, comp); +} + + template void TestComparisonSortDevice(ExecutionPolicy exec, const size_t n, Compare comp) { thrust::host_vector h_data = unittest::random_integers(n); thrust::device_vector d_data = h_data; - thrust::device_vector is_supported(1); - - sort_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), comp, is_supported.begin()); + sort_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), comp); cudaError_t const err = cudaDeviceSynchronize(); ASSERT_EQUAL(cudaSuccess, err); - if(is_supported[0]) - { - thrust::sort(h_data.begin(), h_data.end(), comp); - - ASSERT_EQUAL(h_data, d_data); - } + thrust::sort(h_data.begin(), h_data.end(), comp); + + ASSERT_EQUAL(h_data, d_data); }; @@ -111,6 +102,7 @@ VariableUnitTest< TestSortDeviceDevice, unittest::type_list > TestSortDeviceDeviceInstance; +#endif void TestSortCudaStreams() @@ -163,7 +155,7 @@ void TestComparisonSortCudaStreams() cudaStreamSynchronize(s); ASSERT_EQUAL(true, thrust::is_sorted(keys.begin(), keys.end(), my_less())); - + cudaStreamDestroy(s); } DECLARE_UNITTEST(TestComparisonSortCudaStreams); diff --git a/testing/cuda/sort.mk b/testing/cuda/sort.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/sort.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/sort_by_key.cu b/testing/cuda/sort_by_key.cu index 1e848879b..ee2b44ea0 100644 --- a/testing/cuda/sort_by_key.cu +++ b/testing/cuda/sort_by_key.cu @@ -4,19 +4,6 @@ #include -template -__global__ -void sort_by_key_kernel(ExecutionPolicy exec, Iterator1 keys_first, Iterator1 keys_last, Iterator2 values_first, Compare comp, Iterator3 is_supported) -{ -#if (__CUDA_ARCH__ >= 200) - *is_supported = true; - thrust::sort_by_key(exec, keys_first, keys_last, values_first, comp); -#else - *is_supported = false; -#endif -} - - template struct my_less { @@ -28,6 +15,15 @@ struct my_less }; +#ifdef THRUST_TEST_DEVICE_SIDE +template +__global__ +void sort_by_key_kernel(ExecutionPolicy exec, Iterator1 keys_first, Iterator1 keys_last, Iterator2 values_first, Compare comp) +{ + thrust::sort_by_key(exec, keys_first, keys_last, values_first, comp); +} + + template void TestComparisonSortByKeyDevice(ExecutionPolicy exec, const size_t n, Compare comp) { @@ -36,19 +32,15 @@ void TestComparisonSortByKeyDevice(ExecutionPolicy exec, const size_t n, Compare thrust::host_vector h_values = h_keys; thrust::device_vector d_values = d_keys; - - thrust::device_vector is_supported(1); - sort_by_key_kernel<<<1,1>>>(exec, d_keys.begin(), d_keys.end(), d_values.begin(), comp, is_supported.begin()); + + sort_by_key_kernel<<<1,1>>>(exec, d_keys.begin(), d_keys.end(), d_values.begin(), comp); cudaError_t const err = cudaDeviceSynchronize(); ASSERT_EQUAL(cudaSuccess, err); - if(is_supported[0]) - { - thrust::sort_by_key(h_keys.begin(), h_keys.end(), h_values.begin(), comp); - - ASSERT_EQUAL(h_keys, d_keys); - ASSERT_EQUAL(h_values, d_values); - } + thrust::sort_by_key(h_keys.begin(), h_keys.end(), h_values.begin(), comp); + + ASSERT_EQUAL(h_keys, d_keys); + ASSERT_EQUAL(h_values, d_values); }; @@ -113,6 +105,7 @@ VariableUnitTest< TestSortByKeyDeviceDevice, unittest::type_list > TestSortByKeyDeviceDeviceInstance; +#endif void TestComparisonSortByKeyCudaStreams() @@ -139,7 +132,7 @@ void TestComparisonSortByKeyCudaStreams() ASSERT_EQUAL(true, thrust::is_sorted(keys.begin(), keys.end())); ASSERT_EQUAL(true, thrust::is_sorted(vals.begin(), vals.end())); - + cudaStreamDestroy(s); } DECLARE_UNITTEST(TestComparisonSortByKeyCudaStreams); @@ -169,7 +162,7 @@ void TestSortByKeyCudaStreams() ASSERT_EQUAL(true, thrust::is_sorted(keys.begin(), keys.end())); ASSERT_EQUAL(true, thrust::is_sorted(vals.begin(), vals.end())); - + cudaStreamDestroy(s); } DECLARE_UNITTEST(TestSortByKeyCudaStreams); diff --git a/testing/cuda/sort_by_key.mk b/testing/cuda/sort_by_key.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/sort_by_key.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/stream_legacy.cu b/testing/cuda/stream_legacy.cu new file mode 100644 index 000000000..51c82a096 --- /dev/null +++ b/testing/cuda/stream_legacy.cu @@ -0,0 +1,21 @@ +#include +#include +#include + +#include + +void verify_stream() +{ + auto exec = thrust::device; + auto stream = thrust::cuda_cub::stream(exec); + ASSERT_EQUAL(stream, cudaStreamLegacy); +} + +void TestLegacyDefaultStream() +{ + verify_stream(); + + std::thread t(verify_stream); + t.join(); +} +DECLARE_UNITTEST(TestLegacyDefaultStream); diff --git a/testing/cuda/stream_per_thread.cmake b/testing/cuda/stream_per_thread.cmake new file mode 100644 index 000000000..2cea2f938 --- /dev/null +++ b/testing/cuda/stream_per_thread.cmake @@ -0,0 +1,13 @@ +# This test should always use per-thread streams on NVCC. +set_target_properties(${test_target} PROPERTIES + COMPILE_OPTIONS + $<$,$>:--default-stream=per-thread> +) + +thrust_fix_clang_nvcc_build_for(${test_target}) + +# NVC++ does not have an equivalent option, and will always +# use the global stream by default. +if (CMAKE_CUDA_COMPILER_ID STREQUAL "Feta") + set_tests_properties(${test_target} PROPERTIES WILL_FAIL ON) +endif() diff --git a/testing/cuda/stream_per_thread.cu b/testing/cuda/stream_per_thread.cu new file mode 100644 index 000000000..ef126e78a --- /dev/null +++ b/testing/cuda/stream_per_thread.cu @@ -0,0 +1,21 @@ +#include +#include +#include + +#include + +void verify_stream() +{ + auto exec = thrust::device; + auto stream = thrust::cuda_cub::stream(exec); + ASSERT_EQUAL(stream, cudaStreamPerThread); +} + +void TestPerThreadDefaultStream() +{ + verify_stream(); + + std::thread t(verify_stream); + t.join(); +} +DECLARE_UNITTEST(TestPerThreadDefaultStream); diff --git a/testing/cuda/stream_per_thread.mk b/testing/cuda/stream_per_thread.mk new file mode 100644 index 000000000..da9adfe1b --- /dev/null +++ b/testing/cuda/stream_per_thread.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += --default-stream per-thread diff --git a/testing/cuda/swap_ranges.cu b/testing/cuda/swap_ranges.cu index e2392bbe2..ebc396e83 100644 --- a/testing/cuda/swap_ranges.cu +++ b/testing/cuda/swap_ranges.cu @@ -3,6 +3,7 @@ #include +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void swap_ranges_kernel(ExecutionPolicy exec, Iterator1 first1, Iterator1 last1, Iterator2 first2) @@ -50,6 +51,7 @@ void TestSwapRangesDeviceDevice() TestSwapRangesDevice(thrust::device); } DECLARE_UNITTEST(TestSwapRangesDeviceDevice); +#endif void TestSwapRangesCudaStreams() { diff --git a/testing/cuda/swap_ranges.mk b/testing/cuda/swap_ranges.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/swap_ranges.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/tabulate.cu b/testing/cuda/tabulate.cu index 564d85e7e..b449fb7cc 100644 --- a/testing/cuda/tabulate.cu +++ b/testing/cuda/tabulate.cu @@ -4,6 +4,7 @@ #include +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void tabulate_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Function f) @@ -69,6 +70,7 @@ void TestTabulateDeviceDevice() TestTabulateDevice(thrust::device); } DECLARE_UNITTEST(TestTabulateDeviceDevice); +#endif void TestTabulateCudaStreams() { diff --git a/testing/cuda/tabulate.mk b/testing/cuda/tabulate.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/tabulate.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/transform.cu b/testing/cuda/transform.cu index fa0358e57..7739089e6 100644 --- a/testing/cuda/transform.cu +++ b/testing/cuda/transform.cu @@ -3,6 +3,7 @@ #include +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void transform_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result1, Function f, Iterator3 result2) @@ -270,6 +271,7 @@ void TestTransformIfBinaryDeviceDevice() TestTransformIfBinaryDevice(thrust::device); } DECLARE_UNITTEST(TestTransformIfBinaryDeviceDevice); +#endif void TestTransformUnaryCudaStreams() { diff --git a/testing/cuda/transform.mk b/testing/cuda/transform.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/transform.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/transform_reduce.cu b/testing/cuda/transform_reduce.cu index dcc8f646b..c55aa66e7 100644 --- a/testing/cuda/transform_reduce.cu +++ b/testing/cuda/transform_reduce.cu @@ -3,6 +3,7 @@ #include +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void transform_reduce_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Function1 f1, T init, Function2 f2, Iterator2 result) @@ -44,6 +45,7 @@ void TestTransformReduceDeviceDevice() TestTransformReduceDevice(thrust::device); } DECLARE_UNITTEST(TestTransformReduceDeviceDevice); +#endif void TestTransformReduceCudaStreams() diff --git a/testing/cuda/transform_reduce.mk b/testing/cuda/transform_reduce.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/transform_reduce.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/transform_scan.cu b/testing/cuda/transform_scan.cu index e629fcdff..de0d1524f 100644 --- a/testing/cuda/transform_scan.cu +++ b/testing/cuda/transform_scan.cu @@ -3,6 +3,7 @@ #include +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void transform_inclusive_scan_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result1, Function1 f1, Function2 f2, Iterator3 result2) @@ -115,6 +116,7 @@ void TestTransformScanDeviceDevice() TestTransformScanDevice(thrust::device); } DECLARE_UNITTEST(TestTransformScanDeviceDevice); +#endif void TestTransformScanCudaStreams() @@ -184,3 +186,30 @@ void TestTransformScanCudaStreams() } DECLARE_UNITTEST(TestTransformScanCudaStreams); +void TestTransformScanConstAccumulator() +{ + typedef thrust::device_vector Vector; + typedef Vector::value_type T; + + Vector::iterator iter; + + Vector input(5); + Vector reference(5); + Vector output(5); + + input[0] = 1; + input[1] = 3; + input[2] = -2; + input[3] = 4; + input[4] = -5; + + thrust::transform_inclusive_scan(input.begin(), + input.end(), + output.begin(), + thrust::identity(), + thrust::plus()); + thrust::inclusive_scan(input.begin(), input.end(), reference.begin(), thrust::plus()); + + ASSERT_EQUAL(output, reference); +} +DECLARE_UNITTEST(TestTransformScanConstAccumulator); diff --git a/testing/cuda/transform_scan.mk b/testing/cuda/transform_scan.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/transform_scan.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/uninitialized_copy.cu b/testing/cuda/uninitialized_copy.cu index 31feb0716..735e2dac3 100644 --- a/testing/cuda/uninitialized_copy.cu +++ b/testing/cuda/uninitialized_copy.cu @@ -3,6 +3,7 @@ #include +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void uninitialized_copy_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result) @@ -45,6 +46,7 @@ void TestUninitializedCopyDeviceDevice() TestUninitializedCopyDevice(thrust::device); } DECLARE_UNITTEST(TestUninitializedCopyDeviceDevice); +#endif void TestUninitializedCopyCudaStreams() @@ -74,6 +76,7 @@ void TestUninitializedCopyCudaStreams() DECLARE_UNITTEST(TestUninitializedCopyCudaStreams); +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void uninitialized_copy_n_kernel(ExecutionPolicy exec, Iterator1 first, Size n, Iterator2 result) @@ -116,6 +119,7 @@ void TestUninitializedCopyNDeviceDevice() TestUninitializedCopyNDevice(thrust::device); } DECLARE_UNITTEST(TestUninitializedCopyNDeviceDevice); +#endif void TestUninitializedCopyNCudaStreams() diff --git a/testing/cuda/uninitialized_copy.mk b/testing/cuda/uninitialized_copy.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/uninitialized_copy.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/uninitialized_fill.cu b/testing/cuda/uninitialized_fill.cu index fd7477347..bb222cf02 100644 --- a/testing/cuda/uninitialized_fill.cu +++ b/testing/cuda/uninitialized_fill.cu @@ -3,6 +3,7 @@ #include +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void uninitialized_fill_kernel(ExecutionPolicy exec, Iterator first, Iterator last, T val) @@ -90,6 +91,7 @@ void TestUninitializedFillDeviceDevice() TestUninitializedFillDevice(thrust::device); } DECLARE_UNITTEST(TestUninitializedFillDeviceDevice); +#endif void TestUninitializedFillCudaStreams() @@ -119,6 +121,7 @@ void TestUninitializedFillCudaStreams() DECLARE_UNITTEST(TestUninitializedFillCudaStreams); +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void uninitialized_fill_n_kernel(ExecutionPolicy exec, Iterator1 first, Size n, T val, Iterator2 result) @@ -163,9 +166,6 @@ void TestUninitializedFillNDevice(ExecutionPolicy exec) ASSERT_EQUAL(cudaSuccess, err); } - cudaError_t const err = cudaDeviceSynchronize(); - ASSERT_EQUAL(cudaSuccess, err); - iter = iter_vec[0]; ASSERT_EQUAL(v[0], exemplar); @@ -223,6 +223,7 @@ void TestUninitializedFillNDeviceDevice() TestUninitializedFillNDevice(thrust::device); } DECLARE_UNITTEST(TestUninitializedFillNDeviceDevice); +#endif void TestUninitializedFillNCudaStreams() diff --git a/testing/cuda/uninitialized_fill.mk b/testing/cuda/uninitialized_fill.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/uninitialized_fill.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/unique.cu b/testing/cuda/unique.cu index c0dc7973d..136ba76fd 100644 --- a/testing/cuda/unique.cu +++ b/testing/cuda/unique.cu @@ -3,6 +3,15 @@ #include +template +struct is_equal_div_10_unique +{ + __host__ __device__ + bool operator()(const T x, const T& y) const { return ((int) x / 10) == ((int) y / 10); } +}; + + +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void unique_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result) @@ -19,14 +28,6 @@ void unique_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Binary } -template -struct is_equal_div_10_unique -{ - __host__ __device__ - bool operator()(const T x, const T& y) const { return ((int) x / 10) == ((int) y / 10); } -}; - - template void TestUniqueDevice(ExecutionPolicy exec) { @@ -94,7 +95,16 @@ void TestUniqueDeviceDevice() DECLARE_UNITTEST(TestUniqueDeviceDevice); -void TestUniqueCudaStreams() +void TestUniqueDeviceNoSync() +{ + TestUniqueDevice(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestUniqueDeviceNoSync); +#endif + + +template +void TestUniqueCudaStreams(ExecutionPolicy policy) { typedef thrust::device_vector Vector; typedef Vector::value_type T; @@ -116,8 +126,10 @@ void TestUniqueCudaStreams() cudaStream_t s; cudaStreamCreate(&s); + + auto streampolicy = policy.on(s); - new_last = thrust::unique(thrust::cuda::par.on(s), data.begin(), data.end()); + new_last = thrust::unique(streampolicy, data.begin(), data.end()); cudaStreamSynchronize(s); ASSERT_EQUAL(new_last - data.begin(), 7); @@ -129,7 +141,7 @@ void TestUniqueCudaStreams() ASSERT_EQUAL(data[5], 31); ASSERT_EQUAL(data[6], 37); - new_last = thrust::unique(thrust::cuda::par.on(s), data.begin(), new_last, is_equal_div_10_unique()); + new_last = thrust::unique(streampolicy, data.begin(), new_last, is_equal_div_10_unique()); cudaStreamSynchronize(s); ASSERT_EQUAL(new_last - data.begin(), 3); @@ -139,9 +151,22 @@ void TestUniqueCudaStreams() cudaStreamDestroy(s); } -DECLARE_UNITTEST(TestUniqueCudaStreams); +void TestUniqueCudaStreamsSync() +{ + TestUniqueCudaStreams(thrust::cuda::par); +} +DECLARE_UNITTEST(TestUniqueCudaStreamsSync); + +void TestUniqueCudaStreamsNoSync() +{ + TestUniqueCudaStreams(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestUniqueCudaStreamsNoSync); + + +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void unique_copy_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result1, Iterator3 result2) @@ -227,7 +252,16 @@ void TestUniqueCopyDeviceDevice() DECLARE_UNITTEST(TestUniqueCopyDeviceDevice); -void TestUniqueCopyCudaStreams() +void TestUniqueCopyDeviceNoSync() +{ + TestUniqueCopyDevice(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestUniqueCopyDeviceNoSync); +#endif + + +template +void TestUniqueCopyCudaStreams(ExecutionPolicy policy) { typedef thrust::device_vector Vector; typedef Vector::value_type T; @@ -251,8 +285,10 @@ void TestUniqueCopyCudaStreams() cudaStream_t s; cudaStreamCreate(&s); + + auto streampolicy = policy.on(s); - new_last = thrust::unique_copy(thrust::cuda::par.on(s), data.begin(), data.end(), output.begin()); + new_last = thrust::unique_copy(streampolicy, data.begin(), data.end(), output.begin()); cudaStreamSynchronize(s); ASSERT_EQUAL(new_last - output.begin(), 7); @@ -264,7 +300,7 @@ void TestUniqueCopyCudaStreams() ASSERT_EQUAL(output[5], 31); ASSERT_EQUAL(output[6], 37); - new_last = thrust::unique_copy(thrust::cuda::par.on(s), output.begin(), new_last, data.begin(), is_equal_div_10_unique()); + new_last = thrust::unique_copy(streampolicy, output.begin(), new_last, data.begin(), is_equal_div_10_unique()); cudaStreamSynchronize(s); ASSERT_EQUAL(new_last - data.begin(), 3); @@ -274,5 +310,144 @@ void TestUniqueCopyCudaStreams() cudaStreamDestroy(s); } -DECLARE_UNITTEST(TestUniqueCopyCudaStreams); + +void TestUniqueCopyCudaStreamsSync() +{ + TestUniqueCopyCudaStreams(thrust::cuda::par); +} +DECLARE_UNITTEST(TestUniqueCopyCudaStreamsSync); + + +void TestUniqueCopyCudaStreamsNoSync() +{ + TestUniqueCopyCudaStreams(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestUniqueCopyCudaStreamsNoSync); + + +#ifdef THRUST_TEST_DEVICE_SIDE +template +__global__ +void unique_count_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result) +{ + *result = thrust::unique_count(exec, first, last); +} + + +template +__global__ +void unique_count_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, BinaryPredicate pred, Iterator2 result) +{ + *result = thrust::unique_count(exec, first, last, pred); +} + + +template +void TestUniqueCountDevice(ExecutionPolicy exec) +{ + typedef thrust::device_vector Vector; + typedef Vector::value_type T; + + Vector data(10); + data[0] = 11; + data[1] = 11; + data[2] = 12; + data[3] = 20; + data[4] = 29; + data[5] = 21; + data[6] = 21; + data[7] = 31; + data[8] = 31; + data[9] = 37; + + Vector output(1, -1); + + unique_count_kernel<<<1,1>>>(exec, data.begin(), data.end(), output.begin()); + { + cudaError_t const err = cudaDeviceSynchronize(); + ASSERT_EQUAL(cudaSuccess, err); + } + + ASSERT_EQUAL(output[0], 7); + + unique_count_kernel<<<1,1>>>(exec, data.begin(), data.end(), is_equal_div_10_unique(), output.begin()); + { + cudaError_t const err = cudaDeviceSynchronize(); + ASSERT_EQUAL(cudaSuccess, err); + } + + ASSERT_EQUAL(output[0], 3); +} + + +void TestUniqueCountDeviceSeq() +{ + TestUniqueCountDevice(thrust::seq); +} +DECLARE_UNITTEST(TestUniqueCountDeviceSeq); + + +void TestUniqueCountDeviceDevice() +{ + TestUniqueCountDevice(thrust::device); +} +DECLARE_UNITTEST(TestUniqueCountDeviceDevice); + + +void TestUniqueCountDeviceNoSync() +{ + TestUniqueCountDevice(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestUniqueCountDeviceNoSync); +#endif + + +template +void TestUniqueCountCudaStreams(ExecutionPolicy policy) +{ + typedef thrust::device_vector Vector; + typedef Vector::value_type T; + + Vector data(10); + data[0] = 11; + data[1] = 11; + data[2] = 12; + data[3] = 20; + data[4] = 29; + data[5] = 21; + data[6] = 21; + data[7] = 31; + data[8] = 31; + data[9] = 37; + + cudaStream_t s; + cudaStreamCreate(&s); + + auto streampolicy = policy.on(s); + + int result = thrust::unique_count(streampolicy, data.begin(), data.end()); + cudaStreamSynchronize(s); + + ASSERT_EQUAL(result, 7); + + result = thrust::unique_count(streampolicy, data.begin(), data.end(), is_equal_div_10_unique()); + cudaStreamSynchronize(s); + + ASSERT_EQUAL(result, 3); + + cudaStreamDestroy(s); +} + +void TestUniqueCountCudaStreamsSync() +{ + TestUniqueCountCudaStreams(thrust::cuda::par); +} +DECLARE_UNITTEST(TestUniqueCountCudaStreamsSync); + + +void TestUniqueCountCudaStreamsNoSync() +{ + TestUniqueCountCudaStreams(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestUniqueCountCudaStreamsNoSync); diff --git a/testing/cuda/unique.mk b/testing/cuda/unique.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/unique.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/cuda/unique_by_key.cu b/testing/cuda/unique_by_key.cu index c58a64d51..d96cbdc6c 100644 --- a/testing/cuda/unique_by_key.cu +++ b/testing/cuda/unique_by_key.cu @@ -44,6 +44,7 @@ void initialize_values(Vector& values) } +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void unique_by_key_kernel(ExecutionPolicy exec, Iterator1 keys_first, Iterator1 keys_last, Iterator2 values_first, Iterator3 result) @@ -134,7 +135,16 @@ void TestUniqueByKeyDeviceDevice() DECLARE_UNITTEST(TestUniqueByKeyDeviceDevice); -void TestUniqueByKeyCudaStreams() +void TestUniqueByKeyDeviceNoSync() +{ + TestUniqueByKeyDevice(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestUniqueByKeyDeviceNoSync); +#endif + + +template +void TestUniqueByKeyCudaStreams(ExecutionPolicy policy) { typedef thrust::device_vector Vector; typedef Vector::value_type T; @@ -150,8 +160,10 @@ void TestUniqueByKeyCudaStreams() cudaStream_t s; cudaStreamCreate(&s); + + auto streampolicy = policy.on(s); - new_last = thrust::unique_by_key(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin()); + new_last = thrust::unique_by_key(streampolicy, keys.begin(), keys.end(), values.begin()); cudaStreamSynchronize(s); ASSERT_EQUAL(new_last.first - keys.begin(), 5); @@ -171,7 +183,7 @@ void TestUniqueByKeyCudaStreams() // test BinaryPredicate initialize_keys(keys); initialize_values(values); - new_last = thrust::unique_by_key(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin(), is_equal_div_10_unique()); + new_last = thrust::unique_by_key(streampolicy, keys.begin(), keys.end(), values.begin(), is_equal_div_10_unique()); ASSERT_EQUAL(new_last.first - keys.begin(), 3); ASSERT_EQUAL(new_last.second - values.begin(), 3); @@ -185,9 +197,22 @@ void TestUniqueByKeyCudaStreams() cudaStreamDestroy(s); } -DECLARE_UNITTEST(TestUniqueByKeyCudaStreams); + +void TestUniqueByKeyCudaStreamsSync() +{ + TestUniqueByKeyCudaStreams(thrust::cuda::par); +} +DECLARE_UNITTEST(TestUniqueByKeyCudaStreamsSync); + + +void TestUniqueByKeyCudaStreamsNoSync() +{ + TestUniqueByKeyCudaStreams(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestUniqueByKeyCudaStreamsNoSync); +#ifdef THRUST_TEST_DEVICE_SIDE template __global__ void unique_by_key_copy_kernel(ExecutionPolicy exec, Iterator1 keys_first, Iterator1 keys_last, Iterator2 values_first, Iterator3 keys_result, Iterator4 values_result, Iterator5 result) @@ -282,7 +307,16 @@ void TestUniqueCopyByKeyDeviceDevice() DECLARE_UNITTEST(TestUniqueCopyByKeyDeviceDevice); -void TestUniqueCopyByKeyCudaStreams() +void TestUniqueCopyByKeyDeviceNoSync() +{ + TestUniqueCopyByKeyDevice(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestUniqueCopyByKeyDeviceNoSync); +#endif + + +template +void TestUniqueCopyByKeyCudaStreams(ExecutionPolicy policy) { typedef thrust::device_vector Vector; typedef Vector::value_type T; @@ -302,7 +336,9 @@ void TestUniqueCopyByKeyCudaStreams() cudaStream_t s; cudaStreamCreate(&s); - new_last = thrust::unique_by_key_copy(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin()); + auto streampolicy = policy.on(s); + + new_last = thrust::unique_by_key_copy(streampolicy, keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin()); cudaStreamSynchronize(s); ASSERT_EQUAL(new_last.first - output_keys.begin(), 5); @@ -322,7 +358,7 @@ void TestUniqueCopyByKeyCudaStreams() // test BinaryPredicate initialize_keys(keys); initialize_values(values); - new_last = thrust::unique_by_key_copy(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), is_equal_div_10_unique()); + new_last = thrust::unique_by_key_copy(streampolicy, keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), is_equal_div_10_unique()); cudaStreamSynchronize(s); ASSERT_EQUAL(new_last.first - output_keys.begin(), 3); @@ -337,5 +373,17 @@ void TestUniqueCopyByKeyCudaStreams() cudaStreamDestroy(s); } -DECLARE_UNITTEST(TestUniqueCopyByKeyCudaStreams); + +void TestUniqueCopyByKeyCudaStreamsSync() +{ + TestUniqueCopyByKeyCudaStreams(thrust::cuda::par); +} +DECLARE_UNITTEST(TestUniqueCopyByKeyCudaStreamsSync); + + +void TestUniqueCopyByKeyCudaStreamsNoSync() +{ + TestUniqueCopyByKeyCudaStreams(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestUniqueCopyByKeyCudaStreamsNoSync); diff --git a/testing/cuda/unique_by_key.mk b/testing/cuda/unique_by_key.mk new file mode 100644 index 000000000..7d930481e --- /dev/null +++ b/testing/cuda/unique_by_key.mk @@ -0,0 +1 @@ +CUDACC_FLAGS += -rdc=true diff --git a/testing/dependencies_aware_policies.cu b/testing/dependencies_aware_policies.cu index 5f48bf4f2..531339215 100644 --- a/testing/dependencies_aware_policies.cu +++ b/testing/dependencies_aware_policies.cu @@ -1,5 +1,6 @@ #include +#include #include #include #include @@ -9,7 +10,7 @@ # include #endif -#if __cplusplus >= 201103L +#if THRUST_CPP_DIALECT >= 2011 template struct test_allocator_t @@ -178,11 +179,11 @@ SimpleUnitTest< > > TestDependencyAttachmentInstance; -#else +#else // C++11 void TestDummy() { } DECLARE_UNITTEST(TestDummy); -#endif +#endif // C++11 diff --git a/testing/device_delete.cu b/testing/device_delete.cu index 6684cb2b5..12f757fa4 100644 --- a/testing/device_delete.cu +++ b/testing/device_delete.cu @@ -4,21 +4,23 @@ #include #include +#include + struct Foo { __host__ __device__ Foo(void) - :set_me_upon_destruction(0) + : set_me_upon_destruction{nullptr} {} __host__ __device__ ~Foo(void) { -#ifdef __CUDA_ARCH__ - // __device__ overload - if(set_me_upon_destruction != 0) - *set_me_upon_destruction = true; -#endif + NV_IF_TARGET(NV_IS_DEVICE, ( + if (set_me_upon_destruction != nullptr) + { + *set_me_upon_destruction = true; + })); } bool *set_me_upon_destruction; diff --git a/testing/docs/doxybook_test.h b/testing/docs/doxybook_test.h new file mode 100644 index 000000000..244648ee1 --- /dev/null +++ b/testing/docs/doxybook_test.h @@ -0,0 +1,222 @@ +/* + * Copyright 2008-2020 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*! \file + * \brief Test case for Doxybook rendering. + */ + +#pragma once + +namespace thrust +{ + +/*! \addtogroup test Test + * \{ + */ + +/*! \brief \c test_predefined_friend_struct is a class intended to exercise and + * test Doxybook rendering. + */ +template +struct test_predefined_friend_struct {}; + +/*! \brief \c test_predefined_friend_function is a function intended to + * exercise and test Doxybook rendering. + */ +template +void test_predefined_friend_function(); + +/*! \brief \c test_class is a class intended to exercise and test Doxybook + * rendering. + * + * It does many things. + * + * \tparam T A template parameter. + * \tparam U Another template parameter. + * + * \see test_function + */ +template +class test_class +{ +public: + template + struct test_nested_struct {}; + + int test_member_variable = 0; ///< A test member variable. + + [[deprecated]] static constexpr int test_member_constant = 42; ///< A test member constant. + + template + using test_type_alias = test_class; + + enum class test_enum_class { + A = 15, ///< An enumerator. It is equal to 15. + B, + C + }; + + /*! \brief Construct an empty test class. + */ + test_class() = default; + + /*! \brief Construct a test class. + */ + __host__ __device__ constexpr + test_class(int); + + /*! \brief \c test_member_function is a function intended to exercise + * and test Doxybook rendering. + */ + __host__ __device__ constexpr + int test_member_function() = 0; + + /*! \brief \c test_virtual_member_function is a function intended to exercise + * and test Doxybook rendering. + */ + __host__ __device__ + virtual int test_virtual_member_function() = 0; + + /*! \brief \c test_parameter_overflow_member_function is a function intended + * to test Doxybook's rendering of function and template parameters that exceed + * the length of a line. + */ + template , + typename B = test_predefined_friend_struct, + typename C = test_predefined_friend_struct> + test_predefined_friend_struct + test_parameter_overflow_member_function(test_predefined_friend_struct a, + test_predefined_friend_struct b, + test_predefined_friend_struct c); + + template + friend void test_friend_function() {} + + template + friend void test_predefined_friend_function(); + + template + friend struct thrust::test_predefined_friend_struct; + +protected: + + template + class test_protected_nested_class {}; + + /*! \brief \c test_protected_member_function is a function intended to + * exercise and test Doxybook rendering. + */ + __device__ + auto test_protected_member_function(); +}; + +/*! \brief \c test_derived_class is a derived class intended to exercise and + * test Doxybook rendering. + */ +class test_derived_class : test_class +{ + template + struct test_derived_nested_struct {}; + + double test_derived_member_variable = 3.14; ///< A test member variable. + + typedef double test_typedef; + + /*! \brief \c test_derived_member_function is a function intended to exercise + * and test Doxybook rendering. + */ + __host__ __device__ constexpr + double test_derived_member_function(int, int); +}; + +/*! \brief \c test_function is a function intended to exercise and test Doxybook + * rendering. + * + * \tparam T A template parameter. + * + * \param a A function parameter. + * \param b A function parameter. + */ +template +void test_function(T const& a, test_class&& b); + +/*! \brief \c test_parameter_overflow_function is a function intended to test + * Doxybook's rendering of function and template parameters that exceed the + * length of a line. + */ +template , + typename U = test_predefined_friend_struct, + typename V = test_predefined_friend_struct +> +test_predefined_friend_struct +test_parameter_overflow_function(test_predefined_friend_struct t, + test_predefined_friend_struct u, + test_predefined_friend_struct v); + +/*! \brief \c test_enum is an enum namespace intended to exercise and test + * Doxybook rendering. + */ +enum class test_enum { + X = 1, ///< An enumerator. It is equal to 1. + Y = X, + Z = 2 +}; + +/*! \brief \c test_alias is a type alias intended to exercise and test Doxybook + * rendering. + */ +using test_alias = test_class; + +/*! \brief \c test_namespace is a namespace intended to exercise and test + * Doxybook rendering. + */ +namespace test_namespace { + +inline constexpr int test_constant = 12; + +/*! \brief \c nested_function is a function intended to exercise and test + * Doxybook rendering. + */ +template +auto test_nested_function(T t, U u) noexcept(noexcept(t + u)) -> decltype(t + u) +{ return t + u; } + +/*! \brief \c test_struct is a struct intended to exercise and test Doxybook + * rendering. + */ +template +struct test_struct +{ + test_struct& operator=(test_struct const&) = default; + + /*! \brief \c operator< is a function intended to exercise and test Doxybook + * rendering. + */ + bool operator<(test_struct const& t); +}; + +} // namespace test_namespace + +/*! \brief \c THRUST_TEST_MACRO is a macro intended to exercise and test + * Doxybook rendering. + */ +#define THRUST_TEST_MACRO(x, y) thrust::test_namespace::nested_function(x, y) + +/*! \} // test + */ + +} // namespace thrust + diff --git a/testing/equal.cu b/testing/equal.cu index 932f3ccfd..ca9f7eb69 100644 --- a/testing/equal.cu +++ b/testing/equal.cu @@ -2,6 +2,8 @@ #include #include #include +#include +#include template void TestEqualSimple(void) @@ -102,3 +104,48 @@ void TestEqualDispatchImplicit() } DECLARE_UNITTEST(TestEqualDispatchImplicit); +struct only_set_when_both_expected +{ + long long expected; + bool * flag; + + __device__ + bool operator()(long long x, long long y) + { + if (x == expected && y == expected) + { + *flag = true; + } + + return x == y; + } +}; + +void TestEqualWithBigIndexesHelper(int magnitude) +{ + thrust::counting_iterator begin(1); + thrust::counting_iterator end = begin + (1ll << magnitude); + ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude); + + thrust::device_ptr has_executed = thrust::device_malloc(1); + *has_executed = false; + + only_set_when_both_expected fn = { (1ll << magnitude) - 1, + thrust::raw_pointer_cast(has_executed) }; + + ASSERT_EQUAL(thrust::equal(thrust::device, begin, end, begin, fn), true); + + bool has_executed_h = *has_executed; + thrust::device_free(has_executed); + + ASSERT_EQUAL(has_executed_h, true); +} + +void TestEqualWithBigIndexes() +{ + TestEqualWithBigIndexesHelper(30); + TestEqualWithBigIndexesHelper(31); + TestEqualWithBigIndexesHelper(32); + TestEqualWithBigIndexesHelper(33); +} +DECLARE_UNITTEST(TestEqualWithBigIndexes); diff --git a/testing/event.cu b/testing/event.cu index a02f15fd7..581426919 100644 --- a/testing/event.cu +++ b/testing/event.cu @@ -1,6 +1,6 @@ #include -#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC) +#if THRUST_CPP_DIALECT >= 2014 #include #include @@ -58,8 +58,6 @@ void test_event_new_stream() { auto e0 = thrust::device_event(thrust::new_stream); - auto e0_stream = e0.stream().native_handle(); - ASSERT_EQUAL(true, e0.valid_stream()); ASSERT_NOT_EQUAL_QUIET(nullptr, e0.stream().native_handle()); diff --git a/testing/fill.cu b/testing/fill.cu index ec32dcd30..7154b4118 100644 --- a/testing/fill.cu +++ b/testing/fill.cu @@ -22,17 +22,17 @@ void TestFillSimple(void) ASSERT_EQUAL(v[2], 7); ASSERT_EQUAL(v[3], 7); ASSERT_EQUAL(v[4], 4); - + thrust::fill(v.begin() + 0, v.begin() + 3, (T) 8); - + ASSERT_EQUAL(v[0], 8); ASSERT_EQUAL(v[1], 8); ASSERT_EQUAL(v[2], 8); ASSERT_EQUAL(v[3], 7); ASSERT_EQUAL(v[4], 4); - + thrust::fill(v.begin() + 2, v.end(), (T) 9); - + ASSERT_EQUAL(v[0], 8); ASSERT_EQUAL(v[1], 8); ASSERT_EQUAL(v[2], 9); @@ -40,7 +40,7 @@ void TestFillSimple(void) ASSERT_EQUAL(v[4], 9); thrust::fill(v.begin(), v.end(), (T) 1); - + ASSERT_EQUAL(v[0], 1); ASSERT_EQUAL(v[1], 1); ASSERT_EQUAL(v[2], 1); @@ -70,14 +70,14 @@ void TestFillMixedTypes(void) Vector v(4); thrust::fill(v.begin(), v.end(), bool(true)); - + ASSERT_EQUAL(v[0], 1); ASSERT_EQUAL(v[1], 1); ASSERT_EQUAL(v[2], 1); ASSERT_EQUAL(v[3], 1); - + thrust::fill(v.begin(), v.end(), char(20)); - + ASSERT_EQUAL(v[0], 20); ASSERT_EQUAL(v[1], 20); ASSERT_EQUAL(v[2], 20); @@ -101,17 +101,17 @@ void TestFill(size_t n) thrust::fill(d_data.begin() + std::min((size_t)117, n), d_data.begin() + std::min((size_t)367, n), (T) 1); ASSERT_EQUAL(h_data, d_data); - + thrust::fill(h_data.begin() + std::min((size_t)8, n), h_data.begin() + std::min((size_t)259, n), (T) 2); thrust::fill(d_data.begin() + std::min((size_t)8, n), d_data.begin() + std::min((size_t)259, n), (T) 2); ASSERT_EQUAL(h_data, d_data); - + thrust::fill(h_data.begin() + std::min((size_t)3, n), h_data.end(), (T) 3); thrust::fill(d_data.begin() + std::min((size_t)3, n), d_data.end(), (T) 3); ASSERT_EQUAL(h_data, d_data); - + thrust::fill(h_data.begin(), h_data.end(), (T) 4); thrust::fill(d_data.begin(), d_data.end(), (T) 4); @@ -135,18 +135,18 @@ void TestFillNSimple(void) ASSERT_EQUAL(v[3], 7); ASSERT_EQUAL(v[4], 4); ASSERT_EQUAL_QUIET(v.begin() + 4, iter); - + iter = thrust::fill_n(v.begin() + 0, 3, (T) 8); - + ASSERT_EQUAL(v[0], 8); ASSERT_EQUAL(v[1], 8); ASSERT_EQUAL(v[2], 8); ASSERT_EQUAL(v[3], 7); ASSERT_EQUAL(v[4], 4); ASSERT_EQUAL_QUIET(v.begin() + 3, iter); - + iter = thrust::fill_n(v.begin() + 2, 3, (T) 9); - + ASSERT_EQUAL(v[0], 8); ASSERT_EQUAL(v[1], 8); ASSERT_EQUAL(v[2], 9); @@ -155,7 +155,7 @@ void TestFillNSimple(void) ASSERT_EQUAL_QUIET(v.end(), iter); iter = thrust::fill_n(v.begin(), v.size(), (T) 1); - + ASSERT_EQUAL(v[0], 1); ASSERT_EQUAL(v[1], 1); ASSERT_EQUAL(v[2], 1); @@ -192,15 +192,15 @@ void TestFillNMixedTypes(void) Vector v(4); typename Vector::iterator iter = thrust::fill_n(v.begin(), v.size(), bool(true)); - + ASSERT_EQUAL(v[0], 1); ASSERT_EQUAL(v[1], 1); ASSERT_EQUAL(v[2], 1); ASSERT_EQUAL(v[3], 1); ASSERT_EQUAL_QUIET(v.end(), iter); - + iter = thrust::fill_n(v.begin(), v.size(), char(20)); - + ASSERT_EQUAL(v[0], 20); ASSERT_EQUAL(v[1], 20); ASSERT_EQUAL(v[2], 20); @@ -227,19 +227,19 @@ void TestFillN(size_t n) thrust::fill_n(d_data.begin() + begin_offset, std::min((size_t)367, n) - begin_offset, (T) 1); ASSERT_EQUAL(h_data, d_data); - + begin_offset = std::min(8, n); thrust::fill_n(h_data.begin() + begin_offset, std::min((size_t)259, n) - begin_offset, (T) 2); thrust::fill_n(d_data.begin() + begin_offset, std::min((size_t)259, n) - begin_offset, (T) 2); ASSERT_EQUAL(h_data, d_data); - + begin_offset = std::min(3, n); thrust::fill_n(h_data.begin() + begin_offset, h_data.size() - begin_offset, (T) 3); thrust::fill_n(d_data.begin() + begin_offset, d_data.size() - begin_offset, (T) 3); ASSERT_EQUAL(h_data, d_data); - + thrust::fill_n(h_data.begin(), h_data.size(), (T) 4); thrust::fill_n(d_data.begin(), d_data.size(), (T) 4); @@ -301,7 +301,7 @@ void TestFillWithTrivialAssignment(void) thrust::host_vector h(1); thrust::device_vector d(1); - + ASSERT_EQUAL(h[0].x, 0); ASSERT_EQUAL(h[0].y, 0); ASSERT_EQUAL(h[0].z, 0); @@ -334,6 +334,10 @@ struct TypeWithNonTrivialAssigment __host__ __device__ TypeWithNonTrivialAssigment() : x(0), y(0), z(0) {} +#if THRUST_CPP_DIALECT >= 2011 + TypeWithNonTrivialAssigment(const TypeWithNonTrivialAssigment &) = default; +#endif + __host__ __device__ TypeWithNonTrivialAssigment& operator=(const TypeWithNonTrivialAssigment& t) { @@ -342,7 +346,7 @@ struct TypeWithNonTrivialAssigment z = t.x + t.y; return *this; } - + __host__ __device__ bool operator==(const TypeWithNonTrivialAssigment& t) const { @@ -356,7 +360,7 @@ void TestFillWithNonTrivialAssignment(void) thrust::host_vector h(1); thrust::device_vector d(1); - + ASSERT_EQUAL(h[0].x, 0); ASSERT_EQUAL(h[0].y, 0); ASSERT_EQUAL(h[0].z, 0); diff --git a/testing/find.cu b/testing/find.cu index 7c91320a1..988afbeef 100644 --- a/testing/find.cu +++ b/testing/find.cu @@ -1,4 +1,5 @@ #include +#include #include #include @@ -304,3 +305,69 @@ struct TestFindIfNot }; VariableUnitTest TestFindIfNotInstance; +void TestFindWithBigIndexesHelper(int magnitude) +{ + thrust::counting_iterator begin(1); + thrust::counting_iterator end = begin + (1ll << magnitude); + ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude); + + thrust::detail::intmax_t distance_low_value = thrust::distance( + begin, + thrust::find( + thrust::device, + begin, + end, + 17)); + + thrust::detail::intmax_t distance_high_value = thrust::distance( + begin, + thrust::find( + thrust::device, + begin, + end, + (1ll << magnitude) - 17)); + + ASSERT_EQUAL(distance_low_value, 16); + ASSERT_EQUAL(distance_high_value, (1ll << magnitude) - 18); +} + +void TestFindWithBigIndexes() +{ + TestFindWithBigIndexesHelper(30); + TestFindWithBigIndexesHelper(31); + TestFindWithBigIndexesHelper(32); + TestFindWithBigIndexesHelper(33); +} +DECLARE_UNITTEST(TestFindWithBigIndexes); + +namespace +{ + +class Weird +{ + int value; + +public: + __host__ __device__ Weird(int val, int) + : value(val) + {} + + friend __host__ __device__ + bool operator==(int x, Weird y) + { + return x == y.value; + } +}; + +} // end anon namespace + +void TestFindAsymmetricEquality() +{ // Regression test for NVIDIA/thrust#1229 + thrust::host_vector v(1000); + thrust::sequence(v.begin(), v.end()); + thrust::device_vector dv(v); + auto result = thrust::find(dv.begin(), dv.end(), Weird(333, 0)); + ASSERT_EQUAL(*result, 333); + ASSERT_EQUAL(result - dv.begin(), 333); +} +DECLARE_UNITTEST(TestFindAsymmetricEquality); diff --git a/testing/fix_clang_nvcc_11.5.h b/testing/fix_clang_nvcc_11.5.h new file mode 100644 index 000000000..279dca3f9 --- /dev/null +++ b/testing/fix_clang_nvcc_11.5.h @@ -0,0 +1,24 @@ +#pragma once + +#if defined(__NVCC__) && defined(__clang__) && __CUDACC_VER_MAJOR__ == 11 && \ + __CUDACC_VER_MINOR__ <= 5 + +#if defined(__NVCC_DIAG_PRAGMA_SUPPORT__) +# pragma nv_diag_suppress 3171 +#else +# pragma diag_suppress 3171 +#endif + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wkeyword-compat" + +// Clang has a builtin called `__is_signed`. Unfortunately, libstdc++ headers +// use this name as an identifier. Clang has a workaround for that, it checks +// if `__is_signed` is `const static bool` as in libstdc++ headers and if so, +// disables the intrinsic for the rest of the TU: +// https://github.com/llvm/llvm-project/blob/f49b6afc231242dfee027d5da69734836097cd43/clang/lib/Parse/ParseDecl.cpp#L3552-L3566 +const static bool __is_signed = false; + +#pragma clang diagnostic pop +#endif // defined(__NVCC__) && defined(__clang__) && __CUDACC_VER_MAJOR__ == 11 && + // __CUDACC_VER_MINOR__ <= 5 diff --git a/testing/for_each.cu b/testing/for_each.cu index 0e9e4ef5c..8040e5f78 100644 --- a/testing/for_each.cu +++ b/testing/for_each.cu @@ -355,7 +355,7 @@ DECLARE_UNITTEST(TestForEachNWithLargeTypes); THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END -struct OnlySetWhenExpected +struct only_set_when_expected { unsigned long long expected; bool * flag; @@ -379,7 +379,7 @@ void TestForEachWithBigIndexesHelper(int magnitude) thrust::device_ptr has_executed = thrust::device_malloc(1); *has_executed = false; - OnlySetWhenExpected fn = { (1ull << magnitude) - 1, thrust::raw_pointer_cast(has_executed) }; + only_set_when_expected fn = { (1ull << magnitude) - 1, thrust::raw_pointer_cast(has_executed) }; thrust::for_each(thrust::device, begin, end, fn); diff --git a/testing/functional.cu b/testing/functional.cu index 3b758c9b3..1d1a79b6c 100644 --- a/testing/functional.cu +++ b/testing/functional.cu @@ -296,6 +296,19 @@ void TestNot1(void) } DECLARE_INTEGRAL_VECTOR_UNITTEST(TestNot1); + +// GCC 11 fails to build this test case with a spurious error in a +// very specific scenario: +// - GCC 11 +// - CPP system for both host and device +// - C++11 dialect +#if !(defined(THRUST_GCC_VERSION) && \ + THRUST_GCC_VERSION >= 110000 && \ + THRUST_GCC_VERSION < 120000 && \ + THRUST_HOST_SYSTEM == THRUST_HOST_SYSTEM_CPP && \ + THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CPP && \ + THRUST_CPP_DIALECT == 2011) + template void TestNot2(void) { @@ -321,4 +334,6 @@ void TestNot2(void) } DECLARE_VECTOR_UNITTEST(TestNot2); +#endif // Weird GCC11 failure case + THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END diff --git a/testing/functional_placeholders_arithmetic.cu b/testing/functional_placeholders_arithmetic.cu index 4376b46a9..8d8535aa6 100644 --- a/testing/functional_placeholders_arithmetic.cu +++ b/testing/functional_placeholders_arithmetic.cu @@ -65,8 +65,8 @@ template struct unary_plus_reference { __host__ __device__ T operator()(const T &x) const - { - return +x; + { // Static cast to undo integral promotion + return static_cast(+x); } }; diff --git a/testing/functional_placeholders_bitwise.cu b/testing/functional_placeholders_bitwise.cu index bfefb9771..7c92d967f 100644 --- a/testing/functional_placeholders_bitwise.cu +++ b/testing/functional_placeholders_bitwise.cu @@ -3,16 +3,18 @@ #include #include +#include + static const size_t num_samples = 10000; template struct rebind_vector; -// TODO: C++11: use rebind from allocator_traits template struct rebind_vector, U> { - typedef thrust::host_vector::other> type; + typedef typename thrust::detail::allocator_traits alloc_traits; + typedef typename alloc_traits::template rebind_alloc new_alloc; + typedef thrust::host_vector type; }; template @@ -22,17 +24,24 @@ template typename Allocator::template rebind::other> type; }; +template + struct rebind_vector, U> +{ + typedef thrust::universal_vector::other> type; +}; + #define BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(name, op, reference_functor, type_list) \ template \ struct TestFunctionalPlaceholders##name \ { \ void operator()(const size_t) \ { \ - static const size_t num_samples = 10000; \ - const size_t zero = 0; \ + constexpr size_t NUM_SAMPLES = 10000; \ + constexpr size_t ZERO = 0; \ typedef typename Vector::value_type T; \ - Vector lhs = unittest::random_samples(num_samples); \ - Vector rhs = unittest::random_samples(num_samples); \ + Vector lhs = unittest::random_samples(NUM_SAMPLES); \ + Vector rhs = unittest::random_samples(NUM_SAMPLES); \ thrust::replace(rhs.begin(), rhs.end(), T(0), T(1)); \ \ Vector reference(lhs.size()); \ @@ -47,7 +56,7 @@ template \ thrust::transform(lhs.begin(), lhs.end(), result.begin(), _1 op T(1)); \ ASSERT_ALMOST_EQUAL(reference, result); \ \ - thrust::transform(thrust::make_constant_iterator(1,zero), thrust::make_constant_iterator(1,num_samples), rhs.begin(), reference.begin(), reference_functor()); \ + thrust::transform(thrust::make_constant_iterator(1,ZERO), thrust::make_constant_iterator(1,NUM_SAMPLES), rhs.begin(), reference.begin(), reference_functor()); \ thrust::transform(rhs.begin(), rhs.end(), result.begin(), T(1) op _1); \ ASSERT_ALMOST_EQUAL(reference, result); \ } \ diff --git a/testing/functional_placeholders_logical.cu b/testing/functional_placeholders_logical.cu index 7fcb640fe..caca82040 100644 --- a/testing/functional_placeholders_logical.cu +++ b/testing/functional_placeholders_logical.cu @@ -2,16 +2,18 @@ #include #include +#include + static const size_t num_samples = 10000; template struct rebind_vector; -// TODO: C++11: use rebind from allocator_traits template struct rebind_vector, U> { - typedef thrust::host_vector::other> type; + typedef typename thrust::detail::allocator_traits alloc_traits; + typedef typename alloc_traits::template rebind_alloc new_alloc; + typedef thrust::host_vector type; }; template @@ -21,6 +23,13 @@ template typename Allocator::template rebind::other> type; }; +template + struct rebind_vector, U> +{ + typedef thrust::universal_vector::other> type; +}; + #define BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(name, reference_operator, functor) \ template \ void TestFunctionalPlaceholders##name(void) \ diff --git a/testing/functional_placeholders_relational.cu b/testing/functional_placeholders_relational.cu index 8114ef55e..7f088a1ea 100644 --- a/testing/functional_placeholders_relational.cu +++ b/testing/functional_placeholders_relational.cu @@ -2,16 +2,18 @@ #include #include +#include + static const size_t num_samples = 10000; template struct rebind_vector; -// TODO: C++11: use rebind from allocator_traits template struct rebind_vector, U> { - typedef thrust::host_vector::other> type; + typedef typename thrust::detail::allocator_traits alloc_traits; + typedef typename alloc_traits::template rebind_alloc new_alloc; + typedef thrust::host_vector type; }; template @@ -21,6 +23,13 @@ template typename Allocator::template rebind::other> type; }; +template + struct rebind_vector, U> +{ + typedef thrust::universal_vector::other> type; +}; + #define BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(name, reference_operator, functor) \ template \ void TestFunctionalPlaceholdersBinary##name(void) \ diff --git a/testing/future.cu b/testing/future.cu index 0616230c9..eb1ab582a 100644 --- a/testing/future.cu +++ b/testing/future.cu @@ -1,6 +1,6 @@ #include -#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC) +#if THRUST_CPP_DIALECT >= 2014 #include #include @@ -102,8 +102,6 @@ struct test_future_new_stream { auto f0 = thrust::device_future(thrust::new_stream); - auto f0_stream = f0.stream().native_handle(); - ASSERT_EQUAL(true, f0.valid_stream()); ASSERT_EQUAL(false, f0.valid_content()); diff --git a/testing/inner_product.cu b/testing/inner_product.cu index c1f77904b..4fae72e88 100644 --- a/testing/inner_product.cu +++ b/testing/inner_product.cu @@ -1,6 +1,11 @@ #include #include + +#include #include +#include +#include +#include template void TestInnerProductSimple(void) @@ -100,4 +105,69 @@ struct TestInnerProduct }; VariableUnitTest TestInnerProductInstance; +struct only_set_when_both_expected +{ + long long expected; + bool * flag; + + __device__ + long long operator()(long long x, long long y) + { + if (x == expected && y == expected) + { + *flag = true; + } + + return x == y; + } +}; + +void TestInnerProductWithBigIndexesHelper(int magnitude) +{ + thrust::counting_iterator begin(1); + thrust::counting_iterator end = begin + (1ll << magnitude); + ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude); + + thrust::device_ptr has_executed = thrust::device_malloc(1); + *has_executed = false; + + only_set_when_both_expected fn = { (1ll << magnitude) - 1, + thrust::raw_pointer_cast(has_executed) }; + ASSERT_EQUAL(thrust::inner_product( + thrust::device, + begin, end, + begin, + 0ll, + thrust::plus(), + fn), (1ll << magnitude)); + + bool has_executed_h = *has_executed; + thrust::device_free(has_executed); + + ASSERT_EQUAL(has_executed_h, true); +} + +void TestInnerProductWithBigIndexes() +{ + TestInnerProductWithBigIndexesHelper(30); + TestInnerProductWithBigIndexesHelper(31); + TestInnerProductWithBigIndexesHelper(32); + TestInnerProductWithBigIndexesHelper(33); +} +DECLARE_UNITTEST(TestInnerProductWithBigIndexes); + +void TestInnerProductPlaceholders() +{ // Regression test for NVIDIA/thrust#1178 + using namespace thrust::placeholders; + + thrust::device_vector v1(100, 1.f); + thrust::device_vector v2(100, 1.f); + + auto result = thrust::inner_product(v1.begin(), v1.end(), v2.begin(), 0.0f, + thrust::plus{}, + _1 * _2 + 1.0f); + + ASSERT_ALMOST_EQUAL(result, 200.f); +} +DECLARE_UNITTEST(TestInnerProductPlaceholders); diff --git a/testing/is_contiguous_iterator.cu b/testing/is_contiguous_iterator.cu index 63a307b7b..42a5aa663 100644 --- a/testing/is_contiguous_iterator.cu +++ b/testing/is_contiguous_iterator.cu @@ -134,3 +134,95 @@ void test_is_contiguous_iterator_vectors() } DECLARE_VECTOR_UNITTEST(test_is_contiguous_iterator_vectors); + +struct expect_pointer{}; +struct expect_passthrough{}; + +template +struct check_unwrapped_iterator +{ + using unwrapped_t = typename std::remove_reference< + decltype(thrust::detail::try_unwrap_contiguous_iterator( + std::declval()))>::type; + + static constexpr bool value = + std::is_same::value + ? std::is_same::value + : std::is_same::value; +}; + +template +void test_try_unwrap_contiguous_iterator() +{ + // Raw pointers should pass whether expecting pointers or passthrough. + THRUST_STATIC_ASSERT((check_unwrapped_iterator::value)); + THRUST_STATIC_ASSERT((check_unwrapped_iterator::value)); + THRUST_STATIC_ASSERT((check_unwrapped_iterator::value)); + THRUST_STATIC_ASSERT((check_unwrapped_iterator::value)); + + THRUST_STATIC_ASSERT((check_unwrapped_iterator, + T *, + expect_pointer>::value)); + THRUST_STATIC_ASSERT((check_unwrapped_iterator, + T const *, + expect_pointer>::value)); + THRUST_STATIC_ASSERT((check_unwrapped_iterator::iterator, + T *, + expect_pointer>::value)); + THRUST_STATIC_ASSERT((check_unwrapped_iterator::reverse_iterator, + T *, + expect_passthrough>::value)); + THRUST_STATIC_ASSERT((check_unwrapped_iterator::iterator, + T *, + expect_pointer>::value)); + THRUST_STATIC_ASSERT((check_unwrapped_iterator::iterator, + T const *, + expect_pointer>::value)); + THRUST_STATIC_ASSERT((check_unwrapped_iterator::iterator, + T *, + expect_passthrough>::value)); + THRUST_STATIC_ASSERT((check_unwrapped_iterator::iterator, + T *, + expect_passthrough>::value)); + THRUST_STATIC_ASSERT((check_unwrapped_iterator::iterator, + T *, + expect_passthrough>::value)); + THRUST_STATIC_ASSERT((check_unwrapped_iterator::iterator, + T *, + expect_passthrough>::value)); + THRUST_STATIC_ASSERT((check_unwrapped_iterator::iterator, + std::pair *, + expect_passthrough>::value)); + THRUST_STATIC_ASSERT((check_unwrapped_iterator::iterator, + std::pair *, + expect_passthrough>::value)); + THRUST_STATIC_ASSERT((check_unwrapped_iterator::iterator, + T *, + expect_passthrough>::value)); + THRUST_STATIC_ASSERT((check_unwrapped_iterator::iterator, + T *, + expect_passthrough>::value)); + THRUST_STATIC_ASSERT((check_unwrapped_iterator::iterator, + std::pair *, + expect_passthrough>::value)); + THRUST_STATIC_ASSERT((check_unwrapped_iterator::iterator, + std::pair *, + expect_passthrough>::value)); + THRUST_STATIC_ASSERT((check_unwrapped_iterator, + T *, + expect_passthrough>::value)); + THRUST_STATIC_ASSERT((check_unwrapped_iterator, + void, + expect_passthrough>::value)); +} +DECLARE_GENERIC_UNITTEST(test_try_unwrap_contiguous_iterator); diff --git a/testing/max_element.cu b/testing/max_element.cu index e73275c63..456239264 100644 --- a/testing/max_element.cu +++ b/testing/max_element.cu @@ -105,3 +105,20 @@ void TestMaxElementDispatchImplicit() } DECLARE_UNITTEST(TestMaxElementDispatchImplicit); +void TestMaxElementWithBigIndexesHelper(int magnitude) +{ + thrust::counting_iterator begin(1); + thrust::counting_iterator end = begin + (1ll << magnitude); + ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude); + + ASSERT_EQUAL(*thrust::max_element(thrust::device, begin, end), (1ll << magnitude)); +} + +void TestMaxElementWithBigIndexes() +{ + TestMaxElementWithBigIndexesHelper(30); + TestMaxElementWithBigIndexesHelper(31); + TestMaxElementWithBigIndexesHelper(32); + TestMaxElementWithBigIndexesHelper(33); +} +DECLARE_UNITTEST(TestMaxElementWithBigIndexes); diff --git a/testing/memory.cu b/testing/memory.cu index fde4a16be..e4c1da8f6 100644 --- a/testing/memory.cu +++ b/testing/memory.cu @@ -46,6 +46,68 @@ class my_memory_system : public thrust::device_execution_policy +{ +}; + +template +thrust::pair, std::ptrdiff_t> +get_temporary_buffer(my_old_temporary_allocation_system, std::ptrdiff_t) +{ + thrust::pointer const + result(reinterpret_cast(4217)); + + return thrust::make_pair(result, 314); +} + +template +void return_temporary_buffer(my_old_temporary_allocation_system, Pointer p) +{ + typedef typename thrust::detail::pointer_traits::raw_pointer RP; + ASSERT_EQUAL(p.get(), reinterpret_cast(4217)); +} + +} // my_old_namespace + +namespace my_new_namespace +{ + +struct my_new_temporary_allocation_system + : public thrust::device_execution_policy +{ +}; + +template +thrust::pair, std::ptrdiff_t> +get_temporary_buffer(my_new_temporary_allocation_system, std::ptrdiff_t) +{ + thrust::pointer const + result(reinterpret_cast(1742)); + + return thrust::make_pair(result, 413); +} + +template +void return_temporary_buffer(my_new_temporary_allocation_system, Pointer) +{ + // This should never be called (the three-argument with size overload below + // should be preferred) and shouldn't be ambiguous. + ASSERT_EQUAL(true, false); +} + +template +void return_temporary_buffer(my_new_temporary_allocation_system, Pointer p, std::ptrdiff_t n) +{ + typedef typename thrust::detail::pointer_traits::raw_pointer RP; + ASSERT_EQUAL(p.get(), reinterpret_cast(1742)); + ASSERT_EQUAL(n, 413); +} + +} // my_new_namespace template bool are_same(const T1 &, const T2 &) @@ -119,7 +181,7 @@ void TestGetTemporaryBuffer() ASSERT_EQUAL(true, thrust::all_of(ptr_and_sz.first, ptr_and_sz.first + n, thrust::placeholders::_1 == ref_val)); - thrust::return_temporary_buffer(dev_tag, ptr_and_sz.first); + thrust::return_temporary_buffer(dev_tag, ptr_and_sz.first, ptr_and_sz.second); } DECLARE_UNITTEST(TestGetTemporaryBuffer); @@ -198,11 +260,6 @@ template void TestGetTemporaryBufferDispatchExplicit() { -#if defined(THRUST_GCC_VERSION) && (THRUST_GCC_VERSION < 40400) - // gcc 4.2 does not do adl correctly for get_temporary_buffer - // gcc 4.3 does not do adl correctly for malloc - KNOWN_FAILURE; -#else const std::ptrdiff_t n = 9001; my_memory_system sys(0); @@ -219,8 +276,7 @@ void TestGetTemporaryBufferDispatchExplicit() ASSERT_EQUAL(true, thrust::all_of(ptr_and_sz.first, ptr_and_sz.first + n, thrust::placeholders::_1 == ref_val)); - thrust::return_temporary_buffer(sys, ptr_and_sz.first); -#endif + thrust::return_temporary_buffer(sys, ptr_and_sz.first, ptr_and_sz.second); } DECLARE_UNITTEST(TestGetTemporaryBufferDispatchExplicit); @@ -234,11 +290,6 @@ void TestGetTemporaryBufferDispatchImplicit() } else { -#if defined(THRUST_GCC_VERSION) && (THRUST_GCC_VERSION < 40400) - // gcc 4.2 does not do adl correctly for get_temporary_buffer - // gcc 4.3 does not do adl correctly for malloc - KNOWN_FAILURE; -#else thrust::device_vector vec(9001); thrust::sequence(vec.begin(), vec.end()); @@ -250,8 +301,48 @@ void TestGetTemporaryBufferDispatchImplicit() ASSERT_EQUAL(true, thrust::is_sorted(vec.begin(), vec.end())); ASSERT_EQUAL(true, sys.is_valid()); -#endif } } DECLARE_UNITTEST(TestGetTemporaryBufferDispatchImplicit); + +void TestTemporaryBufferOldCustomization() +{ + typedef my_old_namespace::my_old_temporary_allocation_system system; + typedef thrust::pointer pointer; + typedef thrust::pair pointer_and_size; + + system sys; + + { + pointer_and_size ps = thrust::get_temporary_buffer(sys, 0); + + // The magic values are defined in `my_old_namespace` above. + ASSERT_EQUAL(ps.first.get(), reinterpret_cast(4217)); + ASSERT_EQUAL(ps.second, 314); + + thrust::return_temporary_buffer(sys, ps.first, ps.second); + } +} +DECLARE_UNITTEST(TestTemporaryBufferOldCustomization); + + +void TestTemporaryBufferNewCustomization() +{ + typedef my_new_namespace::my_new_temporary_allocation_system system; + typedef thrust::pointer pointer; + typedef thrust::pair pointer_and_size; + + system sys; + + { + pointer_and_size ps = thrust::get_temporary_buffer(sys, 0); + + // The magic values are defined in `my_new_namespace` above. + ASSERT_EQUAL(ps.first.get(), reinterpret_cast(1742)); + ASSERT_EQUAL(ps.second, 413); + + thrust::return_temporary_buffer(sys, ps.first, ps.second); + } +} +DECLARE_UNITTEST(TestTemporaryBufferNewCustomization); diff --git a/testing/min_element.cu b/testing/min_element.cu index ec9a4a2e1..81fedbdab 100644 --- a/testing/min_element.cu +++ b/testing/min_element.cu @@ -103,3 +103,22 @@ void TestMinElementDispatchImplicit() } DECLARE_UNITTEST(TestMinElementDispatchImplicit); +void TestMinElementWithBigIndexesHelper(int magnitude) +{ + thrust::counting_iterator begin(1); + thrust::counting_iterator end = begin + (1ll << magnitude); + ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude); + + ASSERT_EQUAL( + *thrust::min_element(thrust::device, begin, end, thrust::greater()), + (1ll << magnitude)); +} + +void TestMinElementWithBigIndexes() +{ + TestMinElementWithBigIndexesHelper(30); + TestMinElementWithBigIndexesHelper(31); + TestMinElementWithBigIndexesHelper(32); + TestMinElementWithBigIndexesHelper(33); +} +DECLARE_UNITTEST(TestMinElementWithBigIndexes); diff --git a/testing/minmax_element.cu b/testing/minmax_element.cu index 3a91b4ad2..4a87f5bb4 100644 --- a/testing/minmax_element.cu +++ b/testing/minmax_element.cu @@ -110,3 +110,29 @@ void TestMinMaxElementDispatchImplicit() } DECLARE_UNITTEST(TestMinMaxElementDispatchImplicit); +void TestMinMaxElementWithBigIndexesHelper(int magnitude) +{ + typedef thrust::counting_iterator Iter; + Iter begin(1); + Iter end = begin + (1ll << magnitude); + ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude); + + thrust::pair result = thrust::minmax_element( + thrust::device, begin, end); + ASSERT_EQUAL(*result.first, 1); + ASSERT_EQUAL(*result.second, (1ll << magnitude)); + + result = thrust::minmax_element(thrust::device, begin, end, + thrust::greater()); + ASSERT_EQUAL(*result.second, 1); + ASSERT_EQUAL(*result.first, (1ll << magnitude)); +} + +void TestMinMaxElementWithBigIndexes() +{ + TestMinMaxElementWithBigIndexesHelper(30); + TestMinMaxElementWithBigIndexesHelper(31); + TestMinMaxElementWithBigIndexesHelper(32); + TestMinMaxElementWithBigIndexesHelper(33); +} +DECLARE_UNITTEST(TestMinMaxElementWithBigIndexes); diff --git a/testing/mr_disjoint_pool.cu b/testing/mr_disjoint_pool.cu index 883250671..69a6005ec 100644 --- a/testing/mr_disjoint_pool.cu +++ b/testing/mr_disjoint_pool.cu @@ -1,8 +1,10 @@ #include + +#include #include #include -#if __cplusplus >= 201103L +#if THRUST_CPP_DIALECT >= 2011 #include #endif @@ -19,18 +21,19 @@ struct alloc_id return id == other.id && size == other.size && alignment == other.alignment; } - alloc_id operator+(std::size_t size) const + alloc_id operator+(std::size_t size_) const { alloc_id ret; ret.id = id; - ret.size = size; + ret.size = size_; ret.alignment = alignment; - ret.offset = size; + ret.offset = size_; return ret; } }; -namespace thrust { namespace detail { +THRUST_NAMESPACE_BEGIN +namespace detail { template<> struct pointer_traits { @@ -46,9 +49,12 @@ struct pointer_traits return reinterpret_cast(id.alignment); } }; -}} -class dummy_resource THRUST_FINAL : public thrust::mr::memory_resource +} // end namespace detail + +THRUST_NAMESPACE_END + +class dummy_resource final : public thrust::mr::memory_resource { public: dummy_resource() : id_to_allocate(0), id_to_deallocate(0) @@ -61,7 +67,7 @@ public: ASSERT_EQUAL(id_to_deallocate, 0u); } - virtual alloc_id do_allocate(std::size_t bytes, std::size_t alignment) THRUST_OVERRIDE + virtual alloc_id do_allocate(std::size_t bytes, std::size_t alignment) override { ASSERT_EQUAL(static_cast(id_to_allocate), true); @@ -75,7 +81,7 @@ public: return ret; } - virtual void do_deallocate(alloc_id p, std::size_t bytes, std::size_t alignment) THRUST_OVERRIDE + virtual void do_deallocate(alloc_id p, std::size_t bytes, std::size_t alignment) override { ASSERT_EQUAL(p.size, bytes); ASSERT_EQUAL(p.alignment, alignment); @@ -177,7 +183,7 @@ void TestDisjointUnsynchronizedPool() } DECLARE_UNITTEST(TestDisjointUnsynchronizedPool); -#if __cplusplus >= 201103L +#if THRUST_CPP_DIALECT >= 2011 void TestDisjointSynchronizedPool() { TestDisjointPool(); @@ -260,7 +266,7 @@ void TestDisjointUnsynchronizedPoolCachingOversized() } DECLARE_UNITTEST(TestDisjointUnsynchronizedPoolCachingOversized); -#if __cplusplus >= 201103L +#if THRUST_CPP_DIALECT >= 2011 void TestDisjointSynchronizedPoolCachingOversized() { TestDisjointPoolCachingOversized(); @@ -285,7 +291,7 @@ void TestUnsynchronizedDisjointGlobalPool() } DECLARE_UNITTEST(TestUnsynchronizedDisjointGlobalPool); -#if __cplusplus >= 201103L +#if THRUST_CPP_DIALECT >= 2011 void TestSynchronizedDisjointGlobalPool() { TestDisjointGlobalPool(); diff --git a/testing/mr_new.cu b/testing/mr_new.cu index df0f3fde5..02f34eccf 100644 --- a/testing/mr_new.cu +++ b/testing/mr_new.cu @@ -9,7 +9,7 @@ void TestAlignment(MemoryResource memres, std::size_t size, std::size_t alignmen ASSERT_EQUAL(reinterpret_cast(ptr) % alignment, 0u); char * char_ptr = reinterpret_cast(ptr); - thrust::fill(char_ptr, char_ptr + size, 0); + thrust::fill(char_ptr, char_ptr + size, char{}); memres.do_deallocate(ptr, size, alignment); } diff --git a/testing/mr_pool.cu b/testing/mr_pool.cu index bd91c04ea..30c1f18a4 100644 --- a/testing/mr_pool.cu +++ b/testing/mr_pool.cu @@ -1,8 +1,10 @@ #include + +#include #include #include -#if __cplusplus >= 201103L +#if THRUST_CPP_DIALECT >= 2011 #include #endif @@ -106,7 +108,7 @@ struct tracked_pointer : thrust::iterator_facade< } }; -class tracked_resource THRUST_FINAL : public thrust::mr::memory_resource > +class tracked_resource final : public thrust::mr::memory_resource > { public: tracked_resource() : id_to_allocate(0), id_to_deallocate(0) @@ -119,7 +121,7 @@ public: ASSERT_EQUAL(id_to_deallocate, 0u); } - virtual tracked_pointer do_allocate(std::size_t n, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) THRUST_OVERRIDE + virtual tracked_pointer do_allocate(std::size_t n, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) override { ASSERT_EQUAL(static_cast(id_to_allocate), true); @@ -134,7 +136,7 @@ public: return ret; } - virtual void do_deallocate(tracked_pointer p, std::size_t n, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) THRUST_OVERRIDE + virtual void do_deallocate(tracked_pointer p, std::size_t n, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) override { ASSERT_EQUAL(p.size, n); ASSERT_EQUAL(p.alignment, alignment); @@ -241,7 +243,7 @@ void TestUnsynchronizedPool() } DECLARE_UNITTEST(TestUnsynchronizedPool); -#if __cplusplus >= 201103L +#if THRUST_CPP_DIALECT >= 2011 void TestSynchronizedPool() { TestPool(); @@ -324,7 +326,7 @@ void TestUnsynchronizedPoolCachingOversized() } DECLARE_UNITTEST(TestUnsynchronizedPoolCachingOversized); -#if __cplusplus >= 201103L +#if THRUST_CPP_DIALECT >= 2011 void TestSynchronizedPoolCachingOversized() { TestPoolCachingOversized(); @@ -348,7 +350,7 @@ void TestUnsynchronizedGlobalPool() } DECLARE_UNITTEST(TestUnsynchronizedGlobalPool); -#if __cplusplus >= 201103L +#if THRUST_CPP_DIALECT >= 2011 void TestSynchronizedGlobalPool() { TestGlobalPool(); diff --git a/testing/namespace_wrapped.cu b/testing/namespace_wrapped.cu new file mode 100644 index 000000000..b6bcb3dbb --- /dev/null +++ b/testing/namespace_wrapped.cu @@ -0,0 +1,43 @@ +// Wrap thrust and cub in different enclosing namespaces +// (In practice, you probably want these to be the same, in which case just +// set THRUST_CUB_WRAPPED_NAMESPACE to set both). +#define THRUST_WRAPPED_NAMESPACE wrap_thrust +#define CUB_WRAPPED_NAMESPACE wrap_cub + +#include +#include +#include +#include +#include + +#include + +// Test that we can use a few common utilities and algorithms from a wrapped +// namespace at runtime. More extensive testing is performed by the header +// tests and the check_namespace.cmake test. +void TestWrappedNamespace() +{ + const std::size_t n = 2048; + + const auto in_1_begin = + ::wrap_thrust::thrust::make_constant_iterator(12); + const auto in_2_begin = + ::wrap_thrust::thrust::make_counting_iterator(1024); + + // Check that the qualifier resolves properly: + THRUST_NS_QUALIFIER::device_vector d_out(n); + + ::wrap_thrust::thrust::transform(in_1_begin, + in_1_begin + n, + in_2_begin, + d_out.begin(), + ::wrap_thrust::thrust::plus<>{}); + + ::wrap_thrust::thrust::host_vector h_out(d_out); + + for (std::size_t i = 0; i < n; ++i) + { + ASSERT_EQUAL(h_out[i], static_cast(i) + 1024 + 12); + } +} +DECLARE_UNITTEST(TestWrappedNamespace); diff --git a/testing/omp/CMakeLists.txt b/testing/omp/CMakeLists.txt new file mode 100644 index 000000000..89ea9bb0c --- /dev/null +++ b/testing/omp/CMakeLists.txt @@ -0,0 +1,18 @@ +file(GLOB test_srcs + RELATIVE "${CMAKE_CURRENT_LIST_DIR}}" + CONFIGURE_DEPENDS + *.cu *.cpp +) + +foreach(thrust_target IN LISTS THRUST_TARGETS) + thrust_get_target_property(config_device ${thrust_target} DEVICE) + if (NOT config_device STREQUAL "OMP") + continue() + endif() + + foreach(test_src IN LISTS test_srcs) + get_filename_component(test_name "${test_src}" NAME_WLE) + string(PREPEND test_name "omp.") + thrust_add_test(test_target ${test_name} "${test_src}" ${thrust_target}) + endforeach() +endforeach() diff --git a/testing/out_of_memory_recovery.cu b/testing/out_of_memory_recovery.cu new file mode 100644 index 000000000..5e4f0c327 --- /dev/null +++ b/testing/out_of_memory_recovery.cu @@ -0,0 +1,33 @@ +// Regression test for NVBug 2720132. +// +// Summary of 2720132: +// +// 1. The large allocation fails due to running out of memory. +// 2. A `thrust::system::system_error` exception is thrown. +// 3. Local objects are destroyed as the stack is unwound, leading to the destruction of `x`. +// 4. `x` runs a parallel algorithm in its destructor to call the destructors of all of its elements. +// 5. Launching that parallel algorithm fails because of the prior CUDA out of memory error. +// 6. A `thrust::system::system_error` exception is thrown. +// 7. Because we've already got an active exception, `terminate` is called. + +#include +#include +#include + +struct non_trivial +{ + __host__ __device__ non_trivial() {} + __host__ __device__ ~non_trivial() {} +}; + +void test_out_of_memory_recovery() +{ + try + { + thrust::device_vector x(1); + + thrust::device_vector y(0x00ffffffffffffff); + } + catch (...) { } +} +DECLARE_UNITTEST(test_out_of_memory_recovery); diff --git a/testing/pair.cu b/testing/pair.cu index a213265f3..f5f6e92b5 100644 --- a/testing/pair.cu +++ b/testing/pair.cu @@ -213,22 +213,42 @@ struct TestPairGet }; SimpleUnitTest TestPairGetInstance; +using PairConstVolatileTypes = + unittest::type_list, thrust::pair const, + thrust::pair const volatile>; -void TestPairTupleSize(void) +template +struct TestPairTupleSize { - int result = thrust::tuple_size< thrust::pair >::value; - ASSERT_EQUAL(2, result); + void operator()() + { + ASSERT_EQUAL(2, static_cast(thrust::tuple_size::value)); + } }; -DECLARE_UNITTEST(TestPairTupleSize); +SimpleUnitTest TestPairTupleSizeInstance; void TestPairTupleElement(void) { - typedef thrust::tuple_element<0, thrust::pair >::type type0; - typedef thrust::tuple_element<1, thrust::pair >::type type1; - - ASSERT_EQUAL_QUIET(typeid(int), typeid(type0)); - ASSERT_EQUAL_QUIET(typeid(float), typeid(type1)); + using type0 = thrust::tuple_element<0, thrust::pair >::type; + using type1 = thrust::tuple_element<1, thrust::pair >::type; + static_assert(std::is_same::value,""); + static_assert(std::is_same::value,""); + + using c_type0 = thrust::tuple_element<0, thrust::pair const>::type; + using c_type1 = thrust::tuple_element<1, thrust::pair const>::type; + static_assert(std::is_same::value,""); + static_assert(std::is_same::value,""); + + using v_type0 = thrust::tuple_element<0, thrust::pair volatile>::type; + using v_type1 = thrust::tuple_element<1, thrust::pair volatile>::type; + static_assert(std::is_same::value,""); + static_assert(std::is_same::value,""); + + using cv_type0 = thrust::tuple_element<0, thrust::pair const volatile>::type; + using cv_type1 = thrust::tuple_element<1, thrust::pair const volatile>::type; + static_assert(std::is_same::value,""); + static_assert(std::is_same::value,""); }; DECLARE_UNITTEST(TestPairTupleElement); diff --git a/testing/pair_reduce.cu b/testing/pair_reduce.cu index ebdab6597..6682fb3cc 100644 --- a/testing/pair_reduce.cu +++ b/testing/pair_reduce.cu @@ -20,7 +20,11 @@ struct add_pairs __host__ __device__ Pair1 operator()(const Pair1 &x, const Pair2 &y) { - return thrust::make_pair(x.first + y.first, x.second + y.second); + // Need cast to undo integer promotion, decltype(char{} + char{}) == int + using P1T1 = typename Pair1::first_type; + using P1T2 = typename Pair1::second_type; + return thrust::make_pair(static_cast(x.first + y.first), + static_cast(x.second + y.second)); } // end operator() }; // end add_pairs @@ -43,7 +47,7 @@ template thrust::device_vector d_p2 = h_p2; thrust::device_vector

d_pairs = h_pairs; - P init = thrust::make_pair(13,13); + P init = thrust::make_pair(T{13}, T{13}); // reduce on the host P h_result = thrust::reduce(h_pairs.begin(), h_pairs.end(), init, add_pairs()); diff --git a/testing/pair_scan.cu b/testing/pair_scan.cu index b1bfe064b..5554c6dc4 100644 --- a/testing/pair_scan.cu +++ b/testing/pair_scan.cu @@ -61,19 +61,6 @@ template thrust::inclusive_scan(d_pairs.begin(), d_pairs.end(), d_output.begin(), thrust::maximum

()); ASSERT_EQUAL_QUIET(h_output, d_output); - - // The tests below get miscompiled on Tesla hw for 8b types - -#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA - if(const CUDATestDriver *driver = dynamic_cast(&UnitTestDriver::s_driver())) - { - if(sizeof(T) == sizeof(unittest::uint8_t) && driver->current_device_architecture() < 200) - { - KNOWN_FAILURE; - } // end if - } // end if -#endif - // scan with plus thrust::exclusive_scan(h_pairs.begin(), h_pairs.end(), h_output.begin(), init, add_pairs()); thrust::exclusive_scan(d_pairs.begin(), d_pairs.end(), d_output.begin(), init, add_pairs()); diff --git a/testing/pair_scan_by_key.cu b/testing/pair_scan_by_key.cu index 6e63bc806..21b53bcbe 100644 --- a/testing/pair_scan_by_key.cu +++ b/testing/pair_scan_by_key.cu @@ -20,7 +20,11 @@ struct add_pairs __host__ __device__ Pair1 operator()(const Pair1 &x, const Pair2 &y) { - return thrust::make_pair(x.first + y.first, x.second + y.second); + // Need cast to undo integer promotion, decltype(char{} + char{}) == int + using P1T1 = typename Pair1::first_type; + using P1T2 = typename Pair1::second_type; + return thrust::make_pair(static_cast(x.first + y.first), + static_cast(x.second + y.second)); } // end operator() }; // end add_pairs @@ -46,7 +50,7 @@ template thrust::host_vector h_keys = unittest::random_integers(n); thrust::device_vector d_keys = h_keys; - P init = thrust::make_pair(13,13); + P init = thrust::make_pair(T{13}, T{13}); // scan on the host thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_pairs.begin(), h_pairs.begin(), init, thrust::equal_to(), add_pairs()); diff --git a/testing/partition.cu b/testing/partition.cu index 742560f59..31aaa9fdd 100644 --- a/testing/partition.cu +++ b/testing/partition.cu @@ -6,6 +6,12 @@ #include #include +#if defined(THRUST_GCC_VERSION) && \ + THRUST_GCC_VERSION >= 110000 && \ + THRUST_GCC_VERSION < 120000 +#define WAIVE_GCC11_FAILURES +#endif + template struct is_even { @@ -21,6 +27,17 @@ void TestPartitionSimple(void) typedef typename Vector::value_type T; typedef typename Vector::iterator Iterator; + // GCC 11 miscompiles and segfaults for certain versions of this test. + // It's not reproducible on other compilers, and the test passes when + // optimizations are disabled. It only affects 32-bit value types, and + // impacts all CPU host/device combinations tested. +#ifdef WAIVE_GCC11_FAILURES + if (sizeof(T) == 4) + { + return; + } +#endif + Vector data(5); data[0] = 1; data[1] = 2; @@ -321,6 +338,17 @@ struct TestPartitionStencil { void operator()(const size_t n) { + // GCC 11 miscompiles and segfaults for certain versions of this test. + // It's not reproducible on other compilers, and the test passes when + // optimizations are disabled. It only affects 32-bit value types, and + // impacts all CPU host/device combinations tested. +#ifdef WAIVE_GCC11_FAILURES + if (n == 0 && sizeof(T) == 4) + { + return; + } +#endif + // setup ranges thrust::host_vector h_data = unittest::random_integers(n); thrust::host_vector h_stencil = unittest::random_integers(n); @@ -684,6 +712,9 @@ struct TestPartitionCopyStencilToDiscardIterator VariableUnitTest TestPartitionCopyStencilToDiscardIteratorInstance; +// GCC 11 miscompiles and segfaults in this tests. +#ifndef WAIVE_GCC11_FAILURES + template struct TestStablePartition { @@ -702,6 +733,11 @@ struct TestStablePartition }; VariableUnitTest TestStablePartitionInstance; +#endif // WAIVE_GCC11_FAILURES + + +// GCC 11 miscompiles and segfaults in this tests. +#ifndef WAIVE_GCC11_FAILURES template struct TestStablePartitionStencil @@ -723,6 +759,8 @@ struct TestStablePartitionStencil }; VariableUnitTest TestStablePartitionStencilInstance; +#endif // WAIVE_GCC11_FAILURES + template struct TestStablePartitionCopy diff --git a/testing/partition_point.cu b/testing/partition_point.cu index d93aeac27..bd5a6a8c8 100644 --- a/testing/partition_point.cu +++ b/testing/partition_point.cu @@ -95,3 +95,39 @@ void TestPartitionPointDispatchImplicit() } DECLARE_UNITTEST(TestPartitionPointDispatchImplicit); +struct test_less_than +{ + long long expected; + + __device__ + bool operator()(long long y) + { + return y < expected; + } +}; + +void TestPartitionPointWithBigIndexesHelper(int magnitude) +{ + thrust::counting_iterator begin(0); + thrust::counting_iterator end = begin + (1ll << magnitude); + ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude); + + test_less_than fn = { (1ll << magnitude) - 17 }; + + ASSERT_EQUAL(thrust::distance( + begin, + thrust::partition_point( + thrust::device, + begin, end, + fn)), + (1ll << magnitude) - 17); +} + +void TestPartitionPointWithBigIndexes() +{ + TestPartitionPointWithBigIndexesHelper(30); + TestPartitionPointWithBigIndexesHelper(31); + TestPartitionPointWithBigIndexesHelper(32); + TestPartitionPointWithBigIndexesHelper(33); +} +DECLARE_UNITTEST(TestPartitionPointWithBigIndexes); diff --git a/testing/permutation_iterator.cu b/testing/permutation_iterator.cu index 94f5857c4..22fef650c 100644 --- a/testing/permutation_iterator.cu +++ b/testing/permutation_iterator.cu @@ -279,17 +279,20 @@ DECLARE_UNITTEST(TestPermutationIteratorHostDeviceScatter); template void TestPermutationIteratorWithCountingIterator(void) { - typedef typename Vector::value_type T; + using T = typename Vector::value_type; + using diff_t = typename thrust::counting_iterator::difference_type; - typename thrust::counting_iterator input(0), index(0); + thrust::counting_iterator input(0), index(0); // test copy() { Vector output(4,0); - thrust::copy(thrust::make_permutation_iterator(input, index), - thrust::make_permutation_iterator(input, index + output.size()), - output.begin()); + auto first = thrust::make_permutation_iterator(input, index); + auto last = thrust::make_permutation_iterator(input, + index + static_cast(output.size())); + + thrust::copy(first, last, output.begin()); ASSERT_EQUAL(output[0], 0); ASSERT_EQUAL(output[1], 1); diff --git a/testing/reduce.cu b/testing/reduce.cu index d9daeee03..cb08bc889 100644 --- a/testing/reduce.cu +++ b/testing/reduce.cu @@ -1,6 +1,7 @@ #include #include #include +#include #include #include @@ -210,3 +211,22 @@ template } DECLARE_GENERIC_UNITTEST(TestReduceCountingIterator); +void TestReduceWithBigIndexesHelper(int magnitude) +{ + thrust::constant_iterator begin(1); + thrust::constant_iterator end = begin + (1ll << magnitude); + ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude); + + long long result = thrust::reduce(thrust::device, begin, end); + + ASSERT_EQUAL(result, 1ll << magnitude); +} + +void TestReduceWithBigIndexes() +{ + TestReduceWithBigIndexesHelper(30); + TestReduceWithBigIndexesHelper(31); + TestReduceWithBigIndexesHelper(32); + TestReduceWithBigIndexesHelper(33); +} +DECLARE_UNITTEST(TestReduceWithBigIndexes); diff --git a/testing/reduce_large.cu b/testing/reduce_large.cu index cfe2d0973..170895ccc 100644 --- a/testing/reduce_large.cu +++ b/testing/reduce_large.cu @@ -10,12 +10,14 @@ void _TestReduceWithLargeTypes(void) thrust::host_vector< FixedVector > h_data(n); for(size_t i = 0; i < h_data.size(); i++) - h_data[i] = FixedVector(i); + { + h_data[i] = FixedVector(static_cast(i)); + } thrust::device_vector< FixedVector > d_data = h_data; - FixedVector h_result = thrust::reduce(h_data.begin(), h_data.end(), FixedVector(0)); - FixedVector d_result = thrust::reduce(d_data.begin(), d_data.end(), FixedVector(0)); + FixedVector h_result = thrust::reduce(h_data.begin(), h_data.end(), FixedVector(T{0})); + FixedVector d_result = thrust::reduce(d_data.begin(), d_data.end(), FixedVector(T{0})); ASSERT_EQUAL_QUIET(h_result, d_result); } diff --git a/testing/regression/CMakeLists.txt b/testing/regression/CMakeLists.txt new file mode 100644 index 000000000..eea8b3a45 --- /dev/null +++ b/testing/regression/CMakeLists.txt @@ -0,0 +1,20 @@ +# +# Disabled as these test names are too long for CMAKE_OBJECT_PATH_MAX. +# We should integrate these with the other unit tests. +# See issue #1205. +# +return() + +file(GLOB test_srcs + RELATIVE "${CMAKE_CURRENT_LIST_DIR}}" + CONFIGURE_DEPENDS + *.cu *.cpp +) + +foreach(thrust_target IN LISTS THRUST_TARGETS) + foreach(test_src IN LISTS test_srcs) + get_filename_component(test_name "${test_src}" NAME_WLE) + string(PREPEND test_name "regression.") + thrust_add_test(test_target ${test_name} "${test_src}" ${thrust_target}) + endforeach() +endforeach() diff --git a/testing/remove.cu b/testing/remove.cu index 39adec1af..95b679dc7 100644 --- a/testing/remove.cu +++ b/testing/remove.cu @@ -30,14 +30,14 @@ void TestRemoveSimple(void) typedef typename Vector::value_type T; Vector data(5); - data[0] = 1; - data[1] = 2; + data[0] = 1; + data[1] = 2; data[2] = 1; - data[3] = 3; - data[4] = 2; + data[3] = 3; + data[4] = 2; - typename Vector::iterator end = thrust::remove(data.begin(), - data.end(), + typename Vector::iterator end = thrust::remove(data.begin(), + data.end(), (T) 2); ASSERT_EQUAL(end - data.begin(), 3); @@ -102,17 +102,17 @@ void TestRemoveCopySimple(void) typedef typename Vector::value_type T; Vector data(5); - data[0] = 1; - data[1] = 2; + data[0] = 1; + data[1] = 2; data[2] = 1; - data[3] = 3; - data[4] = 2; + data[3] = 3; + data[4] = 2; Vector result(5); - typename Vector::iterator end = thrust::remove_copy(data.begin(), - data.end(), - result.begin(), + typename Vector::iterator end = thrust::remove_copy(data.begin(), + data.end(), + result.begin(), (T) 2); ASSERT_EQUAL(end - result.begin(), 3); @@ -186,14 +186,14 @@ void TestRemoveIfSimple(void) typedef typename Vector::value_type T; Vector data(5); - data[0] = 1; - data[1] = 2; + data[0] = 1; + data[1] = 2; data[2] = 1; - data[3] = 3; - data[4] = 2; + data[3] = 3; + data[4] = 2; - typename Vector::iterator end = thrust::remove_if(data.begin(), - data.end(), + typename Vector::iterator end = thrust::remove_if(data.begin(), + data.end(), is_even()); ASSERT_EQUAL(end - data.begin(), 3); @@ -258,11 +258,11 @@ void TestRemoveIfStencilSimple(void) typedef typename Vector::value_type T; Vector data(5); - data[0] = 1; - data[1] = 2; + data[0] = 1; + data[1] = 2; data[2] = 1; - data[3] = 3; - data[4] = 2; + data[3] = 3; + data[4] = 2; Vector stencil(5); stencil[0] = 0; @@ -271,7 +271,7 @@ void TestRemoveIfStencilSimple(void) stencil[3] = 0; stencil[4] = 1; - typename Vector::iterator end = thrust::remove_if(data.begin(), + typename Vector::iterator end = thrust::remove_if(data.begin(), data.end(), stencil.begin(), thrust::identity()); @@ -347,17 +347,17 @@ void TestRemoveCopyIfSimple(void) typedef typename Vector::value_type T; Vector data(5); - data[0] = 1; - data[1] = 2; + data[0] = 1; + data[1] = 2; data[2] = 1; - data[3] = 3; - data[4] = 2; + data[3] = 3; + data[4] = 2; Vector result(5); - typename Vector::iterator end = thrust::remove_copy_if(data.begin(), - data.end(), - result.begin(), + typename Vector::iterator end = thrust::remove_copy_if(data.begin(), + data.end(), + result.begin(), is_even()); ASSERT_EQUAL(end - result.begin(), 3); @@ -431,11 +431,11 @@ void TestRemoveCopyIfStencilSimple(void) typedef typename Vector::value_type T; Vector data(5); - data[0] = 1; - data[1] = 2; + data[0] = 1; + data[1] = 2; data[2] = 1; - data[3] = 3; - data[4] = 2; + data[3] = 3; + data[4] = 2; Vector stencil(5); stencil[0] = 0; @@ -446,10 +446,10 @@ void TestRemoveCopyIfStencilSimple(void) Vector result(5); - typename Vector::iterator end = thrust::remove_copy_if(data.begin(), - data.end(), + typename Vector::iterator end = thrust::remove_copy_if(data.begin(), + data.end(), stencil.begin(), - result.begin(), + result.begin(), thrust::identity()); ASSERT_EQUAL(end - result.begin(), 3); @@ -531,7 +531,7 @@ void TestRemove(const size_t n) size_t h_size = thrust::remove(h_data.begin(), h_data.end(), T(0)) - h_data.begin(); size_t d_size = thrust::remove(d_data.begin(), d_data.end(), T(0)) - d_data.begin(); - + ASSERT_EQUAL(h_size, d_size); h_data.resize(h_size); @@ -550,7 +550,7 @@ void TestRemoveIf(const size_t n) size_t h_size = thrust::remove_if(h_data.begin(), h_data.end(), is_true()) - h_data.begin(); size_t d_size = thrust::remove_if(d_data.begin(), d_data.end(), is_true()) - d_data.begin(); - + ASSERT_EQUAL(h_size, d_size); h_data.resize(h_size); @@ -569,10 +569,10 @@ void TestRemoveIfStencil(const size_t n) thrust::host_vector h_stencil = unittest::random_integers(n); thrust::device_vector d_stencil = h_stencil; - + size_t h_size = thrust::remove_if(h_data.begin(), h_data.end(), h_stencil.begin(), is_true()) - h_data.begin(); size_t d_size = thrust::remove_if(d_data.begin(), d_data.end(), d_stencil.begin(), is_true()) - d_data.begin(); - + ASSERT_EQUAL(h_size, d_size); h_data.resize(h_size); @@ -588,13 +588,13 @@ void TestRemoveCopy(const size_t n) { thrust::host_vector h_data = unittest::random_samples(n); thrust::device_vector d_data = h_data; - + thrust::host_vector h_result(n); thrust::device_vector d_result(n); size_t h_size = thrust::remove_copy(h_data.begin(), h_data.end(), h_result.begin(), T(0)) - h_result.begin(); size_t d_size = thrust::remove_copy(d_data.begin(), d_data.end(), d_result.begin(), T(0)) - d_result.begin(); - + ASSERT_EQUAL(h_size, d_size); h_result.resize(h_size); @@ -621,7 +621,7 @@ void TestRemoveCopyToDiscardIterator(const size_t n) thrust::remove_copy(d_data.begin(), d_data.end(), thrust::make_discard_iterator(), T(0)); thrust::discard_iterator<> reference(num_nonzeros); - + ASSERT_EQUAL_QUIET(reference, h_result); ASSERT_EQUAL_QUIET(reference, d_result); } @@ -659,7 +659,7 @@ void TestRemoveCopyToDiscardIteratorZipped(const size_t n) thrust::make_tuple(T(0),T(0))); thrust::discard_iterator<> reference(num_nonzeros); - + ASSERT_EQUAL(h_output, d_output); ASSERT_EQUAL_QUIET(reference, thrust::get<1>(h_result.get_iterator_tuple())); ASSERT_EQUAL_QUIET(reference, thrust::get<1>(d_result.get_iterator_tuple())); @@ -675,10 +675,10 @@ void TestRemoveCopyIf(const size_t n) thrust::host_vector h_result(n); thrust::device_vector d_result(n); - + size_t h_size = thrust::remove_copy_if(h_data.begin(), h_data.end(), h_result.begin(), is_true()) - h_result.begin(); size_t d_size = thrust::remove_copy_if(d_data.begin(), d_data.end(), d_result.begin(), is_true()) - d_result.begin(); - + ASSERT_EQUAL(h_size, d_size); h_result.resize(h_size); @@ -716,16 +716,16 @@ void TestRemoveCopyIfStencil(const size_t n) { thrust::host_vector h_data = unittest::random_samples(n); thrust::device_vector d_data = h_data; - + thrust::host_vector h_stencil = unittest::random_integers(n); thrust::device_vector d_stencil = h_stencil; - + thrust::host_vector h_result(n); thrust::device_vector d_result(n); size_t h_size = thrust::remove_copy_if(h_data.begin(), h_data.end(), h_stencil.begin(), h_result.begin(), is_true()) - h_result.begin(); size_t d_size = thrust::remove_copy_if(d_data.begin(), d_data.end(), d_stencil.begin(), d_result.begin(), is_true()) - d_result.begin(); - + ASSERT_EQUAL(h_size, d_size); h_result.resize(h_size); @@ -741,7 +741,7 @@ void TestRemoveCopyIfStencilToDiscardIterator(const size_t n) { thrust::host_vector h_data = unittest::random_samples(n); thrust::device_vector d_data = h_data; - + thrust::host_vector h_stencil = unittest::random_integers(n); thrust::device_vector d_stencil = h_stencil; @@ -759,4 +759,3 @@ void TestRemoveCopyIfStencilToDiscardIterator(const size_t n) ASSERT_EQUAL_QUIET(reference, d_result); } DECLARE_VARIABLE_UNITTEST(TestRemoveCopyIfStencilToDiscardIterator); - diff --git a/testing/replace.cu b/testing/replace.cu index 31e9890bb..9ba33ddde 100644 --- a/testing/replace.cu +++ b/testing/replace.cu @@ -603,8 +603,8 @@ void TestReplaceCopyIf(const size_t n) thrust::host_vector h_dest(n); thrust::device_vector d_dest(n); - thrust::replace_copy_if(h_data.begin(), h_data.end(), h_dest.begin(), less_than_five(), 0); - thrust::replace_copy_if(d_data.begin(), d_data.end(), d_dest.begin(), less_than_five(), 0); + thrust::replace_copy_if(h_data.begin(), h_data.end(), h_dest.begin(), less_than_five(), T{0}); + thrust::replace_copy_if(d_data.begin(), d_data.end(), d_dest.begin(), less_than_five(), T{0}); ASSERT_ALMOST_EQUAL(h_data, d_data); ASSERT_ALMOST_EQUAL(h_dest, d_dest); @@ -619,10 +619,10 @@ void TestReplaceCopyIfToDiscardIterator(const size_t n) thrust::device_vector d_data = h_data; thrust::discard_iterator<> h_result = - thrust::replace_copy_if(h_data.begin(), h_data.end(), thrust::make_discard_iterator(), less_than_five(), 0); + thrust::replace_copy_if(h_data.begin(), h_data.end(), thrust::make_discard_iterator(), less_than_five(), T{0}); thrust::discard_iterator<> d_result = - thrust::replace_copy_if(d_data.begin(), d_data.end(), thrust::make_discard_iterator(), less_than_five(), 0); + thrust::replace_copy_if(d_data.begin(), d_data.end(), thrust::make_discard_iterator(), less_than_five(), T{0}); thrust::discard_iterator<> reference(n); @@ -643,8 +643,8 @@ void TestReplaceCopyIfStencil(const size_t n) thrust::host_vector h_dest(n); thrust::device_vector d_dest(n); - thrust::replace_copy_if(h_data.begin(), h_data.end(), h_stencil.begin(), h_dest.begin(), less_than_five(), 0); - thrust::replace_copy_if(d_data.begin(), d_data.end(), d_stencil.begin(), d_dest.begin(), less_than_five(), 0); + thrust::replace_copy_if(h_data.begin(), h_data.end(), h_stencil.begin(), h_dest.begin(), less_than_five(), T{0}); + thrust::replace_copy_if(d_data.begin(), d_data.end(), d_stencil.begin(), d_dest.begin(), less_than_five(), T{0}); ASSERT_ALMOST_EQUAL(h_data, d_data); ASSERT_ALMOST_EQUAL(h_dest, d_dest); @@ -661,10 +661,10 @@ void TestReplaceCopyIfStencilToDiscardIterator(const size_t n) thrust::device_vector d_stencil = h_stencil; thrust::discard_iterator<> h_result = - thrust::replace_copy_if(h_data.begin(), h_data.end(), h_stencil.begin(), thrust::make_discard_iterator(), less_than_five(), 0); + thrust::replace_copy_if(h_data.begin(), h_data.end(), h_stencil.begin(), thrust::make_discard_iterator(), less_than_five(), T{0}); thrust::discard_iterator<> d_result = - thrust::replace_copy_if(d_data.begin(), d_data.end(), d_stencil.begin(), thrust::make_discard_iterator(), less_than_five(), 0); + thrust::replace_copy_if(d_data.begin(), d_data.end(), d_stencil.begin(), thrust::make_discard_iterator(), less_than_five(), T{0}); thrust::discard_iterator<> reference(n); diff --git a/testing/reverse.cu b/testing/reverse.cu index b04e446dc..1ea4b9b38 100644 --- a/testing/reverse.cu +++ b/testing/reverse.cu @@ -73,6 +73,16 @@ DECLARE_UNITTEST(TestReverseDispatchImplicit); template void TestReverseCopySimple(void) { +#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC && \ + THRUST_GCC_VERSION >= 80000 && THRUST_GCC_VERSION < 100000 + + if (typeid(Vector) == typeid(thrust::host_vector)) + { + KNOWN_FAILURE // WAR NVBug 2481122 + } + +#endif + typedef typename Vector::iterator Iterator; Vector input(5); diff --git a/testing/scan.cu b/testing/scan.cu index 875ed46a9..bceac4038 100644 --- a/testing/scan.cu +++ b/testing/scan.cu @@ -1,8 +1,14 @@ #include + +#include + #include #include #include +#include #include +#include +#include template @@ -20,6 +26,17 @@ template void TestScanSimple(void) { typedef typename Vector::value_type T; + + // icc miscompiles the intermediate sum updates for custom_numeric. + // The issue doesn't happen with opts disabled, or on other compilers. + // Printing the intermediate sum each iteration "fixes" the issue, + // so likely a bad optimization. +#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_INTEL + if (std::is_same::value) + { + return; + } +#endif typename Vector::iterator iter; @@ -247,48 +264,49 @@ void TestScanMixedTypes(void) IntVector int_output(4); FloatVector float_output(4); - - // float -> int should use using plus operator by default + + // float -> int should use plus operator and float accumulator by default thrust::inclusive_scan(float_input.begin(), float_input.end(), int_output.begin()); - ASSERT_EQUAL(int_output[0], 1); - ASSERT_EQUAL(int_output[1], 3); - ASSERT_EQUAL(int_output[2], 6); - ASSERT_EQUAL(int_output[3], 10); - - // float -> float with plus operator (int accumulator) + ASSERT_EQUAL(int_output[0], 1); // in: 1.5 accum: 1.5f out: 1 + ASSERT_EQUAL(int_output[1], 4); // in: 2.5 accum: 4.0f out: 4 + ASSERT_EQUAL(int_output[2], 7); // in: 3.5 accum: 7.5f out: 7 + ASSERT_EQUAL(int_output[3], 12); // in: 4.5 accum: 12.f out: 12 + + // float -> float with plus operator (float accumulator) thrust::inclusive_scan(float_input.begin(), float_input.end(), float_output.begin(), thrust::plus()); - ASSERT_EQUAL(float_output[0], 1.5); - ASSERT_EQUAL(float_output[1], 3.0); - ASSERT_EQUAL(float_output[2], 6.0); - ASSERT_EQUAL(float_output[3], 10.0); - - // float -> int should use using plus operator by default + ASSERT_EQUAL(float_output[0], 1.5f); // in: 1.5 accum: 1.5f out: 1.5f + ASSERT_EQUAL(float_output[1], 3.0f); // in: 2.5 accum: 3.0f out: 3.0f + ASSERT_EQUAL(float_output[2], 6.0f); // in: 3.5 accum: 6.0f out: 6.0f + ASSERT_EQUAL(float_output[3], 10.0f); // in: 4.5 accum: 10.f out: 10.f + + // float -> int should use plus operator and float accumulator by default thrust::exclusive_scan(float_input.begin(), float_input.end(), int_output.begin()); - ASSERT_EQUAL(int_output[0], 0); - ASSERT_EQUAL(int_output[1], 1); - ASSERT_EQUAL(int_output[2], 3); - ASSERT_EQUAL(int_output[3], 6); - - // float -> int should use using plus operator by default + ASSERT_EQUAL(int_output[0], 0); // out: 0.0f in: 1.5 accum: 1.5f + ASSERT_EQUAL(int_output[1], 1); // out: 1.5f in: 2.5 accum: 4.0f + ASSERT_EQUAL(int_output[2], 4); // out: 4.0f in: 3.5 accum: 7.5f + ASSERT_EQUAL(int_output[3], 7); // out: 7.5f in: 4.5 accum: 12.f + + // float -> int should use plus<> operator and float accumulator by default thrust::exclusive_scan(float_input.begin(), float_input.end(), int_output.begin(), (float) 5.5); - ASSERT_EQUAL(int_output[0], 5); - ASSERT_EQUAL(int_output[1], 7); - ASSERT_EQUAL(int_output[2], 9); - ASSERT_EQUAL(int_output[3], 13); - - // int -> float should use using plus operator by default + ASSERT_EQUAL(int_output[0], 5); // out: 5.5f in: 1.5 accum: 7.0f + ASSERT_EQUAL(int_output[1], 7); // out: 7.0f in: 2.5 accum: 9.5f + ASSERT_EQUAL(int_output[2], 9); // out: 9.5f in: 3.5 accum: 13.0f + ASSERT_EQUAL(int_output[3], 13); // out: 13.f in: 4.5 accum: 17.4f + + // int -> float should use using plus<> operator and int accumulator by default thrust::inclusive_scan(int_input.begin(), int_input.end(), float_output.begin()); - ASSERT_EQUAL(float_output[0], 1.0); - ASSERT_EQUAL(float_output[1], 3.0); - ASSERT_EQUAL(float_output[2], 6.0); - ASSERT_EQUAL(float_output[3], 10.0); - - // int -> float should use using plus operator by default + ASSERT_EQUAL(float_output[0], 1.f); // in: 1 accum: 1 out: 1 + ASSERT_EQUAL(float_output[1], 3.f); // in: 2 accum: 3 out: 3 + ASSERT_EQUAL(float_output[2], 6.f); // in: 3 accum: 6 out: 6 + ASSERT_EQUAL(float_output[3], 10.f); // in: 4 accum: 10 out: 10 + + // int -> float + float init_value should use using plus<> operator and + // float accumulator by default thrust::exclusive_scan(int_input.begin(), int_input.end(), float_output.begin(), (float) 5.5); - ASSERT_EQUAL(float_output[0], 5.5); - ASSERT_EQUAL(float_output[1], 6.5); - ASSERT_EQUAL(float_output[2], 8.5); - ASSERT_EQUAL(float_output[3], 11.5); + ASSERT_EQUAL(float_output[0], 5.5f); // out: 5.5f in: 1 accum: 6.5f + ASSERT_EQUAL(float_output[1], 6.5f); // out: 6.0f in: 2 accum: 8.5f + ASSERT_EQUAL(float_output[2], 8.5f); // out: 8.0f in: 3 accum: 11.5f + ASSERT_EQUAL(float_output[3], 11.5f); // out: 11.f in: 4 accum: 15.5f } void TestScanMixedTypesHost(void) { @@ -476,7 +494,9 @@ void _TestScanWithLargeTypes(void) thrust::host_vector< FixedVector > h_output(n); for(size_t i = 0; i < h_input.size(); i++) - h_input[i] = FixedVector(i); + { + h_input[i] = FixedVector(static_cast(i)); + } thrust::device_vector< FixedVector > d_input = h_input; thrust::device_vector< FixedVector > d_output(n); @@ -555,3 +575,165 @@ void TestInclusiveScanWithIndirection(void) } DECLARE_INTEGRAL_VECTOR_UNITTEST(TestInclusiveScanWithIndirection); +template +struct const_ref_plus_mod3 +{ + T * table; + + const_ref_plus_mod3(T * table) : table(table) {} + + __host__ __device__ + const T& operator()(T a, T b) + { + return table[(int) (a + b)]; + } +}; + +template +void TestInclusiveScanWithConstAccumulator(void) +{ + // add numbers modulo 3 with external lookup table + typedef typename Vector::value_type T; + + Vector data(7); + data[0] = 0; + data[1] = 1; + data[2] = 2; + data[3] = 1; + data[4] = 2; + data[5] = 0; + data[6] = 1; + + Vector table(6); + table[0] = 0; + table[1] = 1; + table[2] = 2; + table[3] = 0; + table[4] = 1; + table[5] = 2; + + thrust::inclusive_scan(data.begin(), data.end(), data.begin(), const_ref_plus_mod3(thrust::raw_pointer_cast(&table[0]))); + + ASSERT_EQUAL(data[0], T(0)); + ASSERT_EQUAL(data[1], T(1)); + ASSERT_EQUAL(data[2], T(0)); + ASSERT_EQUAL(data[3], T(1)); + ASSERT_EQUAL(data[4], T(0)); + ASSERT_EQUAL(data[5], T(0)); + ASSERT_EQUAL(data[6], T(1)); +} +DECLARE_INTEGRAL_VECTOR_UNITTEST(TestInclusiveScanWithConstAccumulator); + +struct only_set_when_expected_it +{ + long long expected; + bool * flag; + + __host__ __device__ only_set_when_expected_it operator++() const { return *this; } + __host__ __device__ only_set_when_expected_it operator*() const { return *this; } + template + __host__ __device__ only_set_when_expected_it operator+(Difference) const { return *this; } + template + __host__ __device__ only_set_when_expected_it operator[](Index) const { return *this; } + + __device__ + void operator=(long long value) const + { + if (value == expected) + { + *flag = true; + } + } +}; + +THRUST_NAMESPACE_BEGIN +template<> +struct iterator_traits +{ + typedef long long value_type; + typedef only_set_when_expected_it reference; +}; +THRUST_NAMESPACE_END + +void TestInclusiveScanWithBigIndexesHelper(int magnitude) +{ + thrust::constant_iterator begin(1); + thrust::constant_iterator end = begin + (1ll << magnitude); + ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude); + + thrust::device_ptr has_executed = thrust::device_malloc(1); + *has_executed = false; + + only_set_when_expected_it out = { (1ll << magnitude), thrust::raw_pointer_cast(has_executed) }; + + thrust::inclusive_scan(thrust::device, begin, end, out); + + bool has_executed_h = *has_executed; + thrust::device_free(has_executed); + + ASSERT_EQUAL(has_executed_h, true); +} + +void TestInclusiveScanWithBigIndexes() +{ + TestInclusiveScanWithBigIndexesHelper(30); + TestInclusiveScanWithBigIndexesHelper(31); + TestInclusiveScanWithBigIndexesHelper(32); + TestInclusiveScanWithBigIndexesHelper(33); +} + +DECLARE_UNITTEST(TestInclusiveScanWithBigIndexes); + +void TestExclusiveScanWithBigIndexesHelper(int magnitude) +{ + thrust::constant_iterator begin(1); + thrust::constant_iterator end = begin + (1ll << magnitude); + ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude); + + thrust::device_ptr has_executed = thrust::device_malloc(1); + *has_executed = false; + + only_set_when_expected_it out = { (1ll << magnitude) - 1, thrust::raw_pointer_cast(has_executed) }; + + thrust::exclusive_scan(thrust::device, begin, end, out,0ll); + + bool has_executed_h = *has_executed; + thrust::device_free(has_executed); + + ASSERT_EQUAL(has_executed_h, true); +} + +void TestExclusiveScanWithBigIndexes() +{ + TestExclusiveScanWithBigIndexesHelper(30); + TestExclusiveScanWithBigIndexesHelper(31); + TestExclusiveScanWithBigIndexesHelper(32); + TestExclusiveScanWithBigIndexesHelper(33); +} + +DECLARE_UNITTEST(TestExclusiveScanWithBigIndexes); + +#if THRUST_CPP_DIALECT >= 2011 + +struct Int { + int i{}; + __host__ __device__ explicit Int(int num) : i(num) {} + __host__ __device__ Int() : i{} {} + __host__ __device__ Int operator+(Int const& o) const { return Int{this->i + o.i}; } +}; + +void TestInclusiveScanWithUserDefinedType() +{ + thrust::device_vector vec(5, Int{1}); + + thrust::inclusive_scan( + thrust::device, + vec.cbegin(), + vec.cend(), + vec.begin()); + + ASSERT_EQUAL(static_cast(vec.back()).i, 5); +} +DECLARE_UNITTEST(TestInclusiveScanWithUserDefinedType); + +#endif // c++11 diff --git a/testing/scan_by_key.cu b/testing/scan_by_key.cu deleted file mode 100644 index efc48bdb4..000000000 --- a/testing/scan_by_key.cu +++ /dev/null @@ -1,629 +0,0 @@ -#include -#include -#include -#include -#include -#include - - -template -void TestInclusiveScanByKeySimple(void) -{ - typedef typename Vector::value_type T; - typedef typename Vector::iterator Iterator; - - Vector keys(7); - Vector vals(7); - - Vector output(7, 0); - - keys[0] = 0; vals[0] = 1; - keys[1] = 1; vals[1] = 2; - keys[2] = 1; vals[2] = 3; - keys[3] = 1; vals[3] = 4; - keys[4] = 2; vals[4] = 5; - keys[5] = 3; vals[5] = 6; - keys[6] = 3; vals[6] = 7; - - Iterator iter = thrust::inclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin()); - - ASSERT_EQUAL_QUIET(iter, output.end()); - - ASSERT_EQUAL(output[0], 1); - ASSERT_EQUAL(output[1], 2); - ASSERT_EQUAL(output[2], 5); - ASSERT_EQUAL(output[3], 9); - ASSERT_EQUAL(output[4], 5); - ASSERT_EQUAL(output[5], 6); - ASSERT_EQUAL(output[6], 13); - - thrust::inclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin(), thrust::equal_to(), thrust::multiplies()); - - ASSERT_EQUAL(output[0], 1); - ASSERT_EQUAL(output[1], 2); - ASSERT_EQUAL(output[2], 6); - ASSERT_EQUAL(output[3], 24); - ASSERT_EQUAL(output[4], 5); - ASSERT_EQUAL(output[5], 6); - ASSERT_EQUAL(output[6], 42); - - thrust::inclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin(), thrust::equal_to()); - - ASSERT_EQUAL(output[0], 1); - ASSERT_EQUAL(output[1], 2); - ASSERT_EQUAL(output[2], 5); - ASSERT_EQUAL(output[3], 9); - ASSERT_EQUAL(output[4], 5); - ASSERT_EQUAL(output[5], 6); - ASSERT_EQUAL(output[6], 13); -} -DECLARE_VECTOR_UNITTEST(TestInclusiveScanByKeySimple); - - -template -OutputIterator inclusive_scan_by_key(my_system &system, - InputIterator1, - InputIterator1, - InputIterator2, - OutputIterator result) -{ - system.validate_dispatch(); - return result; -} - -void TestInclusiveScanByKeyDispatchExplicit() -{ - thrust::device_vector vec(1); - - my_system sys(0); - thrust::inclusive_scan_by_key(sys, - vec.begin(), - vec.begin(), - vec.begin(), - vec.begin()); - - ASSERT_EQUAL(true, sys.is_valid()); -} -DECLARE_UNITTEST(TestInclusiveScanByKeyDispatchExplicit); - - -template -OutputIterator inclusive_scan_by_key(my_tag, - InputIterator1, - InputIterator1, - InputIterator2, - OutputIterator result) -{ - *result = 13; - return result; -} - -void TestInclusiveScanByKeyDispatchImplicit() -{ - thrust::device_vector vec(1); - - thrust::inclusive_scan_by_key(thrust::retag(vec.begin()), - thrust::retag(vec.begin()), - thrust::retag(vec.begin()), - thrust::retag(vec.begin())); - - ASSERT_EQUAL(13, vec.front()); -} -DECLARE_UNITTEST(TestInclusiveScanByKeyDispatchImplicit); - - -template -void TestExclusiveScanByKeySimple(void) -{ - typedef typename Vector::value_type T; - typedef typename Vector::iterator Iterator; - - Vector keys(7); - Vector vals(7); - - Vector output(7, 0); - - keys[0] = 0; vals[0] = 1; - keys[1] = 1; vals[1] = 2; - keys[2] = 1; vals[2] = 3; - keys[3] = 1; vals[3] = 4; - keys[4] = 2; vals[4] = 5; - keys[5] = 3; vals[5] = 6; - keys[6] = 3; vals[6] = 7; - - Iterator iter = thrust::exclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin()); - - ASSERT_EQUAL_QUIET(iter, output.end()); - - ASSERT_EQUAL(output[0], 0); - ASSERT_EQUAL(output[1], 0); - ASSERT_EQUAL(output[2], 2); - ASSERT_EQUAL(output[3], 5); - ASSERT_EQUAL(output[4], 0); - ASSERT_EQUAL(output[5], 0); - ASSERT_EQUAL(output[6], 6); - - thrust::exclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin(), T(10)); - - ASSERT_EQUAL(output[0], 10); - ASSERT_EQUAL(output[1], 10); - ASSERT_EQUAL(output[2], 12); - ASSERT_EQUAL(output[3], 15); - ASSERT_EQUAL(output[4], 10); - ASSERT_EQUAL(output[5], 10); - ASSERT_EQUAL(output[6], 16); - - thrust::exclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin(), T(10), thrust::equal_to(), thrust::multiplies()); - - ASSERT_EQUAL(output[0], 10); - ASSERT_EQUAL(output[1], 10); - ASSERT_EQUAL(output[2], 20); - ASSERT_EQUAL(output[3], 60); - ASSERT_EQUAL(output[4], 10); - ASSERT_EQUAL(output[5], 10); - ASSERT_EQUAL(output[6], 60); - - thrust::exclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin(), T(10), thrust::equal_to()); - - ASSERT_EQUAL(output[0], 10); - ASSERT_EQUAL(output[1], 10); - ASSERT_EQUAL(output[2], 12); - ASSERT_EQUAL(output[3], 15); - ASSERT_EQUAL(output[4], 10); - ASSERT_EQUAL(output[5], 10); - ASSERT_EQUAL(output[6], 16); -} -DECLARE_VECTOR_UNITTEST(TestExclusiveScanByKeySimple); - - -template -OutputIterator exclusive_scan_by_key(my_system &system, - InputIterator1, - InputIterator1, - InputIterator2, - OutputIterator result) -{ - system.validate_dispatch(); - return result; -} - -void TestExclusiveScanByKeyDispatchExplicit() -{ - thrust::device_vector vec(1); - - my_system sys(0); - thrust::exclusive_scan_by_key(sys, - vec.begin(), - vec.begin(), - vec.begin(), - vec.begin()); - - ASSERT_EQUAL(true, sys.is_valid()); -} -DECLARE_UNITTEST(TestExclusiveScanByKeyDispatchExplicit); - - -template -OutputIterator exclusive_scan_by_key(my_tag, - InputIterator1, - InputIterator1, - InputIterator2, - OutputIterator result) -{ - *result = 13; - return result; -} - -void TestExclusiveScanByKeyDispatchImplicit() -{ - thrust::device_vector vec(1); - - thrust::exclusive_scan_by_key(thrust::retag(vec.begin()), - thrust::retag(vec.begin()), - thrust::retag(vec.begin()), - thrust::retag(vec.begin())); - - ASSERT_EQUAL(13, vec.front()); -} -DECLARE_UNITTEST(TestExclusiveScanByKeyDispatchImplicit); - - -struct head_flag_predicate -{ - template - __host__ __device__ - bool operator()(const T&, const T& b) - { - return b ? false : true; - } -}; - -template -void TestScanByKeyHeadFlags(void) -{ - typedef typename Vector::value_type T; - - Vector keys(7); - Vector vals(7); - - Vector output(7, 0); - - keys[0] = 0; vals[0] = 1; - keys[1] = 1; vals[1] = 2; - keys[2] = 0; vals[2] = 3; - keys[3] = 0; vals[3] = 4; - keys[4] = 1; vals[4] = 5; - keys[5] = 1; vals[5] = 6; - keys[6] = 0; vals[6] = 7; - - thrust::inclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin(), head_flag_predicate(), thrust::plus()); - - ASSERT_EQUAL(output[0], 1); - ASSERT_EQUAL(output[1], 2); - ASSERT_EQUAL(output[2], 5); - ASSERT_EQUAL(output[3], 9); - ASSERT_EQUAL(output[4], 5); - ASSERT_EQUAL(output[5], 6); - ASSERT_EQUAL(output[6], 13); - - thrust::exclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin(), T(10), head_flag_predicate(), thrust::plus()); - - ASSERT_EQUAL(output[0], 10); - ASSERT_EQUAL(output[1], 10); - ASSERT_EQUAL(output[2], 12); - ASSERT_EQUAL(output[3], 15); - ASSERT_EQUAL(output[4], 10); - ASSERT_EQUAL(output[5], 10); - ASSERT_EQUAL(output[6], 16); -} -DECLARE_VECTOR_UNITTEST(TestScanByKeyHeadFlags); - -template -void TestInclusiveScanByKeyTransformIterator(void) -{ - typedef typename Vector::value_type T; - - Vector keys(7); - Vector vals(7); - - Vector output(7, 0); - - keys[0] = 0; vals[0] = 1; - keys[1] = 1; vals[1] = 2; - keys[2] = 1; vals[2] = 3; - keys[3] = 1; vals[3] = 4; - keys[4] = 2; vals[4] = 5; - keys[5] = 3; vals[5] = 6; - keys[6] = 3; vals[6] = 7; - - thrust::inclusive_scan_by_key - (keys.begin(), keys.end(), - thrust::make_transform_iterator(vals.begin(), thrust::negate()), - output.begin()); - - ASSERT_EQUAL(output[0], -1); - ASSERT_EQUAL(output[1], -2); - ASSERT_EQUAL(output[2], -5); - ASSERT_EQUAL(output[3], -9); - ASSERT_EQUAL(output[4], -5); - ASSERT_EQUAL(output[5], -6); - ASSERT_EQUAL(output[6], -13); -} -DECLARE_VECTOR_UNITTEST(TestInclusiveScanByKeyTransformIterator); - - -template -void TestScanByKeyReusedKeys(void) -{ - Vector keys(7); - Vector vals(7); - - Vector output(7, 0); - - keys[0] = 0; vals[0] = 1; - keys[1] = 1; vals[1] = 2; - keys[2] = 1; vals[2] = 3; - keys[3] = 1; vals[3] = 4; - keys[4] = 0; vals[4] = 5; - keys[5] = 1; vals[5] = 6; - keys[6] = 1; vals[6] = 7; - - thrust::inclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin()); - - ASSERT_EQUAL(output[0], 1); - ASSERT_EQUAL(output[1], 2); - ASSERT_EQUAL(output[2], 5); - ASSERT_EQUAL(output[3], 9); - ASSERT_EQUAL(output[4], 5); - ASSERT_EQUAL(output[5], 6); - ASSERT_EQUAL(output[6], 13); - - thrust::exclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin(), typename Vector::value_type(10)); - - ASSERT_EQUAL(output[0], 10); - ASSERT_EQUAL(output[1], 10); - ASSERT_EQUAL(output[2], 12); - ASSERT_EQUAL(output[3], 15); - ASSERT_EQUAL(output[4], 10); - ASSERT_EQUAL(output[5], 10); - ASSERT_EQUAL(output[6], 16); -} -DECLARE_VECTOR_UNITTEST(TestScanByKeyReusedKeys); - - -template -void TestInclusiveScanByKey(const size_t n) -{ - // XXX WAR nvbug 1541533 -#if 0 //THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC - if(typeid(T) == typeid(char) || - typeid(T) == typeid(unsigned char)) - { - KNOWN_FAILURE; - } -#endif - - thrust::host_vector h_keys(n); - thrust::default_random_engine rng; - for(size_t i = 0, k = 0; i < n; i++){ - h_keys[i] = k; - if (rng() % 10 == 0) - k++; - } - thrust::device_vector d_keys = h_keys; - - thrust::host_vector h_vals = unittest::random_integers(n); - for(size_t i = 0; i < n; i++) - h_vals[i] = i % 10; - thrust::device_vector d_vals = h_vals; - - thrust::host_vector h_output(n); - thrust::device_vector d_output(n); - - thrust::inclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_output.begin()); - thrust::inclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), d_output.begin()); - ASSERT_EQUAL(d_output, h_output); -} -DECLARE_VARIABLE_UNITTEST(TestInclusiveScanByKey); - - -template -void TestExclusiveScanByKey(const size_t n) -{ - thrust::host_vector h_keys(n); - thrust::default_random_engine rng; - for(size_t i = 0, k = 0; i < n; i++){ - h_keys[i] = k; - if (rng() % 10 == 0) - k++; - } - thrust::device_vector d_keys = h_keys; - - thrust::host_vector h_vals = unittest::random_integers(n); - for(size_t i = 0; i < n; i++) - h_vals[i] = i % 10; - thrust::device_vector d_vals = h_vals; - - thrust::host_vector h_output(n); - thrust::device_vector d_output(n); - - // without init - thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_output.begin()); - thrust::exclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), d_output.begin()); - ASSERT_EQUAL(d_output, h_output); - - // with init - thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_output.begin(), (T) 11); - thrust::exclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), d_output.begin(), (T) 11); - ASSERT_EQUAL(d_output, h_output); -} -DECLARE_VARIABLE_UNITTEST(TestExclusiveScanByKey); - - -template -void TestInclusiveScanByKeyInPlace(const size_t n) -{ - // XXX WAR nvbug 1541533 -#if 0 //THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC - if(typeid(T) == typeid(char) || - typeid(T) == typeid(unsigned char)) - { - KNOWN_FAILURE; - } -#endif - - thrust::host_vector h_keys(n); - thrust::default_random_engine rng; - for(size_t i = 0, k = 0; i < n; i++){ - h_keys[i] = k; - if (rng() % 10 == 0) - k++; - } - thrust::device_vector d_keys = h_keys; - - thrust::host_vector h_vals = unittest::random_integers(n); - for(size_t i = 0; i < n; i++) - h_vals[i] = i % 10; - thrust::device_vector d_vals = h_vals; - - thrust::host_vector h_output(n); - thrust::device_vector d_output(n); - - // in-place scans - h_output = h_vals; - d_output = d_vals; - thrust::inclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_output.begin(), h_output.begin()); - thrust::inclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_output.begin(), d_output.begin()); - ASSERT_EQUAL(d_output, h_output); -} -DECLARE_VARIABLE_UNITTEST(TestInclusiveScanByKeyInPlace); - - -template -void TestExclusiveScanByKeyInPlace(const size_t n) -{ - thrust::host_vector h_keys(n); - thrust::default_random_engine rng; - for(size_t i = 0, k = 0; i < n; i++){ - h_keys[i] = k; - if (rng() % 10 == 0) - k++; - } - thrust::device_vector d_keys = h_keys; - - thrust::host_vector h_vals = unittest::random_integers(n); - for(size_t i = 0; i < n; i++) - h_vals[i] = i % 10; - thrust::device_vector d_vals = h_vals; - - thrust::host_vector h_output = h_vals; - thrust::device_vector d_output = d_vals; - thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_output.begin(), h_output.begin(), (T) 11); - thrust::exclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_output.begin(), d_output.begin(), (T) 11); - ASSERT_EQUAL(d_output, h_output); -} -DECLARE_VARIABLE_UNITTEST(TestExclusiveScanByKeyInPlace); - - -void TestScanByKeyMixedTypes(void) -{ - const unsigned int n = 113; - - thrust::host_vector h_keys(n); - thrust::default_random_engine rng; - for(size_t i = 0, k = 0; i < n; i++){ - h_keys[i] = k; - if (rng() % 10 == 0) - k++; - } - thrust::device_vector d_keys = h_keys; - - thrust::host_vector h_vals = unittest::random_integers(n); - for(size_t i = 0; i < n; i++) - h_vals[i] %= 10; - thrust::device_vector d_vals = h_vals; - - thrust::host_vector h_float_output(n); - thrust::device_vector d_float_output(n); - thrust::host_vector h_int_output(n); - thrust::device_vector d_int_output(n); - - //mixed vals/output types - thrust::inclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_float_output.begin()); - thrust::inclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), d_float_output.begin()); - ASSERT_EQUAL(d_float_output, h_float_output); - - thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_float_output.begin(), (float) 3.5); - thrust::exclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), d_float_output.begin(), (float) 3.5); - ASSERT_EQUAL(d_float_output, h_float_output); - - thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_float_output.begin(), (int) 3); - thrust::exclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), d_float_output.begin(), (int) 3); - ASSERT_EQUAL(d_float_output, h_float_output); - - thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_int_output.begin(), (int) 3); - thrust::exclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), d_int_output.begin(), (int) 3); - ASSERT_EQUAL(d_int_output, h_int_output); - - thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_int_output.begin(), (float) 3.5); - thrust::exclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), d_int_output.begin(), (float) 3.5); - ASSERT_EQUAL(d_int_output, h_int_output); -} -DECLARE_UNITTEST(TestScanByKeyMixedTypes); - - -void TestScanByKeyLargeInput() -{ - const unsigned int N = 1 << 20; - - thrust::host_vector vals_sizes = unittest::random_integers(10); - - thrust::host_vector h_vals = unittest::random_integers(N); - thrust::device_vector d_vals = h_vals; - - thrust::host_vector h_output(N, 0); - thrust::device_vector d_output(N, 0); - - for (unsigned int i = 0; i < vals_sizes.size(); i++) - { - const unsigned int n = vals_sizes[i] % N; - - // define segments - thrust::host_vector h_keys(n); - thrust::default_random_engine rng; - for(size_t i = 0, k = 0; i < n; i++){ - h_keys[i] = k; - if (rng() % 100 == 0) - k++; - } - thrust::device_vector d_keys = h_keys; - - thrust::inclusive_scan_by_key(h_keys.begin(), h_keys.begin() + n, h_vals.begin(), h_output.begin()); - thrust::inclusive_scan_by_key(d_keys.begin(), d_keys.begin() + n, d_vals.begin(), d_output.begin()); - ASSERT_EQUAL(d_output, h_output); - - thrust::inclusive_scan_by_key(h_keys.begin(), h_keys.begin() + n, h_vals.begin(), h_output.begin()); - thrust::inclusive_scan_by_key(d_keys.begin(), d_keys.begin() + n, d_vals.begin(), d_output.begin()); - ASSERT_EQUAL(d_output, h_output); - } -} -DECLARE_UNITTEST(TestScanByKeyLargeInput); - - -template -void _TestScanByKeyWithLargeTypes(void) -{ - size_t n = (64 * 1024) / sizeof(FixedVector); - - thrust::host_vector< unsigned int > h_keys(n); - thrust::host_vector< FixedVector > h_vals(n); - thrust::host_vector< FixedVector > h_output(n); - - thrust::default_random_engine rng; - for(size_t i = 0, k = 0; i < h_vals.size(); i++) - { - h_vals[i] = FixedVector(i); - h_keys[i] = k; - if (rng() % 5 == 0) - k++; - } - - thrust::device_vector< unsigned int > d_keys = h_keys; - thrust::device_vector< FixedVector > d_vals = h_vals; - thrust::device_vector< FixedVector > d_output(n); - - thrust::inclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_output.begin()); - thrust::inclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), d_output.begin()); - - ASSERT_EQUAL_QUIET(h_output, d_output); - - thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_output.begin(), FixedVector(0)); - thrust::exclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), d_output.begin(), FixedVector(0)); - - ASSERT_EQUAL_QUIET(h_output, d_output); -} - -void TestScanByKeyWithLargeTypes(void) -{ - _TestScanByKeyWithLargeTypes(); - _TestScanByKeyWithLargeTypes(); - _TestScanByKeyWithLargeTypes(); - _TestScanByKeyWithLargeTypes(); - //_TestScanByKeyWithLargeTypes(); // too many resources requested for launch - //_TestScanByKeyWithLargeTypes(); - //_TestScanByKeyWithLargeTypes(); // too large to pass as argument - //_TestScanByKeyWithLargeTypes(); - //_TestScanByKeyWithLargeTypes(); - //_TestScanByKeyWithLargeTypes(); - //_TestScanByKeyWithLargeTypes(); -} -DECLARE_UNITTEST(TestScanByKeyWithLargeTypes); - diff --git a/testing/scan_by_key.exclusive.cu b/testing/scan_by_key.exclusive.cu new file mode 100644 index 000000000..58354d848 --- /dev/null +++ b/testing/scan_by_key.exclusive.cu @@ -0,0 +1,576 @@ +#include + +#include +#include +#include +#include +#include + +#include + + +template +void TestExclusiveScanByKeySimple() +{ + typedef typename Vector::value_type T; + typedef typename Vector::iterator Iterator; + + Vector keys(7); + Vector vals(7); + + Vector output(7, 0); + + // clang-format off + keys[0] = 0; vals[0] = 1; + keys[1] = 1; vals[1] = 2; + keys[2] = 1; vals[2] = 3; + keys[3] = 1; vals[3] = 4; + keys[4] = 2; vals[4] = 5; + keys[5] = 3; vals[5] = 6; + keys[6] = 3; vals[6] = 7; + // clang-format on + + Iterator iter = thrust::exclusive_scan_by_key(keys.begin(), + keys.end(), + vals.begin(), + output.begin()); + + ASSERT_EQUAL_QUIET(iter, output.end()); + + ASSERT_EQUAL(output[0], 0); + ASSERT_EQUAL(output[1], 0); + ASSERT_EQUAL(output[2], 2); + ASSERT_EQUAL(output[3], 5); + ASSERT_EQUAL(output[4], 0); + ASSERT_EQUAL(output[5], 0); + ASSERT_EQUAL(output[6], 6); + + thrust::exclusive_scan_by_key(keys.begin(), + keys.end(), + vals.begin(), + output.begin(), + T(10)); + + ASSERT_EQUAL(output[0], 10); + ASSERT_EQUAL(output[1], 10); + ASSERT_EQUAL(output[2], 12); + ASSERT_EQUAL(output[3], 15); + ASSERT_EQUAL(output[4], 10); + ASSERT_EQUAL(output[5], 10); + ASSERT_EQUAL(output[6], 16); + + thrust::exclusive_scan_by_key(keys.begin(), + keys.end(), + vals.begin(), + output.begin(), + T(10), + thrust::equal_to(), + thrust::multiplies()); + + ASSERT_EQUAL(output[0], 10); + ASSERT_EQUAL(output[1], 10); + ASSERT_EQUAL(output[2], 20); + ASSERT_EQUAL(output[3], 60); + ASSERT_EQUAL(output[4], 10); + ASSERT_EQUAL(output[5], 10); + ASSERT_EQUAL(output[6], 60); + + thrust::exclusive_scan_by_key(keys.begin(), + keys.end(), + vals.begin(), + output.begin(), + T(10), + thrust::equal_to()); + + ASSERT_EQUAL(output[0], 10); + ASSERT_EQUAL(output[1], 10); + ASSERT_EQUAL(output[2], 12); + ASSERT_EQUAL(output[3], 15); + ASSERT_EQUAL(output[4], 10); + ASSERT_EQUAL(output[5], 10); + ASSERT_EQUAL(output[6], 16); +} +DECLARE_VECTOR_UNITTEST(TestExclusiveScanByKeySimple); + + +template +OutputIterator exclusive_scan_by_key(my_system& system, + InputIterator1, + InputIterator1, + InputIterator2, + OutputIterator result) +{ + system.validate_dispatch(); + return result; +} + + +void TestExclusiveScanByKeyDispatchExplicit() +{ + thrust::device_vector vec(1); + + my_system sys(0); + thrust::exclusive_scan_by_key(sys, + vec.begin(), + vec.begin(), + vec.begin(), + vec.begin()); + + ASSERT_EQUAL(true, sys.is_valid()); +} +DECLARE_UNITTEST(TestExclusiveScanByKeyDispatchExplicit); + + +template +OutputIterator exclusive_scan_by_key(my_tag, + InputIterator1, + InputIterator1, + InputIterator2, + OutputIterator result) +{ + *result = 13; + return result; +} + + +void TestExclusiveScanByKeyDispatchImplicit() +{ + thrust::device_vector vec(1); + + thrust::exclusive_scan_by_key(thrust::retag(vec.begin()), + thrust::retag(vec.begin()), + thrust::retag(vec.begin()), + thrust::retag(vec.begin())); + + ASSERT_EQUAL(13, vec.front()); +} +DECLARE_UNITTEST(TestExclusiveScanByKeyDispatchImplicit); + + +struct head_flag_predicate +{ + template + __host__ __device__ bool operator()(const T&, const T& b) + { + return b ? false : true; + } +}; + + +template +void TestScanByKeyHeadFlags() +{ + typedef typename Vector::value_type T; + + Vector keys(7); + Vector vals(7); + + Vector output(7, 0); + + // clang-format off + keys[0] = 0; vals[0] = 1; + keys[1] = 1; vals[1] = 2; + keys[2] = 0; vals[2] = 3; + keys[3] = 0; vals[3] = 4; + keys[4] = 1; vals[4] = 5; + keys[5] = 1; vals[5] = 6; + keys[6] = 0; vals[6] = 7; + // clang-format on + + thrust::exclusive_scan_by_key(keys.begin(), + keys.end(), + vals.begin(), + output.begin(), + T(10), + head_flag_predicate(), + thrust::plus()); + + ASSERT_EQUAL(output[0], 10); + ASSERT_EQUAL(output[1], 10); + ASSERT_EQUAL(output[2], 12); + ASSERT_EQUAL(output[3], 15); + ASSERT_EQUAL(output[4], 10); + ASSERT_EQUAL(output[5], 10); + ASSERT_EQUAL(output[6], 16); +} +DECLARE_VECTOR_UNITTEST(TestScanByKeyHeadFlags); + + +template +void TestScanByKeyReusedKeys() +{ + Vector keys(7); + Vector vals(7); + + Vector output(7, 0); + + // clang-format off + keys[0] = 0; vals[0] = 1; + keys[1] = 1; vals[1] = 2; + keys[2] = 1; vals[2] = 3; + keys[3] = 1; vals[3] = 4; + keys[4] = 0; vals[4] = 5; + keys[5] = 1; vals[5] = 6; + keys[6] = 1; vals[6] = 7; + // clang-format on + + thrust::exclusive_scan_by_key(keys.begin(), + keys.end(), + vals.begin(), + output.begin(), + typename Vector::value_type(10)); + + ASSERT_EQUAL(output[0], 10); + ASSERT_EQUAL(output[1], 10); + ASSERT_EQUAL(output[2], 12); + ASSERT_EQUAL(output[3], 15); + ASSERT_EQUAL(output[4], 10); + ASSERT_EQUAL(output[5], 10); + ASSERT_EQUAL(output[6], 16); +} +DECLARE_VECTOR_UNITTEST(TestScanByKeyReusedKeys); + + +template +void TestExclusiveScanByKey(const size_t n) +{ + thrust::host_vector h_keys(n); + thrust::default_random_engine rng; + for (size_t i = 0, k = 0; i < n; i++) + { + h_keys[i] = static_cast(k); + if (rng() % 10 == 0) + { + k++; + } + } + thrust::device_vector d_keys = h_keys; + + thrust::host_vector h_vals = unittest::random_integers(n); + for (size_t i = 0; i < n; i++) + { + h_vals[i] = static_cast(i % 10); + } + thrust::device_vector d_vals = h_vals; + + thrust::host_vector h_output(n); + thrust::device_vector d_output(n); + + // without init + thrust::exclusive_scan_by_key(h_keys.begin(), + h_keys.end(), + h_vals.begin(), + h_output.begin()); + thrust::exclusive_scan_by_key(d_keys.begin(), + d_keys.end(), + d_vals.begin(), + d_output.begin()); + ASSERT_EQUAL(d_output, h_output); + + // with init + thrust::exclusive_scan_by_key(h_keys.begin(), + h_keys.end(), + h_vals.begin(), + h_output.begin(), + (T)11); + thrust::exclusive_scan_by_key(d_keys.begin(), + d_keys.end(), + d_vals.begin(), + d_output.begin(), + (T)11); + ASSERT_EQUAL(d_output, h_output); +} +DECLARE_VARIABLE_UNITTEST(TestExclusiveScanByKey); + + +template +void TestExclusiveScanByKeyInPlace(const size_t n) +{ + thrust::host_vector h_keys(n); + thrust::default_random_engine rng; + for (size_t i = 0, k = 0; i < n; i++) + { + h_keys[i] = static_cast(k); + if (rng() % 10 == 0) + { + k++; + } + } + thrust::device_vector d_keys = h_keys; + + thrust::host_vector h_vals = unittest::random_integers(n); + for (size_t i = 0; i < n; i++) + { + h_vals[i] = static_cast(i % 10); + } + thrust::device_vector d_vals = h_vals; + + // in-place scans: in/out values aliasing + thrust::host_vector h_output = h_vals; + thrust::device_vector d_output = d_vals; + thrust::exclusive_scan_by_key(h_keys.begin(), + h_keys.end(), + h_output.begin(), + h_output.begin(), + (T)11); + thrust::exclusive_scan_by_key(d_keys.begin(), + d_keys.end(), + d_output.begin(), + d_output.begin(), + (T)11); + ASSERT_EQUAL(d_output, h_output); + + // in-place scans: in/out keys aliasing + thrust::exclusive_scan_by_key(h_keys.begin(), + h_keys.end(), + h_vals.begin(), + h_keys.begin(), + (T)11); + thrust::exclusive_scan_by_key(d_keys.begin(), + d_keys.end(), + d_vals.begin(), + d_keys.begin(), + (T)11); + ASSERT_EQUAL(d_keys, h_keys); +} +DECLARE_VARIABLE_UNITTEST(TestExclusiveScanByKeyInPlace); + + +void TestScanByKeyMixedTypes() +{ + const unsigned int n = 113; + + thrust::host_vector h_keys(n); + thrust::default_random_engine rng; + for (size_t i = 0, k = 0; i < n; i++) + { + h_keys[i] = static_cast(k); + if (rng() % 10 == 0) + { + k++; + } + } + thrust::device_vector d_keys = h_keys; + + thrust::host_vector h_vals = + unittest::random_integers(n); + for (size_t i = 0; i < n; i++) + h_vals[i] %= 10; + thrust::device_vector d_vals = h_vals; + + thrust::host_vector h_float_output(n); + thrust::device_vector d_float_output(n); + thrust::host_vector h_int_output(n); + thrust::device_vector d_int_output(n); + + // mixed vals/output types + thrust::exclusive_scan_by_key(h_keys.begin(), + h_keys.end(), + h_vals.begin(), + h_float_output.begin(), + (float)3.5); + thrust::exclusive_scan_by_key(d_keys.begin(), + d_keys.end(), + d_vals.begin(), + d_float_output.begin(), + (float)3.5); + ASSERT_EQUAL(d_float_output, h_float_output); + + thrust::exclusive_scan_by_key(h_keys.begin(), + h_keys.end(), + h_vals.begin(), + h_float_output.begin(), + (int)3); + thrust::exclusive_scan_by_key(d_keys.begin(), + d_keys.end(), + d_vals.begin(), + d_float_output.begin(), + (int)3); + ASSERT_EQUAL(d_float_output, h_float_output); + + thrust::exclusive_scan_by_key(h_keys.begin(), + h_keys.end(), + h_vals.begin(), + h_int_output.begin(), + (int)3); + thrust::exclusive_scan_by_key(d_keys.begin(), + d_keys.end(), + d_vals.begin(), + d_int_output.begin(), + (int)3); + ASSERT_EQUAL(d_int_output, h_int_output); + + thrust::exclusive_scan_by_key(h_keys.begin(), + h_keys.end(), + h_vals.begin(), + h_int_output.begin(), + (float)3.5); + thrust::exclusive_scan_by_key(d_keys.begin(), + d_keys.end(), + d_vals.begin(), + d_int_output.begin(), + (float)3.5); + ASSERT_EQUAL(d_int_output, h_int_output); +} +DECLARE_UNITTEST(TestScanByKeyMixedTypes); + + +template +void TestScanByKeyDiscardOutput(std::size_t n) +{ + thrust::host_vector h_keys(n); + thrust::default_random_engine rng; + + for (size_t i = 0, k = 0; i < n; i++) + { + h_keys[i] = static_cast(k); + if (rng() % 10 == 0) + { + k++; + } + } + thrust::device_vector d_keys = h_keys; + + thrust::host_vector h_vals(n); + for (size_t i = 0; i < n; i++) + { + h_vals[i] = static_cast(i % 10); + } + thrust::device_vector d_vals = h_vals; + + auto out = thrust::make_discard_iterator(); + + // These are no-ops, but they should compile. + thrust::exclusive_scan_by_key(d_keys.cbegin(), + d_keys.cend(), + d_vals.cbegin(), + out); + thrust::exclusive_scan_by_key(d_keys.cbegin(), + d_keys.cend(), + d_vals.cbegin(), + out, + T{}); + thrust::exclusive_scan_by_key(d_keys.cbegin(), + d_keys.cend(), + d_vals.cbegin(), + out, + T{}, + thrust::equal_to{}); + thrust::exclusive_scan_by_key(d_keys.cbegin(), + d_keys.cend(), + d_vals.cbegin(), + out, + T{}, + thrust::equal_to{}, + thrust::multiplies{}); +} +DECLARE_VARIABLE_UNITTEST(TestScanByKeyDiscardOutput); + + +void TestScanByKeyLargeInput() +{ + const unsigned int N = 1 << 20; + + thrust::host_vector vals_sizes = + unittest::random_integers(10); + + thrust::host_vector h_vals = + unittest::random_integers(N); + thrust::device_vector d_vals = h_vals; + + thrust::host_vector h_output(N, 0); + thrust::device_vector d_output(N, 0); + + for (unsigned int i = 0; i < vals_sizes.size(); i++) + { + const unsigned int n = vals_sizes[i] % N; + + // define segments + thrust::host_vector h_keys(n); + thrust::default_random_engine rng; + for (size_t j = 0, k = 0; j < n; j++) + { + h_keys[j] = static_cast(k); + if (rng() % 100 == 0) + { + k++; + } + } + thrust::device_vector d_keys = h_keys; + + thrust::exclusive_scan_by_key(h_keys.begin(), + h_keys.begin() + n, + h_vals.begin(), + h_output.begin()); + thrust::exclusive_scan_by_key(d_keys.begin(), + d_keys.begin() + n, + d_vals.begin(), + d_output.begin()); + ASSERT_EQUAL(d_output, h_output); + } +} +DECLARE_UNITTEST(TestScanByKeyLargeInput); + + +template +void _TestScanByKeyWithLargeTypes() +{ + size_t n = (64 * 1024) / sizeof(FixedVector); + + thrust::host_vector h_keys(n); + thrust::host_vector> h_vals(n); + thrust::host_vector> h_output(n); + + thrust::default_random_engine rng; + for (size_t i = 0, k = 0; i < h_vals.size(); i++) + { + h_keys[i] = static_cast(k); + h_vals[i] = FixedVector(static_cast(i)); + if (rng() % 5 == 0) + { + k++; + } + } + + thrust::device_vector d_keys = h_keys; + thrust::device_vector> d_vals = h_vals; + thrust::device_vector> d_output(n); + + thrust::exclusive_scan_by_key(h_keys.begin(), + h_keys.end(), + h_vals.begin(), + h_output.begin(), + FixedVector(0)); + thrust::exclusive_scan_by_key(d_keys.begin(), + d_keys.end(), + d_vals.begin(), + d_output.begin(), + FixedVector(0)); + + ASSERT_EQUAL_QUIET(h_output, d_output); +} + + +void TestScanByKeyWithLargeTypes() +{ + _TestScanByKeyWithLargeTypes(); + _TestScanByKeyWithLargeTypes(); + _TestScanByKeyWithLargeTypes(); + _TestScanByKeyWithLargeTypes(); + + // too many resources requested for launch: + //_TestScanByKeyWithLargeTypes(); + //_TestScanByKeyWithLargeTypes(); + + // too large to pass as argument: + //_TestScanByKeyWithLargeTypes(); + //_TestScanByKeyWithLargeTypes(); + //_TestScanByKeyWithLargeTypes(); + //_TestScanByKeyWithLargeTypes(); + //_TestScanByKeyWithLargeTypes(); +} +DECLARE_UNITTEST(TestScanByKeyWithLargeTypes); diff --git a/testing/scan_by_key.inclusive.cu b/testing/scan_by_key.inclusive.cu new file mode 100644 index 000000000..b2d2337e2 --- /dev/null +++ b/testing/scan_by_key.inclusive.cu @@ -0,0 +1,524 @@ +#include + +#include +#include +#include +#include +#include + +#include + +template +void TestInclusiveScanByKeySimple() +{ + typedef typename Vector::value_type T; + typedef typename Vector::iterator Iterator; + + Vector keys(7); + Vector vals(7); + + Vector output(7, 0); + + // clang-format off + keys[0] = 0; vals[0] = 1; + keys[1] = 1; vals[1] = 2; + keys[2] = 1; vals[2] = 3; + keys[3] = 1; vals[3] = 4; + keys[4] = 2; vals[4] = 5; + keys[5] = 3; vals[5] = 6; + keys[6] = 3; vals[6] = 7; + // clang-format on + + Iterator iter = thrust::inclusive_scan_by_key(keys.begin(), + keys.end(), + vals.begin(), + output.begin()); + + ASSERT_EQUAL_QUIET(iter, output.end()); + + ASSERT_EQUAL(output[0], 1); + ASSERT_EQUAL(output[1], 2); + ASSERT_EQUAL(output[2], 5); + ASSERT_EQUAL(output[3], 9); + ASSERT_EQUAL(output[4], 5); + ASSERT_EQUAL(output[5], 6); + ASSERT_EQUAL(output[6], 13); + + thrust::inclusive_scan_by_key(keys.begin(), + keys.end(), + vals.begin(), + output.begin(), + thrust::equal_to(), + thrust::multiplies()); + + ASSERT_EQUAL(output[0], 1); + ASSERT_EQUAL(output[1], 2); + ASSERT_EQUAL(output[2], 6); + ASSERT_EQUAL(output[3], 24); + ASSERT_EQUAL(output[4], 5); + ASSERT_EQUAL(output[5], 6); + ASSERT_EQUAL(output[6], 42); + + thrust::inclusive_scan_by_key(keys.begin(), + keys.end(), + vals.begin(), + output.begin(), + thrust::equal_to()); + + ASSERT_EQUAL(output[0], 1); + ASSERT_EQUAL(output[1], 2); + ASSERT_EQUAL(output[2], 5); + ASSERT_EQUAL(output[3], 9); + ASSERT_EQUAL(output[4], 5); + ASSERT_EQUAL(output[5], 6); + ASSERT_EQUAL(output[6], 13); +} +DECLARE_VECTOR_UNITTEST(TestInclusiveScanByKeySimple); + + +template +OutputIterator inclusive_scan_by_key(my_system& system, + InputIterator1, + InputIterator1, + InputIterator2, + OutputIterator result) +{ + system.validate_dispatch(); + return result; +} + +void TestInclusiveScanByKeyDispatchExplicit() +{ + thrust::device_vector vec(1); + + my_system sys(0); + thrust::inclusive_scan_by_key(sys, + vec.begin(), + vec.begin(), + vec.begin(), + vec.begin()); + + ASSERT_EQUAL(true, sys.is_valid()); +} +DECLARE_UNITTEST(TestInclusiveScanByKeyDispatchExplicit); + + +template +OutputIterator inclusive_scan_by_key(my_tag, + InputIterator1, + InputIterator1, + InputIterator2, + OutputIterator result) +{ + *result = 13; + return result; +} + +void TestInclusiveScanByKeyDispatchImplicit() +{ + thrust::device_vector vec(1); + + thrust::inclusive_scan_by_key(thrust::retag(vec.begin()), + thrust::retag(vec.begin()), + thrust::retag(vec.begin()), + thrust::retag(vec.begin())); + + ASSERT_EQUAL(13, vec.front()); +} +DECLARE_UNITTEST(TestInclusiveScanByKeyDispatchImplicit); + +struct head_flag_predicate +{ + template + __host__ __device__ bool operator()(const T&, const T& b) + { + return b ? false : true; + } +}; + +template +void TestScanByKeyHeadFlags() +{ + typedef typename Vector::value_type T; + + Vector keys(7); + Vector vals(7); + + Vector output(7, 0); + + // clang-format off + keys[0] = 0; vals[0] = 1; + keys[1] = 1; vals[1] = 2; + keys[2] = 0; vals[2] = 3; + keys[3] = 0; vals[3] = 4; + keys[4] = 1; vals[4] = 5; + keys[5] = 1; vals[5] = 6; + keys[6] = 0; vals[6] = 7; + // clang-format on + + thrust::inclusive_scan_by_key(keys.begin(), + keys.end(), + vals.begin(), + output.begin(), + head_flag_predicate(), + thrust::plus()); + + ASSERT_EQUAL(output[0], 1); + ASSERT_EQUAL(output[1], 2); + ASSERT_EQUAL(output[2], 5); + ASSERT_EQUAL(output[3], 9); + ASSERT_EQUAL(output[4], 5); + ASSERT_EQUAL(output[5], 6); + ASSERT_EQUAL(output[6], 13); +} +DECLARE_VECTOR_UNITTEST(TestScanByKeyHeadFlags); + +template +void TestInclusiveScanByKeyTransformIterator() +{ + typedef typename Vector::value_type T; + + Vector keys(7); + Vector vals(7); + + Vector output(7, 0); + + // clang-format off + keys[0] = 0; vals[0] = 1; + keys[1] = 1; vals[1] = 2; + keys[2] = 1; vals[2] = 3; + keys[3] = 1; vals[3] = 4; + keys[4] = 2; vals[4] = 5; + keys[5] = 3; vals[5] = 6; + keys[6] = 3; vals[6] = 7; + // clang-format on + + thrust::inclusive_scan_by_key( + keys.begin(), + keys.end(), + thrust::make_transform_iterator(vals.begin(), thrust::negate()), + output.begin()); + + ASSERT_EQUAL(output[0], -1); + ASSERT_EQUAL(output[1], -2); + ASSERT_EQUAL(output[2], -5); + ASSERT_EQUAL(output[3], -9); + ASSERT_EQUAL(output[4], -5); + ASSERT_EQUAL(output[5], -6); + ASSERT_EQUAL(output[6], -13); +} +DECLARE_VECTOR_UNITTEST(TestInclusiveScanByKeyTransformIterator); + + +template +void TestScanByKeyReusedKeys() +{ + Vector keys(7); + Vector vals(7); + + Vector output(7, 0); + + // clang-format off + keys[0] = 0; vals[0] = 1; + keys[1] = 1; vals[1] = 2; + keys[2] = 1; vals[2] = 3; + keys[3] = 1; vals[3] = 4; + keys[4] = 0; vals[4] = 5; + keys[5] = 1; vals[5] = 6; + keys[6] = 1; vals[6] = 7; + // clang-format on + + thrust::inclusive_scan_by_key(keys.begin(), + keys.end(), + vals.begin(), + output.begin()); + + ASSERT_EQUAL(output[0], 1); + ASSERT_EQUAL(output[1], 2); + ASSERT_EQUAL(output[2], 5); + ASSERT_EQUAL(output[3], 9); + ASSERT_EQUAL(output[4], 5); + ASSERT_EQUAL(output[5], 6); + ASSERT_EQUAL(output[6], 13); +} +DECLARE_VECTOR_UNITTEST(TestScanByKeyReusedKeys); + + +template +void TestInclusiveScanByKey(const size_t n) +{ + thrust::host_vector h_keys(n); + thrust::default_random_engine rng; + for (size_t i = 0, k = 0; i < n; i++) + { + h_keys[i] = static_cast(k); + if (rng() % 10 == 0) + { + k++; + } + } + thrust::device_vector d_keys = h_keys; + + thrust::host_vector h_vals = unittest::random_integers(n); + for (size_t i = 0; i < n; i++) + h_vals[i] = static_cast(i % 10); + thrust::device_vector d_vals = h_vals; + + thrust::host_vector h_output(n); + thrust::device_vector d_output(n); + + thrust::inclusive_scan_by_key(h_keys.begin(), + h_keys.end(), + h_vals.begin(), + h_output.begin()); + thrust::inclusive_scan_by_key(d_keys.begin(), + d_keys.end(), + d_vals.begin(), + d_output.begin()); + ASSERT_EQUAL(d_output, h_output); +} +DECLARE_VARIABLE_UNITTEST(TestInclusiveScanByKey); + + +template +void TestInclusiveScanByKeyInPlace(const size_t n) +{ + thrust::host_vector h_keys(n); + thrust::default_random_engine rng; + for (size_t i = 0, k = 0; i < n; i++) + { + h_keys[i] = static_cast(k); + if (rng() % 10 == 0) + { + k++; + } + } + thrust::device_vector d_keys = h_keys; + + thrust::host_vector h_vals = unittest::random_integers(n); + for (size_t i = 0; i < n; i++) + { + h_vals[i] = static_cast(i % 10); + } + thrust::device_vector d_vals = h_vals; + + thrust::host_vector h_output(n); + thrust::device_vector d_output(n); + + // in-place scans: in/out values aliasing + h_output = h_vals; + d_output = d_vals; + thrust::inclusive_scan_by_key(h_keys.begin(), + h_keys.end(), + h_output.begin(), + h_output.begin()); + thrust::inclusive_scan_by_key(d_keys.begin(), + d_keys.end(), + d_output.begin(), + d_output.begin()); + ASSERT_EQUAL(d_output, h_output); + + // in-place scans: in/out keys aliasing + thrust::inclusive_scan_by_key(h_keys.begin(), + h_keys.end(), + h_vals.begin(), + h_keys.begin()); + thrust::inclusive_scan_by_key(d_keys.begin(), + d_keys.end(), + d_vals.begin(), + d_keys.begin()); + ASSERT_EQUAL(d_keys, h_keys); +} +DECLARE_VARIABLE_UNITTEST(TestInclusiveScanByKeyInPlace); + + +void TestScanByKeyMixedTypes() +{ + const unsigned int n = 113; + + thrust::host_vector h_keys(n); + thrust::default_random_engine rng; + for (size_t i = 0, k = 0; i < n; i++) + { + h_keys[i] = static_cast(k); + if (rng() % 10 == 0) + { + k++; + } + } + thrust::device_vector d_keys = h_keys; + + thrust::host_vector h_vals = + unittest::random_integers(n); + for (size_t i = 0; i < n; i++) + h_vals[i] %= 10; + thrust::device_vector d_vals = h_vals; + + thrust::host_vector h_float_output(n); + thrust::device_vector d_float_output(n); + thrust::host_vector h_int_output(n); + thrust::device_vector d_int_output(n); + + // mixed vals/output types + thrust::inclusive_scan_by_key(h_keys.begin(), + h_keys.end(), + h_vals.begin(), + h_float_output.begin()); + thrust::inclusive_scan_by_key(d_keys.begin(), + d_keys.end(), + d_vals.begin(), + d_float_output.begin()); + ASSERT_EQUAL(d_float_output, h_float_output); +} +DECLARE_UNITTEST(TestScanByKeyMixedTypes); + + +template +void TestScanByKeyDiscardOutput(std::size_t n) +{ + thrust::host_vector h_keys(n); + thrust::default_random_engine rng; + + for (size_t i = 0, k = 0; i < n; i++) + { + h_keys[i] = static_cast(k); + if (rng() % 10 == 0) + { + k++; + } + } + thrust::device_vector d_keys = h_keys; + + thrust::host_vector h_vals(n); + for (size_t i = 0; i < n; i++) + { + h_vals[i] = static_cast(i % 10); + } + thrust::device_vector d_vals = h_vals; + + auto out = thrust::make_discard_iterator(); + + // These are no-ops, but they should compile. + thrust::inclusive_scan_by_key(d_keys.cbegin(), + d_keys.cend(), + d_vals.cbegin(), + out); + thrust::inclusive_scan_by_key(d_keys.cbegin(), + d_keys.cend(), + d_vals.cbegin(), + out, + thrust::equal_to{}); + thrust::inclusive_scan_by_key(d_keys.cbegin(), + d_keys.cend(), + d_vals.cbegin(), + out, + thrust::equal_to{}, + thrust::multiplies{}); +} +DECLARE_VARIABLE_UNITTEST(TestScanByKeyDiscardOutput); + + +void TestScanByKeyLargeInput() +{ + const unsigned int N = 1 << 20; + + thrust::host_vector vals_sizes = + unittest::random_integers(10); + + thrust::host_vector h_vals = + unittest::random_integers(N); + thrust::device_vector d_vals = h_vals; + + thrust::host_vector h_output(N, 0); + thrust::device_vector d_output(N, 0); + + for (unsigned int i = 0; i < vals_sizes.size(); i++) + { + const unsigned int n = vals_sizes[i] % N; + + // define segments + thrust::host_vector h_keys(n); + thrust::default_random_engine rng; + for (size_t j = 0, k = 0; j < n; j++) + { + h_keys[j] = static_cast(k); + if (rng() % 100 == 0) + { + k++; + } + } + thrust::device_vector d_keys = h_keys; + + thrust::inclusive_scan_by_key(h_keys.begin(), + h_keys.begin() + n, + h_vals.begin(), + h_output.begin()); + thrust::inclusive_scan_by_key(d_keys.begin(), + d_keys.begin() + n, + d_vals.begin(), + d_output.begin()); + ASSERT_EQUAL(d_output, h_output); + } +} +DECLARE_UNITTEST(TestScanByKeyLargeInput); + + +template +void _TestScanByKeyWithLargeTypes() +{ + size_t n = (64 * 1024) / sizeof(FixedVector); + + thrust::host_vector h_keys(n); + thrust::host_vector> h_vals(n); + thrust::host_vector> h_output(n); + + thrust::default_random_engine rng; + for (size_t i = 0, k = 0; i < h_vals.size(); i++) + { + h_keys[i] = static_cast(k); + h_vals[i] = FixedVector(static_cast(i)); + if (rng() % 5 == 0) + { + k++; + } + } + + thrust::device_vector d_keys = h_keys; + thrust::device_vector> d_vals = h_vals; + thrust::device_vector> d_output(n); + + thrust::inclusive_scan_by_key(h_keys.begin(), + h_keys.end(), + h_vals.begin(), + h_output.begin()); + thrust::inclusive_scan_by_key(d_keys.begin(), + d_keys.end(), + d_vals.begin(), + d_output.begin()); + + ASSERT_EQUAL_QUIET(h_output, d_output); +} + +void TestScanByKeyWithLargeTypes() +{ + _TestScanByKeyWithLargeTypes(); + _TestScanByKeyWithLargeTypes(); + _TestScanByKeyWithLargeTypes(); + _TestScanByKeyWithLargeTypes(); + + // too many resources requested for launch: + //_TestScanByKeyWithLargeTypes(); + //_TestScanByKeyWithLargeTypes(); + + // too large to pass as argument + //_TestScanByKeyWithLargeTypes(); + //_TestScanByKeyWithLargeTypes(); + //_TestScanByKeyWithLargeTypes(); + //_TestScanByKeyWithLargeTypes(); + //_TestScanByKeyWithLargeTypes(); +} +DECLARE_UNITTEST(TestScanByKeyWithLargeTypes); diff --git a/testing/sequence.cu b/testing/sequence.cu index cd3e17744..6d29db4c3 100644 --- a/testing/sequence.cu +++ b/testing/sequence.cu @@ -41,8 +41,9 @@ DECLARE_UNITTEST(TestSequenceDispatchImplicit); template -void TestSequenceSimple(void) +void TestSequenceSimple() { + using value_type = typename Vector::value_type; Vector v(5); thrust::sequence(v.begin(), v.end()); @@ -53,7 +54,7 @@ void TestSequenceSimple(void) ASSERT_EQUAL(v[3], 3); ASSERT_EQUAL(v[4], 4); - thrust::sequence(v.begin(), v.end(), 10); + thrust::sequence(v.begin(), v.end(), value_type{10}); ASSERT_EQUAL(v[0], 10); ASSERT_EQUAL(v[1], 11); @@ -61,7 +62,7 @@ void TestSequenceSimple(void) ASSERT_EQUAL(v[3], 13); ASSERT_EQUAL(v[4], 14); - thrust::sequence(v.begin(), v.end(), 10, 2); + thrust::sequence(v.begin(), v.end(), value_type{10}, value_type{2}); ASSERT_EQUAL(v[0], 10); ASSERT_EQUAL(v[1], 12); @@ -93,8 +94,8 @@ void TestSequence(size_t n) ASSERT_EQUAL(h_data, d_data); - thrust::sequence(h_data.begin(), h_data.end(), size_t(10), size_t(2)); - thrust::sequence(d_data.begin(), d_data.end(), size_t(10), size_t(2)); + thrust::sequence(h_data.begin(), h_data.end(), T(10), T(2)); + thrust::sequence(d_data.begin(), d_data.end(), T(10), T(2)); ASSERT_EQUAL(h_data, d_data); } @@ -123,3 +124,47 @@ void TestSequenceComplex() thrust::sequence(m.begin(), m.end()); } DECLARE_UNITTEST(TestSequenceComplex); + +// A class that doesnt accept conversion from size_t but can be multiplied by a scalar +struct Vector +{ + Vector() = default; + // Explicitly disable construction from size_t + Vector(std::size_t) = delete; + __host__ __device__ Vector(int x_, int y_) : x{x_}, y{y_} {} + Vector(const Vector&) = default; + Vector &operator=(const Vector&) = default; + + int x, y; +}; + +// Vector-Vector addition +__host__ __device__ Vector operator+(const Vector a, const Vector b) +{ + return Vector{a.x + b.x, a.y + b.y}; +} + +// Vector-Scalar Multiplication +// Multiplication by std::size_t is required by thrust::sequence. +__host__ __device__ Vector operator*(const std::size_t a, const Vector b) +{ + return Vector{static_cast(a) * b.x, static_cast(a) * b.y}; +} +__host__ __device__ Vector operator*(const Vector b, const std::size_t a) +{ + return Vector{static_cast(a) * b.x, static_cast(a) * b.y}; +} + +void TestSequenceNoSizeTConversion() +{ + thrust::device_vector m(64); + thrust::sequence(m.begin(), m.end(), ::Vector{0, 0}, ::Vector{1, 2}); + + for (std::size_t i = 0; i < m.size(); ++i) + { + const ::Vector v = m[i]; + ASSERT_EQUAL(static_cast(v.x), i); + ASSERT_EQUAL(static_cast(v.y), 2 * i); + } +} +DECLARE_UNITTEST(TestSequenceNoSizeTConversion); diff --git a/testing/set_difference.cu b/testing/set_difference.cu index b107bda36..5abc5f1fb 100644 --- a/testing/set_difference.cu +++ b/testing/set_difference.cu @@ -169,11 +169,11 @@ DECLARE_VARIABLE_UNITTEST(TestSetDifferenceEquivalentRanges); template void TestSetDifferenceMultiset(const size_t n) { - thrust::host_vector temp = unittest::random_integers(2 * n); + thrust::host_vector vec = unittest::random_integers(2 * n); // restrict elements to [min,13) - for(typename thrust::host_vector::iterator i = temp.begin(); - i != temp.end(); + for(typename thrust::host_vector::iterator i = vec.begin(); + i != vec.end(); ++i) { int temp = static_cast(*i); @@ -181,8 +181,8 @@ void TestSetDifferenceMultiset(const size_t n) *i = temp; } - thrust::host_vector h_a(temp.begin(), temp.begin() + n); - thrust::host_vector h_b(temp.begin() + n, temp.end()); + thrust::host_vector h_a(vec.begin(), vec.begin() + n); + thrust::host_vector h_b(vec.begin() + n, vec.end()); thrust::sort(h_a.begin(), h_a.end()); thrust::sort(h_b.begin(), h_b.end()); @@ -211,3 +211,32 @@ void TestSetDifferenceMultiset(const size_t n) } DECLARE_VARIABLE_UNITTEST(TestSetDifferenceMultiset); +// FIXME: disabled on Windows, because it causes a failure on the internal CI system in one specific configuration. +// That failure will be tracked in a new NVBug, this is disabled to unblock submitting all the other changes. +#if THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC +void TestSetDifferenceWithBigIndexesHelper(int magnitude) +{ + thrust::counting_iterator begin(0); + thrust::counting_iterator end = begin + (1ll << magnitude); + thrust::counting_iterator end_longer = end + 1; + ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude); + + thrust::device_vector result; + result.resize(1); + thrust::set_difference(thrust::device, begin, end_longer, begin, end, result.begin()); + + thrust::host_vector expected; + expected.push_back(*end); + + ASSERT_EQUAL(result, expected); +} + +void TestSetDifferenceWithBigIndexes() +{ + TestSetDifferenceWithBigIndexesHelper(30); + TestSetDifferenceWithBigIndexesHelper(31); + TestSetDifferenceWithBigIndexesHelper(32); + TestSetDifferenceWithBigIndexesHelper(33); +} +DECLARE_UNITTEST(TestSetDifferenceWithBigIndexes); +#endif diff --git a/testing/set_difference_by_key.cu b/testing/set_difference_by_key.cu index be68685fc..29dbb68fc 100644 --- a/testing/set_difference_by_key.cu +++ b/testing/set_difference_by_key.cu @@ -250,11 +250,11 @@ DECLARE_VARIABLE_UNITTEST(TestSetDifferenceByKeyEquivalentRanges); template void TestSetDifferenceByKeyMultiset(const size_t n) { - thrust::host_vector temp = unittest::random_integers(2 * n); + thrust::host_vector vec = unittest::random_integers(2 * n); // restrict elements to [min,13) - for(typename thrust::host_vector::iterator i = temp.begin(); - i != temp.end(); + for(typename thrust::host_vector::iterator i = vec.begin(); + i != vec.end(); ++i) { int temp = static_cast(*i); @@ -262,8 +262,8 @@ void TestSetDifferenceByKeyMultiset(const size_t n) *i = temp; } - thrust::host_vector h_a_key(temp.begin(), temp.begin() + n); - thrust::host_vector h_b_key(temp.begin() + n, temp.end()); + thrust::host_vector h_a_key(vec.begin(), vec.begin() + n); + thrust::host_vector h_b_key(vec.begin() + n, vec.end()); thrust::sort(h_a_key.begin(), h_a_key.end()); thrust::sort(h_b_key.begin(), h_b_key.end()); diff --git a/testing/set_intersection.cu b/testing/set_intersection.cu index 3cae00f30..93ef05d74 100644 --- a/testing/set_intersection.cu +++ b/testing/set_intersection.cu @@ -209,20 +209,20 @@ DECLARE_VARIABLE_UNITTEST(TestSetIntersectionEquivalentRanges); template void TestSetIntersectionMultiset(const size_t n) { - thrust::host_vector temp = unittest::random_integers(2 * n); + thrust::host_vector vec = unittest::random_integers(2 * n); // restrict elements to [min,13) - for(typename thrust::host_vector::iterator i = temp.begin(); - i != temp.end(); + for(typename thrust::host_vector::iterator i = vec.begin(); + i != vec.end(); ++i) { - int temp = static_cast(*i); - temp %= 13; - *i = temp; + int tmp = static_cast(*i); + tmp %= 13; + *i = static_cast(tmp); } - thrust::host_vector h_a(temp.begin(), temp.begin() + n); - thrust::host_vector h_b(temp.begin() + n, temp.end()); + thrust::host_vector h_a(vec.begin(), vec.begin() + n); + thrust::host_vector h_b(vec.begin() + n, vec.end()); thrust::sort(h_a.begin(), h_a.end()); thrust::sort(h_b.begin(), h_b.end()); @@ -251,3 +251,33 @@ void TestSetIntersectionMultiset(const size_t n) } DECLARE_VARIABLE_UNITTEST(TestSetIntersectionMultiset); +// FIXME: disabled on Windows, because it causes a failure on the internal CI system in one specific configuration. +// That failure will be tracked in a new NVBug, this is disabled to unblock submitting all the other changes. +#if THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC +void TestSetDifferenceWithBigIndexesHelper(int magnitude) +{ + thrust::counting_iterator begin1(0); + thrust::counting_iterator begin2 = begin1 + (1ll << magnitude); + thrust::counting_iterator end1 = begin2 + 1; + thrust::counting_iterator end2 = begin2 + (1ll << magnitude); + ASSERT_EQUAL(thrust::distance(begin2, end1), 1); + + thrust::device_vector result; + result.resize(1); + thrust::set_intersection(thrust::device, begin1, end1, begin2, end2, result.begin()); + + thrust::host_vector expected; + expected.push_back(*begin2); + + ASSERT_EQUAL(result, expected); +} + +void TestSetDifferenceWithBigIndexes() +{ + TestSetDifferenceWithBigIndexesHelper(30); + TestSetDifferenceWithBigIndexesHelper(31); + TestSetDifferenceWithBigIndexesHelper(32); + TestSetDifferenceWithBigIndexesHelper(33); +} +DECLARE_UNITTEST(TestSetDifferenceWithBigIndexes); +#endif diff --git a/testing/set_intersection_by_key.cu b/testing/set_intersection_by_key.cu index 6b7d51fc8..d82ee04ad 100644 --- a/testing/set_intersection_by_key.cu +++ b/testing/set_intersection_by_key.cu @@ -234,11 +234,11 @@ DECLARE_VARIABLE_UNITTEST(TestSetIntersectionByKeyEquivalentRanges); template void TestSetIntersectionByKeyMultiset(const size_t n) { - thrust::host_vector temp = unittest::random_integers(2 * n); + thrust::host_vector vec = unittest::random_integers(2 * n); // restrict elements to [min,13) - for(typename thrust::host_vector::iterator i = temp.begin(); - i != temp.end(); + for(typename thrust::host_vector::iterator i = vec.begin(); + i != vec.end(); ++i) { int temp = static_cast(*i); @@ -246,8 +246,8 @@ void TestSetIntersectionByKeyMultiset(const size_t n) *i = temp; } - thrust::host_vector h_a_key(temp.begin(), temp.begin() + n); - thrust::host_vector h_b_key(temp.begin() + n, temp.end()); + thrust::host_vector h_a_key(vec.begin(), vec.begin() + n); + thrust::host_vector h_b_key(vec.begin() + n, vec.end()); thrust::sort(h_a_key.begin(), h_a_key.end()); thrust::sort(h_b_key.begin(), h_b_key.end()); diff --git a/testing/set_symmetric_difference.cu b/testing/set_symmetric_difference.cu index b3e3c1493..dde145fec 100644 --- a/testing/set_symmetric_difference.cu +++ b/testing/set_symmetric_difference.cu @@ -168,11 +168,11 @@ DECLARE_VARIABLE_UNITTEST(TestSetSymmetricDifferenceEquivalentRanges); template void TestSetSymmetricDifferenceMultiset(const size_t n) { - thrust::host_vector temp = unittest::random_integers(2 * n); + thrust::host_vector vec = unittest::random_integers(2 * n); // restrict elements to [min,13) - for(typename thrust::host_vector::iterator i = temp.begin(); - i != temp.end(); + for(typename thrust::host_vector::iterator i = vec.begin(); + i != vec.end(); ++i) { int temp = static_cast(*i); @@ -180,8 +180,8 @@ void TestSetSymmetricDifferenceMultiset(const size_t n) *i = temp; } - thrust::host_vector h_a(temp.begin(), temp.begin() + n); - thrust::host_vector h_b(temp.begin() + n, temp.end()); + thrust::host_vector h_a(vec.begin(), vec.begin() + n); + thrust::host_vector h_b(vec.begin() + n, vec.end()); thrust::sort(h_a.begin(), h_a.end()); thrust::sort(h_b.begin(), h_b.end()); diff --git a/testing/set_symmetric_difference_by_key.cu b/testing/set_symmetric_difference_by_key.cu index c2688fdb8..98e416af8 100644 --- a/testing/set_symmetric_difference_by_key.cu +++ b/testing/set_symmetric_difference_by_key.cu @@ -254,11 +254,11 @@ DECLARE_VARIABLE_UNITTEST(TestSetSymmetricDifferenceByKeyEquivalentRanges); template void TestSetSymmetricDifferenceByKeyMultiset(const size_t n) { - thrust::host_vector temp = unittest::random_integers(2 * n); + thrust::host_vector vec = unittest::random_integers(2 * n); // restrict elements to [min,13) - for(typename thrust::host_vector::iterator i = temp.begin(); - i != temp.end(); + for(typename thrust::host_vector::iterator i = vec.begin(); + i != vec.end(); ++i) { int temp = static_cast(*i); @@ -266,8 +266,8 @@ void TestSetSymmetricDifferenceByKeyMultiset(const size_t n) *i = temp; } - thrust::host_vector h_a_key(temp.begin(), temp.begin() + n); - thrust::host_vector h_b_key(temp.begin() + n, temp.end()); + thrust::host_vector h_a_key(vec.begin(), vec.begin() + n); + thrust::host_vector h_b_key(vec.begin() + n, vec.end()); thrust::sort(h_a_key.begin(), h_a_key.end()); thrust::sort(h_b_key.begin(), h_b_key.end()); diff --git a/testing/set_union_by_key.cu b/testing/set_union_by_key.cu index ec8864941..7d58ebf4f 100644 --- a/testing/set_union_by_key.cu +++ b/testing/set_union_by_key.cu @@ -254,11 +254,11 @@ DECLARE_VARIABLE_UNITTEST(TestSetUnionByKeyEquivalentRanges); template void TestSetUnionByKeyMultiset(const size_t n) { - thrust::host_vector temp = unittest::random_integers(2 * n); + thrust::host_vector vec = unittest::random_integers(2 * n); // restrict elements to [min,13) - for(typename thrust::host_vector::iterator i = temp.begin(); - i != temp.end(); + for(typename thrust::host_vector::iterator i = vec.begin(); + i != vec.end(); ++i) { int temp = static_cast(*i); @@ -266,8 +266,8 @@ void TestSetUnionByKeyMultiset(const size_t n) *i = temp; } - thrust::host_vector h_a_key(temp.begin(), temp.begin() + n); - thrust::host_vector h_b_key(temp.begin() + n, temp.end()); + thrust::host_vector h_a_key(vec.begin(), vec.begin() + n); + thrust::host_vector h_b_key(vec.begin() + n, vec.end()); thrust::sort(h_a_key.begin(), h_a_key.end()); thrust::sort(h_b_key.begin(), h_b_key.end()); diff --git a/testing/shuffle.cu b/testing/shuffle.cu new file mode 100644 index 000000000..77e660c00 --- /dev/null +++ b/testing/shuffle.cu @@ -0,0 +1,602 @@ +#include + +#include +#include +#include +#include +#include +#include +#include + +// Functions for performing statistical tests of randomness +// From NIST-Statistical-Test-Suite +// Licence: +// "This software was developed at the National Institute of Standards and +// Technology by employees of the Federal Government in the course of their +// official duties. Pursuant to title 17 Section 105 of the United States Code +// this software is not subject to copyright protection and is in the public +// domain. The NIST Statistical Test Suite is an experimental system. NIST +// assumes no responsibility whatsoever for its use by other parties, and makes +// no guarantees, expressed or implied, about its quality, reliability, or any +// other characteristic. We would appreciate acknowledgment if the software is +// used." +class CephesFunctions { +public: + static double cephes_igamc(double a, double x) { + double ans, ax, c, yc, r, t, y, z; + double pk, pkm1, pkm2, qk, qkm1, qkm2; + + if ((x <= 0) || (a <= 0)) + return (1.0); + + if ((x < 1.0) || (x < a)) + return (1.e0 - cephes_igam(a, x)); + + ax = a * log(x) - x - cephes_lgam(a); + + if (ax < -MAXLOG) { + printf("igamc: UNDERFLOW\n"); + return 0.0; + } + ax = exp(ax); + + /* continued fraction */ + y = 1.0 - a; + z = x + y + 1.0; + c = 0.0; + pkm2 = 1.0; + qkm2 = x; + pkm1 = x + 1.0; + qkm1 = z * x; + ans = pkm1 / qkm1; + + do { + c += 1.0; + y += 1.0; + z += 2.0; + yc = y * c; + pk = pkm1 * z - pkm2 * yc; + qk = qkm1 * z - qkm2 * yc; + if (qk != 0) { + r = pk / qk; + t = fabs((ans - r) / r); + ans = r; + } else + t = 1.0; + pkm2 = pkm1; + pkm1 = pk; + qkm2 = qkm1; + qkm1 = qk; + if (fabs(pk) > big) { + pkm2 *= biginv; + pkm1 *= biginv; + qkm2 *= biginv; + qkm1 *= biginv; + } + } while (t > MACHEP); + + return ans * ax; + } + +private: + static constexpr double rel_error = 1E-12; + + static constexpr double MACHEP = 1.11022302462515654042E-16; // 2**-53 + static constexpr double MAXLOG = 7.09782712893383996732224E2; // log(MAXNUM) + static constexpr double MAXNUM = 1.7976931348623158E308; // 2**1024*(1-MACHEP) + static constexpr double PI = 3.14159265358979323846; + + static constexpr double big = 4.503599627370496e15; + static constexpr double biginv = 2.22044604925031308085e-16; + + static int sgngam; + + static double cephes_igam(double a, double x) { + double ans, ax, c, r; + + if ((x <= 0) || (a <= 0)) + return 0.0; + + if ((x > 1.0) && (x > a)) + return 1.e0 - cephes_igamc(a, x); + + /* Compute x**a * exp(-x) / gamma(a) */ + ax = a * log(x) - x - cephes_lgam(a); + if (ax < -MAXLOG) { + printf("igam: UNDERFLOW\n"); + return 0.0; + } + ax = exp(ax); + + /* power series */ + r = a; + c = 1.0; + ans = 1.0; + + do { + r += 1.0; + c *= x / r; + ans += c; + } while (c / ans > MACHEP); + + return ans * ax / a; + } + + /* A[]: Stirling's formula expansion of log gamma + * B[], C[]: log gamma function between 2 and 3 + */ + static constexpr double A[] = { + 0.000811614167470508488140545910738410384510643780, + -0.000595061904284301438315674115386855191900394857, + 0.000793650340457716942620114419781884862459264696, + -0.002777777777300996942672073330982129846233874559, + 0.083333333333333189929525985917280195280909538269}; + static constexpr double B[] = { + -1378.251525691208598800585605204105377197265625, + -38801.631513463784358464181423187255859375, + -331612.9927388711948879063129425048828125, + -1162370.97492762305773794651031494140625, + -1721737.00820839661173522472381591796875, + -853555.66424576542340219020843505859375}; + static constexpr double C[] = { + -351.8157014365234545039129443466663360595703125, + -17064.21066518811494461260735988616943359375, + -220528.59055385444662533700466156005859375, + -1139334.44367982516996562480926513671875, + -2532523.07177582941949367523193359375, + -2018891.4143353276886045932769775390625}; + + static constexpr double MAXLGM = 2.556348e305; + + /* Logarithm of gamma function */ + static double cephes_lgam(double x) { + double p, q, u, w, z; + int i; + + sgngam = 1; + + if (x < -34.0) { + q = -x; + w = cephes_lgam(q); /* note this modifies sgngam! */ + p = floor(q); + if (p == q) { + lgsing: + goto loverf; + } + i = (int)p; + if ((i & 1) == 0) + sgngam = -1; + else + sgngam = 1; + z = q - p; + if (z > 0.5) { + p += 1.0; + z = p - q; + } + z = q * sin(PI * z); + if (z == 0.0) + goto lgsing; + /* z = log(PI) - log( z ) - w;*/ + z = log(PI) - log(z) - w; + return z; + } + + if (x < 13.0) { + z = 1.0; + p = 0.0; + u = x; + while (u >= 3.0) { + p -= 1.0; + u = x + p; + z *= u; + } + while (u < 2.0) { + if (u == 0.0) + goto lgsing; + z /= u; + p += 1.0; + u = x + p; + } + if (z < 0.0) { + sgngam = -1; + z = -z; + } else + sgngam = 1; + if (u == 2.0) + return (log(z)); + p -= 2.0; + x = x + p; + p = x * cephes_polevl(x, B, 5) / + cephes_p1evl(x, C, 6); + + return log(z) + p; + } + + if (x > MAXLGM) { + loverf: + printf("lgam: OVERFLOW\n"); + + return sgngam * MAXNUM; + } + + q = (x - 0.5) * log(x) - x + log(sqrt(2 * PI)); + if (x > 1.0e8) + return q; + + p = 1.0 / (x * x); + if (x >= 1000.0) + q += + ((7.9365079365079365079365e-4 * p - 2.7777777777777777777778e-3) * p + + 0.0833333333333333333333) / + x; + else + q += cephes_polevl(p, A, 4) / x; + + return q; + } + + static double cephes_polevl(double x, const double *coef, int N) { + const double *p = coef; + double ans = *p++; + int i = N; + do + ans = ans * x + *p++; + while (--i); + + return ans; + } + + static double cephes_p1evl(double x, const double *coef, int N) { + const double *p = coef; + double ans = x + *p++; + int i = N - 1; + + do + ans = ans * x + *p++; + while (--i); + + return ans; + } + + static double cephes_erf(double x) { + static const double two_sqrtpi = 1.128379167095512574; + double sum = x, term = x, xsqr = x * x; + int j = 1; + + if (fabs(x) > 2.2) + return 1.0 - cephes_erfc(x); + + do { + term *= xsqr / j; + sum -= term / (2 * j + 1); + j++; + term *= xsqr / j; + sum += term / (2 * j + 1); + j++; + } while (fabs(term) / sum > rel_error); + + return two_sqrtpi * sum; + } + + static double cephes_erfc(double x) { + static const double one_sqrtpi = 0.564189583547756287; + double a = 1, b = x, c = x, d = x * x + 0.5; + double q1, q2 = b / d, n = 1.0, t; + + if (fabs(x) < 2.2) + return 1.0 - cephes_erf(x); + if (x < 0) + return 2.0 - cephes_erfc(-x); + + do { + t = a * n + b * x; + a = b; + b = t; + t = c * n + d * x; + c = d; + d = t; + n += 0.5; + q1 = q2; + q2 = b / d; + } while (fabs(q1 - q2) / q2 > rel_error); + + return one_sqrtpi * exp(-x * x) * q2; + } + + static double cephes_normal(double x) { + double arg, result, sqrt2 = 1.414213562373095048801688724209698078569672; + + if (x > 0) { + arg = x / sqrt2; + result = 0.5 * (1 + erf(arg)); + } else { + arg = -x / sqrt2; + result = 0.5 * (1 - erf(arg)); + } + + return (result); + } +}; +int CephesFunctions::sgngam = 0; +constexpr double CephesFunctions::A[]; +constexpr double CephesFunctions::B[]; +constexpr double CephesFunctions::C[]; + +template +void TestShuffleSimple() { + Vector data(5); + data[0] = 0; + data[1] = 1; + data[2] = 2; + data[3] = 3; + data[4] = 4; + Vector shuffled(data.begin(), data.end()); + thrust::default_random_engine g(2); + thrust::shuffle(shuffled.begin(), shuffled.end(), g); + thrust::sort(shuffled.begin(), shuffled.end()); + // Check all of our data is present + // This only tests for strange conditions like duplicated elements + ASSERT_EQUAL(shuffled, data); +} +DECLARE_VECTOR_UNITTEST(TestShuffleSimple); + +template +void TestShuffleCopySimple() { + Vector data(5); + data[0] = 0; + data[1] = 1; + data[2] = 2; + data[3] = 3; + data[4] = 4; + Vector shuffled(5); + thrust::default_random_engine g(2); + thrust::shuffle_copy(data.begin(), data.end(), shuffled.begin(), g); + g.seed(2); + thrust::shuffle(data.begin(), data.end(), g); + ASSERT_EQUAL(shuffled, data); +} +DECLARE_VECTOR_UNITTEST(TestShuffleCopySimple); + +template +void TestHostDeviceIdentical(size_t m) { + thrust::host_vector host_result(m); + thrust::host_vector device_result(m); + thrust::sequence(host_result.begin(), host_result.end(), T{}); + thrust::sequence(device_result.begin(), device_result.end(), T{}); + + thrust::default_random_engine host_g(183); + thrust::default_random_engine device_g(183); + + thrust::shuffle(host_result.begin(), host_result.end(), host_g); + thrust::shuffle(device_result.begin(), device_result.end(), device_g); + + ASSERT_EQUAL(device_result, host_result); +} +DECLARE_VARIABLE_UNITTEST(TestHostDeviceIdentical); + +template +void TestFunctionIsBijection(size_t m) { + thrust::default_random_engine host_g(0xD5); + thrust::default_random_engine device_g(0xD5); + + thrust::system::detail::generic::feistel_bijection host_f(m, host_g); + thrust::system::detail::generic::feistel_bijection device_f(m, device_g); + + if (static_cast(host_f.nearest_power_of_two()) >= static_cast(std::numeric_limits::max()) || m == 0) { + return; + } + + thrust::host_vector host_result(host_f.nearest_power_of_two()); + thrust::host_vector device_result(device_f.nearest_power_of_two()); + thrust::sequence(host_result.begin(), host_result.end(), T{}); + thrust::sequence(device_result.begin(), device_result.end(), T{}); + + thrust::transform(host_result.begin(), host_result.end(), host_result.begin(), + host_f); + thrust::transform(device_result.begin(), device_result.end(), + device_result.begin(), device_f); + + ASSERT_EQUAL(host_result, device_result); + + thrust::sort(host_result.begin(), host_result.end()); + // Assert all values were generated exactly once + for (uint64_t i = 0; i < m; i++) { + ASSERT_EQUAL((uint64_t)host_result[i], i); + } +} +DECLARE_VARIABLE_UNITTEST(TestFunctionIsBijection); + +void TestBijectionLength() { + thrust::default_random_engine g(0xD5); + + uint64_t m = 31; + thrust::system::detail::generic::feistel_bijection f(m, g); + ASSERT_EQUAL(f.nearest_power_of_two(), uint64_t(32)); + + m = 32; + f = thrust::system::detail::generic::feistel_bijection(m, g); + ASSERT_EQUAL(f.nearest_power_of_two(), uint64_t(32)); + + m = 1; + f = thrust::system::detail::generic::feistel_bijection(m, g); + ASSERT_EQUAL(f.nearest_power_of_two(), uint64_t(16)); +} +DECLARE_UNITTEST(TestBijectionLength); + +// Individual input keys should be permuted to output locations with uniform +// probability. Perform chi-squared test with confidence 99.9%. +template +void TestShuffleKeyPosition() { + typedef typename Vector::value_type T; + size_t m = 20; + size_t num_samples = 100; + thrust::host_vector index_sum(m, 0); + thrust::host_vector sequence(m); + thrust::sequence(sequence.begin(), sequence.end(), T(0)); + + thrust::default_random_engine g(0xD5); + for (size_t i = 0; i < num_samples; i++) { + Vector shuffled(sequence.begin(), sequence.end()); + thrust::shuffle(shuffled.begin(), shuffled.end(), g); + thrust::host_vector tmp(shuffled.begin(), shuffled.end()); + + for (auto j = 0ull; j < m; j++) { + index_sum[tmp[j]] += j; + } + } + + double expected_average_position = static_cast(m - 1) / 2; + double chi_squared = 0.0; + for (auto j = 0ull; j < m; j++) { + double average_position = static_cast(index_sum[j]) / num_samples; + chi_squared += std::pow(expected_average_position - average_position, 2) / + expected_average_position; + } + // Tabulated chi-squared critical value for m-1=19 degrees of freedom + // and 99.9% confidence + double confidence_threshold = 43.82; + ASSERT_LESS(chi_squared, confidence_threshold); +} +DECLARE_INTEGRAL_VECTOR_UNITTEST(TestShuffleKeyPosition); + +struct vector_compare { + template + bool operator()(const VectorT &a, const VectorT &b) const { + for (auto i = 0ull; i < a.size(); i++) { + if (a[i] < b[i]) + return true; + if (a[i] > b[i]) + return false; + } + return false; + } +}; + +// Brute force check permutations are uniformly distributed on small input +// Uses a chi-squared test indicating 99% confidence the output is uniformly +// random +template +void TestShuffleUniformPermutation() { + typedef typename Vector::value_type T; + + size_t m = 5; + size_t num_samples = 1000; + size_t total_permutations = 1 * 2 * 3 * 4 * 5; + std::map, size_t, vector_compare> permutation_counts; + Vector sequence(m); + thrust::sequence(sequence.begin(), sequence.end(), T(0)); + thrust::default_random_engine g(0xD5); + for (auto i = 0ull; i < num_samples; i++) { + thrust::shuffle(sequence.begin(), sequence.end(), g); + thrust::host_vector tmp(sequence.begin(), sequence.end()); + permutation_counts[tmp]++; + } + + ASSERT_EQUAL(permutation_counts.size(), total_permutations); + + double chi_squared = 0.0; + double expected_count = static_cast(num_samples) / total_permutations; + for (auto kv : permutation_counts) { + chi_squared += std::pow(expected_count - kv.second, 2) / expected_count; + } + double p_score = CephesFunctions::cephes_igamc( + (double)(total_permutations - 1) / 2.0, chi_squared / 2.0); + ASSERT_GREATER(p_score, 0.01); +} +DECLARE_VECTOR_UNITTEST(TestShuffleUniformPermutation); + +template +void TestShuffleEvenSpacingBetweenOccurances() { + typedef typename Vector::value_type T; + const uint64_t shuffle_size = 10; + const uint64_t num_samples = 1000; + + thrust::host_vector h_results; + Vector sequence(shuffle_size); + thrust::sequence(sequence.begin(), sequence.end(), 0); + thrust::default_random_engine g(0xD6); + for (auto i = 0ull; i < num_samples; i++) { + thrust::shuffle(sequence.begin(), sequence.end(), g); + thrust::host_vector tmp(sequence.begin(), sequence.end()); + h_results.insert(h_results.end(), sequence.begin(), sequence.end()); + } + + std::vector>> distance_between( + num_samples, std::vector>( + num_samples, std::vector(shuffle_size, 0))); + + for (uint64_t sample = 0; sample < num_samples; sample++) { + for (uint64_t i = 0; i < shuffle_size - 1; i++) { + for (uint64_t j = 1; j < shuffle_size - i; j++) { + T val_1 = h_results[sample * shuffle_size + i]; + T val_2 = h_results[sample * shuffle_size + i + j]; + distance_between[val_1][val_2][j]++; + distance_between[val_2][val_1][shuffle_size - j]++; + } + } + } + + const double expected_occurances = (double)num_samples / (shuffle_size - 1); + for (uint64_t val_1 = 0; val_1 < shuffle_size; val_1++) { + for (uint64_t val_2 = val_1 + 1; val_2 < shuffle_size; val_2++) { + double chi_squared = 0.0; + auto &distances = distance_between[val_1][val_2]; + for (uint64_t i = 1; i < shuffle_size; i++) { + chi_squared += std::pow((double)distances[i] - expected_occurances, 2) / + expected_occurances; + } + + double p_score = CephesFunctions::cephes_igamc( + (double)(shuffle_size - 2) / 2.0, chi_squared / 2.0); + ASSERT_GREATER(p_score, 0.01); + } + } +} +DECLARE_INTEGRAL_VECTOR_UNITTEST(TestShuffleEvenSpacingBetweenOccurances); + +template +void TestShuffleEvenDistribution() { + typedef typename Vector::value_type T; + const uint64_t shuffle_sizes[] = {10, 100, 500}; + thrust::default_random_engine g(0xD5); + for (auto shuffle_size : shuffle_sizes) { + if(shuffle_size > (uint64_t)std::numeric_limits::max()) + continue; + const uint64_t num_samples = shuffle_size == 500 ? 1000 : 200; + + std::vector counts(shuffle_size * shuffle_size, 0); + Vector sequence(shuffle_size); + for (auto i = 0ull; i < num_samples; i++) { + thrust::sequence(sequence.begin(), sequence.end(), 0); + thrust::shuffle(sequence.begin(), sequence.end(), g); + thrust::host_vector tmp(sequence.begin(), sequence.end()); + for (uint64_t j = 0; j < shuffle_size; j++) { + assert(j < tmp.size()); + counts.at(j * shuffle_size + tmp[j])++; + } + } + + const double expected_occurances = (double)num_samples / shuffle_size; + for (uint64_t i = 0; i < shuffle_size; i++) { + double chi_squared_pos = 0.0; + double chi_squared_num = 0.0; + for (uint64_t j = 0; j < shuffle_size; j++) { + auto count_pos = counts.at(i * shuffle_size + j); + auto count_num = counts.at(j * shuffle_size + i); + chi_squared_pos += + pow((double)count_pos - expected_occurances, 2) / expected_occurances; + chi_squared_num += + pow((double)count_num - expected_occurances, 2) / expected_occurances; + } + + double p_score_pos = CephesFunctions::cephes_igamc( + (double)(shuffle_size - 1) / 2.0, chi_squared_pos / 2.0); + ASSERT_GREATER(p_score_pos, 0.001 / (double)shuffle_size); + + double p_score_num = CephesFunctions::cephes_igamc( + (double)(shuffle_size - 1) / 2.0, chi_squared_num / 2.0); + ASSERT_GREATER(p_score_num, 0.001 / (double)shuffle_size); + } + } +} +DECLARE_INTEGRAL_VECTOR_UNITTEST(TestShuffleEvenDistribution); diff --git a/testing/stable_sort_by_key_large.cu b/testing/stable_sort_by_key_large.cu deleted file mode 100644 index fc69de64c..000000000 --- a/testing/stable_sort_by_key_large.cu +++ /dev/null @@ -1,155 +0,0 @@ -#include -#include -#include - -template -struct less_div_10 -{ - __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return ((int) lhs) / 10 < ((int) rhs) / 10;} -}; - -template -struct greater_div_10 -{ - __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return ((int) lhs) / 10 > ((int) rhs) / 10;} -}; - - -template -void _TestStableSortByKeyWithLargeKeys(void) -{ - size_t n = (128 * 1024) / sizeof(FixedVector); - - thrust::host_vector< FixedVector > h_keys(n); - thrust::host_vector< unsigned int > h_vals(n); - - for(size_t i = 0; i < n; i++) - { - h_keys[i] = FixedVector(rand()); - h_vals[i] = i; - } - - thrust::device_vector< FixedVector > d_keys = h_keys; - thrust::device_vector< unsigned int > d_vals = h_vals; - - thrust::stable_sort_by_key(h_keys.begin(), h_keys.end(), h_vals.begin()); - thrust::stable_sort_by_key(d_keys.begin(), d_keys.end(), d_vals.begin()); - - ASSERT_EQUAL_QUIET(h_keys, d_keys); - ASSERT_EQUAL_QUIET(h_vals, d_vals); -} - -void TestStableSortByKeyWithLargeKeys(void) -{ - _TestStableSortByKeyWithLargeKeys(); - _TestStableSortByKeyWithLargeKeys(); - _TestStableSortByKeyWithLargeKeys(); - -// XXX these take too long to compile -// _TestStableSortByKeyWithLargeKeys(); -// _TestStableSortByKeyWithLargeKeys(); -// _TestStableSortByKeyWithLargeKeys(); -// _TestStableSortByKeyWithLargeKeys(); -// _TestStableSortByKeyWithLargeKeys(); -// _TestStableSortByKeyWithLargeKeys(); -// _TestStableSortByKeyWithLargeKeys(); -// _TestStableSortByKeyWithLargeKeys(); -// _TestStableSortByKeyWithLargeKeys(); -} -DECLARE_UNITTEST(TestStableSortByKeyWithLargeKeys); - - -template -void _TestStableSortByKeyWithLargeValues(void) -{ - size_t n = (128 * 1024) / sizeof(FixedVector); - - thrust::host_vector< unsigned int > h_keys(n); - thrust::host_vector< FixedVector > h_vals(n); - - for(size_t i = 0; i < n; i++) - { - h_keys[i] = rand(); - h_vals[i] = FixedVector(i); - } - - thrust::device_vector< unsigned int > d_keys = h_keys; - thrust::device_vector< FixedVector > d_vals = h_vals; - - thrust::stable_sort_by_key(h_keys.begin(), h_keys.end(), h_vals.begin()); - thrust::stable_sort_by_key(d_keys.begin(), d_keys.end(), d_vals.begin()); - - ASSERT_EQUAL_QUIET(h_keys, d_keys); - ASSERT_EQUAL_QUIET(h_vals, d_vals); - - // so cuda::stable_merge_sort_by_key() is called - thrust::stable_sort_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), greater_div_10()); - thrust::stable_sort_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), greater_div_10()); - - ASSERT_EQUAL_QUIET(h_keys, d_keys); - ASSERT_EQUAL_QUIET(h_vals, d_vals); -} - -void TestStableSortByKeyWithLargeValues(void) -{ - _TestStableSortByKeyWithLargeValues(); - _TestStableSortByKeyWithLargeValues(); - _TestStableSortByKeyWithLargeValues(); - -// XXX these take too long to compile -// _TestStableSortByKeyWithLargeValues(); -// _TestStableSortByKeyWithLargeValues(); -// _TestStableSortByKeyWithLargeValues(); -// _TestStableSortByKeyWithLargeValues(); -// _TestStableSortByKeyWithLargeValues(); -// _TestStableSortByKeyWithLargeValues(); -// _TestStableSortByKeyWithLargeValues(); -// _TestStableSortByKeyWithLargeValues(); -// _TestStableSortByKeyWithLargeValues(); -} -DECLARE_UNITTEST(TestStableSortByKeyWithLargeValues); - - -template -void _TestStableSortByKeyWithLargeKeysAndValues(void) -{ - size_t n = (128 * 1024) / sizeof(FixedVector); - - thrust::host_vector< FixedVector > h_keys(n); - thrust::host_vector< FixedVector > h_vals(n); - - for(size_t i = 0; i < n; i++) - { - h_keys[i] = FixedVector(rand()); - h_vals[i] = FixedVector(i); - } - - thrust::device_vector< FixedVector > d_keys = h_keys; - thrust::device_vector< FixedVector > d_vals = h_vals; - - thrust::stable_sort_by_key(h_keys.begin(), h_keys.end(), h_vals.begin()); - thrust::stable_sort_by_key(d_keys.begin(), d_keys.end(), d_vals.begin()); - - ASSERT_EQUAL_QUIET(h_keys, d_keys); - ASSERT_EQUAL_QUIET(h_vals, d_vals); -} - -void TestStableSortByKeyWithLargeKeysAndValues(void) -{ - _TestStableSortByKeyWithLargeKeysAndValues(); - _TestStableSortByKeyWithLargeKeysAndValues(); - _TestStableSortByKeyWithLargeKeysAndValues(); - -// XXX these take too long to compile -// _TestStableSortByKeyWithLargeKeysAndValues(); -// _TestStableSortByKeyWithLargeKeysAndValues(); -// _TestStableSortByKeyWithLargeKeysAndValues(); -// _TestStableSortByKeyWithLargeKeysAndValues(); -// _TestStableSortByKeyWithLargeKeysAndValues(); -// _TestStableSortByKeyWithLargeKeysAndValues(); -// _TestStableSortByKeyWithLargeKeysAndValues(); -// _TestStableSortByKeyWithLargeKeysAndValues(); -// _TestStableSortByKeyWithLargeKeysAndValues(); -} -DECLARE_UNITTEST(TestStableSortByKeyWithLargeKeysAndValues); - diff --git a/testing/stable_sort_by_key_large_keys.cu b/testing/stable_sort_by_key_large_keys.cu new file mode 100644 index 000000000..9ea4d51f8 --- /dev/null +++ b/testing/stable_sort_by_key_large_keys.cu @@ -0,0 +1,38 @@ +#include +#include + +#include + +template +void _TestStableSortByKeyWithLargeKeys(void) +{ + size_t n = (128 * 1024) / sizeof(FixedVector); + + thrust::host_vector> h_keys(n); + thrust::host_vector h_vals(n); + + for (size_t i = 0; i < n; i++) + { + const auto uint_i = static_cast(i); + const auto rand_int = unittest::generate_random_integer()(uint_i); + h_keys[i] = FixedVector(rand_int); + h_vals[i] = uint_i; + } + + thrust::device_vector> d_keys = h_keys; + thrust::device_vector d_vals = h_vals; + + thrust::stable_sort_by_key(h_keys.begin(), h_keys.end(), h_vals.begin()); + thrust::stable_sort_by_key(d_keys.begin(), d_keys.end(), d_vals.begin()); + + ASSERT_EQUAL_QUIET(h_keys, d_keys); + ASSERT_EQUAL_QUIET(h_vals, d_vals); +} + +void TestStableSortByKeyWithLargeKeys(void) +{ + _TestStableSortByKeyWithLargeKeys<4>(); + _TestStableSortByKeyWithLargeKeys<8>(); + _TestStableSortByKeyWithLargeKeys<16>(); +} +DECLARE_UNITTEST(TestStableSortByKeyWithLargeKeys); diff --git a/testing/stable_sort_by_key_large_keys_and_values.cu b/testing/stable_sort_by_key_large_keys_and_values.cu new file mode 100644 index 000000000..eed6b6efa --- /dev/null +++ b/testing/stable_sort_by_key_large_keys_and_values.cu @@ -0,0 +1,38 @@ +#include +#include + +#include + +template +void _TestStableSortByKeyWithLargeKeysAndValues() +{ + size_t n = (128 * 1024) / sizeof(FixedVector); + + thrust::host_vector> h_keys(n); + thrust::host_vector> h_vals(n); + + for (size_t i = 0; i < n; i++) + { + const auto uint_i = static_cast(i); + const auto rand_int = unittest::generate_random_integer()(uint_i); + h_keys[i] = FixedVector(rand_int); + h_vals[i] = FixedVector(static_cast(i)); + } + + thrust::device_vector> d_keys = h_keys; + thrust::device_vector> d_vals = h_vals; + + thrust::stable_sort_by_key(h_keys.begin(), h_keys.end(), h_vals.begin()); + thrust::stable_sort_by_key(d_keys.begin(), d_keys.end(), d_vals.begin()); + + ASSERT_EQUAL_QUIET(h_keys, d_keys); + ASSERT_EQUAL_QUIET(h_vals, d_vals); +} + +void TestStableSortByKeyWithLargeKeysAndValues() +{ + _TestStableSortByKeyWithLargeKeysAndValues<4>(); + _TestStableSortByKeyWithLargeKeysAndValues<8>(); + _TestStableSortByKeyWithLargeKeysAndValues<16>(); +} +DECLARE_UNITTEST(TestStableSortByKeyWithLargeKeysAndValues); diff --git a/testing/stable_sort_by_key_large_values.cu b/testing/stable_sort_by_key_large_values.cu new file mode 100644 index 000000000..b37753973 --- /dev/null +++ b/testing/stable_sort_by_key_large_values.cu @@ -0,0 +1,60 @@ +#include +#include + +#include + +template +struct greater_div_10 +{ + __host__ __device__ bool operator()(const T &lhs, const T &rhs) const + { + return ((int)lhs) / 10 > ((int)rhs) / 10; + } +}; + +template +void _TestStableSortByKeyWithLargeValues() +{ + size_t n = (128 * 1024) / sizeof(FixedVector); + + thrust::host_vector h_keys(n); + thrust::host_vector> h_vals(n); + + for (size_t i = 0; i < n; i++) + { + const auto uint_i = static_cast(i); + const auto rand_int = unittest::generate_random_integer()(uint_i); + h_keys[i] = rand_int; + h_vals[i] = FixedVector(static_cast(i)); + } + + thrust::device_vector d_keys = h_keys; + thrust::device_vector> d_vals = h_vals; + + thrust::stable_sort_by_key(h_keys.begin(), h_keys.end(), h_vals.begin()); + thrust::stable_sort_by_key(d_keys.begin(), d_keys.end(), d_vals.begin()); + + ASSERT_EQUAL_QUIET(h_keys, d_keys); + ASSERT_EQUAL_QUIET(h_vals, d_vals); + + // so cuda::stable_merge_sort_by_key() is called + thrust::stable_sort_by_key(h_keys.begin(), + h_keys.end(), + h_vals.begin(), + greater_div_10()); + thrust::stable_sort_by_key(d_keys.begin(), + d_keys.end(), + d_vals.begin(), + greater_div_10()); + + ASSERT_EQUAL_QUIET(h_keys, d_keys); + ASSERT_EQUAL_QUIET(h_vals, d_vals); +} + +void TestStableSortByKeyWithLargeValues() +{ + _TestStableSortByKeyWithLargeValues<4>(); + _TestStableSortByKeyWithLargeValues<8>(); + _TestStableSortByKeyWithLargeValues<16>(); +} +DECLARE_UNITTEST(TestStableSortByKeyWithLargeValues); diff --git a/testing/stable_sort_large.cu b/testing/stable_sort_large.cu index 6b6b78b88..2b1907cea 100644 --- a/testing/stable_sort_large.cu +++ b/testing/stable_sort_large.cu @@ -24,22 +24,9 @@ void _TestStableSortWithLargeKeys(void) void TestStableSortWithLargeKeys(void) { - _TestStableSortWithLargeKeys(); _TestStableSortWithLargeKeys(); - _TestStableSortWithLargeKeys(); - _TestStableSortWithLargeKeys(); - _TestStableSortWithLargeKeys(); - _TestStableSortWithLargeKeys(); - _TestStableSortWithLargeKeys(); + _TestStableSortWithLargeKeys(); _TestStableSortWithLargeKeys(); - _TestStableSortWithLargeKeys(); - -// XXX these take too long to compile -// _TestStableSortWithLargeKeys(); -// _TestStableSortWithLargeKeys(); -// _TestStableSortWithLargeKeys(); -// _TestStableSortWithLargeKeys(); -// _TestStableSortWithLargeKeys(); } DECLARE_UNITTEST(TestStableSortWithLargeKeys); diff --git a/testing/swap_ranges.cu b/testing/swap_ranges.cu index a2d061fe3..843c66240 100644 --- a/testing/swap_ranges.cu +++ b/testing/swap_ranges.cu @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include @@ -68,7 +68,7 @@ void TestSwapRangesSimple(void) ASSERT_EQUAL(v1[2], 7); ASSERT_EQUAL(v1[3], 8); ASSERT_EQUAL(v1[4], 9); - + ASSERT_EQUAL(v2[0], 0); ASSERT_EQUAL(v2[1], 1); ASSERT_EQUAL(v2[2], 2); @@ -88,11 +88,11 @@ void TestSwapRanges(const size_t n) thrust::host_vector h2 = a2; thrust::device_vector d1 = a1; thrust::device_vector d2 = a2; - + thrust::swap_ranges(h1.begin(), h1.end(), h2.begin()); thrust::swap_ranges(d1.begin(), d1.end(), d2.begin()); - ASSERT_EQUAL(h1, a2); + ASSERT_EQUAL(h1, a2); ASSERT_EQUAL(d1, a2); ASSERT_EQUAL(h2, a1); ASSERT_EQUAL(d2, a1); @@ -147,6 +147,10 @@ struct type_with_swap return m_x == other.m_x && m_swapped == other.m_swapped; } +#if THRUST_CPP_DIALECT >= 2011 + type_with_swap & operator=(const type_with_swap &) = default; +#endif + int m_x; bool m_swapped; }; diff --git a/testing/transform_input_output_iterator.cu b/testing/transform_input_output_iterator.cu new file mode 100644 index 000000000..7df163077 --- /dev/null +++ b/testing/transform_input_output_iterator.cu @@ -0,0 +1,122 @@ +#include +#include + +#include +#include +#include +#include +#include + +template +void TestTransformInputOutputIterator(void) +{ + typedef typename Vector::value_type T; + + typedef thrust::negate InputFunction; + typedef thrust::square OutputFunction; + typedef typename Vector::iterator Iterator; + + Vector input(4); + Vector squared(4); + Vector negated(4); + + // initialize input + thrust::sequence(input.begin(), input.end(), 1); + + // construct transform_iterator + thrust::transform_input_output_iterator + transform_iter(squared.begin(), InputFunction(), OutputFunction()); + + // transform_iter writes squared value + thrust::copy(input.begin(), input.end(), transform_iter); + + Vector gold_squared(4); + gold_squared[0] = 1; + gold_squared[1] = 4; + gold_squared[2] = 9; + gold_squared[3] = 16; + + ASSERT_EQUAL(squared, gold_squared); + + // negated value read from transform_iter + thrust::copy_n(transform_iter, squared.size(), negated.begin()); + + Vector gold_negated(4); + gold_negated[0] = -1; + gold_negated[1] = -4; + gold_negated[2] = -9; + gold_negated[3] = -16; + + ASSERT_EQUAL(negated, gold_negated); + +} +DECLARE_VECTOR_UNITTEST(TestTransformInputOutputIterator); + +template +void TestMakeTransformInputOutputIterator(void) +{ + typedef typename Vector::value_type T; + + typedef thrust::negate InputFunction; + typedef thrust::square OutputFunction; + + Vector input(4); + Vector negated(4); + Vector squared(4); + + // initialize input + thrust::sequence(input.begin(), input.end(), 1); + + // negated value read from transform iterator + thrust::copy_n(thrust::make_transform_input_output_iterator(input.begin(), InputFunction(), OutputFunction()), + input.size(), negated.begin()); + + Vector gold_negated(4); + gold_negated[0] = -1; + gold_negated[1] = -2; + gold_negated[2] = -3; + gold_negated[3] = -4; + + ASSERT_EQUAL(negated, gold_negated); + + // squared value writen by transform iterator + thrust::copy(negated.begin(), negated.end(), + thrust::make_transform_input_output_iterator(squared.begin(), InputFunction(), OutputFunction())); + + Vector gold_squared(4); + gold_squared[0] = 1; + gold_squared[1] = 4; + gold_squared[2] = 9; + gold_squared[3] = 16; + + ASSERT_EQUAL(squared, gold_squared); + +} +DECLARE_VECTOR_UNITTEST(TestMakeTransformInputOutputIterator); + +template +struct TestTransformInputOutputIteratorScan +{ + void operator()(const size_t n) + { + thrust::host_vector h_data = unittest::random_samples(n); + thrust::device_vector d_data = h_data; + + thrust::host_vector h_result(n); + thrust::device_vector d_result(n); + + // run on host (uses forward iterator negate) + thrust::inclusive_scan(thrust::make_transform_input_output_iterator(h_data.begin(), thrust::negate(), thrust::identity()), + thrust::make_transform_input_output_iterator(h_data.end(), thrust::negate(), thrust::identity()), + h_result.begin()); + // run on device (uses reverse iterator negate) + thrust::inclusive_scan(d_data.begin(), d_data.end(), + thrust::make_transform_input_output_iterator( + d_result.begin(), thrust::square(), thrust::negate())); + + + ASSERT_EQUAL(h_result, d_result); + } +}; +VariableUnitTest TestTransformInputOutputIteratorScanInstance; + diff --git a/testing/transform_iterator.cu b/testing/transform_iterator.cu index e28e333e1..a960a0b44 100644 --- a/testing/transform_iterator.cu +++ b/testing/transform_iterator.cu @@ -7,6 +7,8 @@ #include #include +#include + template void TestTransformIterator(void) { @@ -84,3 +86,28 @@ struct TestTransformIteratorReduce }; VariableUnitTest TestTransformIteratorReduceInstance; + +struct ExtractValue{ + int operator()(std::unique_ptr const& n){ + return *n; + } +}; + +void TestTransformIteratorNonCopyable(){ + + thrust::host_vector> hv(4); + hv[0].reset(new int{1}); + hv[1].reset(new int{2}); + hv[2].reset(new int{3}); + hv[3].reset(new int{4}); + + auto transformed = thrust::make_transform_iterator(hv.begin(), ExtractValue{}); + ASSERT_EQUAL(transformed[0], 1); + ASSERT_EQUAL(transformed[1], 2); + ASSERT_EQUAL(transformed[2], 3); + ASSERT_EQUAL(transformed[3], 4); + +} + +DECLARE_UNITTEST(TestTransformIteratorNonCopyable); + diff --git a/testing/transform_output_iterator.cu b/testing/transform_output_iterator.cu index cdeb950f1..27f8b53bd 100644 --- a/testing/transform_output_iterator.cu +++ b/testing/transform_output_iterator.cu @@ -1,25 +1,27 @@ #include -#include #include -#include +#include #include -#include +#include #include +#include +#include +#include template void TestTransformOutputIterator(void) { typedef typename Vector::value_type T; - typedef thrust::negate UnaryFunction; + typedef thrust::square UnaryFunction; typedef typename Vector::iterator Iterator; Vector input(4); Vector output(4); // initialize input - thrust::sequence(input.begin(), input.end(), 1); + thrust::sequence(input.begin(), input.end(), T{1}); // construct transform_iterator thrust::transform_output_iterator output_iter(output.begin(), UnaryFunction()); @@ -27,10 +29,10 @@ void TestTransformOutputIterator(void) thrust::copy(input.begin(), input.end(), output_iter); Vector gold_output(4); - gold_output[0] = -1; - gold_output[1] = -2; - gold_output[2] = -3; - gold_output[3] = -4; + gold_output[0] = 1; + gold_output[1] = 4; + gold_output[2] = 9; + gold_output[3] = 16; ASSERT_EQUAL(output, gold_output); @@ -42,7 +44,7 @@ void TestMakeTransformOutputIterator(void) { typedef typename Vector::value_type T; - typedef thrust::negate UnaryFunction; + typedef thrust::square UnaryFunction; Vector input(4); Vector output(4); @@ -54,11 +56,10 @@ void TestMakeTransformOutputIterator(void) thrust::make_transform_output_iterator(output.begin(), UnaryFunction())); Vector gold_output(4); - gold_output[0] = -1; - gold_output[1] = -2; - gold_output[2] = -3; - gold_output[3] = -4; - + gold_output[0] = 1; + gold_output[1] = 4; + gold_output[2] = 9; + gold_output[3] = 16; ASSERT_EQUAL(output, gold_output); } @@ -88,5 +89,5 @@ struct TestTransformOutputIteratorScan ASSERT_EQUAL(h_result, d_result); } }; -VariableUnitTest TestTransformOutputIteratorScanInstance; +VariableUnitTest TestTransformOutputIteratorScanInstance; diff --git a/testing/transform_output_iterator_reduce_by_key.cu b/testing/transform_output_iterator_reduce_by_key.cu new file mode 100644 index 000000000..f7004f8c7 --- /dev/null +++ b/testing/transform_output_iterator_reduce_by_key.cu @@ -0,0 +1,51 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +template +struct TestTransformOutputIteratorReduceByKey +{ + void operator()(const size_t n) + { + thrust::host_vector h_keys = unittest::random_samples(n); + thrust::sort(h_keys.begin(), h_keys.end()); + thrust::device_vector d_keys = h_keys; + + thrust::host_vector h_values = unittest::random_samples(n); + thrust::device_vector d_values = h_values; + + thrust::host_vector h_result(n); + thrust::device_vector d_result(n); + + // run on host + thrust::reduce_by_key(thrust::host, + h_keys.begin(), + h_keys.end(), + thrust::make_transform_iterator(h_values.begin(), thrust::negate()), + thrust::discard_iterator{}, + h_result.begin()); + // run on device + thrust::reduce_by_key(thrust::device, + d_keys.begin(), + d_keys.end(), + d_values.begin(), + thrust::discard_iterator{}, + thrust::make_transform_output_iterator(d_result.begin(), + thrust::negate())); + + ASSERT_EQUAL(h_result, d_result); + } +}; +VariableUnitTest + TestTransformOutputIteratorReduceByKeyInstance; + diff --git a/testing/transform_scan.cu b/testing/transform_scan.cu index 2e6633923..2b6e35a2a 100644 --- a/testing/transform_scan.cu +++ b/testing/transform_scan.cu @@ -190,6 +190,61 @@ void TestTransformScanSimple(void) } DECLARE_INTEGRAL_VECTOR_UNITTEST(TestTransformScanSimple); +struct Record { + int number; + + bool operator==(const Record& rhs) const { + return number == rhs.number; + } + bool operator!=(const Record& rhs) const { + return !(rhs == *this); + } + friend Record operator+(Record lhs, const Record& rhs) { + lhs.number += rhs.number; + return lhs; + } + friend std::ostream& operator<<(std::ostream& os, const Record& record) { + os << "number: " << record.number; + return os; + } +}; + +struct negate { + __host__ __device__ int operator()(Record const& record) const + { + return - record.number; + } +}; + +void TestTransformInclusiveScanDifferentTypes() +{ + typename thrust::host_vector::iterator h_iter; + + thrust::host_vector h_input(5); + thrust::host_vector h_output(5); + thrust::host_vector result(5); + + h_input[0] = {1}; h_input[1] = {3}; h_input[2] = {-2}; h_input[3] = {4}; h_input[4] = {-5}; + + thrust::host_vector input_copy(h_input); + + h_iter = thrust::transform_inclusive_scan(h_input.begin(), h_input.end(), h_output.begin(), negate{}, thrust::plus{}); + result[0] = -1; result[1] = -4; result[2] = -2; result[3] = -6; result[4] = -1; + ASSERT_EQUAL(std::size_t(h_iter - h_output.begin()), h_input.size()); + ASSERT_EQUAL(h_input, input_copy); + ASSERT_EQUAL(h_output, result); + + typename thrust::device_vector::iterator d_iter; + + thrust::device_vector d_input = h_input; + thrust::device_vector d_output(5); + + d_iter = thrust::transform_inclusive_scan(d_input.begin(), d_input.end(), d_output.begin(), negate{}, thrust::plus{}); + ASSERT_EQUAL(std::size_t(d_iter - d_output.begin()), d_input.size()); + ASSERT_EQUAL(d_input, input_copy); + ASSERT_EQUAL(d_output, result); +} +DECLARE_UNITTEST(TestTransformInclusiveScanDifferentTypes); template struct TestTransformScan @@ -292,3 +347,55 @@ struct TestTransformScanToDiscardIterator }; VariableUnitTest TestTransformScanToDiscardIteratorInstance; +// Regression test for https://github.com/NVIDIA/thrust/issues/1332 +// The issue was the internal transform_input_iterator_t created by the +// transform_inclusive_scan implementation was instantiated using a reference +// type for the value_type. +template +void TestValueCategoryDeduction() +{ + thrust::device_vector vec; + + T a_h[10] = {5, 0, 5, 8, 6, 7, 5, 3, 0, 9}; + vec.assign((T*)a_h, a_h + 10); + + + thrust::transform_inclusive_scan(thrust::device, + vec.cbegin(), + vec.cend(), + vec.begin(), + thrust::identity<>{}, + thrust::maximum<>{}); + + ASSERT_EQUAL(T{5}, vec[0]); + ASSERT_EQUAL(T{5}, vec[1]); + ASSERT_EQUAL(T{5}, vec[2]); + ASSERT_EQUAL(T{8}, vec[3]); + ASSERT_EQUAL(T{8}, vec[4]); + ASSERT_EQUAL(T{8}, vec[5]); + ASSERT_EQUAL(T{8}, vec[6]); + ASSERT_EQUAL(T{8}, vec[7]); + ASSERT_EQUAL(T{8}, vec[8]); + ASSERT_EQUAL(T{9}, vec[9]); + + vec.assign((T*)a_h, a_h + 10); + thrust::transform_exclusive_scan(thrust::device, + vec.cbegin(), + vec.cend(), + vec.begin(), + thrust::identity<>{}, + T{}, + thrust::maximum<>{}); + + ASSERT_EQUAL(T{0}, vec[0]); + ASSERT_EQUAL(T{5}, vec[1]); + ASSERT_EQUAL(T{5}, vec[2]); + ASSERT_EQUAL(T{5}, vec[3]); + ASSERT_EQUAL(T{8}, vec[4]); + ASSERT_EQUAL(T{8}, vec[5]); + ASSERT_EQUAL(T{8}, vec[6]); + ASSERT_EQUAL(T{8}, vec[7]); + ASSERT_EQUAL(T{8}, vec[8]); + ASSERT_EQUAL(T{8}, vec[9]); +} +DECLARE_GENERIC_UNITTEST(TestValueCategoryDeduction); diff --git a/testing/tuple_algorithms.cu b/testing/tuple_algorithms.cu index 1a7b48dec..449fdc2f1 100644 --- a/testing/tuple_algorithms.cu +++ b/testing/tuple_algorithms.cu @@ -5,26 +5,58 @@ #include #include +#include // FIXME: Replace with C++14 style `thrust::square<>` when we have it. struct custom_square { template + __host__ __device__ T operator()(T v) const { - return v * v; + return v * v; } }; +struct custom_square_inplace +{ + template + __host__ __device__ + void operator()(T& v) const + { + v *= v; + } +}; + +void test_tuple_subset() +{ + auto t0 = std::make_tuple(0, 2, 3.14); + + auto t1 = thrust::tuple_subset(t0, thrust::index_sequence<2, 0>{}); + + ASSERT_EQUAL_QUIET(t1, std::make_tuple(3.14, 0)); +} +DECLARE_UNITTEST(test_tuple_subset); + void test_tuple_transform() { auto t0 = std::make_tuple(0, 2, 3.14); - auto t1 = thrust::tuple_transform(t0, custom_square{}); + auto t1 = thrust::tuple_transform(t0, custom_square{}); ASSERT_EQUAL_QUIET(t1, std::make_tuple(0, 4, 9.8596)); } DECLARE_UNITTEST(test_tuple_transform); - + +void test_tuple_for_each() +{ + auto t = std::make_tuple(0, 2, 3.14); + + thrust::tuple_for_each(t, custom_square_inplace{}); + + ASSERT_EQUAL_QUIET(t, std::make_tuple(0, 4, 9.8596)); +} +DECLARE_UNITTEST(test_tuple_for_each); + #endif // THRUST_CPP_DIALECT >= 2011 diff --git a/testing/tuple_scan.cu b/testing/tuple_scan.cu index c15b81751..d0565d6d4 100644 --- a/testing/tuple_scan.cu +++ b/testing/tuple_scan.cu @@ -58,18 +58,6 @@ struct TestTupleScan inclusive_scan(d_input.begin(), d_input.end(), d_output.begin(), SumTupleFunctor()); ASSERT_EQUAL_QUIET(h_output, d_output); - // The tests below get miscompiled on Tesla hw for 8b types - -#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA - if(const CUDATestDriver *driver = dynamic_cast(&UnitTestDriver::s_driver())) - { - if(sizeof(T) == sizeof(unittest::uint8_t) && driver->current_device_architecture() < 200) - { - KNOWN_FAILURE; - } // end if - } // end if -#endif - // exclusive_scan tuple init(13,17); exclusive_scan(h_input.begin(), h_input.end(), h_output.begin(), init, SumTupleFunctor()); diff --git a/testing/uninitialized_copy.cu b/testing/uninitialized_copy.cu index 7455d8c81..62a79cdc9 100644 --- a/testing/uninitialized_copy.cu +++ b/testing/uninitialized_copy.cu @@ -3,6 +3,7 @@ #include #include +#include template ForwardIterator uninitialized_copy(my_system &system, @@ -147,13 +148,13 @@ struct CopyConstructTest __host__ __device__ CopyConstructTest(const CopyConstructTest &) { -#if __CUDA_ARCH__ - copy_constructed_on_device = true; - copy_constructed_on_host = false; -#else - copy_constructed_on_device = false; - copy_constructed_on_device = true; -#endif + NV_IF_TARGET(NV_IS_DEVICE, ( + copy_constructed_on_device = true; + copy_constructed_on_host = false; + ), ( + copy_constructed_on_device = false; + copy_constructed_on_host = true; + )); } __host__ __device__ diff --git a/testing/uninitialized_fill.cu b/testing/uninitialized_fill.cu index 5e0d53c72..8fbb97002 100644 --- a/testing/uninitialized_fill.cu +++ b/testing/uninitialized_fill.cu @@ -3,6 +3,7 @@ #include #include +#include template void uninitialized_fill(my_system &system, @@ -147,6 +148,7 @@ DECLARE_VECTOR_UNITTEST(TestUninitializedFillPOD); struct CopyConstructTest { + __host__ __device__ CopyConstructTest(void) :copy_constructed_on_host(false), copy_constructed_on_device(false) @@ -155,13 +157,13 @@ struct CopyConstructTest __host__ __device__ CopyConstructTest(const CopyConstructTest &) { -#if __CUDA_ARCH__ - copy_constructed_on_device = true; - copy_constructed_on_host = false; -#else - copy_constructed_on_device = false; - copy_constructed_on_host = true; -#endif + NV_IF_TARGET(NV_IS_DEVICE, ( + copy_constructed_on_device = true; + copy_constructed_on_host = false; + ), ( + copy_constructed_on_device = false; + copy_constructed_on_host = true; + )); } __host__ __device__ diff --git a/testing/unique.cu b/testing/unique.cu index 8073832df..7df2def87 100644 --- a/testing/unique.cu +++ b/testing/unique.cu @@ -95,6 +95,50 @@ void TestUniqueCopyDispatchImplicit() DECLARE_UNITTEST(TestUniqueCopyDispatchImplicit); +template +typename thrust::iterator_traits::difference_type + unique_count(my_system &system, + ForwardIterator, + ForwardIterator) +{ + system.validate_dispatch(); + return 0; +} + +void TestUniqueCountDispatchExplicit() +{ + thrust::device_vector vec(1); + + my_system sys(0); + thrust::unique_count(sys, vec.begin(), vec.begin()); + + ASSERT_EQUAL(true, sys.is_valid()); +} +DECLARE_UNITTEST(TestUniqueCountDispatchExplicit); + + +template +typename thrust::iterator_traits::difference_type + unique_count(my_tag, + ForwardIterator, + ForwardIterator) +{ + return 13; +} + +void TestUniqueCountDispatchImplicit() +{ + thrust::device_vector vec(1); + + auto result = thrust::unique_count( + thrust::retag(vec.begin()), + thrust::retag(vec.begin())); + + ASSERT_EQUAL(13, result); +} +DECLARE_UNITTEST(TestUniqueCountDispatchImplicit); + + template struct is_equal_div_10_unique { @@ -266,3 +310,48 @@ struct TestUniqueCopyToDiscardIterator VariableUnitTest TestUniqueCopyToDiscardIteratorInstance; +template +void TestUniqueCountSimple(void) +{ + typedef typename Vector::value_type T; + + Vector data(10); + data[0] = 11; + data[1] = 11; + data[2] = 12; + data[3] = 20; + data[4] = 29; + data[5] = 21; + data[6] = 21; + data[7] = 31; + data[8] = 31; + data[9] = 37; + + int count = thrust::unique_count(data.begin(), data.end()); + + ASSERT_EQUAL(count, 7); + + int div_10_count = thrust::unique_count(data.begin(), data.end(), is_equal_div_10_unique()); + + ASSERT_EQUAL(div_10_count, 3); +} +DECLARE_INTEGRAL_VECTOR_UNITTEST(TestUniqueCountSimple); + +template +struct TestUniqueCount +{ + void operator()(const size_t n) + { + thrust::host_vector h_data = unittest::random_integers(n); + thrust::device_vector d_data = h_data; + + int h_count{}; + int d_count{}; + + h_count = thrust::unique_count(h_data.begin(), h_data.end()); + d_count = thrust::unique_count(d_data.begin(), d_data.end()); + + ASSERT_EQUAL(h_count, d_count); + } +}; +VariableUnitTest TestUniqueCountInstance; diff --git a/testing/unittest/CMakeLists.txt b/testing/unittest/CMakeLists.txt new file mode 100644 index 000000000..4c0eb66cb --- /dev/null +++ b/testing/unittest/CMakeLists.txt @@ -0,0 +1,24 @@ +foreach(thrust_target IN LISTS THRUST_TARGETS) + thrust_get_target_property(config_device ${thrust_target} DEVICE) + thrust_get_target_property(config_prefix ${thrust_target} PREFIX) + + set(framework_target ${config_prefix}.test.framework) + + if ("CUDA" STREQUAL "${config_device}") + set(framework_srcs + testframework.cu + cuda/testframework.cu + ) + else() + # Wrap the cu file inside a .cpp file for non-CUDA builds + thrust_wrap_cu_in_cpp(framework_srcs testframework.cu ${thrust_target}) + endif() + + add_library(${framework_target} STATIC ${framework_srcs}) + target_link_libraries(${framework_target} PUBLIC ${thrust_target}) + target_include_directories(${framework_target} PRIVATE "${Thrust_SOURCE_DIR}/testing") + thrust_clone_target_properties(${framework_target} ${thrust_target}) + + thrust_fix_clang_nvcc_build_for(${framework_target}) + +endforeach() diff --git a/testing/unittest/assertions.h b/testing/unittest/assertions.h index 6803e8168..855d705a4 100644 --- a/testing/unittest/assertions.h +++ b/testing/unittest/assertions.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -98,15 +99,15 @@ double const DEFAULT_ABSOLUTE_TOL = 1e-4; template struct value_type { - typedef typename thrust::detail::remove_const< - typename thrust::detail::remove_reference< + typedef typename THRUST_NS_QUALIFIER::detail::remove_const< + typename THRUST_NS_QUALIFIER::detail::remove_reference< T >::type >::type type; }; template - struct value_type< thrust::device_reference > + struct value_type< THRUST_NS_QUALIFIER::device_reference > { typedef typename value_type::type type; }; @@ -327,7 +328,7 @@ void assert_almost_equal(T1 a, T2 b, template -void assert_almost_equal(thrust::complex a, thrust::complex b, +void assert_almost_equal(THRUST_NS_QUALIFIER::complex a, THRUST_NS_QUALIFIER::complex b, const std::string& filename = "unknown", int lineno = -1, double a_tol = DEFAULT_ABSOLUTE_TOL, double r_tol = DEFAULT_RELATIVE_TOL) @@ -343,7 +344,7 @@ void assert_almost_equal(thrust::complex a, thrust::complex b, template - void assert_almost_equal(const thrust::complex& a, const std::complex& b, + void assert_almost_equal(const THRUST_NS_QUALIFIER::complex& a, const std::complex& b, const std::string& filename = "unknown", int lineno = -1, double a_tol = DEFAULT_ABSOLUTE_TOL, double r_tol = DEFAULT_RELATIVE_TOL) @@ -370,13 +371,13 @@ class almost_equal_to template -class almost_equal_to > +class almost_equal_to > { public: double a_tol, r_tol; almost_equal_to(double _a_tol = DEFAULT_ABSOLUTE_TOL, double _r_tol = DEFAULT_RELATIVE_TOL) : a_tol(_a_tol), r_tol(_r_tol) {} - bool operator()(const thrust::complex& a, const thrust::complex& b) const { - return almost_equal((double) a.real(), (double) b.real(), a_tol, r_tol) + bool operator()(const THRUST_NS_QUALIFIER::complex& a, const THRUST_NS_QUALIFIER::complex& b) const { + return almost_equal((double) a.real(), (double) b.real(), a_tol, r_tol) && almost_equal((double) a.imag(), (double) b.imag(), a_tol, r_tol); } }; @@ -388,15 +389,15 @@ template ::type difference_type; - typedef typename thrust::iterator_value::type InputType; - + typedef typename THRUST_NS_QUALIFIER::iterator_difference::type difference_type; + typedef typename THRUST_NS_QUALIFIER::iterator_value::type InputType; + bool failure = false; - difference_type length1 = thrust::distance(first1, last1); - difference_type length2 = thrust::distance(first2, last2); - - difference_type min_length = thrust::min(length1, length2); + difference_type length1 = THRUST_NS_QUALIFIER::distance(first1, last1); + difference_type length2 = THRUST_NS_QUALIFIER::distance(first2, last2); + + difference_type min_length = THRUST_NS_QUALIFIER::min(length1, length2); unittest::UnitTestFailure f; f << "[" << filename << ":" << lineno << "] "; @@ -409,7 +410,7 @@ void assert_equal(ForwardIterator1 first1, ForwardIterator1 last1, ForwardIterat } // check values - + size_t mismatches = 0; for (difference_type i = 0; i < min_length; i++) @@ -427,10 +428,14 @@ void assert_equal(ForwardIterator1 first1, ForwardIterator1 last1, ForwardIterat if(mismatches <= MAX_OUTPUT_LINES) { - if (sizeof(InputType) == 1) + THRUST_IF_CONSTEXPR(sizeof(InputType) == 1) + { f << " [" << i << "] " << *first1 + InputType() << " " << *first2 + InputType() << "\n"; // unprintable chars are a problem + } else + { f << " [" << i << "] " << *first1 << " " << *first2 << "\n"; + } } } @@ -458,8 +463,8 @@ template void assert_equal(ForwardIterator1 first1, ForwardIterator1 last1, ForwardIterator2 first2, ForwardIterator2 last2, const std::string& filename = "unknown", int lineno = -1) { - typedef typename thrust::iterator_traits::value_type InputType; - assert_equal(first1, last1, first2, last2, thrust::equal_to(), filename, lineno); + typedef typename THRUST_NS_QUALIFIER::iterator_traits::value_type InputType; + assert_equal(first1, last1, first2, last2, THRUST_NS_QUALIFIER::equal_to(), filename, lineno); } @@ -468,79 +473,190 @@ void assert_almost_equal(ForwardIterator1 first1, ForwardIterator1 last1, Forwar const std::string& filename = "unknown", int lineno = -1, const double a_tol = DEFAULT_ABSOLUTE_TOL, const double r_tol = DEFAULT_RELATIVE_TOL) { - typedef typename thrust::iterator_traits::value_type InputType; + typedef typename THRUST_NS_QUALIFIER::iterator_traits::value_type InputType; assert_equal(first1, last1, first2, last2, almost_equal_to(a_tol, r_tol), filename, lineno); } +template +void assert_equal(const THRUST_NS_QUALIFIER::host_vector& A, + const THRUST_NS_QUALIFIER::host_vector& B, + const std::string& filename = "unknown", int lineno = -1) +{ + assert_equal(A.begin(), A.end(), B.begin(), B.end(), filename, lineno); +} template -void assert_equal(const thrust::host_vector& A, const thrust::host_vector& B, +void assert_equal(const THRUST_NS_QUALIFIER::host_vector& A, + const THRUST_NS_QUALIFIER::device_vector& B, + const std::string& filename = "unknown", int lineno = -1) +{ + THRUST_NS_QUALIFIER::host_vector B_host = B; + assert_equal(A, B_host, filename, lineno); +} + +template +void assert_equal(const THRUST_NS_QUALIFIER::device_vector& A, + const THRUST_NS_QUALIFIER::host_vector& B, + const std::string& filename = "unknown", int lineno = -1) +{ + THRUST_NS_QUALIFIER::host_vector A_host = A; + assert_equal(A_host, B, filename, lineno); +} + +template +void assert_equal(const THRUST_NS_QUALIFIER::device_vector& A, + const THRUST_NS_QUALIFIER::device_vector& B, + const std::string& filename = "unknown", int lineno = -1) +{ + THRUST_NS_QUALIFIER::host_vector A_host = A; + THRUST_NS_QUALIFIER::host_vector B_host = B; + assert_equal(A_host, B_host, filename, lineno); +} + +template +void assert_equal(const THRUST_NS_QUALIFIER::universal_vector& A, + const THRUST_NS_QUALIFIER::universal_vector& B, const std::string& filename = "unknown", int lineno = -1) { assert_equal(A.begin(), A.end(), B.begin(), B.end(), filename, lineno); } template -void assert_almost_equal(const thrust::host_vector& A, const thrust::host_vector& B, - const std::string& filename = "unknown", int lineno = -1, - const double a_tol = DEFAULT_ABSOLUTE_TOL, const double r_tol = DEFAULT_RELATIVE_TOL) +void assert_equal(const THRUST_NS_QUALIFIER::host_vector& A, + const THRUST_NS_QUALIFIER::universal_vector& B, + const std::string& filename = "unknown", int lineno = -1) { - assert_almost_equal(A.begin(), A.end(), B.begin(), B.end(), filename, lineno, a_tol, r_tol); + assert_equal(A.begin(), A.end(), B.begin(), B.end(), filename, lineno); } template -void assert_equal(const thrust::host_vector& A, const thrust::device_vector& B, +void assert_equal(const THRUST_NS_QUALIFIER::universal_vector& A, + const THRUST_NS_QUALIFIER::host_vector& B, const std::string& filename = "unknown", int lineno = -1) { - thrust::host_vector B_host = B; - assert_equal(A, B_host, filename, lineno); + assert_equal(A.begin(), A.end(), B.begin(), B.end(), filename, lineno); } template -void assert_equal(const thrust::device_vector& A, const thrust::host_vector& B, +void assert_equal(const THRUST_NS_QUALIFIER::device_vector& A, + const THRUST_NS_QUALIFIER::universal_vector& B, const std::string& filename = "unknown", int lineno = -1) { - thrust::host_vector A_host = A; + THRUST_NS_QUALIFIER::host_vector A_host = A; assert_equal(A_host, B, filename, lineno); } template -void assert_equal(const thrust::device_vector& A, const thrust::device_vector& B, +void assert_equal(const THRUST_NS_QUALIFIER::universal_vector& A, + const THRUST_NS_QUALIFIER::device_vector& B, const std::string& filename = "unknown", int lineno = -1) { - thrust::host_vector A_host = A; - thrust::host_vector B_host = B; - assert_equal(A_host, B_host, filename, lineno); + THRUST_NS_QUALIFIER::host_vector B_host = B; + assert_equal(A, B_host, filename, lineno); +} + +template +void assert_equal(const std::vector& A, const std::vector& B, + const std::string& filename = "unknown", int lineno = -1) +{ + assert_equal(A.begin(), A.end(), B.begin(), B.end(), filename, lineno); +} + +template +void assert_almost_equal(const THRUST_NS_QUALIFIER::host_vector& A, + const THRUST_NS_QUALIFIER::host_vector& B, + const std::string& filename = "unknown", int lineno = -1, + const double a_tol = DEFAULT_ABSOLUTE_TOL, const double r_tol = DEFAULT_RELATIVE_TOL) +{ + assert_almost_equal(A.begin(), A.end(), B.begin(), B.end(), filename, lineno, a_tol, r_tol); } template -void assert_almost_equal(const thrust::host_vector& A, const thrust::device_vector& B, +void assert_almost_equal(const THRUST_NS_QUALIFIER::host_vector& A, + const THRUST_NS_QUALIFIER::device_vector& B, const std::string& filename = "unknown", int lineno = -1, const double a_tol = DEFAULT_ABSOLUTE_TOL, const double r_tol = DEFAULT_RELATIVE_TOL) { - thrust::host_vector B_host = B; + THRUST_NS_QUALIFIER::host_vector B_host = B; assert_almost_equal(A, B_host, filename, lineno, a_tol, r_tol); } template -void assert_almost_equal(const thrust::device_vector& A, const thrust::host_vector& B, +void assert_almost_equal(const THRUST_NS_QUALIFIER::device_vector& A, + const THRUST_NS_QUALIFIER::host_vector& B, const std::string& filename = "unknown", int lineno = -1, const double a_tol = DEFAULT_ABSOLUTE_TOL, const double r_tol = DEFAULT_RELATIVE_TOL) { - thrust::host_vector A_host = A; + THRUST_NS_QUALIFIER::host_vector A_host = A; assert_almost_equal(A_host, B, filename, lineno, a_tol, r_tol); } template -void assert_almost_equal(const thrust::device_vector& A, const thrust::device_vector& B, +void assert_almost_equal(const THRUST_NS_QUALIFIER::device_vector& A, + const THRUST_NS_QUALIFIER::device_vector& B, const std::string& filename = "unknown", int lineno = -1, const double a_tol = DEFAULT_ABSOLUTE_TOL, const double r_tol = DEFAULT_RELATIVE_TOL) { - thrust::host_vector A_host = A; - thrust::host_vector B_host = B; + THRUST_NS_QUALIFIER::host_vector A_host = A; + THRUST_NS_QUALIFIER::host_vector B_host = B; assert_almost_equal(A_host, B_host, filename, lineno, a_tol, r_tol); } +template +void assert_almost_equal(const THRUST_NS_QUALIFIER::universal_vector& A, + const THRUST_NS_QUALIFIER::universal_vector& B, + const std::string& filename = "unknown", int lineno = -1, + const double a_tol = DEFAULT_ABSOLUTE_TOL, const double r_tol = DEFAULT_RELATIVE_TOL) +{ + assert_almost_equal(A.begin(), A.end(), B.begin(), B.end(), filename, lineno, a_tol, r_tol); +} + +template +void assert_almost_equal(const THRUST_NS_QUALIFIER::host_vector& A, + const THRUST_NS_QUALIFIER::universal_vector& B, + const std::string& filename = "unknown", int lineno = -1, + const double a_tol = DEFAULT_ABSOLUTE_TOL, const double r_tol = DEFAULT_RELATIVE_TOL) +{ + assert_almost_equal(A.begin(), A.end(), B.begin(), B.end(), filename, lineno, a_tol, r_tol); +} + +template +void assert_almost_equal(const THRUST_NS_QUALIFIER::universal_vector& A, + const THRUST_NS_QUALIFIER::host_vector& B, + const std::string& filename = "unknown", int lineno = -1, + const double a_tol = DEFAULT_ABSOLUTE_TOL, const double r_tol = DEFAULT_RELATIVE_TOL) +{ + assert_almost_equal(A.begin(), A.end(), B.begin(), B.end(), filename, lineno, a_tol, r_tol); +} + +template +void assert_almost_equal(const THRUST_NS_QUALIFIER::device_vector& A, + const THRUST_NS_QUALIFIER::universal_vector& B, + const std::string& filename = "unknown", int lineno = -1, + const double a_tol = DEFAULT_ABSOLUTE_TOL, const double r_tol = DEFAULT_RELATIVE_TOL) +{ + THRUST_NS_QUALIFIER::host_vector A_host = A; + assert_almost_equal(A_host, B, filename, lineno, a_tol, r_tol); +} + +template +void assert_almost_equal(const THRUST_NS_QUALIFIER::universal_vector& A, + const THRUST_NS_QUALIFIER::device_vector& B, + const std::string& filename = "unknown", int lineno = -1, + const double a_tol = DEFAULT_ABSOLUTE_TOL, const double r_tol = DEFAULT_RELATIVE_TOL) +{ + THRUST_NS_QUALIFIER::host_vector B_host = B; + assert_almost_equal(A, B_host, filename, lineno, a_tol, r_tol); +} + +template +void assert_almost_equal(const std::vector& A, const std::vector& B, + const std::string& filename = "unknown", int lineno = -1, + const double a_tol = DEFAULT_ABSOLUTE_TOL, const double r_tol = DEFAULT_RELATIVE_TOL) +{ + assert_almost_equal(A.begin(), A.end(), B.begin(), B.end(), filename, lineno, a_tol, r_tol); +} + enum threw_status { did_not_throw diff --git a/testing/unittest/cuda/testframework.cu b/testing/unittest/cuda/testframework.cu index 8f2073157..ff30f368c 100644 --- a/testing/unittest/cuda/testframework.cu +++ b/testing/unittest/cuda/testframework.cu @@ -2,6 +2,7 @@ #include #include #include +#include __global__ void dummy_kernel() {} @@ -28,15 +29,15 @@ void list_devices(void) { std::cout << "There is no device supporting CUDA" << std::endl; } - + int selected_device; cudaGetDevice(&selected_device); - + for (int dev = 0; dev < deviceCount; ++dev) { cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); - + if(dev == 0) { if(deviceProp.major == 9999 && deviceProp.minor == 9999) @@ -46,12 +47,12 @@ void list_devices(void) else std::cout << "There are " << deviceCount << " devices supporting CUDA" << std:: endl; } - + std::cout << "\nDevice " << dev << ": \"" << deviceProp.name << "\""; if(dev == selected_device) std::cout << " [SELECTED]"; std::cout << std::endl; - + std::cout << " Major revision number: " << deviceProp.major << std::endl; std::cout << " Minor revision number: " << deviceProp.minor << std::endl; std::cout << " Total amount of global memory: " << deviceProp.totalGlobalMem << " bytes" << std::endl; @@ -69,27 +70,25 @@ template Iterator my_next(Iterator iter) std::vector CUDATestDriver::target_devices(const ArgumentMap &kwargs) { std::vector result; - + // by default, test all devices in the system (device id -1) int device_id = kwargs.count("device") ? atoi(kwargs.find("device")->second.c_str()) : -1; - + if(device_id < 0) { // target all devices in the system int count = 0; cudaGetDeviceCount(&count); - + result.resize(count); - // XXX iota is not available in c++03 - for(int i = 0; i < count; ++i) - result[i] = i; + std::iota(result.begin(), result.end(), 0); } else { // target the specified device result = std::vector(1,device_id); } - + return result; } @@ -106,12 +105,12 @@ bool CUDATestDriver::check_cuda_error(bool concise) << std::string(cudaGetErrorString(error)) << "]" << std::endl; } - } + } return cudaSuccess != error; } -bool CUDATestDriver::post_test_sanity_check(const UnitTest &test, bool concise) +bool CUDATestDriver::post_test_smoke_check(const UnitTest &test, bool concise) { cudaError_t const error = cudaDeviceSynchronize(); if(cudaSuccess != error) @@ -128,7 +127,7 @@ bool CUDATestDriver::post_test_sanity_check(const UnitTest &test, bool concise) return cudaSuccess == error; } - + bool CUDATestDriver::run_tests(const ArgumentSet &args, const ArgumentMap &kwargs) { bool verbose = kwargs.count("verbose"); @@ -138,22 +137,21 @@ bool CUDATestDriver::run_tests(const ArgumentSet &args, const ArgumentMap &kwarg { std::cout << "--verbose and --concise cannot be used together" << std::endl; exit(EXIT_FAILURE); - return false; } // check error status before doing anything if(check_cuda_error(concise)) return false; - + bool result = true; if(kwargs.count("verbose")) { list_devices(); } - + // figure out which devices to target std::vector devices = target_devices(kwargs); - + // target each device for(std::vector::iterator device = devices.begin(); device != devices.end(); @@ -171,7 +169,7 @@ bool CUDATestDriver::run_tests(const ArgumentSet &args, const ArgumentMap &kwarg // note which device we're skipping cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, *device); - + std::cout << "Skipping Device " << *device << ": \"" << deviceProp.name << "\"" << std::endl; continue; @@ -182,23 +180,23 @@ bool CUDATestDriver::run_tests(const ArgumentSet &args, const ArgumentMap &kwarg // note which device we're testing cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, *device); - + std::cout << "Testing Device " << *device << ": \"" << deviceProp.name << "\"" << std::endl; } // check error status before running any tests if(check_cuda_error(concise)) return false; - + // run tests result &= UnitTestDriver::run_tests(args, kwargs); - + if(!concise && my_next(device) != devices.end()) { // provide some separation between the output of separate tests std::cout << std::endl; } } - + return result; } diff --git a/testing/unittest/cuda/testframework.h b/testing/unittest/cuda/testframework.h index 953f88c1c..34a3dce5a 100644 --- a/testing/unittest/cuda/testframework.h +++ b/testing/unittest/cuda/testframework.h @@ -16,7 +16,7 @@ class CUDATestDriver bool check_cuda_error(bool concise); - virtual bool post_test_sanity_check(const UnitTest &test, bool concise); + virtual bool post_test_smoke_check(const UnitTest &test, bool concise); virtual bool run_tests(const ArgumentSet &args, const ArgumentMap &kwargs); }; diff --git a/testing/unittest/meta.h b/testing/unittest/meta.h index 39c62edb6..ed492634b 100644 --- a/testing/unittest/meta.h +++ b/testing/unittest/meta.h @@ -13,49 +13,10 @@ namespace unittest struct null_type {}; // this type encapsulates a list of -// up to 10 types -template +// types +template struct type_list { - typedef T0 type_0; - typedef T1 type_1; - typedef T2 type_2; - typedef T3 type_3; - typedef T4 type_4; - typedef T5 type_5; - typedef T6 type_6; - typedef T7 type_7; - typedef T8 type_8; - typedef T9 type_9; - typedef T10 type_10; - typedef T11 type_11; - typedef T12 type_12; - typedef T13 type_13; - typedef T14 type_14; - typedef T15 type_15; - typedef T16 type_16; - typedef T17 type_17; - typedef T18 type_18; - typedef T19 type_19; }; // this type provides a way of indexing @@ -66,26 +27,17 @@ template typedef null_type type; }; -template struct get_type { typedef typename List::type_0 type; }; -template struct get_type { typedef typename List::type_1 type; }; -template struct get_type { typedef typename List::type_2 type; }; -template struct get_type { typedef typename List::type_3 type; }; -template struct get_type { typedef typename List::type_4 type; }; -template struct get_type { typedef typename List::type_5 type; }; -template struct get_type { typedef typename List::type_6 type; }; -template struct get_type { typedef typename List::type_7 type; }; -template struct get_type { typedef typename List::type_8 type; }; -template struct get_type { typedef typename List::type_9 type; }; -template struct get_type { typedef typename List::type_10 type; }; -template struct get_type { typedef typename List::type_11 type; }; -template struct get_type { typedef typename List::type_12 type; }; -template struct get_type { typedef typename List::type_13 type; }; -template struct get_type { typedef typename List::type_14 type; }; -template struct get_type { typedef typename List::type_15 type; }; -template struct get_type { typedef typename List::type_16 type; }; -template struct get_type { typedef typename List::type_17 type; }; -template struct get_type { typedef typename List::type_18 type; }; -template struct get_type { typedef typename List::type_19 type; }; +template + struct get_type, 0> +{ + typedef T type; +}; + +template + struct get_type, i> +{ + typedef typename get_type, i - 1>::type type; +}; // this type and its specialization provides a way to // iterate over a type_list, and @@ -196,64 +148,26 @@ template