Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions include/xsimd/arch/xsimd_neon64.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -608,6 +608,50 @@ namespace xsimd
return vmaxq_f64(lhs, rhs);
}

/********
* mask *
********/

template <class A, class T, detail::enable_sized_t<T, 1> = 0>
XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<neon64>) noexcept
Copy link
Contributor

@onalante-ebay onalante-ebay Dec 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems like this has lower block throughput than the non-NEON64 variant: https://godbolt.org/z/szPjEzPW7.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Aha, benchmarks on actual CPUs were faster with vaddv: DLTcollab/sse2neon@ed179d7.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The results might need reevaluation for u{16,32}. I can put together a benchmark since I am using a M2-based device.

Copy link
Contributor

@onalante-ebay onalante-ebay Dec 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can confirm faster execution for u{8,16,32} on this crude benchmark:

PATCH
diff --git a/benchmark/main.cpp b/benchmark/main.cpp
index 7a630e4..e921566 100644
--- a/benchmark/main.cpp
+++ b/benchmark/main.cpp
@@ -12,6 +12,15 @@
 #include "xsimd_benchmark.hpp"
 #include <map>

+void benchmark_mask()
+{
+    std::size_t size = 20000;
+    xsimd::run_mask_benchmark<uint8_t>(std::cout, size, 1000);
+    xsimd::run_mask_benchmark<uint16_t>(std::cout, size, 1000);
+    xsimd::run_mask_benchmark<uint32_t>(std::cout, size, 1000);
+    xsimd::run_mask_benchmark<uint64_t>(std::cout, size, 1000);
+}
+
 void benchmark_operation()
 {
     // std::size_t size = 9984;
@@ -112,6 +121,7 @@ void benchmark_basic_math()
 int main(int argc, char* argv[])
 {
     const std::map<std::string, std::pair<std::string, void (*)()>> fn_map = {
+        { "mask", { "mask", benchmark_mask } },
         { "op", { "arithmetic", benchmark_operation } },
         { "exp", { "exponential and logarithm", benchmark_exp_log } },
         { "trigo", { "trigonometric", benchmark_trigo } },
diff --git a/benchmark/xsimd_benchmark.hpp b/benchmark/xsimd_benchmark.hpp
index 6f6b91b..8b8447c 100644
--- a/benchmark/xsimd_benchmark.hpp
+++ b/benchmark/xsimd_benchmark.hpp
@@ -16,6 +16,7 @@
 #include "xsimd/xsimd.hpp"
 #include <chrono>
 #include <iostream>
+#include <random>
 #include <string>
 #include <vector>

@@ -310,6 +311,38 @@ namespace xsimd
         return t_res;
     }

+    template <class T, class OS, kernel::detail::enable_integral_t<T> = 0>
+    void run_mask_benchmark(OS& out, std::size_t size, std::size_t iter)
+    {
+        bench_vector<T> f_lhs;
+        // NOTE: This is a hack to match the signature of `benchmark_simd{,_unrolled}`.
+        bench_vector<T> f_res;
+
+        size = size / batch<T>::size * batch<T>::size;
+        f_lhs.resize(size);
+        f_res.resize(size);
+
+        std::minstd_rand rng(1337);
+        std::bernoulli_distribution dist;
+        for (std::size_t i = 0; i < size; ++i)
+        {
+            f_lhs[i] = static_cast<T>(dist(rng));
+        }
+
+        const auto mask_functor = [](batch<T> const& x)
+        {
+            return (x == batch<T>(0)).mask();
+        };
+        const auto time = benchmark_simd<batch<T>>(mask_functor, f_lhs, f_res, iter);
+        const auto time_unr = benchmark_simd_unrolled<batch<T>>(mask_functor, f_lhs, f_res, iter);
+
+        out << "============================" << std::endl;
+        out << "mask" << sizeof(T) * 8 << std::endl;
+        out << "vector            : " << time.count() << "ms" << std::endl;
+        out << "vector unr        : " << time_unr.count() << "ms" << std::endl;
+        out << "============================" << std::endl;
+    }
+
     template <class F, class OS>
     void run_benchmark_1op(F f, OS& out, std::size_t size, std::size_t iter, init_method init = init_method::classic)
     {

Copy link
Contributor Author

@serge-sans-paille serge-sans-paille Dec 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the feedback. Let's merge that one then! (once CI is happy)

{
// From https://github.com/DLTcollab/sse2neon/blob/master/sse2neon.h
// Extract most significant bit
uint8x16_t msbs = vshrq_n_u8(self, 7);
// Position it appropriately
static constexpr int8_t shift_table[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 };
int8x16_t shifts = vld1q_s8(shift_table);
uint8x16_t positioned = vshlq_u8(msbs, shifts);
// Horizontal reduction
return vaddv_u8(vget_low_u8(positioned)) | (vaddv_u8(vget_high_u8(positioned)) << 8);
}

template <class A, class T, detail::enable_sized_t<T, 2> = 0>
XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<neon64>) noexcept
{
// Extract most significant bit
uint16x8_t msbs = vshrq_n_u16(self, 15);
// Position it appropriately
static constexpr int16_t shift_table[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
int16x8_t shifts = vld1q_s16(shift_table);
uint16x8_t positioned = vshlq_u16(msbs, shifts);
// Horizontal reduction
return vaddvq_u16(positioned);
}

template <class A, class T, detail::enable_sized_t<T, 4> = 0>
XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<neon64>) noexcept
{
// Extract most significant bit
uint32x4_t msbs = vshrq_n_u32(self, 31);
// Position it appropriately
static constexpr int32_t shift_table[4] = { 0, 1, 2, 3 };
int32x4_t shifts = vld1q_s32(shift_table);
uint32x4_t positioned = vshlq_u32(msbs, shifts);
// Horizontal reduction
return vaddvq_u32(positioned);
}

/*******
* abs *
*******/
Expand Down
Loading