experimental/bits/simd_neon.h

// Simd NEON specific implementations -*- C++ -*-

// Copyright (C) 2020-2021 Free Software Foundation, Inc.
//
// This file is part of the GNU ISO C++ Library.  This library is free
// software; you can redistribute it and/or modify it under the
// terms of the GNU General Public License as published by the
// Free Software Foundation; either version 3, or (at your option)
// any later version.

// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.

// Under Section 7 of GPL version 3, you are granted additional
// permissions described in the GCC Runtime Library Exception, version
// 3.1, as published by the Free Software Foundation.

// You should have received a copy of the GNU General Public License and
// a copy of the GCC Runtime Library Exception along with this program;
// see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
// <http://www.gnu.org/licenses/>.

#ifndef _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
#define _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_

#if __cplusplus >= 201703L

#if !_GLIBCXX_SIMD_HAVE_NEON
#error "simd_neon.h may only be included when NEON on ARM is available"
#endif

_GLIBCXX_SIMD_BEGIN_NAMESPACE

// _CommonImplNeon {{{
struct _CommonImplNeon : _CommonImplBuiltin
{
  // _S_store {{{
  using _CommonImplBuiltin::_S_store;

  // }}}
};

// }}}
// _SimdImplNeon {{{
template <typename _Abi>
  struct _SimdImplNeon : _SimdImplBuiltin<_Abi>
  {
    using _Base = _SimdImplBuiltin<_Abi>;

    template <typename _Tp>
      using _MaskMember = typename _Base::template _MaskMember<_Tp>;

    template <typename _Tp>
      static constexpr size_t _S_max_store_size = 16;

    // _S_masked_load {{{
    template <typename _Tp, size_t _Np, typename _Up>
      static inline _SimdWrapper<_Tp, _Np>
      _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
                     const _Up* __mem) noexcept
      {
        __execute_n_times<_Np>([&](auto __i) {
          if (__k[__i] != 0)
            __merge._M_set(__i, static_cast<_Tp>(__mem[__i]));
        });
        return __merge;
      }

    // }}}
    // _S_masked_store_nocvt {{{
    template <typename _Tp, size_t _Np>
      _GLIBCXX_SIMD_INTRINSIC static void
      _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem,
                            _MaskMember<_Tp> __k)
      {
        __execute_n_times<_Np>([&](auto __i) {
          if (__k[__i] != 0)
            __mem[__i] = __v[__i];
        });
      }

    // }}}
    // _S_reduce {{{
    template <typename _Tp, typename _BinaryOperation>
      _GLIBCXX_SIMD_INTRINSIC static _Tp
      _S_reduce(simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op)
      {
        constexpr size_t _Np = __x.size();
        if constexpr (sizeof(__x) == 16 && _Np >= 4
                      && !_Abi::template _S_is_partial<_Tp>)
          {
            const auto __halves = split<simd<_Tp, simd_abi::_Neon<8>>>(__x);
            const auto __y = __binary_op(__halves[0], __halves[1]);
            return _SimdImplNeon<simd_abi::_Neon<8>>::_S_reduce(
              __y, static_cast<_BinaryOperation&&>(__binary_op));
          }
        else if constexpr (_Np == 8)
          {
            __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
                                     __vector_permute<1, 0, 3, 2, 5, 4, 7, 6>(
                                       __x._M_data)));
            __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
                                     __vector_permute<3, 2, 1, 0, 7, 6, 5, 4>(
                                       __x._M_data)));
            __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
                                     __vector_permute<7, 6, 5, 4, 3, 2, 1, 0>(
                                       __x._M_data)));
            return __x[0];
          }
        else if constexpr (_Np == 4)
          {
            __x
              = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
                                   __vector_permute<1, 0, 3, 2>(__x._M_data)));
            __x
              = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
                                   __vector_permute<3, 2, 1, 0>(__x._M_data)));
            return __x[0];
          }
        else if constexpr (_Np == 2)
          {
            __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
                                     __vector_permute<1, 0>(__x._M_data)));
            return __x[0];
          }
        else
          return _Base::_S_reduce(__x,
                                  static_cast<_BinaryOperation&&>(__binary_op));
      }

    // }}}
    // math {{{
    // _S_sqrt {{{
    template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
      _GLIBCXX_SIMD_INTRINSIC static _Tp _S_sqrt(_Tp __x)
      {
        if constexpr (__have_neon_a64)
          {
            const auto __intrin = __to_intrin(__x);
            if constexpr (_TVT::template _S_is<float, 2>)
              return vsqrt_f32(__intrin);
            else if constexpr (_TVT::template _S_is<float, 4>)
              return vsqrtq_f32(__intrin);
            else if constexpr (_TVT::template _S_is<double, 1>)
              return vsqrt_f64(__intrin);
            else if constexpr (_TVT::template _S_is<double, 2>)
              return vsqrtq_f64(__intrin);
            else
              __assert_unreachable<_Tp>();
          }
        else
          return _Base::_S_sqrt(__x);
      }

    // }}}
    // _S_trunc {{{
    template <typename _TW, typename _TVT = _VectorTraits<_TW>>
      _GLIBCXX_SIMD_INTRINSIC static _TW _S_trunc(_TW __x)
      {
        using _Tp = typename _TVT::value_type;
        if constexpr (__have_neon_a32)
          {
            const auto __intrin = __to_intrin(__x);
            if constexpr (_TVT::template _S_is<float, 2>)
              return vrnd_f32(__intrin);
            else if constexpr (_TVT::template _S_is<float, 4>)
              return vrndq_f32(__intrin);
            else if constexpr (_TVT::template _S_is<double, 1>)
              return vrnd_f64(__intrin);
            else if constexpr (_TVT::template _S_is<double, 2>)
              return vrndq_f64(__intrin);
            else
              __assert_unreachable<_Tp>();
          }
        else if constexpr (is_same_v<_Tp, float>)
          {
            auto __intrin = __to_intrin(__x);
            if constexpr (sizeof(__x) == 16)
              __intrin = vcvtq_f32_s32(vcvtq_s32_f32(__intrin));
            else
              __intrin = vcvt_f32_s32(vcvt_s32_f32(__intrin));
            return _Base::_S_abs(__x)._M_data < 0x1p23f
                     ? __vector_bitcast<float>(__intrin)
                     : __x._M_data;
          }
        else
          return _Base::_S_trunc(__x);
      }

    // }}}
    // _S_round {{{
    template <typename _Tp, size_t _Np>
      _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
      _S_round(_SimdWrapper<_Tp, _Np> __x)
      {
        if constexpr (__have_neon_a32)
          {
            const auto __intrin = __to_intrin(__x);
            if constexpr (sizeof(_Tp) == 4 && sizeof(__x) == 8)
              return vrnda_f32(__intrin);
            else if constexpr (sizeof(_Tp) == 4 && sizeof(__x) == 16)
              return vrndaq_f32(__intrin);
            else if constexpr (sizeof(_Tp) == 8 && sizeof(__x) == 8)
              return vrnda_f64(__intrin);
            else if constexpr (sizeof(_Tp) == 8 && sizeof(__x) == 16)
              return vrndaq_f64(__intrin);
            else
              __assert_unreachable<_Tp>();
          }
        else
          return _Base::_S_round(__x);
      }

    // }}}
    // _S_floor {{{
    template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
      _GLIBCXX_SIMD_INTRINSIC static _Tp _S_floor(_Tp __x)
      {
        if constexpr (__have_neon_a32)
          {
            const auto __intrin = __to_intrin(__x);
            if constexpr (_TVT::template _S_is<float, 2>)
              return vrndm_f32(__intrin);
            else if constexpr (_TVT::template _S_is<float, 4>)
              return vrndmq_f32(__intrin);
            else if constexpr (_TVT::template _S_is<double, 1>)
              return vrndm_f64(__intrin);
            else if constexpr (_TVT::template _S_is<double, 2>)
              return vrndmq_f64(__intrin);
            else
              __assert_unreachable<_Tp>();
          }
        else
          return _Base::_S_floor(__x);
      }

    // }}}
    // _S_ceil {{{
    template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
      _GLIBCXX_SIMD_INTRINSIC static _Tp _S_ceil(_Tp __x)
      {
        if constexpr (__have_neon_a32)
          {
            const auto __intrin = __to_intrin(__x);
            if constexpr (_TVT::template _S_is<float, 2>)
              return vrndp_f32(__intrin);
            else if constexpr (_TVT::template _S_is<float, 4>)
              return vrndpq_f32(__intrin);
            else if constexpr (_TVT::template _S_is<double, 1>)
              return vrndp_f64(__intrin);
            else if constexpr (_TVT::template _S_is<double, 2>)
              return vrndpq_f64(__intrin);
            else
              __assert_unreachable<_Tp>();
          }
        else
          return _Base::_S_ceil(__x);
      }

    //}}} }}}
  }; // }}}
// _MaskImplNeonMixin {{{
struct _MaskImplNeonMixin
{
  using _Base = _MaskImplBuiltinMixin;

  template <typename _Tp, size_t _Np>
    _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np>
    _S_to_bits(_SimdWrapper<_Tp, _Np> __x)
    {
      if (__builtin_is_constant_evaluated())
        return _Base::_S_to_bits(__x);

      using _I = __int_for_sizeof_t<_Tp>;
      if constexpr (sizeof(__x) == 16)
        {
          auto __asint = __vector_bitcast<_I>(__x);
#ifdef __aarch64__
          [[maybe_unused]] constexpr auto __zero = decltype(__asint)();
#else
          [[maybe_unused]] constexpr auto __zero = decltype(__lo64(__asint))();
#endif
          if constexpr (sizeof(_Tp) == 1)
            {
              constexpr auto __bitsel
                = __generate_from_n_evaluations<16, __vector_type_t<_I, 16>>(
                  [&](auto __i) {
                    return static_cast<_I>(
                      __i < _Np ? (__i < 8 ? 1 << __i : 1 << (__i - 8)) : 0);
                  });
              __asint &= __bitsel;
#ifdef __aarch64__
              return __vector_bitcast<_UShort>(
                vpaddq_s8(vpaddq_s8(vpaddq_s8(__asint, __zero), __zero),
                          __zero))[0];
#else
              return __vector_bitcast<_UShort>(
                vpadd_s8(vpadd_s8(vpadd_s8(__lo64(__asint), __hi64(__asint)),
                                  __zero),
                         __zero))[0];
#endif
            }
          else if constexpr (sizeof(_Tp) == 2)
            {
              constexpr auto __bitsel
                = __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>(
                  [&](auto __i) {
                    return static_cast<_I>(__i < _Np ? 1 << __i : 0);
                  });
              __asint &= __bitsel;
#ifdef __aarch64__
              return vaddvq_s16(__asint);
#else
              return vpadd_s16(
                vpadd_s16(vpadd_s16(__lo64(__asint), __hi64(__asint)), __zero),
                __zero)[0];
#endif
            }
          else if constexpr (sizeof(_Tp) == 4)
            {
              constexpr auto __bitsel
                = __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>(
                  [&](auto __i) {
                    return static_cast<_I>(__i < _Np ? 1 << __i : 0);
                  });
              __asint &= __bitsel;
#ifdef __aarch64__
              return vaddvq_s32(__asint);
#else
              return vpadd_s32(vpadd_s32(__lo64(__asint), __hi64(__asint)),
                               __zero)[0];
#endif
            }
          else if constexpr (sizeof(_Tp) == 8)
            return (__asint[0] & 1) | (__asint[1] & 2);
          else
            __assert_unreachable<_Tp>();
        }
      else if constexpr (sizeof(__x) == 8)
        {
          auto __asint = __vector_bitcast<_I>(__x);
          [[maybe_unused]] constexpr auto __zero = decltype(__asint)();
          if constexpr (sizeof(_Tp) == 1)
            {
              constexpr auto __bitsel
                = __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>(
                  [&](auto __i) {
                    return static_cast<_I>(__i < _Np ? 1 << __i : 0);
                  });
              __asint &= __bitsel;
#ifdef __aarch64__
              return vaddv_s8(__asint);
#else
              return vpadd_s8(vpadd_s8(vpadd_s8(__asint, __zero), __zero),
                              __zero)[0];
#endif
            }
          else if constexpr (sizeof(_Tp) == 2)
            {
              constexpr auto __bitsel
                = __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>(
                  [&](auto __i) {
                    return static_cast<_I>(__i < _Np ? 1 << __i : 0);
                  });
              __asint &= __bitsel;
#ifdef __aarch64__
              return vaddv_s16(__asint);
#else
              return vpadd_s16(vpadd_s16(__asint, __zero), __zero)[0];
#endif
            }
          else if constexpr (sizeof(_Tp) == 4)
            {
              __asint &= __make_vector<_I>(0x1, 0x2);
#ifdef __aarch64__
              return vaddv_s32(__asint);
#else
              return vpadd_s32(__asint, __zero)[0];
#endif
            }
          else
            __assert_unreachable<_Tp>();
        }
      else
        return _Base::_S_to_bits(__x);
    }
};

// }}}
// _MaskImplNeon {{{
template <typename _Abi>
  struct _MaskImplNeon : _MaskImplNeonMixin, _MaskImplBuiltin<_Abi>
  {
    using _MaskImplBuiltinMixin::_S_to_maskvector;
    using _MaskImplNeonMixin::_S_to_bits;
    using _Base = _MaskImplBuiltin<_Abi>;
    using _Base::_S_convert;

    // _S_all_of {{{
    template <typename _Tp>
      _GLIBCXX_SIMD_INTRINSIC static bool _S_all_of(simd_mask<_Tp, _Abi> __k)
      {
        const auto __kk
          = __vector_bitcast<char>(__k._M_data)
            | ~__vector_bitcast<char>(_Abi::template _S_implicit_mask<_Tp>());
        if constexpr (sizeof(__k) == 16)
          {
            const auto __x = __vector_bitcast<long long>(__kk);
            return __x[0] + __x[1] == -2;
          }
        else if constexpr (sizeof(__k) <= 8)
          return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) == -1;
        else
          __assert_unreachable<_Tp>();
      }

    // }}}
    // _S_any_of {{{
    template <typename _Tp>
      _GLIBCXX_SIMD_INTRINSIC static bool _S_any_of(simd_mask<_Tp, _Abi> __k)
      {
        const auto __kk
          = __vector_bitcast<char>(__k._M_data)
            | ~__vector_bitcast<char>(_Abi::template _S_implicit_mask<_Tp>());
        if constexpr (sizeof(__k) == 16)
          {
            const auto __x = __vector_bitcast<long long>(__kk);
            return (__x[0] | __x[1]) != 0;
          }
        else if constexpr (sizeof(__k) <= 8)
          return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) != 0;
        else
          __assert_unreachable<_Tp>();
      }

    // }}}
    // _S_none_of {{{
    template <typename _Tp>
      _GLIBCXX_SIMD_INTRINSIC static bool _S_none_of(simd_mask<_Tp, _Abi> __k)
      {
        const auto __kk = _Abi::_S_masked(__k._M_data);
        if constexpr (sizeof(__k) == 16)
          {
            const auto __x = __vector_bitcast<long long>(__kk);
            return (__x[0] | __x[1]) == 0;
          }
        else if constexpr (sizeof(__k) <= 8)
          return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) == 0;
        else
          __assert_unreachable<_Tp>();
      }

    // }}}
    // _S_some_of {{{
    template <typename _Tp>
      _GLIBCXX_SIMD_INTRINSIC static bool _S_some_of(simd_mask<_Tp, _Abi> __k)
      {
        if constexpr (sizeof(__k) <= 8)
          {
            const auto __kk = __vector_bitcast<char>(__k._M_data)
                              | ~__vector_bitcast<char>(
                                _Abi::template _S_implicit_mask<_Tp>());
            using _Up = make_unsigned_t<__int_for_sizeof_t<decltype(__kk)>>;
            return __bit_cast<_Up>(__kk) + 1 > 1;
          }
        else
          return _Base::_S_some_of(__k);
      }

    // }}}
    // _S_popcount {{{
    template <typename _Tp>
      _GLIBCXX_SIMD_INTRINSIC static int _S_popcount(simd_mask<_Tp, _Abi> __k)
      {
        if constexpr (sizeof(_Tp) == 1)
          {
            const auto __s8 = __vector_bitcast<_SChar>(__k._M_data);
            int8x8_t __tmp = __lo64(__s8) + __hi64z(__s8);
            return -vpadd_s8(vpadd_s8(vpadd_s8(__tmp, int8x8_t()), int8x8_t()),
                             int8x8_t())[0];
          }
        else if constexpr (sizeof(_Tp) == 2)
          {
            const auto __s16 = __vector_bitcast<short>(__k._M_data);
            int16x4_t __tmp = __lo64(__s16) + __hi64z(__s16);
            return -vpadd_s16(vpadd_s16(__tmp, int16x4_t()), int16x4_t())[0];
          }
        else if constexpr (sizeof(_Tp) == 4)
          {
            const auto __s32 = __vector_bitcast<int>(__k._M_data);
            int32x2_t __tmp = __lo64(__s32) + __hi64z(__s32);
            return -vpadd_s32(__tmp, int32x2_t())[0];
          }
        else if constexpr (sizeof(_Tp) == 8)
          {
            static_assert(sizeof(__k) == 16);
            const auto __s64 = __vector_bitcast<long>(__k._M_data);
            return -(__s64[0] + __s64[1]);
          }
      }

    // }}}
    // _S_find_first_set {{{
    template <typename _Tp>
      _GLIBCXX_SIMD_INTRINSIC static int
      _S_find_first_set(simd_mask<_Tp, _Abi> __k)
      {
        // TODO: the _Base implementation is not optimal for NEON
        return _Base::_S_find_first_set(__k);
      }

    // }}}
    // _S_find_last_set {{{
    template <typename _Tp>
      _GLIBCXX_SIMD_INTRINSIC static int
      _S_find_last_set(simd_mask<_Tp, _Abi> __k)
      {
        // TODO: the _Base implementation is not optimal for NEON
        return _Base::_S_find_last_set(__k);
      }

    // }}}
  }; // }}}

_GLIBCXX_SIMD_END_NAMESPACE
#endif // __cplusplus >= 201703L
#endif // _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
// vim: foldmethod=marker sw=2 noet ts=8 sts=2 tw=80
Revision:	1166
Committed:	Tue Oct 26 14:22:36 2021 UTC (4 years ago) by rossy
Content type:	text/x-chdr
File size:	15842 byte(s)
Log Message:	Daodan: Replace MinGW build env with an up-to-date MSYS2 env
#	Content
1	// Simd NEON specific implementations -- C++ --
2
3	// Copyright (C) 2020-2021 Free Software Foundation, Inc.
4	//
5	// This file is part of the GNU ISO C++ Library. This library is free
6	// software; you can redistribute it and/or modify it under the
7	// terms of the GNU General Public License as published by the
8	// Free Software Foundation; either version 3, or (at your option)
9	// any later version.
10
11	// This library is distributed in the hope that it will be useful,
12	// but WITHOUT ANY WARRANTY; without even the implied warranty of
13	// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	// GNU General Public License for more details.
15
16	// Under Section 7 of GPL version 3, you are granted additional
17	// permissions described in the GCC Runtime Library Exception, version
18	// 3.1, as published by the Free Software Foundation.
19
20	// You should have received a copy of the GNU General Public License and
21	// a copy of the GCC Runtime Library Exception along with this program;
22	// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23	// <http://www.gnu.org/licenses/>.
24
25	#ifndef _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
26	#define _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
27
28	#if __cplusplus >= 201703L
29
30	#if !_GLIBCXX_SIMD_HAVE_NEON
31	#error "simd_neon.h may only be included when NEON on ARM is available"
32	#endif
33
34	_GLIBCXX_SIMD_BEGIN_NAMESPACE
35
36	// _CommonImplNeon {{{
37	struct _CommonImplNeon : _CommonImplBuiltin
38	{
39	// _S_store {{{
40	using _CommonImplBuiltin::_S_store;
41
42	// }}}
43	};
44
45	// }}}
46	// _SimdImplNeon {{{
47	template <typename _Abi>
48	struct _SimdImplNeon : _SimdImplBuiltin<_Abi>
49	{
50	using _Base = _SimdImplBuiltin<_Abi>;
51
52	template <typename _Tp>
53	using _MaskMember = typename _Base::template _MaskMember<_Tp>;
54
55	template <typename _Tp>
56	static constexpr size_t _S_max_store_size = 16;
57
58	// _S_masked_load {{{
59	template <typename _Tp, size_t _Np, typename _Up>
60	static inline _SimdWrapper<_Tp, _Np>
61	_S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
62	const _Up* __mem) noexcept
63	{
64	__execute_n_times<_Np>([&](auto __i) {
65	if (__k[__i] != 0)
66	__merge._M_set(__i, static_cast<_Tp>(__mem[__i]));
67	});
68	return __merge;
69	}
70
71	// }}}
72	// _S_masked_store_nocvt {{{
73	template <typename _Tp, size_t _Np>
74	_GLIBCXX_SIMD_INTRINSIC static void
75	_S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem,
76	_MaskMember<_Tp> __k)
77	{
78	__execute_n_times<_Np>([&](auto __i) {
79	if (__k[__i] != 0)
80	__mem[__i] = __v[__i];
81	});
82	}
83
84	// }}}
85	// _S_reduce {{{
86	template <typename _Tp, typename _BinaryOperation>
87	_GLIBCXX_SIMD_INTRINSIC static _Tp
88	_S_reduce(simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op)
89	{
90	constexpr size_t _Np = __x.size();
91	if constexpr (sizeof(__x) == 16 && _Np >= 4
92	&& !_Abi::template _S_is_partial<_Tp>)
93	{
94	const auto __halves = split<simd<_Tp, simd_abi::_Neon<8>>>(__x);
95	const auto __y = __binary_op(__halves[0], __halves[1]);
96	return _SimdImplNeon<simd_abi::_Neon<8>>::_S_reduce(
97	__y, static_cast<_BinaryOperation&&>(__binary_op));
98	}
99	else if constexpr (_Np == 8)
100	{
101	__x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
102	__vector_permute<1, 0, 3, 2, 5, 4, 7, 6>(
103	__x._M_data)));
104	__x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
105	__vector_permute<3, 2, 1, 0, 7, 6, 5, 4>(
106	__x._M_data)));
107	__x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
108	__vector_permute<7, 6, 5, 4, 3, 2, 1, 0>(
109	__x._M_data)));
110	return __x[0];
111	}
112	else if constexpr (_Np == 4)
113	{
114	__x
115	= __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
116	__vector_permute<1, 0, 3, 2>(__x._M_data)));
117	__x
118	= __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
119	__vector_permute<3, 2, 1, 0>(__x._M_data)));
120	return __x[0];
121	}
122	else if constexpr (_Np == 2)
123	{
124	__x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
125	__vector_permute<1, 0>(__x._M_data)));
126	return __x[0];
127	}
128	else
129	return _Base::_S_reduce(__x,
130	static_cast<_BinaryOperation&&>(__binary_op));
131	}
132
133	// }}}
134	// math {{{
135	// _S_sqrt {{{
136	template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
137	_GLIBCXX_SIMD_INTRINSIC static _Tp _S_sqrt(_Tp __x)
138	{
139	if constexpr (__have_neon_a64)
140	{
141	const auto __intrin = __to_intrin(__x);
142	if constexpr (_TVT::template _S_is<float, 2>)
143	return vsqrt_f32(__intrin);
144	else if constexpr (_TVT::template _S_is<float, 4>)
145	return vsqrtq_f32(__intrin);
146	else if constexpr (_TVT::template _S_is<double, 1>)
147	return vsqrt_f64(__intrin);
148	else if constexpr (_TVT::template _S_is<double, 2>)
149	return vsqrtq_f64(__intrin);
150	else
151	__assert_unreachable<_Tp>();
152	}
153	else
154	return _Base::_S_sqrt(__x);
155	}
156
157	// }}}
158	// _S_trunc {{{
159	template <typename _TW, typename _TVT = _VectorTraits<_TW>>
160	_GLIBCXX_SIMD_INTRINSIC static _TW _S_trunc(_TW __x)
161	{
162	using _Tp = typename _TVT::value_type;
163	if constexpr (__have_neon_a32)
164	{
165	const auto __intrin = __to_intrin(__x);
166	if constexpr (_TVT::template _S_is<float, 2>)
167	return vrnd_f32(__intrin);
168	else if constexpr (_TVT::template _S_is<float, 4>)
169	return vrndq_f32(__intrin);
170	else if constexpr (_TVT::template _S_is<double, 1>)
171	return vrnd_f64(__intrin);
172	else if constexpr (_TVT::template _S_is<double, 2>)
173	return vrndq_f64(__intrin);
174	else
175	__assert_unreachable<_Tp>();
176	}
177	else if constexpr (is_same_v<_Tp, float>)
178	{
179	auto __intrin = __to_intrin(__x);
180	if constexpr (sizeof(__x) == 16)
181	__intrin = vcvtq_f32_s32(vcvtq_s32_f32(__intrin));
182	else
183	__intrin = vcvt_f32_s32(vcvt_s32_f32(__intrin));
184	return _Base::_S_abs(__x)._M_data < 0x1p23f
185	? __vector_bitcast<float>(__intrin)
186	: __x._M_data;
187	}
188	else
189	return _Base::_S_trunc(__x);
190	}
191
192	// }}}
193	// _S_round {{{
194	template <typename _Tp, size_t _Np>
195	_GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
196	_S_round(_SimdWrapper<_Tp, _Np> __x)
197	{
198	if constexpr (__have_neon_a32)
199	{
200	const auto __intrin = __to_intrin(__x);
201	if constexpr (sizeof(_Tp) == 4 && sizeof(__x) == 8)
202	return vrnda_f32(__intrin);
203	else if constexpr (sizeof(_Tp) == 4 && sizeof(__x) == 16)
204	return vrndaq_f32(__intrin);
205	else if constexpr (sizeof(_Tp) == 8 && sizeof(__x) == 8)
206	return vrnda_f64(__intrin);
207	else if constexpr (sizeof(_Tp) == 8 && sizeof(__x) == 16)
208	return vrndaq_f64(__intrin);
209	else
210	__assert_unreachable<_Tp>();
211	}
212	else
213	return _Base::_S_round(__x);
214	}
215
216	// }}}
217	// _S_floor {{{
218	template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
219	_GLIBCXX_SIMD_INTRINSIC static _Tp _S_floor(_Tp __x)
220	{
221	if constexpr (__have_neon_a32)
222	{
223	const auto __intrin = __to_intrin(__x);
224	if constexpr (_TVT::template _S_is<float, 2>)
225	return vrndm_f32(__intrin);
226	else if constexpr (_TVT::template _S_is<float, 4>)
227	return vrndmq_f32(__intrin);
228	else if constexpr (_TVT::template _S_is<double, 1>)
229	return vrndm_f64(__intrin);
230	else if constexpr (_TVT::template _S_is<double, 2>)
231	return vrndmq_f64(__intrin);
232	else
233	__assert_unreachable<_Tp>();
234	}
235	else
236	return _Base::_S_floor(__x);
237	}
238
239	// }}}
240	// _S_ceil {{{
241	template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
242	_GLIBCXX_SIMD_INTRINSIC static _Tp _S_ceil(_Tp __x)
243	{
244	if constexpr (__have_neon_a32)
245	{
246	const auto __intrin = __to_intrin(__x);
247	if constexpr (_TVT::template _S_is<float, 2>)
248	return vrndp_f32(__intrin);
249	else if constexpr (_TVT::template _S_is<float, 4>)
250	return vrndpq_f32(__intrin);
251	else if constexpr (_TVT::template _S_is<double, 1>)
252	return vrndp_f64(__intrin);
253	else if constexpr (_TVT::template _S_is<double, 2>)
254	return vrndpq_f64(__intrin);
255	else
256	__assert_unreachable<_Tp>();
257	}
258	else
259	return _Base::_S_ceil(__x);
260	}
261
262	//}}} }}}
263	}; // }}}
264	// _MaskImplNeonMixin {{{
265	struct _MaskImplNeonMixin
266	{
267	using _Base = _MaskImplBuiltinMixin;
268
269	template <typename _Tp, size_t _Np>
270	_GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np>
271	_S_to_bits(_SimdWrapper<_Tp, _Np> __x)
272	{
273	if (__builtin_is_constant_evaluated())
274	return _Base::_S_to_bits(__x);
275
276	using _I = __int_for_sizeof_t<_Tp>;
277	if constexpr (sizeof(__x) == 16)
278	{
279	auto __asint = __vector_bitcast<_I>(__x);
280	#ifdef __aarch64__
281	[[maybe_unused]] constexpr auto __zero = decltype(__asint)();
282	#else
283	[[maybe_unused]] constexpr auto __zero = decltype(__lo64(__asint))();
284	#endif
285	if constexpr (sizeof(_Tp) == 1)
286	{
287	constexpr auto __bitsel
288	= __generate_from_n_evaluations<16, __vector_type_t<_I, 16>>(
289	[&](auto __i) {
290	return static_cast<_I>(
291	__i < _Np ? (__i < 8 ? 1 << __i : 1 << (__i - 8)) : 0);
292	});
293	__asint &= __bitsel;
294	#ifdef __aarch64__
295	return __vector_bitcast<_UShort>(
296	vpaddq_s8(vpaddq_s8(vpaddq_s8(__asint, __zero), __zero),
297	__zero))[0];
298	#else
299	return __vector_bitcast<_UShort>(
300	vpadd_s8(vpadd_s8(vpadd_s8(__lo64(__asint), __hi64(__asint)),
301	__zero),
302	__zero))[0];
303	#endif
304	}
305	else if constexpr (sizeof(_Tp) == 2)
306	{
307	constexpr auto __bitsel
308	= __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>(
309	[&](auto __i) {
310	return static_cast<_I>(__i < _Np ? 1 << __i : 0);
311	});
312	__asint &= __bitsel;
313	#ifdef __aarch64__
314	return vaddvq_s16(__asint);
315	#else
316	return vpadd_s16(
317	vpadd_s16(vpadd_s16(__lo64(__asint), __hi64(__asint)), __zero),
318	__zero)[0];
319	#endif
320	}
321	else if constexpr (sizeof(_Tp) == 4)
322	{
323	constexpr auto __bitsel
324	= __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>(
325	[&](auto __i) {
326	return static_cast<_I>(__i < _Np ? 1 << __i : 0);
327	});
328	__asint &= __bitsel;
329	#ifdef __aarch64__
330	return vaddvq_s32(__asint);
331	#else
332	return vpadd_s32(vpadd_s32(__lo64(__asint), __hi64(__asint)),
333	__zero)[0];
334	#endif
335	}
336	else if constexpr (sizeof(_Tp) == 8)
337	return (__asint[0] & 1) \| (__asint[1] & 2);
338	else
339	__assert_unreachable<_Tp>();
340	}
341	else if constexpr (sizeof(__x) == 8)
342	{
343	auto __asint = __vector_bitcast<_I>(__x);
344	[[maybe_unused]] constexpr auto __zero = decltype(__asint)();
345	if constexpr (sizeof(_Tp) == 1)
346	{
347	constexpr auto __bitsel
348	= __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>(
349	[&](auto __i) {
350	return static_cast<_I>(__i < _Np ? 1 << __i : 0);
351	});
352	__asint &= __bitsel;
353	#ifdef __aarch64__
354	return vaddv_s8(__asint);
355	#else
356	return vpadd_s8(vpadd_s8(vpadd_s8(__asint, __zero), __zero),
357	__zero)[0];
358	#endif
359	}
360	else if constexpr (sizeof(_Tp) == 2)
361	{
362	constexpr auto __bitsel
363	= __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>(
364	[&](auto __i) {
365	return static_cast<_I>(__i < _Np ? 1 << __i : 0);
366	});
367	__asint &= __bitsel;
368	#ifdef __aarch64__
369	return vaddv_s16(__asint);
370	#else
371	return vpadd_s16(vpadd_s16(__asint, __zero), __zero)[0];
372	#endif
373	}
374	else if constexpr (sizeof(_Tp) == 4)
375	{
376	__asint &= __make_vector<_I>(0x1, 0x2);
377	#ifdef __aarch64__
378	return vaddv_s32(__asint);
379	#else
380	return vpadd_s32(__asint, __zero)[0];
381	#endif
382	}
383	else
384	__assert_unreachable<_Tp>();
385	}
386	else
387	return _Base::_S_to_bits(__x);
388	}
389	};
390
391	// }}}
392	// _MaskImplNeon {{{
393	template <typename _Abi>
394	struct _MaskImplNeon : _MaskImplNeonMixin, _MaskImplBuiltin<_Abi>
395	{
396	using _MaskImplBuiltinMixin::_S_to_maskvector;
397	using _MaskImplNeonMixin::_S_to_bits;
398	using _Base = _MaskImplBuiltin<_Abi>;
399	using _Base::_S_convert;
400
401	// _S_all_of {{{
402	template <typename _Tp>
403	_GLIBCXX_SIMD_INTRINSIC static bool _S_all_of(simd_mask<_Tp, _Abi> __k)
404	{
405	const auto __kk
406	= __vector_bitcast<char>(__k._M_data)
407	\| ~__vector_bitcast<char>(_Abi::template _S_implicit_mask<_Tp>());
408	if constexpr (sizeof(__k) == 16)
409	{
410	const auto __x = __vector_bitcast<long long>(__kk);
411	return __x[0] + __x[1] == -2;
412	}
413	else if constexpr (sizeof(__k) <= 8)
414	return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) == -1;
415	else
416	__assert_unreachable<_Tp>();
417	}
418
419	// }}}
420	// _S_any_of {{{
421	template <typename _Tp>
422	_GLIBCXX_SIMD_INTRINSIC static bool _S_any_of(simd_mask<_Tp, _Abi> __k)
423	{
424	const auto __kk
425	= __vector_bitcast<char>(__k._M_data)
426	\| ~__vector_bitcast<char>(_Abi::template _S_implicit_mask<_Tp>());
427	if constexpr (sizeof(__k) == 16)
428	{
429	const auto __x = __vector_bitcast<long long>(__kk);
430	return (__x[0] \| __x[1]) != 0;
431	}
432	else if constexpr (sizeof(__k) <= 8)
433	return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) != 0;
434	else
435	__assert_unreachable<_Tp>();
436	}
437
438	// }}}
439	// _S_none_of {{{
440	template <typename _Tp>
441	_GLIBCXX_SIMD_INTRINSIC static bool _S_none_of(simd_mask<_Tp, _Abi> __k)
442	{
443	const auto __kk = _Abi::_S_masked(__k._M_data);
444	if constexpr (sizeof(__k) == 16)
445	{
446	const auto __x = __vector_bitcast<long long>(__kk);
447	return (__x[0] \| __x[1]) == 0;
448	}
449	else if constexpr (sizeof(__k) <= 8)
450	return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) == 0;
451	else
452	__assert_unreachable<_Tp>();
453	}
454
455	// }}}
456	// _S_some_of {{{
457	template <typename _Tp>
458	_GLIBCXX_SIMD_INTRINSIC static bool _S_some_of(simd_mask<_Tp, _Abi> __k)
459	{
460	if constexpr (sizeof(__k) <= 8)
461	{
462	const auto __kk = __vector_bitcast<char>(__k._M_data)
463	\| ~__vector_bitcast<char>(
464	_Abi::template _S_implicit_mask<_Tp>());
465	using _Up = make_unsigned_t<__int_for_sizeof_t<decltype(__kk)>>;
466	return __bit_cast<_Up>(__kk) + 1 > 1;
467	}
468	else
469	return _Base::_S_some_of(__k);
470	}
471
472	// }}}
473	// _S_popcount {{{
474	template <typename _Tp>
475	_GLIBCXX_SIMD_INTRINSIC static int _S_popcount(simd_mask<_Tp, _Abi> __k)
476	{
477	if constexpr (sizeof(_Tp) == 1)
478	{
479	const auto __s8 = __vector_bitcast<_SChar>(__k._M_data);
480	int8x8_t __tmp = __lo64(__s8) + __hi64z(__s8);
481	return -vpadd_s8(vpadd_s8(vpadd_s8(__tmp, int8x8_t()), int8x8_t()),
482	int8x8_t())[0];
483	}
484	else if constexpr (sizeof(_Tp) == 2)
485	{
486	const auto __s16 = __vector_bitcast<short>(__k._M_data);
487	int16x4_t __tmp = __lo64(__s16) + __hi64z(__s16);
488	return -vpadd_s16(vpadd_s16(__tmp, int16x4_t()), int16x4_t())[0];
489	}
490	else if constexpr (sizeof(_Tp) == 4)
491	{
492	const auto __s32 = __vector_bitcast<int>(__k._M_data);
493	int32x2_t __tmp = __lo64(__s32) + __hi64z(__s32);
494	return -vpadd_s32(__tmp, int32x2_t())[0];
495	}
496	else if constexpr (sizeof(_Tp) == 8)
497	{
498	static_assert(sizeof(__k) == 16);
499	const auto __s64 = __vector_bitcast<long>(__k._M_data);
500	return -(__s64[0] + __s64[1]);
501	}
502	}
503
504	// }}}
505	// _S_find_first_set {{{
506	template <typename _Tp>
507	_GLIBCXX_SIMD_INTRINSIC static int
508	_S_find_first_set(simd_mask<_Tp, _Abi> __k)
509	{
510	// TODO: the _Base implementation is not optimal for NEON
511	return _Base::_S_find_first_set(__k);
512	}
513
514	// }}}
515	// _S_find_last_set {{{
516	template <typename _Tp>
517	_GLIBCXX_SIMD_INTRINSIC static int
518	_S_find_last_set(simd_mask<_Tp, _Abi> __k)
519	{
520	// TODO: the _Base implementation is not optimal for NEON
521	return _Base::_S_find_last_set(__k);
522	}
523
524	// }}}
525	}; // }}}
526
527	_GLIBCXX_SIMD_END_NAMESPACE
528	#endif // __cplusplus >= 201703L
529	#endif // _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
530	// vim: foldmethod=marker sw=2 noet ts=8 sts=2 tw=80