Context Navigation

simd_neon.h

Last change on this file was 1166, checked in by rossy, 4 years ago
Daodan: Replace MinGW build env with an up-to-date MSYS2 env
File size: 15.5 KB

Rev	Line
[1166]	1	// Simd NEON specific implementations -- C++ --
	2
	3	// Copyright (C) 2020-2021 Free Software Foundation, Inc.
	4	//
	5	// This file is part of the GNU ISO C++ Library. This library is free
	6	// software; you can redistribute it and/or modify it under the
	7	// terms of the GNU General Public License as published by the
	8	// Free Software Foundation; either version 3, or (at your option)
	9	// any later version.
	10
	11	// This library is distributed in the hope that it will be useful,
	12	// but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	14	// GNU General Public License for more details.
	15
	16	// Under Section 7 of GPL version 3, you are granted additional
	17	// permissions described in the GCC Runtime Library Exception, version
	18	// 3.1, as published by the Free Software Foundation.
	19
	20	// You should have received a copy of the GNU General Public License and
	21	// a copy of the GCC Runtime Library Exception along with this program;
	22	// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
	23	// <http://www.gnu.org/licenses/>.
	24
	25	#ifndef _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
	26	#define _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
	27
	28	#if __cplusplus >= 201703L
	29
	30	#if !_GLIBCXX_SIMD_HAVE_NEON
	31	#error "simd_neon.h may only be included when NEON on ARM is available"
	32	#endif
	33
	34	_GLIBCXX_SIMD_BEGIN_NAMESPACE
	35
	36	// _CommonImplNeon {{{
	37	struct _CommonImplNeon : _CommonImplBuiltin
	38	{
	39	// _S_store {{{
	40	using _CommonImplBuiltin::_S_store;
	41
	42	// }}}
	43	};
	44
	45	// }}}
	46	// _SimdImplNeon {{{
	47	template <typename _Abi>
	48	struct _SimdImplNeon : _SimdImplBuiltin<_Abi>
	49	{
	50	using _Base = _SimdImplBuiltin<_Abi>;
	51
	52	template <typename _Tp>
	53	using _MaskMember = typename _Base::template _MaskMember<_Tp>;
	54
	55	template <typename _Tp>
	56	static constexpr size_t _S_max_store_size = 16;
	57
	58	// _S_masked_load {{{
	59	template <typename _Tp, size_t _Np, typename _Up>
	60	static inline _SimdWrapper<_Tp, _Np>
	61	_S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
	62	const _Up* __mem) noexcept
	63	{
	64	__execute_n_times<_Np>([&](auto __i) {
	65	if (__k[__i] != 0)
	66	__merge._M_set(__i, static_cast<_Tp>(__mem[__i]));
	67	});
	68	return __merge;
	69	}
	70
	71	// }}}
	72	// _S_masked_store_nocvt {{{
	73	template <typename _Tp, size_t _Np>
	74	_GLIBCXX_SIMD_INTRINSIC static void
	75	_S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem,
	76	_MaskMember<_Tp> __k)
	77	{
	78	__execute_n_times<_Np>([&](auto __i) {
	79	if (__k[__i] != 0)
	80	__mem[__i] = __v[__i];
	81	});
	82	}
	83
	84	// }}}
	85	// _S_reduce {{{
	86	template <typename _Tp, typename _BinaryOperation>
	87	_GLIBCXX_SIMD_INTRINSIC static _Tp
	88	_S_reduce(simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op)
	89	{
	90	constexpr size_t _Np = __x.size();
	91	if constexpr (sizeof(__x) == 16 && _Np >= 4
	92	&& !_Abi::template _S_is_partial<_Tp>)
	93	{
	94	const auto __halves = split<simd<_Tp, simd_abi::_Neon<8>>>(__x);
	95	const auto __y = __binary_op(__halves[0], __halves[1]);
	96	return _SimdImplNeon<simd_abi::_Neon<8>>::_S_reduce(
	97	__y, static_cast<_BinaryOperation&&>(__binary_op));
	98	}
	99	else if constexpr (_Np == 8)
	100	{
	101	__x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
	102	__vector_permute<1, 0, 3, 2, 5, 4, 7, 6>(
	103	__x._M_data)));
	104	__x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
	105	__vector_permute<3, 2, 1, 0, 7, 6, 5, 4>(
	106	__x._M_data)));
	107	__x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
	108	__vector_permute<7, 6, 5, 4, 3, 2, 1, 0>(
	109	__x._M_data)));
	110	return __x[0];
	111	}
	112	else if constexpr (_Np == 4)
	113	{
	114	__x
	115	= __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
	116	__vector_permute<1, 0, 3, 2>(__x._M_data)));
	117	__x
	118	= __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
	119	__vector_permute<3, 2, 1, 0>(__x._M_data)));
	120	return __x[0];
	121	}
	122	else if constexpr (_Np == 2)
	123	{
	124	__x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
	125	__vector_permute<1, 0>(__x._M_data)));
	126	return __x[0];
	127	}
	128	else
	129	return _Base::_S_reduce(__x,
	130	static_cast<_BinaryOperation&&>(__binary_op));
	131	}
	132
	133	// }}}
	134	// math {{{
	135	// _S_sqrt {{{
	136	template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
	137	_GLIBCXX_SIMD_INTRINSIC static _Tp _S_sqrt(_Tp __x)
	138	{
	139	if constexpr (__have_neon_a64)
	140	{
	141	const auto __intrin = __to_intrin(__x);
	142	if constexpr (_TVT::template _S_is<float, 2>)
	143	return vsqrt_f32(__intrin);
	144	else if constexpr (_TVT::template _S_is<float, 4>)
	145	return vsqrtq_f32(__intrin);
	146	else if constexpr (_TVT::template _S_is<double, 1>)
	147	return vsqrt_f64(__intrin);
	148	else if constexpr (_TVT::template _S_is<double, 2>)
	149	return vsqrtq_f64(__intrin);
	150	else
	151	__assert_unreachable<_Tp>();
	152	}
	153	else
	154	return _Base::_S_sqrt(__x);
	155	}
	156
	157	// }}}
	158	// _S_trunc {{{
	159	template <typename _TW, typename _TVT = _VectorTraits<_TW>>
	160	_GLIBCXX_SIMD_INTRINSIC static _TW _S_trunc(_TW __x)
	161	{
	162	using _Tp = typename _TVT::value_type;
	163	if constexpr (__have_neon_a32)
	164	{
	165	const auto __intrin = __to_intrin(__x);
	166	if constexpr (_TVT::template _S_is<float, 2>)
	167	return vrnd_f32(__intrin);
	168	else if constexpr (_TVT::template _S_is<float, 4>)
	169	return vrndq_f32(__intrin);
	170	else if constexpr (_TVT::template _S_is<double, 1>)
	171	return vrnd_f64(__intrin);
	172	else if constexpr (_TVT::template _S_is<double, 2>)
	173	return vrndq_f64(__intrin);
	174	else
	175	__assert_unreachable<_Tp>();
	176	}
	177	else if constexpr (is_same_v<_Tp, float>)
	178	{
	179	auto __intrin = __to_intrin(__x);
	180	if constexpr (sizeof(__x) == 16)
	181	__intrin = vcvtq_f32_s32(vcvtq_s32_f32(__intrin));
	182	else
	183	__intrin = vcvt_f32_s32(vcvt_s32_f32(__intrin));
	184	return _Base::_S_abs(__x)._M_data < 0x1p23f
	185	? __vector_bitcast<float>(__intrin)
	186	: __x._M_data;
	187	}
	188	else
	189	return _Base::_S_trunc(__x);
	190	}
	191
	192	// }}}
	193	// _S_round {{{
	194	template <typename _Tp, size_t _Np>
	195	_GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
	196	_S_round(_SimdWrapper<_Tp, _Np> __x)
	197	{
	198	if constexpr (__have_neon_a32)
	199	{
	200	const auto __intrin = __to_intrin(__x);
	201	if constexpr (sizeof(_Tp) == 4 && sizeof(__x) == 8)
	202	return vrnda_f32(__intrin);
	203	else if constexpr (sizeof(_Tp) == 4 && sizeof(__x) == 16)
	204	return vrndaq_f32(__intrin);
	205	else if constexpr (sizeof(_Tp) == 8 && sizeof(__x) == 8)
	206	return vrnda_f64(__intrin);
	207	else if constexpr (sizeof(_Tp) == 8 && sizeof(__x) == 16)
	208	return vrndaq_f64(__intrin);
	209	else
	210	__assert_unreachable<_Tp>();
	211	}
	212	else
	213	return _Base::_S_round(__x);
	214	}
	215
	216	// }}}
	217	// _S_floor {{{
	218	template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
	219	_GLIBCXX_SIMD_INTRINSIC static _Tp _S_floor(_Tp __x)
	220	{
	221	if constexpr (__have_neon_a32)
	222	{
	223	const auto __intrin = __to_intrin(__x);
	224	if constexpr (_TVT::template _S_is<float, 2>)
	225	return vrndm_f32(__intrin);
	226	else if constexpr (_TVT::template _S_is<float, 4>)
	227	return vrndmq_f32(__intrin);
	228	else if constexpr (_TVT::template _S_is<double, 1>)
	229	return vrndm_f64(__intrin);
	230	else if constexpr (_TVT::template _S_is<double, 2>)
	231	return vrndmq_f64(__intrin);
	232	else
	233	__assert_unreachable<_Tp>();
	234	}
	235	else
	236	return _Base::_S_floor(__x);
	237	}
	238
	239	// }}}
	240	// _S_ceil {{{
	241	template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
	242	_GLIBCXX_SIMD_INTRINSIC static _Tp _S_ceil(_Tp __x)
	243	{
	244	if constexpr (__have_neon_a32)
	245	{
	246	const auto __intrin = __to_intrin(__x);
	247	if constexpr (_TVT::template _S_is<float, 2>)
	248	return vrndp_f32(__intrin);
	249	else if constexpr (_TVT::template _S_is<float, 4>)
	250	return vrndpq_f32(__intrin);
	251	else if constexpr (_TVT::template _S_is<double, 1>)
	252	return vrndp_f64(__intrin);
	253	else if constexpr (_TVT::template _S_is<double, 2>)
	254	return vrndpq_f64(__intrin);
	255	else
	256	__assert_unreachable<_Tp>();
	257	}
	258	else
	259	return _Base::_S_ceil(__x);
	260	}
	261
	262	//}}} }}}
	263	}; // }}}
	264	// _MaskImplNeonMixin {{{
	265	struct _MaskImplNeonMixin
	266	{
	267	using _Base = _MaskImplBuiltinMixin;
	268
	269	template <typename _Tp, size_t _Np>
	270	_GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np>
	271	_S_to_bits(_SimdWrapper<_Tp, _Np> __x)
	272	{
	273	if (__builtin_is_constant_evaluated())
	274	return _Base::_S_to_bits(__x);
	275
	276	using _I = __int_for_sizeof_t<_Tp>;
	277	if constexpr (sizeof(__x) == 16)
	278	{
	279	auto __asint = __vector_bitcast<_I>(__x);
	280	#ifdef __aarch64__
	281	[[maybe_unused]] constexpr auto __zero = decltype(__asint)();
	282	#else
	283	[[maybe_unused]] constexpr auto __zero = decltype(__lo64(__asint))();
	284	#endif
	285	if constexpr (sizeof(_Tp) == 1)
	286	{
	287	constexpr auto __bitsel
	288	= __generate_from_n_evaluations<16, __vector_type_t<_I, 16>>(
	289	[&](auto __i) {
	290	return static_cast<_I>(
	291	__i < _Np ? (__i < 8 ? 1 << __i : 1 << (__i - 8)) : 0);
	292	});
	293	__asint &= __bitsel;
	294	#ifdef __aarch64__
	295	return __vector_bitcast<_UShort>(
	296	vpaddq_s8(vpaddq_s8(vpaddq_s8(__asint, __zero), __zero),
	297	__zero))[0];
	298	#else
	299	return __vector_bitcast<_UShort>(
	300	vpadd_s8(vpadd_s8(vpadd_s8(__lo64(__asint), __hi64(__asint)),
	301	__zero),
	302	__zero))[0];
	303	#endif
	304	}
	305	else if constexpr (sizeof(_Tp) == 2)
	306	{
	307	constexpr auto __bitsel
	308	= __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>(
	309	[&](auto __i) {
	310	return static_cast<_I>(__i < _Np ? 1 << __i : 0);
	311	});
	312	__asint &= __bitsel;
	313	#ifdef __aarch64__
	314	return vaddvq_s16(__asint);
	315	#else
	316	return vpadd_s16(
	317	vpadd_s16(vpadd_s16(__lo64(__asint), __hi64(__asint)), __zero),
	318	__zero)[0];
	319	#endif
	320	}
	321	else if constexpr (sizeof(_Tp) == 4)
	322	{
	323	constexpr auto __bitsel
	324	= __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>(
	325	[&](auto __i) {
	326	return static_cast<_I>(__i < _Np ? 1 << __i : 0);
	327	});
	328	__asint &= __bitsel;
	329	#ifdef __aarch64__
	330	return vaddvq_s32(__asint);
	331	#else
	332	return vpadd_s32(vpadd_s32(__lo64(__asint), __hi64(__asint)),
	333	__zero)[0];
	334	#endif
	335	}
	336	else if constexpr (sizeof(_Tp) == 8)
	337	return (__asint[0] & 1) \| (__asint[1] & 2);
	338	else
	339	__assert_unreachable<_Tp>();
	340	}
	341	else if constexpr (sizeof(__x) == 8)
	342	{
	343	auto __asint = __vector_bitcast<_I>(__x);
	344	[[maybe_unused]] constexpr auto __zero = decltype(__asint)();
	345	if constexpr (sizeof(_Tp) == 1)
	346	{
	347	constexpr auto __bitsel
	348	= __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>(
	349	[&](auto __i) {
	350	return static_cast<_I>(__i < _Np ? 1 << __i : 0);
	351	});
	352	__asint &= __bitsel;
	353	#ifdef __aarch64__
	354	return vaddv_s8(__asint);
	355	#else
	356	return vpadd_s8(vpadd_s8(vpadd_s8(__asint, __zero), __zero),
	357	__zero)[0];
	358	#endif
	359	}
	360	else if constexpr (sizeof(_Tp) == 2)
	361	{
	362	constexpr auto __bitsel
	363	= __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>(
	364	[&](auto __i) {
	365	return static_cast<_I>(__i < _Np ? 1 << __i : 0);
	366	});
	367	__asint &= __bitsel;
	368	#ifdef __aarch64__
	369	return vaddv_s16(__asint);
	370	#else
	371	return vpadd_s16(vpadd_s16(__asint, __zero), __zero)[0];
	372	#endif
	373	}
	374	else if constexpr (sizeof(_Tp) == 4)
	375	{
	376	__asint &= __make_vector<_I>(0x1, 0x2);
	377	#ifdef __aarch64__
	378	return vaddv_s32(__asint);
	379	#else
	380	return vpadd_s32(__asint, __zero)[0];
	381	#endif
	382	}
	383	else
	384	__assert_unreachable<_Tp>();
	385	}
	386	else
	387	return _Base::_S_to_bits(__x);
	388	}
	389	};
	390
	391	// }}}
	392	// _MaskImplNeon {{{
	393	template <typename _Abi>
	394	struct _MaskImplNeon : _MaskImplNeonMixin, _MaskImplBuiltin<_Abi>
	395	{
	396	using _MaskImplBuiltinMixin::_S_to_maskvector;
	397	using _MaskImplNeonMixin::_S_to_bits;
	398	using _Base = _MaskImplBuiltin<_Abi>;
	399	using _Base::_S_convert;
	400
	401	// _S_all_of {{{
	402	template <typename _Tp>
	403	_GLIBCXX_SIMD_INTRINSIC static bool _S_all_of(simd_mask<_Tp, _Abi> __k)
	404	{
	405	const auto __kk
	406	= __vector_bitcast<char>(__k._M_data)
	407	\| ~__vector_bitcast<char>(_Abi::template _S_implicit_mask<_Tp>());
	408	if constexpr (sizeof(__k) == 16)
	409	{
	410	const auto __x = __vector_bitcast<long long>(__kk);
	411	return __x[0] + __x[1] == -2;
	412	}
	413	else if constexpr (sizeof(__k) <= 8)
	414	return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) == -1;
	415	else
	416	__assert_unreachable<_Tp>();
	417	}
	418
	419	// }}}
	420	// _S_any_of {{{
	421	template <typename _Tp>
	422	_GLIBCXX_SIMD_INTRINSIC static bool _S_any_of(simd_mask<_Tp, _Abi> __k)
	423	{
	424	const auto __kk
	425	= __vector_bitcast<char>(__k._M_data)
	426	\| ~__vector_bitcast<char>(_Abi::template _S_implicit_mask<_Tp>());
	427	if constexpr (sizeof(__k) == 16)
	428	{
	429	const auto __x = __vector_bitcast<long long>(__kk);
	430	return (__x[0] \| __x[1]) != 0;
	431	}
	432	else if constexpr (sizeof(__k) <= 8)
	433	return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) != 0;
	434	else
	435	__assert_unreachable<_Tp>();
	436	}
	437
	438	// }}}
	439	// _S_none_of {{{
	440	template <typename _Tp>
	441	_GLIBCXX_SIMD_INTRINSIC static bool _S_none_of(simd_mask<_Tp, _Abi> __k)
	442	{
	443	const auto __kk = _Abi::_S_masked(__k._M_data);
	444	if constexpr (sizeof(__k) == 16)
	445	{
	446	const auto __x = __vector_bitcast<long long>(__kk);
	447	return (__x[0] \| __x[1]) == 0;
	448	}
	449	else if constexpr (sizeof(__k) <= 8)
	450	return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) == 0;
	451	else
	452	__assert_unreachable<_Tp>();
	453	}
	454
	455	// }}}
	456	// _S_some_of {{{
	457	template <typename _Tp>
	458	_GLIBCXX_SIMD_INTRINSIC static bool _S_some_of(simd_mask<_Tp, _Abi> __k)
	459	{
	460	if constexpr (sizeof(__k) <= 8)
	461	{
	462	const auto __kk = __vector_bitcast<char>(__k._M_data)
	463	\| ~__vector_bitcast<char>(
	464	_Abi::template _S_implicit_mask<_Tp>());
	465	using _Up = make_unsigned_t<__int_for_sizeof_t<decltype(__kk)>>;
	466	return __bit_cast<_Up>(__kk) + 1 > 1;
	467	}
	468	else
	469	return _Base::_S_some_of(__k);
	470	}
	471
	472	// }}}
	473	// _S_popcount {{{
	474	template <typename _Tp>
	475	_GLIBCXX_SIMD_INTRINSIC static int _S_popcount(simd_mask<_Tp, _Abi> __k)
	476	{
	477	if constexpr (sizeof(_Tp) == 1)
	478	{
	479	const auto __s8 = __vector_bitcast<_SChar>(__k._M_data);
	480	int8x8_t __tmp = __lo64(__s8) + __hi64z(__s8);
	481	return -vpadd_s8(vpadd_s8(vpadd_s8(__tmp, int8x8_t()), int8x8_t()),
	482	int8x8_t())[0];
	483	}
	484	else if constexpr (sizeof(_Tp) == 2)
	485	{
	486	const auto __s16 = __vector_bitcast<short>(__k._M_data);
	487	int16x4_t __tmp = __lo64(__s16) + __hi64z(__s16);
	488	return -vpadd_s16(vpadd_s16(__tmp, int16x4_t()), int16x4_t())[0];
	489	}
	490	else if constexpr (sizeof(_Tp) == 4)
	491	{
	492	const auto __s32 = __vector_bitcast<int>(__k._M_data);
	493	int32x2_t __tmp = __lo64(__s32) + __hi64z(__s32);
	494	return -vpadd_s32(__tmp, int32x2_t())[0];
	495	}
	496	else if constexpr (sizeof(_Tp) == 8)
	497	{
	498	static_assert(sizeof(__k) == 16);
	499	const auto __s64 = __vector_bitcast<long>(__k._M_data);
	500	return -(__s64[0] + __s64[1]);
	501	}
	502	}
	503
	504	// }}}
	505	// _S_find_first_set {{{
	506	template <typename _Tp>
	507	_GLIBCXX_SIMD_INTRINSIC static int
	508	_S_find_first_set(simd_mask<_Tp, _Abi> __k)
	509	{
	510	// TODO: the _Base implementation is not optimal for NEON
	511	return _Base::_S_find_first_set(__k);
	512	}
	513
	514	// }}}
	515	// _S_find_last_set {{{
	516	template <typename _Tp>
	517	_GLIBCXX_SIMD_INTRINSIC static int
	518	_S_find_last_set(simd_mask<_Tp, _Abi> __k)
	519	{
	520	// TODO: the _Base implementation is not optimal for NEON
	521	return _Base::_S_find_last_set(__k);
	522	}
	523
	524	// }}}
	525	}; // }}}
	526
	527	_GLIBCXX_SIMD_END_NAMESPACE
	528	#endif // __cplusplus >= 201703L
	529	#endif // _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
	530	// vim: foldmethod=marker sw=2 noet ts=8 sts=2 tw=80

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: Daodan/MSYS2/mingw32/include/c++/11.2.0/experimental/bits/simd_neon.h

Download in other formats: