Context Navigation

simd_x86_conversions.h

Last change on this file was 1166, checked in by rossy, 3 years ago
Daodan: Replace MinGW build env with an up-to-date MSYS2 env
File size: 80.5 KB

Rev	Line
[1166]	1	// x86 specific conversion optimizations -- C++ --
	2
	3	// Copyright (C) 2020-2021 Free Software Foundation, Inc.
	4	//
	5	// This file is part of the GNU ISO C++ Library. This library is free
	6	// software; you can redistribute it and/or modify it under the
	7	// terms of the GNU General Public License as published by the
	8	// Free Software Foundation; either version 3, or (at your option)
	9	// any later version.
	10
	11	// This library is distributed in the hope that it will be useful,
	12	// but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	14	// GNU General Public License for more details.
	15
	16	// Under Section 7 of GPL version 3, you are granted additional
	17	// permissions described in the GCC Runtime Library Exception, version
	18	// 3.1, as published by the Free Software Foundation.
	19
	20	// You should have received a copy of the GNU General Public License and
	21	// a copy of the GCC Runtime Library Exception along with this program;
	22	// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
	23	// <http://www.gnu.org/licenses/>.
	24
	25	#ifndef _GLIBCXX_EXPERIMENTAL_SIMD_X86_CONVERSIONS_H
	26	#define _GLIBCXX_EXPERIMENTAL_SIMD_X86_CONVERSIONS_H
	27
	28	#if __cplusplus >= 201703L
	29
	30	// work around PR85827
	31	// 1-arg __convert_x86 {{{1
	32	template <typename _To, typename _V, typename _Traits>
	33	_GLIBCXX_SIMD_INTRINSIC _To
	34	__convert_x86(_V __v)
	35	{
	36	static_assert(__is_vector_type_v<_V>);
	37	using _Tp = typename _Traits::value_type;
	38	constexpr size_t _Np = _Traits::_S_full_size;
	39	[[maybe_unused]] const auto __intrin = __to_intrin(__v);
	40	using _Up = typename _VectorTraits<_To>::value_type;
	41	constexpr size_t _M = _VectorTraits<_To>::_S_full_size;
	42
	43	// [xyz]_to_[xyz] {{{2
	44	[[maybe_unused]] constexpr bool __x_to_x
	45	= sizeof(__v) <= 16 && sizeof(_To) <= 16;
	46	[[maybe_unused]] constexpr bool __x_to_y
	47	= sizeof(__v) <= 16 && sizeof(_To) == 32;
	48	[[maybe_unused]] constexpr bool __x_to_z
	49	= sizeof(__v) <= 16 && sizeof(_To) == 64;
	50	[[maybe_unused]] constexpr bool __y_to_x
	51	= sizeof(__v) == 32 && sizeof(_To) <= 16;
	52	[[maybe_unused]] constexpr bool __y_to_y
	53	= sizeof(__v) == 32 && sizeof(_To) == 32;
	54	[[maybe_unused]] constexpr bool __y_to_z
	55	= sizeof(__v) == 32 && sizeof(_To) == 64;
	56	[[maybe_unused]] constexpr bool __z_to_x
	57	= sizeof(__v) == 64 && sizeof(_To) <= 16;
	58	[[maybe_unused]] constexpr bool __z_to_y
	59	= sizeof(__v) == 64 && sizeof(_To) == 32;
	60	[[maybe_unused]] constexpr bool __z_to_z
	61	= sizeof(__v) == 64 && sizeof(_To) == 64;
	62
	63	// iX_to_iX {{{2
	64	[[maybe_unused]] constexpr bool __i_to_i
	65	= is_integral_v<_Up> && is_integral_v<_Tp>;
	66	[[maybe_unused]] constexpr bool __i8_to_i16
	67	= __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 2;
	68	[[maybe_unused]] constexpr bool __i8_to_i32
	69	= __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 4;
	70	[[maybe_unused]] constexpr bool __i8_to_i64
	71	= __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 8;
	72	[[maybe_unused]] constexpr bool __i16_to_i8
	73	= __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 1;
	74	[[maybe_unused]] constexpr bool __i16_to_i32
	75	= __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 4;
	76	[[maybe_unused]] constexpr bool __i16_to_i64
	77	= __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 8;
	78	[[maybe_unused]] constexpr bool __i32_to_i8
	79	= __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 1;
	80	[[maybe_unused]] constexpr bool __i32_to_i16
	81	= __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 2;
	82	[[maybe_unused]] constexpr bool __i32_to_i64
	83	= __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 8;
	84	[[maybe_unused]] constexpr bool __i64_to_i8
	85	= __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 1;
	86	[[maybe_unused]] constexpr bool __i64_to_i16
	87	= __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 2;
	88	[[maybe_unused]] constexpr bool __i64_to_i32
	89	= __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 4;
	90
	91	// [fsu]X_to_[fsu]X {{{2
	92	// ibw = integral && byte or word, i.e. char and short with any signedness
	93	[[maybe_unused]] constexpr bool __s64_to_f32
	94	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 8
	95	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
	96	[[maybe_unused]] constexpr bool __s32_to_f32
	97	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4
	98	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
	99	[[maybe_unused]] constexpr bool __s16_to_f32
	100	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 2
	101	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
	102	[[maybe_unused]] constexpr bool __s8_to_f32
	103	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 1
	104	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
	105	[[maybe_unused]] constexpr bool __u64_to_f32
	106	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 8
	107	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
	108	[[maybe_unused]] constexpr bool __u32_to_f32
	109	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4
	110	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
	111	[[maybe_unused]] constexpr bool __u16_to_f32
	112	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 2
	113	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
	114	[[maybe_unused]] constexpr bool __u8_to_f32
	115	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 1
	116	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
	117	[[maybe_unused]] constexpr bool __s64_to_f64
	118	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 8
	119	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
	120	[[maybe_unused]] constexpr bool __s32_to_f64
	121	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4
	122	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
	123	[[maybe_unused]] constexpr bool __u64_to_f64
	124	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 8
	125	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
	126	[[maybe_unused]] constexpr bool __u32_to_f64
	127	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4
	128	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
	129	[[maybe_unused]] constexpr bool __f32_to_s64
	130	= is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8
	131	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
	132	[[maybe_unused]] constexpr bool __f32_to_s32
	133	= is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4
	134	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
	135	[[maybe_unused]] constexpr bool __f32_to_u64
	136	= is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8
	137	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
	138	[[maybe_unused]] constexpr bool __f32_to_u32
	139	= is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4
	140	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
	141	[[maybe_unused]] constexpr bool __f64_to_s64
	142	= is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8
	143	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
	144	[[maybe_unused]] constexpr bool __f64_to_s32
	145	= is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4
	146	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
	147	[[maybe_unused]] constexpr bool __f64_to_u64
	148	= is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8
	149	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
	150	[[maybe_unused]] constexpr bool __f64_to_u32
	151	= is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4
	152	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
	153	[[maybe_unused]] constexpr bool __ibw_to_f32
	154	= is_integral_v<_Tp> && sizeof(_Tp) <= 2
	155	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
	156	[[maybe_unused]] constexpr bool __ibw_to_f64
	157	= is_integral_v<_Tp> && sizeof(_Tp) <= 2
	158	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
	159	[[maybe_unused]] constexpr bool __f32_to_ibw
	160	= is_integral_v<_Up> && sizeof(_Up) <= 2
	161	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
	162	[[maybe_unused]] constexpr bool __f64_to_ibw
	163	= is_integral_v<_Up> && sizeof(_Up) <= 2
	164	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
	165	[[maybe_unused]] constexpr bool __f32_to_f64
	166	= is_floating_point_v<_Tp> && sizeof(_Tp) == 4
	167	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
	168	[[maybe_unused]] constexpr bool __f64_to_f32
	169	= is_floating_point_v<_Tp> && sizeof(_Tp) == 8
	170	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
	171
	172	if constexpr (__i_to_i && __y_to_x && !__have_avx2) //{{{2
	173	return __convert_x86<_To>(__lo128(__v), __hi128(__v));
	174	else if constexpr (__i_to_i && __x_to_y && !__have_avx2) //{{{2
	175	return __concat(__convert_x86<__vector_type_t<_Up, _M / 2>>(__v),
	176	__convert_x86<__vector_type_t<_Up, _M / 2>>(
	177	__extract_part<1, _Np / _M * 2>(__v)));
	178	else if constexpr (__i_to_i) //{{{2
	179	{
	180	static_assert(__x_to_x \|\| __have_avx2,
	181	"integral conversions with ymm registers require AVX2");
	182	static_assert(__have_avx512bw
	183	\|\| ((sizeof(_Tp) >= 4 \|\| sizeof(__v) < 64)
	184	&& (sizeof(_Up) >= 4 \|\| sizeof(_To) < 64)),
	185	"8/16-bit integers in zmm registers require AVX512BW");
	186	static_assert((sizeof(__v) < 64 && sizeof(_To) < 64) \|\| __have_avx512f,
	187	"integral conversions with ymm registers require AVX2");
	188	}
	189	if constexpr (is_floating_point_v<_Tp> == is_floating_point_v<_Up> && //{{{2
	190	sizeof(_Tp) == sizeof(_Up))
	191	{
	192	// conversion uses simple bit reinterpretation (or no conversion at all)
	193	if constexpr (_Np >= _M)
	194	return __intrin_bitcast<_To>(__v);
	195	else
	196	return __zero_extend(__vector_bitcast<_Up>(__v));
	197	}
	198	else if constexpr (_Np < _M && sizeof(_To) > 16) //{{{2
	199	// zero extend (eg. xmm -> ymm)
	200	return __zero_extend(
	201	__convert_x86<__vector_type_t<
	202	_Up, (16 / sizeof(_Up) > _Np) ? 16 / sizeof(_Up) : _Np>>(__v));
	203	else if constexpr (_Np > _M && sizeof(__v) > 16) //{{{2
	204	// partial input (eg. ymm -> xmm)
	205	return __convert_x86<_To>(__extract_part<0, _Np / _M>(__v));
	206	else if constexpr (__i64_to_i32) //{{{2
	207	{
	208	if constexpr (__x_to_x && __have_avx512vl)
	209	return __intrin_bitcast<_To>(_mm_cvtepi64_epi32(__intrin));
	210	else if constexpr (__x_to_x)
	211	return __auto_bitcast(
	212	_mm_shuffle_ps(__vector_bitcast<float>(__v), __m128(), 8));
	213	else if constexpr (__y_to_x && __have_avx512vl)
	214	return __intrin_bitcast<_To>(_mm256_cvtepi64_epi32(__intrin));
	215	else if constexpr (__y_to_x && __have_avx512f)
	216	return __intrin_bitcast<_To>(
	217	__lo128(_mm512_cvtepi64_epi32(__auto_bitcast(__v))));
	218	else if constexpr (__y_to_x)
	219	return __intrin_bitcast<_To>(
	220	__lo128(_mm256_permute4x64_epi64(_mm256_shuffle_epi32(__intrin, 8),
	221	0 + 4 * 2)));
	222	else if constexpr (__z_to_y)
	223	return __intrin_bitcast<_To>(_mm512_cvtepi64_epi32(__intrin));
	224	}
	225	else if constexpr (__i64_to_i16) //{{{2
	226	{
	227	if constexpr (__x_to_x && __have_avx512vl)
	228	return __intrin_bitcast<_To>(_mm_cvtepi64_epi16(__intrin));
	229	else if constexpr (__x_to_x && __have_avx512f)
	230	return __intrin_bitcast<_To>(
	231	__lo128(_mm512_cvtepi64_epi16(__auto_bitcast(__v))));
	232	else if constexpr (__x_to_x && __have_ssse3)
	233	{
	234	return __intrin_bitcast<_To>(
	235	_mm_shuffle_epi8(__intrin,
	236	_mm_setr_epi8(0, 1, 8, 9, -0x80, -0x80, -0x80,
	237	-0x80, -0x80, -0x80, -0x80, -0x80,
	238	-0x80, -0x80, -0x80, -0x80)));
	239	// fallback without SSSE3
	240	}
	241	else if constexpr (__y_to_x && __have_avx512vl)
	242	return __intrin_bitcast<_To>(_mm256_cvtepi64_epi16(__intrin));
	243	else if constexpr (__y_to_x && __have_avx512f)
	244	return __intrin_bitcast<_To>(
	245	__lo128(_mm512_cvtepi64_epi16(__auto_bitcast(__v))));
	246	else if constexpr (__y_to_x)
	247	{
	248	const auto __a = _mm256_shuffle_epi8(
	249	__intrin,
	250	_mm256_setr_epi8(0, 1, 8, 9, -0x80, -0x80, -0x80, -0x80, -0x80,
	251	-0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
	252	-0x80, -0x80, -0x80, -0x80, 0, 1, 8, 9, -0x80,
	253	-0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
	254	-0x80));
	255	return __intrin_bitcast<_To>(__lo128(__a) \| __hi128(__a));
	256	}
	257	else if constexpr (__z_to_x)
	258	return __intrin_bitcast<_To>(_mm512_cvtepi64_epi16(__intrin));
	259	}
	260	else if constexpr (__i64_to_i8) //{{{2
	261	{
	262	if constexpr (__x_to_x && __have_avx512vl)
	263	return __intrin_bitcast<_To>(_mm_cvtepi64_epi8(__intrin));
	264	else if constexpr (__x_to_x && __have_avx512f)
	265	return __intrin_bitcast<_To>(
	266	__lo128(_mm512_cvtepi64_epi8(__zero_extend(__intrin))));
	267	else if constexpr (__y_to_x && __have_avx512vl)
	268	return __intrin_bitcast<_To>(_mm256_cvtepi64_epi8(__intrin));
	269	else if constexpr (__y_to_x && __have_avx512f)
	270	return __intrin_bitcast<_To>(
	271	_mm512_cvtepi64_epi8(__zero_extend(__intrin)));
	272	else if constexpr (__z_to_x)
	273	return __intrin_bitcast<_To>(_mm512_cvtepi64_epi8(__intrin));
	274	}
	275	else if constexpr (__i32_to_i64) //{{{2
	276	{
	277	if constexpr (__have_sse4_1 && __x_to_x)
	278	return __intrin_bitcast<_To>(is_signed_v<_Tp>
	279	? _mm_cvtepi32_epi64(__intrin)
	280	: _mm_cvtepu32_epi64(__intrin));
	281	else if constexpr (__x_to_x)
	282	{
	283	return __intrin_bitcast<_To>(
	284	_mm_unpacklo_epi32(__intrin, is_signed_v<_Tp>
	285	? _mm_srai_epi32(__intrin, 31)
	286	: __m128i()));
	287	}
	288	else if constexpr (__x_to_y)
	289	return __intrin_bitcast<_To>(is_signed_v<_Tp>
	290	? _mm256_cvtepi32_epi64(__intrin)
	291	: _mm256_cvtepu32_epi64(__intrin));
	292	else if constexpr (__y_to_z)
	293	return __intrin_bitcast<_To>(is_signed_v<_Tp>
	294	? _mm512_cvtepi32_epi64(__intrin)
	295	: _mm512_cvtepu32_epi64(__intrin));
	296	}
	297	else if constexpr (__i32_to_i16) //{{{2
	298	{
	299	if constexpr (__x_to_x && __have_avx512vl)
	300	return __intrin_bitcast<_To>(_mm_cvtepi32_epi16(__intrin));
	301	else if constexpr (__x_to_x && __have_avx512f)
	302	return __intrin_bitcast<_To>(
	303	__lo128(_mm512_cvtepi32_epi16(__auto_bitcast(__v))));
	304	else if constexpr (__x_to_x && __have_ssse3)
	305	return __intrin_bitcast<_To>(_mm_shuffle_epi8(
	306	__intrin, _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80,
	307	-0x80, -0x80, -0x80, -0x80, -0x80, -0x80)));
	308	else if constexpr (__x_to_x)
	309	{
	310	auto __a = _mm_unpacklo_epi16(__intrin, __m128i()); // 0o.o 1o.o
	311	auto __b = _mm_unpackhi_epi16(__intrin, __m128i()); // 2o.o 3o.o
	312	auto __c = _mm_unpacklo_epi16(__a, __b); // 02oo ..oo
	313	auto __d = _mm_unpackhi_epi16(__a, __b); // 13oo ..oo
	314	return __intrin_bitcast<_To>(
	315	_mm_unpacklo_epi16(__c, __d)); // 0123 oooo
	316	}
	317	else if constexpr (__y_to_x && __have_avx512vl)
	318	return __intrin_bitcast<_To>(_mm256_cvtepi32_epi16(__intrin));
	319	else if constexpr (__y_to_x && __have_avx512f)
	320	return __intrin_bitcast<_To>(
	321	__lo128(_mm512_cvtepi32_epi16(__auto_bitcast(__v))));
	322	else if constexpr (__y_to_x)
	323	{
	324	auto __a = _mm256_shuffle_epi8(
	325	__intrin,
	326	_mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80, -0x80,
	327	-0x80, -0x80, -0x80, -0x80, -0x80, 0, 1, 4, 5, 8,
	328	9, 12, 13, -0x80, -0x80, -0x80, -0x80, -0x80,
	329	-0x80, -0x80, -0x80));
	330	return __intrin_bitcast<_To>(__lo128(
	331	_mm256_permute4x64_epi64(__a,
	332	0xf8))); // __a[0] __a[2] \| __a[3] __a[3]
	333	}
	334	else if constexpr (__z_to_y)
	335	return __intrin_bitcast<_To>(_mm512_cvtepi32_epi16(__intrin));
	336	}
	337	else if constexpr (__i32_to_i8) //{{{2
	338	{
	339	if constexpr (__x_to_x && __have_avx512vl)
	340	return __intrin_bitcast<_To>(_mm_cvtepi32_epi8(__intrin));
	341	else if constexpr (__x_to_x && __have_avx512f)
	342	return __intrin_bitcast<_To>(
	343	__lo128(_mm512_cvtepi32_epi8(__zero_extend(__intrin))));
	344	else if constexpr (__x_to_x && __have_ssse3)
	345	{
	346	return __intrin_bitcast<_To>(
	347	_mm_shuffle_epi8(__intrin,
	348	_mm_setr_epi8(0, 4, 8, 12, -0x80, -0x80, -0x80,
	349	-0x80, -0x80, -0x80, -0x80, -0x80,
	350	-0x80, -0x80, -0x80, -0x80)));
	351	}
	352	else if constexpr (__x_to_x)
	353	{
	354	const auto __a
	355	= _mm_unpacklo_epi8(__intrin, __intrin); // 0... .... 1... ....
	356	const auto __b
	357	= _mm_unpackhi_epi8(__intrin, __intrin); // 2... .... 3... ....
	358	const auto __c = _mm_unpacklo_epi8(__a, __b); // 02.. .... .... ....
	359	const auto __d = _mm_unpackhi_epi8(__a, __b); // 13.. .... .... ....
	360	const auto __e = _mm_unpacklo_epi8(__c, __d); // 0123 .... .... ....
	361	return __intrin_bitcast<_To>(__e & _mm_cvtsi32_si128(-1));
	362	}
	363	else if constexpr (__y_to_x && __have_avx512vl)
	364	return __intrin_bitcast<_To>(_mm256_cvtepi32_epi8(__intrin));
	365	else if constexpr (__y_to_x && __have_avx512f)
	366	return __intrin_bitcast<_To>(
	367	_mm512_cvtepi32_epi8(__zero_extend(__intrin)));
	368	else if constexpr (__z_to_x)
	369	return __intrin_bitcast<_To>(_mm512_cvtepi32_epi8(__intrin));
	370	}
	371	else if constexpr (__i16_to_i64) //{{{2
	372	{
	373	if constexpr (__x_to_x && __have_sse4_1)
	374	return __intrin_bitcast<_To>(is_signed_v<_Tp>
	375	? _mm_cvtepi16_epi64(__intrin)
	376	: _mm_cvtepu16_epi64(__intrin));
	377	else if constexpr (__x_to_x && is_signed_v<_Tp>)
	378	{
	379	auto __x = _mm_srai_epi16(__intrin, 15);
	380	auto __y = _mm_unpacklo_epi16(__intrin, __x);
	381	__x = _mm_unpacklo_epi16(__x, __x);
	382	return __intrin_bitcast<_To>(_mm_unpacklo_epi32(__y, __x));
	383	}
	384	else if constexpr (__x_to_x)
	385	return __intrin_bitcast<_To>(
	386	_mm_unpacklo_epi32(_mm_unpacklo_epi16(__intrin, __m128i()),
	387	__m128i()));
	388	else if constexpr (__x_to_y)
	389	return __intrin_bitcast<_To>(is_signed_v<_Tp>
	390	? _mm256_cvtepi16_epi64(__intrin)
	391	: _mm256_cvtepu16_epi64(__intrin));
	392	else if constexpr (__x_to_z)
	393	return __intrin_bitcast<_To>(is_signed_v<_Tp>
	394	? _mm512_cvtepi16_epi64(__intrin)
	395	: _mm512_cvtepu16_epi64(__intrin));
	396	}
	397	else if constexpr (__i16_to_i32) //{{{2
	398	{
	399	if constexpr (__x_to_x && __have_sse4_1)
	400	return __intrin_bitcast<_To>(is_signed_v<_Tp>
	401	? _mm_cvtepi16_epi32(__intrin)
	402	: _mm_cvtepu16_epi32(__intrin));
	403	else if constexpr (__x_to_x && is_signed_v<_Tp>)
	404	return __intrin_bitcast<_To>(
	405	_mm_srai_epi32(_mm_unpacklo_epi16(__intrin, __intrin), 16));
	406	else if constexpr (__x_to_x && is_unsigned_v<_Tp>)
	407	return __intrin_bitcast<_To>(_mm_unpacklo_epi16(__intrin, __m128i()));
	408	else if constexpr (__x_to_y)
	409	return __intrin_bitcast<_To>(is_signed_v<_Tp>
	410	? _mm256_cvtepi16_epi32(__intrin)
	411	: _mm256_cvtepu16_epi32(__intrin));
	412	else if constexpr (__y_to_z)
	413	return __intrin_bitcast<_To>(is_signed_v<_Tp>
	414	? _mm512_cvtepi16_epi32(__intrin)
	415	: _mm512_cvtepu16_epi32(__intrin));
	416	}
	417	else if constexpr (__i16_to_i8) //{{{2
	418	{
	419	if constexpr (__x_to_x && __have_avx512bw_vl)
	420	return __intrin_bitcast<_To>(_mm_cvtepi16_epi8(__intrin));
	421	else if constexpr (__x_to_x && __have_avx512bw)
	422	return __intrin_bitcast<_To>(
	423	__lo128(_mm512_cvtepi16_epi8(__zero_extend(__intrin))));
	424	else if constexpr (__x_to_x && __have_ssse3)
	425	return __intrin_bitcast<_To>(_mm_shuffle_epi8(
	426	__intrin, _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, -0x80, -0x80,
	427	-0x80, -0x80, -0x80, -0x80, -0x80, -0x80)));
	428	else if constexpr (__x_to_x)
	429	{
	430	auto __a
	431	= _mm_unpacklo_epi8(__intrin, __intrin); // 00.. 11.. 22.. 33..
	432	auto __b
	433	= _mm_unpackhi_epi8(__intrin, __intrin); // 44.. 55.. 66.. 77..
	434	auto __c = _mm_unpacklo_epi8(__a, __b); // 0404 .... 1515 ....
	435	auto __d = _mm_unpackhi_epi8(__a, __b); // 2626 .... 3737 ....
	436	auto __e = _mm_unpacklo_epi8(__c, __d); // 0246 0246 .... ....
	437	auto __f = _mm_unpackhi_epi8(__c, __d); // 1357 1357 .... ....
	438	return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__e, __f));
	439	}
	440	else if constexpr (__y_to_x && __have_avx512bw_vl)
	441	return __intrin_bitcast<_To>(_mm256_cvtepi16_epi8(__intrin));
	442	else if constexpr (__y_to_x && __have_avx512bw)
	443	return __intrin_bitcast<_To>(
	444	__lo256(_mm512_cvtepi16_epi8(__zero_extend(__intrin))));
	445	else if constexpr (__y_to_x)
	446	{
	447	auto __a = _mm256_shuffle_epi8(
	448	__intrin,
	449	_mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, -0x80, -0x80, -0x80,
	450	-0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
	451	-0x80, -0x80, -0x80, -0x80, -0x80, -0x80, 0, 2,
	452	4, 6, 8, 10, 12, 14));
	453	return __intrin_bitcast<_To>(__lo128(__a) \| __hi128(__a));
	454	}
	455	else if constexpr (__z_to_y && __have_avx512bw)
	456	return __intrin_bitcast<_To>(_mm512_cvtepi16_epi8(__intrin));
	457	else if constexpr (__z_to_y)
	458	__assert_unreachable<_Tp>();
	459	}
	460	else if constexpr (__i8_to_i64) //{{{2
	461	{
	462	if constexpr (__x_to_x && __have_sse4_1)
	463	return __intrin_bitcast<_To>(is_signed_v<_Tp>
	464	? _mm_cvtepi8_epi64(__intrin)
	465	: _mm_cvtepu8_epi64(__intrin));
	466	else if constexpr (__x_to_x && is_signed_v<_Tp>)
	467	{
	468	if constexpr (__have_ssse3)
	469	{
	470	auto __dup = _mm_unpacklo_epi8(__intrin, __intrin);
	471	auto __epi16 = _mm_srai_epi16(__dup, 8);
	472	_mm_shuffle_epi8(__epi16,
	473	_mm_setr_epi8(0, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3,
	474	3, 3, 3, 3, 3));
	475	}
	476	else
	477	{
	478	auto __x = _mm_unpacklo_epi8(__intrin, __intrin);
	479	__x = _mm_unpacklo_epi16(__x, __x);
	480	return __intrin_bitcast<_To>(
	481	_mm_unpacklo_epi32(_mm_srai_epi32(__x, 24),
	482	_mm_srai_epi32(__x, 31)));
	483	}
	484	}
	485	else if constexpr (__x_to_x)
	486	{
	487	return __intrin_bitcast<_To>(_mm_unpacklo_epi32(
	488	_mm_unpacklo_epi16(_mm_unpacklo_epi8(__intrin, __m128i()),
	489	__m128i()),
	490	__m128i()));
	491	}
	492	else if constexpr (__x_to_y)
	493	return __intrin_bitcast<_To>(is_signed_v<_Tp>
	494	? _mm256_cvtepi8_epi64(__intrin)
	495	: _mm256_cvtepu8_epi64(__intrin));
	496	else if constexpr (__x_to_z)
	497	return __intrin_bitcast<_To>(is_signed_v<_Tp>
	498	? _mm512_cvtepi8_epi64(__intrin)
	499	: _mm512_cvtepu8_epi64(__intrin));
	500	}
	501	else if constexpr (__i8_to_i32) //{{{2
	502	{
	503	if constexpr (__x_to_x && __have_sse4_1)
	504	return __intrin_bitcast<_To>(is_signed_v<_Tp>
	505	? _mm_cvtepi8_epi32(__intrin)
	506	: _mm_cvtepu8_epi32(__intrin));
	507	else if constexpr (__x_to_x && is_signed_v<_Tp>)
	508	{
	509	const auto __x = _mm_unpacklo_epi8(__intrin, __intrin);
	510	return __intrin_bitcast<_To>(
	511	_mm_srai_epi32(_mm_unpacklo_epi16(__x, __x), 24));
	512	}
	513	else if constexpr (__x_to_x && is_unsigned_v<_Tp>)
	514	return __intrin_bitcast<_To>(
	515	_mm_unpacklo_epi16(_mm_unpacklo_epi8(__intrin, __m128i()),
	516	__m128i()));
	517	else if constexpr (__x_to_y)
	518	return __intrin_bitcast<_To>(is_signed_v<_Tp>
	519	? _mm256_cvtepi8_epi32(__intrin)
	520	: _mm256_cvtepu8_epi32(__intrin));
	521	else if constexpr (__x_to_z)
	522	return __intrin_bitcast<_To>(is_signed_v<_Tp>
	523	? _mm512_cvtepi8_epi32(__intrin)
	524	: _mm512_cvtepu8_epi32(__intrin));
	525	}
	526	else if constexpr (__i8_to_i16) //{{{2
	527	{
	528	if constexpr (__x_to_x && __have_sse4_1)
	529	return __intrin_bitcast<_To>(is_signed_v<_Tp>
	530	? _mm_cvtepi8_epi16(__intrin)
	531	: _mm_cvtepu8_epi16(__intrin));
	532	else if constexpr (__x_to_x && is_signed_v<_Tp>)
	533	return __intrin_bitcast<_To>(
	534	_mm_srai_epi16(_mm_unpacklo_epi8(__intrin, __intrin), 8));
	535	else if constexpr (__x_to_x && is_unsigned_v<_Tp>)
	536	return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__intrin, __m128i()));
	537	else if constexpr (__x_to_y)
	538	return __intrin_bitcast<_To>(is_signed_v<_Tp>
	539	? _mm256_cvtepi8_epi16(__intrin)
	540	: _mm256_cvtepu8_epi16(__intrin));
	541	else if constexpr (__y_to_z && __have_avx512bw)
	542	return __intrin_bitcast<_To>(is_signed_v<_Tp>
	543	? _mm512_cvtepi8_epi16(__intrin)
	544	: _mm512_cvtepu8_epi16(__intrin));
	545	else if constexpr (__y_to_z)
	546	__assert_unreachable<_Tp>();
	547	}
	548	else if constexpr (__f32_to_s64) //{{{2
	549	{
	550	if constexpr (__have_avx512dq_vl && __x_to_x)
	551	return __intrin_bitcast<_To>(_mm_cvttps_epi64(__intrin));
	552	else if constexpr (__have_avx512dq_vl && __x_to_y)
	553	return __intrin_bitcast<_To>(_mm256_cvttps_epi64(__intrin));
	554	else if constexpr (__have_avx512dq && __y_to_z)
	555	return __intrin_bitcast<_To>(_mm512_cvttps_epi64(__intrin));
	556	// else use scalar fallback
	557	}
	558	else if constexpr (__f32_to_u64) //{{{2
	559	{
	560	if constexpr (__have_avx512dq_vl && __x_to_x)
	561	return __intrin_bitcast<_To>(_mm_cvttps_epu64(__intrin));
	562	else if constexpr (__have_avx512dq_vl && __x_to_y)
	563	return __intrin_bitcast<_To>(_mm256_cvttps_epu64(__intrin));
	564	else if constexpr (__have_avx512dq && __y_to_z)
	565	return __intrin_bitcast<_To>(_mm512_cvttps_epu64(__intrin));
	566	// else use scalar fallback
	567	}
	568	else if constexpr (__f32_to_s32) //{{{2
	569	{
	570	if constexpr (__x_to_x \|\| __y_to_y \|\| __z_to_z)
	571	{
	572	// go to fallback, it does the right thing
	573	}
	574	else
	575	__assert_unreachable<_Tp>();
	576	}
	577	else if constexpr (__f32_to_u32) //{{{2
	578	{
	579	if constexpr (__have_avx512vl && __x_to_x)
	580	return __auto_bitcast(_mm_cvttps_epu32(__intrin));
	581	else if constexpr (__have_avx512f && __x_to_x)
	582	return __auto_bitcast(
	583	__lo128(_mm512_cvttps_epu32(__auto_bitcast(__v))));
	584	else if constexpr (__have_avx512vl && __y_to_y)
	585	return __vector_bitcast<_Up>(_mm256_cvttps_epu32(__intrin));
	586	else if constexpr (__have_avx512f && __y_to_y)
	587	return __vector_bitcast<_Up>(
	588	__lo256(_mm512_cvttps_epu32(__auto_bitcast(__v))));
	589	else if constexpr (__x_to_x \|\| __y_to_y \|\| __z_to_z)
	590	{
	591	// go to fallback, it does the right thing. We can't use the
	592	// _mm_floor_ps - 0x8000'0000 trick for f32->u32 because it would
	593	// discard small input values (only 24 mantissa bits)
	594	}
	595	else
	596	__assert_unreachable<_Tp>();
	597	}
	598	else if constexpr (__f32_to_ibw) //{{{2
	599	return __convert_x86<_To>(__convert_x86<__vector_type_t<int, _Np>>(__v));
	600	else if constexpr (__f64_to_s64) //{{{2
	601	{
	602	if constexpr (__have_avx512dq_vl && __x_to_x)
	603	return __intrin_bitcast<_To>(_mm_cvttpd_epi64(__intrin));
	604	else if constexpr (__have_avx512dq_vl && __y_to_y)
	605	return __intrin_bitcast<_To>(_mm256_cvttpd_epi64(__intrin));
	606	else if constexpr (__have_avx512dq && __z_to_z)
	607	return __intrin_bitcast<_To>(_mm512_cvttpd_epi64(__intrin));
	608	// else use scalar fallback
	609	}
	610	else if constexpr (__f64_to_u64) //{{{2
	611	{
	612	if constexpr (__have_avx512dq_vl && __x_to_x)
	613	return __intrin_bitcast<_To>(_mm_cvttpd_epu64(__intrin));
	614	else if constexpr (__have_avx512dq_vl && __y_to_y)
	615	return __intrin_bitcast<_To>(_mm256_cvttpd_epu64(__intrin));
	616	else if constexpr (__have_avx512dq && __z_to_z)
	617	return __intrin_bitcast<_To>(_mm512_cvttpd_epu64(__intrin));
	618	// else use scalar fallback
	619	}
	620	else if constexpr (__f64_to_s32) //{{{2
	621	{
	622	if constexpr (__x_to_x)
	623	return __intrin_bitcast<_To>(_mm_cvttpd_epi32(__intrin));
	624	else if constexpr (__y_to_x)
	625	return __intrin_bitcast<_To>(_mm256_cvttpd_epi32(__intrin));
	626	else if constexpr (__z_to_y)
	627	return __intrin_bitcast<_To>(_mm512_cvttpd_epi32(__intrin));
	628	}
	629	else if constexpr (__f64_to_u32) //{{{2
	630	{
	631	if constexpr (__have_avx512vl && __x_to_x)
	632	return __intrin_bitcast<_To>(_mm_cvttpd_epu32(__intrin));
	633	else if constexpr (__have_sse4_1 && __x_to_x)
	634	return __vector_bitcast<_Up, _M>(
	635	_mm_cvttpd_epi32(_mm_floor_pd(__intrin) - 0x8000'0000u))
	636	^ 0x8000'0000u;
	637	else if constexpr (__x_to_x)
	638	{
	639	// use scalar fallback: it's only 2 values to convert, can't get
	640	// much better than scalar decomposition
	641	}
	642	else if constexpr (__have_avx512vl && __y_to_x)
	643	return __intrin_bitcast<_To>(_mm256_cvttpd_epu32(__intrin));
	644	else if constexpr (__y_to_x)
	645	{
	646	return __intrin_bitcast<_To>(
	647	__vector_bitcast<_Up>(
	648	_mm256_cvttpd_epi32(_mm256_floor_pd(__intrin) - 0x8000'0000u))
	649	^ 0x8000'0000u);
	650	}
	651	else if constexpr (__z_to_y)
	652	return __intrin_bitcast<_To>(_mm512_cvttpd_epu32(__intrin));
	653	}
	654	else if constexpr (__f64_to_ibw) //{{{2
	655	{
	656	return __convert_x86<_To>(
	657	__convert_x86<__vector_type_t<int, (_Np < 4 ? 4 : _Np)>>(__v));
	658	}
	659	else if constexpr (__s64_to_f32) //{{{2
	660	{
	661	if constexpr (__x_to_x && __have_avx512dq_vl)
	662	return __intrin_bitcast<_To>(_mm_cvtepi64_ps(__intrin));
	663	else if constexpr (__y_to_x && __have_avx512dq_vl)
	664	return __intrin_bitcast<_To>(_mm256_cvtepi64_ps(__intrin));
	665	else if constexpr (__z_to_y && __have_avx512dq)
	666	return __intrin_bitcast<_To>(_mm512_cvtepi64_ps(__intrin));
	667	else if constexpr (__z_to_y)
	668	return __intrin_bitcast<_To>(
	669	_mm512_cvtpd_ps(__convert_x86<__vector_type_t<double, 8>>(__v)));
	670	}
	671	else if constexpr (__u64_to_f32) //{{{2
	672	{
	673	if constexpr (__x_to_x && __have_avx512dq_vl)
	674	return __intrin_bitcast<_To>(_mm_cvtepu64_ps(__intrin));
	675	else if constexpr (__y_to_x && __have_avx512dq_vl)
	676	return __intrin_bitcast<_To>(_mm256_cvtepu64_ps(__intrin));
	677	else if constexpr (__z_to_y && __have_avx512dq)
	678	return __intrin_bitcast<_To>(_mm512_cvtepu64_ps(__intrin));
	679	else if constexpr (__z_to_y)
	680	{
	681	return __intrin_bitcast<_To>(
	682	__lo256(_mm512_cvtepu32_ps(__auto_bitcast(
	683	_mm512_cvtepi64_epi32(_mm512_srai_epi64(__intrin, 32)))))
	684	* 0x100000000LL
	685	+ __lo256(_mm512_cvtepu32_ps(
	686	__auto_bitcast(_mm512_cvtepi64_epi32(__intrin)))));
	687	}
	688	}
	689	else if constexpr (__s32_to_f32) //{{{2
	690	{
	691	// use fallback (builtin conversion)
	692	}
	693	else if constexpr (__u32_to_f32) //{{{2
	694	{
	695	if constexpr (__x_to_x && __have_avx512vl)
	696	{
	697	// use fallback
	698	}
	699	else if constexpr (__x_to_x && __have_avx512f)
	700	return __intrin_bitcast<_To>(
	701	__lo128(_mm512_cvtepu32_ps(__auto_bitcast(__v))));
	702	else if constexpr (__x_to_x && (__have_fma \|\| __have_fma4))
	703	// work around PR85819
	704	return __auto_bitcast(0x10000
	705	* _mm_cvtepi32_ps(__to_intrin(__v >> 16))
	706	+ _mm_cvtepi32_ps(__to_intrin(__v & 0xffff)));
	707	else if constexpr (__y_to_y && __have_avx512vl)
	708	{
	709	// use fallback
	710	}
	711	else if constexpr (__y_to_y && __have_avx512f)
	712	return __intrin_bitcast<_To>(
	713	__lo256(_mm512_cvtepu32_ps(__auto_bitcast(__v))));
	714	else if constexpr (__y_to_y)
	715	// work around PR85819
	716	return 0x10000 * _mm256_cvtepi32_ps(__to_intrin(__v >> 16))
	717	+ _mm256_cvtepi32_ps(__to_intrin(__v & 0xffff));
	718	// else use fallback (builtin conversion)
	719	}
	720	else if constexpr (__ibw_to_f32) //{{{2
	721	{
	722	if constexpr (_M <= 4 \|\| __have_avx2)
	723	return __convert_x86<_To>(
	724	__convert_x86<__vector_type_t<int, _M>>(__v));
	725	else
	726	{
	727	static_assert(__x_to_y);
	728	__m128i __a, __b;
	729	if constexpr (__have_sse4_1)
	730	{
	731	__a = sizeof(_Tp) == 2
	732	? (is_signed_v<_Tp> ? _mm_cvtepi16_epi32(__intrin)
	733	: _mm_cvtepu16_epi32(__intrin))
	734	: (is_signed_v<_Tp> ? _mm_cvtepi8_epi32(__intrin)
	735	: _mm_cvtepu8_epi32(__intrin));
	736	const auto __w
	737	= _mm_shuffle_epi32(__intrin, sizeof(_Tp) == 2 ? 0xee : 0xe9);
	738	__b = sizeof(_Tp) == 2
	739	? (is_signed_v<_Tp> ? _mm_cvtepi16_epi32(__w)
	740	: _mm_cvtepu16_epi32(__w))
	741	: (is_signed_v<_Tp> ? _mm_cvtepi8_epi32(__w)
	742	: _mm_cvtepu8_epi32(__w));
	743	}
	744	else
	745	{
	746	__m128i __tmp;
	747	if constexpr (sizeof(_Tp) == 1)
	748	{
	749	__tmp = is_signed_v<_Tp>
	750	? _mm_srai_epi16(_mm_unpacklo_epi8(__intrin,
	751	__intrin),
	752	8)
	753	: _mm_unpacklo_epi8(__intrin, __m128i());
	754	}
	755	else
	756	{
	757	static_assert(sizeof(_Tp) == 2);
	758	__tmp = __intrin;
	759	}
	760	__a = is_signed_v<_Tp>
	761	? _mm_srai_epi32(_mm_unpacklo_epi16(__tmp, __tmp), 16)
	762	: _mm_unpacklo_epi16(__tmp, __m128i());
	763	__b = is_signed_v<_Tp>
	764	? _mm_srai_epi32(_mm_unpackhi_epi16(__tmp, __tmp), 16)
	765	: _mm_unpackhi_epi16(__tmp, __m128i());
	766	}
	767	return __convert_x86<_To>(__vector_bitcast<int>(__a),
	768	__vector_bitcast<int>(__b));
	769	}
	770	}
	771	else if constexpr (__s64_to_f64) //{{{2
	772	{
	773	if constexpr (__x_to_x && __have_avx512dq_vl)
	774	return __intrin_bitcast<_To>(_mm_cvtepi64_pd(__intrin));
	775	else if constexpr (__y_to_y && __have_avx512dq_vl)
	776	return __intrin_bitcast<_To>(_mm256_cvtepi64_pd(__intrin));
	777	else if constexpr (__z_to_z && __have_avx512dq)
	778	return __intrin_bitcast<_To>(_mm512_cvtepi64_pd(__intrin));
	779	else if constexpr (__z_to_z)
	780	{
	781	return __intrin_bitcast<_To>(
	782	_mm512_cvtepi32_pd(_mm512_cvtepi64_epi32(__to_intrin(__v >> 32)))
	783	* 0x100000000LL
	784	+ _mm512_cvtepu32_pd(_mm512_cvtepi64_epi32(__intrin)));
	785	}
	786	}
	787	else if constexpr (__u64_to_f64) //{{{2
	788	{
	789	if constexpr (__x_to_x && __have_avx512dq_vl)
	790	return __intrin_bitcast<_To>(_mm_cvtepu64_pd(__intrin));
	791	else if constexpr (__y_to_y && __have_avx512dq_vl)
	792	return __intrin_bitcast<_To>(_mm256_cvtepu64_pd(__intrin));
	793	else if constexpr (__z_to_z && __have_avx512dq)
	794	return __intrin_bitcast<_To>(_mm512_cvtepu64_pd(__intrin));
	795	else if constexpr (__z_to_z)
	796	{
	797	return __intrin_bitcast<_To>(
	798	_mm512_cvtepu32_pd(_mm512_cvtepi64_epi32(__to_intrin(__v >> 32)))
	799	* 0x100000000LL
	800	+ _mm512_cvtepu32_pd(_mm512_cvtepi64_epi32(__intrin)));
	801	}
	802	}
	803	else if constexpr (__s32_to_f64) //{{{2
	804	{
	805	if constexpr (__x_to_x)
	806	return __intrin_bitcast<_To>(_mm_cvtepi32_pd(__intrin));
	807	else if constexpr (__x_to_y)
	808	return __intrin_bitcast<_To>(_mm256_cvtepi32_pd(__intrin));
	809	else if constexpr (__y_to_z)
	810	return __intrin_bitcast<_To>(_mm512_cvtepi32_pd(__intrin));
	811	}
	812	else if constexpr (__u32_to_f64) //{{{2
	813	{
	814	if constexpr (__x_to_x && __have_avx512vl)
	815	return __intrin_bitcast<_To>(_mm_cvtepu32_pd(__intrin));
	816	else if constexpr (__x_to_x && __have_avx512f)
	817	return __intrin_bitcast<_To>(
	818	__lo128(_mm512_cvtepu32_pd(__auto_bitcast(__v))));
	819	else if constexpr (__x_to_x)
	820	return __intrin_bitcast<_To>(
	821	_mm_cvtepi32_pd(__to_intrin(__v ^ 0x8000'0000u)) + 0x8000'0000u);
	822	else if constexpr (__x_to_y && __have_avx512vl)
	823	return __intrin_bitcast<_To>(_mm256_cvtepu32_pd(__intrin));
	824	else if constexpr (__x_to_y && __have_avx512f)
	825	return __intrin_bitcast<_To>(
	826	__lo256(_mm512_cvtepu32_pd(__auto_bitcast(__v))));
	827	else if constexpr (__x_to_y)
	828	return __intrin_bitcast<_To>(
	829	_mm256_cvtepi32_pd(__to_intrin(__v ^ 0x8000'0000u)) + 0x8000'0000u);
	830	else if constexpr (__y_to_z)
	831	return __intrin_bitcast<_To>(_mm512_cvtepu32_pd(__intrin));
	832	}
	833	else if constexpr (__ibw_to_f64) //{{{2
	834	{
	835	return __convert_x86<_To>(
	836	__convert_x86<__vector_type_t<int, std::max(size_t(4), _M)>>(__v));
	837	}
	838	else if constexpr (__f32_to_f64) //{{{2
	839	{
	840	if constexpr (__x_to_x)
	841	return __intrin_bitcast<_To>(_mm_cvtps_pd(__intrin));
	842	else if constexpr (__x_to_y)
	843	return __intrin_bitcast<_To>(_mm256_cvtps_pd(__intrin));
	844	else if constexpr (__y_to_z)
	845	return __intrin_bitcast<_To>(_mm512_cvtps_pd(__intrin));
	846	}
	847	else if constexpr (__f64_to_f32) //{{{2
	848	{
	849	if constexpr (__x_to_x)
	850	return __intrin_bitcast<_To>(_mm_cvtpd_ps(__intrin));
	851	else if constexpr (__y_to_x)
	852	return __intrin_bitcast<_To>(_mm256_cvtpd_ps(__intrin));
	853	else if constexpr (__z_to_y)
	854	return __intrin_bitcast<_To>(_mm512_cvtpd_ps(__intrin));
	855	}
	856	else //{{{2
	857	__assert_unreachable<_Tp>();
	858
	859	// fallback:{{{2
	860	return __vector_convert<_To>(__v, make_index_sequence<std::min(_M, _Np)>());
	861	//}}}
	862	}
	863
	864	// }}}
	865	// 2-arg __convert_x86 {{{1
	866	template <typename _To, typename _V, typename _Traits>
	867	_GLIBCXX_SIMD_INTRINSIC _To
	868	__convert_x86(_V __v0, _V __v1)
	869	{
	870	static_assert(__is_vector_type_v<_V>);
	871	using _Tp = typename _Traits::value_type;
	872	constexpr size_t _Np = _Traits::_S_full_size;
	873	[[maybe_unused]] const auto __i0 = __to_intrin(__v0);
	874	[[maybe_unused]] const auto __i1 = __to_intrin(__v1);
	875	using _Up = typename _VectorTraits<_To>::value_type;
	876	constexpr size_t _M = _VectorTraits<_To>::_S_full_size;
	877
	878	static_assert(2 * _Np <= _M,
	879	"__v1 would be discarded; use the one-argument "
	880	"__convert_x86 overload instead");
	881
	882	// [xyz]_to_[xyz] {{{2
	883	[[maybe_unused]] constexpr bool __x_to_x
	884	= sizeof(__v0) <= 16 && sizeof(_To) <= 16;
	885	[[maybe_unused]] constexpr bool __x_to_y
	886	= sizeof(__v0) <= 16 && sizeof(_To) == 32;
	887	[[maybe_unused]] constexpr bool __x_to_z
	888	= sizeof(__v0) <= 16 && sizeof(_To) == 64;
	889	[[maybe_unused]] constexpr bool __y_to_x
	890	= sizeof(__v0) == 32 && sizeof(_To) <= 16;
	891	[[maybe_unused]] constexpr bool __y_to_y
	892	= sizeof(__v0) == 32 && sizeof(_To) == 32;
	893	[[maybe_unused]] constexpr bool __y_to_z
	894	= sizeof(__v0) == 32 && sizeof(_To) == 64;
	895	[[maybe_unused]] constexpr bool __z_to_x
	896	= sizeof(__v0) == 64 && sizeof(_To) <= 16;
	897	[[maybe_unused]] constexpr bool __z_to_y
	898	= sizeof(__v0) == 64 && sizeof(_To) == 32;
	899	[[maybe_unused]] constexpr bool __z_to_z
	900	= sizeof(__v0) == 64 && sizeof(_To) == 64;
	901
	902	// iX_to_iX {{{2
	903	[[maybe_unused]] constexpr bool __i_to_i
	904	= is_integral_v<_Up> && is_integral_v<_Tp>;
	905	[[maybe_unused]] constexpr bool __i8_to_i16
	906	= __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 2;
	907	[[maybe_unused]] constexpr bool __i8_to_i32
	908	= __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 4;
	909	[[maybe_unused]] constexpr bool __i8_to_i64
	910	= __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 8;
	911	[[maybe_unused]] constexpr bool __i16_to_i8
	912	= __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 1;
	913	[[maybe_unused]] constexpr bool __i16_to_i32
	914	= __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 4;
	915	[[maybe_unused]] constexpr bool __i16_to_i64
	916	= __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 8;
	917	[[maybe_unused]] constexpr bool __i32_to_i8
	918	= __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 1;
	919	[[maybe_unused]] constexpr bool __i32_to_i16
	920	= __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 2;
	921	[[maybe_unused]] constexpr bool __i32_to_i64
	922	= __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 8;
	923	[[maybe_unused]] constexpr bool __i64_to_i8
	924	= __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 1;
	925	[[maybe_unused]] constexpr bool __i64_to_i16
	926	= __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 2;
	927	[[maybe_unused]] constexpr bool __i64_to_i32
	928	= __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 4;
	929
	930	// [fsu]X_to_[fsu]X {{{2
	931	// ibw = integral && byte or word, i.e. char and short with any signedness
	932	[[maybe_unused]] constexpr bool __i64_to_f32
	933	= is_integral_v<_Tp> && sizeof(_Tp) == 8
	934	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
	935	[[maybe_unused]] constexpr bool __s32_to_f32
	936	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4
	937	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
	938	[[maybe_unused]] constexpr bool __s16_to_f32
	939	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 2
	940	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
	941	[[maybe_unused]] constexpr bool __s8_to_f32
	942	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 1
	943	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
	944	[[maybe_unused]] constexpr bool __u32_to_f32
	945	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4
	946	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
	947	[[maybe_unused]] constexpr bool __u16_to_f32
	948	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 2
	949	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
	950	[[maybe_unused]] constexpr bool __u8_to_f32
	951	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 1
	952	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
	953	[[maybe_unused]] constexpr bool __s64_to_f64
	954	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 8
	955	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
	956	[[maybe_unused]] constexpr bool __s32_to_f64
	957	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4
	958	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
	959	[[maybe_unused]] constexpr bool __s16_to_f64
	960	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 2
	961	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
	962	[[maybe_unused]] constexpr bool __s8_to_f64
	963	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 1
	964	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
	965	[[maybe_unused]] constexpr bool __u64_to_f64
	966	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 8
	967	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
	968	[[maybe_unused]] constexpr bool __u32_to_f64
	969	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4
	970	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
	971	[[maybe_unused]] constexpr bool __u16_to_f64
	972	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 2
	973	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
	974	[[maybe_unused]] constexpr bool __u8_to_f64
	975	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 1
	976	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
	977	[[maybe_unused]] constexpr bool __f32_to_s64
	978	= is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8
	979	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
	980	[[maybe_unused]] constexpr bool __f32_to_s32
	981	= is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4
	982	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
	983	[[maybe_unused]] constexpr bool __f32_to_u64
	984	= is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8
	985	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
	986	[[maybe_unused]] constexpr bool __f32_to_u32
	987	= is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4
	988	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
	989	[[maybe_unused]] constexpr bool __f64_to_s64
	990	= is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8
	991	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
	992	[[maybe_unused]] constexpr bool __f64_to_s32
	993	= is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4
	994	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
	995	[[maybe_unused]] constexpr bool __f64_to_u64
	996	= is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8
	997	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
	998	[[maybe_unused]] constexpr bool __f64_to_u32
	999	= is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4
	1000	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
	1001	[[maybe_unused]] constexpr bool __f32_to_ibw
	1002	= is_integral_v<_Up> && sizeof(_Up) <= 2
	1003	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
	1004	[[maybe_unused]] constexpr bool __f64_to_ibw
	1005	= is_integral_v<_Up> && sizeof(_Up) <= 2
	1006	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
	1007	[[maybe_unused]] constexpr bool __f32_to_f64
	1008	= is_floating_point_v<_Tp> && sizeof(_Tp) == 4
	1009	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
	1010	[[maybe_unused]] constexpr bool __f64_to_f32
	1011	= is_floating_point_v<_Tp> && sizeof(_Tp) == 8
	1012	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
	1013
	1014	if constexpr (__i_to_i && __y_to_x && !__have_avx2) //{{{2
	1015	// <double, 4>, <double, 4> => <short, 8>
	1016	return __convert_x86<_To>(__lo128(__v0), __hi128(__v0), __lo128(__v1),
	1017	__hi128(__v1));
	1018	else if constexpr (__i_to_i) // assert ISA {{{2
	1019	{
	1020	static_assert(__x_to_x \|\| __have_avx2,
	1021	"integral conversions with ymm registers require AVX2");
	1022	static_assert(__have_avx512bw
	1023	\|\| ((sizeof(_Tp) >= 4 \|\| sizeof(__v0) < 64)
	1024	&& (sizeof(_Up) >= 4 \|\| sizeof(_To) < 64)),
	1025	"8/16-bit integers in zmm registers require AVX512BW");
	1026	static_assert((sizeof(__v0) < 64 && sizeof(_To) < 64) \|\| __have_avx512f,
	1027	"integral conversions with ymm registers require AVX2");
	1028	}
	1029	// concat => use 1-arg __convert_x86 {{{2
	1030	if constexpr (sizeof(__v0) < 16 \|\| (sizeof(__v0) == 16 && __have_avx2)
	1031	\|\| (sizeof(__v0) == 16 && __have_avx
	1032	&& is_floating_point_v<_Tp>)
	1033	\|\| (sizeof(__v0) == 32 && __have_avx512f
	1034	&& (sizeof(_Tp) >= 4 \|\| __have_avx512bw)))
	1035	{
	1036	// The ISA can handle wider input registers, so concat and use one-arg
	1037	// implementation. This reduces code duplication considerably.
	1038	return __convert_x86<_To>(__concat(__v0, __v1));
	1039	}
	1040	else //{{{2
	1041	{
	1042	// conversion using bit reinterpretation (or no conversion at all)
	1043	// should all go through the concat branch above:
	1044	static_assert(
	1045	!(is_floating_point_v<
	1046	_Tp> == is_floating_point_v<_Up> && sizeof(_Tp) == sizeof(_Up)));
	1047	// handle all zero extension{{{2
	1048	if constexpr (2 * _Np < _M && sizeof(_To) > 16)
	1049	{
	1050	constexpr size_t Min = 16 / sizeof(_Up);
	1051	return __zero_extend(
	1052	__convert_x86<
	1053	__vector_type_t<_Up, (Min > 2 * _Np) ? Min : 2 * _Np>>(__v0,
	1054	__v1));
	1055	}
	1056	else if constexpr (__i64_to_i32) //{{{2
	1057	{
	1058	if constexpr (__x_to_x)
	1059	return __auto_bitcast(_mm_shuffle_ps(__auto_bitcast(__v0),
	1060	__auto_bitcast(__v1), 0x88));
	1061	else if constexpr (__y_to_y)
	1062	{
	1063	// AVX512F is not available (would concat otherwise)
	1064	return __auto_bitcast(
	1065	__xzyw(_mm256_shuffle_ps(__auto_bitcast(__v0),
	1066	__auto_bitcast(__v1), 0x88)));
	1067	// alternative:
	1068	// const auto v0_abxxcdxx = _mm256_shuffle_epi32(__v0, 8);
	1069	// const auto v1_efxxghxx = _mm256_shuffle_epi32(__v1, 8);
	1070	// const auto v_abefcdgh = _mm256_unpacklo_epi64(v0_abxxcdxx,
	1071	// v1_efxxghxx); return _mm256_permute4x64_epi64(v_abefcdgh,
	1072	// 0x01 * 0 + 0x04 * 2 + 0x10 * 1 + 0x40 * 3); // abcdefgh
	1073	}
	1074	else if constexpr (__z_to_z)
	1075	return __intrin_bitcast<_To>(
	1076	__concat(_mm512_cvtepi64_epi32(__i0),
	1077	_mm512_cvtepi64_epi32(__i1)));
	1078	}
	1079	else if constexpr (__i64_to_i16) //{{{2
	1080	{
	1081	if constexpr (__x_to_x)
	1082	{
	1083	// AVX2 is not available (would concat otherwise)
	1084	if constexpr (__have_sse4_1)
	1085	{
	1086	return __intrin_bitcast<_To>(_mm_shuffle_epi8(
	1087	_mm_blend_epi16(__i0, _mm_slli_si128(__i1, 4), 0x44),
	1088	_mm_setr_epi8(0, 1, 8, 9, 4, 5, 12, 13, -0x80, -0x80,
	1089	-0x80, -0x80, -0x80, -0x80, -0x80, -0x80)));
	1090	}
	1091	else
	1092	{
	1093	return __vector_type_t<_Up, _M>{_Up(__v0[0]), _Up(__v0[1]),
	1094	_Up(__v1[0]), _Up(__v1[1])};
	1095	}
	1096	}
	1097	else if constexpr (__y_to_x)
	1098	{
	1099	auto __a
	1100	= _mm256_unpacklo_epi16(__i0, __i1); // 04.. .... 26.. ....
	1101	auto __b
	1102	= _mm256_unpackhi_epi16(__i0, __i1); // 15.. .... 37.. ....
	1103	auto __c
	1104	= _mm256_unpacklo_epi16(__a, __b); // 0145 .... 2367 ....
	1105	return __intrin_bitcast<_To>(
	1106	_mm_unpacklo_epi32(__lo128(__c), __hi128(__c))); // 0123 4567
	1107	}
	1108	else if constexpr (__z_to_y)
	1109	return __intrin_bitcast<_To>(
	1110	__concat(_mm512_cvtepi64_epi16(__i0),
	1111	_mm512_cvtepi64_epi16(__i1)));
	1112	}
	1113	else if constexpr (__i64_to_i8) //{{{2
	1114	{
	1115	if constexpr (__x_to_x && __have_sse4_1)
	1116	{
	1117	return __intrin_bitcast<_To>(_mm_shuffle_epi8(
	1118	_mm_blend_epi16(__i0, _mm_slli_si128(__i1, 4), 0x44),
	1119	_mm_setr_epi8(0, 8, 4, 12, -0x80, -0x80, -0x80, -0x80, -0x80,
	1120	-0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
	1121	-0x80)));
	1122	}
	1123	else if constexpr (__x_to_x && __have_ssse3)
	1124	{
	1125	return __intrin_bitcast<_To>(_mm_unpacklo_epi16(
	1126	_mm_shuffle_epi8(
	1127	__i0, _mm_setr_epi8(0, 8, -0x80, -0x80, -0x80, -0x80, -0x80,
	1128	-0x80, -0x80, -0x80, -0x80, -0x80,
	1129	-0x80, -0x80, -0x80, -0x80)),
	1130	_mm_shuffle_epi8(
	1131	__i1, _mm_setr_epi8(0, 8, -0x80, -0x80, -0x80, -0x80, -0x80,
	1132	-0x80, -0x80, -0x80, -0x80, -0x80,
	1133	-0x80, -0x80, -0x80, -0x80))));
	1134	}
	1135	else if constexpr (__x_to_x)
	1136	{
	1137	return __vector_type_t<_Up, _M>{_Up(__v0[0]), _Up(__v0[1]),
	1138	_Up(__v1[0]), _Up(__v1[1])};
	1139	}
	1140	else if constexpr (__y_to_x)
	1141	{
	1142	const auto __a = _mm256_shuffle_epi8(
	1143	_mm256_blend_epi32(__i0, _mm256_slli_epi64(__i1, 32), 0xAA),
	1144	_mm256_setr_epi8(0, 8, -0x80, -0x80, 4, 12, -0x80, -0x80,
	1145	-0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
	1146	-0x80, -0x80, -0x80, -0x80, 0, 8, -0x80,
	1147	-0x80, 4, 12, -0x80, -0x80, -0x80, -0x80,
	1148	-0x80, -0x80, -0x80, -0x80));
	1149	return __intrin_bitcast<_To>(__lo128(__a) \| __hi128(__a));
	1150	} // __z_to_x uses concat fallback
	1151	}
	1152	else if constexpr (__i32_to_i16) //{{{2
	1153	{
	1154	if constexpr (__x_to_x)
	1155	{
	1156	// AVX2 is not available (would concat otherwise)
	1157	if constexpr (__have_sse4_1)
	1158	{
	1159	return __intrin_bitcast<_To>(_mm_shuffle_epi8(
	1160	_mm_blend_epi16(__i0, _mm_slli_si128(__i1, 2), 0xaa),
	1161	_mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10,
	1162	11, 14, 15)));
	1163	}
	1164	else if constexpr (__have_ssse3)
	1165	{
	1166	return __intrin_bitcast<_To>(
	1167	_mm_hadd_epi16(__to_intrin(__v0 << 16),
	1168	__to_intrin(__v1 << 16)));
	1169	/*
	1170	return _mm_unpacklo_epi64(
	1171	_mm_shuffle_epi8(__i0, _mm_setr_epi8(0, 1, 4, 5, 8, 9,
	1172	12, 13, 8, 9, 12, 13, 12, 13, 14, 15)),
	1173	_mm_shuffle_epi8(__i1, _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12,
	1174	13, 8, 9, 12, 13, 12, 13, 14, 15)));
	1175	*/
	1176	}
	1177	else
	1178	{
	1179	auto __a = _mm_unpacklo_epi16(__i0, __i1); // 04.. 15..
	1180	auto __b = _mm_unpackhi_epi16(__i0, __i1); // 26.. 37..
	1181	auto __c = _mm_unpacklo_epi16(__a, __b); // 0246 ....
	1182	auto __d = _mm_unpackhi_epi16(__a, __b); // 1357 ....
	1183	return __intrin_bitcast<_To>(
	1184	_mm_unpacklo_epi16(__c, __d)); // 0123 4567
	1185	}
	1186	}
	1187	else if constexpr (__y_to_y)
	1188	{
	1189	const auto __shuf
	1190	= _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80,
	1191	-0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
	1192	0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80,
	1193	-0x80, -0x80, -0x80, -0x80, -0x80, -0x80);
	1194	auto __a = _mm256_shuffle_epi8(__i0, __shuf);
	1195	auto __b = _mm256_shuffle_epi8(__i1, __shuf);
	1196	return __intrin_bitcast<_To>(
	1197	__xzyw(_mm256_unpacklo_epi64(__a, __b)));
	1198	} // __z_to_z uses concat fallback
	1199	}
	1200	else if constexpr (__i32_to_i8) //{{{2
	1201	{
	1202	if constexpr (__x_to_x && __have_ssse3)
	1203	{
	1204	const auto shufmask
	1205	= _mm_setr_epi8(0, 4, 8, 12, -0x80, -0x80, -0x80, -0x80,
	1206	-0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
	1207	-0x80, -0x80);
	1208	return __intrin_bitcast<_To>(
	1209	_mm_unpacklo_epi32(_mm_shuffle_epi8(__i0, shufmask),
	1210	_mm_shuffle_epi8(__i1, shufmask)));
	1211	}
	1212	else if constexpr (__x_to_x)
	1213	{
	1214	auto __a = _mm_unpacklo_epi8(__i0, __i1); // 04.. .... 15.. ....
	1215	auto __b = _mm_unpackhi_epi8(__i0, __i1); // 26.. .... 37.. ....
	1216	auto __c = _mm_unpacklo_epi8(__a, __b); // 0246 .... .... ....
	1217	auto __d = _mm_unpackhi_epi8(__a, __b); // 1357 .... .... ....
	1218	auto __e = _mm_unpacklo_epi8(__c, __d); // 0123 4567 .... ....
	1219	return __intrin_bitcast<_To>(__e & __m128i{-1, 0});
	1220	}
	1221	else if constexpr (__y_to_x)
	1222	{
	1223	const auto __a = _mm256_shuffle_epi8(
	1224	_mm256_blend_epi16(__i0, _mm256_slli_epi32(__i1, 16), 0xAA),
	1225	_mm256_setr_epi8(0, 4, 8, 12, -0x80, -0x80, -0x80, -0x80, 2,
	1226	6, 10, 14, -0x80, -0x80, -0x80, -0x80, -0x80,
	1227	-0x80, -0x80, -0x80, 0, 4, 8, 12, -0x80,
	1228	-0x80, -0x80, -0x80, 2, 6, 10, 14));
	1229	return __intrin_bitcast<_To>(__lo128(__a) \| __hi128(__a));
	1230	} // __z_to_y uses concat fallback
	1231	}
	1232	else if constexpr (__i16_to_i8) //{{{2
	1233	{
	1234	if constexpr (__x_to_x && __have_ssse3)
	1235	{
	1236	const auto __shuf = reinterpret_cast<__m128i>(
	1237	__vector_type_t<_UChar, 16>{0, 2, 4, 6, 8, 10, 12, 14, 0x80,
	1238	0x80, 0x80, 0x80, 0x80, 0x80,
	1239	0x80, 0x80});
	1240	return __intrin_bitcast<_To>(
	1241	_mm_unpacklo_epi64(_mm_shuffle_epi8(__i0, __shuf),
	1242	_mm_shuffle_epi8(__i1, __shuf)));
	1243	}
	1244	else if constexpr (__x_to_x)
	1245	{
	1246	auto __a = _mm_unpacklo_epi8(__i0, __i1); // 08.. 19.. 2A.. 3B..
	1247	auto __b = _mm_unpackhi_epi8(__i0, __i1); // 4C.. 5D.. 6E.. 7F..
	1248	auto __c = _mm_unpacklo_epi8(__a, __b); // 048C .... 159D ....
	1249	auto __d = _mm_unpackhi_epi8(__a, __b); // 26AE .... 37BF ....
	1250	auto __e = _mm_unpacklo_epi8(__c, __d); // 0246 8ACE .... ....
	1251	auto __f = _mm_unpackhi_epi8(__c, __d); // 1357 9BDF .... ....
	1252	return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__e, __f));
	1253	}
	1254	else if constexpr (__y_to_y)
	1255	{
	1256	return __intrin_bitcast<_To>(__xzyw(_mm256_shuffle_epi8(
	1257	(__to_intrin(__v0) & _mm256_set1_epi32(0x00ff00ff))
	1258	\| _mm256_slli_epi16(__i1, 8),
	1259	_mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11,
	1260	13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5,
	1261	7, 9, 11, 13, 15))));
	1262	} // __z_to_z uses concat fallback
	1263	}
	1264	else if constexpr (__i64_to_f32) //{{{2
	1265	{
	1266	if constexpr (__x_to_x)
	1267	return __make_wrapper<float>(__v0[0], __v0[1], __v1[0], __v1[1]);
	1268	else if constexpr (__y_to_y)
	1269	{
	1270	static_assert(__y_to_y && __have_avx2);
	1271	const auto __a = _mm256_unpacklo_epi32(__i0, __i1); // aeAE cgCG
	1272	const auto __b = _mm256_unpackhi_epi32(__i0, __i1); // bfBF dhDH
	1273	const auto __lo32
	1274	= _mm256_unpacklo_epi32(__a, __b); // abef cdgh
	1275	const auto __hi32 = __vector_bitcast<
	1276	conditional_t<is_signed_v<_Tp>, int, _UInt>>(
	1277	_mm256_unpackhi_epi32(__a, __b)); // ABEF CDGH
	1278	const auto __hi
	1279	= 0x100000000LL
	1280	* __convert_x86<__vector_type_t<float, 8>>(__hi32);
	1281	const auto __mid
	1282	= 0x10000 * _mm256_cvtepi32_ps(_mm256_srli_epi32(__lo32, 16));
	1283	const auto __lo
	1284	= _mm256_cvtepi32_ps(_mm256_set1_epi32(0x0000ffffu) & __lo32);
	1285	return __xzyw((__hi + __mid) + __lo);
	1286	}
	1287	else if constexpr (__z_to_z && __have_avx512dq)
	1288	{
	1289	return is_signed_v<_Tp> ? __concat(_mm512_cvtepi64_ps(__i0),
	1290	_mm512_cvtepi64_ps(__i1))
	1291	: __concat(_mm512_cvtepu64_ps(__i0),
	1292	_mm512_cvtepu64_ps(__i1));
	1293	}
	1294	else if constexpr (__z_to_z && is_signed_v<_Tp>)
	1295	{
	1296	const __m512 __hi32 = _mm512_cvtepi32_ps(
	1297	__concat(_mm512_cvtepi64_epi32(__to_intrin(__v0 >> 32)),
	1298	_mm512_cvtepi64_epi32(__to_intrin(__v1 >> 32))));
	1299	const __m512i __lo32 = __concat(_mm512_cvtepi64_epi32(__i0),
	1300	_mm512_cvtepi64_epi32(__i1));
	1301	// split low 32-bits, because if __hi32 is a small negative
	1302	// number, the 24-bit mantissa may lose important information if
	1303	// any of the high 8 bits of __lo32 is set, leading to
	1304	// catastrophic cancelation in the FMA
	1305	const __m512 __hi16
	1306	= _mm512_cvtepu32_ps(_mm512_set1_epi32(0xffff0000u) & __lo32);
	1307	const __m512 __lo16
	1308	= _mm512_cvtepi32_ps(_mm512_set1_epi32(0x0000ffffu) & __lo32);
	1309	return (__hi32 * 0x100000000LL + __hi16) + __lo16;
	1310	}
	1311	else if constexpr (__z_to_z && is_unsigned_v<_Tp>)
	1312	{
	1313	return __intrin_bitcast<_To>(
	1314	_mm512_cvtepu32_ps(__concat(
	1315	_mm512_cvtepi64_epi32(_mm512_srai_epi64(__i0, 32)),
	1316	_mm512_cvtepi64_epi32(_mm512_srai_epi64(__i1, 32))))
	1317	* 0x100000000LL
	1318	+ _mm512_cvtepu32_ps(__concat(_mm512_cvtepi64_epi32(__i0),
	1319	_mm512_cvtepi64_epi32(__i1))));
	1320	}
	1321	}
	1322	else if constexpr (__f64_to_s32) //{{{2
	1323	{
	1324	// use concat fallback
	1325	}
	1326	else if constexpr (__f64_to_u32) //{{{2
	1327	{
	1328	if constexpr (__x_to_x && __have_sse4_1)
	1329	{
	1330	return __vector_bitcast<_Up, _M>(_mm_unpacklo_epi64(
	1331	_mm_cvttpd_epi32(_mm_floor_pd(__i0) - 0x8000'0000u),
	1332	_mm_cvttpd_epi32(_mm_floor_pd(__i1) - 0x8000'0000u)))
	1333	^ 0x8000'0000u;
	1334	// without SSE4.1 just use the scalar fallback, it's only four
	1335	// values
	1336	}
	1337	else if constexpr (__y_to_y)
	1338	{
	1339	return __vector_bitcast<_Up>(
	1340	__concat(_mm256_cvttpd_epi32(_mm256_floor_pd(__i0)
	1341	- 0x8000'0000u),
	1342	_mm256_cvttpd_epi32(_mm256_floor_pd(__i1)
	1343	- 0x8000'0000u)))
	1344	^ 0x8000'0000u;
	1345	} // __z_to_z uses fallback
	1346	}
	1347	else if constexpr (__f64_to_ibw) //{{{2
	1348	{
	1349	// one-arg __f64_to_ibw goes via _SimdWrapper<int, ?>. The fallback
	1350	// would go via two independet conversions to _SimdWrapper<_To> and
	1351	// subsequent interleaving. This is better, because f64->__i32
	1352	// allows to combine __v0 and __v1 into one register: if constexpr
	1353	// (__z_to_x \|\| __y_to_x) {
	1354	return __convert_x86<_To>(
	1355	__convert_x86<__vector_type_t<int, _Np * 2>>(__v0, __v1));
	1356	//}
	1357	}
	1358	else if constexpr (__f32_to_ibw) //{{{2
	1359	{
	1360	return __convert_x86<_To>(
	1361	__convert_x86<__vector_type_t<int, _Np>>(__v0),
	1362	__convert_x86<__vector_type_t<int, _Np>>(__v1));
	1363	} //}}}
	1364
	1365	// fallback: {{{2
	1366	if constexpr (sizeof(_To) >= 32)
	1367	// if _To is ymm or zmm, then _SimdWrapper<_Up, _M / 2> is xmm or ymm
	1368	return __concat(__convert_x86<__vector_type_t<_Up, _M / 2>>(__v0),
	1369	__convert_x86<__vector_type_t<_Up, _M / 2>>(__v1));
	1370	else if constexpr (sizeof(_To) == 16)
	1371	{
	1372	const auto __lo = __to_intrin(__convert_x86<_To>(__v0));
	1373	const auto __hi = __to_intrin(__convert_x86<_To>(__v1));
	1374	if constexpr (sizeof(_Up) * _Np == 8)
	1375	{
	1376	if constexpr (is_floating_point_v<_Up>)
	1377	return __auto_bitcast(
	1378	_mm_unpacklo_pd(__vector_bitcast<double>(__lo),
	1379	__vector_bitcast<double>(__hi)));
	1380	else
	1381	return __intrin_bitcast<_To>(_mm_unpacklo_epi64(__lo, __hi));
	1382	}
	1383	else if constexpr (sizeof(_Up) * _Np == 4)
	1384	{
	1385	if constexpr (is_floating_point_v<_Up>)
	1386	return __auto_bitcast(
	1387	_mm_unpacklo_ps(__vector_bitcast<float>(__lo),
	1388	__vector_bitcast<float>(__hi)));
	1389	else
	1390	return __intrin_bitcast<_To>(_mm_unpacklo_epi32(__lo, __hi));
	1391	}
	1392	else if constexpr (sizeof(_Up) * _Np == 2)
	1393	return __intrin_bitcast<_To>(_mm_unpacklo_epi16(__lo, __hi));
	1394	else
	1395	__assert_unreachable<_Tp>();
	1396	}
	1397	else
	1398	return __vector_convert<_To>(__v0, __v1, make_index_sequence<_Np>());
	1399	//}}}
	1400	}
	1401	}
	1402
	1403	//}}}1
	1404	// 4-arg __convert_x86 {{{1
	1405	template <typename _To, typename _V, typename _Traits>
	1406	_GLIBCXX_SIMD_INTRINSIC _To
	1407	__convert_x86(_V __v0, _V __v1, _V __v2, _V __v3)
	1408	{
	1409	static_assert(__is_vector_type_v<_V>);
	1410	using _Tp = typename _Traits::value_type;
	1411	constexpr size_t _Np = _Traits::_S_full_size;
	1412	[[maybe_unused]] const auto __i0 = __to_intrin(__v0);
	1413	[[maybe_unused]] const auto __i1 = __to_intrin(__v1);
	1414	[[maybe_unused]] const auto __i2 = __to_intrin(__v2);
	1415	[[maybe_unused]] const auto __i3 = __to_intrin(__v3);
	1416	using _Up = typename _VectorTraits<_To>::value_type;
	1417	constexpr size_t _M = _VectorTraits<_To>::_S_full_size;
	1418
	1419	static_assert(4 * _Np <= _M,
	1420	"__v2/__v3 would be discarded; use the two/one-argument "
	1421	"__convert_x86 overload instead");
	1422
	1423	// [xyz]_to_[xyz] {{{2
	1424	[[maybe_unused]] constexpr bool __x_to_x
	1425	= sizeof(__v0) <= 16 && sizeof(_To) <= 16;
	1426	[[maybe_unused]] constexpr bool __x_to_y
	1427	= sizeof(__v0) <= 16 && sizeof(_To) == 32;
	1428	[[maybe_unused]] constexpr bool __x_to_z
	1429	= sizeof(__v0) <= 16 && sizeof(_To) == 64;
	1430	[[maybe_unused]] constexpr bool __y_to_x
	1431	= sizeof(__v0) == 32 && sizeof(_To) <= 16;
	1432	[[maybe_unused]] constexpr bool __y_to_y
	1433	= sizeof(__v0) == 32 && sizeof(_To) == 32;
	1434	[[maybe_unused]] constexpr bool __y_to_z
	1435	= sizeof(__v0) == 32 && sizeof(_To) == 64;
	1436	[[maybe_unused]] constexpr bool __z_to_x
	1437	= sizeof(__v0) == 64 && sizeof(_To) <= 16;
	1438	[[maybe_unused]] constexpr bool __z_to_y
	1439	= sizeof(__v0) == 64 && sizeof(_To) == 32;
	1440	[[maybe_unused]] constexpr bool __z_to_z
	1441	= sizeof(__v0) == 64 && sizeof(_To) == 64;
	1442
	1443	// iX_to_iX {{{2
	1444	[[maybe_unused]] constexpr bool __i_to_i
	1445	= is_integral_v<_Up> && is_integral_v<_Tp>;
	1446	[[maybe_unused]] constexpr bool __i8_to_i16
	1447	= __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 2;
	1448	[[maybe_unused]] constexpr bool __i8_to_i32
	1449	= __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 4;
	1450	[[maybe_unused]] constexpr bool __i8_to_i64
	1451	= __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 8;
	1452	[[maybe_unused]] constexpr bool __i16_to_i8
	1453	= __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 1;
	1454	[[maybe_unused]] constexpr bool __i16_to_i32
	1455	= __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 4;
	1456	[[maybe_unused]] constexpr bool __i16_to_i64
	1457	= __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 8;
	1458	[[maybe_unused]] constexpr bool __i32_to_i8
	1459	= __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 1;
	1460	[[maybe_unused]] constexpr bool __i32_to_i16
	1461	= __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 2;
	1462	[[maybe_unused]] constexpr bool __i32_to_i64
	1463	= __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 8;
	1464	[[maybe_unused]] constexpr bool __i64_to_i8
	1465	= __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 1;
	1466	[[maybe_unused]] constexpr bool __i64_to_i16
	1467	= __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 2;
	1468	[[maybe_unused]] constexpr bool __i64_to_i32
	1469	= __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 4;
	1470
	1471	// [fsu]X_to_[fsu]X {{{2
	1472	// ibw = integral && byte or word, i.e. char and short with any signedness
	1473	[[maybe_unused]] constexpr bool __i64_to_f32
	1474	= is_integral_v<_Tp> && sizeof(_Tp) == 8
	1475	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
	1476	[[maybe_unused]] constexpr bool __s32_to_f32
	1477	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4
	1478	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
	1479	[[maybe_unused]] constexpr bool __s16_to_f32
	1480	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 2
	1481	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
	1482	[[maybe_unused]] constexpr bool __s8_to_f32
	1483	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 1
	1484	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
	1485	[[maybe_unused]] constexpr bool __u32_to_f32
	1486	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4
	1487	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
	1488	[[maybe_unused]] constexpr bool __u16_to_f32
	1489	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 2
	1490	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
	1491	[[maybe_unused]] constexpr bool __u8_to_f32
	1492	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 1
	1493	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
	1494	[[maybe_unused]] constexpr bool __s64_to_f64
	1495	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 8
	1496	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
	1497	[[maybe_unused]] constexpr bool __s32_to_f64
	1498	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4
	1499	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
	1500	[[maybe_unused]] constexpr bool __s16_to_f64
	1501	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 2
	1502	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
	1503	[[maybe_unused]] constexpr bool __s8_to_f64
	1504	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 1
	1505	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
	1506	[[maybe_unused]] constexpr bool __u64_to_f64
	1507	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 8
	1508	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
	1509	[[maybe_unused]] constexpr bool __u32_to_f64
	1510	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4
	1511	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
	1512	[[maybe_unused]] constexpr bool __u16_to_f64
	1513	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 2
	1514	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
	1515	[[maybe_unused]] constexpr bool __u8_to_f64
	1516	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 1
	1517	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
	1518	[[maybe_unused]] constexpr bool __f32_to_s64
	1519	= is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8
	1520	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
	1521	[[maybe_unused]] constexpr bool __f32_to_s32
	1522	= is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4
	1523	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
	1524	[[maybe_unused]] constexpr bool __f32_to_u64
	1525	= is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8
	1526	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
	1527	[[maybe_unused]] constexpr bool __f32_to_u32
	1528	= is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4
	1529	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
	1530	[[maybe_unused]] constexpr bool __f64_to_s64
	1531	= is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8
	1532	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
	1533	[[maybe_unused]] constexpr bool __f64_to_s32
	1534	= is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4
	1535	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
	1536	[[maybe_unused]] constexpr bool __f64_to_u64
	1537	= is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8
	1538	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
	1539	[[maybe_unused]] constexpr bool __f64_to_u32
	1540	= is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4
	1541	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
	1542	[[maybe_unused]] constexpr bool __f32_to_ibw
	1543	= is_integral_v<_Up> && sizeof(_Up) <= 2
	1544	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
	1545	[[maybe_unused]] constexpr bool __f64_to_ibw
	1546	= is_integral_v<_Up> && sizeof(_Up) <= 2
	1547	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
	1548	[[maybe_unused]] constexpr bool __f32_to_f64
	1549	= is_floating_point_v<_Tp> && sizeof(_Tp) == 4
	1550	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
	1551	[[maybe_unused]] constexpr bool __f64_to_f32
	1552	= is_floating_point_v<_Tp> && sizeof(_Tp) == 8
	1553	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
	1554
	1555	if constexpr (__i_to_i && __y_to_x && !__have_avx2) //{{{2
	1556	{
	1557	// <double, 4>, <double, 4>, <double, 4>, <double, 4> => <char, 16>
	1558	return __convert_x86<_To>(__lo128(__v0), __hi128(__v0), __lo128(__v1),
	1559	__hi128(__v1), __lo128(__v2), __hi128(__v2),
	1560	__lo128(__v3), __hi128(__v3));
	1561	}
	1562	else if constexpr (__i_to_i) // assert ISA {{{2
	1563	{
	1564	static_assert(__x_to_x \|\| __have_avx2,
	1565	"integral conversions with ymm registers require AVX2");
	1566	static_assert(__have_avx512bw
	1567	\|\| ((sizeof(_Tp) >= 4 \|\| sizeof(__v0) < 64)
	1568	&& (sizeof(_Up) >= 4 \|\| sizeof(_To) < 64)),
	1569	"8/16-bit integers in zmm registers require AVX512BW");
	1570	static_assert((sizeof(__v0) < 64 && sizeof(_To) < 64) \|\| __have_avx512f,
	1571	"integral conversions with ymm registers require AVX2");
	1572	}
	1573	// concat => use 2-arg __convert_x86 {{{2
	1574	if constexpr (sizeof(__v0) < 16 \|\| (sizeof(__v0) == 16 && __have_avx2)
	1575	\|\| (sizeof(__v0) == 16 && __have_avx
	1576	&& is_floating_point_v<_Tp>)
	1577	\|\| (sizeof(__v0) == 32 && __have_avx512f))
	1578	{
	1579	// The ISA can handle wider input registers, so concat and use two-arg
	1580	// implementation. This reduces code duplication considerably.
	1581	return __convert_x86<_To>(__concat(__v0, __v1), __concat(__v2, __v3));
	1582	}
	1583	else //{{{2
	1584	{
	1585	// conversion using bit reinterpretation (or no conversion at all)
	1586	// should all go through the concat branch above:
	1587	static_assert(
	1588	!(is_floating_point_v<
	1589	_Tp> == is_floating_point_v<_Up> && sizeof(_Tp) == sizeof(_Up)));
	1590	// handle all zero extension{{{2
	1591	if constexpr (4 * _Np < _M && sizeof(_To) > 16)
	1592	{
	1593	constexpr size_t Min = 16 / sizeof(_Up);
	1594	return __zero_extend(
	1595	__convert_x86<
	1596	__vector_type_t<_Up, (Min > 4 * _Np) ? Min : 4 * _Np>>(
	1597	__v0, __v1, __v2, __v3));
	1598	}
	1599	else if constexpr (__i64_to_i16) //{{{2
	1600	{
	1601	if constexpr (__x_to_x && __have_sse4_1)
	1602	{
	1603	return __intrin_bitcast<_To>(_mm_shuffle_epi8(
	1604	_mm_blend_epi16(
	1605	_mm_blend_epi16(__i0, _mm_slli_si128(__i1, 2), 0x22),
	1606	_mm_blend_epi16(_mm_slli_si128(__i2, 4),
	1607	_mm_slli_si128(__i3, 6), 0x88),
	1608	0xcc),
	1609	_mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7,
	1610	14, 15)));
	1611	}
	1612	else if constexpr (__y_to_y && __have_avx2)
	1613	{
	1614	return __intrin_bitcast<_To>(_mm256_shuffle_epi8(
	1615	__xzyw(_mm256_blend_epi16(
	1616	__auto_bitcast(
	1617	_mm256_shuffle_ps(__vector_bitcast<float>(__v0),
	1618	__vector_bitcast<float>(__v2),
	1619	0x88)), // 0.1. 8.9. 2.3. A.B.
	1620	__to_intrin(__vector_bitcast<int>(_mm256_shuffle_ps(
	1621	__vector_bitcast<float>(__v1),
	1622	__vector_bitcast<float>(__v3), 0x88))
	1623	<< 16), // .4.5 .C.D .6.7 .E.F
	1624	0xaa) // 0415 8C9D 2637 AEBF
	1625	), // 0415 2637 8C9D AEBF
	1626	_mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11,
	1627	14, 15, 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7,
	1628	10, 11, 14, 15)));
	1629	/*
	1630	auto __a = _mm256_unpacklo_epi16(__v0, __v1); // 04.. .... 26..
	1631	.... auto __b = _mm256_unpackhi_epi16(__v0, __v1); // 15..
	1632	.... 37.. .... auto __c = _mm256_unpacklo_epi16(__v2, __v3); //
	1633	8C.. .... AE.. .... auto __d = _mm256_unpackhi_epi16(__v2,
	1634	__v3);
	1635	// 9D.. .... BF.. .... auto __e = _mm256_unpacklo_epi16(__a,
	1636	__b);
	1637	// 0145 .... 2367 .... auto __f = _mm256_unpacklo_epi16(__c,
	1638	__d);
	1639	// 89CD .... ABEF .... auto __g = _mm256_unpacklo_epi64(__e,
	1640	__f);
	1641	// 0145 89CD 2367 ABEF return __concat(
	1642	_mm_unpacklo_epi32(__lo128(__g), __hi128(__g)),
	1643	_mm_unpackhi_epi32(__lo128(__g), __hi128(__g))); // 0123
	1644	4567 89AB CDEF
	1645	*/
	1646	} // else use fallback
	1647	}
	1648	else if constexpr (__i64_to_i8) //{{{2
	1649	{
	1650	if constexpr (__x_to_x)
	1651	{
	1652	// TODO: use fallback for now
	1653	}
	1654	else if constexpr (__y_to_x)
	1655	{
	1656	auto __a
	1657	= _mm256_srli_epi32(_mm256_slli_epi32(__i0, 24), 24)
	1658	\| _mm256_srli_epi32(_mm256_slli_epi32(__i1, 24), 16)
	1659	\| _mm256_srli_epi32(_mm256_slli_epi32(__i2, 24), 8)
	1660	\| _mm256_slli_epi32(
	1661	__i3, 24); // 048C .... 159D .... 26AE .... 37BF ....
	1662	/*return _mm_shuffle_epi8(
	1663	_mm_blend_epi32(__lo128(__a) << 32, __hi128(__a), 0x5),
	1664	_mm_setr_epi8(4, 12, 0, 8, 5, 13, 1, 9, 6, 14, 2, 10, 7, 15,
	1665	3, 11));*/
	1666	auto __b = _mm256_unpackhi_epi64(
	1667	__a, __a); // 159D .... 159D .... 37BF .... 37BF ....
	1668	auto __c = _mm256_unpacklo_epi8(
	1669	__a, __b); // 0145 89CD .... .... 2367 ABEF .... ....
	1670	return __intrin_bitcast<_To>(
	1671	_mm_unpacklo_epi16(__lo128(__c),
	1672	__hi128(__c))); // 0123 4567 89AB CDEF
	1673	}
	1674	}
	1675	else if constexpr (__i32_to_i8) //{{{2
	1676	{
	1677	if constexpr (__x_to_x)
	1678	{
	1679	if constexpr (__have_ssse3)
	1680	{
	1681	const auto __x0 = __vector_bitcast<_UInt>(__v0) & 0xff;
	1682	const auto __x1 = (__vector_bitcast<_UInt>(__v1) & 0xff)
	1683	<< 8;
	1684	const auto __x2 = (__vector_bitcast<_UInt>(__v2) & 0xff)
	1685	<< 16;
	1686	const auto __x3 = __vector_bitcast<_UInt>(__v3) << 24;
	1687	return __intrin_bitcast<_To>(
	1688	_mm_shuffle_epi8(__to_intrin(__x0 \| __x1 \| __x2 \| __x3),
	1689	_mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13,
	1690	2, 6, 10, 14, 3, 7, 11,
	1691	15)));
	1692	}
	1693	else
	1694	{
	1695	auto __a
	1696	= _mm_unpacklo_epi8(__i0, __i2); // 08.. .... 19.. ....
	1697	auto __b
	1698	= _mm_unpackhi_epi8(__i0, __i2); // 2A.. .... 3B.. ....
	1699	auto __c
	1700	= _mm_unpacklo_epi8(__i1, __i3); // 4C.. .... 5D.. ....
	1701	auto __d
	1702	= _mm_unpackhi_epi8(__i1, __i3); // 6E.. .... 7F.. ....
	1703	auto __e
	1704	= _mm_unpacklo_epi8(__a, __c); // 048C .... .... ....
	1705	auto __f
	1706	= _mm_unpackhi_epi8(__a, __c); // 159D .... .... ....
	1707	auto __g
	1708	= _mm_unpacklo_epi8(__b, __d); // 26AE .... .... ....
	1709	auto __h
	1710	= _mm_unpackhi_epi8(__b, __d); // 37BF .... .... ....
	1711	return __intrin_bitcast<_To>(_mm_unpacklo_epi8(
	1712	_mm_unpacklo_epi8(__e, __g), // 0246 8ACE .... ....
	1713	_mm_unpacklo_epi8(__f, __h) // 1357 9BDF .... ....
	1714	)); // 0123 4567 89AB CDEF
	1715	}
	1716	}
	1717	else if constexpr (__y_to_y)
	1718	{
	1719	const auto __a = _mm256_shuffle_epi8(
	1720	__to_intrin((__vector_bitcast<_UShort>(_mm256_blend_epi16(
	1721	__i0, _mm256_slli_epi32(__i1, 16), 0xAA))
	1722	& 0xff)
	1723	\| (__vector_bitcast<_UShort>(_mm256_blend_epi16(
	1724	__i2, _mm256_slli_epi32(__i3, 16), 0xAA))
	1725	<< 8)),
	1726	_mm256_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7,
	1727	11, 15, 0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9,
	1728	13, 3, 7, 11, 15));
	1729	return __intrin_bitcast<_To>(_mm256_permutevar8x32_epi32(
	1730	__a, _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7)));
	1731	}
	1732	}
	1733	else if constexpr (__i64_to_f32) //{{{2
	1734	{
	1735	// this branch is only relevant with AVX and w/o AVX2 (i.e. no ymm
	1736	// integers)
	1737	if constexpr (__x_to_y)
	1738	{
	1739	return __make_wrapper<float>(__v0[0], __v0[1], __v1[0], __v1[1],
	1740	__v2[0], __v2[1], __v3[0],
	1741	__v3[1]);
	1742
	1743	const auto __a = _mm_unpacklo_epi32(__i0, __i1); // acAC
	1744	const auto __b = _mm_unpackhi_epi32(__i0, __i1); // bdBD
	1745	const auto __c = _mm_unpacklo_epi32(__i2, __i3); // egEG
	1746	const auto __d = _mm_unpackhi_epi32(__i2, __i3); // fhFH
	1747	const auto __lo32a = _mm_unpacklo_epi32(__a, __b); // abcd
	1748	const auto __lo32b = _mm_unpacklo_epi32(__c, __d); // efgh
	1749	const auto __hi32 = __vector_bitcast<
	1750	conditional_t<is_signed_v<_Tp>, int, _UInt>>(
	1751	__concat(_mm_unpackhi_epi32(__a, __b),
	1752	_mm_unpackhi_epi32(__c, __d))); // ABCD EFGH
	1753	const auto __hi
	1754	= 0x100000000LL
	1755	* __convert_x86<__vector_type_t<float, 8>>(__hi32);
	1756	const auto __mid
	1757	= 0x10000
	1758	* _mm256_cvtepi32_ps(__concat(_mm_srli_epi32(__lo32a, 16),
	1759	_mm_srli_epi32(__lo32b, 16)));
	1760	const auto __lo = _mm256_cvtepi32_ps(
	1761	__concat(_mm_set1_epi32(0x0000ffffu) & __lo32a,
	1762	_mm_set1_epi32(0x0000ffffu) & __lo32b));
	1763	return (__hi + __mid) + __lo;
	1764	}
	1765	}
	1766	else if constexpr (__f64_to_ibw) //{{{2
	1767	{
	1768	return __convert_x86<_To>(
	1769	__convert_x86<__vector_type_t<int, _Np * 2>>(__v0, __v1),
	1770	__convert_x86<__vector_type_t<int, _Np * 2>>(__v2, __v3));
	1771	}
	1772	else if constexpr (__f32_to_ibw) //{{{2
	1773	{
	1774	return __convert_x86<_To>(
	1775	__convert_x86<__vector_type_t<int, _Np>>(__v0),
	1776	__convert_x86<__vector_type_t<int, _Np>>(__v1),
	1777	__convert_x86<__vector_type_t<int, _Np>>(__v2),
	1778	__convert_x86<__vector_type_t<int, _Np>>(__v3));
	1779	} //}}}
	1780
	1781	// fallback: {{{2
	1782	if constexpr (sizeof(_To) >= 32)
	1783	// if _To is ymm or zmm, then _SimdWrapper<_Up, _M / 2> is xmm or ymm
	1784	return __concat(__convert_x86<__vector_type_t<_Up, _M / 2>>(__v0,
	1785	__v1),
	1786	__convert_x86<__vector_type_t<_Up, _M / 2>>(__v2,
	1787	__v3));
	1788	else if constexpr (sizeof(_To) == 16)
	1789	{
	1790	const auto __lo = __to_intrin(__convert_x86<_To>(__v0, __v1));
	1791	const auto __hi = __to_intrin(__convert_x86<_To>(__v2, __v3));
	1792	if constexpr (sizeof(_Up) * _Np * 2 == 8)
	1793	{
	1794	if constexpr (is_floating_point_v<_Up>)
	1795	return __auto_bitcast(_mm_unpacklo_pd(__lo, __hi));
	1796	else
	1797	return __intrin_bitcast<_To>(_mm_unpacklo_epi64(__lo, __hi));
	1798	}
	1799	else if constexpr (sizeof(_Up) * _Np * 2 == 4)
	1800	{
	1801	if constexpr (is_floating_point_v<_Up>)
	1802	return __auto_bitcast(_mm_unpacklo_ps(__lo, __hi));
	1803	else
	1804	return __intrin_bitcast<_To>(_mm_unpacklo_epi32(__lo, __hi));
	1805	}
	1806	else
	1807	__assert_unreachable<_Tp>();
	1808	}
	1809	else
	1810	return __vector_convert<_To>(__v0, __v1, __v2, __v3,
	1811	make_index_sequence<_Np>());
	1812	//}}}2
	1813	}
	1814	}
	1815
	1816	//}}}
	1817	// 8-arg __convert_x86 {{{1
	1818	template <typename _To, typename _V, typename _Traits>
	1819	_GLIBCXX_SIMD_INTRINSIC _To
	1820	__convert_x86(_V __v0, _V __v1, _V __v2, _V __v3, _V __v4, _V __v5, _V __v6,
	1821	_V __v7)
	1822	{
	1823	static_assert(__is_vector_type_v<_V>);
	1824	using _Tp = typename _Traits::value_type;
	1825	constexpr size_t _Np = _Traits::_S_full_size;
	1826	[[maybe_unused]] const auto __i0 = __to_intrin(__v0);
	1827	[[maybe_unused]] const auto __i1 = __to_intrin(__v1);
	1828	[[maybe_unused]] const auto __i2 = __to_intrin(__v2);
	1829	[[maybe_unused]] const auto __i3 = __to_intrin(__v3);
	1830	[[maybe_unused]] const auto __i4 = __to_intrin(__v4);
	1831	[[maybe_unused]] const auto __i5 = __to_intrin(__v5);
	1832	[[maybe_unused]] const auto __i6 = __to_intrin(__v6);
	1833	[[maybe_unused]] const auto __i7 = __to_intrin(__v7);
	1834	using _Up = typename _VectorTraits<_To>::value_type;
	1835	constexpr size_t _M = _VectorTraits<_To>::_S_full_size;
	1836
	1837	static_assert(8 * _Np <= _M,
	1838	"__v4-__v7 would be discarded; use the four/two/one-argument "
	1839	"__convert_x86 overload instead");
	1840
	1841	// [xyz]_to_[xyz] {{{2
	1842	[[maybe_unused]] constexpr bool __x_to_x
	1843	= sizeof(__v0) <= 16 && sizeof(_To) <= 16;
	1844	[[maybe_unused]] constexpr bool __x_to_y
	1845	= sizeof(__v0) <= 16 && sizeof(_To) == 32;
	1846	[[maybe_unused]] constexpr bool __x_to_z
	1847	= sizeof(__v0) <= 16 && sizeof(_To) == 64;
	1848	[[maybe_unused]] constexpr bool __y_to_x
	1849	= sizeof(__v0) == 32 && sizeof(_To) <= 16;
	1850	[[maybe_unused]] constexpr bool __y_to_y
	1851	= sizeof(__v0) == 32 && sizeof(_To) == 32;
	1852	[[maybe_unused]] constexpr bool __y_to_z
	1853	= sizeof(__v0) == 32 && sizeof(_To) == 64;
	1854	[[maybe_unused]] constexpr bool __z_to_x
	1855	= sizeof(__v0) == 64 && sizeof(_To) <= 16;
	1856	[[maybe_unused]] constexpr bool __z_to_y
	1857	= sizeof(__v0) == 64 && sizeof(_To) == 32;
	1858	[[maybe_unused]] constexpr bool __z_to_z
	1859	= sizeof(__v0) == 64 && sizeof(_To) == 64;
	1860
	1861	// [if]X_to_i8 {{{2
	1862	[[maybe_unused]] constexpr bool __i_to_i
	1863	= is_integral_v<_Up> && is_integral_v<_Tp>;
	1864	[[maybe_unused]] constexpr bool __i64_to_i8
	1865	= __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 1;
	1866	[[maybe_unused]] constexpr bool __f64_to_i8
	1867	= is_integral_v<_Up> && sizeof(_Up) == 1
	1868	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
	1869
	1870	if constexpr (__i_to_i) // assert ISA {{{2
	1871	{
	1872	static_assert(__x_to_x \|\| __have_avx2,
	1873	"integral conversions with ymm registers require AVX2");
	1874	static_assert(__have_avx512bw
	1875	\|\| ((sizeof(_Tp) >= 4 \|\| sizeof(__v0) < 64)
	1876	&& (sizeof(_Up) >= 4 \|\| sizeof(_To) < 64)),
	1877	"8/16-bit integers in zmm registers require AVX512BW");
	1878	static_assert((sizeof(__v0) < 64 && sizeof(_To) < 64) \|\| __have_avx512f,
	1879	"integral conversions with ymm registers require AVX2");
	1880	}
	1881	// concat => use 4-arg __convert_x86 {{{2
	1882	if constexpr (sizeof(__v0) < 16 \|\| (sizeof(__v0) == 16 && __have_avx2)
	1883	\|\| (sizeof(__v0) == 16 && __have_avx
	1884	&& is_floating_point_v<_Tp>)
	1885	\|\| (sizeof(__v0) == 32 && __have_avx512f))
	1886	{
	1887	// The ISA can handle wider input registers, so concat and use two-arg
	1888	// implementation. This reduces code duplication considerably.
	1889	return __convert_x86<_To>(__concat(__v0, __v1), __concat(__v2, __v3),
	1890	__concat(__v4, __v5), __concat(__v6, __v7));
	1891	}
	1892	else //{{{2
	1893	{
	1894	// conversion using bit reinterpretation (or no conversion at all)
	1895	// should all go through the concat branch above:
	1896	static_assert(
	1897	!(is_floating_point_v<
	1898	_Tp> == is_floating_point_v<_Up> && sizeof(_Tp) == sizeof(_Up)));
	1899	static_assert(!(8 * _Np < _M && sizeof(_To) > 16),
	1900	"zero extension should be impossible");
	1901	if constexpr (__i64_to_i8) //{{{2
	1902	{
	1903	if constexpr (__x_to_x && __have_ssse3)
	1904	{
	1905	// unsure whether this is better than the variant below
	1906	return __intrin_bitcast<_To>(_mm_shuffle_epi8(
	1907	__to_intrin(
	1908	(((__v0 & 0xff) \| ((__v1 & 0xff) << 8))
	1909	\| (((__v2 & 0xff) << 16) \| ((__v3 & 0xff) << 24)))
	1910	\| ((((__v4 & 0xff) << 32) \| ((__v5 & 0xff) << 40))
	1911	\| (((__v6 & 0xff) << 48) \| (__v7 << 56)))),
	1912	_mm_setr_epi8(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14,
	1913	7, 15)));
	1914	}
	1915	else if constexpr (__x_to_x)
	1916	{
	1917	const auto __a = _mm_unpacklo_epi8(__i0, __i1); // ac
	1918	const auto __b = _mm_unpackhi_epi8(__i0, __i1); // bd
	1919	const auto __c = _mm_unpacklo_epi8(__i2, __i3); // eg
	1920	const auto __d = _mm_unpackhi_epi8(__i2, __i3); // fh
	1921	const auto __e = _mm_unpacklo_epi8(__i4, __i5); // ik
	1922	const auto __f = _mm_unpackhi_epi8(__i4, __i5); // jl
	1923	const auto __g = _mm_unpacklo_epi8(__i6, __i7); // mo
	1924	const auto __h = _mm_unpackhi_epi8(__i6, __i7); // np
	1925	return __intrin_bitcast<_To>(_mm_unpacklo_epi64(
	1926	_mm_unpacklo_epi32(_mm_unpacklo_epi8(__a, __b), // abcd
	1927	_mm_unpacklo_epi8(__c, __d)), // efgh
	1928	_mm_unpacklo_epi32(_mm_unpacklo_epi8(__e, __f), // ijkl
	1929	_mm_unpacklo_epi8(__g, __h)) // mnop
	1930	));
	1931	}
	1932	else if constexpr (__y_to_y)
	1933	{
	1934	auto __a = // 048C GKOS 159D HLPT 26AE IMQU 37BF JNRV
	1935	__to_intrin(
	1936	(((__v0 & 0xff) \| ((__v1 & 0xff) << 8))
	1937	\| (((__v2 & 0xff) << 16) \| ((__v3 & 0xff) << 24)))
	1938	\| ((((__v4 & 0xff) << 32) \| ((__v5 & 0xff) << 40))
	1939	\| (((__v6 & 0xff) << 48) \| ((__v7 << 56)))));
	1940	/*
	1941	auto __b = _mm256_unpackhi_epi64(__a, __a); // 159D HLPT 159D
	1942	HLPT 37BF JNRV 37BF JNRV auto __c = _mm256_unpacklo_epi8(__a,
	1943	__b); // 0145 89CD GHKL OPST 2367 ABEF IJMN QRUV auto __d =
	1944	__xzyw(__c); // 0145 89CD 2367 ABEF GHKL OPST IJMN QRUV return
	1945	_mm256_shuffle_epi8(
	1946	__d, _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12,
	1947	13, 6, 7, 14, 15, 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7,
	1948	14, 15));
	1949	*/
	1950	auto __b = _mm256_shuffle_epi8( // 0145 89CD GHKL OPST 2367 ABEF
	1951	// IJMN QRUV
	1952	__a, _mm256_setr_epi8(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13,
	1953	6, 14, 7, 15, 0, 8, 1, 9, 2, 10, 3, 11,
	1954	4, 12, 5, 13, 6, 14, 7, 15));
	1955	auto __c
	1956	= __xzyw(__b); // 0145 89CD 2367 ABEF GHKL OPST IJMN QRUV
	1957	return __intrin_bitcast<_To>(_mm256_shuffle_epi8(
	1958	__c, _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13,
	1959	6, 7, 14, 15, 0, 1, 8, 9, 2, 3, 10, 11,
	1960	4, 5, 12, 13, 6, 7, 14, 15)));
	1961	}
	1962	else if constexpr (__z_to_z)
	1963	{
	1964	return __concat(
	1965	__convert_x86<__vector_type_t<_Up, _M / 2>>(__v0, __v1, __v2,
	1966	__v3),
	1967	__convert_x86<__vector_type_t<_Up, _M / 2>>(__v4, __v5, __v6,
	1968	__v7));
	1969	}
	1970	}
	1971	else if constexpr (__f64_to_i8) //{{{2
	1972	{
	1973	return __convert_x86<_To>(
	1974	__convert_x86<__vector_type_t<int, _Np * 2>>(__v0, __v1),
	1975	__convert_x86<__vector_type_t<int, _Np * 2>>(__v2, __v3),
	1976	__convert_x86<__vector_type_t<int, _Np * 2>>(__v4, __v5),
	1977	__convert_x86<__vector_type_t<int, _Np * 2>>(__v6, __v7));
	1978	}
	1979	else // unreachable {{{2
	1980	__assert_unreachable<_Tp>();
	1981	//}}}
	1982
	1983	// fallback: {{{2
	1984	if constexpr (sizeof(_To) >= 32)
	1985	// if _To is ymm or zmm, then _SimdWrapper<_Up, _M / 2> is xmm or ymm
	1986	return __concat(
	1987	__convert_x86<__vector_type_t<_Up, _M / 2>>(__v0, __v1, __v2, __v3),
	1988	__convert_x86<__vector_type_t<_Up, _M / 2>>(__v4, __v5, __v6,
	1989	__v7));
	1990	else if constexpr (sizeof(_To) == 16)
	1991	{
	1992	const auto __lo
	1993	= __to_intrin(__convert_x86<_To>(__v0, __v1, __v2, __v3));
	1994	const auto __hi
	1995	= __to_intrin(__convert_x86<_To>(__v4, __v5, __v6, __v7));
	1996	static_assert(sizeof(_Up) == 1 && _Np == 2);
	1997	return __intrin_bitcast<_To>(_mm_unpacklo_epi64(__lo, __hi));
	1998	}
	1999	else
	2000	{
	2001	__assert_unreachable<_Tp>();
	2002	// return __vector_convert<_To>(__v0, __v1, __v2, __v3, __v4, __v5,
	2003	// __v6, __v7,
	2004	// make_index_sequence<_Np>());
	2005	} //}}}2
	2006	}
	2007	}
	2008
	2009	//}}}
	2010	// 16-arg __convert_x86 {{{1
	2011	template <typename _To, typename _V, typename _Traits>
	2012	_GLIBCXX_SIMD_INTRINSIC _To
	2013	__convert_x86(_V __v0, _V __v1, _V __v2, _V __v3, _V __v4, _V __v5, _V __v6,
	2014	_V __v7, _V __v8, _V __v9, _V __v10, _V __v11, _V __v12,
	2015	_V __v13, _V __v14, _V __v15)
	2016	{
	2017	// concat => use 8-arg __convert_x86
	2018	return __convert_x86<_To>(__concat(__v0, __v1), __concat(__v2, __v3),
	2019	__concat(__v4, __v5), __concat(__v6, __v7),
	2020	__concat(__v8, __v9), __concat(__v10, __v11),
	2021	__concat(__v12, __v13), __concat(__v14, __v15));
	2022	}
	2023
	2024	//}}}
	2025
	2026	#endif // __cplusplus >= 201703L
	2027	#endif // _GLIBCXX_EXPERIMENTAL_SIMD_X86_CONVERSIONS_H
	2028
	2029	// vim: foldmethod=marker

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: Daodan/MSYS2/mingw32/include/c++/11.2.0/experimental/bits/simd_x86_conversions.h

Download in other formats: