Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log
Repository URL

simd_x86_conversions.h@ 1181

Last change on this file since 1181 was 1166, checked in by rossy, 3 years ago
Daodan: Replace MinGW build env with an up-to-date MSYS2 env
File size: 80.5 KB

Line
1	// x86 specific conversion optimizations -- C++ --
2
3	// Copyright (C) 2020-2021 Free Software Foundation, Inc.
4	//
5	// This file is part of the GNU ISO C++ Library. This library is free
6	// software; you can redistribute it and/or modify it under the
7	// terms of the GNU General Public License as published by the
8	// Free Software Foundation; either version 3, or (at your option)
9	// any later version.
10
11	// This library is distributed in the hope that it will be useful,
12	// but WITHOUT ANY WARRANTY; without even the implied warranty of
13	// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	// GNU General Public License for more details.
15
16	// Under Section 7 of GPL version 3, you are granted additional
17	// permissions described in the GCC Runtime Library Exception, version
18	// 3.1, as published by the Free Software Foundation.
19
20	// You should have received a copy of the GNU General Public License and
21	// a copy of the GCC Runtime Library Exception along with this program;
22	// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23	// <http://www.gnu.org/licenses/>.
24
25	#ifndef _GLIBCXX_EXPERIMENTAL_SIMD_X86_CONVERSIONS_H
26	#define _GLIBCXX_EXPERIMENTAL_SIMD_X86_CONVERSIONS_H
27
28	#if __cplusplus >= 201703L
29
30	// work around PR85827
31	// 1-arg __convert_x86 {{{1
32	template <typename _To, typename _V, typename _Traits>
33	_GLIBCXX_SIMD_INTRINSIC _To
34	__convert_x86(_V __v)
35	{
36	static_assert(__is_vector_type_v<_V>);
37	using _Tp = typename _Traits::value_type;
38	constexpr size_t _Np = _Traits::_S_full_size;
39	[[maybe_unused]] const auto __intrin = __to_intrin(__v);
40	using _Up = typename _VectorTraits<_To>::value_type;
41	constexpr size_t _M = _VectorTraits<_To>::_S_full_size;
42
43	// [xyz]_to_[xyz] {{{2
44	[[maybe_unused]] constexpr bool __x_to_x
45	= sizeof(__v) <= 16 && sizeof(_To) <= 16;
46	[[maybe_unused]] constexpr bool __x_to_y
47	= sizeof(__v) <= 16 && sizeof(_To) == 32;
48	[[maybe_unused]] constexpr bool __x_to_z
49	= sizeof(__v) <= 16 && sizeof(_To) == 64;
50	[[maybe_unused]] constexpr bool __y_to_x
51	= sizeof(__v) == 32 && sizeof(_To) <= 16;
52	[[maybe_unused]] constexpr bool __y_to_y
53	= sizeof(__v) == 32 && sizeof(_To) == 32;
54	[[maybe_unused]] constexpr bool __y_to_z
55	= sizeof(__v) == 32 && sizeof(_To) == 64;
56	[[maybe_unused]] constexpr bool __z_to_x
57	= sizeof(__v) == 64 && sizeof(_To) <= 16;
58	[[maybe_unused]] constexpr bool __z_to_y
59	= sizeof(__v) == 64 && sizeof(_To) == 32;
60	[[maybe_unused]] constexpr bool __z_to_z
61	= sizeof(__v) == 64 && sizeof(_To) == 64;
62
63	// iX_to_iX {{{2
64	[[maybe_unused]] constexpr bool __i_to_i
65	= is_integral_v<_Up> && is_integral_v<_Tp>;
66	[[maybe_unused]] constexpr bool __i8_to_i16
67	= __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 2;
68	[[maybe_unused]] constexpr bool __i8_to_i32
69	= __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 4;
70	[[maybe_unused]] constexpr bool __i8_to_i64
71	= __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 8;
72	[[maybe_unused]] constexpr bool __i16_to_i8
73	= __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 1;
74	[[maybe_unused]] constexpr bool __i16_to_i32
75	= __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 4;
76	[[maybe_unused]] constexpr bool __i16_to_i64
77	= __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 8;
78	[[maybe_unused]] constexpr bool __i32_to_i8
79	= __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 1;
80	[[maybe_unused]] constexpr bool __i32_to_i16
81	= __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 2;
82	[[maybe_unused]] constexpr bool __i32_to_i64
83	= __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 8;
84	[[maybe_unused]] constexpr bool __i64_to_i8
85	= __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 1;
86	[[maybe_unused]] constexpr bool __i64_to_i16
87	= __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 2;
88	[[maybe_unused]] constexpr bool __i64_to_i32
89	= __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 4;
90
91	// [fsu]X_to_[fsu]X {{{2
92	// ibw = integral && byte or word, i.e. char and short with any signedness
93	[[maybe_unused]] constexpr bool __s64_to_f32
94	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 8
95	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
96	[[maybe_unused]] constexpr bool __s32_to_f32
97	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4
98	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
99	[[maybe_unused]] constexpr bool __s16_to_f32
100	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 2
101	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
102	[[maybe_unused]] constexpr bool __s8_to_f32
103	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 1
104	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
105	[[maybe_unused]] constexpr bool __u64_to_f32
106	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 8
107	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
108	[[maybe_unused]] constexpr bool __u32_to_f32
109	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4
110	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
111	[[maybe_unused]] constexpr bool __u16_to_f32
112	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 2
113	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
114	[[maybe_unused]] constexpr bool __u8_to_f32
115	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 1
116	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
117	[[maybe_unused]] constexpr bool __s64_to_f64
118	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 8
119	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
120	[[maybe_unused]] constexpr bool __s32_to_f64
121	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4
122	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
123	[[maybe_unused]] constexpr bool __u64_to_f64
124	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 8
125	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
126	[[maybe_unused]] constexpr bool __u32_to_f64
127	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4
128	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
129	[[maybe_unused]] constexpr bool __f32_to_s64
130	= is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8
131	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
132	[[maybe_unused]] constexpr bool __f32_to_s32
133	= is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4
134	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
135	[[maybe_unused]] constexpr bool __f32_to_u64
136	= is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8
137	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
138	[[maybe_unused]] constexpr bool __f32_to_u32
139	= is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4
140	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
141	[[maybe_unused]] constexpr bool __f64_to_s64
142	= is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8
143	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
144	[[maybe_unused]] constexpr bool __f64_to_s32
145	= is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4
146	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
147	[[maybe_unused]] constexpr bool __f64_to_u64
148	= is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8
149	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
150	[[maybe_unused]] constexpr bool __f64_to_u32
151	= is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4
152	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
153	[[maybe_unused]] constexpr bool __ibw_to_f32
154	= is_integral_v<_Tp> && sizeof(_Tp) <= 2
155	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
156	[[maybe_unused]] constexpr bool __ibw_to_f64
157	= is_integral_v<_Tp> && sizeof(_Tp) <= 2
158	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
159	[[maybe_unused]] constexpr bool __f32_to_ibw
160	= is_integral_v<_Up> && sizeof(_Up) <= 2
161	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
162	[[maybe_unused]] constexpr bool __f64_to_ibw
163	= is_integral_v<_Up> && sizeof(_Up) <= 2
164	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
165	[[maybe_unused]] constexpr bool __f32_to_f64
166	= is_floating_point_v<_Tp> && sizeof(_Tp) == 4
167	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
168	[[maybe_unused]] constexpr bool __f64_to_f32
169	= is_floating_point_v<_Tp> && sizeof(_Tp) == 8
170	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
171
172	if constexpr (__i_to_i && __y_to_x && !__have_avx2) //{{{2
173	return __convert_x86<_To>(__lo128(__v), __hi128(__v));
174	else if constexpr (__i_to_i && __x_to_y && !__have_avx2) //{{{2
175	return __concat(__convert_x86<__vector_type_t<_Up, _M / 2>>(__v),
176	__convert_x86<__vector_type_t<_Up, _M / 2>>(
177	__extract_part<1, _Np / _M * 2>(__v)));
178	else if constexpr (__i_to_i) //{{{2
179	{
180	static_assert(__x_to_x \|\| __have_avx2,
181	"integral conversions with ymm registers require AVX2");
182	static_assert(__have_avx512bw
183	\|\| ((sizeof(_Tp) >= 4 \|\| sizeof(__v) < 64)
184	&& (sizeof(_Up) >= 4 \|\| sizeof(_To) < 64)),
185	"8/16-bit integers in zmm registers require AVX512BW");
186	static_assert((sizeof(__v) < 64 && sizeof(_To) < 64) \|\| __have_avx512f,
187	"integral conversions with ymm registers require AVX2");
188	}
189	if constexpr (is_floating_point_v<_Tp> == is_floating_point_v<_Up> && //{{{2
190	sizeof(_Tp) == sizeof(_Up))
191	{
192	// conversion uses simple bit reinterpretation (or no conversion at all)
193	if constexpr (_Np >= _M)
194	return __intrin_bitcast<_To>(__v);
195	else
196	return __zero_extend(__vector_bitcast<_Up>(__v));
197	}
198	else if constexpr (_Np < _M && sizeof(_To) > 16) //{{{2
199	// zero extend (eg. xmm -> ymm)
200	return __zero_extend(
201	__convert_x86<__vector_type_t<
202	_Up, (16 / sizeof(_Up) > _Np) ? 16 / sizeof(_Up) : _Np>>(__v));
203	else if constexpr (_Np > _M && sizeof(__v) > 16) //{{{2
204	// partial input (eg. ymm -> xmm)
205	return __convert_x86<_To>(__extract_part<0, _Np / _M>(__v));
206	else if constexpr (__i64_to_i32) //{{{2
207	{
208	if constexpr (__x_to_x && __have_avx512vl)
209	return __intrin_bitcast<_To>(_mm_cvtepi64_epi32(__intrin));
210	else if constexpr (__x_to_x)
211	return __auto_bitcast(
212	_mm_shuffle_ps(__vector_bitcast<float>(__v), __m128(), 8));
213	else if constexpr (__y_to_x && __have_avx512vl)
214	return __intrin_bitcast<_To>(_mm256_cvtepi64_epi32(__intrin));
215	else if constexpr (__y_to_x && __have_avx512f)
216	return __intrin_bitcast<_To>(
217	__lo128(_mm512_cvtepi64_epi32(__auto_bitcast(__v))));
218	else if constexpr (__y_to_x)
219	return __intrin_bitcast<_To>(
220	__lo128(_mm256_permute4x64_epi64(_mm256_shuffle_epi32(__intrin, 8),
221	0 + 4 * 2)));
222	else if constexpr (__z_to_y)
223	return __intrin_bitcast<_To>(_mm512_cvtepi64_epi32(__intrin));
224	}
225	else if constexpr (__i64_to_i16) //{{{2
226	{
227	if constexpr (__x_to_x && __have_avx512vl)
228	return __intrin_bitcast<_To>(_mm_cvtepi64_epi16(__intrin));
229	else if constexpr (__x_to_x && __have_avx512f)
230	return __intrin_bitcast<_To>(
231	__lo128(_mm512_cvtepi64_epi16(__auto_bitcast(__v))));
232	else if constexpr (__x_to_x && __have_ssse3)
233	{
234	return __intrin_bitcast<_To>(
235	_mm_shuffle_epi8(__intrin,
236	_mm_setr_epi8(0, 1, 8, 9, -0x80, -0x80, -0x80,
237	-0x80, -0x80, -0x80, -0x80, -0x80,
238	-0x80, -0x80, -0x80, -0x80)));
239	// fallback without SSSE3
240	}
241	else if constexpr (__y_to_x && __have_avx512vl)
242	return __intrin_bitcast<_To>(_mm256_cvtepi64_epi16(__intrin));
243	else if constexpr (__y_to_x && __have_avx512f)
244	return __intrin_bitcast<_To>(
245	__lo128(_mm512_cvtepi64_epi16(__auto_bitcast(__v))));
246	else if constexpr (__y_to_x)
247	{
248	const auto __a = _mm256_shuffle_epi8(
249	__intrin,
250	_mm256_setr_epi8(0, 1, 8, 9, -0x80, -0x80, -0x80, -0x80, -0x80,
251	-0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
252	-0x80, -0x80, -0x80, -0x80, 0, 1, 8, 9, -0x80,
253	-0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
254	-0x80));
255	return __intrin_bitcast<_To>(__lo128(__a) \| __hi128(__a));
256	}
257	else if constexpr (__z_to_x)
258	return __intrin_bitcast<_To>(_mm512_cvtepi64_epi16(__intrin));
259	}
260	else if constexpr (__i64_to_i8) //{{{2
261	{
262	if constexpr (__x_to_x && __have_avx512vl)
263	return __intrin_bitcast<_To>(_mm_cvtepi64_epi8(__intrin));
264	else if constexpr (__x_to_x && __have_avx512f)
265	return __intrin_bitcast<_To>(
266	__lo128(_mm512_cvtepi64_epi8(__zero_extend(__intrin))));
267	else if constexpr (__y_to_x && __have_avx512vl)
268	return __intrin_bitcast<_To>(_mm256_cvtepi64_epi8(__intrin));
269	else if constexpr (__y_to_x && __have_avx512f)
270	return __intrin_bitcast<_To>(
271	_mm512_cvtepi64_epi8(__zero_extend(__intrin)));
272	else if constexpr (__z_to_x)
273	return __intrin_bitcast<_To>(_mm512_cvtepi64_epi8(__intrin));
274	}
275	else if constexpr (__i32_to_i64) //{{{2
276	{
277	if constexpr (__have_sse4_1 && __x_to_x)
278	return __intrin_bitcast<_To>(is_signed_v<_Tp>
279	? _mm_cvtepi32_epi64(__intrin)
280	: _mm_cvtepu32_epi64(__intrin));
281	else if constexpr (__x_to_x)
282	{
283	return __intrin_bitcast<_To>(
284	_mm_unpacklo_epi32(__intrin, is_signed_v<_Tp>
285	? _mm_srai_epi32(__intrin, 31)
286	: __m128i()));
287	}
288	else if constexpr (__x_to_y)
289	return __intrin_bitcast<_To>(is_signed_v<_Tp>
290	? _mm256_cvtepi32_epi64(__intrin)
291	: _mm256_cvtepu32_epi64(__intrin));
292	else if constexpr (__y_to_z)
293	return __intrin_bitcast<_To>(is_signed_v<_Tp>
294	? _mm512_cvtepi32_epi64(__intrin)
295	: _mm512_cvtepu32_epi64(__intrin));
296	}
297	else if constexpr (__i32_to_i16) //{{{2
298	{
299	if constexpr (__x_to_x && __have_avx512vl)
300	return __intrin_bitcast<_To>(_mm_cvtepi32_epi16(__intrin));
301	else if constexpr (__x_to_x && __have_avx512f)
302	return __intrin_bitcast<_To>(
303	__lo128(_mm512_cvtepi32_epi16(__auto_bitcast(__v))));
304	else if constexpr (__x_to_x && __have_ssse3)
305	return __intrin_bitcast<_To>(_mm_shuffle_epi8(
306	__intrin, _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80,
307	-0x80, -0x80, -0x80, -0x80, -0x80, -0x80)));
308	else if constexpr (__x_to_x)
309	{
310	auto __a = _mm_unpacklo_epi16(__intrin, __m128i()); // 0o.o 1o.o
311	auto __b = _mm_unpackhi_epi16(__intrin, __m128i()); // 2o.o 3o.o
312	auto __c = _mm_unpacklo_epi16(__a, __b); // 02oo ..oo
313	auto __d = _mm_unpackhi_epi16(__a, __b); // 13oo ..oo
314	return __intrin_bitcast<_To>(
315	_mm_unpacklo_epi16(__c, __d)); // 0123 oooo
316	}
317	else if constexpr (__y_to_x && __have_avx512vl)
318	return __intrin_bitcast<_To>(_mm256_cvtepi32_epi16(__intrin));
319	else if constexpr (__y_to_x && __have_avx512f)
320	return __intrin_bitcast<_To>(
321	__lo128(_mm512_cvtepi32_epi16(__auto_bitcast(__v))));
322	else if constexpr (__y_to_x)
323	{
324	auto __a = _mm256_shuffle_epi8(
325	__intrin,
326	_mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80, -0x80,
327	-0x80, -0x80, -0x80, -0x80, -0x80, 0, 1, 4, 5, 8,
328	9, 12, 13, -0x80, -0x80, -0x80, -0x80, -0x80,
329	-0x80, -0x80, -0x80));
330	return __intrin_bitcast<_To>(__lo128(
331	_mm256_permute4x64_epi64(__a,
332	0xf8))); // __a[0] __a[2] \| __a[3] __a[3]
333	}
334	else if constexpr (__z_to_y)
335	return __intrin_bitcast<_To>(_mm512_cvtepi32_epi16(__intrin));
336	}
337	else if constexpr (__i32_to_i8) //{{{2
338	{
339	if constexpr (__x_to_x && __have_avx512vl)
340	return __intrin_bitcast<_To>(_mm_cvtepi32_epi8(__intrin));
341	else if constexpr (__x_to_x && __have_avx512f)
342	return __intrin_bitcast<_To>(
343	__lo128(_mm512_cvtepi32_epi8(__zero_extend(__intrin))));
344	else if constexpr (__x_to_x && __have_ssse3)
345	{
346	return __intrin_bitcast<_To>(
347	_mm_shuffle_epi8(__intrin,
348	_mm_setr_epi8(0, 4, 8, 12, -0x80, -0x80, -0x80,
349	-0x80, -0x80, -0x80, -0x80, -0x80,
350	-0x80, -0x80, -0x80, -0x80)));
351	}
352	else if constexpr (__x_to_x)
353	{
354	const auto __a
355	= _mm_unpacklo_epi8(__intrin, __intrin); // 0... .... 1... ....
356	const auto __b
357	= _mm_unpackhi_epi8(__intrin, __intrin); // 2... .... 3... ....
358	const auto __c = _mm_unpacklo_epi8(__a, __b); // 02.. .... .... ....
359	const auto __d = _mm_unpackhi_epi8(__a, __b); // 13.. .... .... ....
360	const auto __e = _mm_unpacklo_epi8(__c, __d); // 0123 .... .... ....
361	return __intrin_bitcast<_To>(__e & _mm_cvtsi32_si128(-1));
362	}
363	else if constexpr (__y_to_x && __have_avx512vl)
364	return __intrin_bitcast<_To>(_mm256_cvtepi32_epi8(__intrin));
365	else if constexpr (__y_to_x && __have_avx512f)
366	return __intrin_bitcast<_To>(
367	_mm512_cvtepi32_epi8(__zero_extend(__intrin)));
368	else if constexpr (__z_to_x)
369	return __intrin_bitcast<_To>(_mm512_cvtepi32_epi8(__intrin));
370	}
371	else if constexpr (__i16_to_i64) //{{{2
372	{
373	if constexpr (__x_to_x && __have_sse4_1)
374	return __intrin_bitcast<_To>(is_signed_v<_Tp>
375	? _mm_cvtepi16_epi64(__intrin)
376	: _mm_cvtepu16_epi64(__intrin));
377	else if constexpr (__x_to_x && is_signed_v<_Tp>)
378	{
379	auto __x = _mm_srai_epi16(__intrin, 15);
380	auto __y = _mm_unpacklo_epi16(__intrin, __x);
381	__x = _mm_unpacklo_epi16(__x, __x);
382	return __intrin_bitcast<_To>(_mm_unpacklo_epi32(__y, __x));
383	}
384	else if constexpr (__x_to_x)
385	return __intrin_bitcast<_To>(
386	_mm_unpacklo_epi32(_mm_unpacklo_epi16(__intrin, __m128i()),
387	__m128i()));
388	else if constexpr (__x_to_y)
389	return __intrin_bitcast<_To>(is_signed_v<_Tp>
390	? _mm256_cvtepi16_epi64(__intrin)
391	: _mm256_cvtepu16_epi64(__intrin));
392	else if constexpr (__x_to_z)
393	return __intrin_bitcast<_To>(is_signed_v<_Tp>
394	? _mm512_cvtepi16_epi64(__intrin)
395	: _mm512_cvtepu16_epi64(__intrin));
396	}
397	else if constexpr (__i16_to_i32) //{{{2
398	{
399	if constexpr (__x_to_x && __have_sse4_1)
400	return __intrin_bitcast<_To>(is_signed_v<_Tp>
401	? _mm_cvtepi16_epi32(__intrin)
402	: _mm_cvtepu16_epi32(__intrin));
403	else if constexpr (__x_to_x && is_signed_v<_Tp>)
404	return __intrin_bitcast<_To>(
405	_mm_srai_epi32(_mm_unpacklo_epi16(__intrin, __intrin), 16));
406	else if constexpr (__x_to_x && is_unsigned_v<_Tp>)
407	return __intrin_bitcast<_To>(_mm_unpacklo_epi16(__intrin, __m128i()));
408	else if constexpr (__x_to_y)
409	return __intrin_bitcast<_To>(is_signed_v<_Tp>
410	? _mm256_cvtepi16_epi32(__intrin)
411	: _mm256_cvtepu16_epi32(__intrin));
412	else if constexpr (__y_to_z)
413	return __intrin_bitcast<_To>(is_signed_v<_Tp>
414	? _mm512_cvtepi16_epi32(__intrin)
415	: _mm512_cvtepu16_epi32(__intrin));
416	}
417	else if constexpr (__i16_to_i8) //{{{2
418	{
419	if constexpr (__x_to_x && __have_avx512bw_vl)
420	return __intrin_bitcast<_To>(_mm_cvtepi16_epi8(__intrin));
421	else if constexpr (__x_to_x && __have_avx512bw)
422	return __intrin_bitcast<_To>(
423	__lo128(_mm512_cvtepi16_epi8(__zero_extend(__intrin))));
424	else if constexpr (__x_to_x && __have_ssse3)
425	return __intrin_bitcast<_To>(_mm_shuffle_epi8(
426	__intrin, _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, -0x80, -0x80,
427	-0x80, -0x80, -0x80, -0x80, -0x80, -0x80)));
428	else if constexpr (__x_to_x)
429	{
430	auto __a
431	= _mm_unpacklo_epi8(__intrin, __intrin); // 00.. 11.. 22.. 33..
432	auto __b
433	= _mm_unpackhi_epi8(__intrin, __intrin); // 44.. 55.. 66.. 77..
434	auto __c = _mm_unpacklo_epi8(__a, __b); // 0404 .... 1515 ....
435	auto __d = _mm_unpackhi_epi8(__a, __b); // 2626 .... 3737 ....
436	auto __e = _mm_unpacklo_epi8(__c, __d); // 0246 0246 .... ....
437	auto __f = _mm_unpackhi_epi8(__c, __d); // 1357 1357 .... ....
438	return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__e, __f));
439	}
440	else if constexpr (__y_to_x && __have_avx512bw_vl)
441	return __intrin_bitcast<_To>(_mm256_cvtepi16_epi8(__intrin));
442	else if constexpr (__y_to_x && __have_avx512bw)
443	return __intrin_bitcast<_To>(
444	__lo256(_mm512_cvtepi16_epi8(__zero_extend(__intrin))));
445	else if constexpr (__y_to_x)
446	{
447	auto __a = _mm256_shuffle_epi8(
448	__intrin,
449	_mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, -0x80, -0x80, -0x80,
450	-0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
451	-0x80, -0x80, -0x80, -0x80, -0x80, -0x80, 0, 2,
452	4, 6, 8, 10, 12, 14));
453	return __intrin_bitcast<_To>(__lo128(__a) \| __hi128(__a));
454	}
455	else if constexpr (__z_to_y && __have_avx512bw)
456	return __intrin_bitcast<_To>(_mm512_cvtepi16_epi8(__intrin));
457	else if constexpr (__z_to_y)
458	__assert_unreachable<_Tp>();
459	}
460	else if constexpr (__i8_to_i64) //{{{2
461	{
462	if constexpr (__x_to_x && __have_sse4_1)
463	return __intrin_bitcast<_To>(is_signed_v<_Tp>
464	? _mm_cvtepi8_epi64(__intrin)
465	: _mm_cvtepu8_epi64(__intrin));
466	else if constexpr (__x_to_x && is_signed_v<_Tp>)
467	{
468	if constexpr (__have_ssse3)
469	{
470	auto __dup = _mm_unpacklo_epi8(__intrin, __intrin);
471	auto __epi16 = _mm_srai_epi16(__dup, 8);
472	_mm_shuffle_epi8(__epi16,
473	_mm_setr_epi8(0, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3,
474	3, 3, 3, 3, 3));
475	}
476	else
477	{
478	auto __x = _mm_unpacklo_epi8(__intrin, __intrin);
479	__x = _mm_unpacklo_epi16(__x, __x);
480	return __intrin_bitcast<_To>(
481	_mm_unpacklo_epi32(_mm_srai_epi32(__x, 24),
482	_mm_srai_epi32(__x, 31)));
483	}
484	}
485	else if constexpr (__x_to_x)
486	{
487	return __intrin_bitcast<_To>(_mm_unpacklo_epi32(
488	_mm_unpacklo_epi16(_mm_unpacklo_epi8(__intrin, __m128i()),
489	__m128i()),
490	__m128i()));
491	}
492	else if constexpr (__x_to_y)
493	return __intrin_bitcast<_To>(is_signed_v<_Tp>
494	? _mm256_cvtepi8_epi64(__intrin)
495	: _mm256_cvtepu8_epi64(__intrin));
496	else if constexpr (__x_to_z)
497	return __intrin_bitcast<_To>(is_signed_v<_Tp>
498	? _mm512_cvtepi8_epi64(__intrin)
499	: _mm512_cvtepu8_epi64(__intrin));
500	}
501	else if constexpr (__i8_to_i32) //{{{2
502	{
503	if constexpr (__x_to_x && __have_sse4_1)
504	return __intrin_bitcast<_To>(is_signed_v<_Tp>
505	? _mm_cvtepi8_epi32(__intrin)
506	: _mm_cvtepu8_epi32(__intrin));
507	else if constexpr (__x_to_x && is_signed_v<_Tp>)
508	{
509	const auto __x = _mm_unpacklo_epi8(__intrin, __intrin);
510	return __intrin_bitcast<_To>(
511	_mm_srai_epi32(_mm_unpacklo_epi16(__x, __x), 24));
512	}
513	else if constexpr (__x_to_x && is_unsigned_v<_Tp>)
514	return __intrin_bitcast<_To>(
515	_mm_unpacklo_epi16(_mm_unpacklo_epi8(__intrin, __m128i()),
516	__m128i()));
517	else if constexpr (__x_to_y)
518	return __intrin_bitcast<_To>(is_signed_v<_Tp>
519	? _mm256_cvtepi8_epi32(__intrin)
520	: _mm256_cvtepu8_epi32(__intrin));
521	else if constexpr (__x_to_z)
522	return __intrin_bitcast<_To>(is_signed_v<_Tp>
523	? _mm512_cvtepi8_epi32(__intrin)
524	: _mm512_cvtepu8_epi32(__intrin));
525	}
526	else if constexpr (__i8_to_i16) //{{{2
527	{
528	if constexpr (__x_to_x && __have_sse4_1)
529	return __intrin_bitcast<_To>(is_signed_v<_Tp>
530	? _mm_cvtepi8_epi16(__intrin)
531	: _mm_cvtepu8_epi16(__intrin));
532	else if constexpr (__x_to_x && is_signed_v<_Tp>)
533	return __intrin_bitcast<_To>(
534	_mm_srai_epi16(_mm_unpacklo_epi8(__intrin, __intrin), 8));
535	else if constexpr (__x_to_x && is_unsigned_v<_Tp>)
536	return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__intrin, __m128i()));
537	else if constexpr (__x_to_y)
538	return __intrin_bitcast<_To>(is_signed_v<_Tp>
539	? _mm256_cvtepi8_epi16(__intrin)
540	: _mm256_cvtepu8_epi16(__intrin));
541	else if constexpr (__y_to_z && __have_avx512bw)
542	return __intrin_bitcast<_To>(is_signed_v<_Tp>
543	? _mm512_cvtepi8_epi16(__intrin)
544	: _mm512_cvtepu8_epi16(__intrin));
545	else if constexpr (__y_to_z)
546	__assert_unreachable<_Tp>();
547	}
548	else if constexpr (__f32_to_s64) //{{{2
549	{
550	if constexpr (__have_avx512dq_vl && __x_to_x)
551	return __intrin_bitcast<_To>(_mm_cvttps_epi64(__intrin));
552	else if constexpr (__have_avx512dq_vl && __x_to_y)
553	return __intrin_bitcast<_To>(_mm256_cvttps_epi64(__intrin));
554	else if constexpr (__have_avx512dq && __y_to_z)
555	return __intrin_bitcast<_To>(_mm512_cvttps_epi64(__intrin));
556	// else use scalar fallback
557	}
558	else if constexpr (__f32_to_u64) //{{{2
559	{
560	if constexpr (__have_avx512dq_vl && __x_to_x)
561	return __intrin_bitcast<_To>(_mm_cvttps_epu64(__intrin));
562	else if constexpr (__have_avx512dq_vl && __x_to_y)
563	return __intrin_bitcast<_To>(_mm256_cvttps_epu64(__intrin));
564	else if constexpr (__have_avx512dq && __y_to_z)
565	return __intrin_bitcast<_To>(_mm512_cvttps_epu64(__intrin));
566	// else use scalar fallback
567	}
568	else if constexpr (__f32_to_s32) //{{{2
569	{
570	if constexpr (__x_to_x \|\| __y_to_y \|\| __z_to_z)
571	{
572	// go to fallback, it does the right thing
573	}
574	else
575	__assert_unreachable<_Tp>();
576	}
577	else if constexpr (__f32_to_u32) //{{{2
578	{
579	if constexpr (__have_avx512vl && __x_to_x)
580	return __auto_bitcast(_mm_cvttps_epu32(__intrin));
581	else if constexpr (__have_avx512f && __x_to_x)
582	return __auto_bitcast(
583	__lo128(_mm512_cvttps_epu32(__auto_bitcast(__v))));
584	else if constexpr (__have_avx512vl && __y_to_y)
585	return __vector_bitcast<_Up>(_mm256_cvttps_epu32(__intrin));
586	else if constexpr (__have_avx512f && __y_to_y)
587	return __vector_bitcast<_Up>(
588	__lo256(_mm512_cvttps_epu32(__auto_bitcast(__v))));
589	else if constexpr (__x_to_x \|\| __y_to_y \|\| __z_to_z)
590	{
591	// go to fallback, it does the right thing. We can't use the
592	// _mm_floor_ps - 0x8000'0000 trick for f32->u32 because it would
593	// discard small input values (only 24 mantissa bits)
594	}
595	else
596	__assert_unreachable<_Tp>();
597	}
598	else if constexpr (__f32_to_ibw) //{{{2
599	return __convert_x86<_To>(__convert_x86<__vector_type_t<int, _Np>>(__v));
600	else if constexpr (__f64_to_s64) //{{{2
601	{
602	if constexpr (__have_avx512dq_vl && __x_to_x)
603	return __intrin_bitcast<_To>(_mm_cvttpd_epi64(__intrin));
604	else if constexpr (__have_avx512dq_vl && __y_to_y)
605	return __intrin_bitcast<_To>(_mm256_cvttpd_epi64(__intrin));
606	else if constexpr (__have_avx512dq && __z_to_z)
607	return __intrin_bitcast<_To>(_mm512_cvttpd_epi64(__intrin));
608	// else use scalar fallback
609	}
610	else if constexpr (__f64_to_u64) //{{{2
611	{
612	if constexpr (__have_avx512dq_vl && __x_to_x)
613	return __intrin_bitcast<_To>(_mm_cvttpd_epu64(__intrin));
614	else if constexpr (__have_avx512dq_vl && __y_to_y)
615	return __intrin_bitcast<_To>(_mm256_cvttpd_epu64(__intrin));
616	else if constexpr (__have_avx512dq && __z_to_z)
617	return __intrin_bitcast<_To>(_mm512_cvttpd_epu64(__intrin));
618	// else use scalar fallback
619	}
620	else if constexpr (__f64_to_s32) //{{{2
621	{
622	if constexpr (__x_to_x)
623	return __intrin_bitcast<_To>(_mm_cvttpd_epi32(__intrin));
624	else if constexpr (__y_to_x)
625	return __intrin_bitcast<_To>(_mm256_cvttpd_epi32(__intrin));
626	else if constexpr (__z_to_y)
627	return __intrin_bitcast<_To>(_mm512_cvttpd_epi32(__intrin));
628	}
629	else if constexpr (__f64_to_u32) //{{{2
630	{
631	if constexpr (__have_avx512vl && __x_to_x)
632	return __intrin_bitcast<_To>(_mm_cvttpd_epu32(__intrin));
633	else if constexpr (__have_sse4_1 && __x_to_x)
634	return __vector_bitcast<_Up, _M>(
635	_mm_cvttpd_epi32(_mm_floor_pd(__intrin) - 0x8000'0000u))
636	^ 0x8000'0000u;
637	else if constexpr (__x_to_x)
638	{
639	// use scalar fallback: it's only 2 values to convert, can't get
640	// much better than scalar decomposition
641	}
642	else if constexpr (__have_avx512vl && __y_to_x)
643	return __intrin_bitcast<_To>(_mm256_cvttpd_epu32(__intrin));
644	else if constexpr (__y_to_x)
645	{
646	return __intrin_bitcast<_To>(
647	__vector_bitcast<_Up>(
648	_mm256_cvttpd_epi32(_mm256_floor_pd(__intrin) - 0x8000'0000u))
649	^ 0x8000'0000u);
650	}
651	else if constexpr (__z_to_y)
652	return __intrin_bitcast<_To>(_mm512_cvttpd_epu32(__intrin));
653	}
654	else if constexpr (__f64_to_ibw) //{{{2
655	{
656	return __convert_x86<_To>(
657	__convert_x86<__vector_type_t<int, (_Np < 4 ? 4 : _Np)>>(__v));
658	}
659	else if constexpr (__s64_to_f32) //{{{2
660	{
661	if constexpr (__x_to_x && __have_avx512dq_vl)
662	return __intrin_bitcast<_To>(_mm_cvtepi64_ps(__intrin));
663	else if constexpr (__y_to_x && __have_avx512dq_vl)
664	return __intrin_bitcast<_To>(_mm256_cvtepi64_ps(__intrin));
665	else if constexpr (__z_to_y && __have_avx512dq)
666	return __intrin_bitcast<_To>(_mm512_cvtepi64_ps(__intrin));
667	else if constexpr (__z_to_y)
668	return __intrin_bitcast<_To>(
669	_mm512_cvtpd_ps(__convert_x86<__vector_type_t<double, 8>>(__v)));
670	}
671	else if constexpr (__u64_to_f32) //{{{2
672	{
673	if constexpr (__x_to_x && __have_avx512dq_vl)
674	return __intrin_bitcast<_To>(_mm_cvtepu64_ps(__intrin));
675	else if constexpr (__y_to_x && __have_avx512dq_vl)
676	return __intrin_bitcast<_To>(_mm256_cvtepu64_ps(__intrin));
677	else if constexpr (__z_to_y && __have_avx512dq)
678	return __intrin_bitcast<_To>(_mm512_cvtepu64_ps(__intrin));
679	else if constexpr (__z_to_y)
680	{
681	return __intrin_bitcast<_To>(
682	__lo256(_mm512_cvtepu32_ps(__auto_bitcast(
683	_mm512_cvtepi64_epi32(_mm512_srai_epi64(__intrin, 32)))))
684	* 0x100000000LL
685	+ __lo256(_mm512_cvtepu32_ps(
686	__auto_bitcast(_mm512_cvtepi64_epi32(__intrin)))));
687	}
688	}
689	else if constexpr (__s32_to_f32) //{{{2
690	{
691	// use fallback (builtin conversion)
692	}
693	else if constexpr (__u32_to_f32) //{{{2
694	{
695	if constexpr (__x_to_x && __have_avx512vl)
696	{
697	// use fallback
698	}
699	else if constexpr (__x_to_x && __have_avx512f)
700	return __intrin_bitcast<_To>(
701	__lo128(_mm512_cvtepu32_ps(__auto_bitcast(__v))));
702	else if constexpr (__x_to_x && (__have_fma \|\| __have_fma4))
703	// work around PR85819
704	return __auto_bitcast(0x10000
705	* _mm_cvtepi32_ps(__to_intrin(__v >> 16))
706	+ _mm_cvtepi32_ps(__to_intrin(__v & 0xffff)));
707	else if constexpr (__y_to_y && __have_avx512vl)
708	{
709	// use fallback
710	}
711	else if constexpr (__y_to_y && __have_avx512f)
712	return __intrin_bitcast<_To>(
713	__lo256(_mm512_cvtepu32_ps(__auto_bitcast(__v))));
714	else if constexpr (__y_to_y)
715	// work around PR85819
716	return 0x10000 * _mm256_cvtepi32_ps(__to_intrin(__v >> 16))
717	+ _mm256_cvtepi32_ps(__to_intrin(__v & 0xffff));
718	// else use fallback (builtin conversion)
719	}
720	else if constexpr (__ibw_to_f32) //{{{2
721	{
722	if constexpr (_M <= 4 \|\| __have_avx2)
723	return __convert_x86<_To>(
724	__convert_x86<__vector_type_t<int, _M>>(__v));
725	else
726	{
727	static_assert(__x_to_y);
728	__m128i __a, __b;
729	if constexpr (__have_sse4_1)
730	{
731	__a = sizeof(_Tp) == 2
732	? (is_signed_v<_Tp> ? _mm_cvtepi16_epi32(__intrin)
733	: _mm_cvtepu16_epi32(__intrin))
734	: (is_signed_v<_Tp> ? _mm_cvtepi8_epi32(__intrin)
735	: _mm_cvtepu8_epi32(__intrin));
736	const auto __w
737	= _mm_shuffle_epi32(__intrin, sizeof(_Tp) == 2 ? 0xee : 0xe9);
738	__b = sizeof(_Tp) == 2
739	? (is_signed_v<_Tp> ? _mm_cvtepi16_epi32(__w)
740	: _mm_cvtepu16_epi32(__w))
741	: (is_signed_v<_Tp> ? _mm_cvtepi8_epi32(__w)
742	: _mm_cvtepu8_epi32(__w));
743	}
744	else
745	{
746	__m128i __tmp;
747	if constexpr (sizeof(_Tp) == 1)
748	{
749	__tmp = is_signed_v<_Tp>
750	? _mm_srai_epi16(_mm_unpacklo_epi8(__intrin,
751	__intrin),
752	8)
753	: _mm_unpacklo_epi8(__intrin, __m128i());
754	}
755	else
756	{
757	static_assert(sizeof(_Tp) == 2);
758	__tmp = __intrin;
759	}
760	__a = is_signed_v<_Tp>
761	? _mm_srai_epi32(_mm_unpacklo_epi16(__tmp, __tmp), 16)
762	: _mm_unpacklo_epi16(__tmp, __m128i());
763	__b = is_signed_v<_Tp>
764	? _mm_srai_epi32(_mm_unpackhi_epi16(__tmp, __tmp), 16)
765	: _mm_unpackhi_epi16(__tmp, __m128i());
766	}
767	return __convert_x86<_To>(__vector_bitcast<int>(__a),
768	__vector_bitcast<int>(__b));
769	}
770	}
771	else if constexpr (__s64_to_f64) //{{{2
772	{
773	if constexpr (__x_to_x && __have_avx512dq_vl)
774	return __intrin_bitcast<_To>(_mm_cvtepi64_pd(__intrin));
775	else if constexpr (__y_to_y && __have_avx512dq_vl)
776	return __intrin_bitcast<_To>(_mm256_cvtepi64_pd(__intrin));
777	else if constexpr (__z_to_z && __have_avx512dq)
778	return __intrin_bitcast<_To>(_mm512_cvtepi64_pd(__intrin));
779	else if constexpr (__z_to_z)
780	{
781	return __intrin_bitcast<_To>(
782	_mm512_cvtepi32_pd(_mm512_cvtepi64_epi32(__to_intrin(__v >> 32)))
783	* 0x100000000LL
784	+ _mm512_cvtepu32_pd(_mm512_cvtepi64_epi32(__intrin)));
785	}
786	}
787	else if constexpr (__u64_to_f64) //{{{2
788	{
789	if constexpr (__x_to_x && __have_avx512dq_vl)
790	return __intrin_bitcast<_To>(_mm_cvtepu64_pd(__intrin));
791	else if constexpr (__y_to_y && __have_avx512dq_vl)
792	return __intrin_bitcast<_To>(_mm256_cvtepu64_pd(__intrin));
793	else if constexpr (__z_to_z && __have_avx512dq)
794	return __intrin_bitcast<_To>(_mm512_cvtepu64_pd(__intrin));
795	else if constexpr (__z_to_z)
796	{
797	return __intrin_bitcast<_To>(
798	_mm512_cvtepu32_pd(_mm512_cvtepi64_epi32(__to_intrin(__v >> 32)))
799	* 0x100000000LL
800	+ _mm512_cvtepu32_pd(_mm512_cvtepi64_epi32(__intrin)));
801	}
802	}
803	else if constexpr (__s32_to_f64) //{{{2
804	{
805	if constexpr (__x_to_x)
806	return __intrin_bitcast<_To>(_mm_cvtepi32_pd(__intrin));
807	else if constexpr (__x_to_y)
808	return __intrin_bitcast<_To>(_mm256_cvtepi32_pd(__intrin));
809	else if constexpr (__y_to_z)
810	return __intrin_bitcast<_To>(_mm512_cvtepi32_pd(__intrin));
811	}
812	else if constexpr (__u32_to_f64) //{{{2
813	{
814	if constexpr (__x_to_x && __have_avx512vl)
815	return __intrin_bitcast<_To>(_mm_cvtepu32_pd(__intrin));
816	else if constexpr (__x_to_x && __have_avx512f)
817	return __intrin_bitcast<_To>(
818	__lo128(_mm512_cvtepu32_pd(__auto_bitcast(__v))));
819	else if constexpr (__x_to_x)
820	return __intrin_bitcast<_To>(
821	_mm_cvtepi32_pd(__to_intrin(__v ^ 0x8000'0000u)) + 0x8000'0000u);
822	else if constexpr (__x_to_y && __have_avx512vl)
823	return __intrin_bitcast<_To>(_mm256_cvtepu32_pd(__intrin));
824	else if constexpr (__x_to_y && __have_avx512f)
825	return __intrin_bitcast<_To>(
826	__lo256(_mm512_cvtepu32_pd(__auto_bitcast(__v))));
827	else if constexpr (__x_to_y)
828	return __intrin_bitcast<_To>(
829	_mm256_cvtepi32_pd(__to_intrin(__v ^ 0x8000'0000u)) + 0x8000'0000u);
830	else if constexpr (__y_to_z)
831	return __intrin_bitcast<_To>(_mm512_cvtepu32_pd(__intrin));
832	}
833	else if constexpr (__ibw_to_f64) //{{{2
834	{
835	return __convert_x86<_To>(
836	__convert_x86<__vector_type_t<int, std::max(size_t(4), _M)>>(__v));
837	}
838	else if constexpr (__f32_to_f64) //{{{2
839	{
840	if constexpr (__x_to_x)
841	return __intrin_bitcast<_To>(_mm_cvtps_pd(__intrin));
842	else if constexpr (__x_to_y)
843	return __intrin_bitcast<_To>(_mm256_cvtps_pd(__intrin));
844	else if constexpr (__y_to_z)
845	return __intrin_bitcast<_To>(_mm512_cvtps_pd(__intrin));
846	}
847	else if constexpr (__f64_to_f32) //{{{2
848	{
849	if constexpr (__x_to_x)
850	return __intrin_bitcast<_To>(_mm_cvtpd_ps(__intrin));
851	else if constexpr (__y_to_x)
852	return __intrin_bitcast<_To>(_mm256_cvtpd_ps(__intrin));
853	else if constexpr (__z_to_y)
854	return __intrin_bitcast<_To>(_mm512_cvtpd_ps(__intrin));
855	}
856	else //{{{2
857	__assert_unreachable<_Tp>();
858
859	// fallback:{{{2
860	return __vector_convert<_To>(__v, make_index_sequence<std::min(_M, _Np)>());
861	//}}}
862	}
863
864	// }}}
865	// 2-arg __convert_x86 {{{1
866	template <typename _To, typename _V, typename _Traits>
867	_GLIBCXX_SIMD_INTRINSIC _To
868	__convert_x86(_V __v0, _V __v1)
869	{
870	static_assert(__is_vector_type_v<_V>);
871	using _Tp = typename _Traits::value_type;
872	constexpr size_t _Np = _Traits::_S_full_size;
873	[[maybe_unused]] const auto __i0 = __to_intrin(__v0);
874	[[maybe_unused]] const auto __i1 = __to_intrin(__v1);
875	using _Up = typename _VectorTraits<_To>::value_type;
876	constexpr size_t _M = _VectorTraits<_To>::_S_full_size;
877
878	static_assert(2 * _Np <= _M,
879	"__v1 would be discarded; use the one-argument "
880	"__convert_x86 overload instead");
881
882	// [xyz]_to_[xyz] {{{2
883	[[maybe_unused]] constexpr bool __x_to_x
884	= sizeof(__v0) <= 16 && sizeof(_To) <= 16;
885	[[maybe_unused]] constexpr bool __x_to_y
886	= sizeof(__v0) <= 16 && sizeof(_To) == 32;
887	[[maybe_unused]] constexpr bool __x_to_z
888	= sizeof(__v0) <= 16 && sizeof(_To) == 64;
889	[[maybe_unused]] constexpr bool __y_to_x
890	= sizeof(__v0) == 32 && sizeof(_To) <= 16;
891	[[maybe_unused]] constexpr bool __y_to_y
892	= sizeof(__v0) == 32 && sizeof(_To) == 32;
893	[[maybe_unused]] constexpr bool __y_to_z
894	= sizeof(__v0) == 32 && sizeof(_To) == 64;
895	[[maybe_unused]] constexpr bool __z_to_x
896	= sizeof(__v0) == 64 && sizeof(_To) <= 16;
897	[[maybe_unused]] constexpr bool __z_to_y
898	= sizeof(__v0) == 64 && sizeof(_To) == 32;
899	[[maybe_unused]] constexpr bool __z_to_z
900	= sizeof(__v0) == 64 && sizeof(_To) == 64;
901
902	// iX_to_iX {{{2
903	[[maybe_unused]] constexpr bool __i_to_i
904	= is_integral_v<_Up> && is_integral_v<_Tp>;
905	[[maybe_unused]] constexpr bool __i8_to_i16
906	= __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 2;
907	[[maybe_unused]] constexpr bool __i8_to_i32
908	= __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 4;
909	[[maybe_unused]] constexpr bool __i8_to_i64
910	= __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 8;
911	[[maybe_unused]] constexpr bool __i16_to_i8
912	= __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 1;
913	[[maybe_unused]] constexpr bool __i16_to_i32
914	= __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 4;
915	[[maybe_unused]] constexpr bool __i16_to_i64
916	= __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 8;
917	[[maybe_unused]] constexpr bool __i32_to_i8
918	= __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 1;
919	[[maybe_unused]] constexpr bool __i32_to_i16
920	= __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 2;
921	[[maybe_unused]] constexpr bool __i32_to_i64
922	= __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 8;
923	[[maybe_unused]] constexpr bool __i64_to_i8
924	= __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 1;
925	[[maybe_unused]] constexpr bool __i64_to_i16
926	= __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 2;
927	[[maybe_unused]] constexpr bool __i64_to_i32
928	= __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 4;
929
930	// [fsu]X_to_[fsu]X {{{2
931	// ibw = integral && byte or word, i.e. char and short with any signedness
932	[[maybe_unused]] constexpr bool __i64_to_f32
933	= is_integral_v<_Tp> && sizeof(_Tp) == 8
934	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
935	[[maybe_unused]] constexpr bool __s32_to_f32
936	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4
937	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
938	[[maybe_unused]] constexpr bool __s16_to_f32
939	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 2
940	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
941	[[maybe_unused]] constexpr bool __s8_to_f32
942	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 1
943	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
944	[[maybe_unused]] constexpr bool __u32_to_f32
945	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4
946	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
947	[[maybe_unused]] constexpr bool __u16_to_f32
948	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 2
949	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
950	[[maybe_unused]] constexpr bool __u8_to_f32
951	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 1
952	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
953	[[maybe_unused]] constexpr bool __s64_to_f64
954	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 8
955	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
956	[[maybe_unused]] constexpr bool __s32_to_f64
957	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4
958	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
959	[[maybe_unused]] constexpr bool __s16_to_f64
960	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 2
961	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
962	[[maybe_unused]] constexpr bool __s8_to_f64
963	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 1
964	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
965	[[maybe_unused]] constexpr bool __u64_to_f64
966	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 8
967	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
968	[[maybe_unused]] constexpr bool __u32_to_f64
969	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4
970	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
971	[[maybe_unused]] constexpr bool __u16_to_f64
972	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 2
973	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
974	[[maybe_unused]] constexpr bool __u8_to_f64
975	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 1
976	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
977	[[maybe_unused]] constexpr bool __f32_to_s64
978	= is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8
979	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
980	[[maybe_unused]] constexpr bool __f32_to_s32
981	= is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4
982	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
983	[[maybe_unused]] constexpr bool __f32_to_u64
984	= is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8
985	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
986	[[maybe_unused]] constexpr bool __f32_to_u32
987	= is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4
988	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
989	[[maybe_unused]] constexpr bool __f64_to_s64
990	= is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8
991	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
992	[[maybe_unused]] constexpr bool __f64_to_s32
993	= is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4
994	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
995	[[maybe_unused]] constexpr bool __f64_to_u64
996	= is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8
997	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
998	[[maybe_unused]] constexpr bool __f64_to_u32
999	= is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4
1000	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
1001	[[maybe_unused]] constexpr bool __f32_to_ibw
1002	= is_integral_v<_Up> && sizeof(_Up) <= 2
1003	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
1004	[[maybe_unused]] constexpr bool __f64_to_ibw
1005	= is_integral_v<_Up> && sizeof(_Up) <= 2
1006	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
1007	[[maybe_unused]] constexpr bool __f32_to_f64
1008	= is_floating_point_v<_Tp> && sizeof(_Tp) == 4
1009	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
1010	[[maybe_unused]] constexpr bool __f64_to_f32
1011	= is_floating_point_v<_Tp> && sizeof(_Tp) == 8
1012	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
1013
1014	if constexpr (__i_to_i && __y_to_x && !__have_avx2) //{{{2
1015	// <double, 4>, <double, 4> => <short, 8>
1016	return __convert_x86<_To>(__lo128(__v0), __hi128(__v0), __lo128(__v1),
1017	__hi128(__v1));
1018	else if constexpr (__i_to_i) // assert ISA {{{2
1019	{
1020	static_assert(__x_to_x \|\| __have_avx2,
1021	"integral conversions with ymm registers require AVX2");
1022	static_assert(__have_avx512bw
1023	\|\| ((sizeof(_Tp) >= 4 \|\| sizeof(__v0) < 64)
1024	&& (sizeof(_Up) >= 4 \|\| sizeof(_To) < 64)),
1025	"8/16-bit integers in zmm registers require AVX512BW");
1026	static_assert((sizeof(__v0) < 64 && sizeof(_To) < 64) \|\| __have_avx512f,
1027	"integral conversions with ymm registers require AVX2");
1028	}
1029	// concat => use 1-arg __convert_x86 {{{2
1030	if constexpr (sizeof(__v0) < 16 \|\| (sizeof(__v0) == 16 && __have_avx2)
1031	\|\| (sizeof(__v0) == 16 && __have_avx
1032	&& is_floating_point_v<_Tp>)
1033	\|\| (sizeof(__v0) == 32 && __have_avx512f
1034	&& (sizeof(_Tp) >= 4 \|\| __have_avx512bw)))
1035	{
1036	// The ISA can handle wider input registers, so concat and use one-arg
1037	// implementation. This reduces code duplication considerably.
1038	return __convert_x86<_To>(__concat(__v0, __v1));
1039	}
1040	else //{{{2
1041	{
1042	// conversion using bit reinterpretation (or no conversion at all)
1043	// should all go through the concat branch above:
1044	static_assert(
1045	!(is_floating_point_v<
1046	_Tp> == is_floating_point_v<_Up> && sizeof(_Tp) == sizeof(_Up)));
1047	// handle all zero extension{{{2
1048	if constexpr (2 * _Np < _M && sizeof(_To) > 16)
1049	{
1050	constexpr size_t Min = 16 / sizeof(_Up);
1051	return __zero_extend(
1052	__convert_x86<
1053	__vector_type_t<_Up, (Min > 2 * _Np) ? Min : 2 * _Np>>(__v0,
1054	__v1));
1055	}
1056	else if constexpr (__i64_to_i32) //{{{2
1057	{
1058	if constexpr (__x_to_x)
1059	return __auto_bitcast(_mm_shuffle_ps(__auto_bitcast(__v0),
1060	__auto_bitcast(__v1), 0x88));
1061	else if constexpr (__y_to_y)
1062	{
1063	// AVX512F is not available (would concat otherwise)
1064	return __auto_bitcast(
1065	__xzyw(_mm256_shuffle_ps(__auto_bitcast(__v0),
1066	__auto_bitcast(__v1), 0x88)));
1067	// alternative:
1068	// const auto v0_abxxcdxx = _mm256_shuffle_epi32(__v0, 8);
1069	// const auto v1_efxxghxx = _mm256_shuffle_epi32(__v1, 8);
1070	// const auto v_abefcdgh = _mm256_unpacklo_epi64(v0_abxxcdxx,
1071	// v1_efxxghxx); return _mm256_permute4x64_epi64(v_abefcdgh,
1072	// 0x01 * 0 + 0x04 * 2 + 0x10 * 1 + 0x40 * 3); // abcdefgh
1073	}
1074	else if constexpr (__z_to_z)
1075	return __intrin_bitcast<_To>(
1076	__concat(_mm512_cvtepi64_epi32(__i0),
1077	_mm512_cvtepi64_epi32(__i1)));
1078	}
1079	else if constexpr (__i64_to_i16) //{{{2
1080	{
1081	if constexpr (__x_to_x)
1082	{
1083	// AVX2 is not available (would concat otherwise)
1084	if constexpr (__have_sse4_1)
1085	{
1086	return __intrin_bitcast<_To>(_mm_shuffle_epi8(
1087	_mm_blend_epi16(__i0, _mm_slli_si128(__i1, 4), 0x44),
1088	_mm_setr_epi8(0, 1, 8, 9, 4, 5, 12, 13, -0x80, -0x80,
1089	-0x80, -0x80, -0x80, -0x80, -0x80, -0x80)));
1090	}
1091	else
1092	{
1093	return __vector_type_t<_Up, _M>{_Up(__v0[0]), _Up(__v0[1]),
1094	_Up(__v1[0]), _Up(__v1[1])};
1095	}
1096	}
1097	else if constexpr (__y_to_x)
1098	{
1099	auto __a
1100	= _mm256_unpacklo_epi16(__i0, __i1); // 04.. .... 26.. ....
1101	auto __b
1102	= _mm256_unpackhi_epi16(__i0, __i1); // 15.. .... 37.. ....
1103	auto __c
1104	= _mm256_unpacklo_epi16(__a, __b); // 0145 .... 2367 ....
1105	return __intrin_bitcast<_To>(
1106	_mm_unpacklo_epi32(__lo128(__c), __hi128(__c))); // 0123 4567
1107	}
1108	else if constexpr (__z_to_y)
1109	return __intrin_bitcast<_To>(
1110	__concat(_mm512_cvtepi64_epi16(__i0),
1111	_mm512_cvtepi64_epi16(__i1)));
1112	}
1113	else if constexpr (__i64_to_i8) //{{{2
1114	{
1115	if constexpr (__x_to_x && __have_sse4_1)
1116	{
1117	return __intrin_bitcast<_To>(_mm_shuffle_epi8(
1118	_mm_blend_epi16(__i0, _mm_slli_si128(__i1, 4), 0x44),
1119	_mm_setr_epi8(0, 8, 4, 12, -0x80, -0x80, -0x80, -0x80, -0x80,
1120	-0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
1121	-0x80)));
1122	}
1123	else if constexpr (__x_to_x && __have_ssse3)
1124	{
1125	return __intrin_bitcast<_To>(_mm_unpacklo_epi16(
1126	_mm_shuffle_epi8(
1127	__i0, _mm_setr_epi8(0, 8, -0x80, -0x80, -0x80, -0x80, -0x80,
1128	-0x80, -0x80, -0x80, -0x80, -0x80,
1129	-0x80, -0x80, -0x80, -0x80)),
1130	_mm_shuffle_epi8(
1131	__i1, _mm_setr_epi8(0, 8, -0x80, -0x80, -0x80, -0x80, -0x80,
1132	-0x80, -0x80, -0x80, -0x80, -0x80,
1133	-0x80, -0x80, -0x80, -0x80))));
1134	}
1135	else if constexpr (__x_to_x)
1136	{
1137	return __vector_type_t<_Up, _M>{_Up(__v0[0]), _Up(__v0[1]),
1138	_Up(__v1[0]), _Up(__v1[1])};
1139	}
1140	else if constexpr (__y_to_x)
1141	{
1142	const auto __a = _mm256_shuffle_epi8(
1143	_mm256_blend_epi32(__i0, _mm256_slli_epi64(__i1, 32), 0xAA),
1144	_mm256_setr_epi8(0, 8, -0x80, -0x80, 4, 12, -0x80, -0x80,
1145	-0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
1146	-0x80, -0x80, -0x80, -0x80, 0, 8, -0x80,
1147	-0x80, 4, 12, -0x80, -0x80, -0x80, -0x80,
1148	-0x80, -0x80, -0x80, -0x80));
1149	return __intrin_bitcast<_To>(__lo128(__a) \| __hi128(__a));
1150	} // __z_to_x uses concat fallback
1151	}
1152	else if constexpr (__i32_to_i16) //{{{2
1153	{
1154	if constexpr (__x_to_x)
1155	{
1156	// AVX2 is not available (would concat otherwise)
1157	if constexpr (__have_sse4_1)
1158	{
1159	return __intrin_bitcast<_To>(_mm_shuffle_epi8(
1160	_mm_blend_epi16(__i0, _mm_slli_si128(__i1, 2), 0xaa),
1161	_mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10,
1162	11, 14, 15)));
1163	}
1164	else if constexpr (__have_ssse3)
1165	{
1166	return __intrin_bitcast<_To>(
1167	_mm_hadd_epi16(__to_intrin(__v0 << 16),
1168	__to_intrin(__v1 << 16)));
1169	/*
1170	return _mm_unpacklo_epi64(
1171	_mm_shuffle_epi8(__i0, _mm_setr_epi8(0, 1, 4, 5, 8, 9,
1172	12, 13, 8, 9, 12, 13, 12, 13, 14, 15)),
1173	_mm_shuffle_epi8(__i1, _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12,
1174	13, 8, 9, 12, 13, 12, 13, 14, 15)));
1175	*/
1176	}
1177	else
1178	{
1179	auto __a = _mm_unpacklo_epi16(__i0, __i1); // 04.. 15..
1180	auto __b = _mm_unpackhi_epi16(__i0, __i1); // 26.. 37..
1181	auto __c = _mm_unpacklo_epi16(__a, __b); // 0246 ....
1182	auto __d = _mm_unpackhi_epi16(__a, __b); // 1357 ....
1183	return __intrin_bitcast<_To>(
1184	_mm_unpacklo_epi16(__c, __d)); // 0123 4567
1185	}
1186	}
1187	else if constexpr (__y_to_y)
1188	{
1189	const auto __shuf
1190	= _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80,
1191	-0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
1192	0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80,
1193	-0x80, -0x80, -0x80, -0x80, -0x80, -0x80);
1194	auto __a = _mm256_shuffle_epi8(__i0, __shuf);
1195	auto __b = _mm256_shuffle_epi8(__i1, __shuf);
1196	return __intrin_bitcast<_To>(
1197	__xzyw(_mm256_unpacklo_epi64(__a, __b)));
1198	} // __z_to_z uses concat fallback
1199	}
1200	else if constexpr (__i32_to_i8) //{{{2
1201	{
1202	if constexpr (__x_to_x && __have_ssse3)
1203	{
1204	const auto shufmask
1205	= _mm_setr_epi8(0, 4, 8, 12, -0x80, -0x80, -0x80, -0x80,
1206	-0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
1207	-0x80, -0x80);
1208	return __intrin_bitcast<_To>(
1209	_mm_unpacklo_epi32(_mm_shuffle_epi8(__i0, shufmask),
1210	_mm_shuffle_epi8(__i1, shufmask)));
1211	}
1212	else if constexpr (__x_to_x)
1213	{
1214	auto __a = _mm_unpacklo_epi8(__i0, __i1); // 04.. .... 15.. ....
1215	auto __b = _mm_unpackhi_epi8(__i0, __i1); // 26.. .... 37.. ....
1216	auto __c = _mm_unpacklo_epi8(__a, __b); // 0246 .... .... ....
1217	auto __d = _mm_unpackhi_epi8(__a, __b); // 1357 .... .... ....
1218	auto __e = _mm_unpacklo_epi8(__c, __d); // 0123 4567 .... ....
1219	return __intrin_bitcast<_To>(__e & __m128i{-1, 0});
1220	}
1221	else if constexpr (__y_to_x)
1222	{
1223	const auto __a = _mm256_shuffle_epi8(
1224	_mm256_blend_epi16(__i0, _mm256_slli_epi32(__i1, 16), 0xAA),
1225	_mm256_setr_epi8(0, 4, 8, 12, -0x80, -0x80, -0x80, -0x80, 2,
1226	6, 10, 14, -0x80, -0x80, -0x80, -0x80, -0x80,
1227	-0x80, -0x80, -0x80, 0, 4, 8, 12, -0x80,
1228	-0x80, -0x80, -0x80, 2, 6, 10, 14));
1229	return __intrin_bitcast<_To>(__lo128(__a) \| __hi128(__a));
1230	} // __z_to_y uses concat fallback
1231	}
1232	else if constexpr (__i16_to_i8) //{{{2
1233	{
1234	if constexpr (__x_to_x && __have_ssse3)
1235	{
1236	const auto __shuf = reinterpret_cast<__m128i>(
1237	__vector_type_t<_UChar, 16>{0, 2, 4, 6, 8, 10, 12, 14, 0x80,
1238	0x80, 0x80, 0x80, 0x80, 0x80,
1239	0x80, 0x80});
1240	return __intrin_bitcast<_To>(
1241	_mm_unpacklo_epi64(_mm_shuffle_epi8(__i0, __shuf),
1242	_mm_shuffle_epi8(__i1, __shuf)));
1243	}
1244	else if constexpr (__x_to_x)
1245	{
1246	auto __a = _mm_unpacklo_epi8(__i0, __i1); // 08.. 19.. 2A.. 3B..
1247	auto __b = _mm_unpackhi_epi8(__i0, __i1); // 4C.. 5D.. 6E.. 7F..
1248	auto __c = _mm_unpacklo_epi8(__a, __b); // 048C .... 159D ....
1249	auto __d = _mm_unpackhi_epi8(__a, __b); // 26AE .... 37BF ....
1250	auto __e = _mm_unpacklo_epi8(__c, __d); // 0246 8ACE .... ....
1251	auto __f = _mm_unpackhi_epi8(__c, __d); // 1357 9BDF .... ....
1252	return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__e, __f));
1253	}
1254	else if constexpr (__y_to_y)
1255	{
1256	return __intrin_bitcast<_To>(__xzyw(_mm256_shuffle_epi8(
1257	(__to_intrin(__v0) & _mm256_set1_epi32(0x00ff00ff))
1258	\| _mm256_slli_epi16(__i1, 8),
1259	_mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11,
1260	13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5,
1261	7, 9, 11, 13, 15))));
1262	} // __z_to_z uses concat fallback
1263	}
1264	else if constexpr (__i64_to_f32) //{{{2
1265	{
1266	if constexpr (__x_to_x)
1267	return __make_wrapper<float>(__v0[0], __v0[1], __v1[0], __v1[1]);
1268	else if constexpr (__y_to_y)
1269	{
1270	static_assert(__y_to_y && __have_avx2);
1271	const auto __a = _mm256_unpacklo_epi32(__i0, __i1); // aeAE cgCG
1272	const auto __b = _mm256_unpackhi_epi32(__i0, __i1); // bfBF dhDH
1273	const auto __lo32
1274	= _mm256_unpacklo_epi32(__a, __b); // abef cdgh
1275	const auto __hi32 = __vector_bitcast<
1276	conditional_t<is_signed_v<_Tp>, int, _UInt>>(
1277	_mm256_unpackhi_epi32(__a, __b)); // ABEF CDGH
1278	const auto __hi
1279	= 0x100000000LL
1280	* __convert_x86<__vector_type_t<float, 8>>(__hi32);
1281	const auto __mid
1282	= 0x10000 * _mm256_cvtepi32_ps(_mm256_srli_epi32(__lo32, 16));
1283	const auto __lo
1284	= _mm256_cvtepi32_ps(_mm256_set1_epi32(0x0000ffffu) & __lo32);
1285	return __xzyw((__hi + __mid) + __lo);
1286	}
1287	else if constexpr (__z_to_z && __have_avx512dq)
1288	{
1289	return is_signed_v<_Tp> ? __concat(_mm512_cvtepi64_ps(__i0),
1290	_mm512_cvtepi64_ps(__i1))
1291	: __concat(_mm512_cvtepu64_ps(__i0),
1292	_mm512_cvtepu64_ps(__i1));
1293	}
1294	else if constexpr (__z_to_z && is_signed_v<_Tp>)
1295	{
1296	const __m512 __hi32 = _mm512_cvtepi32_ps(
1297	__concat(_mm512_cvtepi64_epi32(__to_intrin(__v0 >> 32)),
1298	_mm512_cvtepi64_epi32(__to_intrin(__v1 >> 32))));
1299	const __m512i __lo32 = __concat(_mm512_cvtepi64_epi32(__i0),
1300	_mm512_cvtepi64_epi32(__i1));
1301	// split low 32-bits, because if __hi32 is a small negative
1302	// number, the 24-bit mantissa may lose important information if
1303	// any of the high 8 bits of __lo32 is set, leading to
1304	// catastrophic cancelation in the FMA
1305	const __m512 __hi16
1306	= _mm512_cvtepu32_ps(_mm512_set1_epi32(0xffff0000u) & __lo32);
1307	const __m512 __lo16
1308	= _mm512_cvtepi32_ps(_mm512_set1_epi32(0x0000ffffu) & __lo32);
1309	return (__hi32 * 0x100000000LL + __hi16) + __lo16;
1310	}
1311	else if constexpr (__z_to_z && is_unsigned_v<_Tp>)
1312	{
1313	return __intrin_bitcast<_To>(
1314	_mm512_cvtepu32_ps(__concat(
1315	_mm512_cvtepi64_epi32(_mm512_srai_epi64(__i0, 32)),
1316	_mm512_cvtepi64_epi32(_mm512_srai_epi64(__i1, 32))))
1317	* 0x100000000LL
1318	+ _mm512_cvtepu32_ps(__concat(_mm512_cvtepi64_epi32(__i0),
1319	_mm512_cvtepi64_epi32(__i1))));
1320	}
1321	}
1322	else if constexpr (__f64_to_s32) //{{{2
1323	{
1324	// use concat fallback
1325	}
1326	else if constexpr (__f64_to_u32) //{{{2
1327	{
1328	if constexpr (__x_to_x && __have_sse4_1)
1329	{
1330	return __vector_bitcast<_Up, _M>(_mm_unpacklo_epi64(
1331	_mm_cvttpd_epi32(_mm_floor_pd(__i0) - 0x8000'0000u),
1332	_mm_cvttpd_epi32(_mm_floor_pd(__i1) - 0x8000'0000u)))
1333	^ 0x8000'0000u;
1334	// without SSE4.1 just use the scalar fallback, it's only four
1335	// values
1336	}
1337	else if constexpr (__y_to_y)
1338	{
1339	return __vector_bitcast<_Up>(
1340	__concat(_mm256_cvttpd_epi32(_mm256_floor_pd(__i0)
1341	- 0x8000'0000u),
1342	_mm256_cvttpd_epi32(_mm256_floor_pd(__i1)
1343	- 0x8000'0000u)))
1344	^ 0x8000'0000u;
1345	} // __z_to_z uses fallback
1346	}
1347	else if constexpr (__f64_to_ibw) //{{{2
1348	{
1349	// one-arg __f64_to_ibw goes via _SimdWrapper<int, ?>. The fallback
1350	// would go via two independet conversions to _SimdWrapper<_To> and
1351	// subsequent interleaving. This is better, because f64->__i32
1352	// allows to combine __v0 and __v1 into one register: if constexpr
1353	// (__z_to_x \|\| __y_to_x) {
1354	return __convert_x86<_To>(
1355	__convert_x86<__vector_type_t<int, _Np * 2>>(__v0, __v1));
1356	//}
1357	}
1358	else if constexpr (__f32_to_ibw) //{{{2
1359	{
1360	return __convert_x86<_To>(
1361	__convert_x86<__vector_type_t<int, _Np>>(__v0),
1362	__convert_x86<__vector_type_t<int, _Np>>(__v1));
1363	} //}}}
1364
1365	// fallback: {{{2
1366	if constexpr (sizeof(_To) >= 32)
1367	// if _To is ymm or zmm, then _SimdWrapper<_Up, _M / 2> is xmm or ymm
1368	return __concat(__convert_x86<__vector_type_t<_Up, _M / 2>>(__v0),
1369	__convert_x86<__vector_type_t<_Up, _M / 2>>(__v1));
1370	else if constexpr (sizeof(_To) == 16)
1371	{
1372	const auto __lo = __to_intrin(__convert_x86<_To>(__v0));
1373	const auto __hi = __to_intrin(__convert_x86<_To>(__v1));
1374	if constexpr (sizeof(_Up) * _Np == 8)
1375	{
1376	if constexpr (is_floating_point_v<_Up>)
1377	return __auto_bitcast(
1378	_mm_unpacklo_pd(__vector_bitcast<double>(__lo),
1379	__vector_bitcast<double>(__hi)));
1380	else
1381	return __intrin_bitcast<_To>(_mm_unpacklo_epi64(__lo, __hi));
1382	}
1383	else if constexpr (sizeof(_Up) * _Np == 4)
1384	{
1385	if constexpr (is_floating_point_v<_Up>)
1386	return __auto_bitcast(
1387	_mm_unpacklo_ps(__vector_bitcast<float>(__lo),
1388	__vector_bitcast<float>(__hi)));
1389	else
1390	return __intrin_bitcast<_To>(_mm_unpacklo_epi32(__lo, __hi));
1391	}
1392	else if constexpr (sizeof(_Up) * _Np == 2)
1393	return __intrin_bitcast<_To>(_mm_unpacklo_epi16(__lo, __hi));
1394	else
1395	__assert_unreachable<_Tp>();
1396	}
1397	else
1398	return __vector_convert<_To>(__v0, __v1, make_index_sequence<_Np>());
1399	//}}}
1400	}
1401	}
1402
1403	//}}}1
1404	// 4-arg __convert_x86 {{{1
1405	template <typename _To, typename _V, typename _Traits>
1406	_GLIBCXX_SIMD_INTRINSIC _To
1407	__convert_x86(_V __v0, _V __v1, _V __v2, _V __v3)
1408	{
1409	static_assert(__is_vector_type_v<_V>);
1410	using _Tp = typename _Traits::value_type;
1411	constexpr size_t _Np = _Traits::_S_full_size;
1412	[[maybe_unused]] const auto __i0 = __to_intrin(__v0);
1413	[[maybe_unused]] const auto __i1 = __to_intrin(__v1);
1414	[[maybe_unused]] const auto __i2 = __to_intrin(__v2);
1415	[[maybe_unused]] const auto __i3 = __to_intrin(__v3);
1416	using _Up = typename _VectorTraits<_To>::value_type;
1417	constexpr size_t _M = _VectorTraits<_To>::_S_full_size;
1418
1419	static_assert(4 * _Np <= _M,
1420	"__v2/__v3 would be discarded; use the two/one-argument "
1421	"__convert_x86 overload instead");
1422
1423	// [xyz]_to_[xyz] {{{2
1424	[[maybe_unused]] constexpr bool __x_to_x
1425	= sizeof(__v0) <= 16 && sizeof(_To) <= 16;
1426	[[maybe_unused]] constexpr bool __x_to_y
1427	= sizeof(__v0) <= 16 && sizeof(_To) == 32;
1428	[[maybe_unused]] constexpr bool __x_to_z
1429	= sizeof(__v0) <= 16 && sizeof(_To) == 64;
1430	[[maybe_unused]] constexpr bool __y_to_x
1431	= sizeof(__v0) == 32 && sizeof(_To) <= 16;
1432	[[maybe_unused]] constexpr bool __y_to_y
1433	= sizeof(__v0) == 32 && sizeof(_To) == 32;
1434	[[maybe_unused]] constexpr bool __y_to_z
1435	= sizeof(__v0) == 32 && sizeof(_To) == 64;
1436	[[maybe_unused]] constexpr bool __z_to_x
1437	= sizeof(__v0) == 64 && sizeof(_To) <= 16;
1438	[[maybe_unused]] constexpr bool __z_to_y
1439	= sizeof(__v0) == 64 && sizeof(_To) == 32;
1440	[[maybe_unused]] constexpr bool __z_to_z
1441	= sizeof(__v0) == 64 && sizeof(_To) == 64;
1442
1443	// iX_to_iX {{{2
1444	[[maybe_unused]] constexpr bool __i_to_i
1445	= is_integral_v<_Up> && is_integral_v<_Tp>;
1446	[[maybe_unused]] constexpr bool __i8_to_i16
1447	= __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 2;
1448	[[maybe_unused]] constexpr bool __i8_to_i32
1449	= __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 4;
1450	[[maybe_unused]] constexpr bool __i8_to_i64
1451	= __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 8;
1452	[[maybe_unused]] constexpr bool __i16_to_i8
1453	= __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 1;
1454	[[maybe_unused]] constexpr bool __i16_to_i32
1455	= __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 4;
1456	[[maybe_unused]] constexpr bool __i16_to_i64
1457	= __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 8;
1458	[[maybe_unused]] constexpr bool __i32_to_i8
1459	= __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 1;
1460	[[maybe_unused]] constexpr bool __i32_to_i16
1461	= __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 2;
1462	[[maybe_unused]] constexpr bool __i32_to_i64
1463	= __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 8;
1464	[[maybe_unused]] constexpr bool __i64_to_i8
1465	= __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 1;
1466	[[maybe_unused]] constexpr bool __i64_to_i16
1467	= __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 2;
1468	[[maybe_unused]] constexpr bool __i64_to_i32
1469	= __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 4;
1470
1471	// [fsu]X_to_[fsu]X {{{2
1472	// ibw = integral && byte or word, i.e. char and short with any signedness
1473	[[maybe_unused]] constexpr bool __i64_to_f32
1474	= is_integral_v<_Tp> && sizeof(_Tp) == 8
1475	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
1476	[[maybe_unused]] constexpr bool __s32_to_f32
1477	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4
1478	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
1479	[[maybe_unused]] constexpr bool __s16_to_f32
1480	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 2
1481	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
1482	[[maybe_unused]] constexpr bool __s8_to_f32
1483	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 1
1484	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
1485	[[maybe_unused]] constexpr bool __u32_to_f32
1486	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4
1487	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
1488	[[maybe_unused]] constexpr bool __u16_to_f32
1489	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 2
1490	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
1491	[[maybe_unused]] constexpr bool __u8_to_f32
1492	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 1
1493	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
1494	[[maybe_unused]] constexpr bool __s64_to_f64
1495	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 8
1496	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
1497	[[maybe_unused]] constexpr bool __s32_to_f64
1498	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4
1499	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
1500	[[maybe_unused]] constexpr bool __s16_to_f64
1501	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 2
1502	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
1503	[[maybe_unused]] constexpr bool __s8_to_f64
1504	= is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 1
1505	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
1506	[[maybe_unused]] constexpr bool __u64_to_f64
1507	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 8
1508	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
1509	[[maybe_unused]] constexpr bool __u32_to_f64
1510	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4
1511	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
1512	[[maybe_unused]] constexpr bool __u16_to_f64
1513	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 2
1514	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
1515	[[maybe_unused]] constexpr bool __u8_to_f64
1516	= is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 1
1517	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
1518	[[maybe_unused]] constexpr bool __f32_to_s64
1519	= is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8
1520	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
1521	[[maybe_unused]] constexpr bool __f32_to_s32
1522	= is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4
1523	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
1524	[[maybe_unused]] constexpr bool __f32_to_u64
1525	= is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8
1526	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
1527	[[maybe_unused]] constexpr bool __f32_to_u32
1528	= is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4
1529	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
1530	[[maybe_unused]] constexpr bool __f64_to_s64
1531	= is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8
1532	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
1533	[[maybe_unused]] constexpr bool __f64_to_s32
1534	= is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4
1535	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
1536	[[maybe_unused]] constexpr bool __f64_to_u64
1537	= is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8
1538	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
1539	[[maybe_unused]] constexpr bool __f64_to_u32
1540	= is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4
1541	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
1542	[[maybe_unused]] constexpr bool __f32_to_ibw
1543	= is_integral_v<_Up> && sizeof(_Up) <= 2
1544	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
1545	[[maybe_unused]] constexpr bool __f64_to_ibw
1546	= is_integral_v<_Up> && sizeof(_Up) <= 2
1547	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
1548	[[maybe_unused]] constexpr bool __f32_to_f64
1549	= is_floating_point_v<_Tp> && sizeof(_Tp) == 4
1550	&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
1551	[[maybe_unused]] constexpr bool __f64_to_f32
1552	= is_floating_point_v<_Tp> && sizeof(_Tp) == 8
1553	&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
1554
1555	if constexpr (__i_to_i && __y_to_x && !__have_avx2) //{{{2
1556	{
1557	// <double, 4>, <double, 4>, <double, 4>, <double, 4> => <char, 16>
1558	return __convert_x86<_To>(__lo128(__v0), __hi128(__v0), __lo128(__v1),
1559	__hi128(__v1), __lo128(__v2), __hi128(__v2),
1560	__lo128(__v3), __hi128(__v3));
1561	}
1562	else if constexpr (__i_to_i) // assert ISA {{{2
1563	{
1564	static_assert(__x_to_x \|\| __have_avx2,
1565	"integral conversions with ymm registers require AVX2");
1566	static_assert(__have_avx512bw
1567	\|\| ((sizeof(_Tp) >= 4 \|\| sizeof(__v0) < 64)
1568	&& (sizeof(_Up) >= 4 \|\| sizeof(_To) < 64)),
1569	"8/16-bit integers in zmm registers require AVX512BW");
1570	static_assert((sizeof(__v0) < 64 && sizeof(_To) < 64) \|\| __have_avx512f,
1571	"integral conversions with ymm registers require AVX2");
1572	}
1573	// concat => use 2-arg __convert_x86 {{{2
1574	if constexpr (sizeof(__v0) < 16 \|\| (sizeof(__v0) == 16 && __have_avx2)
1575	\|\| (sizeof(__v0) == 16 && __have_avx
1576	&& is_floating_point_v<_Tp>)
1577	\|\| (sizeof(__v0) == 32 && __have_avx512f))
1578	{
1579	// The ISA can handle wider input registers, so concat and use two-arg
1580	// implementation. This reduces code duplication considerably.
1581	return __convert_x86<_To>(__concat(__v0, __v1), __concat(__v2, __v3));
1582	}
1583	else //{{{2
1584	{
1585	// conversion using bit reinterpretation (or no conversion at all)
1586	// should all go through the concat branch above:
1587	static_assert(
1588	!(is_floating_point_v<
1589	_Tp> == is_floating_point_v<_Up> && sizeof(_Tp) == sizeof(_Up)));
1590	// handle all zero extension{{{2
1591	if constexpr (4 * _Np < _M && sizeof(_To) > 16)
1592	{
1593	constexpr size_t Min = 16 / sizeof(_Up);
1594	return __zero_extend(
1595	__convert_x86<
1596	__vector_type_t<_Up, (Min > 4 * _Np) ? Min : 4 * _Np>>(
1597	__v0, __v1, __v2, __v3));
1598	}
1599	else if constexpr (__i64_to_i16) //{{{2
1600	{
1601	if constexpr (__x_to_x && __have_sse4_1)
1602	{
1603	return __intrin_bitcast<_To>(_mm_shuffle_epi8(
1604	_mm_blend_epi16(
1605	_mm_blend_epi16(__i0, _mm_slli_si128(__i1, 2), 0x22),
1606	_mm_blend_epi16(_mm_slli_si128(__i2, 4),
1607	_mm_slli_si128(__i3, 6), 0x88),
1608	0xcc),
1609	_mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7,
1610	14, 15)));
1611	}
1612	else if constexpr (__y_to_y && __have_avx2)
1613	{
1614	return __intrin_bitcast<_To>(_mm256_shuffle_epi8(
1615	__xzyw(_mm256_blend_epi16(
1616	__auto_bitcast(
1617	_mm256_shuffle_ps(__vector_bitcast<float>(__v0),
1618	__vector_bitcast<float>(__v2),
1619	0x88)), // 0.1. 8.9. 2.3. A.B.
1620	__to_intrin(__vector_bitcast<int>(_mm256_shuffle_ps(
1621	__vector_bitcast<float>(__v1),
1622	__vector_bitcast<float>(__v3), 0x88))
1623	<< 16), // .4.5 .C.D .6.7 .E.F
1624	0xaa) // 0415 8C9D 2637 AEBF
1625	), // 0415 2637 8C9D AEBF
1626	_mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11,
1627	14, 15, 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7,
1628	10, 11, 14, 15)));
1629	/*
1630	auto __a = _mm256_unpacklo_epi16(__v0, __v1); // 04.. .... 26..
1631	.... auto __b = _mm256_unpackhi_epi16(__v0, __v1); // 15..
1632	.... 37.. .... auto __c = _mm256_unpacklo_epi16(__v2, __v3); //
1633	8C.. .... AE.. .... auto __d = _mm256_unpackhi_epi16(__v2,
1634	__v3);
1635	// 9D.. .... BF.. .... auto __e = _mm256_unpacklo_epi16(__a,
1636	__b);
1637	// 0145 .... 2367 .... auto __f = _mm256_unpacklo_epi16(__c,
1638	__d);
1639	// 89CD .... ABEF .... auto __g = _mm256_unpacklo_epi64(__e,
1640	__f);
1641	// 0145 89CD 2367 ABEF return __concat(
1642	_mm_unpacklo_epi32(__lo128(__g), __hi128(__g)),
1643	_mm_unpackhi_epi32(__lo128(__g), __hi128(__g))); // 0123
1644	4567 89AB CDEF
1645	*/
1646	} // else use fallback
1647	}
1648	else if constexpr (__i64_to_i8) //{{{2
1649	{
1650	if constexpr (__x_to_x)
1651	{
1652	// TODO: use fallback for now
1653	}
1654	else if constexpr (__y_to_x)
1655	{
1656	auto __a
1657	= _mm256_srli_epi32(_mm256_slli_epi32(__i0, 24), 24)
1658	\| _mm256_srli_epi32(_mm256_slli_epi32(__i1, 24), 16)
1659	\| _mm256_srli_epi32(_mm256_slli_epi32(__i2, 24), 8)
1660	\| _mm256_slli_epi32(
1661	__i3, 24); // 048C .... 159D .... 26AE .... 37BF ....
1662	/*return _mm_shuffle_epi8(
1663	_mm_blend_epi32(__lo128(__a) << 32, __hi128(__a), 0x5),
1664	_mm_setr_epi8(4, 12, 0, 8, 5, 13, 1, 9, 6, 14, 2, 10, 7, 15,
1665	3, 11));*/
1666	auto __b = _mm256_unpackhi_epi64(
1667	__a, __a); // 159D .... 159D .... 37BF .... 37BF ....
1668	auto __c = _mm256_unpacklo_epi8(
1669	__a, __b); // 0145 89CD .... .... 2367 ABEF .... ....
1670	return __intrin_bitcast<_To>(
1671	_mm_unpacklo_epi16(__lo128(__c),
1672	__hi128(__c))); // 0123 4567 89AB CDEF
1673	}
1674	}
1675	else if constexpr (__i32_to_i8) //{{{2
1676	{
1677	if constexpr (__x_to_x)
1678	{
1679	if constexpr (__have_ssse3)
1680	{
1681	const auto __x0 = __vector_bitcast<_UInt>(__v0) & 0xff;
1682	const auto __x1 = (__vector_bitcast<_UInt>(__v1) & 0xff)
1683	<< 8;
1684	const auto __x2 = (__vector_bitcast<_UInt>(__v2) & 0xff)
1685	<< 16;
1686	const auto __x3 = __vector_bitcast<_UInt>(__v3) << 24;
1687	return __intrin_bitcast<_To>(
1688	_mm_shuffle_epi8(__to_intrin(__x0 \| __x1 \| __x2 \| __x3),
1689	_mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13,
1690	2, 6, 10, 14, 3, 7, 11,
1691	15)));
1692	}
1693	else
1694	{
1695	auto __a
1696	= _mm_unpacklo_epi8(__i0, __i2); // 08.. .... 19.. ....
1697	auto __b
1698	= _mm_unpackhi_epi8(__i0, __i2); // 2A.. .... 3B.. ....
1699	auto __c
1700	= _mm_unpacklo_epi8(__i1, __i3); // 4C.. .... 5D.. ....
1701	auto __d
1702	= _mm_unpackhi_epi8(__i1, __i3); // 6E.. .... 7F.. ....
1703	auto __e
1704	= _mm_unpacklo_epi8(__a, __c); // 048C .... .... ....
1705	auto __f
1706	= _mm_unpackhi_epi8(__a, __c); // 159D .... .... ....
1707	auto __g
1708	= _mm_unpacklo_epi8(__b, __d); // 26AE .... .... ....
1709	auto __h
1710	= _mm_unpackhi_epi8(__b, __d); // 37BF .... .... ....
1711	return __intrin_bitcast<_To>(_mm_unpacklo_epi8(
1712	_mm_unpacklo_epi8(__e, __g), // 0246 8ACE .... ....
1713	_mm_unpacklo_epi8(__f, __h) // 1357 9BDF .... ....
1714	)); // 0123 4567 89AB CDEF
1715	}
1716	}
1717	else if constexpr (__y_to_y)
1718	{
1719	const auto __a = _mm256_shuffle_epi8(
1720	__to_intrin((__vector_bitcast<_UShort>(_mm256_blend_epi16(
1721	__i0, _mm256_slli_epi32(__i1, 16), 0xAA))
1722	& 0xff)
1723	\| (__vector_bitcast<_UShort>(_mm256_blend_epi16(
1724	__i2, _mm256_slli_epi32(__i3, 16), 0xAA))
1725	<< 8)),
1726	_mm256_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7,
1727	11, 15, 0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9,
1728	13, 3, 7, 11, 15));
1729	return __intrin_bitcast<_To>(_mm256_permutevar8x32_epi32(
1730	__a, _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7)));
1731	}
1732	}
1733	else if constexpr (__i64_to_f32) //{{{2
1734	{
1735	// this branch is only relevant with AVX and w/o AVX2 (i.e. no ymm
1736	// integers)
1737	if constexpr (__x_to_y)
1738	{
1739	return __make_wrapper<float>(__v0[0], __v0[1], __v1[0], __v1[1],
1740	__v2[0], __v2[1], __v3[0],
1741	__v3[1]);
1742
1743	const auto __a = _mm_unpacklo_epi32(__i0, __i1); // acAC
1744	const auto __b = _mm_unpackhi_epi32(__i0, __i1); // bdBD
1745	const auto __c = _mm_unpacklo_epi32(__i2, __i3); // egEG
1746	const auto __d = _mm_unpackhi_epi32(__i2, __i3); // fhFH
1747	const auto __lo32a = _mm_unpacklo_epi32(__a, __b); // abcd
1748	const auto __lo32b = _mm_unpacklo_epi32(__c, __d); // efgh
1749	const auto __hi32 = __vector_bitcast<
1750	conditional_t<is_signed_v<_Tp>, int, _UInt>>(
1751	__concat(_mm_unpackhi_epi32(__a, __b),
1752	_mm_unpackhi_epi32(__c, __d))); // ABCD EFGH
1753	const auto __hi
1754	= 0x100000000LL
1755	* __convert_x86<__vector_type_t<float, 8>>(__hi32);
1756	const auto __mid
1757	= 0x10000
1758	* _mm256_cvtepi32_ps(__concat(_mm_srli_epi32(__lo32a, 16),
1759	_mm_srli_epi32(__lo32b, 16)));
1760	const auto __lo = _mm256_cvtepi32_ps(
1761	__concat(_mm_set1_epi32(0x0000ffffu) & __lo32a,
1762	_mm_set1_epi32(0x0000ffffu) & __lo32b));
1763	return (__hi + __mid) + __lo;
1764	}
1765	}
1766	else if constexpr (__f64_to_ibw) //{{{2
1767	{
1768	return __convert_x86<_To>(
1769	__convert_x86<__vector_type_t<int, _Np * 2>>(__v0, __v1),
1770	__convert_x86<__vector_type_t<int, _Np * 2>>(__v2, __v3));
1771	}
1772	else if constexpr (__f32_to_ibw) //{{{2
1773	{
1774	return __convert_x86<_To>(
1775	__convert_x86<__vector_type_t<int, _Np>>(__v0),
1776	__convert_x86<__vector_type_t<int, _Np>>(__v1),
1777	__convert_x86<__vector_type_t<int, _Np>>(__v2),
1778	__convert_x86<__vector_type_t<int, _Np>>(__v3));
1779	} //}}}
1780
1781	// fallback: {{{2
1782	if constexpr (sizeof(_To) >= 32)
1783	// if _To is ymm or zmm, then _SimdWrapper<_Up, _M / 2> is xmm or ymm
1784	return __concat(__convert_x86<__vector_type_t<_Up, _M / 2>>(__v0,
1785	__v1),
1786	__convert_x86<__vector_type_t<_Up, _M / 2>>(__v2,
1787	__v3));
1788	else if constexpr (sizeof(_To) == 16)
1789	{
1790	const auto __lo = __to_intrin(__convert_x86<_To>(__v0, __v1));
1791	const auto __hi = __to_intrin(__convert_x86<_To>(__v2, __v3));
1792	if constexpr (sizeof(_Up) * _Np * 2 == 8)
1793	{
1794	if constexpr (is_floating_point_v<_Up>)
1795	return __auto_bitcast(_mm_unpacklo_pd(__lo, __hi));
1796	else
1797	return __intrin_bitcast<_To>(_mm_unpacklo_epi64(__lo, __hi));
1798	}
1799	else if constexpr (sizeof(_Up) * _Np * 2 == 4)
1800	{
1801	if constexpr (is_floating_point_v<_Up>)
1802	return __auto_bitcast(_mm_unpacklo_ps(__lo, __hi));
1803	else
1804	return __intrin_bitcast<_To>(_mm_unpacklo_epi32(__lo, __hi));
1805	}
1806	else
1807	__assert_unreachable<_Tp>();
1808	}
1809	else
1810	return __vector_convert<_To>(__v0, __v1, __v2, __v3,
1811	make_index_sequence<_Np>());
1812	//}}}2
1813	}
1814	}
1815
1816	//}}}
1817	// 8-arg __convert_x86 {{{1
1818	template <typename _To, typename _V, typename _Traits>
1819	_GLIBCXX_SIMD_INTRINSIC _To
1820	__convert_x86(_V __v0, _V __v1, _V __v2, _V __v3, _V __v4, _V __v5, _V __v6,
1821	_V __v7)
1822	{
1823	static_assert(__is_vector_type_v<_V>);
1824	using _Tp = typename _Traits::value_type;
1825	constexpr size_t _Np = _Traits::_S_full_size;
1826	[[maybe_unused]] const auto __i0 = __to_intrin(__v0);
1827	[[maybe_unused]] const auto __i1 = __to_intrin(__v1);
1828	[[maybe_unused]] const auto __i2 = __to_intrin(__v2);
1829	[[maybe_unused]] const auto __i3 = __to_intrin(__v3);
1830	[[maybe_unused]] const auto __i4 = __to_intrin(__v4);
1831	[[maybe_unused]] const auto __i5 = __to_intrin(__v5);
1832	[[maybe_unused]] const auto __i6 = __to_intrin(__v6);
1833	[[maybe_unused]] const auto __i7 = __to_intrin(__v7);
1834	using _Up = typename _VectorTraits<_To>::value_type;
1835	constexpr size_t _M = _VectorTraits<_To>::_S_full_size;
1836
1837	static_assert(8 * _Np <= _M,
1838	"__v4-__v7 would be discarded; use the four/two/one-argument "
1839	"__convert_x86 overload instead");
1840
1841	// [xyz]_to_[xyz] {{{2
1842	[[maybe_unused]] constexpr bool __x_to_x
1843	= sizeof(__v0) <= 16 && sizeof(_To) <= 16;
1844	[[maybe_unused]] constexpr bool __x_to_y
1845	= sizeof(__v0) <= 16 && sizeof(_To) == 32;
1846	[[maybe_unused]] constexpr bool __x_to_z
1847	= sizeof(__v0) <= 16 && sizeof(_To) == 64;
1848	[[maybe_unused]] constexpr bool __y_to_x
1849	= sizeof(__v0) == 32 && sizeof(_To) <= 16;
1850	[[maybe_unused]] constexpr bool __y_to_y
1851	= sizeof(__v0) == 32 && sizeof(_To) == 32;
1852	[[maybe_unused]] constexpr bool __y_to_z
1853	= sizeof(__v0) == 32 && sizeof(_To) == 64;
1854	[[maybe_unused]] constexpr bool __z_to_x
1855	= sizeof(__v0) == 64 && sizeof(_To) <= 16;
1856	[[maybe_unused]] constexpr bool __z_to_y
1857	= sizeof(__v0) == 64 && sizeof(_To) == 32;
1858	[[maybe_unused]] constexpr bool __z_to_z
1859	= sizeof(__v0) == 64 && sizeof(_To) == 64;
1860
1861	// [if]X_to_i8 {{{2
1862	[[maybe_unused]] constexpr bool __i_to_i
1863	= is_integral_v<_Up> && is_integral_v<_Tp>;
1864	[[maybe_unused]] constexpr bool __i64_to_i8
1865	= __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 1;
1866	[[maybe_unused]] constexpr bool __f64_to_i8
1867	= is_integral_v<_Up> && sizeof(_Up) == 1
1868	&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
1869
1870	if constexpr (__i_to_i) // assert ISA {{{2
1871	{
1872	static_assert(__x_to_x \|\| __have_avx2,
1873	"integral conversions with ymm registers require AVX2");
1874	static_assert(__have_avx512bw
1875	\|\| ((sizeof(_Tp) >= 4 \|\| sizeof(__v0) < 64)
1876	&& (sizeof(_Up) >= 4 \|\| sizeof(_To) < 64)),
1877	"8/16-bit integers in zmm registers require AVX512BW");
1878	static_assert((sizeof(__v0) < 64 && sizeof(_To) < 64) \|\| __have_avx512f,
1879	"integral conversions with ymm registers require AVX2");
1880	}
1881	// concat => use 4-arg __convert_x86 {{{2
1882	if constexpr (sizeof(__v0) < 16 \|\| (sizeof(__v0) == 16 && __have_avx2)
1883	\|\| (sizeof(__v0) == 16 && __have_avx
1884	&& is_floating_point_v<_Tp>)
1885	\|\| (sizeof(__v0) == 32 && __have_avx512f))
1886	{
1887	// The ISA can handle wider input registers, so concat and use two-arg
1888	// implementation. This reduces code duplication considerably.
1889	return __convert_x86<_To>(__concat(__v0, __v1), __concat(__v2, __v3),
1890	__concat(__v4, __v5), __concat(__v6, __v7));
1891	}
1892	else //{{{2
1893	{
1894	// conversion using bit reinterpretation (or no conversion at all)
1895	// should all go through the concat branch above:
1896	static_assert(
1897	!(is_floating_point_v<
1898	_Tp> == is_floating_point_v<_Up> && sizeof(_Tp) == sizeof(_Up)));
1899	static_assert(!(8 * _Np < _M && sizeof(_To) > 16),
1900	"zero extension should be impossible");
1901	if constexpr (__i64_to_i8) //{{{2
1902	{
1903	if constexpr (__x_to_x && __have_ssse3)
1904	{
1905	// unsure whether this is better than the variant below
1906	return __intrin_bitcast<_To>(_mm_shuffle_epi8(
1907	__to_intrin(
1908	(((__v0 & 0xff) \| ((__v1 & 0xff) << 8))
1909	\| (((__v2 & 0xff) << 16) \| ((__v3 & 0xff) << 24)))
1910	\| ((((__v4 & 0xff) << 32) \| ((__v5 & 0xff) << 40))
1911	\| (((__v6 & 0xff) << 48) \| (__v7 << 56)))),
1912	_mm_setr_epi8(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14,
1913	7, 15)));
1914	}
1915	else if constexpr (__x_to_x)
1916	{
1917	const auto __a = _mm_unpacklo_epi8(__i0, __i1); // ac
1918	const auto __b = _mm_unpackhi_epi8(__i0, __i1); // bd
1919	const auto __c = _mm_unpacklo_epi8(__i2, __i3); // eg
1920	const auto __d = _mm_unpackhi_epi8(__i2, __i3); // fh
1921	const auto __e = _mm_unpacklo_epi8(__i4, __i5); // ik
1922	const auto __f = _mm_unpackhi_epi8(__i4, __i5); // jl
1923	const auto __g = _mm_unpacklo_epi8(__i6, __i7); // mo
1924	const auto __h = _mm_unpackhi_epi8(__i6, __i7); // np
1925	return __intrin_bitcast<_To>(_mm_unpacklo_epi64(
1926	_mm_unpacklo_epi32(_mm_unpacklo_epi8(__a, __b), // abcd
1927	_mm_unpacklo_epi8(__c, __d)), // efgh
1928	_mm_unpacklo_epi32(_mm_unpacklo_epi8(__e, __f), // ijkl
1929	_mm_unpacklo_epi8(__g, __h)) // mnop
1930	));
1931	}
1932	else if constexpr (__y_to_y)
1933	{
1934	auto __a = // 048C GKOS 159D HLPT 26AE IMQU 37BF JNRV
1935	__to_intrin(
1936	(((__v0 & 0xff) \| ((__v1 & 0xff) << 8))
1937	\| (((__v2 & 0xff) << 16) \| ((__v3 & 0xff) << 24)))
1938	\| ((((__v4 & 0xff) << 32) \| ((__v5 & 0xff) << 40))
1939	\| (((__v6 & 0xff) << 48) \| ((__v7 << 56)))));
1940	/*
1941	auto __b = _mm256_unpackhi_epi64(__a, __a); // 159D HLPT 159D
1942	HLPT 37BF JNRV 37BF JNRV auto __c = _mm256_unpacklo_epi8(__a,
1943	__b); // 0145 89CD GHKL OPST 2367 ABEF IJMN QRUV auto __d =
1944	__xzyw(__c); // 0145 89CD 2367 ABEF GHKL OPST IJMN QRUV return
1945	_mm256_shuffle_epi8(
1946	__d, _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12,
1947	13, 6, 7, 14, 15, 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7,
1948	14, 15));
1949	*/
1950	auto __b = _mm256_shuffle_epi8( // 0145 89CD GHKL OPST 2367 ABEF
1951	// IJMN QRUV
1952	__a, _mm256_setr_epi8(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13,
1953	6, 14, 7, 15, 0, 8, 1, 9, 2, 10, 3, 11,
1954	4, 12, 5, 13, 6, 14, 7, 15));
1955	auto __c
1956	= __xzyw(__b); // 0145 89CD 2367 ABEF GHKL OPST IJMN QRUV
1957	return __intrin_bitcast<_To>(_mm256_shuffle_epi8(
1958	__c, _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13,
1959	6, 7, 14, 15, 0, 1, 8, 9, 2, 3, 10, 11,
1960	4, 5, 12, 13, 6, 7, 14, 15)));
1961	}
1962	else if constexpr (__z_to_z)
1963	{
1964	return __concat(
1965	__convert_x86<__vector_type_t<_Up, _M / 2>>(__v0, __v1, __v2,
1966	__v3),
1967	__convert_x86<__vector_type_t<_Up, _M / 2>>(__v4, __v5, __v6,
1968	__v7));
1969	}
1970	}
1971	else if constexpr (__f64_to_i8) //{{{2
1972	{
1973	return __convert_x86<_To>(
1974	__convert_x86<__vector_type_t<int, _Np * 2>>(__v0, __v1),
1975	__convert_x86<__vector_type_t<int, _Np * 2>>(__v2, __v3),
1976	__convert_x86<__vector_type_t<int, _Np * 2>>(__v4, __v5),
1977	__convert_x86<__vector_type_t<int, _Np * 2>>(__v6, __v7));
1978	}
1979	else // unreachable {{{2
1980	__assert_unreachable<_Tp>();
1981	//}}}
1982
1983	// fallback: {{{2
1984	if constexpr (sizeof(_To) >= 32)
1985	// if _To is ymm or zmm, then _SimdWrapper<_Up, _M / 2> is xmm or ymm
1986	return __concat(
1987	__convert_x86<__vector_type_t<_Up, _M / 2>>(__v0, __v1, __v2, __v3),
1988	__convert_x86<__vector_type_t<_Up, _M / 2>>(__v4, __v5, __v6,
1989	__v7));
1990	else if constexpr (sizeof(_To) == 16)
1991	{
1992	const auto __lo
1993	= __to_intrin(__convert_x86<_To>(__v0, __v1, __v2, __v3));
1994	const auto __hi
1995	= __to_intrin(__convert_x86<_To>(__v4, __v5, __v6, __v7));
1996	static_assert(sizeof(_Up) == 1 && _Np == 2);
1997	return __intrin_bitcast<_To>(_mm_unpacklo_epi64(__lo, __hi));
1998	}
1999	else
2000	{
2001	__assert_unreachable<_Tp>();
2002	// return __vector_convert<_To>(__v0, __v1, __v2, __v3, __v4, __v5,
2003	// __v6, __v7,
2004	// make_index_sequence<_Np>());
2005	} //}}}2
2006	}
2007	}
2008
2009	//}}}
2010	// 16-arg __convert_x86 {{{1
2011	template <typename _To, typename _V, typename _Traits>
2012	_GLIBCXX_SIMD_INTRINSIC _To
2013	__convert_x86(_V __v0, _V __v1, _V __v2, _V __v3, _V __v4, _V __v5, _V __v6,
2014	_V __v7, _V __v8, _V __v9, _V __v10, _V __v11, _V __v12,
2015	_V __v13, _V __v14, _V __v15)
2016	{
2017	// concat => use 8-arg __convert_x86
2018	return __convert_x86<_To>(__concat(__v0, __v1), __concat(__v2, __v3),
2019	__concat(__v4, __v5), __concat(__v6, __v7),
2020	__concat(__v8, __v9), __concat(__v10, __v11),
2021	__concat(__v12, __v13), __concat(__v14, __v15));
2022	}
2023
2024	//}}}
2025
2026	#endif // __cplusplus >= 201703L
2027	#endif // _GLIBCXX_EXPERIMENTAL_SIMD_X86_CONVERSIONS_H
2028
2029	// vim: foldmethod=marker

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: Daodan/MSYS2/mingw32/include/c++/11.2.0/experimental/bits/simd_x86_conversions.h@ 1181

Download in other formats: