source: Daodan/MSYS2/mingw32/include/c++/11.2.0/experimental/bits/simd_x86_conversions.h@ 1181

Last change on this file since 1181 was 1166, checked in by rossy, 3 years ago

Daodan: Replace MinGW build env with an up-to-date MSYS2 env

File size: 80.5 KB
Line 
1// x86 specific conversion optimizations -*- C++ -*-
2
3// Copyright (C) 2020-2021 Free Software Foundation, Inc.
4//
5// This file is part of the GNU ISO C++ Library. This library is free
6// software; you can redistribute it and/or modify it under the
7// terms of the GNU General Public License as published by the
8// Free Software Foundation; either version 3, or (at your option)
9// any later version.
10
11// This library is distributed in the hope that it will be useful,
12// but WITHOUT ANY WARRANTY; without even the implied warranty of
13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14// GNU General Public License for more details.
15
16// Under Section 7 of GPL version 3, you are granted additional
17// permissions described in the GCC Runtime Library Exception, version
18// 3.1, as published by the Free Software Foundation.
19
20// You should have received a copy of the GNU General Public License and
21// a copy of the GCC Runtime Library Exception along with this program;
22// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23// <http://www.gnu.org/licenses/>.
24
25#ifndef _GLIBCXX_EXPERIMENTAL_SIMD_X86_CONVERSIONS_H
26#define _GLIBCXX_EXPERIMENTAL_SIMD_X86_CONVERSIONS_H
27
28#if __cplusplus >= 201703L
29
30// work around PR85827
31// 1-arg __convert_x86 {{{1
32template <typename _To, typename _V, typename _Traits>
33 _GLIBCXX_SIMD_INTRINSIC _To
34 __convert_x86(_V __v)
35 {
36 static_assert(__is_vector_type_v<_V>);
37 using _Tp = typename _Traits::value_type;
38 constexpr size_t _Np = _Traits::_S_full_size;
39 [[maybe_unused]] const auto __intrin = __to_intrin(__v);
40 using _Up = typename _VectorTraits<_To>::value_type;
41 constexpr size_t _M = _VectorTraits<_To>::_S_full_size;
42
43 // [xyz]_to_[xyz] {{{2
44 [[maybe_unused]] constexpr bool __x_to_x
45 = sizeof(__v) <= 16 && sizeof(_To) <= 16;
46 [[maybe_unused]] constexpr bool __x_to_y
47 = sizeof(__v) <= 16 && sizeof(_To) == 32;
48 [[maybe_unused]] constexpr bool __x_to_z
49 = sizeof(__v) <= 16 && sizeof(_To) == 64;
50 [[maybe_unused]] constexpr bool __y_to_x
51 = sizeof(__v) == 32 && sizeof(_To) <= 16;
52 [[maybe_unused]] constexpr bool __y_to_y
53 = sizeof(__v) == 32 && sizeof(_To) == 32;
54 [[maybe_unused]] constexpr bool __y_to_z
55 = sizeof(__v) == 32 && sizeof(_To) == 64;
56 [[maybe_unused]] constexpr bool __z_to_x
57 = sizeof(__v) == 64 && sizeof(_To) <= 16;
58 [[maybe_unused]] constexpr bool __z_to_y
59 = sizeof(__v) == 64 && sizeof(_To) == 32;
60 [[maybe_unused]] constexpr bool __z_to_z
61 = sizeof(__v) == 64 && sizeof(_To) == 64;
62
63 // iX_to_iX {{{2
64 [[maybe_unused]] constexpr bool __i_to_i
65 = is_integral_v<_Up> && is_integral_v<_Tp>;
66 [[maybe_unused]] constexpr bool __i8_to_i16
67 = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 2;
68 [[maybe_unused]] constexpr bool __i8_to_i32
69 = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 4;
70 [[maybe_unused]] constexpr bool __i8_to_i64
71 = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 8;
72 [[maybe_unused]] constexpr bool __i16_to_i8
73 = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 1;
74 [[maybe_unused]] constexpr bool __i16_to_i32
75 = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 4;
76 [[maybe_unused]] constexpr bool __i16_to_i64
77 = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 8;
78 [[maybe_unused]] constexpr bool __i32_to_i8
79 = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 1;
80 [[maybe_unused]] constexpr bool __i32_to_i16
81 = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 2;
82 [[maybe_unused]] constexpr bool __i32_to_i64
83 = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 8;
84 [[maybe_unused]] constexpr bool __i64_to_i8
85 = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 1;
86 [[maybe_unused]] constexpr bool __i64_to_i16
87 = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 2;
88 [[maybe_unused]] constexpr bool __i64_to_i32
89 = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 4;
90
91 // [fsu]X_to_[fsu]X {{{2
92 // ibw = integral && byte or word, i.e. char and short with any signedness
93 [[maybe_unused]] constexpr bool __s64_to_f32
94 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 8
95 && is_floating_point_v<_Up> && sizeof(_Up) == 4;
96 [[maybe_unused]] constexpr bool __s32_to_f32
97 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4
98 && is_floating_point_v<_Up> && sizeof(_Up) == 4;
99 [[maybe_unused]] constexpr bool __s16_to_f32
100 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 2
101 && is_floating_point_v<_Up> && sizeof(_Up) == 4;
102 [[maybe_unused]] constexpr bool __s8_to_f32
103 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 1
104 && is_floating_point_v<_Up> && sizeof(_Up) == 4;
105 [[maybe_unused]] constexpr bool __u64_to_f32
106 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 8
107 && is_floating_point_v<_Up> && sizeof(_Up) == 4;
108 [[maybe_unused]] constexpr bool __u32_to_f32
109 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4
110 && is_floating_point_v<_Up> && sizeof(_Up) == 4;
111 [[maybe_unused]] constexpr bool __u16_to_f32
112 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 2
113 && is_floating_point_v<_Up> && sizeof(_Up) == 4;
114 [[maybe_unused]] constexpr bool __u8_to_f32
115 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 1
116 && is_floating_point_v<_Up> && sizeof(_Up) == 4;
117 [[maybe_unused]] constexpr bool __s64_to_f64
118 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 8
119 && is_floating_point_v<_Up> && sizeof(_Up) == 8;
120 [[maybe_unused]] constexpr bool __s32_to_f64
121 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4
122 && is_floating_point_v<_Up> && sizeof(_Up) == 8;
123 [[maybe_unused]] constexpr bool __u64_to_f64
124 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 8
125 && is_floating_point_v<_Up> && sizeof(_Up) == 8;
126 [[maybe_unused]] constexpr bool __u32_to_f64
127 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4
128 && is_floating_point_v<_Up> && sizeof(_Up) == 8;
129 [[maybe_unused]] constexpr bool __f32_to_s64
130 = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8
131 && is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
132 [[maybe_unused]] constexpr bool __f32_to_s32
133 = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4
134 && is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
135 [[maybe_unused]] constexpr bool __f32_to_u64
136 = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8
137 && is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
138 [[maybe_unused]] constexpr bool __f32_to_u32
139 = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4
140 && is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
141 [[maybe_unused]] constexpr bool __f64_to_s64
142 = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8
143 && is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
144 [[maybe_unused]] constexpr bool __f64_to_s32
145 = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4
146 && is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
147 [[maybe_unused]] constexpr bool __f64_to_u64
148 = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8
149 && is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
150 [[maybe_unused]] constexpr bool __f64_to_u32
151 = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4
152 && is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
153 [[maybe_unused]] constexpr bool __ibw_to_f32
154 = is_integral_v<_Tp> && sizeof(_Tp) <= 2
155 && is_floating_point_v<_Up> && sizeof(_Up) == 4;
156 [[maybe_unused]] constexpr bool __ibw_to_f64
157 = is_integral_v<_Tp> && sizeof(_Tp) <= 2
158 && is_floating_point_v<_Up> && sizeof(_Up) == 8;
159 [[maybe_unused]] constexpr bool __f32_to_ibw
160 = is_integral_v<_Up> && sizeof(_Up) <= 2
161 && is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
162 [[maybe_unused]] constexpr bool __f64_to_ibw
163 = is_integral_v<_Up> && sizeof(_Up) <= 2
164 && is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
165 [[maybe_unused]] constexpr bool __f32_to_f64
166 = is_floating_point_v<_Tp> && sizeof(_Tp) == 4
167 && is_floating_point_v<_Up> && sizeof(_Up) == 8;
168 [[maybe_unused]] constexpr bool __f64_to_f32
169 = is_floating_point_v<_Tp> && sizeof(_Tp) == 8
170 && is_floating_point_v<_Up> && sizeof(_Up) == 4;
171
172 if constexpr (__i_to_i && __y_to_x && !__have_avx2) //{{{2
173 return __convert_x86<_To>(__lo128(__v), __hi128(__v));
174 else if constexpr (__i_to_i && __x_to_y && !__have_avx2) //{{{2
175 return __concat(__convert_x86<__vector_type_t<_Up, _M / 2>>(__v),
176 __convert_x86<__vector_type_t<_Up, _M / 2>>(
177 __extract_part<1, _Np / _M * 2>(__v)));
178 else if constexpr (__i_to_i) //{{{2
179 {
180 static_assert(__x_to_x || __have_avx2,
181 "integral conversions with ymm registers require AVX2");
182 static_assert(__have_avx512bw
183 || ((sizeof(_Tp) >= 4 || sizeof(__v) < 64)
184 && (sizeof(_Up) >= 4 || sizeof(_To) < 64)),
185 "8/16-bit integers in zmm registers require AVX512BW");
186 static_assert((sizeof(__v) < 64 && sizeof(_To) < 64) || __have_avx512f,
187 "integral conversions with ymm registers require AVX2");
188 }
189 if constexpr (is_floating_point_v<_Tp> == is_floating_point_v<_Up> && //{{{2
190 sizeof(_Tp) == sizeof(_Up))
191 {
192 // conversion uses simple bit reinterpretation (or no conversion at all)
193 if constexpr (_Np >= _M)
194 return __intrin_bitcast<_To>(__v);
195 else
196 return __zero_extend(__vector_bitcast<_Up>(__v));
197 }
198 else if constexpr (_Np < _M && sizeof(_To) > 16) //{{{2
199 // zero extend (eg. xmm -> ymm)
200 return __zero_extend(
201 __convert_x86<__vector_type_t<
202 _Up, (16 / sizeof(_Up) > _Np) ? 16 / sizeof(_Up) : _Np>>(__v));
203 else if constexpr (_Np > _M && sizeof(__v) > 16) //{{{2
204 // partial input (eg. ymm -> xmm)
205 return __convert_x86<_To>(__extract_part<0, _Np / _M>(__v));
206 else if constexpr (__i64_to_i32) //{{{2
207 {
208 if constexpr (__x_to_x && __have_avx512vl)
209 return __intrin_bitcast<_To>(_mm_cvtepi64_epi32(__intrin));
210 else if constexpr (__x_to_x)
211 return __auto_bitcast(
212 _mm_shuffle_ps(__vector_bitcast<float>(__v), __m128(), 8));
213 else if constexpr (__y_to_x && __have_avx512vl)
214 return __intrin_bitcast<_To>(_mm256_cvtepi64_epi32(__intrin));
215 else if constexpr (__y_to_x && __have_avx512f)
216 return __intrin_bitcast<_To>(
217 __lo128(_mm512_cvtepi64_epi32(__auto_bitcast(__v))));
218 else if constexpr (__y_to_x)
219 return __intrin_bitcast<_To>(
220 __lo128(_mm256_permute4x64_epi64(_mm256_shuffle_epi32(__intrin, 8),
221 0 + 4 * 2)));
222 else if constexpr (__z_to_y)
223 return __intrin_bitcast<_To>(_mm512_cvtepi64_epi32(__intrin));
224 }
225 else if constexpr (__i64_to_i16) //{{{2
226 {
227 if constexpr (__x_to_x && __have_avx512vl)
228 return __intrin_bitcast<_To>(_mm_cvtepi64_epi16(__intrin));
229 else if constexpr (__x_to_x && __have_avx512f)
230 return __intrin_bitcast<_To>(
231 __lo128(_mm512_cvtepi64_epi16(__auto_bitcast(__v))));
232 else if constexpr (__x_to_x && __have_ssse3)
233 {
234 return __intrin_bitcast<_To>(
235 _mm_shuffle_epi8(__intrin,
236 _mm_setr_epi8(0, 1, 8, 9, -0x80, -0x80, -0x80,
237 -0x80, -0x80, -0x80, -0x80, -0x80,
238 -0x80, -0x80, -0x80, -0x80)));
239 // fallback without SSSE3
240 }
241 else if constexpr (__y_to_x && __have_avx512vl)
242 return __intrin_bitcast<_To>(_mm256_cvtepi64_epi16(__intrin));
243 else if constexpr (__y_to_x && __have_avx512f)
244 return __intrin_bitcast<_To>(
245 __lo128(_mm512_cvtepi64_epi16(__auto_bitcast(__v))));
246 else if constexpr (__y_to_x)
247 {
248 const auto __a = _mm256_shuffle_epi8(
249 __intrin,
250 _mm256_setr_epi8(0, 1, 8, 9, -0x80, -0x80, -0x80, -0x80, -0x80,
251 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
252 -0x80, -0x80, -0x80, -0x80, 0, 1, 8, 9, -0x80,
253 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
254 -0x80));
255 return __intrin_bitcast<_To>(__lo128(__a) | __hi128(__a));
256 }
257 else if constexpr (__z_to_x)
258 return __intrin_bitcast<_To>(_mm512_cvtepi64_epi16(__intrin));
259 }
260 else if constexpr (__i64_to_i8) //{{{2
261 {
262 if constexpr (__x_to_x && __have_avx512vl)
263 return __intrin_bitcast<_To>(_mm_cvtepi64_epi8(__intrin));
264 else if constexpr (__x_to_x && __have_avx512f)
265 return __intrin_bitcast<_To>(
266 __lo128(_mm512_cvtepi64_epi8(__zero_extend(__intrin))));
267 else if constexpr (__y_to_x && __have_avx512vl)
268 return __intrin_bitcast<_To>(_mm256_cvtepi64_epi8(__intrin));
269 else if constexpr (__y_to_x && __have_avx512f)
270 return __intrin_bitcast<_To>(
271 _mm512_cvtepi64_epi8(__zero_extend(__intrin)));
272 else if constexpr (__z_to_x)
273 return __intrin_bitcast<_To>(_mm512_cvtepi64_epi8(__intrin));
274 }
275 else if constexpr (__i32_to_i64) //{{{2
276 {
277 if constexpr (__have_sse4_1 && __x_to_x)
278 return __intrin_bitcast<_To>(is_signed_v<_Tp>
279 ? _mm_cvtepi32_epi64(__intrin)
280 : _mm_cvtepu32_epi64(__intrin));
281 else if constexpr (__x_to_x)
282 {
283 return __intrin_bitcast<_To>(
284 _mm_unpacklo_epi32(__intrin, is_signed_v<_Tp>
285 ? _mm_srai_epi32(__intrin, 31)
286 : __m128i()));
287 }
288 else if constexpr (__x_to_y)
289 return __intrin_bitcast<_To>(is_signed_v<_Tp>
290 ? _mm256_cvtepi32_epi64(__intrin)
291 : _mm256_cvtepu32_epi64(__intrin));
292 else if constexpr (__y_to_z)
293 return __intrin_bitcast<_To>(is_signed_v<_Tp>
294 ? _mm512_cvtepi32_epi64(__intrin)
295 : _mm512_cvtepu32_epi64(__intrin));
296 }
297 else if constexpr (__i32_to_i16) //{{{2
298 {
299 if constexpr (__x_to_x && __have_avx512vl)
300 return __intrin_bitcast<_To>(_mm_cvtepi32_epi16(__intrin));
301 else if constexpr (__x_to_x && __have_avx512f)
302 return __intrin_bitcast<_To>(
303 __lo128(_mm512_cvtepi32_epi16(__auto_bitcast(__v))));
304 else if constexpr (__x_to_x && __have_ssse3)
305 return __intrin_bitcast<_To>(_mm_shuffle_epi8(
306 __intrin, _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80,
307 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80)));
308 else if constexpr (__x_to_x)
309 {
310 auto __a = _mm_unpacklo_epi16(__intrin, __m128i()); // 0o.o 1o.o
311 auto __b = _mm_unpackhi_epi16(__intrin, __m128i()); // 2o.o 3o.o
312 auto __c = _mm_unpacklo_epi16(__a, __b); // 02oo ..oo
313 auto __d = _mm_unpackhi_epi16(__a, __b); // 13oo ..oo
314 return __intrin_bitcast<_To>(
315 _mm_unpacklo_epi16(__c, __d)); // 0123 oooo
316 }
317 else if constexpr (__y_to_x && __have_avx512vl)
318 return __intrin_bitcast<_To>(_mm256_cvtepi32_epi16(__intrin));
319 else if constexpr (__y_to_x && __have_avx512f)
320 return __intrin_bitcast<_To>(
321 __lo128(_mm512_cvtepi32_epi16(__auto_bitcast(__v))));
322 else if constexpr (__y_to_x)
323 {
324 auto __a = _mm256_shuffle_epi8(
325 __intrin,
326 _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80, -0x80,
327 -0x80, -0x80, -0x80, -0x80, -0x80, 0, 1, 4, 5, 8,
328 9, 12, 13, -0x80, -0x80, -0x80, -0x80, -0x80,
329 -0x80, -0x80, -0x80));
330 return __intrin_bitcast<_To>(__lo128(
331 _mm256_permute4x64_epi64(__a,
332 0xf8))); // __a[0] __a[2] | __a[3] __a[3]
333 }
334 else if constexpr (__z_to_y)
335 return __intrin_bitcast<_To>(_mm512_cvtepi32_epi16(__intrin));
336 }
337 else if constexpr (__i32_to_i8) //{{{2
338 {
339 if constexpr (__x_to_x && __have_avx512vl)
340 return __intrin_bitcast<_To>(_mm_cvtepi32_epi8(__intrin));
341 else if constexpr (__x_to_x && __have_avx512f)
342 return __intrin_bitcast<_To>(
343 __lo128(_mm512_cvtepi32_epi8(__zero_extend(__intrin))));
344 else if constexpr (__x_to_x && __have_ssse3)
345 {
346 return __intrin_bitcast<_To>(
347 _mm_shuffle_epi8(__intrin,
348 _mm_setr_epi8(0, 4, 8, 12, -0x80, -0x80, -0x80,
349 -0x80, -0x80, -0x80, -0x80, -0x80,
350 -0x80, -0x80, -0x80, -0x80)));
351 }
352 else if constexpr (__x_to_x)
353 {
354 const auto __a
355 = _mm_unpacklo_epi8(__intrin, __intrin); // 0... .... 1... ....
356 const auto __b
357 = _mm_unpackhi_epi8(__intrin, __intrin); // 2... .... 3... ....
358 const auto __c = _mm_unpacklo_epi8(__a, __b); // 02.. .... .... ....
359 const auto __d = _mm_unpackhi_epi8(__a, __b); // 13.. .... .... ....
360 const auto __e = _mm_unpacklo_epi8(__c, __d); // 0123 .... .... ....
361 return __intrin_bitcast<_To>(__e & _mm_cvtsi32_si128(-1));
362 }
363 else if constexpr (__y_to_x && __have_avx512vl)
364 return __intrin_bitcast<_To>(_mm256_cvtepi32_epi8(__intrin));
365 else if constexpr (__y_to_x && __have_avx512f)
366 return __intrin_bitcast<_To>(
367 _mm512_cvtepi32_epi8(__zero_extend(__intrin)));
368 else if constexpr (__z_to_x)
369 return __intrin_bitcast<_To>(_mm512_cvtepi32_epi8(__intrin));
370 }
371 else if constexpr (__i16_to_i64) //{{{2
372 {
373 if constexpr (__x_to_x && __have_sse4_1)
374 return __intrin_bitcast<_To>(is_signed_v<_Tp>
375 ? _mm_cvtepi16_epi64(__intrin)
376 : _mm_cvtepu16_epi64(__intrin));
377 else if constexpr (__x_to_x && is_signed_v<_Tp>)
378 {
379 auto __x = _mm_srai_epi16(__intrin, 15);
380 auto __y = _mm_unpacklo_epi16(__intrin, __x);
381 __x = _mm_unpacklo_epi16(__x, __x);
382 return __intrin_bitcast<_To>(_mm_unpacklo_epi32(__y, __x));
383 }
384 else if constexpr (__x_to_x)
385 return __intrin_bitcast<_To>(
386 _mm_unpacklo_epi32(_mm_unpacklo_epi16(__intrin, __m128i()),
387 __m128i()));
388 else if constexpr (__x_to_y)
389 return __intrin_bitcast<_To>(is_signed_v<_Tp>
390 ? _mm256_cvtepi16_epi64(__intrin)
391 : _mm256_cvtepu16_epi64(__intrin));
392 else if constexpr (__x_to_z)
393 return __intrin_bitcast<_To>(is_signed_v<_Tp>
394 ? _mm512_cvtepi16_epi64(__intrin)
395 : _mm512_cvtepu16_epi64(__intrin));
396 }
397 else if constexpr (__i16_to_i32) //{{{2
398 {
399 if constexpr (__x_to_x && __have_sse4_1)
400 return __intrin_bitcast<_To>(is_signed_v<_Tp>
401 ? _mm_cvtepi16_epi32(__intrin)
402 : _mm_cvtepu16_epi32(__intrin));
403 else if constexpr (__x_to_x && is_signed_v<_Tp>)
404 return __intrin_bitcast<_To>(
405 _mm_srai_epi32(_mm_unpacklo_epi16(__intrin, __intrin), 16));
406 else if constexpr (__x_to_x && is_unsigned_v<_Tp>)
407 return __intrin_bitcast<_To>(_mm_unpacklo_epi16(__intrin, __m128i()));
408 else if constexpr (__x_to_y)
409 return __intrin_bitcast<_To>(is_signed_v<_Tp>
410 ? _mm256_cvtepi16_epi32(__intrin)
411 : _mm256_cvtepu16_epi32(__intrin));
412 else if constexpr (__y_to_z)
413 return __intrin_bitcast<_To>(is_signed_v<_Tp>
414 ? _mm512_cvtepi16_epi32(__intrin)
415 : _mm512_cvtepu16_epi32(__intrin));
416 }
417 else if constexpr (__i16_to_i8) //{{{2
418 {
419 if constexpr (__x_to_x && __have_avx512bw_vl)
420 return __intrin_bitcast<_To>(_mm_cvtepi16_epi8(__intrin));
421 else if constexpr (__x_to_x && __have_avx512bw)
422 return __intrin_bitcast<_To>(
423 __lo128(_mm512_cvtepi16_epi8(__zero_extend(__intrin))));
424 else if constexpr (__x_to_x && __have_ssse3)
425 return __intrin_bitcast<_To>(_mm_shuffle_epi8(
426 __intrin, _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, -0x80, -0x80,
427 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80)));
428 else if constexpr (__x_to_x)
429 {
430 auto __a
431 = _mm_unpacklo_epi8(__intrin, __intrin); // 00.. 11.. 22.. 33..
432 auto __b
433 = _mm_unpackhi_epi8(__intrin, __intrin); // 44.. 55.. 66.. 77..
434 auto __c = _mm_unpacklo_epi8(__a, __b); // 0404 .... 1515 ....
435 auto __d = _mm_unpackhi_epi8(__a, __b); // 2626 .... 3737 ....
436 auto __e = _mm_unpacklo_epi8(__c, __d); // 0246 0246 .... ....
437 auto __f = _mm_unpackhi_epi8(__c, __d); // 1357 1357 .... ....
438 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__e, __f));
439 }
440 else if constexpr (__y_to_x && __have_avx512bw_vl)
441 return __intrin_bitcast<_To>(_mm256_cvtepi16_epi8(__intrin));
442 else if constexpr (__y_to_x && __have_avx512bw)
443 return __intrin_bitcast<_To>(
444 __lo256(_mm512_cvtepi16_epi8(__zero_extend(__intrin))));
445 else if constexpr (__y_to_x)
446 {
447 auto __a = _mm256_shuffle_epi8(
448 __intrin,
449 _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, -0x80, -0x80, -0x80,
450 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
451 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, 0, 2,
452 4, 6, 8, 10, 12, 14));
453 return __intrin_bitcast<_To>(__lo128(__a) | __hi128(__a));
454 }
455 else if constexpr (__z_to_y && __have_avx512bw)
456 return __intrin_bitcast<_To>(_mm512_cvtepi16_epi8(__intrin));
457 else if constexpr (__z_to_y)
458 __assert_unreachable<_Tp>();
459 }
460 else if constexpr (__i8_to_i64) //{{{2
461 {
462 if constexpr (__x_to_x && __have_sse4_1)
463 return __intrin_bitcast<_To>(is_signed_v<_Tp>
464 ? _mm_cvtepi8_epi64(__intrin)
465 : _mm_cvtepu8_epi64(__intrin));
466 else if constexpr (__x_to_x && is_signed_v<_Tp>)
467 {
468 if constexpr (__have_ssse3)
469 {
470 auto __dup = _mm_unpacklo_epi8(__intrin, __intrin);
471 auto __epi16 = _mm_srai_epi16(__dup, 8);
472 _mm_shuffle_epi8(__epi16,
473 _mm_setr_epi8(0, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3,
474 3, 3, 3, 3, 3));
475 }
476 else
477 {
478 auto __x = _mm_unpacklo_epi8(__intrin, __intrin);
479 __x = _mm_unpacklo_epi16(__x, __x);
480 return __intrin_bitcast<_To>(
481 _mm_unpacklo_epi32(_mm_srai_epi32(__x, 24),
482 _mm_srai_epi32(__x, 31)));
483 }
484 }
485 else if constexpr (__x_to_x)
486 {
487 return __intrin_bitcast<_To>(_mm_unpacklo_epi32(
488 _mm_unpacklo_epi16(_mm_unpacklo_epi8(__intrin, __m128i()),
489 __m128i()),
490 __m128i()));
491 }
492 else if constexpr (__x_to_y)
493 return __intrin_bitcast<_To>(is_signed_v<_Tp>
494 ? _mm256_cvtepi8_epi64(__intrin)
495 : _mm256_cvtepu8_epi64(__intrin));
496 else if constexpr (__x_to_z)
497 return __intrin_bitcast<_To>(is_signed_v<_Tp>
498 ? _mm512_cvtepi8_epi64(__intrin)
499 : _mm512_cvtepu8_epi64(__intrin));
500 }
501 else if constexpr (__i8_to_i32) //{{{2
502 {
503 if constexpr (__x_to_x && __have_sse4_1)
504 return __intrin_bitcast<_To>(is_signed_v<_Tp>
505 ? _mm_cvtepi8_epi32(__intrin)
506 : _mm_cvtepu8_epi32(__intrin));
507 else if constexpr (__x_to_x && is_signed_v<_Tp>)
508 {
509 const auto __x = _mm_unpacklo_epi8(__intrin, __intrin);
510 return __intrin_bitcast<_To>(
511 _mm_srai_epi32(_mm_unpacklo_epi16(__x, __x), 24));
512 }
513 else if constexpr (__x_to_x && is_unsigned_v<_Tp>)
514 return __intrin_bitcast<_To>(
515 _mm_unpacklo_epi16(_mm_unpacklo_epi8(__intrin, __m128i()),
516 __m128i()));
517 else if constexpr (__x_to_y)
518 return __intrin_bitcast<_To>(is_signed_v<_Tp>
519 ? _mm256_cvtepi8_epi32(__intrin)
520 : _mm256_cvtepu8_epi32(__intrin));
521 else if constexpr (__x_to_z)
522 return __intrin_bitcast<_To>(is_signed_v<_Tp>
523 ? _mm512_cvtepi8_epi32(__intrin)
524 : _mm512_cvtepu8_epi32(__intrin));
525 }
526 else if constexpr (__i8_to_i16) //{{{2
527 {
528 if constexpr (__x_to_x && __have_sse4_1)
529 return __intrin_bitcast<_To>(is_signed_v<_Tp>
530 ? _mm_cvtepi8_epi16(__intrin)
531 : _mm_cvtepu8_epi16(__intrin));
532 else if constexpr (__x_to_x && is_signed_v<_Tp>)
533 return __intrin_bitcast<_To>(
534 _mm_srai_epi16(_mm_unpacklo_epi8(__intrin, __intrin), 8));
535 else if constexpr (__x_to_x && is_unsigned_v<_Tp>)
536 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__intrin, __m128i()));
537 else if constexpr (__x_to_y)
538 return __intrin_bitcast<_To>(is_signed_v<_Tp>
539 ? _mm256_cvtepi8_epi16(__intrin)
540 : _mm256_cvtepu8_epi16(__intrin));
541 else if constexpr (__y_to_z && __have_avx512bw)
542 return __intrin_bitcast<_To>(is_signed_v<_Tp>
543 ? _mm512_cvtepi8_epi16(__intrin)
544 : _mm512_cvtepu8_epi16(__intrin));
545 else if constexpr (__y_to_z)
546 __assert_unreachable<_Tp>();
547 }
548 else if constexpr (__f32_to_s64) //{{{2
549 {
550 if constexpr (__have_avx512dq_vl && __x_to_x)
551 return __intrin_bitcast<_To>(_mm_cvttps_epi64(__intrin));
552 else if constexpr (__have_avx512dq_vl && __x_to_y)
553 return __intrin_bitcast<_To>(_mm256_cvttps_epi64(__intrin));
554 else if constexpr (__have_avx512dq && __y_to_z)
555 return __intrin_bitcast<_To>(_mm512_cvttps_epi64(__intrin));
556 // else use scalar fallback
557 }
558 else if constexpr (__f32_to_u64) //{{{2
559 {
560 if constexpr (__have_avx512dq_vl && __x_to_x)
561 return __intrin_bitcast<_To>(_mm_cvttps_epu64(__intrin));
562 else if constexpr (__have_avx512dq_vl && __x_to_y)
563 return __intrin_bitcast<_To>(_mm256_cvttps_epu64(__intrin));
564 else if constexpr (__have_avx512dq && __y_to_z)
565 return __intrin_bitcast<_To>(_mm512_cvttps_epu64(__intrin));
566 // else use scalar fallback
567 }
568 else if constexpr (__f32_to_s32) //{{{2
569 {
570 if constexpr (__x_to_x || __y_to_y || __z_to_z)
571 {
572 // go to fallback, it does the right thing
573 }
574 else
575 __assert_unreachable<_Tp>();
576 }
577 else if constexpr (__f32_to_u32) //{{{2
578 {
579 if constexpr (__have_avx512vl && __x_to_x)
580 return __auto_bitcast(_mm_cvttps_epu32(__intrin));
581 else if constexpr (__have_avx512f && __x_to_x)
582 return __auto_bitcast(
583 __lo128(_mm512_cvttps_epu32(__auto_bitcast(__v))));
584 else if constexpr (__have_avx512vl && __y_to_y)
585 return __vector_bitcast<_Up>(_mm256_cvttps_epu32(__intrin));
586 else if constexpr (__have_avx512f && __y_to_y)
587 return __vector_bitcast<_Up>(
588 __lo256(_mm512_cvttps_epu32(__auto_bitcast(__v))));
589 else if constexpr (__x_to_x || __y_to_y || __z_to_z)
590 {
591 // go to fallback, it does the right thing. We can't use the
592 // _mm_floor_ps - 0x8000'0000 trick for f32->u32 because it would
593 // discard small input values (only 24 mantissa bits)
594 }
595 else
596 __assert_unreachable<_Tp>();
597 }
598 else if constexpr (__f32_to_ibw) //{{{2
599 return __convert_x86<_To>(__convert_x86<__vector_type_t<int, _Np>>(__v));
600 else if constexpr (__f64_to_s64) //{{{2
601 {
602 if constexpr (__have_avx512dq_vl && __x_to_x)
603 return __intrin_bitcast<_To>(_mm_cvttpd_epi64(__intrin));
604 else if constexpr (__have_avx512dq_vl && __y_to_y)
605 return __intrin_bitcast<_To>(_mm256_cvttpd_epi64(__intrin));
606 else if constexpr (__have_avx512dq && __z_to_z)
607 return __intrin_bitcast<_To>(_mm512_cvttpd_epi64(__intrin));
608 // else use scalar fallback
609 }
610 else if constexpr (__f64_to_u64) //{{{2
611 {
612 if constexpr (__have_avx512dq_vl && __x_to_x)
613 return __intrin_bitcast<_To>(_mm_cvttpd_epu64(__intrin));
614 else if constexpr (__have_avx512dq_vl && __y_to_y)
615 return __intrin_bitcast<_To>(_mm256_cvttpd_epu64(__intrin));
616 else if constexpr (__have_avx512dq && __z_to_z)
617 return __intrin_bitcast<_To>(_mm512_cvttpd_epu64(__intrin));
618 // else use scalar fallback
619 }
620 else if constexpr (__f64_to_s32) //{{{2
621 {
622 if constexpr (__x_to_x)
623 return __intrin_bitcast<_To>(_mm_cvttpd_epi32(__intrin));
624 else if constexpr (__y_to_x)
625 return __intrin_bitcast<_To>(_mm256_cvttpd_epi32(__intrin));
626 else if constexpr (__z_to_y)
627 return __intrin_bitcast<_To>(_mm512_cvttpd_epi32(__intrin));
628 }
629 else if constexpr (__f64_to_u32) //{{{2
630 {
631 if constexpr (__have_avx512vl && __x_to_x)
632 return __intrin_bitcast<_To>(_mm_cvttpd_epu32(__intrin));
633 else if constexpr (__have_sse4_1 && __x_to_x)
634 return __vector_bitcast<_Up, _M>(
635 _mm_cvttpd_epi32(_mm_floor_pd(__intrin) - 0x8000'0000u))
636 ^ 0x8000'0000u;
637 else if constexpr (__x_to_x)
638 {
639 // use scalar fallback: it's only 2 values to convert, can't get
640 // much better than scalar decomposition
641 }
642 else if constexpr (__have_avx512vl && __y_to_x)
643 return __intrin_bitcast<_To>(_mm256_cvttpd_epu32(__intrin));
644 else if constexpr (__y_to_x)
645 {
646 return __intrin_bitcast<_To>(
647 __vector_bitcast<_Up>(
648 _mm256_cvttpd_epi32(_mm256_floor_pd(__intrin) - 0x8000'0000u))
649 ^ 0x8000'0000u);
650 }
651 else if constexpr (__z_to_y)
652 return __intrin_bitcast<_To>(_mm512_cvttpd_epu32(__intrin));
653 }
654 else if constexpr (__f64_to_ibw) //{{{2
655 {
656 return __convert_x86<_To>(
657 __convert_x86<__vector_type_t<int, (_Np < 4 ? 4 : _Np)>>(__v));
658 }
659 else if constexpr (__s64_to_f32) //{{{2
660 {
661 if constexpr (__x_to_x && __have_avx512dq_vl)
662 return __intrin_bitcast<_To>(_mm_cvtepi64_ps(__intrin));
663 else if constexpr (__y_to_x && __have_avx512dq_vl)
664 return __intrin_bitcast<_To>(_mm256_cvtepi64_ps(__intrin));
665 else if constexpr (__z_to_y && __have_avx512dq)
666 return __intrin_bitcast<_To>(_mm512_cvtepi64_ps(__intrin));
667 else if constexpr (__z_to_y)
668 return __intrin_bitcast<_To>(
669 _mm512_cvtpd_ps(__convert_x86<__vector_type_t<double, 8>>(__v)));
670 }
671 else if constexpr (__u64_to_f32) //{{{2
672 {
673 if constexpr (__x_to_x && __have_avx512dq_vl)
674 return __intrin_bitcast<_To>(_mm_cvtepu64_ps(__intrin));
675 else if constexpr (__y_to_x && __have_avx512dq_vl)
676 return __intrin_bitcast<_To>(_mm256_cvtepu64_ps(__intrin));
677 else if constexpr (__z_to_y && __have_avx512dq)
678 return __intrin_bitcast<_To>(_mm512_cvtepu64_ps(__intrin));
679 else if constexpr (__z_to_y)
680 {
681 return __intrin_bitcast<_To>(
682 __lo256(_mm512_cvtepu32_ps(__auto_bitcast(
683 _mm512_cvtepi64_epi32(_mm512_srai_epi64(__intrin, 32)))))
684 * 0x100000000LL
685 + __lo256(_mm512_cvtepu32_ps(
686 __auto_bitcast(_mm512_cvtepi64_epi32(__intrin)))));
687 }
688 }
689 else if constexpr (__s32_to_f32) //{{{2
690 {
691 // use fallback (builtin conversion)
692 }
693 else if constexpr (__u32_to_f32) //{{{2
694 {
695 if constexpr (__x_to_x && __have_avx512vl)
696 {
697 // use fallback
698 }
699 else if constexpr (__x_to_x && __have_avx512f)
700 return __intrin_bitcast<_To>(
701 __lo128(_mm512_cvtepu32_ps(__auto_bitcast(__v))));
702 else if constexpr (__x_to_x && (__have_fma || __have_fma4))
703 // work around PR85819
704 return __auto_bitcast(0x10000
705 * _mm_cvtepi32_ps(__to_intrin(__v >> 16))
706 + _mm_cvtepi32_ps(__to_intrin(__v & 0xffff)));
707 else if constexpr (__y_to_y && __have_avx512vl)
708 {
709 // use fallback
710 }
711 else if constexpr (__y_to_y && __have_avx512f)
712 return __intrin_bitcast<_To>(
713 __lo256(_mm512_cvtepu32_ps(__auto_bitcast(__v))));
714 else if constexpr (__y_to_y)
715 // work around PR85819
716 return 0x10000 * _mm256_cvtepi32_ps(__to_intrin(__v >> 16))
717 + _mm256_cvtepi32_ps(__to_intrin(__v & 0xffff));
718 // else use fallback (builtin conversion)
719 }
720 else if constexpr (__ibw_to_f32) //{{{2
721 {
722 if constexpr (_M <= 4 || __have_avx2)
723 return __convert_x86<_To>(
724 __convert_x86<__vector_type_t<int, _M>>(__v));
725 else
726 {
727 static_assert(__x_to_y);
728 __m128i __a, __b;
729 if constexpr (__have_sse4_1)
730 {
731 __a = sizeof(_Tp) == 2
732 ? (is_signed_v<_Tp> ? _mm_cvtepi16_epi32(__intrin)
733 : _mm_cvtepu16_epi32(__intrin))
734 : (is_signed_v<_Tp> ? _mm_cvtepi8_epi32(__intrin)
735 : _mm_cvtepu8_epi32(__intrin));
736 const auto __w
737 = _mm_shuffle_epi32(__intrin, sizeof(_Tp) == 2 ? 0xee : 0xe9);
738 __b = sizeof(_Tp) == 2
739 ? (is_signed_v<_Tp> ? _mm_cvtepi16_epi32(__w)
740 : _mm_cvtepu16_epi32(__w))
741 : (is_signed_v<_Tp> ? _mm_cvtepi8_epi32(__w)
742 : _mm_cvtepu8_epi32(__w));
743 }
744 else
745 {
746 __m128i __tmp;
747 if constexpr (sizeof(_Tp) == 1)
748 {
749 __tmp = is_signed_v<_Tp>
750 ? _mm_srai_epi16(_mm_unpacklo_epi8(__intrin,
751 __intrin),
752 8)
753 : _mm_unpacklo_epi8(__intrin, __m128i());
754 }
755 else
756 {
757 static_assert(sizeof(_Tp) == 2);
758 __tmp = __intrin;
759 }
760 __a = is_signed_v<_Tp>
761 ? _mm_srai_epi32(_mm_unpacklo_epi16(__tmp, __tmp), 16)
762 : _mm_unpacklo_epi16(__tmp, __m128i());
763 __b = is_signed_v<_Tp>
764 ? _mm_srai_epi32(_mm_unpackhi_epi16(__tmp, __tmp), 16)
765 : _mm_unpackhi_epi16(__tmp, __m128i());
766 }
767 return __convert_x86<_To>(__vector_bitcast<int>(__a),
768 __vector_bitcast<int>(__b));
769 }
770 }
771 else if constexpr (__s64_to_f64) //{{{2
772 {
773 if constexpr (__x_to_x && __have_avx512dq_vl)
774 return __intrin_bitcast<_To>(_mm_cvtepi64_pd(__intrin));
775 else if constexpr (__y_to_y && __have_avx512dq_vl)
776 return __intrin_bitcast<_To>(_mm256_cvtepi64_pd(__intrin));
777 else if constexpr (__z_to_z && __have_avx512dq)
778 return __intrin_bitcast<_To>(_mm512_cvtepi64_pd(__intrin));
779 else if constexpr (__z_to_z)
780 {
781 return __intrin_bitcast<_To>(
782 _mm512_cvtepi32_pd(_mm512_cvtepi64_epi32(__to_intrin(__v >> 32)))
783 * 0x100000000LL
784 + _mm512_cvtepu32_pd(_mm512_cvtepi64_epi32(__intrin)));
785 }
786 }
787 else if constexpr (__u64_to_f64) //{{{2
788 {
789 if constexpr (__x_to_x && __have_avx512dq_vl)
790 return __intrin_bitcast<_To>(_mm_cvtepu64_pd(__intrin));
791 else if constexpr (__y_to_y && __have_avx512dq_vl)
792 return __intrin_bitcast<_To>(_mm256_cvtepu64_pd(__intrin));
793 else if constexpr (__z_to_z && __have_avx512dq)
794 return __intrin_bitcast<_To>(_mm512_cvtepu64_pd(__intrin));
795 else if constexpr (__z_to_z)
796 {
797 return __intrin_bitcast<_To>(
798 _mm512_cvtepu32_pd(_mm512_cvtepi64_epi32(__to_intrin(__v >> 32)))
799 * 0x100000000LL
800 + _mm512_cvtepu32_pd(_mm512_cvtepi64_epi32(__intrin)));
801 }
802 }
803 else if constexpr (__s32_to_f64) //{{{2
804 {
805 if constexpr (__x_to_x)
806 return __intrin_bitcast<_To>(_mm_cvtepi32_pd(__intrin));
807 else if constexpr (__x_to_y)
808 return __intrin_bitcast<_To>(_mm256_cvtepi32_pd(__intrin));
809 else if constexpr (__y_to_z)
810 return __intrin_bitcast<_To>(_mm512_cvtepi32_pd(__intrin));
811 }
812 else if constexpr (__u32_to_f64) //{{{2
813 {
814 if constexpr (__x_to_x && __have_avx512vl)
815 return __intrin_bitcast<_To>(_mm_cvtepu32_pd(__intrin));
816 else if constexpr (__x_to_x && __have_avx512f)
817 return __intrin_bitcast<_To>(
818 __lo128(_mm512_cvtepu32_pd(__auto_bitcast(__v))));
819 else if constexpr (__x_to_x)
820 return __intrin_bitcast<_To>(
821 _mm_cvtepi32_pd(__to_intrin(__v ^ 0x8000'0000u)) + 0x8000'0000u);
822 else if constexpr (__x_to_y && __have_avx512vl)
823 return __intrin_bitcast<_To>(_mm256_cvtepu32_pd(__intrin));
824 else if constexpr (__x_to_y && __have_avx512f)
825 return __intrin_bitcast<_To>(
826 __lo256(_mm512_cvtepu32_pd(__auto_bitcast(__v))));
827 else if constexpr (__x_to_y)
828 return __intrin_bitcast<_To>(
829 _mm256_cvtepi32_pd(__to_intrin(__v ^ 0x8000'0000u)) + 0x8000'0000u);
830 else if constexpr (__y_to_z)
831 return __intrin_bitcast<_To>(_mm512_cvtepu32_pd(__intrin));
832 }
833 else if constexpr (__ibw_to_f64) //{{{2
834 {
835 return __convert_x86<_To>(
836 __convert_x86<__vector_type_t<int, std::max(size_t(4), _M)>>(__v));
837 }
838 else if constexpr (__f32_to_f64) //{{{2
839 {
840 if constexpr (__x_to_x)
841 return __intrin_bitcast<_To>(_mm_cvtps_pd(__intrin));
842 else if constexpr (__x_to_y)
843 return __intrin_bitcast<_To>(_mm256_cvtps_pd(__intrin));
844 else if constexpr (__y_to_z)
845 return __intrin_bitcast<_To>(_mm512_cvtps_pd(__intrin));
846 }
847 else if constexpr (__f64_to_f32) //{{{2
848 {
849 if constexpr (__x_to_x)
850 return __intrin_bitcast<_To>(_mm_cvtpd_ps(__intrin));
851 else if constexpr (__y_to_x)
852 return __intrin_bitcast<_To>(_mm256_cvtpd_ps(__intrin));
853 else if constexpr (__z_to_y)
854 return __intrin_bitcast<_To>(_mm512_cvtpd_ps(__intrin));
855 }
856 else //{{{2
857 __assert_unreachable<_Tp>();
858
859 // fallback:{{{2
860 return __vector_convert<_To>(__v, make_index_sequence<std::min(_M, _Np)>());
861 //}}}
862 }
863
864// }}}
865// 2-arg __convert_x86 {{{1
866template <typename _To, typename _V, typename _Traits>
867 _GLIBCXX_SIMD_INTRINSIC _To
868 __convert_x86(_V __v0, _V __v1)
869 {
870 static_assert(__is_vector_type_v<_V>);
871 using _Tp = typename _Traits::value_type;
872 constexpr size_t _Np = _Traits::_S_full_size;
873 [[maybe_unused]] const auto __i0 = __to_intrin(__v0);
874 [[maybe_unused]] const auto __i1 = __to_intrin(__v1);
875 using _Up = typename _VectorTraits<_To>::value_type;
876 constexpr size_t _M = _VectorTraits<_To>::_S_full_size;
877
878 static_assert(2 * _Np <= _M,
879 "__v1 would be discarded; use the one-argument "
880 "__convert_x86 overload instead");
881
882 // [xyz]_to_[xyz] {{{2
883 [[maybe_unused]] constexpr bool __x_to_x
884 = sizeof(__v0) <= 16 && sizeof(_To) <= 16;
885 [[maybe_unused]] constexpr bool __x_to_y
886 = sizeof(__v0) <= 16 && sizeof(_To) == 32;
887 [[maybe_unused]] constexpr bool __x_to_z
888 = sizeof(__v0) <= 16 && sizeof(_To) == 64;
889 [[maybe_unused]] constexpr bool __y_to_x
890 = sizeof(__v0) == 32 && sizeof(_To) <= 16;
891 [[maybe_unused]] constexpr bool __y_to_y
892 = sizeof(__v0) == 32 && sizeof(_To) == 32;
893 [[maybe_unused]] constexpr bool __y_to_z
894 = sizeof(__v0) == 32 && sizeof(_To) == 64;
895 [[maybe_unused]] constexpr bool __z_to_x
896 = sizeof(__v0) == 64 && sizeof(_To) <= 16;
897 [[maybe_unused]] constexpr bool __z_to_y
898 = sizeof(__v0) == 64 && sizeof(_To) == 32;
899 [[maybe_unused]] constexpr bool __z_to_z
900 = sizeof(__v0) == 64 && sizeof(_To) == 64;
901
902 // iX_to_iX {{{2
903 [[maybe_unused]] constexpr bool __i_to_i
904 = is_integral_v<_Up> && is_integral_v<_Tp>;
905 [[maybe_unused]] constexpr bool __i8_to_i16
906 = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 2;
907 [[maybe_unused]] constexpr bool __i8_to_i32
908 = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 4;
909 [[maybe_unused]] constexpr bool __i8_to_i64
910 = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 8;
911 [[maybe_unused]] constexpr bool __i16_to_i8
912 = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 1;
913 [[maybe_unused]] constexpr bool __i16_to_i32
914 = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 4;
915 [[maybe_unused]] constexpr bool __i16_to_i64
916 = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 8;
917 [[maybe_unused]] constexpr bool __i32_to_i8
918 = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 1;
919 [[maybe_unused]] constexpr bool __i32_to_i16
920 = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 2;
921 [[maybe_unused]] constexpr bool __i32_to_i64
922 = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 8;
923 [[maybe_unused]] constexpr bool __i64_to_i8
924 = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 1;
925 [[maybe_unused]] constexpr bool __i64_to_i16
926 = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 2;
927 [[maybe_unused]] constexpr bool __i64_to_i32
928 = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 4;
929
930 // [fsu]X_to_[fsu]X {{{2
931 // ibw = integral && byte or word, i.e. char and short with any signedness
932 [[maybe_unused]] constexpr bool __i64_to_f32
933 = is_integral_v<_Tp> && sizeof(_Tp) == 8
934 && is_floating_point_v<_Up> && sizeof(_Up) == 4;
935 [[maybe_unused]] constexpr bool __s32_to_f32
936 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4
937 && is_floating_point_v<_Up> && sizeof(_Up) == 4;
938 [[maybe_unused]] constexpr bool __s16_to_f32
939 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 2
940 && is_floating_point_v<_Up> && sizeof(_Up) == 4;
941 [[maybe_unused]] constexpr bool __s8_to_f32
942 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 1
943 && is_floating_point_v<_Up> && sizeof(_Up) == 4;
944 [[maybe_unused]] constexpr bool __u32_to_f32
945 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4
946 && is_floating_point_v<_Up> && sizeof(_Up) == 4;
947 [[maybe_unused]] constexpr bool __u16_to_f32
948 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 2
949 && is_floating_point_v<_Up> && sizeof(_Up) == 4;
950 [[maybe_unused]] constexpr bool __u8_to_f32
951 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 1
952 && is_floating_point_v<_Up> && sizeof(_Up) == 4;
953 [[maybe_unused]] constexpr bool __s64_to_f64
954 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 8
955 && is_floating_point_v<_Up> && sizeof(_Up) == 8;
956 [[maybe_unused]] constexpr bool __s32_to_f64
957 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4
958 && is_floating_point_v<_Up> && sizeof(_Up) == 8;
959 [[maybe_unused]] constexpr bool __s16_to_f64
960 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 2
961 && is_floating_point_v<_Up> && sizeof(_Up) == 8;
962 [[maybe_unused]] constexpr bool __s8_to_f64
963 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 1
964 && is_floating_point_v<_Up> && sizeof(_Up) == 8;
965 [[maybe_unused]] constexpr bool __u64_to_f64
966 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 8
967 && is_floating_point_v<_Up> && sizeof(_Up) == 8;
968 [[maybe_unused]] constexpr bool __u32_to_f64
969 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4
970 && is_floating_point_v<_Up> && sizeof(_Up) == 8;
971 [[maybe_unused]] constexpr bool __u16_to_f64
972 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 2
973 && is_floating_point_v<_Up> && sizeof(_Up) == 8;
974 [[maybe_unused]] constexpr bool __u8_to_f64
975 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 1
976 && is_floating_point_v<_Up> && sizeof(_Up) == 8;
977 [[maybe_unused]] constexpr bool __f32_to_s64
978 = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8
979 && is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
980 [[maybe_unused]] constexpr bool __f32_to_s32
981 = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4
982 && is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
983 [[maybe_unused]] constexpr bool __f32_to_u64
984 = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8
985 && is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
986 [[maybe_unused]] constexpr bool __f32_to_u32
987 = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4
988 && is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
989 [[maybe_unused]] constexpr bool __f64_to_s64
990 = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8
991 && is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
992 [[maybe_unused]] constexpr bool __f64_to_s32
993 = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4
994 && is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
995 [[maybe_unused]] constexpr bool __f64_to_u64
996 = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8
997 && is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
998 [[maybe_unused]] constexpr bool __f64_to_u32
999 = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4
1000 && is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
1001 [[maybe_unused]] constexpr bool __f32_to_ibw
1002 = is_integral_v<_Up> && sizeof(_Up) <= 2
1003 && is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
1004 [[maybe_unused]] constexpr bool __f64_to_ibw
1005 = is_integral_v<_Up> && sizeof(_Up) <= 2
1006 && is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
1007 [[maybe_unused]] constexpr bool __f32_to_f64
1008 = is_floating_point_v<_Tp> && sizeof(_Tp) == 4
1009 && is_floating_point_v<_Up> && sizeof(_Up) == 8;
1010 [[maybe_unused]] constexpr bool __f64_to_f32
1011 = is_floating_point_v<_Tp> && sizeof(_Tp) == 8
1012 && is_floating_point_v<_Up> && sizeof(_Up) == 4;
1013
1014 if constexpr (__i_to_i && __y_to_x && !__have_avx2) //{{{2
1015 // <double, 4>, <double, 4> => <short, 8>
1016 return __convert_x86<_To>(__lo128(__v0), __hi128(__v0), __lo128(__v1),
1017 __hi128(__v1));
1018 else if constexpr (__i_to_i) // assert ISA {{{2
1019 {
1020 static_assert(__x_to_x || __have_avx2,
1021 "integral conversions with ymm registers require AVX2");
1022 static_assert(__have_avx512bw
1023 || ((sizeof(_Tp) >= 4 || sizeof(__v0) < 64)
1024 && (sizeof(_Up) >= 4 || sizeof(_To) < 64)),
1025 "8/16-bit integers in zmm registers require AVX512BW");
1026 static_assert((sizeof(__v0) < 64 && sizeof(_To) < 64) || __have_avx512f,
1027 "integral conversions with ymm registers require AVX2");
1028 }
1029 // concat => use 1-arg __convert_x86 {{{2
1030 if constexpr (sizeof(__v0) < 16 || (sizeof(__v0) == 16 && __have_avx2)
1031 || (sizeof(__v0) == 16 && __have_avx
1032 && is_floating_point_v<_Tp>)
1033 || (sizeof(__v0) == 32 && __have_avx512f
1034 && (sizeof(_Tp) >= 4 || __have_avx512bw)))
1035 {
1036 // The ISA can handle wider input registers, so concat and use one-arg
1037 // implementation. This reduces code duplication considerably.
1038 return __convert_x86<_To>(__concat(__v0, __v1));
1039 }
1040 else //{{{2
1041 {
1042 // conversion using bit reinterpretation (or no conversion at all)
1043 // should all go through the concat branch above:
1044 static_assert(
1045 !(is_floating_point_v<
1046 _Tp> == is_floating_point_v<_Up> && sizeof(_Tp) == sizeof(_Up)));
1047 // handle all zero extension{{{2
1048 if constexpr (2 * _Np < _M && sizeof(_To) > 16)
1049 {
1050 constexpr size_t Min = 16 / sizeof(_Up);
1051 return __zero_extend(
1052 __convert_x86<
1053 __vector_type_t<_Up, (Min > 2 * _Np) ? Min : 2 * _Np>>(__v0,
1054 __v1));
1055 }
1056 else if constexpr (__i64_to_i32) //{{{2
1057 {
1058 if constexpr (__x_to_x)
1059 return __auto_bitcast(_mm_shuffle_ps(__auto_bitcast(__v0),
1060 __auto_bitcast(__v1), 0x88));
1061 else if constexpr (__y_to_y)
1062 {
1063 // AVX512F is not available (would concat otherwise)
1064 return __auto_bitcast(
1065 __xzyw(_mm256_shuffle_ps(__auto_bitcast(__v0),
1066 __auto_bitcast(__v1), 0x88)));
1067 // alternative:
1068 // const auto v0_abxxcdxx = _mm256_shuffle_epi32(__v0, 8);
1069 // const auto v1_efxxghxx = _mm256_shuffle_epi32(__v1, 8);
1070 // const auto v_abefcdgh = _mm256_unpacklo_epi64(v0_abxxcdxx,
1071 // v1_efxxghxx); return _mm256_permute4x64_epi64(v_abefcdgh,
1072 // 0x01 * 0 + 0x04 * 2 + 0x10 * 1 + 0x40 * 3); // abcdefgh
1073 }
1074 else if constexpr (__z_to_z)
1075 return __intrin_bitcast<_To>(
1076 __concat(_mm512_cvtepi64_epi32(__i0),
1077 _mm512_cvtepi64_epi32(__i1)));
1078 }
1079 else if constexpr (__i64_to_i16) //{{{2
1080 {
1081 if constexpr (__x_to_x)
1082 {
1083 // AVX2 is not available (would concat otherwise)
1084 if constexpr (__have_sse4_1)
1085 {
1086 return __intrin_bitcast<_To>(_mm_shuffle_epi8(
1087 _mm_blend_epi16(__i0, _mm_slli_si128(__i1, 4), 0x44),
1088 _mm_setr_epi8(0, 1, 8, 9, 4, 5, 12, 13, -0x80, -0x80,
1089 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80)));
1090 }
1091 else
1092 {
1093 return __vector_type_t<_Up, _M>{_Up(__v0[0]), _Up(__v0[1]),
1094 _Up(__v1[0]), _Up(__v1[1])};
1095 }
1096 }
1097 else if constexpr (__y_to_x)
1098 {
1099 auto __a
1100 = _mm256_unpacklo_epi16(__i0, __i1); // 04.. .... 26.. ....
1101 auto __b
1102 = _mm256_unpackhi_epi16(__i0, __i1); // 15.. .... 37.. ....
1103 auto __c
1104 = _mm256_unpacklo_epi16(__a, __b); // 0145 .... 2367 ....
1105 return __intrin_bitcast<_To>(
1106 _mm_unpacklo_epi32(__lo128(__c), __hi128(__c))); // 0123 4567
1107 }
1108 else if constexpr (__z_to_y)
1109 return __intrin_bitcast<_To>(
1110 __concat(_mm512_cvtepi64_epi16(__i0),
1111 _mm512_cvtepi64_epi16(__i1)));
1112 }
1113 else if constexpr (__i64_to_i8) //{{{2
1114 {
1115 if constexpr (__x_to_x && __have_sse4_1)
1116 {
1117 return __intrin_bitcast<_To>(_mm_shuffle_epi8(
1118 _mm_blend_epi16(__i0, _mm_slli_si128(__i1, 4), 0x44),
1119 _mm_setr_epi8(0, 8, 4, 12, -0x80, -0x80, -0x80, -0x80, -0x80,
1120 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
1121 -0x80)));
1122 }
1123 else if constexpr (__x_to_x && __have_ssse3)
1124 {
1125 return __intrin_bitcast<_To>(_mm_unpacklo_epi16(
1126 _mm_shuffle_epi8(
1127 __i0, _mm_setr_epi8(0, 8, -0x80, -0x80, -0x80, -0x80, -0x80,
1128 -0x80, -0x80, -0x80, -0x80, -0x80,
1129 -0x80, -0x80, -0x80, -0x80)),
1130 _mm_shuffle_epi8(
1131 __i1, _mm_setr_epi8(0, 8, -0x80, -0x80, -0x80, -0x80, -0x80,
1132 -0x80, -0x80, -0x80, -0x80, -0x80,
1133 -0x80, -0x80, -0x80, -0x80))));
1134 }
1135 else if constexpr (__x_to_x)
1136 {
1137 return __vector_type_t<_Up, _M>{_Up(__v0[0]), _Up(__v0[1]),
1138 _Up(__v1[0]), _Up(__v1[1])};
1139 }
1140 else if constexpr (__y_to_x)
1141 {
1142 const auto __a = _mm256_shuffle_epi8(
1143 _mm256_blend_epi32(__i0, _mm256_slli_epi64(__i1, 32), 0xAA),
1144 _mm256_setr_epi8(0, 8, -0x80, -0x80, 4, 12, -0x80, -0x80,
1145 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
1146 -0x80, -0x80, -0x80, -0x80, 0, 8, -0x80,
1147 -0x80, 4, 12, -0x80, -0x80, -0x80, -0x80,
1148 -0x80, -0x80, -0x80, -0x80));
1149 return __intrin_bitcast<_To>(__lo128(__a) | __hi128(__a));
1150 } // __z_to_x uses concat fallback
1151 }
1152 else if constexpr (__i32_to_i16) //{{{2
1153 {
1154 if constexpr (__x_to_x)
1155 {
1156 // AVX2 is not available (would concat otherwise)
1157 if constexpr (__have_sse4_1)
1158 {
1159 return __intrin_bitcast<_To>(_mm_shuffle_epi8(
1160 _mm_blend_epi16(__i0, _mm_slli_si128(__i1, 2), 0xaa),
1161 _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10,
1162 11, 14, 15)));
1163 }
1164 else if constexpr (__have_ssse3)
1165 {
1166 return __intrin_bitcast<_To>(
1167 _mm_hadd_epi16(__to_intrin(__v0 << 16),
1168 __to_intrin(__v1 << 16)));
1169 /*
1170 return _mm_unpacklo_epi64(
1171 _mm_shuffle_epi8(__i0, _mm_setr_epi8(0, 1, 4, 5, 8, 9,
1172 12, 13, 8, 9, 12, 13, 12, 13, 14, 15)),
1173 _mm_shuffle_epi8(__i1, _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12,
1174 13, 8, 9, 12, 13, 12, 13, 14, 15)));
1175 */
1176 }
1177 else
1178 {
1179 auto __a = _mm_unpacklo_epi16(__i0, __i1); // 04.. 15..
1180 auto __b = _mm_unpackhi_epi16(__i0, __i1); // 26.. 37..
1181 auto __c = _mm_unpacklo_epi16(__a, __b); // 0246 ....
1182 auto __d = _mm_unpackhi_epi16(__a, __b); // 1357 ....
1183 return __intrin_bitcast<_To>(
1184 _mm_unpacklo_epi16(__c, __d)); // 0123 4567
1185 }
1186 }
1187 else if constexpr (__y_to_y)
1188 {
1189 const auto __shuf
1190 = _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80,
1191 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
1192 0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80,
1193 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80);
1194 auto __a = _mm256_shuffle_epi8(__i0, __shuf);
1195 auto __b = _mm256_shuffle_epi8(__i1, __shuf);
1196 return __intrin_bitcast<_To>(
1197 __xzyw(_mm256_unpacklo_epi64(__a, __b)));
1198 } // __z_to_z uses concat fallback
1199 }
1200 else if constexpr (__i32_to_i8) //{{{2
1201 {
1202 if constexpr (__x_to_x && __have_ssse3)
1203 {
1204 const auto shufmask
1205 = _mm_setr_epi8(0, 4, 8, 12, -0x80, -0x80, -0x80, -0x80,
1206 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
1207 -0x80, -0x80);
1208 return __intrin_bitcast<_To>(
1209 _mm_unpacklo_epi32(_mm_shuffle_epi8(__i0, shufmask),
1210 _mm_shuffle_epi8(__i1, shufmask)));
1211 }
1212 else if constexpr (__x_to_x)
1213 {
1214 auto __a = _mm_unpacklo_epi8(__i0, __i1); // 04.. .... 15.. ....
1215 auto __b = _mm_unpackhi_epi8(__i0, __i1); // 26.. .... 37.. ....
1216 auto __c = _mm_unpacklo_epi8(__a, __b); // 0246 .... .... ....
1217 auto __d = _mm_unpackhi_epi8(__a, __b); // 1357 .... .... ....
1218 auto __e = _mm_unpacklo_epi8(__c, __d); // 0123 4567 .... ....
1219 return __intrin_bitcast<_To>(__e & __m128i{-1, 0});
1220 }
1221 else if constexpr (__y_to_x)
1222 {
1223 const auto __a = _mm256_shuffle_epi8(
1224 _mm256_blend_epi16(__i0, _mm256_slli_epi32(__i1, 16), 0xAA),
1225 _mm256_setr_epi8(0, 4, 8, 12, -0x80, -0x80, -0x80, -0x80, 2,
1226 6, 10, 14, -0x80, -0x80, -0x80, -0x80, -0x80,
1227 -0x80, -0x80, -0x80, 0, 4, 8, 12, -0x80,
1228 -0x80, -0x80, -0x80, 2, 6, 10, 14));
1229 return __intrin_bitcast<_To>(__lo128(__a) | __hi128(__a));
1230 } // __z_to_y uses concat fallback
1231 }
1232 else if constexpr (__i16_to_i8) //{{{2
1233 {
1234 if constexpr (__x_to_x && __have_ssse3)
1235 {
1236 const auto __shuf = reinterpret_cast<__m128i>(
1237 __vector_type_t<_UChar, 16>{0, 2, 4, 6, 8, 10, 12, 14, 0x80,
1238 0x80, 0x80, 0x80, 0x80, 0x80,
1239 0x80, 0x80});
1240 return __intrin_bitcast<_To>(
1241 _mm_unpacklo_epi64(_mm_shuffle_epi8(__i0, __shuf),
1242 _mm_shuffle_epi8(__i1, __shuf)));
1243 }
1244 else if constexpr (__x_to_x)
1245 {
1246 auto __a = _mm_unpacklo_epi8(__i0, __i1); // 08.. 19.. 2A.. 3B..
1247 auto __b = _mm_unpackhi_epi8(__i0, __i1); // 4C.. 5D.. 6E.. 7F..
1248 auto __c = _mm_unpacklo_epi8(__a, __b); // 048C .... 159D ....
1249 auto __d = _mm_unpackhi_epi8(__a, __b); // 26AE .... 37BF ....
1250 auto __e = _mm_unpacklo_epi8(__c, __d); // 0246 8ACE .... ....
1251 auto __f = _mm_unpackhi_epi8(__c, __d); // 1357 9BDF .... ....
1252 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__e, __f));
1253 }
1254 else if constexpr (__y_to_y)
1255 {
1256 return __intrin_bitcast<_To>(__xzyw(_mm256_shuffle_epi8(
1257 (__to_intrin(__v0) & _mm256_set1_epi32(0x00ff00ff))
1258 | _mm256_slli_epi16(__i1, 8),
1259 _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11,
1260 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5,
1261 7, 9, 11, 13, 15))));
1262 } // __z_to_z uses concat fallback
1263 }
1264 else if constexpr (__i64_to_f32) //{{{2
1265 {
1266 if constexpr (__x_to_x)
1267 return __make_wrapper<float>(__v0[0], __v0[1], __v1[0], __v1[1]);
1268 else if constexpr (__y_to_y)
1269 {
1270 static_assert(__y_to_y && __have_avx2);
1271 const auto __a = _mm256_unpacklo_epi32(__i0, __i1); // aeAE cgCG
1272 const auto __b = _mm256_unpackhi_epi32(__i0, __i1); // bfBF dhDH
1273 const auto __lo32
1274 = _mm256_unpacklo_epi32(__a, __b); // abef cdgh
1275 const auto __hi32 = __vector_bitcast<
1276 conditional_t<is_signed_v<_Tp>, int, _UInt>>(
1277 _mm256_unpackhi_epi32(__a, __b)); // ABEF CDGH
1278 const auto __hi
1279 = 0x100000000LL
1280 * __convert_x86<__vector_type_t<float, 8>>(__hi32);
1281 const auto __mid
1282 = 0x10000 * _mm256_cvtepi32_ps(_mm256_srli_epi32(__lo32, 16));
1283 const auto __lo
1284 = _mm256_cvtepi32_ps(_mm256_set1_epi32(0x0000ffffu) & __lo32);
1285 return __xzyw((__hi + __mid) + __lo);
1286 }
1287 else if constexpr (__z_to_z && __have_avx512dq)
1288 {
1289 return is_signed_v<_Tp> ? __concat(_mm512_cvtepi64_ps(__i0),
1290 _mm512_cvtepi64_ps(__i1))
1291 : __concat(_mm512_cvtepu64_ps(__i0),
1292 _mm512_cvtepu64_ps(__i1));
1293 }
1294 else if constexpr (__z_to_z && is_signed_v<_Tp>)
1295 {
1296 const __m512 __hi32 = _mm512_cvtepi32_ps(
1297 __concat(_mm512_cvtepi64_epi32(__to_intrin(__v0 >> 32)),
1298 _mm512_cvtepi64_epi32(__to_intrin(__v1 >> 32))));
1299 const __m512i __lo32 = __concat(_mm512_cvtepi64_epi32(__i0),
1300 _mm512_cvtepi64_epi32(__i1));
1301 // split low 32-bits, because if __hi32 is a small negative
1302 // number, the 24-bit mantissa may lose important information if
1303 // any of the high 8 bits of __lo32 is set, leading to
1304 // catastrophic cancelation in the FMA
1305 const __m512 __hi16
1306 = _mm512_cvtepu32_ps(_mm512_set1_epi32(0xffff0000u) & __lo32);
1307 const __m512 __lo16
1308 = _mm512_cvtepi32_ps(_mm512_set1_epi32(0x0000ffffu) & __lo32);
1309 return (__hi32 * 0x100000000LL + __hi16) + __lo16;
1310 }
1311 else if constexpr (__z_to_z && is_unsigned_v<_Tp>)
1312 {
1313 return __intrin_bitcast<_To>(
1314 _mm512_cvtepu32_ps(__concat(
1315 _mm512_cvtepi64_epi32(_mm512_srai_epi64(__i0, 32)),
1316 _mm512_cvtepi64_epi32(_mm512_srai_epi64(__i1, 32))))
1317 * 0x100000000LL
1318 + _mm512_cvtepu32_ps(__concat(_mm512_cvtepi64_epi32(__i0),
1319 _mm512_cvtepi64_epi32(__i1))));
1320 }
1321 }
1322 else if constexpr (__f64_to_s32) //{{{2
1323 {
1324 // use concat fallback
1325 }
1326 else if constexpr (__f64_to_u32) //{{{2
1327 {
1328 if constexpr (__x_to_x && __have_sse4_1)
1329 {
1330 return __vector_bitcast<_Up, _M>(_mm_unpacklo_epi64(
1331 _mm_cvttpd_epi32(_mm_floor_pd(__i0) - 0x8000'0000u),
1332 _mm_cvttpd_epi32(_mm_floor_pd(__i1) - 0x8000'0000u)))
1333 ^ 0x8000'0000u;
1334 // without SSE4.1 just use the scalar fallback, it's only four
1335 // values
1336 }
1337 else if constexpr (__y_to_y)
1338 {
1339 return __vector_bitcast<_Up>(
1340 __concat(_mm256_cvttpd_epi32(_mm256_floor_pd(__i0)
1341 - 0x8000'0000u),
1342 _mm256_cvttpd_epi32(_mm256_floor_pd(__i1)
1343 - 0x8000'0000u)))
1344 ^ 0x8000'0000u;
1345 } // __z_to_z uses fallback
1346 }
1347 else if constexpr (__f64_to_ibw) //{{{2
1348 {
1349 // one-arg __f64_to_ibw goes via _SimdWrapper<int, ?>. The fallback
1350 // would go via two independet conversions to _SimdWrapper<_To> and
1351 // subsequent interleaving. This is better, because f64->__i32
1352 // allows to combine __v0 and __v1 into one register: if constexpr
1353 // (__z_to_x || __y_to_x) {
1354 return __convert_x86<_To>(
1355 __convert_x86<__vector_type_t<int, _Np * 2>>(__v0, __v1));
1356 //}
1357 }
1358 else if constexpr (__f32_to_ibw) //{{{2
1359 {
1360 return __convert_x86<_To>(
1361 __convert_x86<__vector_type_t<int, _Np>>(__v0),
1362 __convert_x86<__vector_type_t<int, _Np>>(__v1));
1363 } //}}}
1364
1365 // fallback: {{{2
1366 if constexpr (sizeof(_To) >= 32)
1367 // if _To is ymm or zmm, then _SimdWrapper<_Up, _M / 2> is xmm or ymm
1368 return __concat(__convert_x86<__vector_type_t<_Up, _M / 2>>(__v0),
1369 __convert_x86<__vector_type_t<_Up, _M / 2>>(__v1));
1370 else if constexpr (sizeof(_To) == 16)
1371 {
1372 const auto __lo = __to_intrin(__convert_x86<_To>(__v0));
1373 const auto __hi = __to_intrin(__convert_x86<_To>(__v1));
1374 if constexpr (sizeof(_Up) * _Np == 8)
1375 {
1376 if constexpr (is_floating_point_v<_Up>)
1377 return __auto_bitcast(
1378 _mm_unpacklo_pd(__vector_bitcast<double>(__lo),
1379 __vector_bitcast<double>(__hi)));
1380 else
1381 return __intrin_bitcast<_To>(_mm_unpacklo_epi64(__lo, __hi));
1382 }
1383 else if constexpr (sizeof(_Up) * _Np == 4)
1384 {
1385 if constexpr (is_floating_point_v<_Up>)
1386 return __auto_bitcast(
1387 _mm_unpacklo_ps(__vector_bitcast<float>(__lo),
1388 __vector_bitcast<float>(__hi)));
1389 else
1390 return __intrin_bitcast<_To>(_mm_unpacklo_epi32(__lo, __hi));
1391 }
1392 else if constexpr (sizeof(_Up) * _Np == 2)
1393 return __intrin_bitcast<_To>(_mm_unpacklo_epi16(__lo, __hi));
1394 else
1395 __assert_unreachable<_Tp>();
1396 }
1397 else
1398 return __vector_convert<_To>(__v0, __v1, make_index_sequence<_Np>());
1399 //}}}
1400 }
1401 }
1402
1403//}}}1
1404// 4-arg __convert_x86 {{{1
1405template <typename _To, typename _V, typename _Traits>
1406 _GLIBCXX_SIMD_INTRINSIC _To
1407 __convert_x86(_V __v0, _V __v1, _V __v2, _V __v3)
1408 {
1409 static_assert(__is_vector_type_v<_V>);
1410 using _Tp = typename _Traits::value_type;
1411 constexpr size_t _Np = _Traits::_S_full_size;
1412 [[maybe_unused]] const auto __i0 = __to_intrin(__v0);
1413 [[maybe_unused]] const auto __i1 = __to_intrin(__v1);
1414 [[maybe_unused]] const auto __i2 = __to_intrin(__v2);
1415 [[maybe_unused]] const auto __i3 = __to_intrin(__v3);
1416 using _Up = typename _VectorTraits<_To>::value_type;
1417 constexpr size_t _M = _VectorTraits<_To>::_S_full_size;
1418
1419 static_assert(4 * _Np <= _M,
1420 "__v2/__v3 would be discarded; use the two/one-argument "
1421 "__convert_x86 overload instead");
1422
1423 // [xyz]_to_[xyz] {{{2
1424 [[maybe_unused]] constexpr bool __x_to_x
1425 = sizeof(__v0) <= 16 && sizeof(_To) <= 16;
1426 [[maybe_unused]] constexpr bool __x_to_y
1427 = sizeof(__v0) <= 16 && sizeof(_To) == 32;
1428 [[maybe_unused]] constexpr bool __x_to_z
1429 = sizeof(__v0) <= 16 && sizeof(_To) == 64;
1430 [[maybe_unused]] constexpr bool __y_to_x
1431 = sizeof(__v0) == 32 && sizeof(_To) <= 16;
1432 [[maybe_unused]] constexpr bool __y_to_y
1433 = sizeof(__v0) == 32 && sizeof(_To) == 32;
1434 [[maybe_unused]] constexpr bool __y_to_z
1435 = sizeof(__v0) == 32 && sizeof(_To) == 64;
1436 [[maybe_unused]] constexpr bool __z_to_x
1437 = sizeof(__v0) == 64 && sizeof(_To) <= 16;
1438 [[maybe_unused]] constexpr bool __z_to_y
1439 = sizeof(__v0) == 64 && sizeof(_To) == 32;
1440 [[maybe_unused]] constexpr bool __z_to_z
1441 = sizeof(__v0) == 64 && sizeof(_To) == 64;
1442
1443 // iX_to_iX {{{2
1444 [[maybe_unused]] constexpr bool __i_to_i
1445 = is_integral_v<_Up> && is_integral_v<_Tp>;
1446 [[maybe_unused]] constexpr bool __i8_to_i16
1447 = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 2;
1448 [[maybe_unused]] constexpr bool __i8_to_i32
1449 = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 4;
1450 [[maybe_unused]] constexpr bool __i8_to_i64
1451 = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 8;
1452 [[maybe_unused]] constexpr bool __i16_to_i8
1453 = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 1;
1454 [[maybe_unused]] constexpr bool __i16_to_i32
1455 = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 4;
1456 [[maybe_unused]] constexpr bool __i16_to_i64
1457 = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 8;
1458 [[maybe_unused]] constexpr bool __i32_to_i8
1459 = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 1;
1460 [[maybe_unused]] constexpr bool __i32_to_i16
1461 = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 2;
1462 [[maybe_unused]] constexpr bool __i32_to_i64
1463 = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 8;
1464 [[maybe_unused]] constexpr bool __i64_to_i8
1465 = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 1;
1466 [[maybe_unused]] constexpr bool __i64_to_i16
1467 = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 2;
1468 [[maybe_unused]] constexpr bool __i64_to_i32
1469 = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 4;
1470
1471 // [fsu]X_to_[fsu]X {{{2
1472 // ibw = integral && byte or word, i.e. char and short with any signedness
1473 [[maybe_unused]] constexpr bool __i64_to_f32
1474 = is_integral_v<_Tp> && sizeof(_Tp) == 8
1475 && is_floating_point_v<_Up> && sizeof(_Up) == 4;
1476 [[maybe_unused]] constexpr bool __s32_to_f32
1477 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4
1478 && is_floating_point_v<_Up> && sizeof(_Up) == 4;
1479 [[maybe_unused]] constexpr bool __s16_to_f32
1480 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 2
1481 && is_floating_point_v<_Up> && sizeof(_Up) == 4;
1482 [[maybe_unused]] constexpr bool __s8_to_f32
1483 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 1
1484 && is_floating_point_v<_Up> && sizeof(_Up) == 4;
1485 [[maybe_unused]] constexpr bool __u32_to_f32
1486 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4
1487 && is_floating_point_v<_Up> && sizeof(_Up) == 4;
1488 [[maybe_unused]] constexpr bool __u16_to_f32
1489 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 2
1490 && is_floating_point_v<_Up> && sizeof(_Up) == 4;
1491 [[maybe_unused]] constexpr bool __u8_to_f32
1492 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 1
1493 && is_floating_point_v<_Up> && sizeof(_Up) == 4;
1494 [[maybe_unused]] constexpr bool __s64_to_f64
1495 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 8
1496 && is_floating_point_v<_Up> && sizeof(_Up) == 8;
1497 [[maybe_unused]] constexpr bool __s32_to_f64
1498 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4
1499 && is_floating_point_v<_Up> && sizeof(_Up) == 8;
1500 [[maybe_unused]] constexpr bool __s16_to_f64
1501 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 2
1502 && is_floating_point_v<_Up> && sizeof(_Up) == 8;
1503 [[maybe_unused]] constexpr bool __s8_to_f64
1504 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 1
1505 && is_floating_point_v<_Up> && sizeof(_Up) == 8;
1506 [[maybe_unused]] constexpr bool __u64_to_f64
1507 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 8
1508 && is_floating_point_v<_Up> && sizeof(_Up) == 8;
1509 [[maybe_unused]] constexpr bool __u32_to_f64
1510 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4
1511 && is_floating_point_v<_Up> && sizeof(_Up) == 8;
1512 [[maybe_unused]] constexpr bool __u16_to_f64
1513 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 2
1514 && is_floating_point_v<_Up> && sizeof(_Up) == 8;
1515 [[maybe_unused]] constexpr bool __u8_to_f64
1516 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 1
1517 && is_floating_point_v<_Up> && sizeof(_Up) == 8;
1518 [[maybe_unused]] constexpr bool __f32_to_s64
1519 = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8
1520 && is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
1521 [[maybe_unused]] constexpr bool __f32_to_s32
1522 = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4
1523 && is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
1524 [[maybe_unused]] constexpr bool __f32_to_u64
1525 = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8
1526 && is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
1527 [[maybe_unused]] constexpr bool __f32_to_u32
1528 = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4
1529 && is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
1530 [[maybe_unused]] constexpr bool __f64_to_s64
1531 = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8
1532 && is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
1533 [[maybe_unused]] constexpr bool __f64_to_s32
1534 = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4
1535 && is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
1536 [[maybe_unused]] constexpr bool __f64_to_u64
1537 = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8
1538 && is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
1539 [[maybe_unused]] constexpr bool __f64_to_u32
1540 = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4
1541 && is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
1542 [[maybe_unused]] constexpr bool __f32_to_ibw
1543 = is_integral_v<_Up> && sizeof(_Up) <= 2
1544 && is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
1545 [[maybe_unused]] constexpr bool __f64_to_ibw
1546 = is_integral_v<_Up> && sizeof(_Up) <= 2
1547 && is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
1548 [[maybe_unused]] constexpr bool __f32_to_f64
1549 = is_floating_point_v<_Tp> && sizeof(_Tp) == 4
1550 && is_floating_point_v<_Up> && sizeof(_Up) == 8;
1551 [[maybe_unused]] constexpr bool __f64_to_f32
1552 = is_floating_point_v<_Tp> && sizeof(_Tp) == 8
1553 && is_floating_point_v<_Up> && sizeof(_Up) == 4;
1554
1555 if constexpr (__i_to_i && __y_to_x && !__have_avx2) //{{{2
1556 {
1557 // <double, 4>, <double, 4>, <double, 4>, <double, 4> => <char, 16>
1558 return __convert_x86<_To>(__lo128(__v0), __hi128(__v0), __lo128(__v1),
1559 __hi128(__v1), __lo128(__v2), __hi128(__v2),
1560 __lo128(__v3), __hi128(__v3));
1561 }
1562 else if constexpr (__i_to_i) // assert ISA {{{2
1563 {
1564 static_assert(__x_to_x || __have_avx2,
1565 "integral conversions with ymm registers require AVX2");
1566 static_assert(__have_avx512bw
1567 || ((sizeof(_Tp) >= 4 || sizeof(__v0) < 64)
1568 && (sizeof(_Up) >= 4 || sizeof(_To) < 64)),
1569 "8/16-bit integers in zmm registers require AVX512BW");
1570 static_assert((sizeof(__v0) < 64 && sizeof(_To) < 64) || __have_avx512f,
1571 "integral conversions with ymm registers require AVX2");
1572 }
1573 // concat => use 2-arg __convert_x86 {{{2
1574 if constexpr (sizeof(__v0) < 16 || (sizeof(__v0) == 16 && __have_avx2)
1575 || (sizeof(__v0) == 16 && __have_avx
1576 && is_floating_point_v<_Tp>)
1577 || (sizeof(__v0) == 32 && __have_avx512f))
1578 {
1579 // The ISA can handle wider input registers, so concat and use two-arg
1580 // implementation. This reduces code duplication considerably.
1581 return __convert_x86<_To>(__concat(__v0, __v1), __concat(__v2, __v3));
1582 }
1583 else //{{{2
1584 {
1585 // conversion using bit reinterpretation (or no conversion at all)
1586 // should all go through the concat branch above:
1587 static_assert(
1588 !(is_floating_point_v<
1589 _Tp> == is_floating_point_v<_Up> && sizeof(_Tp) == sizeof(_Up)));
1590 // handle all zero extension{{{2
1591 if constexpr (4 * _Np < _M && sizeof(_To) > 16)
1592 {
1593 constexpr size_t Min = 16 / sizeof(_Up);
1594 return __zero_extend(
1595 __convert_x86<
1596 __vector_type_t<_Up, (Min > 4 * _Np) ? Min : 4 * _Np>>(
1597 __v0, __v1, __v2, __v3));
1598 }
1599 else if constexpr (__i64_to_i16) //{{{2
1600 {
1601 if constexpr (__x_to_x && __have_sse4_1)
1602 {
1603 return __intrin_bitcast<_To>(_mm_shuffle_epi8(
1604 _mm_blend_epi16(
1605 _mm_blend_epi16(__i0, _mm_slli_si128(__i1, 2), 0x22),
1606 _mm_blend_epi16(_mm_slli_si128(__i2, 4),
1607 _mm_slli_si128(__i3, 6), 0x88),
1608 0xcc),
1609 _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7,
1610 14, 15)));
1611 }
1612 else if constexpr (__y_to_y && __have_avx2)
1613 {
1614 return __intrin_bitcast<_To>(_mm256_shuffle_epi8(
1615 __xzyw(_mm256_blend_epi16(
1616 __auto_bitcast(
1617 _mm256_shuffle_ps(__vector_bitcast<float>(__v0),
1618 __vector_bitcast<float>(__v2),
1619 0x88)), // 0.1. 8.9. 2.3. A.B.
1620 __to_intrin(__vector_bitcast<int>(_mm256_shuffle_ps(
1621 __vector_bitcast<float>(__v1),
1622 __vector_bitcast<float>(__v3), 0x88))
1623 << 16), // .4.5 .C.D .6.7 .E.F
1624 0xaa) // 0415 8C9D 2637 AEBF
1625 ), // 0415 2637 8C9D AEBF
1626 _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11,
1627 14, 15, 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7,
1628 10, 11, 14, 15)));
1629 /*
1630 auto __a = _mm256_unpacklo_epi16(__v0, __v1); // 04.. .... 26..
1631 .... auto __b = _mm256_unpackhi_epi16(__v0, __v1); // 15..
1632 .... 37.. .... auto __c = _mm256_unpacklo_epi16(__v2, __v3); //
1633 8C.. .... AE.. .... auto __d = _mm256_unpackhi_epi16(__v2,
1634 __v3);
1635 // 9D.. .... BF.. .... auto __e = _mm256_unpacklo_epi16(__a,
1636 __b);
1637 // 0145 .... 2367 .... auto __f = _mm256_unpacklo_epi16(__c,
1638 __d);
1639 // 89CD .... ABEF .... auto __g = _mm256_unpacklo_epi64(__e,
1640 __f);
1641 // 0145 89CD 2367 ABEF return __concat(
1642 _mm_unpacklo_epi32(__lo128(__g), __hi128(__g)),
1643 _mm_unpackhi_epi32(__lo128(__g), __hi128(__g))); // 0123
1644 4567 89AB CDEF
1645 */
1646 } // else use fallback
1647 }
1648 else if constexpr (__i64_to_i8) //{{{2
1649 {
1650 if constexpr (__x_to_x)
1651 {
1652 // TODO: use fallback for now
1653 }
1654 else if constexpr (__y_to_x)
1655 {
1656 auto __a
1657 = _mm256_srli_epi32(_mm256_slli_epi32(__i0, 24), 24)
1658 | _mm256_srli_epi32(_mm256_slli_epi32(__i1, 24), 16)
1659 | _mm256_srli_epi32(_mm256_slli_epi32(__i2, 24), 8)
1660 | _mm256_slli_epi32(
1661 __i3, 24); // 048C .... 159D .... 26AE .... 37BF ....
1662 /*return _mm_shuffle_epi8(
1663 _mm_blend_epi32(__lo128(__a) << 32, __hi128(__a), 0x5),
1664 _mm_setr_epi8(4, 12, 0, 8, 5, 13, 1, 9, 6, 14, 2, 10, 7, 15,
1665 3, 11));*/
1666 auto __b = _mm256_unpackhi_epi64(
1667 __a, __a); // 159D .... 159D .... 37BF .... 37BF ....
1668 auto __c = _mm256_unpacklo_epi8(
1669 __a, __b); // 0145 89CD .... .... 2367 ABEF .... ....
1670 return __intrin_bitcast<_To>(
1671 _mm_unpacklo_epi16(__lo128(__c),
1672 __hi128(__c))); // 0123 4567 89AB CDEF
1673 }
1674 }
1675 else if constexpr (__i32_to_i8) //{{{2
1676 {
1677 if constexpr (__x_to_x)
1678 {
1679 if constexpr (__have_ssse3)
1680 {
1681 const auto __x0 = __vector_bitcast<_UInt>(__v0) & 0xff;
1682 const auto __x1 = (__vector_bitcast<_UInt>(__v1) & 0xff)
1683 << 8;
1684 const auto __x2 = (__vector_bitcast<_UInt>(__v2) & 0xff)
1685 << 16;
1686 const auto __x3 = __vector_bitcast<_UInt>(__v3) << 24;
1687 return __intrin_bitcast<_To>(
1688 _mm_shuffle_epi8(__to_intrin(__x0 | __x1 | __x2 | __x3),
1689 _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13,
1690 2, 6, 10, 14, 3, 7, 11,
1691 15)));
1692 }
1693 else
1694 {
1695 auto __a
1696 = _mm_unpacklo_epi8(__i0, __i2); // 08.. .... 19.. ....
1697 auto __b
1698 = _mm_unpackhi_epi8(__i0, __i2); // 2A.. .... 3B.. ....
1699 auto __c
1700 = _mm_unpacklo_epi8(__i1, __i3); // 4C.. .... 5D.. ....
1701 auto __d
1702 = _mm_unpackhi_epi8(__i1, __i3); // 6E.. .... 7F.. ....
1703 auto __e
1704 = _mm_unpacklo_epi8(__a, __c); // 048C .... .... ....
1705 auto __f
1706 = _mm_unpackhi_epi8(__a, __c); // 159D .... .... ....
1707 auto __g
1708 = _mm_unpacklo_epi8(__b, __d); // 26AE .... .... ....
1709 auto __h
1710 = _mm_unpackhi_epi8(__b, __d); // 37BF .... .... ....
1711 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(
1712 _mm_unpacklo_epi8(__e, __g), // 0246 8ACE .... ....
1713 _mm_unpacklo_epi8(__f, __h) // 1357 9BDF .... ....
1714 )); // 0123 4567 89AB CDEF
1715 }
1716 }
1717 else if constexpr (__y_to_y)
1718 {
1719 const auto __a = _mm256_shuffle_epi8(
1720 __to_intrin((__vector_bitcast<_UShort>(_mm256_blend_epi16(
1721 __i0, _mm256_slli_epi32(__i1, 16), 0xAA))
1722 & 0xff)
1723 | (__vector_bitcast<_UShort>(_mm256_blend_epi16(
1724 __i2, _mm256_slli_epi32(__i3, 16), 0xAA))
1725 << 8)),
1726 _mm256_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7,
1727 11, 15, 0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9,
1728 13, 3, 7, 11, 15));
1729 return __intrin_bitcast<_To>(_mm256_permutevar8x32_epi32(
1730 __a, _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7)));
1731 }
1732 }
1733 else if constexpr (__i64_to_f32) //{{{2
1734 {
1735 // this branch is only relevant with AVX and w/o AVX2 (i.e. no ymm
1736 // integers)
1737 if constexpr (__x_to_y)
1738 {
1739 return __make_wrapper<float>(__v0[0], __v0[1], __v1[0], __v1[1],
1740 __v2[0], __v2[1], __v3[0],
1741 __v3[1]);
1742
1743 const auto __a = _mm_unpacklo_epi32(__i0, __i1); // acAC
1744 const auto __b = _mm_unpackhi_epi32(__i0, __i1); // bdBD
1745 const auto __c = _mm_unpacklo_epi32(__i2, __i3); // egEG
1746 const auto __d = _mm_unpackhi_epi32(__i2, __i3); // fhFH
1747 const auto __lo32a = _mm_unpacklo_epi32(__a, __b); // abcd
1748 const auto __lo32b = _mm_unpacklo_epi32(__c, __d); // efgh
1749 const auto __hi32 = __vector_bitcast<
1750 conditional_t<is_signed_v<_Tp>, int, _UInt>>(
1751 __concat(_mm_unpackhi_epi32(__a, __b),
1752 _mm_unpackhi_epi32(__c, __d))); // ABCD EFGH
1753 const auto __hi
1754 = 0x100000000LL
1755 * __convert_x86<__vector_type_t<float, 8>>(__hi32);
1756 const auto __mid
1757 = 0x10000
1758 * _mm256_cvtepi32_ps(__concat(_mm_srli_epi32(__lo32a, 16),
1759 _mm_srli_epi32(__lo32b, 16)));
1760 const auto __lo = _mm256_cvtepi32_ps(
1761 __concat(_mm_set1_epi32(0x0000ffffu) & __lo32a,
1762 _mm_set1_epi32(0x0000ffffu) & __lo32b));
1763 return (__hi + __mid) + __lo;
1764 }
1765 }
1766 else if constexpr (__f64_to_ibw) //{{{2
1767 {
1768 return __convert_x86<_To>(
1769 __convert_x86<__vector_type_t<int, _Np * 2>>(__v0, __v1),
1770 __convert_x86<__vector_type_t<int, _Np * 2>>(__v2, __v3));
1771 }
1772 else if constexpr (__f32_to_ibw) //{{{2
1773 {
1774 return __convert_x86<_To>(
1775 __convert_x86<__vector_type_t<int, _Np>>(__v0),
1776 __convert_x86<__vector_type_t<int, _Np>>(__v1),
1777 __convert_x86<__vector_type_t<int, _Np>>(__v2),
1778 __convert_x86<__vector_type_t<int, _Np>>(__v3));
1779 } //}}}
1780
1781 // fallback: {{{2
1782 if constexpr (sizeof(_To) >= 32)
1783 // if _To is ymm or zmm, then _SimdWrapper<_Up, _M / 2> is xmm or ymm
1784 return __concat(__convert_x86<__vector_type_t<_Up, _M / 2>>(__v0,
1785 __v1),
1786 __convert_x86<__vector_type_t<_Up, _M / 2>>(__v2,
1787 __v3));
1788 else if constexpr (sizeof(_To) == 16)
1789 {
1790 const auto __lo = __to_intrin(__convert_x86<_To>(__v0, __v1));
1791 const auto __hi = __to_intrin(__convert_x86<_To>(__v2, __v3));
1792 if constexpr (sizeof(_Up) * _Np * 2 == 8)
1793 {
1794 if constexpr (is_floating_point_v<_Up>)
1795 return __auto_bitcast(_mm_unpacklo_pd(__lo, __hi));
1796 else
1797 return __intrin_bitcast<_To>(_mm_unpacklo_epi64(__lo, __hi));
1798 }
1799 else if constexpr (sizeof(_Up) * _Np * 2 == 4)
1800 {
1801 if constexpr (is_floating_point_v<_Up>)
1802 return __auto_bitcast(_mm_unpacklo_ps(__lo, __hi));
1803 else
1804 return __intrin_bitcast<_To>(_mm_unpacklo_epi32(__lo, __hi));
1805 }
1806 else
1807 __assert_unreachable<_Tp>();
1808 }
1809 else
1810 return __vector_convert<_To>(__v0, __v1, __v2, __v3,
1811 make_index_sequence<_Np>());
1812 //}}}2
1813 }
1814 }
1815
1816//}}}
1817// 8-arg __convert_x86 {{{1
1818template <typename _To, typename _V, typename _Traits>
1819 _GLIBCXX_SIMD_INTRINSIC _To
1820 __convert_x86(_V __v0, _V __v1, _V __v2, _V __v3, _V __v4, _V __v5, _V __v6,
1821 _V __v7)
1822 {
1823 static_assert(__is_vector_type_v<_V>);
1824 using _Tp = typename _Traits::value_type;
1825 constexpr size_t _Np = _Traits::_S_full_size;
1826 [[maybe_unused]] const auto __i0 = __to_intrin(__v0);
1827 [[maybe_unused]] const auto __i1 = __to_intrin(__v1);
1828 [[maybe_unused]] const auto __i2 = __to_intrin(__v2);
1829 [[maybe_unused]] const auto __i3 = __to_intrin(__v3);
1830 [[maybe_unused]] const auto __i4 = __to_intrin(__v4);
1831 [[maybe_unused]] const auto __i5 = __to_intrin(__v5);
1832 [[maybe_unused]] const auto __i6 = __to_intrin(__v6);
1833 [[maybe_unused]] const auto __i7 = __to_intrin(__v7);
1834 using _Up = typename _VectorTraits<_To>::value_type;
1835 constexpr size_t _M = _VectorTraits<_To>::_S_full_size;
1836
1837 static_assert(8 * _Np <= _M,
1838 "__v4-__v7 would be discarded; use the four/two/one-argument "
1839 "__convert_x86 overload instead");
1840
1841 // [xyz]_to_[xyz] {{{2
1842 [[maybe_unused]] constexpr bool __x_to_x
1843 = sizeof(__v0) <= 16 && sizeof(_To) <= 16;
1844 [[maybe_unused]] constexpr bool __x_to_y
1845 = sizeof(__v0) <= 16 && sizeof(_To) == 32;
1846 [[maybe_unused]] constexpr bool __x_to_z
1847 = sizeof(__v0) <= 16 && sizeof(_To) == 64;
1848 [[maybe_unused]] constexpr bool __y_to_x
1849 = sizeof(__v0) == 32 && sizeof(_To) <= 16;
1850 [[maybe_unused]] constexpr bool __y_to_y
1851 = sizeof(__v0) == 32 && sizeof(_To) == 32;
1852 [[maybe_unused]] constexpr bool __y_to_z
1853 = sizeof(__v0) == 32 && sizeof(_To) == 64;
1854 [[maybe_unused]] constexpr bool __z_to_x
1855 = sizeof(__v0) == 64 && sizeof(_To) <= 16;
1856 [[maybe_unused]] constexpr bool __z_to_y
1857 = sizeof(__v0) == 64 && sizeof(_To) == 32;
1858 [[maybe_unused]] constexpr bool __z_to_z
1859 = sizeof(__v0) == 64 && sizeof(_To) == 64;
1860
1861 // [if]X_to_i8 {{{2
1862 [[maybe_unused]] constexpr bool __i_to_i
1863 = is_integral_v<_Up> && is_integral_v<_Tp>;
1864 [[maybe_unused]] constexpr bool __i64_to_i8
1865 = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 1;
1866 [[maybe_unused]] constexpr bool __f64_to_i8
1867 = is_integral_v<_Up> && sizeof(_Up) == 1
1868 && is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
1869
1870 if constexpr (__i_to_i) // assert ISA {{{2
1871 {
1872 static_assert(__x_to_x || __have_avx2,
1873 "integral conversions with ymm registers require AVX2");
1874 static_assert(__have_avx512bw
1875 || ((sizeof(_Tp) >= 4 || sizeof(__v0) < 64)
1876 && (sizeof(_Up) >= 4 || sizeof(_To) < 64)),
1877 "8/16-bit integers in zmm registers require AVX512BW");
1878 static_assert((sizeof(__v0) < 64 && sizeof(_To) < 64) || __have_avx512f,
1879 "integral conversions with ymm registers require AVX2");
1880 }
1881 // concat => use 4-arg __convert_x86 {{{2
1882 if constexpr (sizeof(__v0) < 16 || (sizeof(__v0) == 16 && __have_avx2)
1883 || (sizeof(__v0) == 16 && __have_avx
1884 && is_floating_point_v<_Tp>)
1885 || (sizeof(__v0) == 32 && __have_avx512f))
1886 {
1887 // The ISA can handle wider input registers, so concat and use two-arg
1888 // implementation. This reduces code duplication considerably.
1889 return __convert_x86<_To>(__concat(__v0, __v1), __concat(__v2, __v3),
1890 __concat(__v4, __v5), __concat(__v6, __v7));
1891 }
1892 else //{{{2
1893 {
1894 // conversion using bit reinterpretation (or no conversion at all)
1895 // should all go through the concat branch above:
1896 static_assert(
1897 !(is_floating_point_v<
1898 _Tp> == is_floating_point_v<_Up> && sizeof(_Tp) == sizeof(_Up)));
1899 static_assert(!(8 * _Np < _M && sizeof(_To) > 16),
1900 "zero extension should be impossible");
1901 if constexpr (__i64_to_i8) //{{{2
1902 {
1903 if constexpr (__x_to_x && __have_ssse3)
1904 {
1905 // unsure whether this is better than the variant below
1906 return __intrin_bitcast<_To>(_mm_shuffle_epi8(
1907 __to_intrin(
1908 (((__v0 & 0xff) | ((__v1 & 0xff) << 8))
1909 | (((__v2 & 0xff) << 16) | ((__v3 & 0xff) << 24)))
1910 | ((((__v4 & 0xff) << 32) | ((__v5 & 0xff) << 40))
1911 | (((__v6 & 0xff) << 48) | (__v7 << 56)))),
1912 _mm_setr_epi8(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14,
1913 7, 15)));
1914 }
1915 else if constexpr (__x_to_x)
1916 {
1917 const auto __a = _mm_unpacklo_epi8(__i0, __i1); // ac
1918 const auto __b = _mm_unpackhi_epi8(__i0, __i1); // bd
1919 const auto __c = _mm_unpacklo_epi8(__i2, __i3); // eg
1920 const auto __d = _mm_unpackhi_epi8(__i2, __i3); // fh
1921 const auto __e = _mm_unpacklo_epi8(__i4, __i5); // ik
1922 const auto __f = _mm_unpackhi_epi8(__i4, __i5); // jl
1923 const auto __g = _mm_unpacklo_epi8(__i6, __i7); // mo
1924 const auto __h = _mm_unpackhi_epi8(__i6, __i7); // np
1925 return __intrin_bitcast<_To>(_mm_unpacklo_epi64(
1926 _mm_unpacklo_epi32(_mm_unpacklo_epi8(__a, __b), // abcd
1927 _mm_unpacklo_epi8(__c, __d)), // efgh
1928 _mm_unpacklo_epi32(_mm_unpacklo_epi8(__e, __f), // ijkl
1929 _mm_unpacklo_epi8(__g, __h)) // mnop
1930 ));
1931 }
1932 else if constexpr (__y_to_y)
1933 {
1934 auto __a = // 048C GKOS 159D HLPT 26AE IMQU 37BF JNRV
1935 __to_intrin(
1936 (((__v0 & 0xff) | ((__v1 & 0xff) << 8))
1937 | (((__v2 & 0xff) << 16) | ((__v3 & 0xff) << 24)))
1938 | ((((__v4 & 0xff) << 32) | ((__v5 & 0xff) << 40))
1939 | (((__v6 & 0xff) << 48) | ((__v7 << 56)))));
1940 /*
1941 auto __b = _mm256_unpackhi_epi64(__a, __a); // 159D HLPT 159D
1942 HLPT 37BF JNRV 37BF JNRV auto __c = _mm256_unpacklo_epi8(__a,
1943 __b); // 0145 89CD GHKL OPST 2367 ABEF IJMN QRUV auto __d =
1944 __xzyw(__c); // 0145 89CD 2367 ABEF GHKL OPST IJMN QRUV return
1945 _mm256_shuffle_epi8(
1946 __d, _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12,
1947 13, 6, 7, 14, 15, 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7,
1948 14, 15));
1949 */
1950 auto __b = _mm256_shuffle_epi8( // 0145 89CD GHKL OPST 2367 ABEF
1951 // IJMN QRUV
1952 __a, _mm256_setr_epi8(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13,
1953 6, 14, 7, 15, 0, 8, 1, 9, 2, 10, 3, 11,
1954 4, 12, 5, 13, 6, 14, 7, 15));
1955 auto __c
1956 = __xzyw(__b); // 0145 89CD 2367 ABEF GHKL OPST IJMN QRUV
1957 return __intrin_bitcast<_To>(_mm256_shuffle_epi8(
1958 __c, _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13,
1959 6, 7, 14, 15, 0, 1, 8, 9, 2, 3, 10, 11,
1960 4, 5, 12, 13, 6, 7, 14, 15)));
1961 }
1962 else if constexpr (__z_to_z)
1963 {
1964 return __concat(
1965 __convert_x86<__vector_type_t<_Up, _M / 2>>(__v0, __v1, __v2,
1966 __v3),
1967 __convert_x86<__vector_type_t<_Up, _M / 2>>(__v4, __v5, __v6,
1968 __v7));
1969 }
1970 }
1971 else if constexpr (__f64_to_i8) //{{{2
1972 {
1973 return __convert_x86<_To>(
1974 __convert_x86<__vector_type_t<int, _Np * 2>>(__v0, __v1),
1975 __convert_x86<__vector_type_t<int, _Np * 2>>(__v2, __v3),
1976 __convert_x86<__vector_type_t<int, _Np * 2>>(__v4, __v5),
1977 __convert_x86<__vector_type_t<int, _Np * 2>>(__v6, __v7));
1978 }
1979 else // unreachable {{{2
1980 __assert_unreachable<_Tp>();
1981 //}}}
1982
1983 // fallback: {{{2
1984 if constexpr (sizeof(_To) >= 32)
1985 // if _To is ymm or zmm, then _SimdWrapper<_Up, _M / 2> is xmm or ymm
1986 return __concat(
1987 __convert_x86<__vector_type_t<_Up, _M / 2>>(__v0, __v1, __v2, __v3),
1988 __convert_x86<__vector_type_t<_Up, _M / 2>>(__v4, __v5, __v6,
1989 __v7));
1990 else if constexpr (sizeof(_To) == 16)
1991 {
1992 const auto __lo
1993 = __to_intrin(__convert_x86<_To>(__v0, __v1, __v2, __v3));
1994 const auto __hi
1995 = __to_intrin(__convert_x86<_To>(__v4, __v5, __v6, __v7));
1996 static_assert(sizeof(_Up) == 1 && _Np == 2);
1997 return __intrin_bitcast<_To>(_mm_unpacklo_epi64(__lo, __hi));
1998 }
1999 else
2000 {
2001 __assert_unreachable<_Tp>();
2002 // return __vector_convert<_To>(__v0, __v1, __v2, __v3, __v4, __v5,
2003 // __v6, __v7,
2004 // make_index_sequence<_Np>());
2005 } //}}}2
2006 }
2007 }
2008
2009//}}}
2010// 16-arg __convert_x86 {{{1
2011template <typename _To, typename _V, typename _Traits>
2012 _GLIBCXX_SIMD_INTRINSIC _To
2013 __convert_x86(_V __v0, _V __v1, _V __v2, _V __v3, _V __v4, _V __v5, _V __v6,
2014 _V __v7, _V __v8, _V __v9, _V __v10, _V __v11, _V __v12,
2015 _V __v13, _V __v14, _V __v15)
2016 {
2017 // concat => use 8-arg __convert_x86
2018 return __convert_x86<_To>(__concat(__v0, __v1), __concat(__v2, __v3),
2019 __concat(__v4, __v5), __concat(__v6, __v7),
2020 __concat(__v8, __v9), __concat(__v10, __v11),
2021 __concat(__v12, __v13), __concat(__v14, __v15));
2022 }
2023
2024//}}}
2025
2026#endif // __cplusplus >= 201703L
2027#endif // _GLIBCXX_EXPERIMENTAL_SIMD_X86_CONVERSIONS_H
2028
2029// vim: foldmethod=marker
Note: See TracBrowser for help on using the repository browser.