source: Daodan/MSYS2/mingw32/lib/gcc/i686-w64-mingw32/11.2.0/include/avx2intrin.h@ 1175

Last change on this file since 1175 was 1166, checked in by rossy, 3 years ago

Daodan: Replace MinGW build env with an up-to-date MSYS2 env

File size: 57.4 KB
RevLine 
[1166]1/* Copyright (C) 2011-2021 Free Software Foundation, Inc.
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
18
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
23
24#ifndef _IMMINTRIN_H_INCLUDED
25# error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
26#endif
27
28#ifndef _AVX2INTRIN_H_INCLUDED
29#define _AVX2INTRIN_H_INCLUDED
30
31#ifndef __AVX2__
32#pragma GCC push_options
33#pragma GCC target("avx2")
34#define __DISABLE_AVX2__
35#endif /* __AVX2__ */
36
37/* Sum absolute 8-bit integer difference of adjacent groups of 4
38 byte integers in the first 2 operands. Starting offsets within
39 operands are determined by the 3rd mask operand. */
40#ifdef __OPTIMIZE__
41extern __inline __m256i
42__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
43_mm256_mpsadbw_epu8 (__m256i __X, __m256i __Y, const int __M)
44{
45 return (__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)__X,
46 (__v32qi)__Y, __M);
47}
48#else
49#define _mm256_mpsadbw_epu8(X, Y, M) \
50 ((__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)(__m256i)(X), \
51 (__v32qi)(__m256i)(Y), (int)(M)))
52#endif
53
54extern __inline __m256i
55__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
56_mm256_abs_epi8 (__m256i __A)
57{
58 return (__m256i)__builtin_ia32_pabsb256 ((__v32qi)__A);
59}
60
61extern __inline __m256i
62__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
63_mm256_abs_epi16 (__m256i __A)
64{
65 return (__m256i)__builtin_ia32_pabsw256 ((__v16hi)__A);
66}
67
68extern __inline __m256i
69__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
70_mm256_abs_epi32 (__m256i __A)
71{
72 return (__m256i)__builtin_ia32_pabsd256 ((__v8si)__A);
73}
74
75extern __inline __m256i
76__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
77_mm256_packs_epi32 (__m256i __A, __m256i __B)
78{
79 return (__m256i)__builtin_ia32_packssdw256 ((__v8si)__A, (__v8si)__B);
80}
81
82extern __inline __m256i
83__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
84_mm256_packs_epi16 (__m256i __A, __m256i __B)
85{
86 return (__m256i)__builtin_ia32_packsswb256 ((__v16hi)__A, (__v16hi)__B);
87}
88
89extern __inline __m256i
90__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
91_mm256_packus_epi32 (__m256i __A, __m256i __B)
92{
93 return (__m256i)__builtin_ia32_packusdw256 ((__v8si)__A, (__v8si)__B);
94}
95
96extern __inline __m256i
97__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
98_mm256_packus_epi16 (__m256i __A, __m256i __B)
99{
100 return (__m256i)__builtin_ia32_packuswb256 ((__v16hi)__A, (__v16hi)__B);
101}
102
103extern __inline __m256i
104__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
105_mm256_add_epi8 (__m256i __A, __m256i __B)
106{
107 return (__m256i) ((__v32qu)__A + (__v32qu)__B);
108}
109
110extern __inline __m256i
111__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
112_mm256_add_epi16 (__m256i __A, __m256i __B)
113{
114 return (__m256i) ((__v16hu)__A + (__v16hu)__B);
115}
116
117extern __inline __m256i
118__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
119_mm256_add_epi32 (__m256i __A, __m256i __B)
120{
121 return (__m256i) ((__v8su)__A + (__v8su)__B);
122}
123
124extern __inline __m256i
125__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
126_mm256_add_epi64 (__m256i __A, __m256i __B)
127{
128 return (__m256i) ((__v4du)__A + (__v4du)__B);
129}
130
131extern __inline __m256i
132__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
133_mm256_adds_epi8 (__m256i __A, __m256i __B)
134{
135 return (__m256i)__builtin_ia32_paddsb256 ((__v32qi)__A, (__v32qi)__B);
136}
137
138extern __inline __m256i
139__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
140_mm256_adds_epi16 (__m256i __A, __m256i __B)
141{
142 return (__m256i)__builtin_ia32_paddsw256 ((__v16hi)__A, (__v16hi)__B);
143}
144
145extern __inline __m256i
146__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
147_mm256_adds_epu8 (__m256i __A, __m256i __B)
148{
149 return (__m256i)__builtin_ia32_paddusb256 ((__v32qi)__A, (__v32qi)__B);
150}
151
152extern __inline __m256i
153__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
154_mm256_adds_epu16 (__m256i __A, __m256i __B)
155{
156 return (__m256i)__builtin_ia32_paddusw256 ((__v16hi)__A, (__v16hi)__B);
157}
158
159#ifdef __OPTIMIZE__
160extern __inline __m256i
161__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
162_mm256_alignr_epi8 (__m256i __A, __m256i __B, const int __N)
163{
164 return (__m256i) __builtin_ia32_palignr256 ((__v4di)__A,
165 (__v4di)__B,
166 __N * 8);
167}
168#else
169/* In that case (__N*8) will be in vreg, and insn will not be matched. */
170/* Use define instead */
171#define _mm256_alignr_epi8(A, B, N) \
172 ((__m256i) __builtin_ia32_palignr256 ((__v4di)(__m256i)(A), \
173 (__v4di)(__m256i)(B), \
174 (int)(N) * 8))
175#endif
176
177extern __inline __m256i
178__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
179_mm256_and_si256 (__m256i __A, __m256i __B)
180{
181 return (__m256i) ((__v4du)__A & (__v4du)__B);
182}
183
184extern __inline __m256i
185__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
186_mm256_andnot_si256 (__m256i __A, __m256i __B)
187{
188 return (__m256i) __builtin_ia32_andnotsi256 ((__v4di)__A, (__v4di)__B);
189}
190
191extern __inline __m256i
192__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
193_mm256_avg_epu8 (__m256i __A, __m256i __B)
194{
195 return (__m256i)__builtin_ia32_pavgb256 ((__v32qi)__A, (__v32qi)__B);
196}
197
198extern __inline __m256i
199__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
200_mm256_avg_epu16 (__m256i __A, __m256i __B)
201{
202 return (__m256i)__builtin_ia32_pavgw256 ((__v16hi)__A, (__v16hi)__B);
203}
204
205extern __inline __m256i
206__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
207_mm256_blendv_epi8 (__m256i __X, __m256i __Y, __m256i __M)
208{
209 return (__m256i) __builtin_ia32_pblendvb256 ((__v32qi)__X,
210 (__v32qi)__Y,
211 (__v32qi)__M);
212}
213
214#ifdef __OPTIMIZE__
215extern __inline __m256i
216__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
217_mm256_blend_epi16 (__m256i __X, __m256i __Y, const int __M)
218{
219 return (__m256i) __builtin_ia32_pblendw256 ((__v16hi)__X,
220 (__v16hi)__Y,
221 __M);
222}
223#else
224#define _mm256_blend_epi16(X, Y, M) \
225 ((__m256i) __builtin_ia32_pblendw256 ((__v16hi)(__m256i)(X), \
226 (__v16hi)(__m256i)(Y), (int)(M)))
227#endif
228
229extern __inline __m256i
230__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
231_mm256_cmpeq_epi8 (__m256i __A, __m256i __B)
232{
233 return (__m256i) ((__v32qi)__A == (__v32qi)__B);
234}
235
236extern __inline __m256i
237__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
238_mm256_cmpeq_epi16 (__m256i __A, __m256i __B)
239{
240 return (__m256i) ((__v16hi)__A == (__v16hi)__B);
241}
242
243extern __inline __m256i
244__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
245_mm256_cmpeq_epi32 (__m256i __A, __m256i __B)
246{
247 return (__m256i) ((__v8si)__A == (__v8si)__B);
248}
249
250extern __inline __m256i
251__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
252_mm256_cmpeq_epi64 (__m256i __A, __m256i __B)
253{
254 return (__m256i) ((__v4di)__A == (__v4di)__B);
255}
256
257extern __inline __m256i
258__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
259_mm256_cmpgt_epi8 (__m256i __A, __m256i __B)
260{
261 return (__m256i) ((__v32qs)__A > (__v32qs)__B);
262}
263
264extern __inline __m256i
265__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
266_mm256_cmpgt_epi16 (__m256i __A, __m256i __B)
267{
268 return (__m256i) ((__v16hi)__A > (__v16hi)__B);
269}
270
271extern __inline __m256i
272__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
273_mm256_cmpgt_epi32 (__m256i __A, __m256i __B)
274{
275 return (__m256i) ((__v8si)__A > (__v8si)__B);
276}
277
278extern __inline __m256i
279__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
280_mm256_cmpgt_epi64 (__m256i __A, __m256i __B)
281{
282 return (__m256i) ((__v4di)__A > (__v4di)__B);
283}
284
285extern __inline __m256i
286__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
287_mm256_hadd_epi16 (__m256i __X, __m256i __Y)
288{
289 return (__m256i) __builtin_ia32_phaddw256 ((__v16hi)__X,
290 (__v16hi)__Y);
291}
292
293extern __inline __m256i
294__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
295_mm256_hadd_epi32 (__m256i __X, __m256i __Y)
296{
297 return (__m256i) __builtin_ia32_phaddd256 ((__v8si)__X, (__v8si)__Y);
298}
299
300extern __inline __m256i
301__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
302_mm256_hadds_epi16 (__m256i __X, __m256i __Y)
303{
304 return (__m256i) __builtin_ia32_phaddsw256 ((__v16hi)__X,
305 (__v16hi)__Y);
306}
307
308extern __inline __m256i
309__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
310_mm256_hsub_epi16 (__m256i __X, __m256i __Y)
311{
312 return (__m256i) __builtin_ia32_phsubw256 ((__v16hi)__X,
313 (__v16hi)__Y);
314}
315
316extern __inline __m256i
317__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
318_mm256_hsub_epi32 (__m256i __X, __m256i __Y)
319{
320 return (__m256i) __builtin_ia32_phsubd256 ((__v8si)__X, (__v8si)__Y);
321}
322
323extern __inline __m256i
324__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
325_mm256_hsubs_epi16 (__m256i __X, __m256i __Y)
326{
327 return (__m256i) __builtin_ia32_phsubsw256 ((__v16hi)__X,
328 (__v16hi)__Y);
329}
330
331extern __inline __m256i
332__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
333_mm256_maddubs_epi16 (__m256i __X, __m256i __Y)
334{
335 return (__m256i) __builtin_ia32_pmaddubsw256 ((__v32qi)__X,
336 (__v32qi)__Y);
337}
338
339extern __inline __m256i
340__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
341_mm256_madd_epi16 (__m256i __A, __m256i __B)
342{
343 return (__m256i)__builtin_ia32_pmaddwd256 ((__v16hi)__A,
344 (__v16hi)__B);
345}
346
347extern __inline __m256i
348__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
349_mm256_max_epi8 (__m256i __A, __m256i __B)
350{
351 return (__m256i)__builtin_ia32_pmaxsb256 ((__v32qi)__A, (__v32qi)__B);
352}
353
354extern __inline __m256i
355__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
356_mm256_max_epi16 (__m256i __A, __m256i __B)
357{
358 return (__m256i)__builtin_ia32_pmaxsw256 ((__v16hi)__A, (__v16hi)__B);
359}
360
361extern __inline __m256i
362__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
363_mm256_max_epi32 (__m256i __A, __m256i __B)
364{
365 return (__m256i)__builtin_ia32_pmaxsd256 ((__v8si)__A, (__v8si)__B);
366}
367
368extern __inline __m256i
369__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
370_mm256_max_epu8 (__m256i __A, __m256i __B)
371{
372 return (__m256i)__builtin_ia32_pmaxub256 ((__v32qi)__A, (__v32qi)__B);
373}
374
375extern __inline __m256i
376__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
377_mm256_max_epu16 (__m256i __A, __m256i __B)
378{
379 return (__m256i)__builtin_ia32_pmaxuw256 ((__v16hi)__A, (__v16hi)__B);
380}
381
382extern __inline __m256i
383__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
384_mm256_max_epu32 (__m256i __A, __m256i __B)
385{
386 return (__m256i)__builtin_ia32_pmaxud256 ((__v8si)__A, (__v8si)__B);
387}
388
389extern __inline __m256i
390__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
391_mm256_min_epi8 (__m256i __A, __m256i __B)
392{
393 return (__m256i)__builtin_ia32_pminsb256 ((__v32qi)__A, (__v32qi)__B);
394}
395
396extern __inline __m256i
397__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
398_mm256_min_epi16 (__m256i __A, __m256i __B)
399{
400 return (__m256i)__builtin_ia32_pminsw256 ((__v16hi)__A, (__v16hi)__B);
401}
402
403extern __inline __m256i
404__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
405_mm256_min_epi32 (__m256i __A, __m256i __B)
406{
407 return (__m256i)__builtin_ia32_pminsd256 ((__v8si)__A, (__v8si)__B);
408}
409
410extern __inline __m256i
411__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
412_mm256_min_epu8 (__m256i __A, __m256i __B)
413{
414 return (__m256i)__builtin_ia32_pminub256 ((__v32qi)__A, (__v32qi)__B);
415}
416
417extern __inline __m256i
418__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
419_mm256_min_epu16 (__m256i __A, __m256i __B)
420{
421 return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__A, (__v16hi)__B);
422}
423
424extern __inline __m256i
425__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
426_mm256_min_epu32 (__m256i __A, __m256i __B)
427{
428 return (__m256i)__builtin_ia32_pminud256 ((__v8si)__A, (__v8si)__B);
429}
430
431extern __inline int
432__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
433_mm256_movemask_epi8 (__m256i __A)
434{
435 return __builtin_ia32_pmovmskb256 ((__v32qi)__A);
436}
437
438extern __inline __m256i
439__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
440_mm256_cvtepi8_epi16 (__m128i __X)
441{
442 return (__m256i) __builtin_ia32_pmovsxbw256 ((__v16qi)__X);
443}
444
445extern __inline __m256i
446__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
447_mm256_cvtepi8_epi32 (__m128i __X)
448{
449 return (__m256i) __builtin_ia32_pmovsxbd256 ((__v16qi)__X);
450}
451
452extern __inline __m256i
453__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
454_mm256_cvtepi8_epi64 (__m128i __X)
455{
456 return (__m256i) __builtin_ia32_pmovsxbq256 ((__v16qi)__X);
457}
458
459extern __inline __m256i
460__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
461_mm256_cvtepi16_epi32 (__m128i __X)
462{
463 return (__m256i) __builtin_ia32_pmovsxwd256 ((__v8hi)__X);
464}
465
466extern __inline __m256i
467__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
468_mm256_cvtepi16_epi64 (__m128i __X)
469{
470 return (__m256i) __builtin_ia32_pmovsxwq256 ((__v8hi)__X);
471}
472
473extern __inline __m256i
474__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
475_mm256_cvtepi32_epi64 (__m128i __X)
476{
477 return (__m256i) __builtin_ia32_pmovsxdq256 ((__v4si)__X);
478}
479
480extern __inline __m256i
481__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
482_mm256_cvtepu8_epi16 (__m128i __X)
483{
484 return (__m256i) __builtin_ia32_pmovzxbw256 ((__v16qi)__X);
485}
486
487extern __inline __m256i
488__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
489_mm256_cvtepu8_epi32 (__m128i __X)
490{
491 return (__m256i) __builtin_ia32_pmovzxbd256 ((__v16qi)__X);
492}
493
494extern __inline __m256i
495__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
496_mm256_cvtepu8_epi64 (__m128i __X)
497{
498 return (__m256i) __builtin_ia32_pmovzxbq256 ((__v16qi)__X);
499}
500
501extern __inline __m256i
502__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
503_mm256_cvtepu16_epi32 (__m128i __X)
504{
505 return (__m256i) __builtin_ia32_pmovzxwd256 ((__v8hi)__X);
506}
507
508extern __inline __m256i
509__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
510_mm256_cvtepu16_epi64 (__m128i __X)
511{
512 return (__m256i) __builtin_ia32_pmovzxwq256 ((__v8hi)__X);
513}
514
515extern __inline __m256i
516__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
517_mm256_cvtepu32_epi64 (__m128i __X)
518{
519 return (__m256i) __builtin_ia32_pmovzxdq256 ((__v4si)__X);
520}
521
522extern __inline __m256i
523__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
524_mm256_mul_epi32 (__m256i __X, __m256i __Y)
525{
526 return (__m256i) __builtin_ia32_pmuldq256 ((__v8si)__X, (__v8si)__Y);
527}
528
529extern __inline __m256i
530__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
531_mm256_mulhrs_epi16 (__m256i __X, __m256i __Y)
532{
533 return (__m256i) __builtin_ia32_pmulhrsw256 ((__v16hi)__X,
534 (__v16hi)__Y);
535}
536
537extern __inline __m256i
538__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
539_mm256_mulhi_epu16 (__m256i __A, __m256i __B)
540{
541 return (__m256i)__builtin_ia32_pmulhuw256 ((__v16hi)__A, (__v16hi)__B);
542}
543
544extern __inline __m256i
545__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
546_mm256_mulhi_epi16 (__m256i __A, __m256i __B)
547{
548 return (__m256i)__builtin_ia32_pmulhw256 ((__v16hi)__A, (__v16hi)__B);
549}
550
551extern __inline __m256i
552__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
553_mm256_mullo_epi16 (__m256i __A, __m256i __B)
554{
555 return (__m256i) ((__v16hu)__A * (__v16hu)__B);
556}
557
558extern __inline __m256i
559__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
560_mm256_mullo_epi32 (__m256i __A, __m256i __B)
561{
562 return (__m256i) ((__v8su)__A * (__v8su)__B);
563}
564
565extern __inline __m256i
566__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
567_mm256_mul_epu32 (__m256i __A, __m256i __B)
568{
569 return (__m256i)__builtin_ia32_pmuludq256 ((__v8si)__A, (__v8si)__B);
570}
571
572extern __inline __m256i
573__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
574_mm256_or_si256 (__m256i __A, __m256i __B)
575{
576 return (__m256i) ((__v4du)__A | (__v4du)__B);
577}
578
579extern __inline __m256i
580__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
581_mm256_sad_epu8 (__m256i __A, __m256i __B)
582{
583 return (__m256i)__builtin_ia32_psadbw256 ((__v32qi)__A, (__v32qi)__B);
584}
585
586extern __inline __m256i
587__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
588_mm256_shuffle_epi8 (__m256i __X, __m256i __Y)
589{
590 return (__m256i) __builtin_ia32_pshufb256 ((__v32qi)__X,
591 (__v32qi)__Y);
592}
593
594#ifdef __OPTIMIZE__
595extern __inline __m256i
596__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
597_mm256_shuffle_epi32 (__m256i __A, const int __mask)
598{
599 return (__m256i)__builtin_ia32_pshufd256 ((__v8si)__A, __mask);
600}
601
602extern __inline __m256i
603__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
604_mm256_shufflehi_epi16 (__m256i __A, const int __mask)
605{
606 return (__m256i)__builtin_ia32_pshufhw256 ((__v16hi)__A, __mask);
607}
608
609extern __inline __m256i
610__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
611_mm256_shufflelo_epi16 (__m256i __A, const int __mask)
612{
613 return (__m256i)__builtin_ia32_pshuflw256 ((__v16hi)__A, __mask);
614}
615#else
616#define _mm256_shuffle_epi32(A, N) \
617 ((__m256i)__builtin_ia32_pshufd256 ((__v8si)(__m256i)(A), (int)(N)))
618#define _mm256_shufflehi_epi16(A, N) \
619 ((__m256i)__builtin_ia32_pshufhw256 ((__v16hi)(__m256i)(A), (int)(N)))
620#define _mm256_shufflelo_epi16(A, N) \
621 ((__m256i)__builtin_ia32_pshuflw256 ((__v16hi)(__m256i)(A), (int)(N)))
622#endif
623
624extern __inline __m256i
625__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
626_mm256_sign_epi8 (__m256i __X, __m256i __Y)
627{
628 return (__m256i) __builtin_ia32_psignb256 ((__v32qi)__X, (__v32qi)__Y);
629}
630
631extern __inline __m256i
632__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
633_mm256_sign_epi16 (__m256i __X, __m256i __Y)
634{
635 return (__m256i) __builtin_ia32_psignw256 ((__v16hi)__X, (__v16hi)__Y);
636}
637
638extern __inline __m256i
639__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
640_mm256_sign_epi32 (__m256i __X, __m256i __Y)
641{
642 return (__m256i) __builtin_ia32_psignd256 ((__v8si)__X, (__v8si)__Y);
643}
644
645#ifdef __OPTIMIZE__
646extern __inline __m256i
647__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
648_mm256_bslli_epi128 (__m256i __A, const int __N)
649{
650 return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8);
651}
652
653extern __inline __m256i
654__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
655_mm256_slli_si256 (__m256i __A, const int __N)
656{
657 return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8);
658}
659#else
660#define _mm256_bslli_epi128(A, N) \
661 ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8))
662#define _mm256_slli_si256(A, N) \
663 ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8))
664#endif
665
666extern __inline __m256i
667__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
668_mm256_slli_epi16 (__m256i __A, int __B)
669{
670 return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
671}
672
673extern __inline __m256i
674__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
675_mm256_sll_epi16 (__m256i __A, __m128i __B)
676{
677 return (__m256i)__builtin_ia32_psllw256((__v16hi)__A, (__v8hi)__B);
678}
679
680extern __inline __m256i
681__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
682_mm256_slli_epi32 (__m256i __A, int __B)
683{
684 return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B);
685}
686
687extern __inline __m256i
688__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
689_mm256_sll_epi32 (__m256i __A, __m128i __B)
690{
691 return (__m256i)__builtin_ia32_pslld256((__v8si)__A, (__v4si)__B);
692}
693
694extern __inline __m256i
695__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
696_mm256_slli_epi64 (__m256i __A, int __B)
697{
698 return (__m256i)__builtin_ia32_psllqi256 ((__v4di)__A, __B);
699}
700
701extern __inline __m256i
702__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
703_mm256_sll_epi64 (__m256i __A, __m128i __B)
704{
705 return (__m256i)__builtin_ia32_psllq256((__v4di)__A, (__v2di)__B);
706}
707
708extern __inline __m256i
709__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
710_mm256_srai_epi16 (__m256i __A, int __B)
711{
712 return (__m256i)__builtin_ia32_psrawi256 ((__v16hi)__A, __B);
713}
714
715extern __inline __m256i
716__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
717_mm256_sra_epi16 (__m256i __A, __m128i __B)
718{
719 return (__m256i)__builtin_ia32_psraw256 ((__v16hi)__A, (__v8hi)__B);
720}
721
722extern __inline __m256i
723__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
724_mm256_srai_epi32 (__m256i __A, int __B)
725{
726 return (__m256i)__builtin_ia32_psradi256 ((__v8si)__A, __B);
727}
728
729extern __inline __m256i
730__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
731_mm256_sra_epi32 (__m256i __A, __m128i __B)
732{
733 return (__m256i)__builtin_ia32_psrad256 ((__v8si)__A, (__v4si)__B);
734}
735
736#ifdef __OPTIMIZE__
737extern __inline __m256i
738__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
739_mm256_bsrli_epi128 (__m256i __A, const int __N)
740{
741 return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8);
742}
743
744extern __inline __m256i
745__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
746_mm256_srli_si256 (__m256i __A, const int __N)
747{
748 return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8);
749}
750#else
751#define _mm256_bsrli_epi128(A, N) \
752 ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8))
753#define _mm256_srli_si256(A, N) \
754 ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8))
755#endif
756
757extern __inline __m256i
758__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
759_mm256_srli_epi16 (__m256i __A, int __B)
760{
761 return (__m256i)__builtin_ia32_psrlwi256 ((__v16hi)__A, __B);
762}
763
764extern __inline __m256i
765__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
766_mm256_srl_epi16 (__m256i __A, __m128i __B)
767{
768 return (__m256i)__builtin_ia32_psrlw256((__v16hi)__A, (__v8hi)__B);
769}
770
771extern __inline __m256i
772__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
773_mm256_srli_epi32 (__m256i __A, int __B)
774{
775 return (__m256i)__builtin_ia32_psrldi256 ((__v8si)__A, __B);
776}
777
778extern __inline __m256i
779__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
780_mm256_srl_epi32 (__m256i __A, __m128i __B)
781{
782 return (__m256i)__builtin_ia32_psrld256((__v8si)__A, (__v4si)__B);
783}
784
785extern __inline __m256i
786__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
787_mm256_srli_epi64 (__m256i __A, int __B)
788{
789 return (__m256i)__builtin_ia32_psrlqi256 ((__v4di)__A, __B);
790}
791
792extern __inline __m256i
793__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
794_mm256_srl_epi64 (__m256i __A, __m128i __B)
795{
796 return (__m256i)__builtin_ia32_psrlq256((__v4di)__A, (__v2di)__B);
797}
798
799extern __inline __m256i
800__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
801_mm256_sub_epi8 (__m256i __A, __m256i __B)
802{
803 return (__m256i) ((__v32qu)__A - (__v32qu)__B);
804}
805
806extern __inline __m256i
807__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
808_mm256_sub_epi16 (__m256i __A, __m256i __B)
809{
810 return (__m256i) ((__v16hu)__A - (__v16hu)__B);
811}
812
813extern __inline __m256i
814__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
815_mm256_sub_epi32 (__m256i __A, __m256i __B)
816{
817 return (__m256i) ((__v8su)__A - (__v8su)__B);
818}
819
820extern __inline __m256i
821__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
822_mm256_sub_epi64 (__m256i __A, __m256i __B)
823{
824 return (__m256i) ((__v4du)__A - (__v4du)__B);
825}
826
827extern __inline __m256i
828__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
829_mm256_subs_epi8 (__m256i __A, __m256i __B)
830{
831 return (__m256i)__builtin_ia32_psubsb256 ((__v32qi)__A, (__v32qi)__B);
832}
833
834extern __inline __m256i
835__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
836_mm256_subs_epi16 (__m256i __A, __m256i __B)
837{
838 return (__m256i)__builtin_ia32_psubsw256 ((__v16hi)__A, (__v16hi)__B);
839}
840
841extern __inline __m256i
842__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
843_mm256_subs_epu8 (__m256i __A, __m256i __B)
844{
845 return (__m256i)__builtin_ia32_psubusb256 ((__v32qi)__A, (__v32qi)__B);
846}
847
848extern __inline __m256i
849__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
850_mm256_subs_epu16 (__m256i __A, __m256i __B)
851{
852 return (__m256i)__builtin_ia32_psubusw256 ((__v16hi)__A, (__v16hi)__B);
853}
854
855extern __inline __m256i
856__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
857_mm256_unpackhi_epi8 (__m256i __A, __m256i __B)
858{
859 return (__m256i)__builtin_ia32_punpckhbw256 ((__v32qi)__A, (__v32qi)__B);
860}
861
862extern __inline __m256i
863__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
864_mm256_unpackhi_epi16 (__m256i __A, __m256i __B)
865{
866 return (__m256i)__builtin_ia32_punpckhwd256 ((__v16hi)__A, (__v16hi)__B);
867}
868
869extern __inline __m256i
870__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
871_mm256_unpackhi_epi32 (__m256i __A, __m256i __B)
872{
873 return (__m256i)__builtin_ia32_punpckhdq256 ((__v8si)__A, (__v8si)__B);
874}
875
876extern __inline __m256i
877__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
878_mm256_unpackhi_epi64 (__m256i __A, __m256i __B)
879{
880 return (__m256i)__builtin_ia32_punpckhqdq256 ((__v4di)__A, (__v4di)__B);
881}
882
883extern __inline __m256i
884__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
885_mm256_unpacklo_epi8 (__m256i __A, __m256i __B)
886{
887 return (__m256i)__builtin_ia32_punpcklbw256 ((__v32qi)__A, (__v32qi)__B);
888}
889
890extern __inline __m256i
891__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
892_mm256_unpacklo_epi16 (__m256i __A, __m256i __B)
893{
894 return (__m256i)__builtin_ia32_punpcklwd256 ((__v16hi)__A, (__v16hi)__B);
895}
896
897extern __inline __m256i
898__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
899_mm256_unpacklo_epi32 (__m256i __A, __m256i __B)
900{
901 return (__m256i)__builtin_ia32_punpckldq256 ((__v8si)__A, (__v8si)__B);
902}
903
904extern __inline __m256i
905__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
906_mm256_unpacklo_epi64 (__m256i __A, __m256i __B)
907{
908 return (__m256i)__builtin_ia32_punpcklqdq256 ((__v4di)__A, (__v4di)__B);
909}
910
911extern __inline __m256i
912__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
913_mm256_xor_si256 (__m256i __A, __m256i __B)
914{
915 return (__m256i) ((__v4du)__A ^ (__v4du)__B);
916}
917
918extern __inline __m256i
919__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
920_mm256_stream_load_si256 (__m256i const *__X)
921{
922 return (__m256i) __builtin_ia32_movntdqa256 ((__v4di *) __X);
923}
924
925extern __inline __m128
926__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
927_mm_broadcastss_ps (__m128 __X)
928{
929 return (__m128) __builtin_ia32_vbroadcastss_ps ((__v4sf)__X);
930}
931
932extern __inline __m256
933__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
934_mm256_broadcastss_ps (__m128 __X)
935{
936 return (__m256) __builtin_ia32_vbroadcastss_ps256 ((__v4sf)__X);
937}
938
939extern __inline __m256d
940__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
941_mm256_broadcastsd_pd (__m128d __X)
942{
943 return (__m256d) __builtin_ia32_vbroadcastsd_pd256 ((__v2df)__X);
944}
945
946extern __inline __m256i
947__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
948_mm256_broadcastsi128_si256 (__m128i __X)
949{
950 return (__m256i) __builtin_ia32_vbroadcastsi256 ((__v2di)__X);
951}
952
953#define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)
954#define _mm_broadcastsd_pd(X) _mm_movedup_pd(X)
955
956#ifdef __OPTIMIZE__
957extern __inline __m128i
958__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
959_mm_blend_epi32 (__m128i __X, __m128i __Y, const int __M)
960{
961 return (__m128i) __builtin_ia32_pblendd128 ((__v4si)__X,
962 (__v4si)__Y,
963 __M);
964}
965#else
966#define _mm_blend_epi32(X, Y, M) \
967 ((__m128i) __builtin_ia32_pblendd128 ((__v4si)(__m128i)(X), \
968 (__v4si)(__m128i)(Y), (int)(M)))
969#endif
970
971#ifdef __OPTIMIZE__
972extern __inline __m256i
973__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
974_mm256_blend_epi32 (__m256i __X, __m256i __Y, const int __M)
975{
976 return (__m256i) __builtin_ia32_pblendd256 ((__v8si)__X,
977 (__v8si)__Y,
978 __M);
979}
980#else
981#define _mm256_blend_epi32(X, Y, M) \
982 ((__m256i) __builtin_ia32_pblendd256 ((__v8si)(__m256i)(X), \
983 (__v8si)(__m256i)(Y), (int)(M)))
984#endif
985
986extern __inline __m256i
987__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
988_mm256_broadcastb_epi8 (__m128i __X)
989{
990 return (__m256i) __builtin_ia32_pbroadcastb256 ((__v16qi)__X);
991}
992
993extern __inline __m256i
994__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
995_mm256_broadcastw_epi16 (__m128i __X)
996{
997 return (__m256i) __builtin_ia32_pbroadcastw256 ((__v8hi)__X);
998}
999
1000extern __inline __m256i
1001__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1002_mm256_broadcastd_epi32 (__m128i __X)
1003{
1004 return (__m256i) __builtin_ia32_pbroadcastd256 ((__v4si)__X);
1005}
1006
1007extern __inline __m256i
1008__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1009_mm256_broadcastq_epi64 (__m128i __X)
1010{
1011 return (__m256i) __builtin_ia32_pbroadcastq256 ((__v2di)__X);
1012}
1013
1014extern __inline __m128i
1015__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1016_mm_broadcastb_epi8 (__m128i __X)
1017{
1018 return (__m128i) __builtin_ia32_pbroadcastb128 ((__v16qi)__X);
1019}
1020
1021extern __inline __m128i
1022__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1023_mm_broadcastw_epi16 (__m128i __X)
1024{
1025 return (__m128i) __builtin_ia32_pbroadcastw128 ((__v8hi)__X);
1026}
1027
1028extern __inline __m128i
1029__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1030_mm_broadcastd_epi32 (__m128i __X)
1031{
1032 return (__m128i) __builtin_ia32_pbroadcastd128 ((__v4si)__X);
1033}
1034
1035extern __inline __m128i
1036__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1037_mm_broadcastq_epi64 (__m128i __X)
1038{
1039 return (__m128i) __builtin_ia32_pbroadcastq128 ((__v2di)__X);
1040}
1041
1042extern __inline __m256i
1043__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1044_mm256_permutevar8x32_epi32 (__m256i __X, __m256i __Y)
1045{
1046 return (__m256i) __builtin_ia32_permvarsi256 ((__v8si)__X, (__v8si)__Y);
1047}
1048
1049#ifdef __OPTIMIZE__
1050extern __inline __m256d
1051__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1052_mm256_permute4x64_pd (__m256d __X, const int __M)
1053{
1054 return (__m256d) __builtin_ia32_permdf256 ((__v4df)__X, __M);
1055}
1056#else
1057#define _mm256_permute4x64_pd(X, M) \
1058 ((__m256d) __builtin_ia32_permdf256 ((__v4df)(__m256d)(X), (int)(M)))
1059#endif
1060
1061extern __inline __m256
1062__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1063_mm256_permutevar8x32_ps (__m256 __X, __m256i __Y)
1064{
1065 return (__m256) __builtin_ia32_permvarsf256 ((__v8sf)__X, (__v8si)__Y);
1066}
1067
1068#ifdef __OPTIMIZE__
1069extern __inline __m256i
1070__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1071_mm256_permute4x64_epi64 (__m256i __X, const int __M)
1072{
1073 return (__m256i) __builtin_ia32_permdi256 ((__v4di)__X, __M);
1074}
1075#else
1076#define _mm256_permute4x64_epi64(X, M) \
1077 ((__m256i) __builtin_ia32_permdi256 ((__v4di)(__m256i)(X), (int)(M)))
1078#endif
1079
1080
1081#ifdef __OPTIMIZE__
1082extern __inline __m256i
1083__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1084_mm256_permute2x128_si256 (__m256i __X, __m256i __Y, const int __M)
1085{
1086 return (__m256i) __builtin_ia32_permti256 ((__v4di)__X, (__v4di)__Y, __M);
1087}
1088#else
1089#define _mm256_permute2x128_si256(X, Y, M) \
1090 ((__m256i) __builtin_ia32_permti256 ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(M)))
1091#endif
1092
1093#ifdef __OPTIMIZE__
1094extern __inline __m128i
1095__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1096_mm256_extracti128_si256 (__m256i __X, const int __M)
1097{
1098 return (__m128i) __builtin_ia32_extract128i256 ((__v4di)__X, __M);
1099}
1100#else
1101#define _mm256_extracti128_si256(X, M) \
1102 ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(X), (int)(M)))
1103#endif
1104
1105#ifdef __OPTIMIZE__
1106extern __inline __m256i
1107__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1108_mm256_inserti128_si256 (__m256i __X, __m128i __Y, const int __M)
1109{
1110 return (__m256i) __builtin_ia32_insert128i256 ((__v4di)__X, (__v2di)__Y, __M);
1111}
1112#else
1113#define _mm256_inserti128_si256(X, Y, M) \
1114 ((__m256i) __builtin_ia32_insert128i256 ((__v4di)(__m256i)(X), \
1115 (__v2di)(__m128i)(Y), \
1116 (int)(M)))
1117#endif
1118
1119extern __inline __m256i
1120__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1121_mm256_maskload_epi32 (int const *__X, __m256i __M )
1122{
1123 return (__m256i) __builtin_ia32_maskloadd256 ((const __v8si *)__X,
1124 (__v8si)__M);
1125}
1126
1127extern __inline __m256i
1128__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1129_mm256_maskload_epi64 (long long const *__X, __m256i __M )
1130{
1131 return (__m256i) __builtin_ia32_maskloadq256 ((const __v4di *)__X,
1132 (__v4di)__M);
1133}
1134
1135extern __inline __m128i
1136__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1137_mm_maskload_epi32 (int const *__X, __m128i __M )
1138{
1139 return (__m128i) __builtin_ia32_maskloadd ((const __v4si *)__X,
1140 (__v4si)__M);
1141}
1142
1143extern __inline __m128i
1144__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1145_mm_maskload_epi64 (long long const *__X, __m128i __M )
1146{
1147 return (__m128i) __builtin_ia32_maskloadq ((const __v2di *)__X,
1148 (__v2di)__M);
1149}
1150
1151extern __inline void
1152__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1153_mm256_maskstore_epi32 (int *__X, __m256i __M, __m256i __Y )
1154{
1155 __builtin_ia32_maskstored256 ((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
1156}
1157
1158extern __inline void
1159__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1160_mm256_maskstore_epi64 (long long *__X, __m256i __M, __m256i __Y )
1161{
1162 __builtin_ia32_maskstoreq256 ((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
1163}
1164
1165extern __inline void
1166__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1167_mm_maskstore_epi32 (int *__X, __m128i __M, __m128i __Y )
1168{
1169 __builtin_ia32_maskstored ((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
1170}
1171
1172extern __inline void
1173__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1174_mm_maskstore_epi64 (long long *__X, __m128i __M, __m128i __Y )
1175{
1176 __builtin_ia32_maskstoreq (( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
1177}
1178
1179extern __inline __m256i
1180__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1181_mm256_sllv_epi32 (__m256i __X, __m256i __Y)
1182{
1183 return (__m256i) __builtin_ia32_psllv8si ((__v8si)__X, (__v8si)__Y);
1184}
1185
1186extern __inline __m128i
1187__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1188_mm_sllv_epi32 (__m128i __X, __m128i __Y)
1189{
1190 return (__m128i) __builtin_ia32_psllv4si ((__v4si)__X, (__v4si)__Y);
1191}
1192
1193extern __inline __m256i
1194__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1195_mm256_sllv_epi64 (__m256i __X, __m256i __Y)
1196{
1197 return (__m256i) __builtin_ia32_psllv4di ((__v4di)__X, (__v4di)__Y);
1198}
1199
1200extern __inline __m128i
1201__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1202_mm_sllv_epi64 (__m128i __X, __m128i __Y)
1203{
1204 return (__m128i) __builtin_ia32_psllv2di ((__v2di)__X, (__v2di)__Y);
1205}
1206
1207extern __inline __m256i
1208__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1209_mm256_srav_epi32 (__m256i __X, __m256i __Y)
1210{
1211 return (__m256i) __builtin_ia32_psrav8si ((__v8si)__X, (__v8si)__Y);
1212}
1213
1214extern __inline __m128i
1215__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1216_mm_srav_epi32 (__m128i __X, __m128i __Y)
1217{
1218 return (__m128i) __builtin_ia32_psrav4si ((__v4si)__X, (__v4si)__Y);
1219}
1220
1221extern __inline __m256i
1222__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1223_mm256_srlv_epi32 (__m256i __X, __m256i __Y)
1224{
1225 return (__m256i) __builtin_ia32_psrlv8si ((__v8si)__X, (__v8si)__Y);
1226}
1227
1228extern __inline __m128i
1229__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1230_mm_srlv_epi32 (__m128i __X, __m128i __Y)
1231{
1232 return (__m128i) __builtin_ia32_psrlv4si ((__v4si)__X, (__v4si)__Y);
1233}
1234
1235extern __inline __m256i
1236__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1237_mm256_srlv_epi64 (__m256i __X, __m256i __Y)
1238{
1239 return (__m256i) __builtin_ia32_psrlv4di ((__v4di)__X, (__v4di)__Y);
1240}
1241
1242extern __inline __m128i
1243__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1244_mm_srlv_epi64 (__m128i __X, __m128i __Y)
1245{
1246 return (__m128i) __builtin_ia32_psrlv2di ((__v2di)__X, (__v2di)__Y);
1247}
1248
1249#ifdef __OPTIMIZE__
1250extern __inline __m128d
1251__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1252_mm_i32gather_pd (double const *__base, __m128i __index, const int __scale)
1253{
1254 __v2df __zero = _mm_setzero_pd ();
1255 __v2df __mask = _mm_cmpeq_pd (__zero, __zero);
1256
1257 return (__m128d) __builtin_ia32_gathersiv2df (_mm_undefined_pd (),
1258 __base,
1259 (__v4si)__index,
1260 __mask,
1261 __scale);
1262}
1263
1264extern __inline __m128d
1265__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1266_mm_mask_i32gather_pd (__m128d __src, double const *__base, __m128i __index,
1267 __m128d __mask, const int __scale)
1268{
1269 return (__m128d) __builtin_ia32_gathersiv2df ((__v2df)__src,
1270 __base,
1271 (__v4si)__index,
1272 (__v2df)__mask,
1273 __scale);
1274}
1275
1276extern __inline __m256d
1277__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1278_mm256_i32gather_pd (double const *__base, __m128i __index, const int __scale)
1279{
1280 __v4df __zero = _mm256_setzero_pd ();
1281 __v4df __mask = _mm256_cmp_pd (__zero, __zero, _CMP_EQ_OQ);
1282
1283 return (__m256d) __builtin_ia32_gathersiv4df (_mm256_undefined_pd (),
1284 __base,
1285 (__v4si)__index,
1286 __mask,
1287 __scale);
1288}
1289
1290extern __inline __m256d
1291__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1292_mm256_mask_i32gather_pd (__m256d __src, double const *__base,
1293 __m128i __index, __m256d __mask, const int __scale)
1294{
1295 return (__m256d) __builtin_ia32_gathersiv4df ((__v4df)__src,
1296 __base,
1297 (__v4si)__index,
1298 (__v4df)__mask,
1299 __scale);
1300}
1301
1302extern __inline __m128d
1303__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1304_mm_i64gather_pd (double const *__base, __m128i __index, const int __scale)
1305{
1306 __v2df __src = _mm_setzero_pd ();
1307 __v2df __mask = _mm_cmpeq_pd (__src, __src);
1308
1309 return (__m128d) __builtin_ia32_gatherdiv2df (__src,
1310 __base,
1311 (__v2di)__index,
1312 __mask,
1313 __scale);
1314}
1315
1316extern __inline __m128d
1317__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1318_mm_mask_i64gather_pd (__m128d __src, double const *__base, __m128i __index,
1319 __m128d __mask, const int __scale)
1320{
1321 return (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)__src,
1322 __base,
1323 (__v2di)__index,
1324 (__v2df)__mask,
1325 __scale);
1326}
1327
1328extern __inline __m256d
1329__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1330_mm256_i64gather_pd (double const *__base, __m256i __index, const int __scale)
1331{
1332 __v4df __src = _mm256_setzero_pd ();
1333 __v4df __mask = _mm256_cmp_pd (__src, __src, _CMP_EQ_OQ);
1334
1335 return (__m256d) __builtin_ia32_gatherdiv4df (__src,
1336 __base,
1337 (__v4di)__index,
1338 __mask,
1339 __scale);
1340}
1341
1342extern __inline __m256d
1343__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1344_mm256_mask_i64gather_pd (__m256d __src, double const *__base,
1345 __m256i __index, __m256d __mask, const int __scale)
1346{
1347 return (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)__src,
1348 __base,
1349 (__v4di)__index,
1350 (__v4df)__mask,
1351 __scale);
1352}
1353
1354extern __inline __m128
1355__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1356_mm_i32gather_ps (float const *__base, __m128i __index, const int __scale)
1357{
1358 __v4sf __src = _mm_setzero_ps ();
1359 __v4sf __mask = _mm_cmpeq_ps (__src, __src);
1360
1361 return (__m128) __builtin_ia32_gathersiv4sf (__src,
1362 __base,
1363 (__v4si)__index,
1364 __mask,
1365 __scale);
1366}
1367
1368extern __inline __m128
1369__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1370_mm_mask_i32gather_ps (__m128 __src, float const *__base, __m128i __index,
1371 __m128 __mask, const int __scale)
1372{
1373 return (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)__src,
1374 __base,
1375 (__v4si)__index,
1376 (__v4sf)__mask,
1377 __scale);
1378}
1379
1380extern __inline __m256
1381__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1382_mm256_i32gather_ps (float const *__base, __m256i __index, const int __scale)
1383{
1384 __v8sf __src = _mm256_setzero_ps ();
1385 __v8sf __mask = _mm256_cmp_ps (__src, __src, _CMP_EQ_OQ);
1386
1387 return (__m256) __builtin_ia32_gathersiv8sf (__src,
1388 __base,
1389 (__v8si)__index,
1390 __mask,
1391 __scale);
1392}
1393
1394extern __inline __m256
1395__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1396_mm256_mask_i32gather_ps (__m256 __src, float const *__base,
1397 __m256i __index, __m256 __mask, const int __scale)
1398{
1399 return (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)__src,
1400 __base,
1401 (__v8si)__index,
1402 (__v8sf)__mask,
1403 __scale);
1404}
1405
1406extern __inline __m128
1407__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1408_mm_i64gather_ps (float const *__base, __m128i __index, const int __scale)
1409{
1410 __v4sf __src = _mm_setzero_ps ();
1411 __v4sf __mask = _mm_cmpeq_ps (__src, __src);
1412
1413 return (__m128) __builtin_ia32_gatherdiv4sf (__src,
1414 __base,
1415 (__v2di)__index,
1416 __mask,
1417 __scale);
1418}
1419
1420extern __inline __m128
1421__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1422_mm_mask_i64gather_ps (__m128 __src, float const *__base, __m128i __index,
1423 __m128 __mask, const int __scale)
1424{
1425 return (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)__src,
1426 __base,
1427 (__v2di)__index,
1428 (__v4sf)__mask,
1429 __scale);
1430}
1431
1432extern __inline __m128
1433__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1434_mm256_i64gather_ps (float const *__base, __m256i __index, const int __scale)
1435{
1436 __v4sf __src = _mm_setzero_ps ();
1437 __v4sf __mask = _mm_cmpeq_ps (__src, __src);
1438
1439 return (__m128) __builtin_ia32_gatherdiv4sf256 (__src,
1440 __base,
1441 (__v4di)__index,
1442 __mask,
1443 __scale);
1444}
1445
1446extern __inline __m128
1447__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1448_mm256_mask_i64gather_ps (__m128 __src, float const *__base,
1449 __m256i __index, __m128 __mask, const int __scale)
1450{
1451 return (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)__src,
1452 __base,
1453 (__v4di)__index,
1454 (__v4sf)__mask,
1455 __scale);
1456}
1457
1458extern __inline __m128i
1459__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1460_mm_i32gather_epi64 (long long int const *__base,
1461 __m128i __index, const int __scale)
1462{
1463 __v2di __src = __extension__ (__v2di){ 0, 0 };
1464 __v2di __mask = __extension__ (__v2di){ ~0, ~0 };
1465
1466 return (__m128i) __builtin_ia32_gathersiv2di (__src,
1467 __base,
1468 (__v4si)__index,
1469 __mask,
1470 __scale);
1471}
1472
1473extern __inline __m128i
1474__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1475_mm_mask_i32gather_epi64 (__m128i __src, long long int const *__base,
1476 __m128i __index, __m128i __mask, const int __scale)
1477{
1478 return (__m128i) __builtin_ia32_gathersiv2di ((__v2di)__src,
1479 __base,
1480 (__v4si)__index,
1481 (__v2di)__mask,
1482 __scale);
1483}
1484
1485extern __inline __m256i
1486__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1487_mm256_i32gather_epi64 (long long int const *__base,
1488 __m128i __index, const int __scale)
1489{
1490 __v4di __src = __extension__ (__v4di){ 0, 0, 0, 0 };
1491 __v4di __mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
1492
1493 return (__m256i) __builtin_ia32_gathersiv4di (__src,
1494 __base,
1495 (__v4si)__index,
1496 __mask,
1497 __scale);
1498}
1499
1500extern __inline __m256i
1501__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1502_mm256_mask_i32gather_epi64 (__m256i __src, long long int const *__base,
1503 __m128i __index, __m256i __mask,
1504 const int __scale)
1505{
1506 return (__m256i) __builtin_ia32_gathersiv4di ((__v4di)__src,
1507 __base,
1508 (__v4si)__index,
1509 (__v4di)__mask,
1510 __scale);
1511}
1512
1513extern __inline __m128i
1514__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1515_mm_i64gather_epi64 (long long int const *__base,
1516 __m128i __index, const int __scale)
1517{
1518 __v2di __src = __extension__ (__v2di){ 0, 0 };
1519 __v2di __mask = __extension__ (__v2di){ ~0, ~0 };
1520
1521 return (__m128i) __builtin_ia32_gatherdiv2di (__src,
1522 __base,
1523 (__v2di)__index,
1524 __mask,
1525 __scale);
1526}
1527
1528extern __inline __m128i
1529__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1530_mm_mask_i64gather_epi64 (__m128i __src, long long int const *__base,
1531 __m128i __index, __m128i __mask, const int __scale)
1532{
1533 return (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)__src,
1534 __base,
1535 (__v2di)__index,
1536 (__v2di)__mask,
1537 __scale);
1538}
1539
1540extern __inline __m256i
1541__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1542_mm256_i64gather_epi64 (long long int const *__base,
1543 __m256i __index, const int __scale)
1544{
1545 __v4di __src = __extension__ (__v4di){ 0, 0, 0, 0 };
1546 __v4di __mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
1547
1548 return (__m256i) __builtin_ia32_gatherdiv4di (__src,
1549 __base,
1550 (__v4di)__index,
1551 __mask,
1552 __scale);
1553}
1554
1555extern __inline __m256i
1556__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1557_mm256_mask_i64gather_epi64 (__m256i __src, long long int const *__base,
1558 __m256i __index, __m256i __mask,
1559 const int __scale)
1560{
1561 return (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)__src,
1562 __base,
1563 (__v4di)__index,
1564 (__v4di)__mask,
1565 __scale);
1566}
1567
1568extern __inline __m128i
1569__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1570_mm_i32gather_epi32 (int const *__base, __m128i __index, const int __scale)
1571{
1572 __v4si __src = __extension__ (__v4si){ 0, 0, 0, 0 };
1573 __v4si __mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
1574
1575 return (__m128i) __builtin_ia32_gathersiv4si (__src,
1576 __base,
1577 (__v4si)__index,
1578 __mask,
1579 __scale);
1580}
1581
1582extern __inline __m128i
1583__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1584_mm_mask_i32gather_epi32 (__m128i __src, int const *__base, __m128i __index,
1585 __m128i __mask, const int __scale)
1586{
1587 return (__m128i) __builtin_ia32_gathersiv4si ((__v4si)__src,
1588 __base,
1589 (__v4si)__index,
1590 (__v4si)__mask,
1591 __scale);
1592}
1593
1594extern __inline __m256i
1595__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1596_mm256_i32gather_epi32 (int const *__base, __m256i __index, const int __scale)
1597{
1598 __v8si __src = __extension__ (__v8si){ 0, 0, 0, 0, 0, 0, 0, 0 };
1599 __v8si __mask = __extension__ (__v8si){ ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0 };
1600
1601 return (__m256i) __builtin_ia32_gathersiv8si (__src,
1602 __base,
1603 (__v8si)__index,
1604 __mask,
1605 __scale);
1606}
1607
1608extern __inline __m256i
1609__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1610_mm256_mask_i32gather_epi32 (__m256i __src, int const *__base,
1611 __m256i __index, __m256i __mask,
1612 const int __scale)
1613{
1614 return (__m256i) __builtin_ia32_gathersiv8si ((__v8si)__src,
1615 __base,
1616 (__v8si)__index,
1617 (__v8si)__mask,
1618 __scale);
1619}
1620
1621extern __inline __m128i
1622__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1623_mm_i64gather_epi32 (int const *__base, __m128i __index, const int __scale)
1624{
1625 __v4si __src = __extension__ (__v4si){ 0, 0, 0, 0 };
1626 __v4si __mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
1627
1628 return (__m128i) __builtin_ia32_gatherdiv4si (__src,
1629 __base,
1630 (__v2di)__index,
1631 __mask,
1632 __scale);
1633}
1634
1635extern __inline __m128i
1636__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1637_mm_mask_i64gather_epi32 (__m128i __src, int const *__base, __m128i __index,
1638 __m128i __mask, const int __scale)
1639{
1640 return (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)__src,
1641 __base,
1642 (__v2di)__index,
1643 (__v4si)__mask,
1644 __scale);
1645}
1646
1647extern __inline __m128i
1648__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1649_mm256_i64gather_epi32 (int const *__base, __m256i __index, const int __scale)
1650{
1651 __v4si __src = __extension__ (__v4si){ 0, 0, 0, 0 };
1652 __v4si __mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
1653
1654 return (__m128i) __builtin_ia32_gatherdiv4si256 (__src,
1655 __base,
1656 (__v4di)__index,
1657 __mask,
1658 __scale);
1659}
1660
1661extern __inline __m128i
1662__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1663_mm256_mask_i64gather_epi32 (__m128i __src, int const *__base,
1664 __m256i __index, __m128i __mask,
1665 const int __scale)
1666{
1667 return (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)__src,
1668 __base,
1669 (__v4di)__index,
1670 (__v4si)__mask,
1671 __scale);
1672}
1673#else /* __OPTIMIZE__ */
1674#define _mm_i32gather_pd(BASE, INDEX, SCALE) \
1675 (__m128d) __builtin_ia32_gathersiv2df ((__v2df) _mm_setzero_pd (), \
1676 (double const *) (BASE), \
1677 (__v4si)(__m128i) (INDEX), \
1678 (__v2df) \
1679 _mm_cmpeq_pd (_mm_setzero_pd (),\
1680 _mm_setzero_pd ()),\
1681 (int) (SCALE))
1682
1683#define _mm_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
1684 (__m128d) __builtin_ia32_gathersiv2df ((__v2df)(__m128d) (SRC), \
1685 (double const *) (BASE), \
1686 (__v4si)(__m128i) (INDEX), \
1687 (__v2df)(__m128d) (MASK), \
1688 (int) (SCALE))
1689
1690#define _mm256_i32gather_pd(BASE, INDEX, SCALE) \
1691 (__m256d) __builtin_ia32_gathersiv4df ((__v4df) _mm256_setzero_pd (), \
1692 (double const *) (BASE), \
1693 (__v4si)(__m128i) (INDEX), \
1694 (__v4df) \
1695 _mm256_cmp_pd (_mm256_setzero_pd (),\
1696 _mm256_setzero_pd (),\
1697 _CMP_EQ_OQ), \
1698 (int) (SCALE))
1699
1700#define _mm256_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
1701 (__m256d) __builtin_ia32_gathersiv4df ((__v4df)(__m256d) (SRC), \
1702 (double const *) (BASE), \
1703 (__v4si)(__m128i) (INDEX), \
1704 (__v4df)(__m256d) (MASK), \
1705 (int) (SCALE))
1706
1707#define _mm_i64gather_pd(BASE, INDEX, SCALE) \
1708 (__m128d) __builtin_ia32_gatherdiv2df ((__v2df) _mm_setzero_pd (), \
1709 (double const *) (BASE), \
1710 (__v2di)(__m128i) (INDEX), \
1711 (__v2df) \
1712 _mm_cmpeq_pd (_mm_setzero_pd (),\
1713 _mm_setzero_pd ()),\
1714 (int) (SCALE))
1715
1716#define _mm_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
1717 (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)(__m128d) (SRC), \
1718 (double const *) (BASE), \
1719 (__v2di)(__m128i) (INDEX), \
1720 (__v2df)(__m128d) (MASK), \
1721 (int) (SCALE))
1722
1723#define _mm256_i64gather_pd(BASE, INDEX, SCALE) \
1724 (__m256d) __builtin_ia32_gatherdiv4df ((__v4df) _mm256_setzero_pd (), \
1725 (double const *) (BASE), \
1726 (__v4di)(__m256i) (INDEX), \
1727 (__v4df) \
1728 _mm256_cmp_pd (_mm256_setzero_pd (),\
1729 _mm256_setzero_pd (),\
1730 _CMP_EQ_OQ), \
1731 (int) (SCALE))
1732
1733#define _mm256_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
1734 (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)(__m256d) (SRC), \
1735 (double const *) (BASE), \
1736 (__v4di)(__m256i) (INDEX), \
1737 (__v4df)(__m256d) (MASK), \
1738 (int) (SCALE))
1739
1740#define _mm_i32gather_ps(BASE, INDEX, SCALE) \
1741 (__m128) __builtin_ia32_gathersiv4sf ((__v4sf) _mm_setzero_ps (), \
1742 (float const *) (BASE), \
1743 (__v4si)(__m128i) (INDEX), \
1744 (__v4sf) \
1745 _mm_cmpeq_ps (_mm_setzero_ps (),\
1746 _mm_setzero_ps ()),\
1747 (int) (SCALE))
1748
1749#define _mm_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
1750 (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)(__m128) (SRC), \
1751 (float const *) (BASE), \
1752 (__v4si)(__m128i) (INDEX), \
1753 (__v4sf)(__m128) (MASK), \
1754 (int) (SCALE))
1755
1756#define _mm256_i32gather_ps(BASE, INDEX, SCALE) \
1757 (__m256) __builtin_ia32_gathersiv8sf ((__v8sf) _mm256_setzero_ps (), \
1758 (float const *) (BASE), \
1759 (__v8si)(__m256i) (INDEX), \
1760 (__v8sf) \
1761 _mm256_cmp_ps (_mm256_setzero_ps (),\
1762 _mm256_setzero_ps (),\
1763 _CMP_EQ_OQ), \
1764 (int) (SCALE))
1765
1766#define _mm256_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
1767 (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)(__m256) (SRC), \
1768 (float const *) (BASE), \
1769 (__v8si)(__m256i) (INDEX), \
1770 (__v8sf)(__m256) (MASK), \
1771 (int) (SCALE))
1772
1773#define _mm_i64gather_ps(BASE, INDEX, SCALE) \
1774 (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf) _mm_setzero_pd (), \
1775 (float const *) (BASE), \
1776 (__v2di)(__m128i) (INDEX), \
1777 (__v4sf) \
1778 _mm_cmpeq_ps (_mm_setzero_ps (),\
1779 _mm_setzero_ps ()),\
1780 (int) (SCALE))
1781
1782#define _mm_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
1783 (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)(__m128) (SRC), \
1784 (float const *) (BASE), \
1785 (__v2di)(__m128i) (INDEX), \
1786 (__v4sf)(__m128) (MASK), \
1787 (int) (SCALE))
1788
1789#define _mm256_i64gather_ps(BASE, INDEX, SCALE) \
1790 (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf) _mm_setzero_ps (), \
1791 (float const *) (BASE), \
1792 (__v4di)(__m256i) (INDEX), \
1793 (__v4sf) \
1794 _mm_cmpeq_ps (_mm_setzero_ps (),\
1795 _mm_setzero_ps ()),\
1796 (int) (SCALE))
1797
1798#define _mm256_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
1799 (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)(__m128) (SRC), \
1800 (float const *) (BASE), \
1801 (__v4di)(__m256i) (INDEX), \
1802 (__v4sf)(__m128) (MASK), \
1803 (int) (SCALE))
1804
1805#define _mm_i32gather_epi64(BASE, INDEX, SCALE) \
1806 (__m128i) __builtin_ia32_gathersiv2di ((__v2di) _mm_setzero_si128 (), \
1807 (long long const *) (BASE), \
1808 (__v4si)(__m128i) (INDEX), \
1809 (__v2di)_mm_set1_epi64x (-1), \
1810 (int) (SCALE))
1811
1812#define _mm_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1813 (__m128i) __builtin_ia32_gathersiv2di ((__v2di)(__m128i) (SRC), \
1814 (long long const *) (BASE), \
1815 (__v4si)(__m128i) (INDEX), \
1816 (__v2di)(__m128i) (MASK), \
1817 (int) (SCALE))
1818
1819#define _mm256_i32gather_epi64(BASE, INDEX, SCALE) \
1820 (__m256i) __builtin_ia32_gathersiv4di ((__v4di) _mm256_setzero_si256 (), \
1821 (long long const *) (BASE), \
1822 (__v4si)(__m128i) (INDEX), \
1823 (__v4di)_mm256_set1_epi64x (-1), \
1824 (int) (SCALE))
1825
1826#define _mm256_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1827 (__m256i) __builtin_ia32_gathersiv4di ((__v4di)(__m256i) (SRC), \
1828 (long long const *) (BASE), \
1829 (__v4si)(__m128i) (INDEX), \
1830 (__v4di)(__m256i) (MASK), \
1831 (int) (SCALE))
1832
1833#define _mm_i64gather_epi64(BASE, INDEX, SCALE) \
1834 (__m128i) __builtin_ia32_gatherdiv2di ((__v2di) _mm_setzero_si128 (), \
1835 (long long const *) (BASE), \
1836 (__v2di)(__m128i) (INDEX), \
1837 (__v2di)_mm_set1_epi64x (-1), \
1838 (int) (SCALE))
1839
1840#define _mm_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1841 (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)(__m128i) (SRC), \
1842 (long long const *) (BASE), \
1843 (__v2di)(__m128i) (INDEX), \
1844 (__v2di)(__m128i) (MASK), \
1845 (int) (SCALE))
1846
1847#define _mm256_i64gather_epi64(BASE, INDEX, SCALE) \
1848 (__m256i) __builtin_ia32_gatherdiv4di ((__v4di) _mm256_setzero_si256 (), \
1849 (long long const *) (BASE), \
1850 (__v4di)(__m256i) (INDEX), \
1851 (__v4di)_mm256_set1_epi64x (-1), \
1852 (int) (SCALE))
1853
1854#define _mm256_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1855 (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)(__m256i) (SRC), \
1856 (long long const *) (BASE), \
1857 (__v4di)(__m256i) (INDEX), \
1858 (__v4di)(__m256i) (MASK), \
1859 (int) (SCALE))
1860
1861#define _mm_i32gather_epi32(BASE, INDEX, SCALE) \
1862 (__m128i) __builtin_ia32_gathersiv4si ((__v4si) _mm_setzero_si128 (), \
1863 (int const *) (BASE), \
1864 (__v4si)(__m128i) (INDEX), \
1865 (__v4si)_mm_set1_epi32 (-1), \
1866 (int) (SCALE))
1867
1868#define _mm_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1869 (__m128i) __builtin_ia32_gathersiv4si ((__v4si)(__m128i) (SRC), \
1870 (int const *) (BASE), \
1871 (__v4si)(__m128i) (INDEX), \
1872 (__v4si)(__m128i) (MASK), \
1873 (int) (SCALE))
1874
1875#define _mm256_i32gather_epi32(BASE, INDEX, SCALE) \
1876 (__m256i) __builtin_ia32_gathersiv8si ((__v8si) _mm256_setzero_si256 (), \
1877 (int const *) (BASE), \
1878 (__v8si)(__m256i) (INDEX), \
1879 (__v8si)_mm256_set1_epi32 (-1), \
1880 (int) (SCALE))
1881
1882#define _mm256_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1883 (__m256i) __builtin_ia32_gathersiv8si ((__v8si)(__m256i) (SRC), \
1884 (int const *) (BASE), \
1885 (__v8si)(__m256i) (INDEX), \
1886 (__v8si)(__m256i) (MASK), \
1887 (int) (SCALE))
1888
1889#define _mm_i64gather_epi32(BASE, INDEX, SCALE) \
1890 (__m128i) __builtin_ia32_gatherdiv4si ((__v4si) _mm_setzero_si128 (), \
1891 (int const *) (BASE), \
1892 (__v2di)(__m128i) (INDEX), \
1893 (__v4si)_mm_set1_epi32 (-1), \
1894 (int) (SCALE))
1895
1896#define _mm_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1897 (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)(__m128i) (SRC), \
1898 (int const *) (BASE), \
1899 (__v2di)(__m128i) (INDEX), \
1900 (__v4si)(__m128i) (MASK), \
1901 (int) (SCALE))
1902
1903#define _mm256_i64gather_epi32(BASE, INDEX, SCALE) \
1904 (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si) _mm_setzero_si128 (), \
1905 (int const *) (BASE), \
1906 (__v4di)(__m256i) (INDEX), \
1907 (__v4si)_mm_set1_epi32(-1), \
1908 (int) (SCALE))
1909
1910#define _mm256_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1911 (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)(__m128i) (SRC), \
1912 (int const *) (BASE), \
1913 (__v4di)(__m256i) (INDEX), \
1914 (__v4si)(__m128i) (MASK), \
1915 (int) (SCALE))
1916#endif /* __OPTIMIZE__ */
1917
1918#ifdef __DISABLE_AVX2__
1919#undef __DISABLE_AVX2__
1920#pragma GCC pop_options
1921#endif /* __DISABLE_AVX2__ */
1922
1923#endif /* _AVX2INTRIN_H_INCLUDED */
Note: See TracBrowser for help on using the repository browser.