avxintrin.h source code [clang_source_code/lib/Headers/avxintrin.h]

1	/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2	*
3	* Permission is hereby granted, free of charge, to any person obtaining a copy
4	* of this software and associated documentation files (the "Software"), to deal
5	* in the Software without restriction, including without limitation the rights
6	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7	* copies of the Software, and to permit persons to whom the Software is
8	* furnished to do so, subject to the following conditions:
9	*
10	* The above copyright notice and this permission notice shall be included in
11	* all copies or substantial portions of the Software.
12	*
13	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19	* THE SOFTWARE.
20	*
21	*===-----------------------------------------------------------------------===
22	*/
23
24	#ifndef __IMMINTRIN_H
25	#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
26	#endif
27
28	#ifndef __AVXINTRIN_H
29	#define __AVXINTRIN_H
30
31	typedef double __v4df __attribute__ ((__vector_size__ (32)));
32	typedef float __v8sf __attribute__ ((__vector_size__ (32)));
33	typedef long long __v4di __attribute__ ((__vector_size__ (32)));
34	typedef int __v8si __attribute__ ((__vector_size__ (32)));
35	typedef short __v16hi __attribute__ ((__vector_size__ (32)));
36	typedef char __v32qi __attribute__ ((__vector_size__ (32)));
37
38	/* Unsigned types */
39	typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
40	typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
41	typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
42	typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
43
44	/* We need an explicitly signed variant for char. Note that this shouldn't
45	* appear in the interface though. */
46	typedef signed char __v32qs __attribute__((__vector_size__(32)));
47
48	typedef float __m256 __attribute__ ((__vector_size__ (32), __aligned__(32)));
49	typedef double __m256d __attribute__((__vector_size__(32), __aligned__(32)));
50	typedef long long __m256i __attribute__((__vector_size__(32), __aligned__(32)));
51
52	typedef float __m256_u __attribute__ ((__vector_size__ (32), __aligned__(1)));
53	typedef double __m256d_u __attribute__((__vector_size__(32), __aligned__(1)));
54	typedef long long __m256i_u __attribute__((__vector_size__(32), __aligned__(1)));
55
56	/* Define the default attributes for the functions in this file. */
57	#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(256)))
58	#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(128)))
59
60	/* Arithmetic */
61	/// Adds two 256-bit vectors of [4 x double].
62	///
63	/// \headerfile <x86intrin.h>
64	///
65	/// This intrinsic corresponds to the <c> VADDPD </c> instruction.
66	///
67	/// \param __a
68	/// A 256-bit vector of [4 x double] containing one of the source operands.
69	/// \param __b
70	/// A 256-bit vector of [4 x double] containing one of the source operands.
71	/// \returns A 256-bit vector of [4 x double] containing the sums of both
72	/// operands.
73	static __inline __m256d __DEFAULT_FN_ATTRS
74	_mm256_add_pd(__m256d __a, __m256d __b)
75	{
76	return (__m256d)((__v4df)__a+(__v4df)__b);
77	}
78
79	/// Adds two 256-bit vectors of [8 x float].
80	///
81	/// \headerfile <x86intrin.h>
82	///
83	/// This intrinsic corresponds to the <c> VADDPS </c> instruction.
84	///
85	/// \param __a
86	/// A 256-bit vector of [8 x float] containing one of the source operands.
87	/// \param __b
88	/// A 256-bit vector of [8 x float] containing one of the source operands.
89	/// \returns A 256-bit vector of [8 x float] containing the sums of both
90	/// operands.
91	static __inline __m256 __DEFAULT_FN_ATTRS
92	_mm256_add_ps(__m256 __a, __m256 __b)
93	{
94	return (__m256)((__v8sf)__a+(__v8sf)__b);
95	}
96
97	/// Subtracts two 256-bit vectors of [4 x double].
98	///
99	/// \headerfile <x86intrin.h>
100	///
101	/// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
102	///
103	/// \param __a
104	/// A 256-bit vector of [4 x double] containing the minuend.
105	/// \param __b
106	/// A 256-bit vector of [4 x double] containing the subtrahend.
107	/// \returns A 256-bit vector of [4 x double] containing the differences between
108	/// both operands.
109	static __inline __m256d __DEFAULT_FN_ATTRS
110	_mm256_sub_pd(__m256d __a, __m256d __b)
111	{
112	return (__m256d)((__v4df)__a-(__v4df)__b);
113	}
114
115	/// Subtracts two 256-bit vectors of [8 x float].
116	///
117	/// \headerfile <x86intrin.h>
118	///
119	/// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
120	///
121	/// \param __a
122	/// A 256-bit vector of [8 x float] containing the minuend.
123	/// \param __b
124	/// A 256-bit vector of [8 x float] containing the subtrahend.
125	/// \returns A 256-bit vector of [8 x float] containing the differences between
126	/// both operands.
127	static __inline __m256 __DEFAULT_FN_ATTRS
128	_mm256_sub_ps(__m256 __a, __m256 __b)
129	{
130	return (__m256)((__v8sf)__a-(__v8sf)__b);
131	}
132
133	/// Adds the even-indexed values and subtracts the odd-indexed values of
134	/// two 256-bit vectors of [4 x double].
135	///
136	/// \headerfile <x86intrin.h>
137	///
138	/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
139	///
140	/// \param __a
141	/// A 256-bit vector of [4 x double] containing the left source operand.
142	/// \param __b
143	/// A 256-bit vector of [4 x double] containing the right source operand.
144	/// \returns A 256-bit vector of [4 x double] containing the alternating sums
145	/// and differences between both operands.
146	static __inline __m256d __DEFAULT_FN_ATTRS
147	_mm256_addsub_pd(__m256d __a, __m256d __b)
148	{
149	return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
150	}
151
152	/// Adds the even-indexed values and subtracts the odd-indexed values of
153	/// two 256-bit vectors of [8 x float].
154	///
155	/// \headerfile <x86intrin.h>
156	///
157	/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
158	///
159	/// \param __a
160	/// A 256-bit vector of [8 x float] containing the left source operand.
161	/// \param __b
162	/// A 256-bit vector of [8 x float] containing the right source operand.
163	/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
164	/// differences between both operands.
165	static __inline __m256 __DEFAULT_FN_ATTRS
166	_mm256_addsub_ps(__m256 __a, __m256 __b)
167	{
168	return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
169	}
170
171	/// Divides two 256-bit vectors of [4 x double].
172	///
173	/// \headerfile <x86intrin.h>
174	///
175	/// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
176	///
177	/// \param __a
178	/// A 256-bit vector of [4 x double] containing the dividend.
179	/// \param __b
180	/// A 256-bit vector of [4 x double] containing the divisor.
181	/// \returns A 256-bit vector of [4 x double] containing the quotients of both
182	/// operands.
183	static __inline __m256d __DEFAULT_FN_ATTRS
184	_mm256_div_pd(__m256d __a, __m256d __b)
185	{
186	return (__m256d)((__v4df)__a/(__v4df)__b);
187	}
188
189	/// Divides two 256-bit vectors of [8 x float].
190	///
191	/// \headerfile <x86intrin.h>
192	///
193	/// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
194	///
195	/// \param __a
196	/// A 256-bit vector of [8 x float] containing the dividend.
197	/// \param __b
198	/// A 256-bit vector of [8 x float] containing the divisor.
199	/// \returns A 256-bit vector of [8 x float] containing the quotients of both
200	/// operands.
201	static __inline __m256 __DEFAULT_FN_ATTRS
202	_mm256_div_ps(__m256 __a, __m256 __b)
203	{
204	return (__m256)((__v8sf)__a/(__v8sf)__b);
205	}
206
207	/// Compares two 256-bit vectors of [4 x double] and returns the greater
208	/// of each pair of values.
209	///
210	/// \headerfile <x86intrin.h>
211	///
212	/// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
213	///
214	/// \param __a
215	/// A 256-bit vector of [4 x double] containing one of the operands.
216	/// \param __b
217	/// A 256-bit vector of [4 x double] containing one of the operands.
218	/// \returns A 256-bit vector of [4 x double] containing the maximum values
219	/// between both operands.
220	static __inline __m256d __DEFAULT_FN_ATTRS
221	_mm256_max_pd(__m256d __a, __m256d __b)
222	{
223	return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
224	}
225
226	/// Compares two 256-bit vectors of [8 x float] and returns the greater
227	/// of each pair of values.
228	///
229	/// \headerfile <x86intrin.h>
230	///
231	/// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
232	///
233	/// \param __a
234	/// A 256-bit vector of [8 x float] containing one of the operands.
235	/// \param __b
236	/// A 256-bit vector of [8 x float] containing one of the operands.
237	/// \returns A 256-bit vector of [8 x float] containing the maximum values
238	/// between both operands.
239	static __inline __m256 __DEFAULT_FN_ATTRS
240	_mm256_max_ps(__m256 __a, __m256 __b)
241	{
242	return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
243	}
244
245	/// Compares two 256-bit vectors of [4 x double] and returns the lesser
246	/// of each pair of values.
247	///
248	/// \headerfile <x86intrin.h>
249	///
250	/// This intrinsic corresponds to the <c> VMINPD </c> instruction.
251	///
252	/// \param __a
253	/// A 256-bit vector of [4 x double] containing one of the operands.
254	/// \param __b
255	/// A 256-bit vector of [4 x double] containing one of the operands.
256	/// \returns A 256-bit vector of [4 x double] containing the minimum values
257	/// between both operands.
258	static __inline __m256d __DEFAULT_FN_ATTRS
259	_mm256_min_pd(__m256d __a, __m256d __b)
260	{
261	return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
262	}
263
264	/// Compares two 256-bit vectors of [8 x float] and returns the lesser
265	/// of each pair of values.
266	///
267	/// \headerfile <x86intrin.h>
268	///
269	/// This intrinsic corresponds to the <c> VMINPS </c> instruction.
270	///
271	/// \param __a
272	/// A 256-bit vector of [8 x float] containing one of the operands.
273	/// \param __b
274	/// A 256-bit vector of [8 x float] containing one of the operands.
275	/// \returns A 256-bit vector of [8 x float] containing the minimum values
276	/// between both operands.
277	static __inline __m256 __DEFAULT_FN_ATTRS
278	_mm256_min_ps(__m256 __a, __m256 __b)
279	{
280	return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
281	}
282
283	/// Multiplies two 256-bit vectors of [4 x double].
284	///
285	/// \headerfile <x86intrin.h>
286	///
287	/// This intrinsic corresponds to the <c> VMULPD </c> instruction.
288	///
289	/// \param __a
290	/// A 256-bit vector of [4 x double] containing one of the operands.
291	/// \param __b
292	/// A 256-bit vector of [4 x double] containing one of the operands.
293	/// \returns A 256-bit vector of [4 x double] containing the products of both
294	/// operands.
295	static __inline __m256d __DEFAULT_FN_ATTRS
296	_mm256_mul_pd(__m256d __a, __m256d __b)
297	{
298	return (__m256d)((__v4df)__a * (__v4df)__b);
299	}
300
301	/// Multiplies two 256-bit vectors of [8 x float].
302	///
303	/// \headerfile <x86intrin.h>
304	///
305	/// This intrinsic corresponds to the <c> VMULPS </c> instruction.
306	///
307	/// \param __a
308	/// A 256-bit vector of [8 x float] containing one of the operands.
309	/// \param __b
310	/// A 256-bit vector of [8 x float] containing one of the operands.
311	/// \returns A 256-bit vector of [8 x float] containing the products of both
312	/// operands.
313	static __inline __m256 __DEFAULT_FN_ATTRS
314	_mm256_mul_ps(__m256 __a, __m256 __b)
315	{
316	return (__m256)((__v8sf)__a * (__v8sf)__b);
317	}
318
319	/// Calculates the square roots of the values in a 256-bit vector of
320	/// [4 x double].
321	///
322	/// \headerfile <x86intrin.h>
323	///
324	/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
325	///
326	/// \param __a
327	/// A 256-bit vector of [4 x double].
328	/// \returns A 256-bit vector of [4 x double] containing the square roots of the
329	/// values in the operand.
330	static __inline __m256d __DEFAULT_FN_ATTRS
331	_mm256_sqrt_pd(__m256d __a)
332	{
333	return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
334	}
335
336	/// Calculates the square roots of the values in a 256-bit vector of
337	/// [8 x float].
338	///
339	/// \headerfile <x86intrin.h>
340	///
341	/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
342	///
343	/// \param __a
344	/// A 256-bit vector of [8 x float].
345	/// \returns A 256-bit vector of [8 x float] containing the square roots of the
346	/// values in the operand.
347	static __inline __m256 __DEFAULT_FN_ATTRS
348	_mm256_sqrt_ps(__m256 __a)
349	{
350	return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
351	}
352
353	/// Calculates the reciprocal square roots of the values in a 256-bit
354	/// vector of [8 x float].
355	///
356	/// \headerfile <x86intrin.h>
357	///
358	/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
359	///
360	/// \param __a
361	/// A 256-bit vector of [8 x float].
362	/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
363	/// roots of the values in the operand.
364	static __inline __m256 __DEFAULT_FN_ATTRS
365	_mm256_rsqrt_ps(__m256 __a)
366	{
367	return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
368	}
369
370	/// Calculates the reciprocals of the values in a 256-bit vector of
371	/// [8 x float].
372	///
373	/// \headerfile <x86intrin.h>
374	///
375	/// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
376	///
377	/// \param __a
378	/// A 256-bit vector of [8 x float].
379	/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
380	/// values in the operand.
381	static __inline __m256 __DEFAULT_FN_ATTRS
382	_mm256_rcp_ps(__m256 __a)
383	{
384	return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
385	}
386
387	/// Rounds the values in a 256-bit vector of [4 x double] as specified
388	/// by the byte operand. The source values are rounded to integer values and
389	/// returned as 64-bit double-precision floating-point values.
390	///
391	/// \headerfile <x86intrin.h>
392	///
393	/// \code
394	/// __m256d _mm256_round_pd(__m256d V, const int M);
395	/// \endcode
396	///
397	/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
398	///
399	/// \param V
400	/// A 256-bit vector of [4 x double].
401	/// \param M
402	/// An integer value that specifies the rounding operation. \n
403	/// Bits [7:4] are reserved. \n
404	/// Bit [3] is a precision exception value: \n
405	/// 0: A normal PE exception is used. \n
406	/// 1: The PE field is not updated. \n
407	/// Bit [2] is the rounding control source: \n
408	/// 0: Use bits [1:0] of \a M. \n
409	/// 1: Use the current MXCSR setting. \n
410	/// Bits [1:0] contain the rounding control definition: \n
411	/// 00: Nearest. \n
412	/// 01: Downward (toward negative infinity). \n
413	/// 10: Upward (toward positive infinity). \n
414	/// 11: Truncated.
415	/// \returns A 256-bit vector of [4 x double] containing the rounded values.
416	#define _mm256_round_pd(V, M) \
417	(__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M))
418
419	/// Rounds the values stored in a 256-bit vector of [8 x float] as
420	/// specified by the byte operand. The source values are rounded to integer
421	/// values and returned as floating-point values.
422	///
423	/// \headerfile <x86intrin.h>
424	///
425	/// \code
426	/// __m256 _mm256_round_ps(__m256 V, const int M);
427	/// \endcode
428	///
429	/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
430	///
431	/// \param V
432	/// A 256-bit vector of [8 x float].
433	/// \param M
434	/// An integer value that specifies the rounding operation. \n
435	/// Bits [7:4] are reserved. \n
436	/// Bit [3] is a precision exception value: \n
437	/// 0: A normal PE exception is used. \n
438	/// 1: The PE field is not updated. \n
439	/// Bit [2] is the rounding control source: \n
440	/// 0: Use bits [1:0] of \a M. \n
441	/// 1: Use the current MXCSR setting. \n
442	/// Bits [1:0] contain the rounding control definition: \n
443	/// 00: Nearest. \n
444	/// 01: Downward (toward negative infinity). \n
445	/// 10: Upward (toward positive infinity). \n
446	/// 11: Truncated.
447	/// \returns A 256-bit vector of [8 x float] containing the rounded values.
448	#define _mm256_round_ps(V, M) \
449	(__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M))
450
451	/// Rounds up the values stored in a 256-bit vector of [4 x double]. The
452	/// source values are rounded up to integer values and returned as 64-bit
453	/// double-precision floating-point values.
454	///
455	/// \headerfile <x86intrin.h>
456	///
457	/// \code
458	/// __m256d _mm256_ceil_pd(__m256d V);
459	/// \endcode
460	///
461	/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
462	///
463	/// \param V
464	/// A 256-bit vector of [4 x double].
465	/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
466	#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
467
468	/// Rounds down the values stored in a 256-bit vector of [4 x double].
469	/// The source values are rounded down to integer values and returned as
470	/// 64-bit double-precision floating-point values.
471	///
472	/// \headerfile <x86intrin.h>
473	///
474	/// \code
475	/// __m256d _mm256_floor_pd(__m256d V);
476	/// \endcode
477	///
478	/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
479	///
480	/// \param V
481	/// A 256-bit vector of [4 x double].
482	/// \returns A 256-bit vector of [4 x double] containing the rounded down
483	/// values.
484	#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
485
486	/// Rounds up the values stored in a 256-bit vector of [8 x float]. The
487	/// source values are rounded up to integer values and returned as
488	/// floating-point values.
489	///
490	/// \headerfile <x86intrin.h>
491	///
492	/// \code
493	/// __m256 _mm256_ceil_ps(__m256 V);
494	/// \endcode
495	///
496	/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
497	///
498	/// \param V
499	/// A 256-bit vector of [8 x float].
500	/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
501	#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
502
503	/// Rounds down the values stored in a 256-bit vector of [8 x float]. The
504	/// source values are rounded down to integer values and returned as
505	/// floating-point values.
506	///
507	/// \headerfile <x86intrin.h>
508	///
509	/// \code
510	/// __m256 _mm256_floor_ps(__m256 V);
511	/// \endcode
512	///
513	/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
514	///
515	/// \param V
516	/// A 256-bit vector of [8 x float].
517	/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
518	#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
519
520	/* Logical */
521	/// Performs a bitwise AND of two 256-bit vectors of [4 x double].
522	///
523	/// \headerfile <x86intrin.h>
524	///
525	/// This intrinsic corresponds to the <c> VANDPD </c> instruction.
526	///
527	/// \param __a
528	/// A 256-bit vector of [4 x double] containing one of the source operands.
529	/// \param __b
530	/// A 256-bit vector of [4 x double] containing one of the source operands.
531	/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
532	/// values between both operands.
533	static __inline __m256d __DEFAULT_FN_ATTRS
534	_mm256_and_pd(__m256d __a, __m256d __b)
535	{
536	return (__m256d)((__v4du)__a & (__v4du)__b);
537	}
538
539	/// Performs a bitwise AND of two 256-bit vectors of [8 x float].
540	///
541	/// \headerfile <x86intrin.h>
542	///
543	/// This intrinsic corresponds to the <c> VANDPS </c> instruction.
544	///
545	/// \param __a
546	/// A 256-bit vector of [8 x float] containing one of the source operands.
547	/// \param __b
548	/// A 256-bit vector of [8 x float] containing one of the source operands.
549	/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
550	/// values between both operands.
551	static __inline __m256 __DEFAULT_FN_ATTRS
552	_mm256_and_ps(__m256 __a, __m256 __b)
553	{
554	return (__m256)((__v8su)__a & (__v8su)__b);
555	}
556
557	/// Performs a bitwise AND of two 256-bit vectors of [4 x double], using
558	/// the one's complement of the values contained in the first source operand.
559	///
560	/// \headerfile <x86intrin.h>
561	///
562	/// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
563	///
564	/// \param __a
565	/// A 256-bit vector of [4 x double] containing the left source operand. The
566	/// one's complement of this value is used in the bitwise AND.
567	/// \param __b
568	/// A 256-bit vector of [4 x double] containing the right source operand.
569	/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
570	/// values of the second operand and the one's complement of the first
571	/// operand.
572	static __inline __m256d __DEFAULT_FN_ATTRS
573	_mm256_andnot_pd(__m256d __a, __m256d __b)
574	{
575	return (__m256d)(~(__v4du)__a & (__v4du)__b);
576	}
577
578	/// Performs a bitwise AND of two 256-bit vectors of [8 x float], using
579	/// the one's complement of the values contained in the first source operand.
580	///
581	/// \headerfile <x86intrin.h>
582	///
583	/// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
584	///
585	/// \param __a
586	/// A 256-bit vector of [8 x float] containing the left source operand. The
587	/// one's complement of this value is used in the bitwise AND.
588	/// \param __b
589	/// A 256-bit vector of [8 x float] containing the right source operand.
590	/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
591	/// values of the second operand and the one's complement of the first
592	/// operand.
593	static __inline __m256 __DEFAULT_FN_ATTRS
594	_mm256_andnot_ps(__m256 __a, __m256 __b)
595	{
596	return (__m256)(~(__v8su)__a & (__v8su)__b);
597	}
598
599	/// Performs a bitwise OR of two 256-bit vectors of [4 x double].
600	///
601	/// \headerfile <x86intrin.h>
602	///
603	/// This intrinsic corresponds to the <c> VORPD </c> instruction.
604	///
605	/// \param __a
606	/// A 256-bit vector of [4 x double] containing one of the source operands.
607	/// \param __b
608	/// A 256-bit vector of [4 x double] containing one of the source operands.
609	/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
610	/// values between both operands.
611	static __inline __m256d __DEFAULT_FN_ATTRS
612	_mm256_or_pd(__m256d __a, __m256d __b)
613	{
614	return (__m256d)((__v4du)__a \| (__v4du)__b);
615	}
616
617	/// Performs a bitwise OR of two 256-bit vectors of [8 x float].
618	///
619	/// \headerfile <x86intrin.h>
620	///
621	/// This intrinsic corresponds to the <c> VORPS </c> instruction.
622	///
623	/// \param __a
624	/// A 256-bit vector of [8 x float] containing one of the source operands.
625	/// \param __b
626	/// A 256-bit vector of [8 x float] containing one of the source operands.
627	/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
628	/// values between both operands.
629	static __inline __m256 __DEFAULT_FN_ATTRS
630	_mm256_or_ps(__m256 __a, __m256 __b)
631	{
632	return (__m256)((__v8su)__a \| (__v8su)__b);
633	}
634
635	/// Performs a bitwise XOR of two 256-bit vectors of [4 x double].
636	///
637	/// \headerfile <x86intrin.h>
638	///
639	/// This intrinsic corresponds to the <c> VXORPD </c> instruction.
640	///
641	/// \param __a
642	/// A 256-bit vector of [4 x double] containing one of the source operands.
643	/// \param __b
644	/// A 256-bit vector of [4 x double] containing one of the source operands.
645	/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
646	/// values between both operands.
647	static __inline __m256d __DEFAULT_FN_ATTRS
648	_mm256_xor_pd(__m256d __a, __m256d __b)
649	{
650	return (__m256d)((__v4du)__a ^ (__v4du)__b);
651	}
652
653	/// Performs a bitwise XOR of two 256-bit vectors of [8 x float].
654	///
655	/// \headerfile <x86intrin.h>
656	///
657	/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
658	///
659	/// \param __a
660	/// A 256-bit vector of [8 x float] containing one of the source operands.
661	/// \param __b
662	/// A 256-bit vector of [8 x float] containing one of the source operands.
663	/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
664	/// values between both operands.
665	static __inline __m256 __DEFAULT_FN_ATTRS
666	_mm256_xor_ps(__m256 __a, __m256 __b)
667	{
668	return (__m256)((__v8su)__a ^ (__v8su)__b);
669	}
670
671	/* Horizontal arithmetic */
672	/// Horizontally adds the adjacent pairs of values contained in two
673	/// 256-bit vectors of [4 x double].
674	///
675	/// \headerfile <x86intrin.h>
676	///
677	/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
678	///
679	/// \param __a
680	/// A 256-bit vector of [4 x double] containing one of the source operands.
681	/// The horizontal sums of the values are returned in the even-indexed
682	/// elements of a vector of [4 x double].
683	/// \param __b
684	/// A 256-bit vector of [4 x double] containing one of the source operands.
685	/// The horizontal sums of the values are returned in the odd-indexed
686	/// elements of a vector of [4 x double].
687	/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
688	/// both operands.
689	static __inline __m256d __DEFAULT_FN_ATTRS
690	_mm256_hadd_pd(__m256d __a, __m256d __b)
691	{
692	return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
693	}
694
695	/// Horizontally adds the adjacent pairs of values contained in two
696	/// 256-bit vectors of [8 x float].
697	///
698	/// \headerfile <x86intrin.h>
699	///
700	/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
701	///
702	/// \param __a
703	/// A 256-bit vector of [8 x float] containing one of the source operands.
704	/// The horizontal sums of the values are returned in the elements with
705	/// index 0, 1, 4, 5 of a vector of [8 x float].
706	/// \param __b
707	/// A 256-bit vector of [8 x float] containing one of the source operands.
708	/// The horizontal sums of the values are returned in the elements with
709	/// index 2, 3, 6, 7 of a vector of [8 x float].
710	/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
711	/// both operands.
712	static __inline __m256 __DEFAULT_FN_ATTRS
713	_mm256_hadd_ps(__m256 __a, __m256 __b)
714	{
715	return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
716	}
717
718	/// Horizontally subtracts the adjacent pairs of values contained in two
719	/// 256-bit vectors of [4 x double].
720	///
721	/// \headerfile <x86intrin.h>
722	///
723	/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
724	///
725	/// \param __a
726	/// A 256-bit vector of [4 x double] containing one of the source operands.
727	/// The horizontal differences between the values are returned in the
728	/// even-indexed elements of a vector of [4 x double].
729	/// \param __b
730	/// A 256-bit vector of [4 x double] containing one of the source operands.
731	/// The horizontal differences between the values are returned in the
732	/// odd-indexed elements of a vector of [4 x double].
733	/// \returns A 256-bit vector of [4 x double] containing the horizontal
734	/// differences of both operands.
735	static __inline __m256d __DEFAULT_FN_ATTRS
736	_mm256_hsub_pd(__m256d __a, __m256d __b)
737	{
738	return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
739	}
740
741	/// Horizontally subtracts the adjacent pairs of values contained in two
742	/// 256-bit vectors of [8 x float].
743	///
744	/// \headerfile <x86intrin.h>
745	///
746	/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
747	///
748	/// \param __a
749	/// A 256-bit vector of [8 x float] containing one of the source operands.
750	/// The horizontal differences between the values are returned in the
751	/// elements with index 0, 1, 4, 5 of a vector of [8 x float].
752	/// \param __b
753	/// A 256-bit vector of [8 x float] containing one of the source operands.
754	/// The horizontal differences between the values are returned in the
755	/// elements with index 2, 3, 6, 7 of a vector of [8 x float].
756	/// \returns A 256-bit vector of [8 x float] containing the horizontal
757	/// differences of both operands.
758	static __inline __m256 __DEFAULT_FN_ATTRS
759	_mm256_hsub_ps(__m256 __a, __m256 __b)
760	{
761	return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
762	}
763
764	/* Vector permutations */
765	/// Copies the values in a 128-bit vector of [2 x double] as specified
766	/// by the 128-bit integer vector operand.
767	///
768	/// \headerfile <x86intrin.h>
769	///
770	/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
771	///
772	/// \param __a
773	/// A 128-bit vector of [2 x double].
774	/// \param __c
775	/// A 128-bit integer vector operand specifying how the values are to be
776	/// copied. \n
777	/// Bit [1]: \n
778	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
779	/// vector. \n
780	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
781	/// returned vector. \n
782	/// Bit [65]: \n
783	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
784	/// returned vector. \n
785	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
786	/// returned vector.
787	/// \returns A 128-bit vector of [2 x double] containing the copied values.
788	static __inline __m128d __DEFAULT_FN_ATTRS128
789	_mm_permutevar_pd(__m128d __a, __m128i __c)
790	{
791	return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
792	}
793
794	/// Copies the values in a 256-bit vector of [4 x double] as specified
795	/// by the 256-bit integer vector operand.
796	///
797	/// \headerfile <x86intrin.h>
798	///
799	/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
800	///
801	/// \param __a
802	/// A 256-bit vector of [4 x double].
803	/// \param __c
804	/// A 256-bit integer vector operand specifying how the values are to be
805	/// copied. \n
806	/// Bit [1]: \n
807	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
808	/// vector. \n
809	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
810	/// returned vector. \n
811	/// Bit [65]: \n
812	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
813	/// returned vector. \n
814	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
815	/// returned vector. \n
816	/// Bit [129]: \n
817	/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
818	/// returned vector. \n
819	/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
820	/// returned vector. \n
821	/// Bit [193]: \n
822	/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
823	/// returned vector. \n
824	/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
825	/// returned vector.
826	/// \returns A 256-bit vector of [4 x double] containing the copied values.
827	static __inline __m256d __DEFAULT_FN_ATTRS
828	_mm256_permutevar_pd(__m256d __a, __m256i __c)
829	{
830	return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
831	}
832
833	/// Copies the values stored in a 128-bit vector of [4 x float] as
834	/// specified by the 128-bit integer vector operand.
835	/// \headerfile <x86intrin.h>
836	///
837	/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
838	///
839	/// \param __a
840	/// A 128-bit vector of [4 x float].
841	/// \param __c
842	/// A 128-bit integer vector operand specifying how the values are to be
843	/// copied. \n
844	/// Bits [1:0]: \n
845	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
846	/// returned vector. \n
847	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
848	/// returned vector. \n
849	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
850	/// returned vector. \n
851	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
852	/// returned vector. \n
853	/// Bits [33:32]: \n
854	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
855	/// returned vector. \n
856	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
857	/// returned vector. \n
858	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
859	/// returned vector. \n
860	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
861	/// returned vector. \n
862	/// Bits [65:64]: \n
863	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
864	/// returned vector. \n
865	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
866	/// returned vector. \n
867	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
868	/// returned vector. \n
869	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
870	/// returned vector. \n
871	/// Bits [97:96]: \n
872	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
873	/// returned vector. \n
874	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
875	/// returned vector. \n
876	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
877	/// returned vector. \n
878	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
879	/// returned vector.
880	/// \returns A 128-bit vector of [4 x float] containing the copied values.
881	static __inline __m128 __DEFAULT_FN_ATTRS128
882	_mm_permutevar_ps(__m128 __a, __m128i __c)
883	{
884	return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
885	}
886
887	/// Copies the values stored in a 256-bit vector of [8 x float] as
888	/// specified by the 256-bit integer vector operand.
889	///
890	/// \headerfile <x86intrin.h>
891	///
892	/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
893	///
894	/// \param __a
895	/// A 256-bit vector of [8 x float].
896	/// \param __c
897	/// A 256-bit integer vector operand specifying how the values are to be
898	/// copied. \n
899	/// Bits [1:0]: \n
900	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
901	/// returned vector. \n
902	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
903	/// returned vector. \n
904	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
905	/// returned vector. \n
906	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
907	/// returned vector. \n
908	/// Bits [33:32]: \n
909	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
910	/// returned vector. \n
911	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
912	/// returned vector. \n
913	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
914	/// returned vector. \n
915	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
916	/// returned vector. \n
917	/// Bits [65:64]: \n
918	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
919	/// returned vector. \n
920	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
921	/// returned vector. \n
922	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
923	/// returned vector. \n
924	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
925	/// returned vector. \n
926	/// Bits [97:96]: \n
927	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
928	/// returned vector. \n
929	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
930	/// returned vector. \n
931	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
932	/// returned vector. \n
933	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
934	/// returned vector. \n
935	/// Bits [129:128]: \n
936	/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
937	/// returned vector. \n
938	/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
939	/// returned vector. \n
940	/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
941	/// returned vector. \n
942	/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
943	/// returned vector. \n
944	/// Bits [161:160]: \n
945	/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
946	/// returned vector. \n
947	/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
948	/// returned vector. \n
949	/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
950	/// returned vector. \n
951	/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
952	/// returned vector. \n
953	/// Bits [193:192]: \n
954	/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
955	/// returned vector. \n
956	/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
957	/// returned vector. \n
958	/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
959	/// returned vector. \n
960	/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
961	/// returned vector. \n
962	/// Bits [225:224]: \n
963	/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
964	/// returned vector. \n
965	/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
966	/// returned vector. \n
967	/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
968	/// returned vector. \n
969	/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
970	/// returned vector.
971	/// \returns A 256-bit vector of [8 x float] containing the copied values.
972	static __inline __m256 __DEFAULT_FN_ATTRS
973	_mm256_permutevar_ps(__m256 __a, __m256i __c)
974	{
975	return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
976	}
977
978	/// Copies the values in a 128-bit vector of [2 x double] as specified
979	/// by the immediate integer operand.
980	///
981	/// \headerfile <x86intrin.h>
982	///
983	/// \code
984	/// __m128d _mm_permute_pd(__m128d A, const int C);
985	/// \endcode
986	///
987	/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
988	///
989	/// \param A
990	/// A 128-bit vector of [2 x double].
991	/// \param C
992	/// An immediate integer operand specifying how the values are to be
993	/// copied. \n
994	/// Bit [0]: \n
995	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
996	/// vector. \n
997	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
998	/// returned vector. \n
999	/// Bit [1]: \n
1000	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
1001	/// returned vector. \n
1002	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
1003	/// returned vector.
1004	/// \returns A 128-bit vector of [2 x double] containing the copied values.
1005	#define _mm_permute_pd(A, C) \
1006	(__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C))
1007
1008	/// Copies the values in a 256-bit vector of [4 x double] as specified by
1009	/// the immediate integer operand.
1010	///
1011	/// \headerfile <x86intrin.h>
1012	///
1013	/// \code
1014	/// __m256d _mm256_permute_pd(__m256d A, const int C);
1015	/// \endcode
1016	///
1017	/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
1018	///
1019	/// \param A
1020	/// A 256-bit vector of [4 x double].
1021	/// \param C
1022	/// An immediate integer operand specifying how the values are to be
1023	/// copied. \n
1024	/// Bit [0]: \n
1025	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
1026	/// vector. \n
1027	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
1028	/// returned vector. \n
1029	/// Bit [1]: \n
1030	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
1031	/// returned vector. \n
1032	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
1033	/// returned vector. \n
1034	/// Bit [2]: \n
1035	/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
1036	/// returned vector. \n
1037	/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
1038	/// returned vector. \n
1039	/// Bit [3]: \n
1040	/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
1041	/// returned vector. \n
1042	/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
1043	/// returned vector.
1044	/// \returns A 256-bit vector of [4 x double] containing the copied values.
1045	#define _mm256_permute_pd(A, C) \
1046	(__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C))
1047
1048	/// Copies the values in a 128-bit vector of [4 x float] as specified by
1049	/// the immediate integer operand.
1050	///
1051	/// \headerfile <x86intrin.h>
1052	///
1053	/// \code
1054	/// __m128 _mm_permute_ps(__m128 A, const int C);
1055	/// \endcode
1056	///
1057	/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1058	///
1059	/// \param A
1060	/// A 128-bit vector of [4 x float].
1061	/// \param C
1062	/// An immediate integer operand specifying how the values are to be
1063	/// copied. \n
1064	/// Bits [1:0]: \n
1065	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
1066	/// returned vector. \n
1067	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
1068	/// returned vector. \n
1069	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
1070	/// returned vector. \n
1071	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
1072	/// returned vector. \n
1073	/// Bits [3:2]: \n
1074	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
1075	/// returned vector. \n
1076	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
1077	/// returned vector. \n
1078	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
1079	/// returned vector. \n
1080	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
1081	/// returned vector. \n
1082	/// Bits [5:4]: \n
1083	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
1084	/// returned vector. \n
1085	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
1086	/// returned vector. \n
1087	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
1088	/// returned vector. \n
1089	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
1090	/// returned vector. \n
1091	/// Bits [7:6]: \n
1092	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
1093	/// returned vector. \n
1094	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
1095	/// returned vector. \n
1096	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
1097	/// returned vector. \n
1098	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
1099	/// returned vector.
1100	/// \returns A 128-bit vector of [4 x float] containing the copied values.
1101	#define _mm_permute_ps(A, C) \
1102	(__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C))
1103
1104	/// Copies the values in a 256-bit vector of [8 x float] as specified by
1105	/// the immediate integer operand.
1106	///
1107	/// \headerfile <x86intrin.h>
1108	///
1109	/// \code
1110	/// __m256 _mm256_permute_ps(__m256 A, const int C);
1111	/// \endcode
1112	///
1113	/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1114	///
1115	/// \param A
1116	/// A 256-bit vector of [8 x float].
1117	/// \param C
1118	/// An immediate integer operand specifying how the values are to be
1119	/// copied. \n
1120	/// Bits [1:0]: \n
1121	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
1122	/// returned vector. \n
1123	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
1124	/// returned vector. \n
1125	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
1126	/// returned vector. \n
1127	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
1128	/// returned vector. \n
1129	/// Bits [3:2]: \n
1130	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
1131	/// returned vector. \n
1132	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
1133	/// returned vector. \n
1134	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
1135	/// returned vector. \n
1136	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
1137	/// returned vector. \n
1138	/// Bits [5:4]: \n
1139	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
1140	/// returned vector. \n
1141	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
1142	/// returned vector. \n
1143	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
1144	/// returned vector. \n
1145	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
1146	/// returned vector. \n
1147	/// Bits [7:6]: \n
1148	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
1149	/// returned vector. \n
1150	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
1151	/// returned vector. \n
1152	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
1153	/// returned vector. \n
1154	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
1155	/// returned vector. \n
1156	/// Bits [1:0]: \n
1157	/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
1158	/// returned vector. \n
1159	/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
1160	/// returned vector. \n
1161	/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
1162	/// returned vector. \n
1163	/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
1164	/// returned vector. \n
1165	/// Bits [3:2]: \n
1166	/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
1167	/// returned vector. \n
1168	/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
1169	/// returned vector. \n
1170	/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
1171	/// returned vector. \n
1172	/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
1173	/// returned vector. \n
1174	/// Bits [5:4]: \n
1175	/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
1176	/// returned vector. \n
1177	/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
1178	/// returned vector. \n
1179	/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
1180	/// returned vector. \n
1181	/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
1182	/// returned vector. \n
1183	/// Bits [7:6]: \n
1184	/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
1185	/// returned vector. \n
1186	/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
1187	/// returned vector. \n
1188	/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
1189	/// returned vector. \n
1190	/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
1191	/// returned vector.
1192	/// \returns A 256-bit vector of [8 x float] containing the copied values.
1193	#define _mm256_permute_ps(A, C) \
1194	(__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C))
1195
1196	/// Permutes 128-bit data values stored in two 256-bit vectors of
1197	/// [4 x double], as specified by the immediate integer operand.
1198	///
1199	/// \headerfile <x86intrin.h>
1200	///
1201	/// \code
1202	/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
1203	/// \endcode
1204	///
1205	/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1206	///
1207	/// \param V1
1208	/// A 256-bit vector of [4 x double].
1209	/// \param V2
1210	/// A 256-bit vector of [4 x double.
1211	/// \param M
1212	/// An immediate integer operand specifying how the values are to be
1213	/// permuted. \n
1214	/// Bits [1:0]: \n
1215	/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1216	/// destination. \n
1217	/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1218	/// destination. \n
1219	/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1220	/// destination. \n
1221	/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1222	/// destination. \n
1223	/// Bits [5:4]: \n
1224	/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1225	/// destination. \n
1226	/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1227	/// destination. \n
1228	/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1229	/// destination. \n
1230	/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1231	/// destination.
1232	/// \returns A 256-bit vector of [4 x double] containing the copied values.
1233	#define _mm256_permute2f128_pd(V1, V2, M) \
1234	(__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
1235	(__v4df)(__m256d)(V2), (int)(M))
1236
1237	/// Permutes 128-bit data values stored in two 256-bit vectors of
1238	/// [8 x float], as specified by the immediate integer operand.
1239	///
1240	/// \headerfile <x86intrin.h>
1241	///
1242	/// \code
1243	/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
1244	/// \endcode
1245	///
1246	/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1247	///
1248	/// \param V1
1249	/// A 256-bit vector of [8 x float].
1250	/// \param V2
1251	/// A 256-bit vector of [8 x float].
1252	/// \param M
1253	/// An immediate integer operand specifying how the values are to be
1254	/// permuted. \n
1255	/// Bits [1:0]: \n
1256	/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1257	/// destination. \n
1258	/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1259	/// destination. \n
1260	/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1261	/// destination. \n
1262	/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1263	/// destination. \n
1264	/// Bits [5:4]: \n
1265	/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1266	/// destination. \n
1267	/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1268	/// destination. \n
1269	/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1270	/// destination. \n
1271	/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1272	/// destination.
1273	/// \returns A 256-bit vector of [8 x float] containing the copied values.
1274	#define _mm256_permute2f128_ps(V1, V2, M) \
1275	(__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
1276	(__v8sf)(__m256)(V2), (int)(M))
1277
1278	/// Permutes 128-bit data values stored in two 256-bit integer vectors,
1279	/// as specified by the immediate integer operand.
1280	///
1281	/// \headerfile <x86intrin.h>
1282	///
1283	/// \code
1284	/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
1285	/// \endcode
1286	///
1287	/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1288	///
1289	/// \param V1
1290	/// A 256-bit integer vector.
1291	/// \param V2
1292	/// A 256-bit integer vector.
1293	/// \param M
1294	/// An immediate integer operand specifying how the values are to be copied.
1295	/// Bits [1:0]: \n
1296	/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1297	/// destination. \n
1298	/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1299	/// destination. \n
1300	/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1301	/// destination. \n
1302	/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1303	/// destination. \n
1304	/// Bits [5:4]: \n
1305	/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1306	/// destination. \n
1307	/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1308	/// destination. \n
1309	/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1310	/// destination. \n
1311	/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1312	/// destination.
1313	/// \returns A 256-bit integer vector containing the copied values.
1314	#define _mm256_permute2f128_si256(V1, V2, M) \
1315	(__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
1316	(__v8si)(__m256i)(V2), (int)(M))
1317
1318	/* Vector Blend */
1319	/// Merges 64-bit double-precision data values stored in either of the
1320	/// two 256-bit vectors of [4 x double], as specified by the immediate
1321	/// integer operand.
1322	///
1323	/// \headerfile <x86intrin.h>
1324	///
1325	/// \code
1326	/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
1327	/// \endcode
1328	///
1329	/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
1330	///
1331	/// \param V1
1332	/// A 256-bit vector of [4 x double].
1333	/// \param V2
1334	/// A 256-bit vector of [4 x double].
1335	/// \param M
1336	/// An immediate integer operand, with mask bits [3:0] specifying how the
1337	/// values are to be copied. The position of the mask bit corresponds to the
1338	/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
1339	/// element in operand \a V1 is copied to the same position in the
1340	/// destination. When a mask bit is 1, the corresponding 64-bit element in
1341	/// operand \a V2 is copied to the same position in the destination.
1342	/// \returns A 256-bit vector of [4 x double] containing the copied values.
1343	#define _mm256_blend_pd(V1, V2, M) \
1344	(__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
1345	(__v4df)(__m256d)(V2), (int)(M))
1346
1347	/// Merges 32-bit single-precision data values stored in either of the
1348	/// two 256-bit vectors of [8 x float], as specified by the immediate
1349	/// integer operand.
1350	///
1351	/// \headerfile <x86intrin.h>
1352	///
1353	/// \code
1354	/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
1355	/// \endcode
1356	///
1357	/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
1358	///
1359	/// \param V1
1360	/// A 256-bit vector of [8 x float].
1361	/// \param V2
1362	/// A 256-bit vector of [8 x float].
1363	/// \param M
1364	/// An immediate integer operand, with mask bits [7:0] specifying how the
1365	/// values are to be copied. The position of the mask bit corresponds to the
1366	/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
1367	/// element in operand \a V1 is copied to the same position in the
1368	/// destination. When a mask bit is 1, the corresponding 32-bit element in
1369	/// operand \a V2 is copied to the same position in the destination.
1370	/// \returns A 256-bit vector of [8 x float] containing the copied values.
1371	#define _mm256_blend_ps(V1, V2, M) \
1372	(__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
1373	(__v8sf)(__m256)(V2), (int)(M))
1374
1375	/// Merges 64-bit double-precision data values stored in either of the
1376	/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector
1377	/// operand.
1378	///
1379	/// \headerfile <x86intrin.h>
1380	///
1381	/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
1382	///
1383	/// \param __a
1384	/// A 256-bit vector of [4 x double].
1385	/// \param __b
1386	/// A 256-bit vector of [4 x double].
1387	/// \param __c
1388	/// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
1389	/// how the values are to be copied. The position of the mask bit corresponds
1390	/// to the most significant bit of a copied value. When a mask bit is 0, the
1391	/// corresponding 64-bit element in operand \a __a is copied to the same
1392	/// position in the destination. When a mask bit is 1, the corresponding
1393	/// 64-bit element in operand \a __b is copied to the same position in the
1394	/// destination.
1395	/// \returns A 256-bit vector of [4 x double] containing the copied values.
1396	static __inline __m256d __DEFAULT_FN_ATTRS
1397	_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
1398	{
1399	return (__m256d)__builtin_ia32_blendvpd256(
1400	(__v4df)__a, (__v4df)__b, (__v4df)__c);
1401	}
1402
1403	/// Merges 32-bit single-precision data values stored in either of the
1404	/// two 256-bit vectors of [8 x float], as specified by the 256-bit vector
1405	/// operand.
1406	///
1407	/// \headerfile <x86intrin.h>
1408	///
1409	/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
1410	///
1411	/// \param __a
1412	/// A 256-bit vector of [8 x float].
1413	/// \param __b
1414	/// A 256-bit vector of [8 x float].
1415	/// \param __c
1416	/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
1417	/// and 31 specifying how the values are to be copied. The position of the
1418	/// mask bit corresponds to the most significant bit of a copied value. When
1419	/// a mask bit is 0, the corresponding 32-bit element in operand \a __a is
1420	/// copied to the same position in the destination. When a mask bit is 1, the
1421	/// corresponding 32-bit element in operand \a __b is copied to the same
1422	/// position in the destination.
1423	/// \returns A 256-bit vector of [8 x float] containing the copied values.
1424	static __inline __m256 __DEFAULT_FN_ATTRS
1425	_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
1426	{
1427	return (__m256)__builtin_ia32_blendvps256(
1428	(__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
1429	}
1430
1431	/* Vector Dot Product */
1432	/// Computes two dot products in parallel, using the lower and upper
1433	/// halves of two [8 x float] vectors as input to the two computations, and
1434	/// returning the two dot products in the lower and upper halves of the
1435	/// [8 x float] result.
1436	///
1437	/// The immediate integer operand controls which input elements will
1438	/// contribute to the dot product, and where the final results are returned.
1439	/// In general, for each dot product, the four corresponding elements of the
1440	/// input vectors are multiplied; the first two and second two products are
1441	/// summed, then the two sums are added to form the final result.
1442	///
1443	/// \headerfile <x86intrin.h>
1444	///
1445	/// \code
1446	/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
1447	/// \endcode
1448	///
1449	/// This intrinsic corresponds to the <c> VDPPS </c> instruction.
1450	///
1451	/// \param V1
1452	/// A vector of [8 x float] values, treated as two [4 x float] vectors.
1453	/// \param V2
1454	/// A vector of [8 x float] values, treated as two [4 x float] vectors.
1455	/// \param M
1456	/// An immediate integer argument. Bits [7:4] determine which elements of
1457	/// the input vectors are used, with bit [4] corresponding to the lowest
1458	/// element and bit [7] corresponding to the highest element of each [4 x
1459	/// float] subvector. If a bit is set, the corresponding elements from the
1460	/// two input vectors are used as an input for dot product; otherwise that
1461	/// input is treated as zero. Bits [3:0] determine which elements of the
1462	/// result will receive a copy of the final dot product, with bit [0]
1463	/// corresponding to the lowest element and bit [3] corresponding to the
1464	/// highest element of each [4 x float] subvector. If a bit is set, the dot
1465	/// product is returned in the corresponding element; otherwise that element
1466	/// is set to zero. The bitmask is applied in the same way to each of the
1467	/// two parallel dot product computations.
1468	/// \returns A 256-bit vector of [8 x float] containing the two dot products.
1469	#define _mm256_dp_ps(V1, V2, M) \
1470	(__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
1471	(__v8sf)(__m256)(V2), (M))
1472
1473	/* Vector shuffle */
1474	/// Selects 8 float values from the 256-bit operands of [8 x float], as
1475	/// specified by the immediate value operand.
1476	///
1477	/// The four selected elements in each operand are copied to the destination
1478	/// according to the bits specified in the immediate operand. The selected
1479	/// elements from the first 256-bit operand are copied to bits [63:0] and
1480	/// bits [191:128] of the destination, and the selected elements from the
1481	/// second 256-bit operand are copied to bits [127:64] and bits [255:192] of
1482	/// the destination. For example, if bits [7:0] of the immediate operand
1483	/// contain a value of 0xFF, the 256-bit destination vector would contain the
1484	/// following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3].
1485	///
1486	/// \headerfile <x86intrin.h>
1487	///
1488	/// \code
1489	/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
1490	/// \endcode
1491	///
1492	/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
1493	///
1494	/// \param a
1495	/// A 256-bit vector of [8 x float]. The four selected elements in this
1496	/// operand are copied to bits [63:0] and bits [191:128] in the destination,
1497	/// according to the bits specified in the immediate operand.
1498	/// \param b
1499	/// A 256-bit vector of [8 x float]. The four selected elements in this
1500	/// operand are copied to bits [127:64] and bits [255:192] in the
1501	/// destination, according to the bits specified in the immediate operand.
1502	/// \param mask
1503	/// An immediate value containing an 8-bit value specifying which elements to
1504	/// copy from \a a and \a b \n.
1505	/// Bits [3:0] specify the values copied from operand \a a. \n
1506	/// Bits [7:4] specify the values copied from operand \a b. \n
1507	/// The destinations within the 256-bit destination are assigned values as
1508	/// follows, according to the bit value assignments described below: \n
1509	/// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
1510	/// destination. \n
1511	/// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
1512	/// destination. \n
1513	/// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
1514	/// destination. \n
1515	/// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
1516	/// the destination. \n
1517	/// Bit value assignments: \n
1518	/// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n
1519	/// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n
1520	/// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n
1521	/// 11: Bits [127:96] and [255:224] are copied from the selected operand.
1522	/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
1523	#define _mm256_shuffle_ps(a, b, mask) \
1524	(__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
1525	(__v8sf)(__m256)(b), (int)(mask))
1526
1527	/// Selects four double-precision values from the 256-bit operands of
1528	/// [4 x double], as specified by the immediate value operand.
1529	///
1530	/// The selected elements from the first 256-bit operand are copied to bits
1531	/// [63:0] and bits [191:128] in the destination, and the selected elements
1532	/// from the second 256-bit operand are copied to bits [127:64] and bits
1533	/// [255:192] in the destination. For example, if bits [3:0] of the immediate
1534	/// operand contain a value of 0xF, the 256-bit destination vector would
1535	/// contain the following values: b[3], a[3], b[1], a[1].
1536	///
1537	/// \headerfile <x86intrin.h>
1538	///
1539	/// \code
1540	/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
1541	/// \endcode
1542	///
1543	/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
1544	///
1545	/// \param a
1546	/// A 256-bit vector of [4 x double].
1547	/// \param b
1548	/// A 256-bit vector of [4 x double].
1549	/// \param mask
1550	/// An immediate value containing 8-bit values specifying which elements to
1551	/// copy from \a a and \a b: \n
1552	/// Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
1553	/// destination. \n
1554	/// Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
1555	/// destination. \n
1556	/// Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
1557	/// destination. \n
1558	/// Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
1559	/// destination. \n
1560	/// Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
1561	/// destination. \n
1562	/// Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
1563	/// destination. \n
1564	/// Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
1565	/// destination. \n
1566	/// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
1567	/// destination.
1568	/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
1569	#define _mm256_shuffle_pd(a, b, mask) \
1570	(__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
1571	(__v4df)(__m256d)(b), (int)(mask))
1572
1573	/* Compare */
1574	#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
1575	#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
1576	#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
1577	#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
1578	#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
1579	#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
1580	#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
1581	#define _CMP_ORD_Q 0x07 /* Ordered (non-signaling) */
1582	#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
1583	#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */
1584	#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
1585	#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
1586	#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
1587	#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
1588	#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
1589	#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
1590	#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
1591	#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
1592	#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
1593	#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
1594	#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
1595	#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
1596	#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */
1597	#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
1598	#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
1599	#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */
1600	#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
1601	#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
1602	#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
1603	#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
1604	#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
1605	#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
1606
1607	/// Compares each of the corresponding double-precision values of two
1608	/// 128-bit vectors of [2 x double], using the operation specified by the
1609	/// immediate integer operand.
1610	///
1611	/// Returns a [2 x double] vector consisting of two doubles corresponding to
1612	/// the two comparison results: zero if the comparison is false, and all 1's
1613	/// if the comparison is true.
1614	///
1615	/// \headerfile <x86intrin.h>
1616	///
1617	/// \code
1618	/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
1619	/// \endcode
1620	///
1621	/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
1622	///
1623	/// \param a
1624	/// A 128-bit vector of [2 x double].
1625	/// \param b
1626	/// A 128-bit vector of [2 x double].
1627	/// \param c
1628	/// An immediate integer operand, with bits [4:0] specifying which comparison
1629	/// operation to use: \n
1630	/// 0x00: Equal (ordered, non-signaling) \n
1631	/// 0x01: Less-than (ordered, signaling) \n
1632	/// 0x02: Less-than-or-equal (ordered, signaling) \n
1633	/// 0x03: Unordered (non-signaling) \n
1634	/// 0x04: Not-equal (unordered, non-signaling) \n
1635	/// 0x05: Not-less-than (unordered, signaling) \n
1636	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1637	/// 0x07: Ordered (non-signaling) \n
1638	/// 0x08: Equal (unordered, non-signaling) \n
1639	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1640	/// 0x0A: Not-greater-than (unordered, signaling) \n
1641	/// 0x0B: False (ordered, non-signaling) \n
1642	/// 0x0C: Not-equal (ordered, non-signaling) \n
1643	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1644	/// 0x0E: Greater-than (ordered, signaling) \n
1645	/// 0x0F: True (unordered, non-signaling) \n
1646	/// 0x10: Equal (ordered, signaling) \n
1647	/// 0x11: Less-than (ordered, non-signaling) \n
1648	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1649	/// 0x13: Unordered (signaling) \n
1650	/// 0x14: Not-equal (unordered, signaling) \n
1651	/// 0x15: Not-less-than (unordered, non-signaling) \n
1652	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1653	/// 0x17: Ordered (signaling) \n
1654	/// 0x18: Equal (unordered, signaling) \n
1655	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1656	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1657	/// 0x1B: False (ordered, signaling) \n
1658	/// 0x1C: Not-equal (ordered, signaling) \n
1659	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1660	/// 0x1E: Greater-than (ordered, non-signaling) \n
1661	/// 0x1F: True (unordered, signaling)
1662	/// \returns A 128-bit vector of [2 x double] containing the comparison results.
1663	#define _mm_cmp_pd(a, b, c) \
1664	(__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
1665	(__v2df)(__m128d)(b), (c))
1666
1667	/// Compares each of the corresponding values of two 128-bit vectors of
1668	/// [4 x float], using the operation specified by the immediate integer
1669	/// operand.
1670	///
1671	/// Returns a [4 x float] vector consisting of four floats corresponding to
1672	/// the four comparison results: zero if the comparison is false, and all 1's
1673	/// if the comparison is true.
1674	///
1675	/// \headerfile <x86intrin.h>
1676	///
1677	/// \code
1678	/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
1679	/// \endcode
1680	///
1681	/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
1682	///
1683	/// \param a
1684	/// A 128-bit vector of [4 x float].
1685	/// \param b
1686	/// A 128-bit vector of [4 x float].
1687	/// \param c
1688	/// An immediate integer operand, with bits [4:0] specifying which comparison
1689	/// operation to use: \n
1690	/// 0x00: Equal (ordered, non-signaling) \n
1691	/// 0x01: Less-than (ordered, signaling) \n
1692	/// 0x02: Less-than-or-equal (ordered, signaling) \n
1693	/// 0x03: Unordered (non-signaling) \n
1694	/// 0x04: Not-equal (unordered, non-signaling) \n
1695	/// 0x05: Not-less-than (unordered, signaling) \n
1696	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1697	/// 0x07: Ordered (non-signaling) \n
1698	/// 0x08: Equal (unordered, non-signaling) \n
1699	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1700	/// 0x0A: Not-greater-than (unordered, signaling) \n
1701	/// 0x0B: False (ordered, non-signaling) \n
1702	/// 0x0C: Not-equal (ordered, non-signaling) \n
1703	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1704	/// 0x0E: Greater-than (ordered, signaling) \n
1705	/// 0x0F: True (unordered, non-signaling) \n
1706	/// 0x10: Equal (ordered, signaling) \n
1707	/// 0x11: Less-than (ordered, non-signaling) \n
1708	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1709	/// 0x13: Unordered (signaling) \n
1710	/// 0x14: Not-equal (unordered, signaling) \n
1711	/// 0x15: Not-less-than (unordered, non-signaling) \n
1712	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1713	/// 0x17: Ordered (signaling) \n
1714	/// 0x18: Equal (unordered, signaling) \n
1715	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1716	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1717	/// 0x1B: False (ordered, signaling) \n
1718	/// 0x1C: Not-equal (ordered, signaling) \n
1719	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1720	/// 0x1E: Greater-than (ordered, non-signaling) \n
1721	/// 0x1F: True (unordered, signaling)
1722	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1723	#define _mm_cmp_ps(a, b, c) \
1724	(__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
1725	(__v4sf)(__m128)(b), (c))
1726
1727	/// Compares each of the corresponding double-precision values of two
1728	/// 256-bit vectors of [4 x double], using the operation specified by the
1729	/// immediate integer operand.
1730	///
1731	/// Returns a [4 x double] vector consisting of four doubles corresponding to
1732	/// the four comparison results: zero if the comparison is false, and all 1's
1733	/// if the comparison is true.
1734	///
1735	/// \headerfile <x86intrin.h>
1736	///
1737	/// \code
1738	/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
1739	/// \endcode
1740	///
1741	/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
1742	///
1743	/// \param a
1744	/// A 256-bit vector of [4 x double].
1745	/// \param b
1746	/// A 256-bit vector of [4 x double].
1747	/// \param c
1748	/// An immediate integer operand, with bits [4:0] specifying which comparison
1749	/// operation to use: \n
1750	/// 0x00: Equal (ordered, non-signaling) \n
1751	/// 0x01: Less-than (ordered, signaling) \n
1752	/// 0x02: Less-than-or-equal (ordered, signaling) \n
1753	/// 0x03: Unordered (non-signaling) \n
1754	/// 0x04: Not-equal (unordered, non-signaling) \n
1755	/// 0x05: Not-less-than (unordered, signaling) \n
1756	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1757	/// 0x07: Ordered (non-signaling) \n
1758	/// 0x08: Equal (unordered, non-signaling) \n
1759	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1760	/// 0x0A: Not-greater-than (unordered, signaling) \n
1761	/// 0x0B: False (ordered, non-signaling) \n
1762	/// 0x0C: Not-equal (ordered, non-signaling) \n
1763	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1764	/// 0x0E: Greater-than (ordered, signaling) \n
1765	/// 0x0F: True (unordered, non-signaling) \n
1766	/// 0x10: Equal (ordered, signaling) \n
1767	/// 0x11: Less-than (ordered, non-signaling) \n
1768	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1769	/// 0x13: Unordered (signaling) \n
1770	/// 0x14: Not-equal (unordered, signaling) \n
1771	/// 0x15: Not-less-than (unordered, non-signaling) \n
1772	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1773	/// 0x17: Ordered (signaling) \n
1774	/// 0x18: Equal (unordered, signaling) \n
1775	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1776	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1777	/// 0x1B: False (ordered, signaling) \n
1778	/// 0x1C: Not-equal (ordered, signaling) \n
1779	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1780	/// 0x1E: Greater-than (ordered, non-signaling) \n
1781	/// 0x1F: True (unordered, signaling)
1782	/// \returns A 256-bit vector of [4 x double] containing the comparison results.
1783	#define _mm256_cmp_pd(a, b, c) \
1784	(__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
1785	(__v4df)(__m256d)(b), (c))
1786
1787	/// Compares each of the corresponding values of two 256-bit vectors of
1788	/// [8 x float], using the operation specified by the immediate integer
1789	/// operand.
1790	///
1791	/// Returns a [8 x float] vector consisting of eight floats corresponding to
1792	/// the eight comparison results: zero if the comparison is false, and all
1793	/// 1's if the comparison is true.
1794	///
1795	/// \headerfile <x86intrin.h>
1796	///
1797	/// \code
1798	/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
1799	/// \endcode
1800	///
1801	/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
1802	///
1803	/// \param a
1804	/// A 256-bit vector of [8 x float].
1805	/// \param b
1806	/// A 256-bit vector of [8 x float].
1807	/// \param c
1808	/// An immediate integer operand, with bits [4:0] specifying which comparison
1809	/// operation to use: \n
1810	/// 0x00: Equal (ordered, non-signaling) \n
1811	/// 0x01: Less-than (ordered, signaling) \n
1812	/// 0x02: Less-than-or-equal (ordered, signaling) \n
1813	/// 0x03: Unordered (non-signaling) \n
1814	/// 0x04: Not-equal (unordered, non-signaling) \n
1815	/// 0x05: Not-less-than (unordered, signaling) \n
1816	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1817	/// 0x07: Ordered (non-signaling) \n
1818	/// 0x08: Equal (unordered, non-signaling) \n
1819	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1820	/// 0x0A: Not-greater-than (unordered, signaling) \n
1821	/// 0x0B: False (ordered, non-signaling) \n
1822	/// 0x0C: Not-equal (ordered, non-signaling) \n
1823	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1824	/// 0x0E: Greater-than (ordered, signaling) \n
1825	/// 0x0F: True (unordered, non-signaling) \n
1826	/// 0x10: Equal (ordered, signaling) \n
1827	/// 0x11: Less-than (ordered, non-signaling) \n
1828	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1829	/// 0x13: Unordered (signaling) \n
1830	/// 0x14: Not-equal (unordered, signaling) \n
1831	/// 0x15: Not-less-than (unordered, non-signaling) \n
1832	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1833	/// 0x17: Ordered (signaling) \n
1834	/// 0x18: Equal (unordered, signaling) \n
1835	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1836	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1837	/// 0x1B: False (ordered, signaling) \n
1838	/// 0x1C: Not-equal (ordered, signaling) \n
1839	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1840	/// 0x1E: Greater-than (ordered, non-signaling) \n
1841	/// 0x1F: True (unordered, signaling)
1842	/// \returns A 256-bit vector of [8 x float] containing the comparison results.
1843	#define _mm256_cmp_ps(a, b, c) \
1844	(__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
1845	(__v8sf)(__m256)(b), (c))
1846
1847	/// Compares each of the corresponding scalar double-precision values of
1848	/// two 128-bit vectors of [2 x double], using the operation specified by the
1849	/// immediate integer operand.
1850	///
1851	/// If the result is true, all 64 bits of the destination vector are set;
1852	/// otherwise they are cleared.
1853	///
1854	/// \headerfile <x86intrin.h>
1855	///
1856	/// \code
1857	/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
1858	/// \endcode
1859	///
1860	/// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
1861	///
1862	/// \param a
1863	/// A 128-bit vector of [2 x double].
1864	/// \param b
1865	/// A 128-bit vector of [2 x double].
1866	/// \param c
1867	/// An immediate integer operand, with bits [4:0] specifying which comparison
1868	/// operation to use: \n
1869	/// 0x00: Equal (ordered, non-signaling) \n
1870	/// 0x01: Less-than (ordered, signaling) \n
1871	/// 0x02: Less-than-or-equal (ordered, signaling) \n
1872	/// 0x03: Unordered (non-signaling) \n
1873	/// 0x04: Not-equal (unordered, non-signaling) \n
1874	/// 0x05: Not-less-than (unordered, signaling) \n
1875	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1876	/// 0x07: Ordered (non-signaling) \n
1877	/// 0x08: Equal (unordered, non-signaling) \n
1878	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1879	/// 0x0A: Not-greater-than (unordered, signaling) \n
1880	/// 0x0B: False (ordered, non-signaling) \n
1881	/// 0x0C: Not-equal (ordered, non-signaling) \n
1882	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1883	/// 0x0E: Greater-than (ordered, signaling) \n
1884	/// 0x0F: True (unordered, non-signaling) \n
1885	/// 0x10: Equal (ordered, signaling) \n
1886	/// 0x11: Less-than (ordered, non-signaling) \n
1887	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1888	/// 0x13: Unordered (signaling) \n
1889	/// 0x14: Not-equal (unordered, signaling) \n
1890	/// 0x15: Not-less-than (unordered, non-signaling) \n
1891	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1892	/// 0x17: Ordered (signaling) \n
1893	/// 0x18: Equal (unordered, signaling) \n
1894	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1895	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1896	/// 0x1B: False (ordered, signaling) \n
1897	/// 0x1C: Not-equal (ordered, signaling) \n
1898	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1899	/// 0x1E: Greater-than (ordered, non-signaling) \n
1900	/// 0x1F: True (unordered, signaling)
1901	/// \returns A 128-bit vector of [2 x double] containing the comparison results.
1902	#define _mm_cmp_sd(a, b, c) \
1903	(__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
1904	(__v2df)(__m128d)(b), (c))
1905
1906	/// Compares each of the corresponding scalar values of two 128-bit
1907	/// vectors of [4 x float], using the operation specified by the immediate
1908	/// integer operand.
1909	///
1910	/// If the result is true, all 32 bits of the destination vector are set;
1911	/// otherwise they are cleared.
1912	///
1913	/// \headerfile <x86intrin.h>
1914	///
1915	/// \code
1916	/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
1917	/// \endcode
1918	///
1919	/// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
1920	///
1921	/// \param a
1922	/// A 128-bit vector of [4 x float].
1923	/// \param b
1924	/// A 128-bit vector of [4 x float].
1925	/// \param c
1926	/// An immediate integer operand, with bits [4:0] specifying which comparison
1927	/// operation to use: \n
1928	/// 0x00: Equal (ordered, non-signaling) \n
1929	/// 0x01: Less-than (ordered, signaling) \n
1930	/// 0x02: Less-than-or-equal (ordered, signaling) \n
1931	/// 0x03: Unordered (non-signaling) \n
1932	/// 0x04: Not-equal (unordered, non-signaling) \n
1933	/// 0x05: Not-less-than (unordered, signaling) \n
1934	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1935	/// 0x07: Ordered (non-signaling) \n
1936	/// 0x08: Equal (unordered, non-signaling) \n
1937	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1938	/// 0x0A: Not-greater-than (unordered, signaling) \n
1939	/// 0x0B: False (ordered, non-signaling) \n
1940	/// 0x0C: Not-equal (ordered, non-signaling) \n
1941	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1942	/// 0x0E: Greater-than (ordered, signaling) \n
1943	/// 0x0F: True (unordered, non-signaling) \n
1944	/// 0x10: Equal (ordered, signaling) \n
1945	/// 0x11: Less-than (ordered, non-signaling) \n
1946	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1947	/// 0x13: Unordered (signaling) \n
1948	/// 0x14: Not-equal (unordered, signaling) \n
1949	/// 0x15: Not-less-than (unordered, non-signaling) \n
1950	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1951	/// 0x17: Ordered (signaling) \n
1952	/// 0x18: Equal (unordered, signaling) \n
1953	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1954	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1955	/// 0x1B: False (ordered, signaling) \n
1956	/// 0x1C: Not-equal (ordered, signaling) \n
1957	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1958	/// 0x1E: Greater-than (ordered, non-signaling) \n
1959	/// 0x1F: True (unordered, signaling)
1960	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1961	#define _mm_cmp_ss(a, b, c) \
1962	(__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
1963	(__v4sf)(__m128)(b), (c))
1964
1965	/// Takes a [8 x i32] vector and returns the vector element value
1966	/// indexed by the immediate constant operand.
1967	///
1968	/// \headerfile <x86intrin.h>
1969	///
1970	/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1971	/// instruction.
1972	///
1973	/// \param __a
1974	/// A 256-bit vector of [8 x i32].
1975	/// \param __imm
1976	/// An immediate integer operand with bits [2:0] determining which vector
1977	/// element is extracted and returned.
1978	/// \returns A 32-bit integer containing the extracted 32 bits of extended
1979	/// packed data.
1980	#define _mm256_extract_epi32(X, N) \
1981	(int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N))
1982
1983	/// Takes a [16 x i16] vector and returns the vector element value
1984	/// indexed by the immediate constant operand.
1985	///
1986	/// \headerfile <x86intrin.h>
1987	///
1988	/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1989	/// instruction.
1990	///
1991	/// \param __a
1992	/// A 256-bit integer vector of [16 x i16].
1993	/// \param __imm
1994	/// An immediate integer operand with bits [3:0] determining which vector
1995	/// element is extracted and returned.
1996	/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
1997	/// packed data.
1998	#define _mm256_extract_epi16(X, N) \
1999	(int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
2000	(int)(N))
2001
2002	/// Takes a [32 x i8] vector and returns the vector element value
2003	/// indexed by the immediate constant operand.
2004	///
2005	/// \headerfile <x86intrin.h>
2006	///
2007	/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2008	/// instruction.
2009	///
2010	/// \param __a
2011	/// A 256-bit integer vector of [32 x i8].
2012	/// \param __imm
2013	/// An immediate integer operand with bits [4:0] determining which vector
2014	/// element is extracted and returned.
2015	/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
2016	/// packed data.
2017	#define _mm256_extract_epi8(X, N) \
2018	(int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
2019	(int)(N))
2020
2021	#ifdef __x86_64__
2022	/// Takes a [4 x i64] vector and returns the vector element value
2023	/// indexed by the immediate constant operand.
2024	///
2025	/// \headerfile <x86intrin.h>
2026	///
2027	/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2028	/// instruction.
2029	///
2030	/// \param __a
2031	/// A 256-bit integer vector of [4 x i64].
2032	/// \param __imm
2033	/// An immediate integer operand with bits [1:0] determining which vector
2034	/// element is extracted and returned.
2035	/// \returns A 64-bit integer containing the extracted 64 bits of extended
2036	/// packed data.
2037	#define _mm256_extract_epi64(X, N) \
2038	(long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N))
2039	#endif
2040
2041	/// Takes a [8 x i32] vector and replaces the vector element value
2042	/// indexed by the immediate constant operand by a new value. Returns the
2043	/// modified vector.
2044	///
2045	/// \headerfile <x86intrin.h>
2046	///
2047	/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2048	/// instruction.
2049	///
2050	/// \param __a
2051	/// A vector of [8 x i32] to be used by the insert operation.
2052	/// \param __b
2053	/// An integer value. The replacement value for the insert operation.
2054	/// \param __imm
2055	/// An immediate integer specifying the index of the vector element to be
2056	/// replaced.
2057	/// \returns A copy of vector \a __a, after replacing its element indexed by
2058	/// \a __imm with \a __b.
2059	#define _mm256_insert_epi32(X, I, N) \
2060	(__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
2061	(int)(I), (int)(N))
2062
2063
2064	/// Takes a [16 x i16] vector and replaces the vector element value
2065	/// indexed by the immediate constant operand with a new value. Returns the
2066	/// modified vector.
2067	///
2068	/// \headerfile <x86intrin.h>
2069	///
2070	/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2071	/// instruction.
2072	///
2073	/// \param __a
2074	/// A vector of [16 x i16] to be used by the insert operation.
2075	/// \param __b
2076	/// An i16 integer value. The replacement value for the insert operation.
2077	/// \param __imm
2078	/// An immediate integer specifying the index of the vector element to be
2079	/// replaced.
2080	/// \returns A copy of vector \a __a, after replacing its element indexed by
2081	/// \a __imm with \a __b.
2082	#define _mm256_insert_epi16(X, I, N) \
2083	(__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
2084	(int)(I), (int)(N))
2085
2086	/// Takes a [32 x i8] vector and replaces the vector element value
2087	/// indexed by the immediate constant operand with a new value. Returns the
2088	/// modified vector.
2089	///
2090	/// \headerfile <x86intrin.h>
2091	///
2092	/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2093	/// instruction.
2094	///
2095	/// \param __a
2096	/// A vector of [32 x i8] to be used by the insert operation.
2097	/// \param __b
2098	/// An i8 integer value. The replacement value for the insert operation.
2099	/// \param __imm
2100	/// An immediate integer specifying the index of the vector element to be
2101	/// replaced.
2102	/// \returns A copy of vector \a __a, after replacing its element indexed by
2103	/// \a __imm with \a __b.
2104	#define _mm256_insert_epi8(X, I, N) \
2105	(__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
2106	(int)(I), (int)(N))
2107
2108	#ifdef __x86_64__
2109	/// Takes a [4 x i64] vector and replaces the vector element value
2110	/// indexed by the immediate constant operand with a new value. Returns the
2111	/// modified vector.
2112	///
2113	/// \headerfile <x86intrin.h>
2114	///
2115	/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2116	/// instruction.
2117	///
2118	/// \param __a
2119	/// A vector of [4 x i64] to be used by the insert operation.
2120	/// \param __b
2121	/// A 64-bit integer value. The replacement value for the insert operation.
2122	/// \param __imm
2123	/// An immediate integer specifying the index of the vector element to be
2124	/// replaced.
2125	/// \returns A copy of vector \a __a, after replacing its element indexed by
2126	/// \a __imm with \a __b.
2127	#define _mm256_insert_epi64(X, I, N) \
2128	(__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
2129	(long long)(I), (int)(N))
2130	#endif
2131
2132	/* Conversion */
2133	/// Converts a vector of [4 x i32] into a vector of [4 x double].
2134	///
2135	/// \headerfile <x86intrin.h>
2136	///
2137	/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
2138	///
2139	/// \param __a
2140	/// A 128-bit integer vector of [4 x i32].
2141	/// \returns A 256-bit vector of [4 x double] containing the converted values.
2142	static __inline __m256d __DEFAULT_FN_ATTRS
2143	_mm256_cvtepi32_pd(__m128i __a)
2144	{
2145	return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
2146	}
2147
2148	/// Converts a vector of [8 x i32] into a vector of [8 x float].
2149	///
2150	/// \headerfile <x86intrin.h>
2151	///
2152	/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
2153	///
2154	/// \param __a
2155	/// A 256-bit integer vector.
2156	/// \returns A 256-bit vector of [8 x float] containing the converted values.
2157	static __inline __m256 __DEFAULT_FN_ATTRS
2158	_mm256_cvtepi32_ps(__m256i __a)
2159	{
2160	return (__m256)__builtin_convertvector((__v8si)__a, __v8sf);
2161	}
2162
2163	/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
2164	/// [4 x float].
2165	///
2166	/// \headerfile <x86intrin.h>
2167	///
2168	/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
2169	///
2170	/// \param __a
2171	/// A 256-bit vector of [4 x double].
2172	/// \returns A 128-bit vector of [4 x float] containing the converted values.
2173	static __inline __m128 __DEFAULT_FN_ATTRS
2174	_mm256_cvtpd_ps(__m256d __a)
2175	{
2176	return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
2177	}
2178
2179	/// Converts a vector of [8 x float] into a vector of [8 x i32].
2180	///
2181	/// \headerfile <x86intrin.h>
2182	///
2183	/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
2184	///
2185	/// \param __a
2186	/// A 256-bit vector of [8 x float].
2187	/// \returns A 256-bit integer vector containing the converted values.
2188	static __inline __m256i __DEFAULT_FN_ATTRS
2189	_mm256_cvtps_epi32(__m256 __a)
2190	{
2191	return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
2192	}
2193
2194	/// Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
2195	/// x double].
2196	///
2197	/// \headerfile <x86intrin.h>
2198	///
2199	/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
2200	///
2201	/// \param __a
2202	/// A 128-bit vector of [4 x float].
2203	/// \returns A 256-bit vector of [4 x double] containing the converted values.
2204	static __inline __m256d __DEFAULT_FN_ATTRS
2205	_mm256_cvtps_pd(__m128 __a)
2206	{
2207	return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
2208	}
2209
2210	/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
2211	/// x i32], truncating the result by rounding towards zero when it is
2212	/// inexact.
2213	///
2214	/// \headerfile <x86intrin.h>
2215	///
2216	/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
2217	///
2218	/// \param __a
2219	/// A 256-bit vector of [4 x double].
2220	/// \returns A 128-bit integer vector containing the converted values.
2221	static __inline __m128i __DEFAULT_FN_ATTRS
2222	_mm256_cvttpd_epi32(__m256d __a)
2223	{
2224	return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
2225	}
2226
2227	/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
2228	/// x i32]. When a conversion is inexact, the value returned is rounded
2229	/// according to the rounding control bits in the MXCSR register.
2230	///
2231	/// \headerfile <x86intrin.h>
2232	///
2233	/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
2234	///
2235	/// \param __a
2236	/// A 256-bit vector of [4 x double].
2237	/// \returns A 128-bit integer vector containing the converted values.
2238	static __inline __m128i __DEFAULT_FN_ATTRS
2239	_mm256_cvtpd_epi32(__m256d __a)
2240	{
2241	return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
2242	}
2243
2244	/// Converts a vector of [8 x float] into a vector of [8 x i32],
2245	/// truncating the result by rounding towards zero when it is inexact.
2246	///
2247	/// \headerfile <x86intrin.h>
2248	///
2249	/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
2250	///
2251	/// \param __a
2252	/// A 256-bit vector of [8 x float].
2253	/// \returns A 256-bit integer vector containing the converted values.
2254	static __inline __m256i __DEFAULT_FN_ATTRS
2255	_mm256_cvttps_epi32(__m256 __a)
2256	{
2257	return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
2258	}
2259
2260	/// Returns the first element of the input vector of [4 x double].
2261	///
2262	/// \headerfile <avxintrin.h>
2263	///
2264	/// This intrinsic is a utility function and does not correspond to a specific
2265	/// instruction.
2266	///
2267	/// \param __a
2268	/// A 256-bit vector of [4 x double].
2269	/// \returns A 64 bit double containing the first element of the input vector.
2270	static __inline double __DEFAULT_FN_ATTRS
2271	_mm256_cvtsd_f64(__m256d __a)
2272	{
2273	return __a[0];
2274	}
2275
2276	/// Returns the first element of the input vector of [8 x i32].
2277	///
2278	/// \headerfile <avxintrin.h>
2279	///
2280	/// This intrinsic is a utility function and does not correspond to a specific
2281	/// instruction.
2282	///
2283	/// \param __a
2284	/// A 256-bit vector of [8 x i32].
2285	/// \returns A 32 bit integer containing the first element of the input vector.
2286	static __inline int __DEFAULT_FN_ATTRS
2287	_mm256_cvtsi256_si32(__m256i __a)
2288	{
2289	__v8si __b = (__v8si)__a;
2290	return __b[0];
2291	}
2292
2293	/// Returns the first element of the input vector of [8 x float].
2294	///
2295	/// \headerfile <avxintrin.h>
2296	///
2297	/// This intrinsic is a utility function and does not correspond to a specific
2298	/// instruction.
2299	///
2300	/// \param __a
2301	/// A 256-bit vector of [8 x float].
2302	/// \returns A 32 bit float containing the first element of the input vector.
2303	static __inline float __DEFAULT_FN_ATTRS
2304	_mm256_cvtss_f32(__m256 __a)
2305	{
2306	return __a[0];
2307	}
2308
2309	/* Vector replicate */
2310	/// Moves and duplicates odd-indexed values from a 256-bit vector of
2311	/// [8 x float] to float values in a 256-bit vector of [8 x float].
2312	///
2313	/// \headerfile <x86intrin.h>
2314	///
2315	/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
2316	///
2317	/// \param __a
2318	/// A 256-bit vector of [8 x float]. \n
2319	/// Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
2320	/// the return value. \n
2321	/// Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
2322	/// the return value. \n
2323	/// Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
2324	/// return value. \n
2325	/// Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
2326	/// return value.
2327	/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2328	/// values.
2329	static __inline __m256 __DEFAULT_FN_ATTRS
2330	_mm256_movehdup_ps(__m256 __a)
2331	{
2332	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
2333	}
2334
2335	/// Moves and duplicates even-indexed values from a 256-bit vector of
2336	/// [8 x float] to float values in a 256-bit vector of [8 x float].
2337	///
2338	/// \headerfile <x86intrin.h>
2339	///
2340	/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
2341	///
2342	/// \param __a
2343	/// A 256-bit vector of [8 x float]. \n
2344	/// Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
2345	/// the return value. \n
2346	/// Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
2347	/// the return value. \n
2348	/// Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
2349	/// return value. \n
2350	/// Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
2351	/// return value.
2352	/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2353	/// values.
2354	static __inline __m256 __DEFAULT_FN_ATTRS
2355	_mm256_moveldup_ps(__m256 __a)
2356	{
2357	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
2358	}
2359
2360	/// Moves and duplicates double-precision floating point values from a
2361	/// 256-bit vector of [4 x double] to double-precision values in a 256-bit
2362	/// vector of [4 x double].
2363	///
2364	/// \headerfile <x86intrin.h>
2365	///
2366	/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
2367	///
2368	/// \param __a
2369	/// A 256-bit vector of [4 x double]. \n
2370	/// Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
2371	/// return value. \n
2372	/// Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
2373	/// the return value.
2374	/// \returns A 256-bit vector of [4 x double] containing the moved and
2375	/// duplicated values.
2376	static __inline __m256d __DEFAULT_FN_ATTRS
2377	_mm256_movedup_pd(__m256d __a)
2378	{
2379	return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
2380	}
2381
2382	/* Unpack and Interleave */
2383	/// Unpacks the odd-indexed vector elements from two 256-bit vectors of
2384	/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2385	///
2386	/// \headerfile <x86intrin.h>
2387	///
2388	/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
2389	///
2390	/// \param __a
2391	/// A 256-bit floating-point vector of [4 x double]. \n
2392	/// Bits [127:64] are written to bits [63:0] of the return value. \n
2393	/// Bits [255:192] are written to bits [191:128] of the return value. \n
2394	/// \param __b
2395	/// A 256-bit floating-point vector of [4 x double]. \n
2396	/// Bits [127:64] are written to bits [127:64] of the return value. \n
2397	/// Bits [255:192] are written to bits [255:192] of the return value. \n
2398	/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2399	static __inline __m256d __DEFAULT_FN_ATTRS
2400	_mm256_unpackhi_pd(__m256d __a, __m256d __b)
2401	{
2402	return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
2403	}
2404
2405	/// Unpacks the even-indexed vector elements from two 256-bit vectors of
2406	/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2407	///
2408	/// \headerfile <x86intrin.h>
2409	///
2410	/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
2411	///
2412	/// \param __a
2413	/// A 256-bit floating-point vector of [4 x double]. \n
2414	/// Bits [63:0] are written to bits [63:0] of the return value. \n
2415	/// Bits [191:128] are written to bits [191:128] of the return value.
2416	/// \param __b
2417	/// A 256-bit floating-point vector of [4 x double]. \n
2418	/// Bits [63:0] are written to bits [127:64] of the return value. \n
2419	/// Bits [191:128] are written to bits [255:192] of the return value. \n
2420	/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2421	static __inline __m256d __DEFAULT_FN_ATTRS
2422	_mm256_unpacklo_pd(__m256d __a, __m256d __b)
2423	{
2424	return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
2425	}
2426
2427	/// Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
2428	/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2429	/// vector of [8 x float].
2430	///
2431	/// \headerfile <x86intrin.h>
2432	///
2433	/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
2434	///
2435	/// \param __a
2436	/// A 256-bit vector of [8 x float]. \n
2437	/// Bits [95:64] are written to bits [31:0] of the return value. \n
2438	/// Bits [127:96] are written to bits [95:64] of the return value. \n
2439	/// Bits [223:192] are written to bits [159:128] of the return value. \n
2440	/// Bits [255:224] are written to bits [223:192] of the return value.
2441	/// \param __b
2442	/// A 256-bit vector of [8 x float]. \n
2443	/// Bits [95:64] are written to bits [63:32] of the return value. \n
2444	/// Bits [127:96] are written to bits [127:96] of the return value. \n
2445	/// Bits [223:192] are written to bits [191:160] of the return value. \n
2446	/// Bits [255:224] are written to bits [255:224] of the return value.
2447	/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2448	static __inline __m256 __DEFAULT_FN_ATTRS
2449	_mm256_unpackhi_ps(__m256 __a, __m256 __b)
2450	{
2451	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
2452	}
2453
2454	/// Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
2455	/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2456	/// vector of [8 x float].
2457	///
2458	/// \headerfile <x86intrin.h>
2459	///
2460	/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
2461	///
2462	/// \param __a
2463	/// A 256-bit vector of [8 x float]. \n
2464	/// Bits [31:0] are written to bits [31:0] of the return value. \n
2465	/// Bits [63:32] are written to bits [95:64] of the return value. \n
2466	/// Bits [159:128] are written to bits [159:128] of the return value. \n
2467	/// Bits [191:160] are written to bits [223:192] of the return value.
2468	/// \param __b
2469	/// A 256-bit vector of [8 x float]. \n
2470	/// Bits [31:0] are written to bits [63:32] of the return value. \n
2471	/// Bits [63:32] are written to bits [127:96] of the return value. \n
2472	/// Bits [159:128] are written to bits [191:160] of the return value. \n
2473	/// Bits [191:160] are written to bits [255:224] of the return value.
2474	/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2475	static __inline __m256 __DEFAULT_FN_ATTRS
2476	_mm256_unpacklo_ps(__m256 __a, __m256 __b)
2477	{
2478	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
2479	}
2480
2481	/* Bit Test */
2482	/// Given two 128-bit floating-point vectors of [2 x double], perform an
2483	/// element-by-element comparison of the double-precision element in the
2484	/// first source vector and the corresponding element in the second source
2485	/// vector.
2486	///
2487	/// The EFLAGS register is updated as follows: \n
2488	/// If there is at least one pair of double-precision elements where the
2489	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2490	/// ZF flag is set to 1. \n
2491	/// If there is at least one pair of double-precision elements where the
2492	/// sign-bit of the first element is 0 and the sign-bit of the second element
2493	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2494	/// This intrinsic returns the value of the ZF flag.
2495	///
2496	/// \headerfile <x86intrin.h>
2497	///
2498	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2499	///
2500	/// \param __a
2501	/// A 128-bit vector of [2 x double].
2502	/// \param __b
2503	/// A 128-bit vector of [2 x double].
2504	/// \returns the ZF flag in the EFLAGS register.
2505	static __inline int __DEFAULT_FN_ATTRS128
2506	_mm_testz_pd(__m128d __a, __m128d __b)
2507	{
2508	return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
2509	}
2510
2511	/// Given two 128-bit floating-point vectors of [2 x double], perform an
2512	/// element-by-element comparison of the double-precision element in the
2513	/// first source vector and the corresponding element in the second source
2514	/// vector.
2515	///
2516	/// The EFLAGS register is updated as follows: \n
2517	/// If there is at least one pair of double-precision elements where the
2518	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2519	/// ZF flag is set to 1. \n
2520	/// If there is at least one pair of double-precision elements where the
2521	/// sign-bit of the first element is 0 and the sign-bit of the second element
2522	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2523	/// This intrinsic returns the value of the CF flag.
2524	///
2525	/// \headerfile <x86intrin.h>
2526	///
2527	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2528	///
2529	/// \param __a
2530	/// A 128-bit vector of [2 x double].
2531	/// \param __b
2532	/// A 128-bit vector of [2 x double].
2533	/// \returns the CF flag in the EFLAGS register.
2534	static __inline int __DEFAULT_FN_ATTRS128
2535	_mm_testc_pd(__m128d __a, __m128d __b)
2536	{
2537	return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
2538	}
2539
2540	/// Given two 128-bit floating-point vectors of [2 x double], perform an
2541	/// element-by-element comparison of the double-precision element in the
2542	/// first source vector and the corresponding element in the second source
2543	/// vector.
2544	///
2545	/// The EFLAGS register is updated as follows: \n
2546	/// If there is at least one pair of double-precision elements where the
2547	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2548	/// ZF flag is set to 1. \n
2549	/// If there is at least one pair of double-precision elements where the
2550	/// sign-bit of the first element is 0 and the sign-bit of the second element
2551	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2552	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2553	/// otherwise it returns 0.
2554	///
2555	/// \headerfile <x86intrin.h>
2556	///
2557	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2558	///
2559	/// \param __a
2560	/// A 128-bit vector of [2 x double].
2561	/// \param __b
2562	/// A 128-bit vector of [2 x double].
2563	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2564	static __inline int __DEFAULT_FN_ATTRS128
2565	_mm_testnzc_pd(__m128d __a, __m128d __b)
2566	{
2567	return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
2568	}
2569
2570	/// Given two 128-bit floating-point vectors of [4 x float], perform an
2571	/// element-by-element comparison of the single-precision element in the
2572	/// first source vector and the corresponding element in the second source
2573	/// vector.
2574	///
2575	/// The EFLAGS register is updated as follows: \n
2576	/// If there is at least one pair of single-precision elements where the
2577	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2578	/// ZF flag is set to 1. \n
2579	/// If there is at least one pair of single-precision elements where the
2580	/// sign-bit of the first element is 0 and the sign-bit of the second element
2581	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2582	/// This intrinsic returns the value of the ZF flag.
2583	///
2584	/// \headerfile <x86intrin.h>
2585	///
2586	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2587	///
2588	/// \param __a
2589	/// A 128-bit vector of [4 x float].
2590	/// \param __b
2591	/// A 128-bit vector of [4 x float].
2592	/// \returns the ZF flag.
2593	static __inline int __DEFAULT_FN_ATTRS128
2594	_mm_testz_ps(__m128 __a, __m128 __b)
2595	{
2596	return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
2597	}
2598
2599	/// Given two 128-bit floating-point vectors of [4 x float], perform an
2600	/// element-by-element comparison of the single-precision element in the
2601	/// first source vector and the corresponding element in the second source
2602	/// vector.
2603	///
2604	/// The EFLAGS register is updated as follows: \n
2605	/// If there is at least one pair of single-precision elements where the
2606	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2607	/// ZF flag is set to 1. \n
2608	/// If there is at least one pair of single-precision elements where the
2609	/// sign-bit of the first element is 0 and the sign-bit of the second element
2610	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2611	/// This intrinsic returns the value of the CF flag.
2612	///
2613	/// \headerfile <x86intrin.h>
2614	///
2615	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2616	///
2617	/// \param __a
2618	/// A 128-bit vector of [4 x float].
2619	/// \param __b
2620	/// A 128-bit vector of [4 x float].
2621	/// \returns the CF flag.
2622	static __inline int __DEFAULT_FN_ATTRS128
2623	_mm_testc_ps(__m128 __a, __m128 __b)
2624	{
2625	return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
2626	}
2627
2628	/// Given two 128-bit floating-point vectors of [4 x float], perform an
2629	/// element-by-element comparison of the single-precision element in the
2630	/// first source vector and the corresponding element in the second source
2631	/// vector.
2632	///
2633	/// The EFLAGS register is updated as follows: \n
2634	/// If there is at least one pair of single-precision elements where the
2635	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2636	/// ZF flag is set to 1. \n
2637	/// If there is at least one pair of single-precision elements where the
2638	/// sign-bit of the first element is 0 and the sign-bit of the second element
2639	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2640	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2641	/// otherwise it returns 0.
2642	///
2643	/// \headerfile <x86intrin.h>
2644	///
2645	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2646	///
2647	/// \param __a
2648	/// A 128-bit vector of [4 x float].
2649	/// \param __b
2650	/// A 128-bit vector of [4 x float].
2651	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2652	static __inline int __DEFAULT_FN_ATTRS128
2653	_mm_testnzc_ps(__m128 __a, __m128 __b)
2654	{
2655	return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
2656	}
2657
2658	/// Given two 256-bit floating-point vectors of [4 x double], perform an
2659	/// element-by-element comparison of the double-precision elements in the
2660	/// first source vector and the corresponding elements in the second source
2661	/// vector.
2662	///
2663	/// The EFLAGS register is updated as follows: \n
2664	/// If there is at least one pair of double-precision elements where the
2665	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2666	/// ZF flag is set to 1. \n
2667	/// If there is at least one pair of double-precision elements where the
2668	/// sign-bit of the first element is 0 and the sign-bit of the second element
2669	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2670	/// This intrinsic returns the value of the ZF flag.
2671	///
2672	/// \headerfile <x86intrin.h>
2673	///
2674	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2675	///
2676	/// \param __a
2677	/// A 256-bit vector of [4 x double].
2678	/// \param __b
2679	/// A 256-bit vector of [4 x double].
2680	/// \returns the ZF flag.
2681	static __inline int __DEFAULT_FN_ATTRS
2682	_mm256_testz_pd(__m256d __a, __m256d __b)
2683	{
2684	return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
2685	}
2686
2687	/// Given two 256-bit floating-point vectors of [4 x double], perform an
2688	/// element-by-element comparison of the double-precision elements in the
2689	/// first source vector and the corresponding elements in the second source
2690	/// vector.
2691	///
2692	/// The EFLAGS register is updated as follows: \n
2693	/// If there is at least one pair of double-precision elements where the
2694	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2695	/// ZF flag is set to 1. \n
2696	/// If there is at least one pair of double-precision elements where the
2697	/// sign-bit of the first element is 0 and the sign-bit of the second element
2698	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2699	/// This intrinsic returns the value of the CF flag.
2700	///
2701	/// \headerfile <x86intrin.h>
2702	///
2703	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2704	///
2705	/// \param __a
2706	/// A 256-bit vector of [4 x double].
2707	/// \param __b
2708	/// A 256-bit vector of [4 x double].
2709	/// \returns the CF flag.
2710	static __inline int __DEFAULT_FN_ATTRS
2711	_mm256_testc_pd(__m256d __a, __m256d __b)
2712	{
2713	return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
2714	}
2715
2716	/// Given two 256-bit floating-point vectors of [4 x double], perform an
2717	/// element-by-element comparison of the double-precision elements in the
2718	/// first source vector and the corresponding elements in the second source
2719	/// vector.
2720	///
2721	/// The EFLAGS register is updated as follows: \n
2722	/// If there is at least one pair of double-precision elements where the
2723	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2724	/// ZF flag is set to 1. \n
2725	/// If there is at least one pair of double-precision elements where the
2726	/// sign-bit of the first element is 0 and the sign-bit of the second element
2727	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2728	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2729	/// otherwise it returns 0.
2730	///
2731	/// \headerfile <x86intrin.h>
2732	///
2733	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2734	///
2735	/// \param __a
2736	/// A 256-bit vector of [4 x double].
2737	/// \param __b
2738	/// A 256-bit vector of [4 x double].
2739	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2740	static __inline int __DEFAULT_FN_ATTRS
2741	_mm256_testnzc_pd(__m256d __a, __m256d __b)
2742	{
2743	return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
2744	}
2745
2746	/// Given two 256-bit floating-point vectors of [8 x float], perform an
2747	/// element-by-element comparison of the single-precision element in the
2748	/// first source vector and the corresponding element in the second source
2749	/// vector.
2750	///
2751	/// The EFLAGS register is updated as follows: \n
2752	/// If there is at least one pair of single-precision elements where the
2753	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2754	/// ZF flag is set to 1. \n
2755	/// If there is at least one pair of single-precision elements where the
2756	/// sign-bit of the first element is 0 and the sign-bit of the second element
2757	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2758	/// This intrinsic returns the value of the ZF flag.
2759	///
2760	/// \headerfile <x86intrin.h>
2761	///
2762	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2763	///
2764	/// \param __a
2765	/// A 256-bit vector of [8 x float].
2766	/// \param __b
2767	/// A 256-bit vector of [8 x float].
2768	/// \returns the ZF flag.
2769	static __inline int __DEFAULT_FN_ATTRS
2770	_mm256_testz_ps(__m256 __a, __m256 __b)
2771	{
2772	return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
2773	}
2774
2775	/// Given two 256-bit floating-point vectors of [8 x float], perform an
2776	/// element-by-element comparison of the single-precision element in the
2777	/// first source vector and the corresponding element in the second source
2778	/// vector.
2779	///
2780	/// The EFLAGS register is updated as follows: \n
2781	/// If there is at least one pair of single-precision elements where the
2782	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2783	/// ZF flag is set to 1. \n
2784	/// If there is at least one pair of single-precision elements where the
2785	/// sign-bit of the first element is 0 and the sign-bit of the second element
2786	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2787	/// This intrinsic returns the value of the CF flag.
2788	///
2789	/// \headerfile <x86intrin.h>
2790	///
2791	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2792	///
2793	/// \param __a
2794	/// A 256-bit vector of [8 x float].
2795	/// \param __b
2796	/// A 256-bit vector of [8 x float].
2797	/// \returns the CF flag.
2798	static __inline int __DEFAULT_FN_ATTRS
2799	_mm256_testc_ps(__m256 __a, __m256 __b)
2800	{
2801	return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
2802	}
2803
2804	/// Given two 256-bit floating-point vectors of [8 x float], perform an
2805	/// element-by-element comparison of the single-precision elements in the
2806	/// first source vector and the corresponding elements in the second source
2807	/// vector.
2808	///
2809	/// The EFLAGS register is updated as follows: \n
2810	/// If there is at least one pair of single-precision elements where the
2811	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2812	/// ZF flag is set to 1. \n
2813	/// If there is at least one pair of single-precision elements where the
2814	/// sign-bit of the first element is 0 and the sign-bit of the second element
2815	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2816	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2817	/// otherwise it returns 0.
2818	///
2819	/// \headerfile <x86intrin.h>
2820	///
2821	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2822	///
2823	/// \param __a
2824	/// A 256-bit vector of [8 x float].
2825	/// \param __b
2826	/// A 256-bit vector of [8 x float].
2827	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2828	static __inline int __DEFAULT_FN_ATTRS
2829	_mm256_testnzc_ps(__m256 __a, __m256 __b)
2830	{
2831	return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
2832	}
2833
2834	/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2835	/// of the two source vectors.
2836	///
2837	/// The EFLAGS register is updated as follows: \n
2838	/// If there is at least one pair of bits where both bits are 1, the ZF flag
2839	/// is set to 0. Otherwise the ZF flag is set to 1. \n
2840	/// If there is at least one pair of bits where the bit from the first source
2841	/// vector is 0 and the bit from the second source vector is 1, the CF flag
2842	/// is set to 0. Otherwise the CF flag is set to 1. \n
2843	/// This intrinsic returns the value of the ZF flag.
2844	///
2845	/// \headerfile <x86intrin.h>
2846	///
2847	/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2848	///
2849	/// \param __a
2850	/// A 256-bit integer vector.
2851	/// \param __b
2852	/// A 256-bit integer vector.
2853	/// \returns the ZF flag.
2854	static __inline int __DEFAULT_FN_ATTRS
2855	_mm256_testz_si256(__m256i __a, __m256i __b)
2856	{
2857	return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
2858	}
2859
2860	/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2861	/// of the two source vectors.
2862	///
2863	/// The EFLAGS register is updated as follows: \n
2864	/// If there is at least one pair of bits where both bits are 1, the ZF flag
2865	/// is set to 0. Otherwise the ZF flag is set to 1. \n
2866	/// If there is at least one pair of bits where the bit from the first source
2867	/// vector is 0 and the bit from the second source vector is 1, the CF flag
2868	/// is set to 0. Otherwise the CF flag is set to 1. \n
2869	/// This intrinsic returns the value of the CF flag.
2870	///
2871	/// \headerfile <x86intrin.h>
2872	///
2873	/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2874	///
2875	/// \param __a
2876	/// A 256-bit integer vector.
2877	/// \param __b
2878	/// A 256-bit integer vector.
2879	/// \returns the CF flag.
2880	static __inline int __DEFAULT_FN_ATTRS
2881	_mm256_testc_si256(__m256i __a, __m256i __b)
2882	{
2883	return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
2884	}
2885
2886	/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2887	/// of the two source vectors.
2888	///
2889	/// The EFLAGS register is updated as follows: \n
2890	/// If there is at least one pair of bits where both bits are 1, the ZF flag
2891	/// is set to 0. Otherwise the ZF flag is set to 1. \n
2892	/// If there is at least one pair of bits where the bit from the first source
2893	/// vector is 0 and the bit from the second source vector is 1, the CF flag
2894	/// is set to 0. Otherwise the CF flag is set to 1. \n
2895	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2896	/// otherwise it returns 0.
2897	///
2898	/// \headerfile <x86intrin.h>
2899	///
2900	/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2901	///
2902	/// \param __a
2903	/// A 256-bit integer vector.
2904	/// \param __b
2905	/// A 256-bit integer vector.
2906	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2907	static __inline int __DEFAULT_FN_ATTRS
2908	_mm256_testnzc_si256(__m256i __a, __m256i __b)
2909	{
2910	return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
2911	}
2912
2913	/* Vector extract sign mask */
2914	/// Extracts the sign bits of double-precision floating point elements
2915	/// in a 256-bit vector of [4 x double] and writes them to the lower order
2916	/// bits of the return value.
2917	///
2918	/// \headerfile <x86intrin.h>
2919	///
2920	/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
2921	///
2922	/// \param __a
2923	/// A 256-bit vector of [4 x double] containing the double-precision
2924	/// floating point values with sign bits to be extracted.
2925	/// \returns The sign bits from the operand, written to bits [3:0].
2926	static __inline int __DEFAULT_FN_ATTRS
2927	_mm256_movemask_pd(__m256d __a)
2928	{
2929	return __builtin_ia32_movmskpd256((__v4df)__a);
2930	}
2931
2932	/// Extracts the sign bits of single-precision floating point elements
2933	/// in a 256-bit vector of [8 x float] and writes them to the lower order
2934	/// bits of the return value.
2935	///
2936	/// \headerfile <x86intrin.h>
2937	///
2938	/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
2939	///
2940	/// \param __a
2941	/// A 256-bit vector of [8 x float] containing the single-precision floating
2942	/// point values with sign bits to be extracted.
2943	/// \returns The sign bits from the operand, written to bits [7:0].
2944	static __inline int __DEFAULT_FN_ATTRS
2945	_mm256_movemask_ps(__m256 __a)
2946	{
2947	return __builtin_ia32_movmskps256((__v8sf)__a);
2948	}
2949
2950	/* Vector __zero */
2951	/// Zeroes the contents of all XMM or YMM registers.
2952	///
2953	/// \headerfile <x86intrin.h>
2954	///
2955	/// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
2956	static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
2957	_mm256_zeroall(void)
2958	{
2959	__builtin_ia32_vzeroall();
2960	}
2961
2962	/// Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
2963	///
2964	/// \headerfile <x86intrin.h>
2965	///
2966	/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
2967	static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
2968	_mm256_zeroupper(void)
2969	{
2970	__builtin_ia32_vzeroupper();
2971	}
2972
2973	/* Vector load with broadcast */
2974	/// Loads a scalar single-precision floating point value from the
2975	/// specified address pointed to by \a __a and broadcasts it to the elements
2976	/// of a [4 x float] vector.
2977	///
2978	/// \headerfile <x86intrin.h>
2979	///
2980	/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
2981	///
2982	/// \param __a
2983	/// The single-precision floating point value to be broadcast.
2984	/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
2985	/// equal to the broadcast value.
2986	static __inline __m128 __DEFAULT_FN_ATTRS128
2987	_mm_broadcast_ss(float const *__a)
2988	{
2989	float __f = *__a;
2990	return __extension__ (__m128)(__v4sf){ __f, __f, __f, __f };
2991	}
2992
2993	/// Loads a scalar double-precision floating point value from the
2994	/// specified address pointed to by \a __a and broadcasts it to the elements
2995	/// of a [4 x double] vector.
2996	///
2997	/// \headerfile <x86intrin.h>
2998	///
2999	/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
3000	///
3001	/// \param __a
3002	/// The double-precision floating point value to be broadcast.
3003	/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
3004	/// equal to the broadcast value.
3005	static __inline __m256d __DEFAULT_FN_ATTRS
3006	_mm256_broadcast_sd(double const *__a)
3007	{
3008	double __d = *__a;
3009	return __extension__ (__m256d)(__v4df){ __d, __d, __d, __d };
3010	}
3011
3012	/// Loads a scalar single-precision floating point value from the
3013	/// specified address pointed to by \a __a and broadcasts it to the elements
3014	/// of a [8 x float] vector.
3015	///
3016	/// \headerfile <x86intrin.h>
3017	///
3018	/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
3019	///
3020	/// \param __a
3021	/// The single-precision floating point value to be broadcast.
3022	/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
3023	/// equal to the broadcast value.
3024	static __inline __m256 __DEFAULT_FN_ATTRS
3025	_mm256_broadcast_ss(float const *__a)
3026	{
3027	float __f = *__a;
3028	return __extension__ (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
3029	}
3030
3031	/// Loads the data from a 128-bit vector of [2 x double] from the
3032	/// specified address pointed to by \a __a and broadcasts it to 128-bit
3033	/// elements in a 256-bit vector of [4 x double].
3034	///
3035	/// \headerfile <x86intrin.h>
3036	///
3037	/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
3038	///
3039	/// \param __a
3040	/// The 128-bit vector of [2 x double] to be broadcast.
3041	/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
3042	/// equal to the broadcast value.
3043	static __inline __m256d __DEFAULT_FN_ATTRS
3044	_mm256_broadcast_pd(__m128d const *__a)
3045	{
3046	__m128d __b = _mm_loadu_pd((const double *)__a);
3047	return (__m256d)__builtin_shufflevector((__v2df)__b, (__v2df)__b,
3048	0, 1, 0, 1);
3049	}
3050
3051	/// Loads the data from a 128-bit vector of [4 x float] from the
3052	/// specified address pointed to by \a __a and broadcasts it to 128-bit
3053	/// elements in a 256-bit vector of [8 x float].
3054	///
3055	/// \headerfile <x86intrin.h>
3056	///
3057	/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
3058	///
3059	/// \param __a
3060	/// The 128-bit vector of [4 x float] to be broadcast.
3061	/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
3062	/// equal to the broadcast value.
3063	static __inline __m256 __DEFAULT_FN_ATTRS
3064	_mm256_broadcast_ps(__m128 const *__a)
3065	{
3066	__m128 __b = _mm_loadu_ps((const float *)__a);
3067	return (__m256)__builtin_shufflevector((__v4sf)__b, (__v4sf)__b,
3068	0, 1, 2, 3, 0, 1, 2, 3);
3069	}
3070
3071	/* SIMD load ops */
3072	/// Loads 4 double-precision floating point values from a 32-byte aligned
3073	/// memory location pointed to by \a __p into a vector of [4 x double].
3074	///
3075	/// \headerfile <x86intrin.h>
3076	///
3077	/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3078	///
3079	/// \param __p
3080	/// A 32-byte aligned pointer to a memory location containing
3081	/// double-precision floating point values.
3082	/// \returns A 256-bit vector of [4 x double] containing the moved values.
3083	static __inline __m256d __DEFAULT_FN_ATTRS
3084	_mm256_load_pd(double const *__p)
3085	{
3086	return (__m256d )__p;
3087	}
3088
3089	/// Loads 8 single-precision floating point values from a 32-byte aligned
3090	/// memory location pointed to by \a __p into a vector of [8 x float].
3091	///
3092	/// \headerfile <x86intrin.h>
3093	///
3094	/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3095	///
3096	/// \param __p
3097	/// A 32-byte aligned pointer to a memory location containing float values.
3098	/// \returns A 256-bit vector of [8 x float] containing the moved values.
3099	static __inline __m256 __DEFAULT_FN_ATTRS
3100	_mm256_load_ps(float const *__p)
3101	{
3102	return (__m256 )__p;
3103	}
3104
3105	/// Loads 4 double-precision floating point values from an unaligned
3106	/// memory location pointed to by \a __p into a vector of [4 x double].
3107	///
3108	/// \headerfile <x86intrin.h>
3109	///
3110	/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3111	///
3112	/// \param __p
3113	/// A pointer to a memory location containing double-precision floating
3114	/// point values.
3115	/// \returns A 256-bit vector of [4 x double] containing the moved values.
3116	static __inline __m256d __DEFAULT_FN_ATTRS
3117	_mm256_loadu_pd(double const *__p)
3118	{
3119	struct __loadu_pd {
3120	__m256d_u __v;
3121	} __attribute__((__packed__, __may_alias__));
3122	return ((struct __loadu_pd*)__p)->__v;
3123	}
3124
3125	/// Loads 8 single-precision floating point values from an unaligned
3126	/// memory location pointed to by \a __p into a vector of [8 x float].
3127	///
3128	/// \headerfile <x86intrin.h>
3129	///
3130	/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3131	///
3132	/// \param __p
3133	/// A pointer to a memory location containing single-precision floating
3134	/// point values.
3135	/// \returns A 256-bit vector of [8 x float] containing the moved values.
3136	static __inline __m256 __DEFAULT_FN_ATTRS
3137	_mm256_loadu_ps(float const *__p)
3138	{
3139	struct __loadu_ps {
3140	__m256_u __v;
3141	} __attribute__((__packed__, __may_alias__));
3142	return ((struct __loadu_ps*)__p)->__v;
3143	}
3144
3145	/// Loads 256 bits of integer data from a 32-byte aligned memory
3146	/// location pointed to by \a __p into elements of a 256-bit integer vector.
3147	///
3148	/// \headerfile <x86intrin.h>
3149	///
3150	/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3151	///
3152	/// \param __p
3153	/// A 32-byte aligned pointer to a 256-bit integer vector containing integer
3154	/// values.
3155	/// \returns A 256-bit integer vector containing the moved values.
3156	static __inline __m256i __DEFAULT_FN_ATTRS
3157	_mm256_load_si256(__m256i const *__p)
3158	{
3159	return *__p;
3160	}
3161
3162	/// Loads 256 bits of integer data from an unaligned memory location
3163	/// pointed to by \a __p into a 256-bit integer vector.
3164	///
3165	/// \headerfile <x86intrin.h>
3166	///
3167	/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3168	///
3169	/// \param __p
3170	/// A pointer to a 256-bit integer vector containing integer values.
3171	/// \returns A 256-bit integer vector containing the moved values.
3172	static __inline __m256i __DEFAULT_FN_ATTRS
3173	_mm256_loadu_si256(__m256i_u const *__p)
3174	{
3175	struct __loadu_si256 {
3176	__m256i_u __v;
3177	} __attribute__((__packed__, __may_alias__));
3178	return ((struct __loadu_si256*)__p)->__v;
3179	}
3180
3181	/// Loads 256 bits of integer data from an unaligned memory location
3182	/// pointed to by \a __p into a 256-bit integer vector. This intrinsic may
3183	/// perform better than \c _mm256_loadu_si256 when the data crosses a cache
3184	/// line boundary.
3185	///
3186	/// \headerfile <x86intrin.h>
3187	///
3188	/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
3189	///
3190	/// \param __p
3191	/// A pointer to a 256-bit integer vector containing integer values.
3192	/// \returns A 256-bit integer vector containing the moved values.
3193	static __inline __m256i __DEFAULT_FN_ATTRS
3194	_mm256_lddqu_si256(__m256i const *__p)
3195	{
3196	return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
3197	}
3198
3199	/* SIMD store ops */
3200	/// Stores double-precision floating point values from a 256-bit vector
3201	/// of [4 x double] to a 32-byte aligned memory location pointed to by
3202	/// \a __p.
3203	///
3204	/// \headerfile <x86intrin.h>
3205	///
3206	/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3207	///
3208	/// \param __p
3209	/// A 32-byte aligned pointer to a memory location that will receive the
3210	/// double-precision floaing point values.
3211	/// \param __a
3212	/// A 256-bit vector of [4 x double] containing the values to be moved.
3213	static __inline void __DEFAULT_FN_ATTRS
3214	_mm256_store_pd(double *__p, __m256d __a)
3215	{
3216	(__m256d )__p = __a;
3217	}
3218
3219	/// Stores single-precision floating point values from a 256-bit vector
3220	/// of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
3221	///
3222	/// \headerfile <x86intrin.h>
3223	///
3224	/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3225	///
3226	/// \param __p
3227	/// A 32-byte aligned pointer to a memory location that will receive the
3228	/// float values.
3229	/// \param __a
3230	/// A 256-bit vector of [8 x float] containing the values to be moved.
3231	static __inline void __DEFAULT_FN_ATTRS
3232	_mm256_store_ps(float *__p, __m256 __a)
3233	{
3234	(__m256 )__p = __a;
3235	}
3236
3237	/// Stores double-precision floating point values from a 256-bit vector
3238	/// of [4 x double] to an unaligned memory location pointed to by \a __p.
3239	///
3240	/// \headerfile <x86intrin.h>
3241	///
3242	/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3243	///
3244	/// \param __p
3245	/// A pointer to a memory location that will receive the double-precision
3246	/// floating point values.
3247	/// \param __a
3248	/// A 256-bit vector of [4 x double] containing the values to be moved.
3249	static __inline void __DEFAULT_FN_ATTRS
3250	_mm256_storeu_pd(double *__p, __m256d __a)
3251	{
3252	struct __storeu_pd {
3253	__m256d_u __v;
3254	} __attribute__((__packed__, __may_alias__));
3255	((struct __storeu_pd*)__p)->__v = __a;
3256	}
3257
3258	/// Stores single-precision floating point values from a 256-bit vector
3259	/// of [8 x float] to an unaligned memory location pointed to by \a __p.
3260	///
3261	/// \headerfile <x86intrin.h>
3262	///
3263	/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3264	///
3265	/// \param __p
3266	/// A pointer to a memory location that will receive the float values.
3267	/// \param __a
3268	/// A 256-bit vector of [8 x float] containing the values to be moved.
3269	static __inline void __DEFAULT_FN_ATTRS
3270	_mm256_storeu_ps(float *__p, __m256 __a)
3271	{
3272	struct __storeu_ps {
3273	__m256_u __v;
3274	} __attribute__((__packed__, __may_alias__));
3275	((struct __storeu_ps*)__p)->__v = __a;
3276	}
3277
3278	/// Stores integer values from a 256-bit integer vector to a 32-byte
3279	/// aligned memory location pointed to by \a __p.
3280	///
3281	/// \headerfile <x86intrin.h>
3282	///
3283	/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3284	///
3285	/// \param __p
3286	/// A 32-byte aligned pointer to a memory location that will receive the
3287	/// integer values.
3288	/// \param __a
3289	/// A 256-bit integer vector containing the values to be moved.
3290	static __inline void __DEFAULT_FN_ATTRS
3291	_mm256_store_si256(__m256i *__p, __m256i __a)
3292	{
3293	*__p = __a;
3294	}
3295
3296	/// Stores integer values from a 256-bit integer vector to an unaligned
3297	/// memory location pointed to by \a __p.
3298	///
3299	/// \headerfile <x86intrin.h>
3300	///
3301	/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3302	///
3303	/// \param __p
3304	/// A pointer to a memory location that will receive the integer values.
3305	/// \param __a
3306	/// A 256-bit integer vector containing the values to be moved.
3307	static __inline void __DEFAULT_FN_ATTRS
3308	_mm256_storeu_si256(__m256i_u *__p, __m256i __a)
3309	{
3310	struct __storeu_si256 {
3311	__m256i_u __v;
3312	} __attribute__((__packed__, __may_alias__));
3313	((struct __storeu_si256*)__p)->__v = __a;
3314	}
3315
3316	/* Conditional load ops */
3317	/// Conditionally loads double-precision floating point elements from a
3318	/// memory location pointed to by \a __p into a 128-bit vector of
3319	/// [2 x double], depending on the mask bits associated with each data
3320	/// element.
3321	///
3322	/// \headerfile <x86intrin.h>
3323	///
3324	/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3325	///
3326	/// \param __p
3327	/// A pointer to a memory location that contains the double-precision
3328	/// floating point values.
3329	/// \param __m
3330	/// A 128-bit integer vector containing the mask. The most significant bit of
3331	/// each data element represents the mask bits. If a mask bit is zero, the
3332	/// corresponding value in the memory location is not loaded and the
3333	/// corresponding field in the return value is set to zero.
3334	/// \returns A 128-bit vector of [2 x double] containing the loaded values.
3335	static __inline __m128d __DEFAULT_FN_ATTRS128
3336	_mm_maskload_pd(double const *__p, __m128i __m)
3337	{
3338	return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
3339	}
3340
3341	/// Conditionally loads double-precision floating point elements from a
3342	/// memory location pointed to by \a __p into a 256-bit vector of
3343	/// [4 x double], depending on the mask bits associated with each data
3344	/// element.
3345	///
3346	/// \headerfile <x86intrin.h>
3347	///
3348	/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3349	///
3350	/// \param __p
3351	/// A pointer to a memory location that contains the double-precision
3352	/// floating point values.
3353	/// \param __m
3354	/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
3355	/// significant bit of each quadword element represents the mask bits. If a
3356	/// mask bit is zero, the corresponding value in the memory location is not
3357	/// loaded and the corresponding field in the return value is set to zero.
3358	/// \returns A 256-bit vector of [4 x double] containing the loaded values.
3359	static __inline __m256d __DEFAULT_FN_ATTRS
3360	_mm256_maskload_pd(double const *__p, __m256i __m)
3361	{
3362	return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
3363	(__v4di)__m);
3364	}
3365
3366	/// Conditionally loads single-precision floating point elements from a
3367	/// memory location pointed to by \a __p into a 128-bit vector of
3368	/// [4 x float], depending on the mask bits associated with each data
3369	/// element.
3370	///
3371	/// \headerfile <x86intrin.h>
3372	///
3373	/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3374	///
3375	/// \param __p
3376	/// A pointer to a memory location that contains the single-precision
3377	/// floating point values.
3378	/// \param __m
3379	/// A 128-bit integer vector containing the mask. The most significant bit of
3380	/// each data element represents the mask bits. If a mask bit is zero, the
3381	/// corresponding value in the memory location is not loaded and the
3382	/// corresponding field in the return value is set to zero.
3383	/// \returns A 128-bit vector of [4 x float] containing the loaded values.
3384	static __inline __m128 __DEFAULT_FN_ATTRS128
3385	_mm_maskload_ps(float const *__p, __m128i __m)
3386	{
3387	return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
3388	}
3389
3390	/// Conditionally loads single-precision floating point elements from a
3391	/// memory location pointed to by \a __p into a 256-bit vector of
3392	/// [8 x float], depending on the mask bits associated with each data
3393	/// element.
3394	///
3395	/// \headerfile <x86intrin.h>
3396	///
3397	/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3398	///
3399	/// \param __p
3400	/// A pointer to a memory location that contains the single-precision
3401	/// floating point values.
3402	/// \param __m
3403	/// A 256-bit integer vector of [8 x dword] containing the mask. The most
3404	/// significant bit of each dword element represents the mask bits. If a mask
3405	/// bit is zero, the corresponding value in the memory location is not loaded
3406	/// and the corresponding field in the return value is set to zero.
3407	/// \returns A 256-bit vector of [8 x float] containing the loaded values.
3408	static __inline __m256 __DEFAULT_FN_ATTRS
3409	_mm256_maskload_ps(float const *__p, __m256i __m)
3410	{
3411	return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
3412	}
3413
3414	/* Conditional store ops */
3415	/// Moves single-precision floating point values from a 256-bit vector
3416	/// of [8 x float] to a memory location pointed to by \a __p, according to
3417	/// the specified mask.
3418	///
3419	/// \headerfile <x86intrin.h>
3420	///
3421	/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3422	///
3423	/// \param __p
3424	/// A pointer to a memory location that will receive the float values.
3425	/// \param __m
3426	/// A 256-bit integer vector of [8 x dword] containing the mask. The most
3427	/// significant bit of each dword element in the mask vector represents the
3428	/// mask bits. If a mask bit is zero, the corresponding value from vector
3429	/// \a __a is not stored and the corresponding field in the memory location
3430	/// pointed to by \a __p is not changed.
3431	/// \param __a
3432	/// A 256-bit vector of [8 x float] containing the values to be stored.
3433	static __inline void __DEFAULT_FN_ATTRS
3434	_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
3435	{
3436	__builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
3437	}
3438
3439	/// Moves double-precision values from a 128-bit vector of [2 x double]
3440	/// to a memory location pointed to by \a __p, according to the specified
3441	/// mask.
3442	///
3443	/// \headerfile <x86intrin.h>
3444	///
3445	/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3446	///
3447	/// \param __p
3448	/// A pointer to a memory location that will receive the float values.
3449	/// \param __m
3450	/// A 128-bit integer vector containing the mask. The most significant bit of
3451	/// each field in the mask vector represents the mask bits. If a mask bit is
3452	/// zero, the corresponding value from vector \a __a is not stored and the
3453	/// corresponding field in the memory location pointed to by \a __p is not
3454	/// changed.
3455	/// \param __a
3456	/// A 128-bit vector of [2 x double] containing the values to be stored.
3457	static __inline void __DEFAULT_FN_ATTRS128
3458	_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
3459	{
3460	__builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
3461	}
3462
3463	/// Moves double-precision values from a 256-bit vector of [4 x double]
3464	/// to a memory location pointed to by \a __p, according to the specified
3465	/// mask.
3466	///
3467	/// \headerfile <x86intrin.h>
3468	///
3469	/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3470	///
3471	/// \param __p
3472	/// A pointer to a memory location that will receive the float values.
3473	/// \param __m
3474	/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
3475	/// significant bit of each quadword element in the mask vector represents
3476	/// the mask bits. If a mask bit is zero, the corresponding value from vector
3477	/// __a is not stored and the corresponding field in the memory location
3478	/// pointed to by \a __p is not changed.
3479	/// \param __a
3480	/// A 256-bit vector of [4 x double] containing the values to be stored.
3481	static __inline void __DEFAULT_FN_ATTRS
3482	_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
3483	{
3484	__builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
3485	}
3486
3487	/// Moves single-precision floating point values from a 128-bit vector
3488	/// of [4 x float] to a memory location pointed to by \a __p, according to
3489	/// the specified mask.
3490	///
3491	/// \headerfile <x86intrin.h>
3492	///
3493	/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3494	///
3495	/// \param __p
3496	/// A pointer to a memory location that will receive the float values.
3497	/// \param __m
3498	/// A 128-bit integer vector containing the mask. The most significant bit of
3499	/// each field in the mask vector represents the mask bits. If a mask bit is
3500	/// zero, the corresponding value from vector __a is not stored and the
3501	/// corresponding field in the memory location pointed to by \a __p is not
3502	/// changed.
3503	/// \param __a
3504	/// A 128-bit vector of [4 x float] containing the values to be stored.
3505	static __inline void __DEFAULT_FN_ATTRS128
3506	_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
3507	{
3508	__builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
3509	}
3510
3511	/* Cacheability support ops */
3512	/// Moves integer data from a 256-bit integer vector to a 32-byte
3513	/// aligned memory location. To minimize caching, the data is flagged as
3514	/// non-temporal (unlikely to be used again soon).
3515	///
3516	/// \headerfile <x86intrin.h>
3517	///
3518	/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
3519	///
3520	/// \param __a
3521	/// A pointer to a 32-byte aligned memory location that will receive the
3522	/// integer values.
3523	/// \param __b
3524	/// A 256-bit integer vector containing the values to be moved.
3525	static __inline void __DEFAULT_FN_ATTRS
3526	_mm256_stream_si256(__m256i *__a, __m256i __b)
3527	{
3528	typedef __v4di __v4di_aligned __attribute__((aligned(32)));
3529	__builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
3530	}
3531
3532	/// Moves double-precision values from a 256-bit vector of [4 x double]
3533	/// to a 32-byte aligned memory location. To minimize caching, the data is
3534	/// flagged as non-temporal (unlikely to be used again soon).
3535	///
3536	/// \headerfile <x86intrin.h>
3537	///
3538	/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
3539	///
3540	/// \param __a
3541	/// A pointer to a 32-byte aligned memory location that will receive the
3542	/// double-precision floating-point values.
3543	/// \param __b
3544	/// A 256-bit vector of [4 x double] containing the values to be moved.
3545	static __inline void __DEFAULT_FN_ATTRS
3546	_mm256_stream_pd(double *__a, __m256d __b)
3547	{
3548	typedef __v4df __v4df_aligned __attribute__((aligned(32)));
3549	__builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
3550	}
3551
3552	/// Moves single-precision floating point values from a 256-bit vector
3553	/// of [8 x float] to a 32-byte aligned memory location. To minimize
3554	/// caching, the data is flagged as non-temporal (unlikely to be used again
3555	/// soon).
3556	///
3557	/// \headerfile <x86intrin.h>
3558	///
3559	/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
3560	///
3561	/// \param __p
3562	/// A pointer to a 32-byte aligned memory location that will receive the
3563	/// single-precision floating point values.
3564	/// \param __a
3565	/// A 256-bit vector of [8 x float] containing the values to be moved.
3566	static __inline void __DEFAULT_FN_ATTRS
3567	_mm256_stream_ps(float *__p, __m256 __a)
3568	{
3569	typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
3570	__builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
3571	}
3572
3573	/* Create vectors */
3574	/// Create a 256-bit vector of [4 x double] with undefined values.
3575	///
3576	/// \headerfile <x86intrin.h>
3577	///
3578	/// This intrinsic has no corresponding instruction.
3579	///
3580	/// \returns A 256-bit vector of [4 x double] containing undefined values.
3581	static __inline__ __m256d __DEFAULT_FN_ATTRS
3582	_mm256_undefined_pd(void)
3583	{
3584	return (__m256d)__builtin_ia32_undef256();
3585	}
3586
3587	/// Create a 256-bit vector of [8 x float] with undefined values.
3588	///
3589	/// \headerfile <x86intrin.h>
3590	///
3591	/// This intrinsic has no corresponding instruction.
3592	///
3593	/// \returns A 256-bit vector of [8 x float] containing undefined values.
3594	static __inline__ __m256 __DEFAULT_FN_ATTRS
3595	_mm256_undefined_ps(void)
3596	{
3597	return (__m256)__builtin_ia32_undef256();
3598	}
3599
3600	/// Create a 256-bit integer vector with undefined values.
3601	///
3602	/// \headerfile <x86intrin.h>
3603	///
3604	/// This intrinsic has no corresponding instruction.
3605	///
3606	/// \returns A 256-bit integer vector containing undefined values.
3607	static __inline__ __m256i __DEFAULT_FN_ATTRS
3608	_mm256_undefined_si256(void)
3609	{
3610	return (__m256i)__builtin_ia32_undef256();
3611	}
3612
3613	/// Constructs a 256-bit floating-point vector of [4 x double]
3614	/// initialized with the specified double-precision floating-point values.
3615	///
3616	/// \headerfile <x86intrin.h>
3617	///
3618	/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3619	/// instruction.
3620	///
3621	/// \param __a
3622	/// A double-precision floating-point value used to initialize bits [255:192]
3623	/// of the result.
3624	/// \param __b
3625	/// A double-precision floating-point value used to initialize bits [191:128]
3626	/// of the result.
3627	/// \param __c
3628	/// A double-precision floating-point value used to initialize bits [127:64]
3629	/// of the result.
3630	/// \param __d
3631	/// A double-precision floating-point value used to initialize bits [63:0]
3632	/// of the result.
3633	/// \returns An initialized 256-bit floating-point vector of [4 x double].
3634	static __inline __m256d __DEFAULT_FN_ATTRS
3635	_mm256_set_pd(double __a, double __b, double __c, double __d)
3636	{
3637	return __extension__ (__m256d){ __d, __c, __b, __a };
3638	}
3639
3640	/// Constructs a 256-bit floating-point vector of [8 x float] initialized
3641	/// with the specified single-precision floating-point values.
3642	///
3643	/// \headerfile <x86intrin.h>
3644	///
3645	/// This intrinsic is a utility function and does not correspond to a specific
3646	/// instruction.
3647	///
3648	/// \param __a
3649	/// A single-precision floating-point value used to initialize bits [255:224]
3650	/// of the result.
3651	/// \param __b
3652	/// A single-precision floating-point value used to initialize bits [223:192]
3653	/// of the result.
3654	/// \param __c
3655	/// A single-precision floating-point value used to initialize bits [191:160]
3656	/// of the result.
3657	/// \param __d
3658	/// A single-precision floating-point value used to initialize bits [159:128]
3659	/// of the result.
3660	/// \param __e
3661	/// A single-precision floating-point value used to initialize bits [127:96]
3662	/// of the result.
3663	/// \param __f
3664	/// A single-precision floating-point value used to initialize bits [95:64]
3665	/// of the result.
3666	/// \param __g
3667	/// A single-precision floating-point value used to initialize bits [63:32]
3668	/// of the result.
3669	/// \param __h
3670	/// A single-precision floating-point value used to initialize bits [31:0]
3671	/// of the result.
3672	/// \returns An initialized 256-bit floating-point vector of [8 x float].
3673	static __inline __m256 __DEFAULT_FN_ATTRS
3674	_mm256_set_ps(float __a, float __b, float __c, float __d,
3675	float __e, float __f, float __g, float __h)
3676	{
3677	return __extension__ (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
3678	}
3679
3680	/// Constructs a 256-bit integer vector initialized with the specified
3681	/// 32-bit integral values.
3682	///
3683	/// \headerfile <x86intrin.h>
3684	///
3685	/// This intrinsic is a utility function and does not correspond to a specific
3686	/// instruction.
3687	///
3688	/// \param __i0
3689	/// A 32-bit integral value used to initialize bits [255:224] of the result.
3690	/// \param __i1
3691	/// A 32-bit integral value used to initialize bits [223:192] of the result.
3692	/// \param __i2
3693	/// A 32-bit integral value used to initialize bits [191:160] of the result.
3694	/// \param __i3
3695	/// A 32-bit integral value used to initialize bits [159:128] of the result.
3696	/// \param __i4
3697	/// A 32-bit integral value used to initialize bits [127:96] of the result.
3698	/// \param __i5
3699	/// A 32-bit integral value used to initialize bits [95:64] of the result.
3700	/// \param __i6
3701	/// A 32-bit integral value used to initialize bits [63:32] of the result.
3702	/// \param __i7
3703	/// A 32-bit integral value used to initialize bits [31:0] of the result.
3704	/// \returns An initialized 256-bit integer vector.
3705	static __inline __m256i __DEFAULT_FN_ATTRS
3706	_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
3707	int __i4, int __i5, int __i6, int __i7)
3708	{
3709	return __extension__ (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
3710	}
3711
3712	/// Constructs a 256-bit integer vector initialized with the specified
3713	/// 16-bit integral values.
3714	///
3715	/// \headerfile <x86intrin.h>
3716	///
3717	/// This intrinsic is a utility function and does not correspond to a specific
3718	/// instruction.
3719	///
3720	/// \param __w15
3721	/// A 16-bit integral value used to initialize bits [255:240] of the result.
3722	/// \param __w14
3723	/// A 16-bit integral value used to initialize bits [239:224] of the result.
3724	/// \param __w13
3725	/// A 16-bit integral value used to initialize bits [223:208] of the result.
3726	/// \param __w12
3727	/// A 16-bit integral value used to initialize bits [207:192] of the result.
3728	/// \param __w11
3729	/// A 16-bit integral value used to initialize bits [191:176] of the result.
3730	/// \param __w10
3731	/// A 16-bit integral value used to initialize bits [175:160] of the result.
3732	/// \param __w09
3733	/// A 16-bit integral value used to initialize bits [159:144] of the result.
3734	/// \param __w08
3735	/// A 16-bit integral value used to initialize bits [143:128] of the result.
3736	/// \param __w07
3737	/// A 16-bit integral value used to initialize bits [127:112] of the result.
3738	/// \param __w06
3739	/// A 16-bit integral value used to initialize bits [111:96] of the result.
3740	/// \param __w05
3741	/// A 16-bit integral value used to initialize bits [95:80] of the result.
3742	/// \param __w04
3743	/// A 16-bit integral value used to initialize bits [79:64] of the result.
3744	/// \param __w03
3745	/// A 16-bit integral value used to initialize bits [63:48] of the result.
3746	/// \param __w02
3747	/// A 16-bit integral value used to initialize bits [47:32] of the result.
3748	/// \param __w01
3749	/// A 16-bit integral value used to initialize bits [31:16] of the result.
3750	/// \param __w00
3751	/// A 16-bit integral value used to initialize bits [15:0] of the result.
3752	/// \returns An initialized 256-bit integer vector.
3753	static __inline __m256i __DEFAULT_FN_ATTRS
3754	_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
3755	short __w11, short __w10, short __w09, short __w08,
3756	short __w07, short __w06, short __w05, short __w04,
3757	short __w03, short __w02, short __w01, short __w00)
3758	{
3759	return __extension__ (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
3760	__w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
3761	}
3762
3763	/// Constructs a 256-bit integer vector initialized with the specified
3764	/// 8-bit integral values.
3765	///
3766	/// \headerfile <x86intrin.h>
3767	///
3768	/// This intrinsic is a utility function and does not correspond to a specific
3769	/// instruction.
3770	///
3771	/// \param __b31
3772	/// An 8-bit integral value used to initialize bits [255:248] of the result.
3773	/// \param __b30
3774	/// An 8-bit integral value used to initialize bits [247:240] of the result.
3775	/// \param __b29
3776	/// An 8-bit integral value used to initialize bits [239:232] of the result.
3777	/// \param __b28
3778	/// An 8-bit integral value used to initialize bits [231:224] of the result.
3779	/// \param __b27
3780	/// An 8-bit integral value used to initialize bits [223:216] of the result.
3781	/// \param __b26
3782	/// An 8-bit integral value used to initialize bits [215:208] of the result.
3783	/// \param __b25
3784	/// An 8-bit integral value used to initialize bits [207:200] of the result.
3785	/// \param __b24
3786	/// An 8-bit integral value used to initialize bits [199:192] of the result.
3787	/// \param __b23
3788	/// An 8-bit integral value used to initialize bits [191:184] of the result.
3789	/// \param __b22
3790	/// An 8-bit integral value used to initialize bits [183:176] of the result.
3791	/// \param __b21
3792	/// An 8-bit integral value used to initialize bits [175:168] of the result.
3793	/// \param __b20
3794	/// An 8-bit integral value used to initialize bits [167:160] of the result.
3795	/// \param __b19
3796	/// An 8-bit integral value used to initialize bits [159:152] of the result.
3797	/// \param __b18
3798	/// An 8-bit integral value used to initialize bits [151:144] of the result.
3799	/// \param __b17
3800	/// An 8-bit integral value used to initialize bits [143:136] of the result.
3801	/// \param __b16
3802	/// An 8-bit integral value used to initialize bits [135:128] of the result.
3803	/// \param __b15
3804	/// An 8-bit integral value used to initialize bits [127:120] of the result.
3805	/// \param __b14
3806	/// An 8-bit integral value used to initialize bits [119:112] of the result.
3807	/// \param __b13
3808	/// An 8-bit integral value used to initialize bits [111:104] of the result.
3809	/// \param __b12
3810	/// An 8-bit integral value used to initialize bits [103:96] of the result.
3811	/// \param __b11
3812	/// An 8-bit integral value used to initialize bits [95:88] of the result.
3813	/// \param __b10
3814	/// An 8-bit integral value used to initialize bits [87:80] of the result.
3815	/// \param __b09
3816	/// An 8-bit integral value used to initialize bits [79:72] of the result.
3817	/// \param __b08
3818	/// An 8-bit integral value used to initialize bits [71:64] of the result.
3819	/// \param __b07
3820	/// An 8-bit integral value used to initialize bits [63:56] of the result.
3821	/// \param __b06
3822	/// An 8-bit integral value used to initialize bits [55:48] of the result.
3823	/// \param __b05
3824	/// An 8-bit integral value used to initialize bits [47:40] of the result.
3825	/// \param __b04
3826	/// An 8-bit integral value used to initialize bits [39:32] of the result.
3827	/// \param __b03
3828	/// An 8-bit integral value used to initialize bits [31:24] of the result.
3829	/// \param __b02
3830	/// An 8-bit integral value used to initialize bits [23:16] of the result.
3831	/// \param __b01
3832	/// An 8-bit integral value used to initialize bits [15:8] of the result.
3833	/// \param __b00
3834	/// An 8-bit integral value used to initialize bits [7:0] of the result.
3835	/// \returns An initialized 256-bit integer vector.
3836	static __inline __m256i __DEFAULT_FN_ATTRS
3837	_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
3838	char __b27, char __b26, char __b25, char __b24,
3839	char __b23, char __b22, char __b21, char __b20,
3840	char __b19, char __b18, char __b17, char __b16,
3841	char __b15, char __b14, char __b13, char __b12,
3842	char __b11, char __b10, char __b09, char __b08,
3843	char __b07, char __b06, char __b05, char __b04,
3844	char __b03, char __b02, char __b01, char __b00)
3845	{
3846	return __extension__ (__m256i)(__v32qi){
3847	__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
3848	__b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
3849	__b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
3850	__b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
3851	};
3852	}
3853
3854	/// Constructs a 256-bit integer vector initialized with the specified
3855	/// 64-bit integral values.
3856	///
3857	/// \headerfile <x86intrin.h>
3858	///
3859	/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
3860	/// instruction.
3861	///
3862	/// \param __a
3863	/// A 64-bit integral value used to initialize bits [255:192] of the result.
3864	/// \param __b
3865	/// A 64-bit integral value used to initialize bits [191:128] of the result.
3866	/// \param __c
3867	/// A 64-bit integral value used to initialize bits [127:64] of the result.
3868	/// \param __d
3869	/// A 64-bit integral value used to initialize bits [63:0] of the result.
3870	/// \returns An initialized 256-bit integer vector.
3871	static __inline __m256i __DEFAULT_FN_ATTRS
3872	_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
3873	{
3874	return __extension__ (__m256i)(__v4di){ __d, __c, __b, __a };
3875	}
3876
3877	/* Create vectors with elements in reverse order */
3878	/// Constructs a 256-bit floating-point vector of [4 x double],
3879	/// initialized in reverse order with the specified double-precision
3880	/// floating-point values.
3881	///
3882	/// \headerfile <x86intrin.h>
3883	///
3884	/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3885	/// instruction.
3886	///
3887	/// \param __a
3888	/// A double-precision floating-point value used to initialize bits [63:0]
3889	/// of the result.
3890	/// \param __b
3891	/// A double-precision floating-point value used to initialize bits [127:64]
3892	/// of the result.
3893	/// \param __c
3894	/// A double-precision floating-point value used to initialize bits [191:128]
3895	/// of the result.
3896	/// \param __d
3897	/// A double-precision floating-point value used to initialize bits [255:192]
3898	/// of the result.
3899	/// \returns An initialized 256-bit floating-point vector of [4 x double].
3900	static __inline __m256d __DEFAULT_FN_ATTRS
3901	_mm256_setr_pd(double __a, double __b, double __c, double __d)
3902	{
3903	return _mm256_set_pd(__d, __c, __b, __a);
3904	}
3905
3906	/// Constructs a 256-bit floating-point vector of [8 x float],
3907	/// initialized in reverse order with the specified single-precision
3908	/// float-point values.
3909	///
3910	/// \headerfile <x86intrin.h>
3911	///
3912	/// This intrinsic is a utility function and does not correspond to a specific
3913	/// instruction.
3914	///
3915	/// \param __a
3916	/// A single-precision floating-point value used to initialize bits [31:0]
3917	/// of the result.
3918	/// \param __b
3919	/// A single-precision floating-point value used to initialize bits [63:32]
3920	/// of the result.
3921	/// \param __c
3922	/// A single-precision floating-point value used to initialize bits [95:64]
3923	/// of the result.
3924	/// \param __d
3925	/// A single-precision floating-point value used to initialize bits [127:96]
3926	/// of the result.
3927	/// \param __e
3928	/// A single-precision floating-point value used to initialize bits [159:128]
3929	/// of the result.
3930	/// \param __f
3931	/// A single-precision floating-point value used to initialize bits [191:160]
3932	/// of the result.
3933	/// \param __g
3934	/// A single-precision floating-point value used to initialize bits [223:192]
3935	/// of the result.
3936	/// \param __h
3937	/// A single-precision floating-point value used to initialize bits [255:224]
3938	/// of the result.
3939	/// \returns An initialized 256-bit floating-point vector of [8 x float].
3940	static __inline __m256 __DEFAULT_FN_ATTRS
3941	_mm256_setr_ps(float __a, float __b, float __c, float __d,
3942	float __e, float __f, float __g, float __h)
3943	{
3944	return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a);
3945	}
3946
3947	/// Constructs a 256-bit integer vector, initialized in reverse order
3948	/// with the specified 32-bit integral values.
3949	///
3950	/// \headerfile <x86intrin.h>
3951	///
3952	/// This intrinsic is a utility function and does not correspond to a specific
3953	/// instruction.
3954	///
3955	/// \param __i0
3956	/// A 32-bit integral value used to initialize bits [31:0] of the result.
3957	/// \param __i1
3958	/// A 32-bit integral value used to initialize bits [63:32] of the result.
3959	/// \param __i2
3960	/// A 32-bit integral value used to initialize bits [95:64] of the result.
3961	/// \param __i3
3962	/// A 32-bit integral value used to initialize bits [127:96] of the result.
3963	/// \param __i4
3964	/// A 32-bit integral value used to initialize bits [159:128] of the result.
3965	/// \param __i5
3966	/// A 32-bit integral value used to initialize bits [191:160] of the result.
3967	/// \param __i6
3968	/// A 32-bit integral value used to initialize bits [223:192] of the result.
3969	/// \param __i7
3970	/// A 32-bit integral value used to initialize bits [255:224] of the result.
3971	/// \returns An initialized 256-bit integer vector.
3972	static __inline __m256i __DEFAULT_FN_ATTRS
3973	_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
3974	int __i4, int __i5, int __i6, int __i7)
3975	{
3976	return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0);
3977	}
3978
3979	/// Constructs a 256-bit integer vector, initialized in reverse order
3980	/// with the specified 16-bit integral values.
3981	///
3982	/// \headerfile <x86intrin.h>
3983	///
3984	/// This intrinsic is a utility function and does not correspond to a specific
3985	/// instruction.
3986	///
3987	/// \param __w15
3988	/// A 16-bit integral value used to initialize bits [15:0] of the result.
3989	/// \param __w14
3990	/// A 16-bit integral value used to initialize bits [31:16] of the result.
3991	/// \param __w13
3992	/// A 16-bit integral value used to initialize bits [47:32] of the result.
3993	/// \param __w12
3994	/// A 16-bit integral value used to initialize bits [63:48] of the result.
3995	/// \param __w11
3996	/// A 16-bit integral value used to initialize bits [79:64] of the result.
3997	/// \param __w10
3998	/// A 16-bit integral value used to initialize bits [95:80] of the result.
3999	/// \param __w09
4000	/// A 16-bit integral value used to initialize bits [111:96] of the result.
4001	/// \param __w08
4002	/// A 16-bit integral value used to initialize bits [127:112] of the result.
4003	/// \param __w07
4004	/// A 16-bit integral value used to initialize bits [143:128] of the result.
4005	/// \param __w06
4006	/// A 16-bit integral value used to initialize bits [159:144] of the result.
4007	/// \param __w05
4008	/// A 16-bit integral value used to initialize bits [175:160] of the result.
4009	/// \param __w04
4010	/// A 16-bit integral value used to initialize bits [191:176] of the result.
4011	/// \param __w03
4012	/// A 16-bit integral value used to initialize bits [207:192] of the result.
4013	/// \param __w02
4014	/// A 16-bit integral value used to initialize bits [223:208] of the result.
4015	/// \param __w01
4016	/// A 16-bit integral value used to initialize bits [239:224] of the result.
4017	/// \param __w00
4018	/// A 16-bit integral value used to initialize bits [255:240] of the result.
4019	/// \returns An initialized 256-bit integer vector.
4020	static __inline __m256i __DEFAULT_FN_ATTRS
4021	_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
4022	short __w11, short __w10, short __w09, short __w08,
4023	short __w07, short __w06, short __w05, short __w04,
4024	short __w03, short __w02, short __w01, short __w00)
4025	{
4026	return _mm256_set_epi16(__w00, __w01, __w02, __w03,
4027	__w04, __w05, __w06, __w07,
4028	__w08, __w09, __w10, __w11,
4029	__w12, __w13, __w14, __w15);
4030	}
4031
4032	/// Constructs a 256-bit integer vector, initialized in reverse order
4033	/// with the specified 8-bit integral values.
4034	///
4035	/// \headerfile <x86intrin.h>
4036	///
4037	/// This intrinsic is a utility function and does not correspond to a specific
4038	/// instruction.
4039	///
4040	/// \param __b31
4041	/// An 8-bit integral value used to initialize bits [7:0] of the result.
4042	/// \param __b30
4043	/// An 8-bit integral value used to initialize bits [15:8] of the result.
4044	/// \param __b29
4045	/// An 8-bit integral value used to initialize bits [23:16] of the result.
4046	/// \param __b28
4047	/// An 8-bit integral value used to initialize bits [31:24] of the result.
4048	/// \param __b27
4049	/// An 8-bit integral value used to initialize bits [39:32] of the result.
4050	/// \param __b26
4051	/// An 8-bit integral value used to initialize bits [47:40] of the result.
4052	/// \param __b25
4053	/// An 8-bit integral value used to initialize bits [55:48] of the result.
4054	/// \param __b24
4055	/// An 8-bit integral value used to initialize bits [63:56] of the result.
4056	/// \param __b23
4057	/// An 8-bit integral value used to initialize bits [71:64] of the result.
4058	/// \param __b22
4059	/// An 8-bit integral value used to initialize bits [79:72] of the result.
4060	/// \param __b21
4061	/// An 8-bit integral value used to initialize bits [87:80] of the result.
4062	/// \param __b20
4063	/// An 8-bit integral value used to initialize bits [95:88] of the result.
4064	/// \param __b19
4065	/// An 8-bit integral value used to initialize bits [103:96] of the result.
4066	/// \param __b18
4067	/// An 8-bit integral value used to initialize bits [111:104] of the result.
4068	/// \param __b17
4069	/// An 8-bit integral value used to initialize bits [119:112] of the result.
4070	/// \param __b16
4071	/// An 8-bit integral value used to initialize bits [127:120] of the result.
4072	/// \param __b15
4073	/// An 8-bit integral value used to initialize bits [135:128] of the result.
4074	/// \param __b14
4075	/// An 8-bit integral value used to initialize bits [143:136] of the result.
4076	/// \param __b13
4077	/// An 8-bit integral value used to initialize bits [151:144] of the result.
4078	/// \param __b12
4079	/// An 8-bit integral value used to initialize bits [159:152] of the result.
4080	/// \param __b11
4081	/// An 8-bit integral value used to initialize bits [167:160] of the result.
4082	/// \param __b10
4083	/// An 8-bit integral value used to initialize bits [175:168] of the result.
4084	/// \param __b09
4085	/// An 8-bit integral value used to initialize bits [183:176] of the result.
4086	/// \param __b08
4087	/// An 8-bit integral value used to initialize bits [191:184] of the result.
4088	/// \param __b07
4089	/// An 8-bit integral value used to initialize bits [199:192] of the result.
4090	/// \param __b06
4091	/// An 8-bit integral value used to initialize bits [207:200] of the result.
4092	/// \param __b05
4093	/// An 8-bit integral value used to initialize bits [215:208] of the result.
4094	/// \param __b04
4095	/// An 8-bit integral value used to initialize bits [223:216] of the result.
4096	/// \param __b03
4097	/// An 8-bit integral value used to initialize bits [231:224] of the result.
4098	/// \param __b02
4099	/// An 8-bit integral value used to initialize bits [239:232] of the result.
4100	/// \param __b01
4101	/// An 8-bit integral value used to initialize bits [247:240] of the result.
4102	/// \param __b00
4103	/// An 8-bit integral value used to initialize bits [255:248] of the result.
4104	/// \returns An initialized 256-bit integer vector.
4105	static __inline __m256i __DEFAULT_FN_ATTRS
4106	_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
4107	char __b27, char __b26, char __b25, char __b24,
4108	char __b23, char __b22, char __b21, char __b20,
4109	char __b19, char __b18, char __b17, char __b16,
4110	char __b15, char __b14, char __b13, char __b12,
4111	char __b11, char __b10, char __b09, char __b08,
4112	char __b07, char __b06, char __b05, char __b04,
4113	char __b03, char __b02, char __b01, char __b00)
4114	{
4115	return _mm256_set_epi8(__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
4116	__b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
4117	__b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
4118	__b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31);
4119	}
4120
4121	/// Constructs a 256-bit integer vector, initialized in reverse order
4122	/// with the specified 64-bit integral values.
4123	///
4124	/// \headerfile <x86intrin.h>
4125	///
4126	/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
4127	/// instruction.
4128	///
4129	/// \param __a
4130	/// A 64-bit integral value used to initialize bits [63:0] of the result.
4131	/// \param __b
4132	/// A 64-bit integral value used to initialize bits [127:64] of the result.
4133	/// \param __c
4134	/// A 64-bit integral value used to initialize bits [191:128] of the result.
4135	/// \param __d
4136	/// A 64-bit integral value used to initialize bits [255:192] of the result.
4137	/// \returns An initialized 256-bit integer vector.
4138	static __inline __m256i __DEFAULT_FN_ATTRS
4139	_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
4140	{
4141	return _mm256_set_epi64x(__d, __c, __b, __a);
4142	}
4143
4144	/* Create vectors with repeated elements */
4145	/// Constructs a 256-bit floating-point vector of [4 x double], with each
4146	/// of the four double-precision floating-point vector elements set to the
4147	/// specified double-precision floating-point value.
4148	///
4149	/// \headerfile <x86intrin.h>
4150	///
4151	/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4152	///
4153	/// \param __w
4154	/// A double-precision floating-point value used to initialize each vector
4155	/// element of the result.
4156	/// \returns An initialized 256-bit floating-point vector of [4 x double].
4157	static __inline __m256d __DEFAULT_FN_ATTRS
4158	_mm256_set1_pd(double __w)
4159	{
4160	return _mm256_set_pd(__w, __w, __w, __w);
4161	}
4162
4163	/// Constructs a 256-bit floating-point vector of [8 x float], with each
4164	/// of the eight single-precision floating-point vector elements set to the
4165	/// specified single-precision floating-point value.
4166	///
4167	/// \headerfile <x86intrin.h>
4168	///
4169	/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4170	/// instruction.
4171	///
4172	/// \param __w
4173	/// A single-precision floating-point value used to initialize each vector
4174	/// element of the result.
4175	/// \returns An initialized 256-bit floating-point vector of [8 x float].
4176	static __inline __m256 __DEFAULT_FN_ATTRS
4177	_mm256_set1_ps(float __w)
4178	{
4179	return _mm256_set_ps(__w, __w, __w, __w, __w, __w, __w, __w);
4180	}
4181
4182	/// Constructs a 256-bit integer vector of [8 x i32], with each of the
4183	/// 32-bit integral vector elements set to the specified 32-bit integral
4184	/// value.
4185	///
4186	/// \headerfile <x86intrin.h>
4187	///
4188	/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4189	/// instruction.
4190	///
4191	/// \param __i
4192	/// A 32-bit integral value used to initialize each vector element of the
4193	/// result.
4194	/// \returns An initialized 256-bit integer vector of [8 x i32].
4195	static __inline __m256i __DEFAULT_FN_ATTRS
4196	_mm256_set1_epi32(int __i)
4197	{
4198	return _mm256_set_epi32(__i, __i, __i, __i, __i, __i, __i, __i);
4199	}
4200
4201	/// Constructs a 256-bit integer vector of [16 x i16], with each of the
4202	/// 16-bit integral vector elements set to the specified 16-bit integral
4203	/// value.
4204	///
4205	/// \headerfile <x86intrin.h>
4206	///
4207	/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4208	///
4209	/// \param __w
4210	/// A 16-bit integral value used to initialize each vector element of the
4211	/// result.
4212	/// \returns An initialized 256-bit integer vector of [16 x i16].
4213	static __inline __m256i __DEFAULT_FN_ATTRS
4214	_mm256_set1_epi16(short __w)
4215	{
4216	return _mm256_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w,
4217	__w, __w, __w, __w, __w, __w, __w, __w);
4218	}
4219
4220	/// Constructs a 256-bit integer vector of [32 x i8], with each of the
4221	/// 8-bit integral vector elements set to the specified 8-bit integral value.
4222	///
4223	/// \headerfile <x86intrin.h>
4224	///
4225	/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4226	///
4227	/// \param __b
4228	/// An 8-bit integral value used to initialize each vector element of the
4229	/// result.
4230	/// \returns An initialized 256-bit integer vector of [32 x i8].
4231	static __inline __m256i __DEFAULT_FN_ATTRS
4232	_mm256_set1_epi8(char __b)
4233	{
4234	return _mm256_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b,
4235	__b, __b, __b, __b, __b, __b, __b, __b,
4236	__b, __b, __b, __b, __b, __b, __b, __b,
4237	__b, __b, __b, __b, __b, __b, __b, __b);
4238	}
4239
4240	/// Constructs a 256-bit integer vector of [4 x i64], with each of the
4241	/// 64-bit integral vector elements set to the specified 64-bit integral
4242	/// value.
4243	///
4244	/// \headerfile <x86intrin.h>
4245	///
4246	/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4247	///
4248	/// \param __q
4249	/// A 64-bit integral value used to initialize each vector element of the
4250	/// result.
4251	/// \returns An initialized 256-bit integer vector of [4 x i64].
4252	static __inline __m256i __DEFAULT_FN_ATTRS
4253	_mm256_set1_epi64x(long long __q)
4254	{
4255	return _mm256_set_epi64x(__q, __q, __q, __q);
4256	}
4257
4258	/* Create __zeroed vectors */
4259	/// Constructs a 256-bit floating-point vector of [4 x double] with all
4260	/// vector elements initialized to zero.
4261	///
4262	/// \headerfile <x86intrin.h>
4263	///
4264	/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4265	///
4266	/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
4267	static __inline __m256d __DEFAULT_FN_ATTRS
4268	_mm256_setzero_pd(void)
4269	{
4270	return __extension__ (__m256d){ 0, 0, 0, 0 };
4271	}
4272
4273	/// Constructs a 256-bit floating-point vector of [8 x float] with all
4274	/// vector elements initialized to zero.
4275	///
4276	/// \headerfile <x86intrin.h>
4277	///
4278	/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4279	///
4280	/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
4281	static __inline __m256 __DEFAULT_FN_ATTRS
4282	_mm256_setzero_ps(void)
4283	{
4284	return __extension__ (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
4285	}
4286
4287	/// Constructs a 256-bit integer vector initialized to zero.
4288	///
4289	/// \headerfile <x86intrin.h>
4290	///
4291	/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4292	///
4293	/// \returns A 256-bit integer vector initialized to zero.
4294	static __inline __m256i __DEFAULT_FN_ATTRS
4295	_mm256_setzero_si256(void)
4296	{
4297	return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
4298	}
4299
4300	/* Cast between vector types */
4301	/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4302	/// floating-point vector of [8 x float].
4303	///
4304	/// \headerfile <x86intrin.h>
4305	///
4306	/// This intrinsic has no corresponding instruction.
4307	///
4308	/// \param __a
4309	/// A 256-bit floating-point vector of [4 x double].
4310	/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4311	/// bitwise pattern as the parameter.
4312	static __inline __m256 __DEFAULT_FN_ATTRS
4313	_mm256_castpd_ps(__m256d __a)
4314	{
4315	return (__m256)__a;
4316	}
4317
4318	/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4319	/// integer vector.
4320	///
4321	/// \headerfile <x86intrin.h>
4322	///
4323	/// This intrinsic has no corresponding instruction.
4324	///
4325	/// \param __a
4326	/// A 256-bit floating-point vector of [4 x double].
4327	/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4328	/// parameter.
4329	static __inline __m256i __DEFAULT_FN_ATTRS
4330	_mm256_castpd_si256(__m256d __a)
4331	{
4332	return (__m256i)__a;
4333	}
4334
4335	/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4336	/// floating-point vector of [4 x double].
4337	///
4338	/// \headerfile <x86intrin.h>
4339	///
4340	/// This intrinsic has no corresponding instruction.
4341	///
4342	/// \param __a
4343	/// A 256-bit floating-point vector of [8 x float].
4344	/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4345	/// bitwise pattern as the parameter.
4346	static __inline __m256d __DEFAULT_FN_ATTRS
4347	_mm256_castps_pd(__m256 __a)
4348	{
4349	return (__m256d)__a;
4350	}
4351
4352	/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4353	/// integer vector.
4354	///
4355	/// \headerfile <x86intrin.h>
4356	///
4357	/// This intrinsic has no corresponding instruction.
4358	///
4359	/// \param __a
4360	/// A 256-bit floating-point vector of [8 x float].
4361	/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4362	/// parameter.
4363	static __inline __m256i __DEFAULT_FN_ATTRS
4364	_mm256_castps_si256(__m256 __a)
4365	{
4366	return (__m256i)__a;
4367	}
4368
4369	/// Casts a 256-bit integer vector into a 256-bit floating-point vector
4370	/// of [8 x float].
4371	///
4372	/// \headerfile <x86intrin.h>
4373	///
4374	/// This intrinsic has no corresponding instruction.
4375	///
4376	/// \param __a
4377	/// A 256-bit integer vector.
4378	/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4379	/// bitwise pattern as the parameter.
4380	static __inline __m256 __DEFAULT_FN_ATTRS
4381	_mm256_castsi256_ps(__m256i __a)
4382	{
4383	return (__m256)__a;
4384	}
4385
4386	/// Casts a 256-bit integer vector into a 256-bit floating-point vector
4387	/// of [4 x double].
4388	///
4389	/// \headerfile <x86intrin.h>
4390	///
4391	/// This intrinsic has no corresponding instruction.
4392	///
4393	/// \param __a
4394	/// A 256-bit integer vector.
4395	/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4396	/// bitwise pattern as the parameter.
4397	static __inline __m256d __DEFAULT_FN_ATTRS
4398	_mm256_castsi256_pd(__m256i __a)
4399	{
4400	return (__m256d)__a;
4401	}
4402
4403	/// Returns the lower 128 bits of a 256-bit floating-point vector of
4404	/// [4 x double] as a 128-bit floating-point vector of [2 x double].
4405	///
4406	/// \headerfile <x86intrin.h>
4407	///
4408	/// This intrinsic has no corresponding instruction.
4409	///
4410	/// \param __a
4411	/// A 256-bit floating-point vector of [4 x double].
4412	/// \returns A 128-bit floating-point vector of [2 x double] containing the
4413	/// lower 128 bits of the parameter.
4414	static __inline __m128d __DEFAULT_FN_ATTRS
4415	_mm256_castpd256_pd128(__m256d __a)
4416	{
4417	return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
4418	}
4419
4420	/// Returns the lower 128 bits of a 256-bit floating-point vector of
4421	/// [8 x float] as a 128-bit floating-point vector of [4 x float].
4422	///
4423	/// \headerfile <x86intrin.h>
4424	///
4425	/// This intrinsic has no corresponding instruction.
4426	///
4427	/// \param __a
4428	/// A 256-bit floating-point vector of [8 x float].
4429	/// \returns A 128-bit floating-point vector of [4 x float] containing the
4430	/// lower 128 bits of the parameter.
4431	static __inline __m128 __DEFAULT_FN_ATTRS
4432	_mm256_castps256_ps128(__m256 __a)
4433	{
4434	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
4435	}
4436
4437	/// Truncates a 256-bit integer vector into a 128-bit integer vector.
4438	///
4439	/// \headerfile <x86intrin.h>
4440	///
4441	/// This intrinsic has no corresponding instruction.
4442	///
4443	/// \param __a
4444	/// A 256-bit integer vector.
4445	/// \returns A 128-bit integer vector containing the lower 128 bits of the
4446	/// parameter.
4447	static __inline __m128i __DEFAULT_FN_ATTRS
4448	_mm256_castsi256_si128(__m256i __a)
4449	{
4450	return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
4451	}
4452
4453	/// Constructs a 256-bit floating-point vector of [4 x double] from a
4454	/// 128-bit floating-point vector of [2 x double].
4455	///
4456	/// The lower 128 bits contain the value of the source vector. The contents
4457	/// of the upper 128 bits are undefined.
4458	///
4459	/// \headerfile <x86intrin.h>
4460	///
4461	/// This intrinsic has no corresponding instruction.
4462	///
4463	/// \param __a
4464	/// A 128-bit vector of [2 x double].
4465	/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4466	/// contain the value of the parameter. The contents of the upper 128 bits
4467	/// are undefined.
4468	static __inline __m256d __DEFAULT_FN_ATTRS
4469	_mm256_castpd128_pd256(__m128d __a)
4470	{
4471	return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1);
4472	}
4473
4474	/// Constructs a 256-bit floating-point vector of [8 x float] from a
4475	/// 128-bit floating-point vector of [4 x float].
4476	///
4477	/// The lower 128 bits contain the value of the source vector. The contents
4478	/// of the upper 128 bits are undefined.
4479	///
4480	/// \headerfile <x86intrin.h>
4481	///
4482	/// This intrinsic has no corresponding instruction.
4483	///
4484	/// \param __a
4485	/// A 128-bit vector of [4 x float].
4486	/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4487	/// contain the value of the parameter. The contents of the upper 128 bits
4488	/// are undefined.
4489	static __inline __m256 __DEFAULT_FN_ATTRS
4490	_mm256_castps128_ps256(__m128 __a)
4491	{
4492	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1);
4493	}
4494
4495	/// Constructs a 256-bit integer vector from a 128-bit integer vector.
4496	///
4497	/// The lower 128 bits contain the value of the source vector. The contents
4498	/// of the upper 128 bits are undefined.
4499	///
4500	/// \headerfile <x86intrin.h>
4501	///
4502	/// This intrinsic has no corresponding instruction.
4503	///
4504	/// \param __a
4505	/// A 128-bit integer vector.
4506	/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4507	/// the parameter. The contents of the upper 128 bits are undefined.
4508	static __inline __m256i __DEFAULT_FN_ATTRS
4509	_mm256_castsi128_si256(__m128i __a)
4510	{
4511	return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);
4512	}
4513
4514	/// Constructs a 256-bit floating-point vector of [4 x double] from a
4515	/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
4516	/// contain the value of the source vector. The upper 128 bits are set
4517	/// to zero.
4518	///
4519	/// \headerfile <x86intrin.h>
4520	///
4521	/// This intrinsic has no corresponding instruction.
4522	///
4523	/// \param __a
4524	/// A 128-bit vector of [2 x double].
4525	/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4526	/// contain the value of the parameter. The upper 128 bits are set to zero.
4527	static __inline __m256d __DEFAULT_FN_ATTRS
4528	_mm256_zextpd128_pd256(__m128d __a)
4529	{
4530	return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3);
4531	}
4532
4533	/// Constructs a 256-bit floating-point vector of [8 x float] from a
4534	/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
4535	/// the value of the source vector. The upper 128 bits are set to zero.
4536	///
4537	/// \headerfile <x86intrin.h>
4538	///
4539	/// This intrinsic has no corresponding instruction.
4540	///
4541	/// \param __a
4542	/// A 128-bit vector of [4 x float].
4543	/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4544	/// contain the value of the parameter. The upper 128 bits are set to zero.
4545	static __inline __m256 __DEFAULT_FN_ATTRS
4546	_mm256_zextps128_ps256(__m128 __a)
4547	{
4548	return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7);
4549	}
4550
4551	/// Constructs a 256-bit integer vector from a 128-bit integer vector.
4552	/// The lower 128 bits contain the value of the source vector. The upper
4553	/// 128 bits are set to zero.
4554	///
4555	/// \headerfile <x86intrin.h>
4556	///
4557	/// This intrinsic has no corresponding instruction.
4558	///
4559	/// \param __a
4560	/// A 128-bit integer vector.
4561	/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4562	/// the parameter. The upper 128 bits are set to zero.
4563	static __inline __m256i __DEFAULT_FN_ATTRS
4564	_mm256_zextsi128_si256(__m128i __a)
4565	{
4566	return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3);
4567	}
4568
4569	/*
4570	Vector insert.
4571	We use macros rather than inlines because we only want to accept
4572	invocations where the immediate M is a constant expression.
4573	*/
4574	/// Constructs a new 256-bit vector of [8 x float] by first duplicating
4575	/// a 256-bit vector of [8 x float] given in the first parameter, and then
4576	/// replacing either the upper or the lower 128 bits with the contents of a
4577	/// 128-bit vector of [4 x float] in the second parameter.
4578	///
4579	/// The immediate integer parameter determines between the upper or the lower
4580	/// 128 bits.
4581	///
4582	/// \headerfile <x86intrin.h>
4583	///
4584	/// \code
4585	/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
4586	/// \endcode
4587	///
4588	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4589	///
4590	/// \param V1
4591	/// A 256-bit vector of [8 x float]. This vector is copied to the result
4592	/// first, and then either the upper or the lower 128 bits of the result will
4593	/// be replaced by the contents of \a V2.
4594	/// \param V2
4595	/// A 128-bit vector of [4 x float]. The contents of this parameter are
4596	/// written to either the upper or the lower 128 bits of the result depending
4597	/// on the value of parameter \a M.
4598	/// \param M
4599	/// An immediate integer. The least significant bit determines how the values
4600	/// from the two parameters are interleaved: \n
4601	/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4602	/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4603	/// result. \n
4604	/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4605	/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4606	/// result.
4607	/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
4608	#define _mm256_insertf128_ps(V1, V2, M) \
4609	(__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
4610	(__v4sf)(__m128)(V2), (int)(M))
4611
4612	/// Constructs a new 256-bit vector of [4 x double] by first duplicating
4613	/// a 256-bit vector of [4 x double] given in the first parameter, and then
4614	/// replacing either the upper or the lower 128 bits with the contents of a
4615	/// 128-bit vector of [2 x double] in the second parameter.
4616	///
4617	/// The immediate integer parameter determines between the upper or the lower
4618	/// 128 bits.
4619	///
4620	/// \headerfile <x86intrin.h>
4621	///
4622	/// \code
4623	/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
4624	/// \endcode
4625	///
4626	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4627	///
4628	/// \param V1
4629	/// A 256-bit vector of [4 x double]. This vector is copied to the result
4630	/// first, and then either the upper or the lower 128 bits of the result will
4631	/// be replaced by the contents of \a V2.
4632	/// \param V2
4633	/// A 128-bit vector of [2 x double]. The contents of this parameter are
4634	/// written to either the upper or the lower 128 bits of the result depending
4635	/// on the value of parameter \a M.
4636	/// \param M
4637	/// An immediate integer. The least significant bit determines how the values
4638	/// from the two parameters are interleaved: \n
4639	/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4640	/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4641	/// result. \n
4642	/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4643	/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4644	/// result.
4645	/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
4646	#define _mm256_insertf128_pd(V1, V2, M) \
4647	(__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
4648	(__v2df)(__m128d)(V2), (int)(M))
4649
4650	/// Constructs a new 256-bit integer vector by first duplicating a
4651	/// 256-bit integer vector given in the first parameter, and then replacing
4652	/// either the upper or the lower 128 bits with the contents of a 128-bit
4653	/// integer vector in the second parameter.
4654	///
4655	/// The immediate integer parameter determines between the upper or the lower
4656	/// 128 bits.
4657	///
4658	/// \headerfile <x86intrin.h>
4659	///
4660	/// \code
4661	/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
4662	/// \endcode
4663	///
4664	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4665	///
4666	/// \param V1
4667	/// A 256-bit integer vector. This vector is copied to the result first, and
4668	/// then either the upper or the lower 128 bits of the result will be
4669	/// replaced by the contents of \a V2.
4670	/// \param V2
4671	/// A 128-bit integer vector. The contents of this parameter are written to
4672	/// either the upper or the lower 128 bits of the result depending on the
4673	/// value of parameter \a M.
4674	/// \param M
4675	/// An immediate integer. The least significant bit determines how the values
4676	/// from the two parameters are interleaved: \n
4677	/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4678	/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4679	/// result. \n
4680	/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4681	/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4682	/// result.
4683	/// \returns A 256-bit integer vector containing the interleaved values.
4684	#define _mm256_insertf128_si256(V1, V2, M) \
4685	(__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
4686	(__v4si)(__m128i)(V2), (int)(M))
4687
4688	/*
4689	Vector extract.
4690	We use macros rather than inlines because we only want to accept
4691	invocations where the immediate M is a constant expression.
4692	*/
4693	/// Extracts either the upper or the lower 128 bits from a 256-bit vector
4694	/// of [8 x float], as determined by the immediate integer parameter, and
4695	/// returns the extracted bits as a 128-bit vector of [4 x float].
4696	///
4697	/// \headerfile <x86intrin.h>
4698	///
4699	/// \code
4700	/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
4701	/// \endcode
4702	///
4703	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4704	///
4705	/// \param V
4706	/// A 256-bit vector of [8 x float].
4707	/// \param M
4708	/// An immediate integer. The least significant bit determines which bits are
4709	/// extracted from the first parameter: \n
4710	/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4711	/// result. \n
4712	/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4713	/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
4714	#define _mm256_extractf128_ps(V, M) \
4715	(__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M))
4716
4717	/// Extracts either the upper or the lower 128 bits from a 256-bit vector
4718	/// of [4 x double], as determined by the immediate integer parameter, and
4719	/// returns the extracted bits as a 128-bit vector of [2 x double].
4720	///
4721	/// \headerfile <x86intrin.h>
4722	///
4723	/// \code
4724	/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
4725	/// \endcode
4726	///
4727	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4728	///
4729	/// \param V
4730	/// A 256-bit vector of [4 x double].
4731	/// \param M
4732	/// An immediate integer. The least significant bit determines which bits are
4733	/// extracted from the first parameter: \n
4734	/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4735	/// result. \n
4736	/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4737	/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
4738	#define _mm256_extractf128_pd(V, M) \
4739	(__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M))
4740
4741	/// Extracts either the upper or the lower 128 bits from a 256-bit
4742	/// integer vector, as determined by the immediate integer parameter, and
4743	/// returns the extracted bits as a 128-bit integer vector.
4744	///
4745	/// \headerfile <x86intrin.h>
4746	///
4747	/// \code
4748	/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
4749	/// \endcode
4750	///
4751	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4752	///
4753	/// \param V
4754	/// A 256-bit integer vector.
4755	/// \param M
4756	/// An immediate integer. The least significant bit determines which bits are
4757	/// extracted from the first parameter: \n
4758	/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4759	/// result. \n
4760	/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4761	/// \returns A 128-bit integer vector containing the extracted bits.
4762	#define _mm256_extractf128_si256(V, M) \
4763	(__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M))
4764
4765	/* SIMD load ops (unaligned) */
4766	/// Loads two 128-bit floating-point vectors of [4 x float] from
4767	/// unaligned memory locations and constructs a 256-bit floating-point vector
4768	/// of [8 x float] by concatenating the two 128-bit vectors.
4769	///
4770	/// \headerfile <x86intrin.h>
4771	///
4772	/// This intrinsic corresponds to load instructions followed by the
4773	/// <c> VINSERTF128 </c> instruction.
4774	///
4775	/// \param __addr_hi
4776	/// A pointer to a 128-bit memory location containing 4 consecutive
4777	/// single-precision floating-point values. These values are to be copied to
4778	/// bits[255:128] of the result. The address of the memory location does not
4779	/// have to be aligned.
4780	/// \param __addr_lo
4781	/// A pointer to a 128-bit memory location containing 4 consecutive
4782	/// single-precision floating-point values. These values are to be copied to
4783	/// bits[127:0] of the result. The address of the memory location does not
4784	/// have to be aligned.
4785	/// \returns A 256-bit floating-point vector of [8 x float] containing the
4786	/// concatenated result.
4787	static __inline __m256 __DEFAULT_FN_ATTRS
4788	_mm256_loadu2_m128(float const __addr_hi, float const __addr_lo)
4789	{
4790	__m256 __v256 = _mm256_castps128_ps256(_mm_loadu_ps(__addr_lo));
4791	return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1);
4792	}
4793
4794	/// Loads two 128-bit floating-point vectors of [2 x double] from
4795	/// unaligned memory locations and constructs a 256-bit floating-point vector
4796	/// of [4 x double] by concatenating the two 128-bit vectors.
4797	///
4798	/// \headerfile <x86intrin.h>
4799	///
4800	/// This intrinsic corresponds to load instructions followed by the
4801	/// <c> VINSERTF128 </c> instruction.
4802	///
4803	/// \param __addr_hi
4804	/// A pointer to a 128-bit memory location containing two consecutive
4805	/// double-precision floating-point values. These values are to be copied to
4806	/// bits[255:128] of the result. The address of the memory location does not
4807	/// have to be aligned.
4808	/// \param __addr_lo
4809	/// A pointer to a 128-bit memory location containing two consecutive
4810	/// double-precision floating-point values. These values are to be copied to
4811	/// bits[127:0] of the result. The address of the memory location does not
4812	/// have to be aligned.
4813	/// \returns A 256-bit floating-point vector of [4 x double] containing the
4814	/// concatenated result.
4815	static __inline __m256d __DEFAULT_FN_ATTRS
4816	_mm256_loadu2_m128d(double const __addr_hi, double const __addr_lo)
4817	{
4818	__m256d __v256 = _mm256_castpd128_pd256(_mm_loadu_pd(__addr_lo));
4819	return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1);
4820	}
4821
4822	/// Loads two 128-bit integer vectors from unaligned memory locations and
4823	/// constructs a 256-bit integer vector by concatenating the two 128-bit
4824	/// vectors.
4825	///
4826	/// \headerfile <x86intrin.h>
4827	///
4828	/// This intrinsic corresponds to load instructions followed by the
4829	/// <c> VINSERTF128 </c> instruction.
4830	///
4831	/// \param __addr_hi
4832	/// A pointer to a 128-bit memory location containing a 128-bit integer
4833	/// vector. This vector is to be copied to bits[255:128] of the result. The
4834	/// address of the memory location does not have to be aligned.
4835	/// \param __addr_lo
4836	/// A pointer to a 128-bit memory location containing a 128-bit integer
4837	/// vector. This vector is to be copied to bits[127:0] of the result. The
4838	/// address of the memory location does not have to be aligned.
4839	/// \returns A 256-bit integer vector containing the concatenated result.
4840	static __inline __m256i __DEFAULT_FN_ATTRS
4841	_mm256_loadu2_m128i(__m128i_u const __addr_hi, __m128i_u const __addr_lo)
4842	{
4843	__m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo));
4844	return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1);
4845	}
4846
4847	/* SIMD store ops (unaligned) */
4848	/// Stores the upper and lower 128 bits of a 256-bit floating-point
4849	/// vector of [8 x float] into two different unaligned memory locations.
4850	///
4851	/// \headerfile <x86intrin.h>
4852	///
4853	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
4854	/// store instructions.
4855	///
4856	/// \param __addr_hi
4857	/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
4858	/// copied to this memory location. The address of this memory location does
4859	/// not have to be aligned.
4860	/// \param __addr_lo
4861	/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
4862	/// copied to this memory location. The address of this memory location does
4863	/// not have to be aligned.
4864	/// \param __a
4865	/// A 256-bit floating-point vector of [8 x float].
4866	static __inline void __DEFAULT_FN_ATTRS
4867	_mm256_storeu2_m128(float __addr_hi, float __addr_lo, __m256 __a)
4868	{
4869	__m128 __v128;
4870
4871	__v128 = _mm256_castps256_ps128(__a);
4872	_mm_storeu_ps(__addr_lo, __v128);
4873	__v128 = _mm256_extractf128_ps(__a, 1);
4874	_mm_storeu_ps(__addr_hi, __v128);
4875	}
4876
4877	/// Stores the upper and lower 128 bits of a 256-bit floating-point
4878	/// vector of [4 x double] into two different unaligned memory locations.
4879	///
4880	/// \headerfile <x86intrin.h>
4881	///
4882	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
4883	/// store instructions.
4884	///
4885	/// \param __addr_hi
4886	/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
4887	/// copied to this memory location. The address of this memory location does
4888	/// not have to be aligned.
4889	/// \param __addr_lo
4890	/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
4891	/// copied to this memory location. The address of this memory location does
4892	/// not have to be aligned.
4893	/// \param __a
4894	/// A 256-bit floating-point vector of [4 x double].
4895	static __inline void __DEFAULT_FN_ATTRS
4896	_mm256_storeu2_m128d(double __addr_hi, double __addr_lo, __m256d __a)
4897	{
4898	__m128d __v128;
4899
4900	__v128 = _mm256_castpd256_pd128(__a);
4901	_mm_storeu_pd(__addr_lo, __v128);
4902	__v128 = _mm256_extractf128_pd(__a, 1);
4903	_mm_storeu_pd(__addr_hi, __v128);
4904	}
4905
4906	/// Stores the upper and lower 128 bits of a 256-bit integer vector into
4907	/// two different unaligned memory locations.
4908	///
4909	/// \headerfile <x86intrin.h>
4910	///
4911	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
4912	/// store instructions.
4913	///
4914	/// \param __addr_hi
4915	/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
4916	/// copied to this memory location. The address of this memory location does
4917	/// not have to be aligned.
4918	/// \param __addr_lo
4919	/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
4920	/// copied to this memory location. The address of this memory location does
4921	/// not have to be aligned.
4922	/// \param __a
4923	/// A 256-bit integer vector.
4924	static __inline void __DEFAULT_FN_ATTRS
4925	_mm256_storeu2_m128i(__m128i_u __addr_hi, __m128i_u __addr_lo, __m256i __a)
4926	{
4927	__m128i __v128;
4928
4929	__v128 = _mm256_castsi256_si128(__a);
4930	_mm_storeu_si128(__addr_lo, __v128);
4931	__v128 = _mm256_extractf128_si256(__a, 1);
4932	_mm_storeu_si128(__addr_hi, __v128);
4933	}
4934
4935	/// Constructs a 256-bit floating-point vector of [8 x float] by
4936	/// concatenating two 128-bit floating-point vectors of [4 x float].
4937	///
4938	/// \headerfile <x86intrin.h>
4939	///
4940	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4941	///
4942	/// \param __hi
4943	/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
4944	/// 128 bits of the result.
4945	/// \param __lo
4946	/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
4947	/// 128 bits of the result.
4948	/// \returns A 256-bit floating-point vector of [8 x float] containing the
4949	/// concatenated result.
4950	static __inline __m256 __DEFAULT_FN_ATTRS
4951	_mm256_set_m128 (__m128 __hi, __m128 __lo)
4952	{
4953	return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
4954	}
4955
4956	/// Constructs a 256-bit floating-point vector of [4 x double] by
4957	/// concatenating two 128-bit floating-point vectors of [2 x double].
4958	///
4959	/// \headerfile <x86intrin.h>
4960	///
4961	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4962	///
4963	/// \param __hi
4964	/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
4965	/// 128 bits of the result.
4966	/// \param __lo
4967	/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
4968	/// 128 bits of the result.
4969	/// \returns A 256-bit floating-point vector of [4 x double] containing the
4970	/// concatenated result.
4971	static __inline __m256d __DEFAULT_FN_ATTRS
4972	_mm256_set_m128d (__m128d __hi, __m128d __lo)
4973	{
4974	return (__m256d) __builtin_shufflevector((__v2df)__lo, (__v2df)__hi, 0, 1, 2, 3);
4975	}
4976
4977	/// Constructs a 256-bit integer vector by concatenating two 128-bit
4978	/// integer vectors.
4979	///
4980	/// \headerfile <x86intrin.h>
4981	///
4982	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4983	///
4984	/// \param __hi
4985	/// A 128-bit integer vector to be copied to the upper 128 bits of the
4986	/// result.
4987	/// \param __lo
4988	/// A 128-bit integer vector to be copied to the lower 128 bits of the
4989	/// result.
4990	/// \returns A 256-bit integer vector containing the concatenated result.
4991	static __inline __m256i __DEFAULT_FN_ATTRS
4992	_mm256_set_m128i (__m128i __hi, __m128i __lo)
4993	{
4994	return (__m256i) __builtin_shufflevector((__v2di)__lo, (__v2di)__hi, 0, 1, 2, 3);
4995	}
4996
4997	/// Constructs a 256-bit floating-point vector of [8 x float] by
4998	/// concatenating two 128-bit floating-point vectors of [4 x float]. This is
4999	/// similar to _mm256_set_m128, but the order of the input parameters is
5000	/// swapped.
5001	///
5002	/// \headerfile <x86intrin.h>
5003	///
5004	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
5005	///
5006	/// \param __lo
5007	/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
5008	/// 128 bits of the result.
5009	/// \param __hi
5010	/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
5011	/// 128 bits of the result.
5012	/// \returns A 256-bit floating-point vector of [8 x float] containing the
5013	/// concatenated result.
5014	static __inline __m256 __DEFAULT_FN_ATTRS
5015	_mm256_setr_m128 (__m128 __lo, __m128 __hi)
5016	{
5017	return _mm256_set_m128(__hi, __lo);
5018	}
5019
5020	/// Constructs a 256-bit floating-point vector of [4 x double] by
5021	/// concatenating two 128-bit floating-point vectors of [2 x double]. This is
5022	/// similar to _mm256_set_m128d, but the order of the input parameters is
5023	/// swapped.
5024	///
5025	/// \headerfile <x86intrin.h>
5026	///
5027	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
5028	///
5029	/// \param __lo
5030	/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
5031	/// 128 bits of the result.
5032	/// \param __hi
5033	/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
5034	/// 128 bits of the result.
5035	/// \returns A 256-bit floating-point vector of [4 x double] containing the
5036	/// concatenated result.
5037	static __inline __m256d __DEFAULT_FN_ATTRS
5038	_mm256_setr_m128d (__m128d __lo, __m128d __hi)
5039	{
5040	return (__m256d)_mm256_set_m128d(__hi, __lo);
5041	}
5042
5043	/// Constructs a 256-bit integer vector by concatenating two 128-bit
5044	/// integer vectors. This is similar to _mm256_set_m128i, but the order of
5045	/// the input parameters is swapped.
5046	///
5047	/// \headerfile <x86intrin.h>
5048	///
5049	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
5050	///
5051	/// \param __lo
5052	/// A 128-bit integer vector to be copied to the lower 128 bits of the
5053	/// result.
5054	/// \param __hi
5055	/// A 128-bit integer vector to be copied to the upper 128 bits of the
5056	/// result.
5057	/// \returns A 256-bit integer vector containing the concatenated result.
5058	static __inline __m256i __DEFAULT_FN_ATTRS
5059	_mm256_setr_m128i (__m128i __lo, __m128i __hi)
5060	{
5061	return (__m256i)_mm256_set_m128i(__hi, __lo);
5062	}
5063
5064	#undef __DEFAULT_FN_ATTRS
5065	#undef __DEFAULT_FN_ATTRS128
5066
5067	#endif /* __AVXINTRIN_H */
5068

Clang Project