emmintrin.h source code [clang_source_code/lib/Headers/emmintrin.h]

1	/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2	*
3	* Permission is hereby granted, free of charge, to any person obtaining a copy
4	* of this software and associated documentation files (the "Software"), to deal
5	* in the Software without restriction, including without limitation the rights
6	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7	* copies of the Software, and to permit persons to whom the Software is
8	* furnished to do so, subject to the following conditions:
9	*
10	* The above copyright notice and this permission notice shall be included in
11	* all copies or substantial portions of the Software.
12	*
13	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19	* THE SOFTWARE.
20	*
21	*===-----------------------------------------------------------------------===
22	*/
23
24	#ifndef __EMMINTRIN_H
25	#define __EMMINTRIN_H
26
27	#include <xmmintrin.h>
28
29	typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
30	typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
31
32	typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1)));
33	typedef long long __m128i_u __attribute__((__vector_size__(16), __aligned__(1)));
34
35	/* Type defines. */
36	typedef double __v2df __attribute__ ((__vector_size__ (16)));
37	typedef long long __v2di __attribute__ ((__vector_size__ (16)));
38	typedef short __v8hi __attribute__((__vector_size__(16)));
39	typedef char __v16qi __attribute__((__vector_size__(16)));
40
41	/* Unsigned types */
42	typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16)));
43	typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
44	typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
45
46	/* We need an explicitly signed variant for char. Note that this shouldn't
47	* appear in the interface though. */
48	typedef signed char __v16qs __attribute__((__vector_size__(16)));
49
50	/* Define the default attributes for the functions in this file. */
51	#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2"), __min_vector_width__(128)))
52	#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse2"), __min_vector_width__(64)))
53
54	/// Adds lower double-precision values in both operands and returns the
55	/// sum in the lower 64 bits of the result. The upper 64 bits of the result
56	/// are copied from the upper double-precision value of the first operand.
57	///
58	/// \headerfile <x86intrin.h>
59	///
60	/// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
61	///
62	/// \param __a
63	/// A 128-bit vector of [2 x double] containing one of the source operands.
64	/// \param __b
65	/// A 128-bit vector of [2 x double] containing one of the source operands.
66	/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
67	/// sum of the lower 64 bits of both operands. The upper 64 bits are copied
68	/// from the upper 64 bits of the first source operand.
69	static __inline__ __m128d __DEFAULT_FN_ATTRS
70	_mm_add_sd(__m128d __a, __m128d __b)
71	{
72	__a[0] += __b[0];
73	return __a;
74	}
75
76	/// Adds two 128-bit vectors of [2 x double].
77	///
78	/// \headerfile <x86intrin.h>
79	///
80	/// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
81	///
82	/// \param __a
83	/// A 128-bit vector of [2 x double] containing one of the source operands.
84	/// \param __b
85	/// A 128-bit vector of [2 x double] containing one of the source operands.
86	/// \returns A 128-bit vector of [2 x double] containing the sums of both
87	/// operands.
88	static __inline__ __m128d __DEFAULT_FN_ATTRS
89	_mm_add_pd(__m128d __a, __m128d __b)
90	{
91	return (__m128d)((__v2df)__a + (__v2df)__b);
92	}
93
94	/// Subtracts the lower double-precision value of the second operand
95	/// from the lower double-precision value of the first operand and returns
96	/// the difference in the lower 64 bits of the result. The upper 64 bits of
97	/// the result are copied from the upper double-precision value of the first
98	/// operand.
99	///
100	/// \headerfile <x86intrin.h>
101	///
102	/// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
103	///
104	/// \param __a
105	/// A 128-bit vector of [2 x double] containing the minuend.
106	/// \param __b
107	/// A 128-bit vector of [2 x double] containing the subtrahend.
108	/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
109	/// difference of the lower 64 bits of both operands. The upper 64 bits are
110	/// copied from the upper 64 bits of the first source operand.
111	static __inline__ __m128d __DEFAULT_FN_ATTRS
112	_mm_sub_sd(__m128d __a, __m128d __b)
113	{
114	__a[0] -= __b[0];
115	return __a;
116	}
117
118	/// Subtracts two 128-bit vectors of [2 x double].
119	///
120	/// \headerfile <x86intrin.h>
121	///
122	/// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
123	///
124	/// \param __a
125	/// A 128-bit vector of [2 x double] containing the minuend.
126	/// \param __b
127	/// A 128-bit vector of [2 x double] containing the subtrahend.
128	/// \returns A 128-bit vector of [2 x double] containing the differences between
129	/// both operands.
130	static __inline__ __m128d __DEFAULT_FN_ATTRS
131	_mm_sub_pd(__m128d __a, __m128d __b)
132	{
133	return (__m128d)((__v2df)__a - (__v2df)__b);
134	}
135
136	/// Multiplies lower double-precision values in both operands and returns
137	/// the product in the lower 64 bits of the result. The upper 64 bits of the
138	/// result are copied from the upper double-precision value of the first
139	/// operand.
140	///
141	/// \headerfile <x86intrin.h>
142	///
143	/// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
144	///
145	/// \param __a
146	/// A 128-bit vector of [2 x double] containing one of the source operands.
147	/// \param __b
148	/// A 128-bit vector of [2 x double] containing one of the source operands.
149	/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
150	/// product of the lower 64 bits of both operands. The upper 64 bits are
151	/// copied from the upper 64 bits of the first source operand.
152	static __inline__ __m128d __DEFAULT_FN_ATTRS
153	_mm_mul_sd(__m128d __a, __m128d __b)
154	{
155	__a[0] *= __b[0];
156	return __a;
157	}
158
159	/// Multiplies two 128-bit vectors of [2 x double].
160	///
161	/// \headerfile <x86intrin.h>
162	///
163	/// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
164	///
165	/// \param __a
166	/// A 128-bit vector of [2 x double] containing one of the operands.
167	/// \param __b
168	/// A 128-bit vector of [2 x double] containing one of the operands.
169	/// \returns A 128-bit vector of [2 x double] containing the products of both
170	/// operands.
171	static __inline__ __m128d __DEFAULT_FN_ATTRS
172	_mm_mul_pd(__m128d __a, __m128d __b)
173	{
174	return (__m128d)((__v2df)__a * (__v2df)__b);
175	}
176
177	/// Divides the lower double-precision value of the first operand by the
178	/// lower double-precision value of the second operand and returns the
179	/// quotient in the lower 64 bits of the result. The upper 64 bits of the
180	/// result are copied from the upper double-precision value of the first
181	/// operand.
182	///
183	/// \headerfile <x86intrin.h>
184	///
185	/// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
186	///
187	/// \param __a
188	/// A 128-bit vector of [2 x double] containing the dividend.
189	/// \param __b
190	/// A 128-bit vector of [2 x double] containing divisor.
191	/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
192	/// quotient of the lower 64 bits of both operands. The upper 64 bits are
193	/// copied from the upper 64 bits of the first source operand.
194	static __inline__ __m128d __DEFAULT_FN_ATTRS
195	_mm_div_sd(__m128d __a, __m128d __b)
196	{
197	__a[0] /= __b[0];
198	return __a;
199	}
200
201	/// Performs an element-by-element division of two 128-bit vectors of
202	/// [2 x double].
203	///
204	/// \headerfile <x86intrin.h>
205	///
206	/// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
207	///
208	/// \param __a
209	/// A 128-bit vector of [2 x double] containing the dividend.
210	/// \param __b
211	/// A 128-bit vector of [2 x double] containing the divisor.
212	/// \returns A 128-bit vector of [2 x double] containing the quotients of both
213	/// operands.
214	static __inline__ __m128d __DEFAULT_FN_ATTRS
215	_mm_div_pd(__m128d __a, __m128d __b)
216	{
217	return (__m128d)((__v2df)__a / (__v2df)__b);
218	}
219
220	/// Calculates the square root of the lower double-precision value of
221	/// the second operand and returns it in the lower 64 bits of the result.
222	/// The upper 64 bits of the result are copied from the upper
223	/// double-precision value of the first operand.
224	///
225	/// \headerfile <x86intrin.h>
226	///
227	/// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
228	///
229	/// \param __a
230	/// A 128-bit vector of [2 x double] containing one of the operands. The
231	/// upper 64 bits of this operand are copied to the upper 64 bits of the
232	/// result.
233	/// \param __b
234	/// A 128-bit vector of [2 x double] containing one of the operands. The
235	/// square root is calculated using the lower 64 bits of this operand.
236	/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
237	/// square root of the lower 64 bits of operand \a __b, and whose upper 64
238	/// bits are copied from the upper 64 bits of operand \a __a.
239	static __inline__ __m128d __DEFAULT_FN_ATTRS
240	_mm_sqrt_sd(__m128d __a, __m128d __b)
241	{
242	__m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
243	return __extension__ (__m128d) { __c[0], __a[1] };
244	}
245
246	/// Calculates the square root of the each of two values stored in a
247	/// 128-bit vector of [2 x double].
248	///
249	/// \headerfile <x86intrin.h>
250	///
251	/// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
252	///
253	/// \param __a
254	/// A 128-bit vector of [2 x double].
255	/// \returns A 128-bit vector of [2 x double] containing the square roots of the
256	/// values in the operand.
257	static __inline__ __m128d __DEFAULT_FN_ATTRS
258	_mm_sqrt_pd(__m128d __a)
259	{
260	return __builtin_ia32_sqrtpd((__v2df)__a);
261	}
262
263	/// Compares lower 64-bit double-precision values of both operands, and
264	/// returns the lesser of the pair of values in the lower 64-bits of the
265	/// result. The upper 64 bits of the result are copied from the upper
266	/// double-precision value of the first operand.
267	///
268	/// \headerfile <x86intrin.h>
269	///
270	/// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
271	///
272	/// \param __a
273	/// A 128-bit vector of [2 x double] containing one of the operands. The
274	/// lower 64 bits of this operand are used in the comparison.
275	/// \param __b
276	/// A 128-bit vector of [2 x double] containing one of the operands. The
277	/// lower 64 bits of this operand are used in the comparison.
278	/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
279	/// minimum value between both operands. The upper 64 bits are copied from
280	/// the upper 64 bits of the first source operand.
281	static __inline__ __m128d __DEFAULT_FN_ATTRS
282	_mm_min_sd(__m128d __a, __m128d __b)
283	{
284	return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
285	}
286
287	/// Performs element-by-element comparison of the two 128-bit vectors of
288	/// [2 x double] and returns the vector containing the lesser of each pair of
289	/// values.
290	///
291	/// \headerfile <x86intrin.h>
292	///
293	/// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
294	///
295	/// \param __a
296	/// A 128-bit vector of [2 x double] containing one of the operands.
297	/// \param __b
298	/// A 128-bit vector of [2 x double] containing one of the operands.
299	/// \returns A 128-bit vector of [2 x double] containing the minimum values
300	/// between both operands.
301	static __inline__ __m128d __DEFAULT_FN_ATTRS
302	_mm_min_pd(__m128d __a, __m128d __b)
303	{
304	return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
305	}
306
307	/// Compares lower 64-bit double-precision values of both operands, and
308	/// returns the greater of the pair of values in the lower 64-bits of the
309	/// result. The upper 64 bits of the result are copied from the upper
310	/// double-precision value of the first operand.
311	///
312	/// \headerfile <x86intrin.h>
313	///
314	/// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
315	///
316	/// \param __a
317	/// A 128-bit vector of [2 x double] containing one of the operands. The
318	/// lower 64 bits of this operand are used in the comparison.
319	/// \param __b
320	/// A 128-bit vector of [2 x double] containing one of the operands. The
321	/// lower 64 bits of this operand are used in the comparison.
322	/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
323	/// maximum value between both operands. The upper 64 bits are copied from
324	/// the upper 64 bits of the first source operand.
325	static __inline__ __m128d __DEFAULT_FN_ATTRS
326	_mm_max_sd(__m128d __a, __m128d __b)
327	{
328	return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
329	}
330
331	/// Performs element-by-element comparison of the two 128-bit vectors of
332	/// [2 x double] and returns the vector containing the greater of each pair
333	/// of values.
334	///
335	/// \headerfile <x86intrin.h>
336	///
337	/// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
338	///
339	/// \param __a
340	/// A 128-bit vector of [2 x double] containing one of the operands.
341	/// \param __b
342	/// A 128-bit vector of [2 x double] containing one of the operands.
343	/// \returns A 128-bit vector of [2 x double] containing the maximum values
344	/// between both operands.
345	static __inline__ __m128d __DEFAULT_FN_ATTRS
346	_mm_max_pd(__m128d __a, __m128d __b)
347	{
348	return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
349	}
350
351	/// Performs a bitwise AND of two 128-bit vectors of [2 x double].
352	///
353	/// \headerfile <x86intrin.h>
354	///
355	/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
356	///
357	/// \param __a
358	/// A 128-bit vector of [2 x double] containing one of the source operands.
359	/// \param __b
360	/// A 128-bit vector of [2 x double] containing one of the source operands.
361	/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
362	/// values between both operands.
363	static __inline__ __m128d __DEFAULT_FN_ATTRS
364	_mm_and_pd(__m128d __a, __m128d __b)
365	{
366	return (__m128d)((__v2du)__a & (__v2du)__b);
367	}
368
369	/// Performs a bitwise AND of two 128-bit vectors of [2 x double], using
370	/// the one's complement of the values contained in the first source operand.
371	///
372	/// \headerfile <x86intrin.h>
373	///
374	/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
375	///
376	/// \param __a
377	/// A 128-bit vector of [2 x double] containing the left source operand. The
378	/// one's complement of this value is used in the bitwise AND.
379	/// \param __b
380	/// A 128-bit vector of [2 x double] containing the right source operand.
381	/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
382	/// values in the second operand and the one's complement of the first
383	/// operand.
384	static __inline__ __m128d __DEFAULT_FN_ATTRS
385	_mm_andnot_pd(__m128d __a, __m128d __b)
386	{
387	return (__m128d)(~(__v2du)__a & (__v2du)__b);
388	}
389
390	/// Performs a bitwise OR of two 128-bit vectors of [2 x double].
391	///
392	/// \headerfile <x86intrin.h>
393	///
394	/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
395	///
396	/// \param __a
397	/// A 128-bit vector of [2 x double] containing one of the source operands.
398	/// \param __b
399	/// A 128-bit vector of [2 x double] containing one of the source operands.
400	/// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
401	/// values between both operands.
402	static __inline__ __m128d __DEFAULT_FN_ATTRS
403	_mm_or_pd(__m128d __a, __m128d __b)
404	{
405	return (__m128d)((__v2du)__a \| (__v2du)__b);
406	}
407
408	/// Performs a bitwise XOR of two 128-bit vectors of [2 x double].
409	///
410	/// \headerfile <x86intrin.h>
411	///
412	/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
413	///
414	/// \param __a
415	/// A 128-bit vector of [2 x double] containing one of the source operands.
416	/// \param __b
417	/// A 128-bit vector of [2 x double] containing one of the source operands.
418	/// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
419	/// values between both operands.
420	static __inline__ __m128d __DEFAULT_FN_ATTRS
421	_mm_xor_pd(__m128d __a, __m128d __b)
422	{
423	return (__m128d)((__v2du)__a ^ (__v2du)__b);
424	}
425
426	/// Compares each of the corresponding double-precision values of the
427	/// 128-bit vectors of [2 x double] for equality. Each comparison yields 0x0
428	/// for false, 0xFFFFFFFFFFFFFFFF for true.
429	///
430	/// \headerfile <x86intrin.h>
431	///
432	/// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
433	///
434	/// \param __a
435	/// A 128-bit vector of [2 x double].
436	/// \param __b
437	/// A 128-bit vector of [2 x double].
438	/// \returns A 128-bit vector containing the comparison results.
439	static __inline__ __m128d __DEFAULT_FN_ATTRS
440	_mm_cmpeq_pd(__m128d __a, __m128d __b)
441	{
442	return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
443	}
444
445	/// Compares each of the corresponding double-precision values of the
446	/// 128-bit vectors of [2 x double] to determine if the values in the first
447	/// operand are less than those in the second operand. Each comparison
448	/// yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
449	///
450	/// \headerfile <x86intrin.h>
451	///
452	/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
453	///
454	/// \param __a
455	/// A 128-bit vector of [2 x double].
456	/// \param __b
457	/// A 128-bit vector of [2 x double].
458	/// \returns A 128-bit vector containing the comparison results.
459	static __inline__ __m128d __DEFAULT_FN_ATTRS
460	_mm_cmplt_pd(__m128d __a, __m128d __b)
461	{
462	return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
463	}
464
465	/// Compares each of the corresponding double-precision values of the
466	/// 128-bit vectors of [2 x double] to determine if the values in the first
467	/// operand are less than or equal to those in the second operand.
468	///
469	/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
470	///
471	/// \headerfile <x86intrin.h>
472	///
473	/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
474	///
475	/// \param __a
476	/// A 128-bit vector of [2 x double].
477	/// \param __b
478	/// A 128-bit vector of [2 x double].
479	/// \returns A 128-bit vector containing the comparison results.
480	static __inline__ __m128d __DEFAULT_FN_ATTRS
481	_mm_cmple_pd(__m128d __a, __m128d __b)
482	{
483	return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
484	}
485
486	/// Compares each of the corresponding double-precision values of the
487	/// 128-bit vectors of [2 x double] to determine if the values in the first
488	/// operand are greater than those in the second operand.
489	///
490	/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
491	///
492	/// \headerfile <x86intrin.h>
493	///
494	/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
495	///
496	/// \param __a
497	/// A 128-bit vector of [2 x double].
498	/// \param __b
499	/// A 128-bit vector of [2 x double].
500	/// \returns A 128-bit vector containing the comparison results.
501	static __inline__ __m128d __DEFAULT_FN_ATTRS
502	_mm_cmpgt_pd(__m128d __a, __m128d __b)
503	{
504	return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
505	}
506
507	/// Compares each of the corresponding double-precision values of the
508	/// 128-bit vectors of [2 x double] to determine if the values in the first
509	/// operand are greater than or equal to those in the second operand.
510	///
511	/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
512	///
513	/// \headerfile <x86intrin.h>
514	///
515	/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
516	///
517	/// \param __a
518	/// A 128-bit vector of [2 x double].
519	/// \param __b
520	/// A 128-bit vector of [2 x double].
521	/// \returns A 128-bit vector containing the comparison results.
522	static __inline__ __m128d __DEFAULT_FN_ATTRS
523	_mm_cmpge_pd(__m128d __a, __m128d __b)
524	{
525	return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
526	}
527
528	/// Compares each of the corresponding double-precision values of the
529	/// 128-bit vectors of [2 x double] to determine if the values in the first
530	/// operand are ordered with respect to those in the second operand.
531	///
532	/// A pair of double-precision values are "ordered" with respect to each
533	/// other if neither value is a NaN. Each comparison yields 0x0 for false,
534	/// 0xFFFFFFFFFFFFFFFF for true.
535	///
536	/// \headerfile <x86intrin.h>
537	///
538	/// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
539	///
540	/// \param __a
541	/// A 128-bit vector of [2 x double].
542	/// \param __b
543	/// A 128-bit vector of [2 x double].
544	/// \returns A 128-bit vector containing the comparison results.
545	static __inline__ __m128d __DEFAULT_FN_ATTRS
546	_mm_cmpord_pd(__m128d __a, __m128d __b)
547	{
548	return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
549	}
550
551	/// Compares each of the corresponding double-precision values of the
552	/// 128-bit vectors of [2 x double] to determine if the values in the first
553	/// operand are unordered with respect to those in the second operand.
554	///
555	/// A pair of double-precision values are "unordered" with respect to each
556	/// other if one or both values are NaN. Each comparison yields 0x0 for
557	/// false, 0xFFFFFFFFFFFFFFFF for true.
558	///
559	/// \headerfile <x86intrin.h>
560	///
561	/// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
562	/// instruction.
563	///
564	/// \param __a
565	/// A 128-bit vector of [2 x double].
566	/// \param __b
567	/// A 128-bit vector of [2 x double].
568	/// \returns A 128-bit vector containing the comparison results.
569	static __inline__ __m128d __DEFAULT_FN_ATTRS
570	_mm_cmpunord_pd(__m128d __a, __m128d __b)
571	{
572	return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
573	}
574
575	/// Compares each of the corresponding double-precision values of the
576	/// 128-bit vectors of [2 x double] to determine if the values in the first
577	/// operand are unequal to those in the second operand.
578	///
579	/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
580	///
581	/// \headerfile <x86intrin.h>
582	///
583	/// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
584	///
585	/// \param __a
586	/// A 128-bit vector of [2 x double].
587	/// \param __b
588	/// A 128-bit vector of [2 x double].
589	/// \returns A 128-bit vector containing the comparison results.
590	static __inline__ __m128d __DEFAULT_FN_ATTRS
591	_mm_cmpneq_pd(__m128d __a, __m128d __b)
592	{
593	return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
594	}
595
596	/// Compares each of the corresponding double-precision values of the
597	/// 128-bit vectors of [2 x double] to determine if the values in the first
598	/// operand are not less than those in the second operand.
599	///
600	/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
601	///
602	/// \headerfile <x86intrin.h>
603	///
604	/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
605	///
606	/// \param __a
607	/// A 128-bit vector of [2 x double].
608	/// \param __b
609	/// A 128-bit vector of [2 x double].
610	/// \returns A 128-bit vector containing the comparison results.
611	static __inline__ __m128d __DEFAULT_FN_ATTRS
612	_mm_cmpnlt_pd(__m128d __a, __m128d __b)
613	{
614	return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
615	}
616
617	/// Compares each of the corresponding double-precision values of the
618	/// 128-bit vectors of [2 x double] to determine if the values in the first
619	/// operand are not less than or equal to those in the second operand.
620	///
621	/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
622	///
623	/// \headerfile <x86intrin.h>
624	///
625	/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
626	///
627	/// \param __a
628	/// A 128-bit vector of [2 x double].
629	/// \param __b
630	/// A 128-bit vector of [2 x double].
631	/// \returns A 128-bit vector containing the comparison results.
632	static __inline__ __m128d __DEFAULT_FN_ATTRS
633	_mm_cmpnle_pd(__m128d __a, __m128d __b)
634	{
635	return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
636	}
637
638	/// Compares each of the corresponding double-precision values of the
639	/// 128-bit vectors of [2 x double] to determine if the values in the first
640	/// operand are not greater than those in the second operand.
641	///
642	/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
643	///
644	/// \headerfile <x86intrin.h>
645	///
646	/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
647	///
648	/// \param __a
649	/// A 128-bit vector of [2 x double].
650	/// \param __b
651	/// A 128-bit vector of [2 x double].
652	/// \returns A 128-bit vector containing the comparison results.
653	static __inline__ __m128d __DEFAULT_FN_ATTRS
654	_mm_cmpngt_pd(__m128d __a, __m128d __b)
655	{
656	return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
657	}
658
659	/// Compares each of the corresponding double-precision values of the
660	/// 128-bit vectors of [2 x double] to determine if the values in the first
661	/// operand are not greater than or equal to those in the second operand.
662	///
663	/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
664	///
665	/// \headerfile <x86intrin.h>
666	///
667	/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
668	///
669	/// \param __a
670	/// A 128-bit vector of [2 x double].
671	/// \param __b
672	/// A 128-bit vector of [2 x double].
673	/// \returns A 128-bit vector containing the comparison results.
674	static __inline__ __m128d __DEFAULT_FN_ATTRS
675	_mm_cmpnge_pd(__m128d __a, __m128d __b)
676	{
677	return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
678	}
679
680	/// Compares the lower double-precision floating-point values in each of
681	/// the two 128-bit floating-point vectors of [2 x double] for equality.
682	///
683	/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
684	///
685	/// \headerfile <x86intrin.h>
686	///
687	/// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
688	///
689	/// \param __a
690	/// A 128-bit vector of [2 x double]. The lower double-precision value is
691	/// compared to the lower double-precision value of \a __b.
692	/// \param __b
693	/// A 128-bit vector of [2 x double]. The lower double-precision value is
694	/// compared to the lower double-precision value of \a __a.
695	/// \returns A 128-bit vector. The lower 64 bits contains the comparison
696	/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
697	static __inline__ __m128d __DEFAULT_FN_ATTRS
698	_mm_cmpeq_sd(__m128d __a, __m128d __b)
699	{
700	return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
701	}
702
703	/// Compares the lower double-precision floating-point values in each of
704	/// the two 128-bit floating-point vectors of [2 x double] to determine if
705	/// the value in the first parameter is less than the corresponding value in
706	/// the second parameter.
707	///
708	/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
709	///
710	/// \headerfile <x86intrin.h>
711	///
712	/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
713	///
714	/// \param __a
715	/// A 128-bit vector of [2 x double]. The lower double-precision value is
716	/// compared to the lower double-precision value of \a __b.
717	/// \param __b
718	/// A 128-bit vector of [2 x double]. The lower double-precision value is
719	/// compared to the lower double-precision value of \a __a.
720	/// \returns A 128-bit vector. The lower 64 bits contains the comparison
721	/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
722	static __inline__ __m128d __DEFAULT_FN_ATTRS
723	_mm_cmplt_sd(__m128d __a, __m128d __b)
724	{
725	return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
726	}
727
728	/// Compares the lower double-precision floating-point values in each of
729	/// the two 128-bit floating-point vectors of [2 x double] to determine if
730	/// the value in the first parameter is less than or equal to the
731	/// corresponding value in the second parameter.
732	///
733	/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
734	///
735	/// \headerfile <x86intrin.h>
736	///
737	/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
738	///
739	/// \param __a
740	/// A 128-bit vector of [2 x double]. The lower double-precision value is
741	/// compared to the lower double-precision value of \a __b.
742	/// \param __b
743	/// A 128-bit vector of [2 x double]. The lower double-precision value is
744	/// compared to the lower double-precision value of \a __a.
745	/// \returns A 128-bit vector. The lower 64 bits contains the comparison
746	/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
747	static __inline__ __m128d __DEFAULT_FN_ATTRS
748	_mm_cmple_sd(__m128d __a, __m128d __b)
749	{
750	return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
751	}
752
753	/// Compares the lower double-precision floating-point values in each of
754	/// the two 128-bit floating-point vectors of [2 x double] to determine if
755	/// the value in the first parameter is greater than the corresponding value
756	/// in the second parameter.
757	///
758	/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
759	///
760	/// \headerfile <x86intrin.h>
761	///
762	/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
763	///
764	/// \param __a
765	/// A 128-bit vector of [2 x double]. The lower double-precision value is
766	/// compared to the lower double-precision value of \a __b.
767	/// \param __b
768	/// A 128-bit vector of [2 x double]. The lower double-precision value is
769	/// compared to the lower double-precision value of \a __a.
770	/// \returns A 128-bit vector. The lower 64 bits contains the comparison
771	/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
772	static __inline__ __m128d __DEFAULT_FN_ATTRS
773	_mm_cmpgt_sd(__m128d __a, __m128d __b)
774	{
775	__m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
776	return __extension__ (__m128d) { __c[0], __a[1] };
777	}
778
779	/// Compares the lower double-precision floating-point values in each of
780	/// the two 128-bit floating-point vectors of [2 x double] to determine if
781	/// the value in the first parameter is greater than or equal to the
782	/// corresponding value in the second parameter.
783	///
784	/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
785	///
786	/// \headerfile <x86intrin.h>
787	///
788	/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
789	///
790	/// \param __a
791	/// A 128-bit vector of [2 x double]. The lower double-precision value is
792	/// compared to the lower double-precision value of \a __b.
793	/// \param __b
794	/// A 128-bit vector of [2 x double]. The lower double-precision value is
795	/// compared to the lower double-precision value of \a __a.
796	/// \returns A 128-bit vector. The lower 64 bits contains the comparison
797	/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
798	static __inline__ __m128d __DEFAULT_FN_ATTRS
799	_mm_cmpge_sd(__m128d __a, __m128d __b)
800	{
801	__m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
802	return __extension__ (__m128d) { __c[0], __a[1] };
803	}
804
805	/// Compares the lower double-precision floating-point values in each of
806	/// the two 128-bit floating-point vectors of [2 x double] to determine if
807	/// the value in the first parameter is "ordered" with respect to the
808	/// corresponding value in the second parameter.
809	///
810	/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
811	/// of double-precision values are "ordered" with respect to each other if
812	/// neither value is a NaN.
813	///
814	/// \headerfile <x86intrin.h>
815	///
816	/// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
817	///
818	/// \param __a
819	/// A 128-bit vector of [2 x double]. The lower double-precision value is
820	/// compared to the lower double-precision value of \a __b.
821	/// \param __b
822	/// A 128-bit vector of [2 x double]. The lower double-precision value is
823	/// compared to the lower double-precision value of \a __a.
824	/// \returns A 128-bit vector. The lower 64 bits contains the comparison
825	/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
826	static __inline__ __m128d __DEFAULT_FN_ATTRS
827	_mm_cmpord_sd(__m128d __a, __m128d __b)
828	{
829	return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
830	}
831
832	/// Compares the lower double-precision floating-point values in each of
833	/// the two 128-bit floating-point vectors of [2 x double] to determine if
834	/// the value in the first parameter is "unordered" with respect to the
835	/// corresponding value in the second parameter.
836	///
837	/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
838	/// of double-precision values are "unordered" with respect to each other if
839	/// one or both values are NaN.
840	///
841	/// \headerfile <x86intrin.h>
842	///
843	/// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
844	/// instruction.
845	///
846	/// \param __a
847	/// A 128-bit vector of [2 x double]. The lower double-precision value is
848	/// compared to the lower double-precision value of \a __b.
849	/// \param __b
850	/// A 128-bit vector of [2 x double]. The lower double-precision value is
851	/// compared to the lower double-precision value of \a __a.
852	/// \returns A 128-bit vector. The lower 64 bits contains the comparison
853	/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
854	static __inline__ __m128d __DEFAULT_FN_ATTRS
855	_mm_cmpunord_sd(__m128d __a, __m128d __b)
856	{
857	return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
858	}
859
860	/// Compares the lower double-precision floating-point values in each of
861	/// the two 128-bit floating-point vectors of [2 x double] to determine if
862	/// the value in the first parameter is unequal to the corresponding value in
863	/// the second parameter.
864	///
865	/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
866	///
867	/// \headerfile <x86intrin.h>
868	///
869	/// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
870	///
871	/// \param __a
872	/// A 128-bit vector of [2 x double]. The lower double-precision value is
873	/// compared to the lower double-precision value of \a __b.
874	/// \param __b
875	/// A 128-bit vector of [2 x double]. The lower double-precision value is
876	/// compared to the lower double-precision value of \a __a.
877	/// \returns A 128-bit vector. The lower 64 bits contains the comparison
878	/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
879	static __inline__ __m128d __DEFAULT_FN_ATTRS
880	_mm_cmpneq_sd(__m128d __a, __m128d __b)
881	{
882	return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
883	}
884
885	/// Compares the lower double-precision floating-point values in each of
886	/// the two 128-bit floating-point vectors of [2 x double] to determine if
887	/// the value in the first parameter is not less than the corresponding
888	/// value in the second parameter.
889	///
890	/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
891	///
892	/// \headerfile <x86intrin.h>
893	///
894	/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
895	///
896	/// \param __a
897	/// A 128-bit vector of [2 x double]. The lower double-precision value is
898	/// compared to the lower double-precision value of \a __b.
899	/// \param __b
900	/// A 128-bit vector of [2 x double]. The lower double-precision value is
901	/// compared to the lower double-precision value of \a __a.
902	/// \returns A 128-bit vector. The lower 64 bits contains the comparison
903	/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
904	static __inline__ __m128d __DEFAULT_FN_ATTRS
905	_mm_cmpnlt_sd(__m128d __a, __m128d __b)
906	{
907	return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
908	}
909
910	/// Compares the lower double-precision floating-point values in each of
911	/// the two 128-bit floating-point vectors of [2 x double] to determine if
912	/// the value in the first parameter is not less than or equal to the
913	/// corresponding value in the second parameter.
914	///
915	/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
916	///
917	/// \headerfile <x86intrin.h>
918	///
919	/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
920	///
921	/// \param __a
922	/// A 128-bit vector of [2 x double]. The lower double-precision value is
923	/// compared to the lower double-precision value of \a __b.
924	/// \param __b
925	/// A 128-bit vector of [2 x double]. The lower double-precision value is
926	/// compared to the lower double-precision value of \a __a.
927	/// \returns A 128-bit vector. The lower 64 bits contains the comparison
928	/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
929	static __inline__ __m128d __DEFAULT_FN_ATTRS
930	_mm_cmpnle_sd(__m128d __a, __m128d __b)
931	{
932	return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
933	}
934
935	/// Compares the lower double-precision floating-point values in each of
936	/// the two 128-bit floating-point vectors of [2 x double] to determine if
937	/// the value in the first parameter is not greater than the corresponding
938	/// value in the second parameter.
939	///
940	/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
941	///
942	/// \headerfile <x86intrin.h>
943	///
944	/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
945	///
946	/// \param __a
947	/// A 128-bit vector of [2 x double]. The lower double-precision value is
948	/// compared to the lower double-precision value of \a __b.
949	/// \param __b
950	/// A 128-bit vector of [2 x double]. The lower double-precision value is
951	/// compared to the lower double-precision value of \a __a.
952	/// \returns A 128-bit vector. The lower 64 bits contains the comparison
953	/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
954	static __inline__ __m128d __DEFAULT_FN_ATTRS
955	_mm_cmpngt_sd(__m128d __a, __m128d __b)
956	{
957	__m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
958	return __extension__ (__m128d) { __c[0], __a[1] };
959	}
960
961	/// Compares the lower double-precision floating-point values in each of
962	/// the two 128-bit floating-point vectors of [2 x double] to determine if
963	/// the value in the first parameter is not greater than or equal to the
964	/// corresponding value in the second parameter.
965	///
966	/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
967	///
968	/// \headerfile <x86intrin.h>
969	///
970	/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
971	///
972	/// \param __a
973	/// A 128-bit vector of [2 x double]. The lower double-precision value is
974	/// compared to the lower double-precision value of \a __b.
975	/// \param __b
976	/// A 128-bit vector of [2 x double]. The lower double-precision value is
977	/// compared to the lower double-precision value of \a __a.
978	/// \returns A 128-bit vector. The lower 64 bits contains the comparison
979	/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
980	static __inline__ __m128d __DEFAULT_FN_ATTRS
981	_mm_cmpnge_sd(__m128d __a, __m128d __b)
982	{
983	__m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
984	return __extension__ (__m128d) { __c[0], __a[1] };
985	}
986
987	/// Compares the lower double-precision floating-point values in each of
988	/// the two 128-bit floating-point vectors of [2 x double] for equality.
989	///
990	/// The comparison yields 0 for false, 1 for true. If either of the two
991	/// lower double-precision values is NaN, 0 is returned.
992	///
993	/// \headerfile <x86intrin.h>
994	///
995	/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
996	///
997	/// \param __a
998	/// A 128-bit vector of [2 x double]. The lower double-precision value is
999	/// compared to the lower double-precision value of \a __b.
1000	/// \param __b
1001	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1002	/// compared to the lower double-precision value of \a __a.
1003	/// \returns An integer containing the comparison results. If either of the two
1004	/// lower double-precision values is NaN, 0 is returned.
1005	static __inline__ int __DEFAULT_FN_ATTRS
1006	_mm_comieq_sd(__m128d __a, __m128d __b)
1007	{
1008	return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
1009	}
1010
1011	/// Compares the lower double-precision floating-point values in each of
1012	/// the two 128-bit floating-point vectors of [2 x double] to determine if
1013	/// the value in the first parameter is less than the corresponding value in
1014	/// the second parameter.
1015	///
1016	/// The comparison yields 0 for false, 1 for true. If either of the two
1017	/// lower double-precision values is NaN, 0 is returned.
1018	///
1019	/// \headerfile <x86intrin.h>
1020	///
1021	/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1022	///
1023	/// \param __a
1024	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1025	/// compared to the lower double-precision value of \a __b.
1026	/// \param __b
1027	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1028	/// compared to the lower double-precision value of \a __a.
1029	/// \returns An integer containing the comparison results. If either of the two
1030	/// lower double-precision values is NaN, 0 is returned.
1031	static __inline__ int __DEFAULT_FN_ATTRS
1032	_mm_comilt_sd(__m128d __a, __m128d __b)
1033	{
1034	return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
1035	}
1036
1037	/// Compares the lower double-precision floating-point values in each of
1038	/// the two 128-bit floating-point vectors of [2 x double] to determine if
1039	/// the value in the first parameter is less than or equal to the
1040	/// corresponding value in the second parameter.
1041	///
1042	/// The comparison yields 0 for false, 1 for true. If either of the two
1043	/// lower double-precision values is NaN, 0 is returned.
1044	///
1045	/// \headerfile <x86intrin.h>
1046	///
1047	/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1048	///
1049	/// \param __a
1050	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1051	/// compared to the lower double-precision value of \a __b.
1052	/// \param __b
1053	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1054	/// compared to the lower double-precision value of \a __a.
1055	/// \returns An integer containing the comparison results. If either of the two
1056	/// lower double-precision values is NaN, 0 is returned.
1057	static __inline__ int __DEFAULT_FN_ATTRS
1058	_mm_comile_sd(__m128d __a, __m128d __b)
1059	{
1060	return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
1061	}
1062
1063	/// Compares the lower double-precision floating-point values in each of
1064	/// the two 128-bit floating-point vectors of [2 x double] to determine if
1065	/// the value in the first parameter is greater than the corresponding value
1066	/// in the second parameter.
1067	///
1068	/// The comparison yields 0 for false, 1 for true. If either of the two
1069	/// lower double-precision values is NaN, 0 is returned.
1070	///
1071	/// \headerfile <x86intrin.h>
1072	///
1073	/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1074	///
1075	/// \param __a
1076	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1077	/// compared to the lower double-precision value of \a __b.
1078	/// \param __b
1079	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1080	/// compared to the lower double-precision value of \a __a.
1081	/// \returns An integer containing the comparison results. If either of the two
1082	/// lower double-precision values is NaN, 0 is returned.
1083	static __inline__ int __DEFAULT_FN_ATTRS
1084	_mm_comigt_sd(__m128d __a, __m128d __b)
1085	{
1086	return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
1087	}
1088
1089	/// Compares the lower double-precision floating-point values in each of
1090	/// the two 128-bit floating-point vectors of [2 x double] to determine if
1091	/// the value in the first parameter is greater than or equal to the
1092	/// corresponding value in the second parameter.
1093	///
1094	/// The comparison yields 0 for false, 1 for true. If either of the two
1095	/// lower double-precision values is NaN, 0 is returned.
1096	///
1097	/// \headerfile <x86intrin.h>
1098	///
1099	/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1100	///
1101	/// \param __a
1102	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1103	/// compared to the lower double-precision value of \a __b.
1104	/// \param __b
1105	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1106	/// compared to the lower double-precision value of \a __a.
1107	/// \returns An integer containing the comparison results. If either of the two
1108	/// lower double-precision values is NaN, 0 is returned.
1109	static __inline__ int __DEFAULT_FN_ATTRS
1110	_mm_comige_sd(__m128d __a, __m128d __b)
1111	{
1112	return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
1113	}
1114
1115	/// Compares the lower double-precision floating-point values in each of
1116	/// the two 128-bit floating-point vectors of [2 x double] to determine if
1117	/// the value in the first parameter is unequal to the corresponding value in
1118	/// the second parameter.
1119	///
1120	/// The comparison yields 0 for false, 1 for true. If either of the two
1121	/// lower double-precision values is NaN, 1 is returned.
1122	///
1123	/// \headerfile <x86intrin.h>
1124	///
1125	/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1126	///
1127	/// \param __a
1128	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1129	/// compared to the lower double-precision value of \a __b.
1130	/// \param __b
1131	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1132	/// compared to the lower double-precision value of \a __a.
1133	/// \returns An integer containing the comparison results. If either of the two
1134	/// lower double-precision values is NaN, 1 is returned.
1135	static __inline__ int __DEFAULT_FN_ATTRS
1136	_mm_comineq_sd(__m128d __a, __m128d __b)
1137	{
1138	return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
1139	}
1140
1141	/// Compares the lower double-precision floating-point values in each of
1142	/// the two 128-bit floating-point vectors of [2 x double] for equality. The
1143	/// comparison yields 0 for false, 1 for true.
1144	///
1145	/// If either of the two lower double-precision values is NaN, 0 is returned.
1146	///
1147	/// \headerfile <x86intrin.h>
1148	///
1149	/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1150	///
1151	/// \param __a
1152	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1153	/// compared to the lower double-precision value of \a __b.
1154	/// \param __b
1155	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1156	/// compared to the lower double-precision value of \a __a.
1157	/// \returns An integer containing the comparison results. If either of the two
1158	/// lower double-precision values is NaN, 0 is returned.
1159	static __inline__ int __DEFAULT_FN_ATTRS
1160	_mm_ucomieq_sd(__m128d __a, __m128d __b)
1161	{
1162	return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
1163	}
1164
1165	/// Compares the lower double-precision floating-point values in each of
1166	/// the two 128-bit floating-point vectors of [2 x double] to determine if
1167	/// the value in the first parameter is less than the corresponding value in
1168	/// the second parameter.
1169	///
1170	/// The comparison yields 0 for false, 1 for true. If either of the two lower
1171	/// double-precision values is NaN, 0 is returned.
1172	///
1173	/// \headerfile <x86intrin.h>
1174	///
1175	/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1176	///
1177	/// \param __a
1178	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1179	/// compared to the lower double-precision value of \a __b.
1180	/// \param __b
1181	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1182	/// compared to the lower double-precision value of \a __a.
1183	/// \returns An integer containing the comparison results. If either of the two
1184	/// lower double-precision values is NaN, 0 is returned.
1185	static __inline__ int __DEFAULT_FN_ATTRS
1186	_mm_ucomilt_sd(__m128d __a, __m128d __b)
1187	{
1188	return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
1189	}
1190
1191	/// Compares the lower double-precision floating-point values in each of
1192	/// the two 128-bit floating-point vectors of [2 x double] to determine if
1193	/// the value in the first parameter is less than or equal to the
1194	/// corresponding value in the second parameter.
1195	///
1196	/// The comparison yields 0 for false, 1 for true. If either of the two lower
1197	/// double-precision values is NaN, 0 is returned.
1198	///
1199	/// \headerfile <x86intrin.h>
1200	///
1201	/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1202	///
1203	/// \param __a
1204	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1205	/// compared to the lower double-precision value of \a __b.
1206	/// \param __b
1207	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1208	/// compared to the lower double-precision value of \a __a.
1209	/// \returns An integer containing the comparison results. If either of the two
1210	/// lower double-precision values is NaN, 0 is returned.
1211	static __inline__ int __DEFAULT_FN_ATTRS
1212	_mm_ucomile_sd(__m128d __a, __m128d __b)
1213	{
1214	return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
1215	}
1216
1217	/// Compares the lower double-precision floating-point values in each of
1218	/// the two 128-bit floating-point vectors of [2 x double] to determine if
1219	/// the value in the first parameter is greater than the corresponding value
1220	/// in the second parameter.
1221	///
1222	/// The comparison yields 0 for false, 1 for true. If either of the two lower
1223	/// double-precision values is NaN, 0 is returned.
1224	///
1225	/// \headerfile <x86intrin.h>
1226	///
1227	/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1228	///
1229	/// \param __a
1230	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1231	/// compared to the lower double-precision value of \a __b.
1232	/// \param __b
1233	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1234	/// compared to the lower double-precision value of \a __a.
1235	/// \returns An integer containing the comparison results. If either of the two
1236	/// lower double-precision values is NaN, 0 is returned.
1237	static __inline__ int __DEFAULT_FN_ATTRS
1238	_mm_ucomigt_sd(__m128d __a, __m128d __b)
1239	{
1240	return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
1241	}
1242
1243	/// Compares the lower double-precision floating-point values in each of
1244	/// the two 128-bit floating-point vectors of [2 x double] to determine if
1245	/// the value in the first parameter is greater than or equal to the
1246	/// corresponding value in the second parameter.
1247	///
1248	/// The comparison yields 0 for false, 1 for true. If either of the two
1249	/// lower double-precision values is NaN, 0 is returned.
1250	///
1251	/// \headerfile <x86intrin.h>
1252	///
1253	/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1254	///
1255	/// \param __a
1256	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1257	/// compared to the lower double-precision value of \a __b.
1258	/// \param __b
1259	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1260	/// compared to the lower double-precision value of \a __a.
1261	/// \returns An integer containing the comparison results. If either of the two
1262	/// lower double-precision values is NaN, 0 is returned.
1263	static __inline__ int __DEFAULT_FN_ATTRS
1264	_mm_ucomige_sd(__m128d __a, __m128d __b)
1265	{
1266	return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
1267	}
1268
1269	/// Compares the lower double-precision floating-point values in each of
1270	/// the two 128-bit floating-point vectors of [2 x double] to determine if
1271	/// the value in the first parameter is unequal to the corresponding value in
1272	/// the second parameter.
1273	///
1274	/// The comparison yields 0 for false, 1 for true. If either of the two lower
1275	/// double-precision values is NaN, 1 is returned.
1276	///
1277	/// \headerfile <x86intrin.h>
1278	///
1279	/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1280	///
1281	/// \param __a
1282	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1283	/// compared to the lower double-precision value of \a __b.
1284	/// \param __b
1285	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1286	/// compared to the lower double-precision value of \a __a.
1287	/// \returns An integer containing the comparison result. If either of the two
1288	/// lower double-precision values is NaN, 1 is returned.
1289	static __inline__ int __DEFAULT_FN_ATTRS
1290	_mm_ucomineq_sd(__m128d __a, __m128d __b)
1291	{
1292	return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
1293	}
1294
1295	/// Converts the two double-precision floating-point elements of a
1296	/// 128-bit vector of [2 x double] into two single-precision floating-point
1297	/// values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
1298	/// The upper 64 bits of the result vector are set to zero.
1299	///
1300	/// \headerfile <x86intrin.h>
1301	///
1302	/// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
1303	///
1304	/// \param __a
1305	/// A 128-bit vector of [2 x double].
1306	/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1307	/// converted values. The upper 64 bits are set to zero.
1308	static __inline__ __m128 __DEFAULT_FN_ATTRS
1309	_mm_cvtpd_ps(__m128d __a)
1310	{
1311	return __builtin_ia32_cvtpd2ps((__v2df)__a);
1312	}
1313
1314	/// Converts the lower two single-precision floating-point elements of a
1315	/// 128-bit vector of [4 x float] into two double-precision floating-point
1316	/// values, returned in a 128-bit vector of [2 x double]. The upper two
1317	/// elements of the input vector are unused.
1318	///
1319	/// \headerfile <x86intrin.h>
1320	///
1321	/// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
1322	///
1323	/// \param __a
1324	/// A 128-bit vector of [4 x float]. The lower two single-precision
1325	/// floating-point elements are converted to double-precision values. The
1326	/// upper two elements are unused.
1327	/// \returns A 128-bit vector of [2 x double] containing the converted values.
1328	static __inline__ __m128d __DEFAULT_FN_ATTRS
1329	_mm_cvtps_pd(__m128 __a)
1330	{
1331	return (__m128d) __builtin_convertvector(
1332	__builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
1333	}
1334
1335	/// Converts the lower two integer elements of a 128-bit vector of
1336	/// [4 x i32] into two double-precision floating-point values, returned in a
1337	/// 128-bit vector of [2 x double].
1338	///
1339	/// The upper two elements of the input vector are unused.
1340	///
1341	/// \headerfile <x86intrin.h>
1342	///
1343	/// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
1344	///
1345	/// \param __a
1346	/// A 128-bit integer vector of [4 x i32]. The lower two integer elements are
1347	/// converted to double-precision values.
1348	///
1349	/// The upper two elements are unused.
1350	/// \returns A 128-bit vector of [2 x double] containing the converted values.
1351	static __inline__ __m128d __DEFAULT_FN_ATTRS
1352	_mm_cvtepi32_pd(__m128i __a)
1353	{
1354	return (__m128d) __builtin_convertvector(
1355	__builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
1356	}
1357
1358	/// Converts the two double-precision floating-point elements of a
1359	/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1360	/// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
1361	/// 64 bits of the result vector are set to zero.
1362	///
1363	/// \headerfile <x86intrin.h>
1364	///
1365	/// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
1366	///
1367	/// \param __a
1368	/// A 128-bit vector of [2 x double].
1369	/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1370	/// converted values. The upper 64 bits are set to zero.
1371	static __inline__ __m128i __DEFAULT_FN_ATTRS
1372	_mm_cvtpd_epi32(__m128d __a)
1373	{
1374	return __builtin_ia32_cvtpd2dq((__v2df)__a);
1375	}
1376
1377	/// Converts the low-order element of a 128-bit vector of [2 x double]
1378	/// into a 32-bit signed integer value.
1379	///
1380	/// \headerfile <x86intrin.h>
1381	///
1382	/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
1383	///
1384	/// \param __a
1385	/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1386	/// conversion.
1387	/// \returns A 32-bit signed integer containing the converted value.
1388	static __inline__ int __DEFAULT_FN_ATTRS
1389	_mm_cvtsd_si32(__m128d __a)
1390	{
1391	return __builtin_ia32_cvtsd2si((__v2df)__a);
1392	}
1393
1394	/// Converts the lower double-precision floating-point element of a
1395	/// 128-bit vector of [2 x double], in the second parameter, into a
1396	/// single-precision floating-point value, returned in the lower 32 bits of a
1397	/// 128-bit vector of [4 x float]. The upper 96 bits of the result vector are
1398	/// copied from the upper 96 bits of the first parameter.
1399	///
1400	/// \headerfile <x86intrin.h>
1401	///
1402	/// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
1403	///
1404	/// \param __a
1405	/// A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
1406	/// copied to the upper 96 bits of the result.
1407	/// \param __b
1408	/// A 128-bit vector of [2 x double]. The lower double-precision
1409	/// floating-point element is used in the conversion.
1410	/// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
1411	/// converted value from the second parameter. The upper 96 bits are copied
1412	/// from the upper 96 bits of the first parameter.
1413	static __inline__ __m128 __DEFAULT_FN_ATTRS
1414	_mm_cvtsd_ss(__m128 __a, __m128d __b)
1415	{
1416	return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
1417	}
1418
1419	/// Converts a 32-bit signed integer value, in the second parameter, into
1420	/// a double-precision floating-point value, returned in the lower 64 bits of
1421	/// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1422	/// are copied from the upper 64 bits of the first parameter.
1423	///
1424	/// \headerfile <x86intrin.h>
1425	///
1426	/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
1427	///
1428	/// \param __a
1429	/// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1430	/// copied to the upper 64 bits of the result.
1431	/// \param __b
1432	/// A 32-bit signed integer containing the value to be converted.
1433	/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1434	/// converted value from the second parameter. The upper 64 bits are copied
1435	/// from the upper 64 bits of the first parameter.
1436	static __inline__ __m128d __DEFAULT_FN_ATTRS
1437	_mm_cvtsi32_sd(__m128d __a, int __b)
1438	{
1439	__a[0] = __b;
1440	return __a;
1441	}
1442
1443	/// Converts the lower single-precision floating-point element of a
1444	/// 128-bit vector of [4 x float], in the second parameter, into a
1445	/// double-precision floating-point value, returned in the lower 64 bits of
1446	/// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1447	/// are copied from the upper 64 bits of the first parameter.
1448	///
1449	/// \headerfile <x86intrin.h>
1450	///
1451	/// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
1452	///
1453	/// \param __a
1454	/// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1455	/// copied to the upper 64 bits of the result.
1456	/// \param __b
1457	/// A 128-bit vector of [4 x float]. The lower single-precision
1458	/// floating-point element is used in the conversion.
1459	/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1460	/// converted value from the second parameter. The upper 64 bits are copied
1461	/// from the upper 64 bits of the first parameter.
1462	static __inline__ __m128d __DEFAULT_FN_ATTRS
1463	_mm_cvtss_sd(__m128d __a, __m128 __b)
1464	{
1465	__a[0] = __b[0];
1466	return __a;
1467	}
1468
1469	/// Converts the two double-precision floating-point elements of a
1470	/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1471	/// returned in the lower 64 bits of a 128-bit vector of [4 x i32].
1472	///
1473	/// If the result of either conversion is inexact, the result is truncated
1474	/// (rounded towards zero) regardless of the current MXCSR setting. The upper
1475	/// 64 bits of the result vector are set to zero.
1476	///
1477	/// \headerfile <x86intrin.h>
1478	///
1479	/// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
1480	/// instruction.
1481	///
1482	/// \param __a
1483	/// A 128-bit vector of [2 x double].
1484	/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1485	/// converted values. The upper 64 bits are set to zero.
1486	static __inline__ __m128i __DEFAULT_FN_ATTRS
1487	_mm_cvttpd_epi32(__m128d __a)
1488	{
1489	return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
1490	}
1491
1492	/// Converts the low-order element of a [2 x double] vector into a 32-bit
1493	/// signed integer value, truncating the result when it is inexact.
1494	///
1495	/// \headerfile <x86intrin.h>
1496	///
1497	/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
1498	/// instruction.
1499	///
1500	/// \param __a
1501	/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1502	/// conversion.
1503	/// \returns A 32-bit signed integer containing the converted value.
1504	static __inline__ int __DEFAULT_FN_ATTRS
1505	_mm_cvttsd_si32(__m128d __a)
1506	{
1507	return __builtin_ia32_cvttsd2si((__v2df)__a);
1508	}
1509
1510	/// Converts the two double-precision floating-point elements of a
1511	/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1512	/// returned in a 64-bit vector of [2 x i32].
1513	///
1514	/// \headerfile <x86intrin.h>
1515	///
1516	/// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
1517	///
1518	/// \param __a
1519	/// A 128-bit vector of [2 x double].
1520	/// \returns A 64-bit vector of [2 x i32] containing the converted values.
1521	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1522	_mm_cvtpd_pi32(__m128d __a)
1523	{
1524	return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
1525	}
1526
1527	/// Converts the two double-precision floating-point elements of a
1528	/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1529	/// returned in a 64-bit vector of [2 x i32].
1530	///
1531	/// If the result of either conversion is inexact, the result is truncated
1532	/// (rounded towards zero) regardless of the current MXCSR setting.
1533	///
1534	/// \headerfile <x86intrin.h>
1535	///
1536	/// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
1537	///
1538	/// \param __a
1539	/// A 128-bit vector of [2 x double].
1540	/// \returns A 64-bit vector of [2 x i32] containing the converted values.
1541	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1542	_mm_cvttpd_pi32(__m128d __a)
1543	{
1544	return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
1545	}
1546
1547	/// Converts the two signed 32-bit integer elements of a 64-bit vector of
1548	/// [2 x i32] into two double-precision floating-point values, returned in a
1549	/// 128-bit vector of [2 x double].
1550	///
1551	/// \headerfile <x86intrin.h>
1552	///
1553	/// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
1554	///
1555	/// \param __a
1556	/// A 64-bit vector of [2 x i32].
1557	/// \returns A 128-bit vector of [2 x double] containing the converted values.
1558	static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX
1559	_mm_cvtpi32_pd(__m64 __a)
1560	{
1561	return __builtin_ia32_cvtpi2pd((__v2si)__a);
1562	}
1563
1564	/// Returns the low-order element of a 128-bit vector of [2 x double] as
1565	/// a double-precision floating-point value.
1566	///
1567	/// \headerfile <x86intrin.h>
1568	///
1569	/// This intrinsic has no corresponding instruction.
1570	///
1571	/// \param __a
1572	/// A 128-bit vector of [2 x double]. The lower 64 bits are returned.
1573	/// \returns A double-precision floating-point value copied from the lower 64
1574	/// bits of \a __a.
1575	static __inline__ double __DEFAULT_FN_ATTRS
1576	_mm_cvtsd_f64(__m128d __a)
1577	{
1578	return __a[0];
1579	}
1580
1581	/// Loads a 128-bit floating-point vector of [2 x double] from an aligned
1582	/// memory location.
1583	///
1584	/// \headerfile <x86intrin.h>
1585	///
1586	/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
1587	///
1588	/// \param __dp
1589	/// A pointer to a 128-bit memory location. The address of the memory
1590	/// location has to be 16-byte aligned.
1591	/// \returns A 128-bit vector of [2 x double] containing the loaded values.
1592	static __inline__ __m128d __DEFAULT_FN_ATTRS
1593	_mm_load_pd(double const *__dp)
1594	{
1595	return (__m128d)__dp;
1596	}
1597
1598	/// Loads a double-precision floating-point value from a specified memory
1599	/// location and duplicates it to both vector elements of a 128-bit vector of
1600	/// [2 x double].
1601	///
1602	/// \headerfile <x86intrin.h>
1603	///
1604	/// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
1605	///
1606	/// \param __dp
1607	/// A pointer to a memory location containing a double-precision value.
1608	/// \returns A 128-bit vector of [2 x double] containing the loaded and
1609	/// duplicated values.
1610	static __inline__ __m128d __DEFAULT_FN_ATTRS
1611	_mm_load1_pd(double const *__dp)
1612	{
1613	struct __mm_load1_pd_struct {
1614	double __u;
1615	} __attribute__((__packed__, __may_alias__));
1616	double __u = ((struct __mm_load1_pd_struct*)__dp)->__u;
1617	return __extension__ (__m128d){ __u, __u };
1618	}
1619
1620	#define _mm_load_pd1(dp) _mm_load1_pd(dp)
1621
1622	/// Loads two double-precision values, in reverse order, from an aligned
1623	/// memory location into a 128-bit vector of [2 x double].
1624	///
1625	/// \headerfile <x86intrin.h>
1626	///
1627	/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
1628	/// needed shuffling instructions. In AVX mode, the shuffling may be combined
1629	/// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
1630	///
1631	/// \param __dp
1632	/// A 16-byte aligned pointer to an array of double-precision values to be
1633	/// loaded in reverse order.
1634	/// \returns A 128-bit vector of [2 x double] containing the reversed loaded
1635	/// values.
1636	static __inline__ __m128d __DEFAULT_FN_ATTRS
1637	_mm_loadr_pd(double const *__dp)
1638	{
1639	__m128d __u = (__m128d)__dp;
1640	return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
1641	}
1642
1643	/// Loads a 128-bit floating-point vector of [2 x double] from an
1644	/// unaligned memory location.
1645	///
1646	/// \headerfile <x86intrin.h>
1647	///
1648	/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1649	///
1650	/// \param __dp
1651	/// A pointer to a 128-bit memory location. The address of the memory
1652	/// location does not have to be aligned.
1653	/// \returns A 128-bit vector of [2 x double] containing the loaded values.
1654	static __inline__ __m128d __DEFAULT_FN_ATTRS
1655	_mm_loadu_pd(double const *__dp)
1656	{
1657	struct __loadu_pd {
1658	__m128d_u __v;
1659	} __attribute__((__packed__, __may_alias__));
1660	return ((struct __loadu_pd*)__dp)->__v;
1661	}
1662
1663	/// Loads a 64-bit integer value to the low element of a 128-bit integer
1664	/// vector and clears the upper element.
1665	///
1666	/// \headerfile <x86intrin.h>
1667	///
1668	/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1669	///
1670	/// \param __a
1671	/// A pointer to a 64-bit memory location. The address of the memory
1672	/// location does not have to be aligned.
1673	/// \returns A 128-bit vector of [2 x i64] containing the loaded value.
1674	static __inline__ __m128i __DEFAULT_FN_ATTRS
1675	_mm_loadu_si64(void const *__a)
1676	{
1677	struct __loadu_si64 {
1678	long long __v;
1679	} __attribute__((__packed__, __may_alias__));
1680	long long __u = ((struct __loadu_si64*)__a)->__v;
1681	return __extension__ (__m128i)(__v2di){__u, 0LL};
1682	}
1683
1684	/// Loads a 32-bit integer value to the low element of a 128-bit integer
1685	/// vector and clears the upper element.
1686	///
1687	/// \headerfile <x86intrin.h>
1688	///
1689	/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
1690	///
1691	/// \param __a
1692	/// A pointer to a 32-bit memory location. The address of the memory
1693	/// location does not have to be aligned.
1694	/// \returns A 128-bit vector of [4 x i32] containing the loaded value.
1695	static __inline__ __m128i __DEFAULT_FN_ATTRS
1696	_mm_loadu_si32(void const *__a)
1697	{
1698	struct __loadu_si32 {
1699	int __v;
1700	} __attribute__((__packed__, __may_alias__));
1701	int __u = ((struct __loadu_si32*)__a)->__v;
1702	return __extension__ (__m128i)(__v4si){__u, 0, 0, 0};
1703	}
1704
1705	/// Loads a 16-bit integer value to the low element of a 128-bit integer
1706	/// vector and clears the upper element.
1707	///
1708	/// \headerfile <x86intrin.h>
1709	///
1710	/// This intrinsic does not correspond to a specific instruction.
1711	///
1712	/// \param __a
1713	/// A pointer to a 16-bit memory location. The address of the memory
1714	/// location does not have to be aligned.
1715	/// \returns A 128-bit vector of [8 x i16] containing the loaded value.
1716	static __inline__ __m128i __DEFAULT_FN_ATTRS
1717	_mm_loadu_si16(void const *__a)
1718	{
1719	struct __loadu_si16 {
1720	short __v;
1721	} __attribute__((__packed__, __may_alias__));
1722	short __u = ((struct __loadu_si16*)__a)->__v;
1723	return __extension__ (__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0};
1724	}
1725
1726	/// Loads a 64-bit double-precision value to the low element of a
1727	/// 128-bit integer vector and clears the upper element.
1728	///
1729	/// \headerfile <x86intrin.h>
1730	///
1731	/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1732	///
1733	/// \param __dp
1734	/// A pointer to a memory location containing a double-precision value.
1735	/// The address of the memory location does not have to be aligned.
1736	/// \returns A 128-bit vector of [2 x double] containing the loaded value.
1737	static __inline__ __m128d __DEFAULT_FN_ATTRS
1738	_mm_load_sd(double const *__dp)
1739	{
1740	struct __mm_load_sd_struct {
1741	double __u;
1742	} __attribute__((__packed__, __may_alias__));
1743	double __u = ((struct __mm_load_sd_struct*)__dp)->__u;
1744	return __extension__ (__m128d){ __u, 0 };
1745	}
1746
1747	/// Loads a double-precision value into the high-order bits of a 128-bit
1748	/// vector of [2 x double]. The low-order bits are copied from the low-order
1749	/// bits of the first operand.
1750	///
1751	/// \headerfile <x86intrin.h>
1752	///
1753	/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1754	///
1755	/// \param __a
1756	/// A 128-bit vector of [2 x double]. \n
1757	/// Bits [63:0] are written to bits [63:0] of the result.
1758	/// \param __dp
1759	/// A pointer to a 64-bit memory location containing a double-precision
1760	/// floating-point value that is loaded. The loaded value is written to bits
1761	/// [127:64] of the result. The address of the memory location does not have
1762	/// to be aligned.
1763	/// \returns A 128-bit vector of [2 x double] containing the moved values.
1764	static __inline__ __m128d __DEFAULT_FN_ATTRS
1765	_mm_loadh_pd(__m128d __a, double const *__dp)
1766	{
1767	struct __mm_loadh_pd_struct {
1768	double __u;
1769	} __attribute__((__packed__, __may_alias__));
1770	double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u;
1771	return __extension__ (__m128d){ __a[0], __u };
1772	}
1773
1774	/// Loads a double-precision value into the low-order bits of a 128-bit
1775	/// vector of [2 x double]. The high-order bits are copied from the
1776	/// high-order bits of the first operand.
1777	///
1778	/// \headerfile <x86intrin.h>
1779	///
1780	/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1781	///
1782	/// \param __a
1783	/// A 128-bit vector of [2 x double]. \n
1784	/// Bits [127:64] are written to bits [127:64] of the result.
1785	/// \param __dp
1786	/// A pointer to a 64-bit memory location containing a double-precision
1787	/// floating-point value that is loaded. The loaded value is written to bits
1788	/// [63:0] of the result. The address of the memory location does not have to
1789	/// be aligned.
1790	/// \returns A 128-bit vector of [2 x double] containing the moved values.
1791	static __inline__ __m128d __DEFAULT_FN_ATTRS
1792	_mm_loadl_pd(__m128d __a, double const *__dp)
1793	{
1794	struct __mm_loadl_pd_struct {
1795	double __u;
1796	} __attribute__((__packed__, __may_alias__));
1797	double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u;
1798	return __extension__ (__m128d){ __u, __a[1] };
1799	}
1800
1801	/// Constructs a 128-bit floating-point vector of [2 x double] with
1802	/// unspecified content. This could be used as an argument to another
1803	/// intrinsic function where the argument is required but the value is not
1804	/// actually used.
1805	///
1806	/// \headerfile <x86intrin.h>
1807	///
1808	/// This intrinsic has no corresponding instruction.
1809	///
1810	/// \returns A 128-bit floating-point vector of [2 x double] with unspecified
1811	/// content.
1812	static __inline__ __m128d __DEFAULT_FN_ATTRS
1813	_mm_undefined_pd(void)
1814	{
1815	return (__m128d)__builtin_ia32_undef128();
1816	}
1817
1818	/// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1819	/// 64 bits of the vector are initialized with the specified double-precision
1820	/// floating-point value. The upper 64 bits are set to zero.
1821	///
1822	/// \headerfile <x86intrin.h>
1823	///
1824	/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1825	///
1826	/// \param __w
1827	/// A double-precision floating-point value used to initialize the lower 64
1828	/// bits of the result.
1829	/// \returns An initialized 128-bit floating-point vector of [2 x double]. The
1830	/// lower 64 bits contain the value of the parameter. The upper 64 bits are
1831	/// set to zero.
1832	static __inline__ __m128d __DEFAULT_FN_ATTRS
1833	_mm_set_sd(double __w)
1834	{
1835	return __extension__ (__m128d){ __w, 0 };
1836	}
1837
1838	/// Constructs a 128-bit floating-point vector of [2 x double], with each
1839	/// of the two double-precision floating-point vector elements set to the
1840	/// specified double-precision floating-point value.
1841	///
1842	/// \headerfile <x86intrin.h>
1843	///
1844	/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1845	///
1846	/// \param __w
1847	/// A double-precision floating-point value used to initialize each vector
1848	/// element of the result.
1849	/// \returns An initialized 128-bit floating-point vector of [2 x double].
1850	static __inline__ __m128d __DEFAULT_FN_ATTRS
1851	_mm_set1_pd(double __w)
1852	{
1853	return __extension__ (__m128d){ __w, __w };
1854	}
1855
1856	/// Constructs a 128-bit floating-point vector of [2 x double], with each
1857	/// of the two double-precision floating-point vector elements set to the
1858	/// specified double-precision floating-point value.
1859	///
1860	/// \headerfile <x86intrin.h>
1861	///
1862	/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1863	///
1864	/// \param __w
1865	/// A double-precision floating-point value used to initialize each vector
1866	/// element of the result.
1867	/// \returns An initialized 128-bit floating-point vector of [2 x double].
1868	static __inline__ __m128d __DEFAULT_FN_ATTRS
1869	_mm_set_pd1(double __w)
1870	{
1871	return _mm_set1_pd(__w);
1872	}
1873
1874	/// Constructs a 128-bit floating-point vector of [2 x double]
1875	/// initialized with the specified double-precision floating-point values.
1876	///
1877	/// \headerfile <x86intrin.h>
1878	///
1879	/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1880	///
1881	/// \param __w
1882	/// A double-precision floating-point value used to initialize the upper 64
1883	/// bits of the result.
1884	/// \param __x
1885	/// A double-precision floating-point value used to initialize the lower 64
1886	/// bits of the result.
1887	/// \returns An initialized 128-bit floating-point vector of [2 x double].
1888	static __inline__ __m128d __DEFAULT_FN_ATTRS
1889	_mm_set_pd(double __w, double __x)
1890	{
1891	return __extension__ (__m128d){ __x, __w };
1892	}
1893
1894	/// Constructs a 128-bit floating-point vector of [2 x double],
1895	/// initialized in reverse order with the specified double-precision
1896	/// floating-point values.
1897	///
1898	/// \headerfile <x86intrin.h>
1899	///
1900	/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1901	///
1902	/// \param __w
1903	/// A double-precision floating-point value used to initialize the lower 64
1904	/// bits of the result.
1905	/// \param __x
1906	/// A double-precision floating-point value used to initialize the upper 64
1907	/// bits of the result.
1908	/// \returns An initialized 128-bit floating-point vector of [2 x double].
1909	static __inline__ __m128d __DEFAULT_FN_ATTRS
1910	_mm_setr_pd(double __w, double __x)
1911	{
1912	return __extension__ (__m128d){ __w, __x };
1913	}
1914
1915	/// Constructs a 128-bit floating-point vector of [2 x double]
1916	/// initialized to zero.
1917	///
1918	/// \headerfile <x86intrin.h>
1919	///
1920	/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1921	///
1922	/// \returns An initialized 128-bit floating-point vector of [2 x double] with
1923	/// all elements set to zero.
1924	static __inline__ __m128d __DEFAULT_FN_ATTRS
1925	_mm_setzero_pd(void)
1926	{
1927	return __extension__ (__m128d){ 0, 0 };
1928	}
1929
1930	/// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1931	/// 64 bits are set to the lower 64 bits of the second parameter. The upper
1932	/// 64 bits are set to the upper 64 bits of the first parameter.
1933	///
1934	/// \headerfile <x86intrin.h>
1935	///
1936	/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
1937	///
1938	/// \param __a
1939	/// A 128-bit vector of [2 x double]. The upper 64 bits are written to the
1940	/// upper 64 bits of the result.
1941	/// \param __b
1942	/// A 128-bit vector of [2 x double]. The lower 64 bits are written to the
1943	/// lower 64 bits of the result.
1944	/// \returns A 128-bit vector of [2 x double] containing the moved values.
1945	static __inline__ __m128d __DEFAULT_FN_ATTRS
1946	_mm_move_sd(__m128d __a, __m128d __b)
1947	{
1948	__a[0] = __b[0];
1949	return __a;
1950	}
1951
1952	/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1953	/// memory location.
1954	///
1955	/// \headerfile <x86intrin.h>
1956	///
1957	/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1958	///
1959	/// \param __dp
1960	/// A pointer to a 64-bit memory location.
1961	/// \param __a
1962	/// A 128-bit vector of [2 x double] containing the value to be stored.
1963	static __inline__ void __DEFAULT_FN_ATTRS
1964	_mm_store_sd(double *__dp, __m128d __a)
1965	{
1966	struct __mm_store_sd_struct {
1967	double __u;
1968	} __attribute__((__packed__, __may_alias__));
1969	((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
1970	}
1971
1972	/// Moves packed double-precision values from a 128-bit vector of
1973	/// [2 x double] to a memory location.
1974	///
1975	/// \headerfile <x86intrin.h>
1976	///
1977	/// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
1978	///
1979	/// \param __dp
1980	/// A pointer to an aligned memory location that can store two
1981	/// double-precision values.
1982	/// \param __a
1983	/// A packed 128-bit vector of [2 x double] containing the values to be
1984	/// moved.
1985	static __inline__ void __DEFAULT_FN_ATTRS
1986	_mm_store_pd(double *__dp, __m128d __a)
1987	{
1988	(__m128d)__dp = __a;
1989	}
1990
1991	/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1992	/// the upper and lower 64 bits of a memory location.
1993	///
1994	/// \headerfile <x86intrin.h>
1995	///
1996	/// This intrinsic corresponds to the
1997	/// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1998	///
1999	/// \param __dp
2000	/// A pointer to a memory location that can store two double-precision
2001	/// values.
2002	/// \param __a
2003	/// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
2004	/// of the values in \a __dp.
2005	static __inline__ void __DEFAULT_FN_ATTRS
2006	_mm_store1_pd(double *__dp, __m128d __a)
2007	{
2008	__a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
2009	_mm_store_pd(__dp, __a);
2010	}
2011
2012	/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
2013	/// the upper and lower 64 bits of a memory location.
2014	///
2015	/// \headerfile <x86intrin.h>
2016	///
2017	/// This intrinsic corresponds to the
2018	/// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
2019	///
2020	/// \param __dp
2021	/// A pointer to a memory location that can store two double-precision
2022	/// values.
2023	/// \param __a
2024	/// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
2025	/// of the values in \a __dp.
2026	static __inline__ void __DEFAULT_FN_ATTRS
2027	_mm_store_pd1(double *__dp, __m128d __a)
2028	{
2029	_mm_store1_pd(__dp, __a);
2030	}
2031
2032	/// Stores a 128-bit vector of [2 x double] into an unaligned memory
2033	/// location.
2034	///
2035	/// \headerfile <x86intrin.h>
2036	///
2037	/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
2038	///
2039	/// \param __dp
2040	/// A pointer to a 128-bit memory location. The address of the memory
2041	/// location does not have to be aligned.
2042	/// \param __a
2043	/// A 128-bit vector of [2 x double] containing the values to be stored.
2044	static __inline__ void __DEFAULT_FN_ATTRS
2045	_mm_storeu_pd(double *__dp, __m128d __a)
2046	{
2047	struct __storeu_pd {
2048	__m128d_u __v;
2049	} __attribute__((__packed__, __may_alias__));
2050	((struct __storeu_pd*)__dp)->__v = __a;
2051	}
2052
2053	/// Stores two double-precision values, in reverse order, from a 128-bit
2054	/// vector of [2 x double] to a 16-byte aligned memory location.
2055	///
2056	/// \headerfile <x86intrin.h>
2057	///
2058	/// This intrinsic corresponds to a shuffling instruction followed by a
2059	/// <c> VMOVAPD / MOVAPD </c> instruction.
2060	///
2061	/// \param __dp
2062	/// A pointer to a 16-byte aligned memory location that can store two
2063	/// double-precision values.
2064	/// \param __a
2065	/// A 128-bit vector of [2 x double] containing the values to be reversed and
2066	/// stored.
2067	static __inline__ void __DEFAULT_FN_ATTRS
2068	_mm_storer_pd(double *__dp, __m128d __a)
2069	{
2070	__a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
2071	(__m128d )__dp = __a;
2072	}
2073
2074	/// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
2075	/// memory location.
2076	///
2077	/// \headerfile <x86intrin.h>
2078	///
2079	/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
2080	///
2081	/// \param __dp
2082	/// A pointer to a 64-bit memory location.
2083	/// \param __a
2084	/// A 128-bit vector of [2 x double] containing the value to be stored.
2085	static __inline__ void __DEFAULT_FN_ATTRS
2086	_mm_storeh_pd(double *__dp, __m128d __a)
2087	{
2088	struct __mm_storeh_pd_struct {
2089	double __u;
2090	} __attribute__((__packed__, __may_alias__));
2091	((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
2092	}
2093
2094	/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
2095	/// memory location.
2096	///
2097	/// \headerfile <x86intrin.h>
2098	///
2099	/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
2100	///
2101	/// \param __dp
2102	/// A pointer to a 64-bit memory location.
2103	/// \param __a
2104	/// A 128-bit vector of [2 x double] containing the value to be stored.
2105	static __inline__ void __DEFAULT_FN_ATTRS
2106	_mm_storel_pd(double *__dp, __m128d __a)
2107	{
2108	struct __mm_storeh_pd_struct {
2109	double __u;
2110	} __attribute__((__packed__, __may_alias__));
2111	((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
2112	}
2113
2114	/// Adds the corresponding elements of two 128-bit vectors of [16 x i8],
2115	/// saving the lower 8 bits of each sum in the corresponding element of a
2116	/// 128-bit result vector of [16 x i8].
2117	///
2118	/// The integer elements of both parameters can be either signed or unsigned.
2119	///
2120	/// \headerfile <x86intrin.h>
2121	///
2122	/// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
2123	///
2124	/// \param __a
2125	/// A 128-bit vector of [16 x i8].
2126	/// \param __b
2127	/// A 128-bit vector of [16 x i8].
2128	/// \returns A 128-bit vector of [16 x i8] containing the sums of both
2129	/// parameters.
2130	static __inline__ __m128i __DEFAULT_FN_ATTRS
2131	_mm_add_epi8(__m128i __a, __m128i __b)
2132	{
2133	return (__m128i)((__v16qu)__a + (__v16qu)__b);
2134	}
2135
2136	/// Adds the corresponding elements of two 128-bit vectors of [8 x i16],
2137	/// saving the lower 16 bits of each sum in the corresponding element of a
2138	/// 128-bit result vector of [8 x i16].
2139	///
2140	/// The integer elements of both parameters can be either signed or unsigned.
2141	///
2142	/// \headerfile <x86intrin.h>
2143	///
2144	/// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
2145	///
2146	/// \param __a
2147	/// A 128-bit vector of [8 x i16].
2148	/// \param __b
2149	/// A 128-bit vector of [8 x i16].
2150	/// \returns A 128-bit vector of [8 x i16] containing the sums of both
2151	/// parameters.
2152	static __inline__ __m128i __DEFAULT_FN_ATTRS
2153	_mm_add_epi16(__m128i __a, __m128i __b)
2154	{
2155	return (__m128i)((__v8hu)__a + (__v8hu)__b);
2156	}
2157
2158	/// Adds the corresponding elements of two 128-bit vectors of [4 x i32],
2159	/// saving the lower 32 bits of each sum in the corresponding element of a
2160	/// 128-bit result vector of [4 x i32].
2161	///
2162	/// The integer elements of both parameters can be either signed or unsigned.
2163	///
2164	/// \headerfile <x86intrin.h>
2165	///
2166	/// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
2167	///
2168	/// \param __a
2169	/// A 128-bit vector of [4 x i32].
2170	/// \param __b
2171	/// A 128-bit vector of [4 x i32].
2172	/// \returns A 128-bit vector of [4 x i32] containing the sums of both
2173	/// parameters.
2174	static __inline__ __m128i __DEFAULT_FN_ATTRS
2175	_mm_add_epi32(__m128i __a, __m128i __b)
2176	{
2177	return (__m128i)((__v4su)__a + (__v4su)__b);
2178	}
2179
2180	/// Adds two signed or unsigned 64-bit integer values, returning the
2181	/// lower 64 bits of the sum.
2182	///
2183	/// \headerfile <x86intrin.h>
2184	///
2185	/// This intrinsic corresponds to the <c> PADDQ </c> instruction.
2186	///
2187	/// \param __a
2188	/// A 64-bit integer.
2189	/// \param __b
2190	/// A 64-bit integer.
2191	/// \returns A 64-bit integer containing the sum of both parameters.
2192	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2193	_mm_add_si64(__m64 __a, __m64 __b)
2194	{
2195	return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
2196	}
2197
2198	/// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
2199	/// saving the lower 64 bits of each sum in the corresponding element of a
2200	/// 128-bit result vector of [2 x i64].
2201	///
2202	/// The integer elements of both parameters can be either signed or unsigned.
2203	///
2204	/// \headerfile <x86intrin.h>
2205	///
2206	/// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
2207	///
2208	/// \param __a
2209	/// A 128-bit vector of [2 x i64].
2210	/// \param __b
2211	/// A 128-bit vector of [2 x i64].
2212	/// \returns A 128-bit vector of [2 x i64] containing the sums of both
2213	/// parameters.
2214	static __inline__ __m128i __DEFAULT_FN_ATTRS
2215	_mm_add_epi64(__m128i __a, __m128i __b)
2216	{
2217	return (__m128i)((__v2du)__a + (__v2du)__b);
2218	}
2219
2220	/// Adds, with saturation, the corresponding elements of two 128-bit
2221	/// signed [16 x i8] vectors, saving each sum in the corresponding element of
2222	/// a 128-bit result vector of [16 x i8]. Positive sums greater than 0x7F are
2223	/// saturated to 0x7F. Negative sums less than 0x80 are saturated to 0x80.
2224	///
2225	/// \headerfile <x86intrin.h>
2226	///
2227	/// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
2228	///
2229	/// \param __a
2230	/// A 128-bit signed [16 x i8] vector.
2231	/// \param __b
2232	/// A 128-bit signed [16 x i8] vector.
2233	/// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
2234	/// both parameters.
2235	static __inline__ __m128i __DEFAULT_FN_ATTRS
2236	_mm_adds_epi8(__m128i __a, __m128i __b)
2237	{
2238	return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
2239	}
2240
2241	/// Adds, with saturation, the corresponding elements of two 128-bit
2242	/// signed [8 x i16] vectors, saving each sum in the corresponding element of
2243	/// a 128-bit result vector of [8 x i16]. Positive sums greater than 0x7FFF
2244	/// are saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
2245	/// 0x8000.
2246	///
2247	/// \headerfile <x86intrin.h>
2248	///
2249	/// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
2250	///
2251	/// \param __a
2252	/// A 128-bit signed [8 x i16] vector.
2253	/// \param __b
2254	/// A 128-bit signed [8 x i16] vector.
2255	/// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
2256	/// both parameters.
2257	static __inline__ __m128i __DEFAULT_FN_ATTRS
2258	_mm_adds_epi16(__m128i __a, __m128i __b)
2259	{
2260	return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
2261	}
2262
2263	/// Adds, with saturation, the corresponding elements of two 128-bit
2264	/// unsigned [16 x i8] vectors, saving each sum in the corresponding element
2265	/// of a 128-bit result vector of [16 x i8]. Positive sums greater than 0xFF
2266	/// are saturated to 0xFF. Negative sums are saturated to 0x00.
2267	///
2268	/// \headerfile <x86intrin.h>
2269	///
2270	/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2271	///
2272	/// \param __a
2273	/// A 128-bit unsigned [16 x i8] vector.
2274	/// \param __b
2275	/// A 128-bit unsigned [16 x i8] vector.
2276	/// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
2277	/// of both parameters.
2278	static __inline__ __m128i __DEFAULT_FN_ATTRS
2279	_mm_adds_epu8(__m128i __a, __m128i __b)
2280	{
2281	return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
2282	}
2283
2284	/// Adds, with saturation, the corresponding elements of two 128-bit
2285	/// unsigned [8 x i16] vectors, saving each sum in the corresponding element
2286	/// of a 128-bit result vector of [8 x i16]. Positive sums greater than
2287	/// 0xFFFF are saturated to 0xFFFF. Negative sums are saturated to 0x0000.
2288	///
2289	/// \headerfile <x86intrin.h>
2290	///
2291	/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2292	///
2293	/// \param __a
2294	/// A 128-bit unsigned [8 x i16] vector.
2295	/// \param __b
2296	/// A 128-bit unsigned [8 x i16] vector.
2297	/// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
2298	/// of both parameters.
2299	static __inline__ __m128i __DEFAULT_FN_ATTRS
2300	_mm_adds_epu16(__m128i __a, __m128i __b)
2301	{
2302	return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
2303	}
2304
2305	/// Computes the rounded avarages of corresponding elements of two
2306	/// 128-bit unsigned [16 x i8] vectors, saving each result in the
2307	/// corresponding element of a 128-bit result vector of [16 x i8].
2308	///
2309	/// \headerfile <x86intrin.h>
2310	///
2311	/// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
2312	///
2313	/// \param __a
2314	/// A 128-bit unsigned [16 x i8] vector.
2315	/// \param __b
2316	/// A 128-bit unsigned [16 x i8] vector.
2317	/// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
2318	/// averages of both parameters.
2319	static __inline__ __m128i __DEFAULT_FN_ATTRS
2320	_mm_avg_epu8(__m128i __a, __m128i __b)
2321	{
2322	typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
2323	return (__m128i)__builtin_convertvector(
2324	((__builtin_convertvector((__v16qu)__a, __v16hu) +
2325	__builtin_convertvector((__v16qu)__b, __v16hu)) + 1)
2326	>> 1, __v16qu);
2327	}
2328
2329	/// Computes the rounded avarages of corresponding elements of two
2330	/// 128-bit unsigned [8 x i16] vectors, saving each result in the
2331	/// corresponding element of a 128-bit result vector of [8 x i16].
2332	///
2333	/// \headerfile <x86intrin.h>
2334	///
2335	/// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
2336	///
2337	/// \param __a
2338	/// A 128-bit unsigned [8 x i16] vector.
2339	/// \param __b
2340	/// A 128-bit unsigned [8 x i16] vector.
2341	/// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
2342	/// averages of both parameters.
2343	static __inline__ __m128i __DEFAULT_FN_ATTRS
2344	_mm_avg_epu16(__m128i __a, __m128i __b)
2345	{
2346	typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
2347	return (__m128i)__builtin_convertvector(
2348	((__builtin_convertvector((__v8hu)__a, __v8su) +
2349	__builtin_convertvector((__v8hu)__b, __v8su)) + 1)
2350	>> 1, __v8hu);
2351	}
2352
2353	/// Multiplies the corresponding elements of two 128-bit signed [8 x i16]
2354	/// vectors, producing eight intermediate 32-bit signed integer products, and
2355	/// adds the consecutive pairs of 32-bit products to form a 128-bit signed
2356	/// [4 x i32] vector.
2357	///
2358	/// For example, bits [15:0] of both parameters are multiplied producing a
2359	/// 32-bit product, bits [31:16] of both parameters are multiplied producing
2360	/// a 32-bit product, and the sum of those two products becomes bits [31:0]
2361	/// of the result.
2362	///
2363	/// \headerfile <x86intrin.h>
2364	///
2365	/// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
2366	///
2367	/// \param __a
2368	/// A 128-bit signed [8 x i16] vector.
2369	/// \param __b
2370	/// A 128-bit signed [8 x i16] vector.
2371	/// \returns A 128-bit signed [4 x i32] vector containing the sums of products
2372	/// of both parameters.
2373	static __inline__ __m128i __DEFAULT_FN_ATTRS
2374	_mm_madd_epi16(__m128i __a, __m128i __b)
2375	{
2376	return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
2377	}
2378
2379	/// Compares corresponding elements of two 128-bit signed [8 x i16]
2380	/// vectors, saving the greater value from each comparison in the
2381	/// corresponding element of a 128-bit result vector of [8 x i16].
2382	///
2383	/// \headerfile <x86intrin.h>
2384	///
2385	/// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
2386	///
2387	/// \param __a
2388	/// A 128-bit signed [8 x i16] vector.
2389	/// \param __b
2390	/// A 128-bit signed [8 x i16] vector.
2391	/// \returns A 128-bit signed [8 x i16] vector containing the greater value of
2392	/// each comparison.
2393	static __inline__ __m128i __DEFAULT_FN_ATTRS
2394	_mm_max_epi16(__m128i __a, __m128i __b)
2395	{
2396	return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
2397	}
2398
2399	/// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2400	/// vectors, saving the greater value from each comparison in the
2401	/// corresponding element of a 128-bit result vector of [16 x i8].
2402	///
2403	/// \headerfile <x86intrin.h>
2404	///
2405	/// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
2406	///
2407	/// \param __a
2408	/// A 128-bit unsigned [16 x i8] vector.
2409	/// \param __b
2410	/// A 128-bit unsigned [16 x i8] vector.
2411	/// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
2412	/// each comparison.
2413	static __inline__ __m128i __DEFAULT_FN_ATTRS
2414	_mm_max_epu8(__m128i __a, __m128i __b)
2415	{
2416	return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
2417	}
2418
2419	/// Compares corresponding elements of two 128-bit signed [8 x i16]
2420	/// vectors, saving the smaller value from each comparison in the
2421	/// corresponding element of a 128-bit result vector of [8 x i16].
2422	///
2423	/// \headerfile <x86intrin.h>
2424	///
2425	/// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
2426	///
2427	/// \param __a
2428	/// A 128-bit signed [8 x i16] vector.
2429	/// \param __b
2430	/// A 128-bit signed [8 x i16] vector.
2431	/// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
2432	/// each comparison.
2433	static __inline__ __m128i __DEFAULT_FN_ATTRS
2434	_mm_min_epi16(__m128i __a, __m128i __b)
2435	{
2436	return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
2437	}
2438
2439	/// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2440	/// vectors, saving the smaller value from each comparison in the
2441	/// corresponding element of a 128-bit result vector of [16 x i8].
2442	///
2443	/// \headerfile <x86intrin.h>
2444	///
2445	/// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
2446	///
2447	/// \param __a
2448	/// A 128-bit unsigned [16 x i8] vector.
2449	/// \param __b
2450	/// A 128-bit unsigned [16 x i8] vector.
2451	/// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
2452	/// each comparison.
2453	static __inline__ __m128i __DEFAULT_FN_ATTRS
2454	_mm_min_epu8(__m128i __a, __m128i __b)
2455	{
2456	return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
2457	}
2458
2459	/// Multiplies the corresponding elements of two signed [8 x i16]
2460	/// vectors, saving the upper 16 bits of each 32-bit product in the
2461	/// corresponding element of a 128-bit signed [8 x i16] result vector.
2462	///
2463	/// \headerfile <x86intrin.h>
2464	///
2465	/// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
2466	///
2467	/// \param __a
2468	/// A 128-bit signed [8 x i16] vector.
2469	/// \param __b
2470	/// A 128-bit signed [8 x i16] vector.
2471	/// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
2472	/// each of the eight 32-bit products.
2473	static __inline__ __m128i __DEFAULT_FN_ATTRS
2474	_mm_mulhi_epi16(__m128i __a, __m128i __b)
2475	{
2476	return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
2477	}
2478
2479	/// Multiplies the corresponding elements of two unsigned [8 x i16]
2480	/// vectors, saving the upper 16 bits of each 32-bit product in the
2481	/// corresponding element of a 128-bit unsigned [8 x i16] result vector.
2482	///
2483	/// \headerfile <x86intrin.h>
2484	///
2485	/// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
2486	///
2487	/// \param __a
2488	/// A 128-bit unsigned [8 x i16] vector.
2489	/// \param __b
2490	/// A 128-bit unsigned [8 x i16] vector.
2491	/// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
2492	/// of each of the eight 32-bit products.
2493	static __inline__ __m128i __DEFAULT_FN_ATTRS
2494	_mm_mulhi_epu16(__m128i __a, __m128i __b)
2495	{
2496	return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
2497	}
2498
2499	/// Multiplies the corresponding elements of two signed [8 x i16]
2500	/// vectors, saving the lower 16 bits of each 32-bit product in the
2501	/// corresponding element of a 128-bit signed [8 x i16] result vector.
2502	///
2503	/// \headerfile <x86intrin.h>
2504	///
2505	/// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
2506	///
2507	/// \param __a
2508	/// A 128-bit signed [8 x i16] vector.
2509	/// \param __b
2510	/// A 128-bit signed [8 x i16] vector.
2511	/// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
2512	/// each of the eight 32-bit products.
2513	static __inline__ __m128i __DEFAULT_FN_ATTRS
2514	_mm_mullo_epi16(__m128i __a, __m128i __b)
2515	{
2516	return (__m128i)((__v8hu)__a * (__v8hu)__b);
2517	}
2518
2519	/// Multiplies 32-bit unsigned integer values contained in the lower bits
2520	/// of the two 64-bit integer vectors and returns the 64-bit unsigned
2521	/// product.
2522	///
2523	/// \headerfile <x86intrin.h>
2524	///
2525	/// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
2526	///
2527	/// \param __a
2528	/// A 64-bit integer containing one of the source operands.
2529	/// \param __b
2530	/// A 64-bit integer containing one of the source operands.
2531	/// \returns A 64-bit integer vector containing the product of both operands.
2532	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2533	_mm_mul_su32(__m64 __a, __m64 __b)
2534	{
2535	return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
2536	}
2537
2538	/// Multiplies 32-bit unsigned integer values contained in the lower
2539	/// bits of the corresponding elements of two [2 x i64] vectors, and returns
2540	/// the 64-bit products in the corresponding elements of a [2 x i64] vector.
2541	///
2542	/// \headerfile <x86intrin.h>
2543	///
2544	/// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
2545	///
2546	/// \param __a
2547	/// A [2 x i64] vector containing one of the source operands.
2548	/// \param __b
2549	/// A [2 x i64] vector containing one of the source operands.
2550	/// \returns A [2 x i64] vector containing the product of both operands.
2551	static __inline__ __m128i __DEFAULT_FN_ATTRS
2552	_mm_mul_epu32(__m128i __a, __m128i __b)
2553	{
2554	return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
2555	}
2556
2557	/// Computes the absolute differences of corresponding 8-bit integer
2558	/// values in two 128-bit vectors. Sums the first 8 absolute differences, and
2559	/// separately sums the second 8 absolute differences. Packs these two
2560	/// unsigned 16-bit integer sums into the upper and lower elements of a
2561	/// [2 x i64] vector.
2562	///
2563	/// \headerfile <x86intrin.h>
2564	///
2565	/// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
2566	///
2567	/// \param __a
2568	/// A 128-bit integer vector containing one of the source operands.
2569	/// \param __b
2570	/// A 128-bit integer vector containing one of the source operands.
2571	/// \returns A [2 x i64] vector containing the sums of the sets of absolute
2572	/// differences between both operands.
2573	static __inline__ __m128i __DEFAULT_FN_ATTRS
2574	_mm_sad_epu8(__m128i __a, __m128i __b)
2575	{
2576	return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
2577	}
2578
2579	/// Subtracts the corresponding 8-bit integer values in the operands.
2580	///
2581	/// \headerfile <x86intrin.h>
2582	///
2583	/// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
2584	///
2585	/// \param __a
2586	/// A 128-bit integer vector containing the minuends.
2587	/// \param __b
2588	/// A 128-bit integer vector containing the subtrahends.
2589	/// \returns A 128-bit integer vector containing the differences of the values
2590	/// in the operands.
2591	static __inline__ __m128i __DEFAULT_FN_ATTRS
2592	_mm_sub_epi8(__m128i __a, __m128i __b)
2593	{
2594	return (__m128i)((__v16qu)__a - (__v16qu)__b);
2595	}
2596
2597	/// Subtracts the corresponding 16-bit integer values in the operands.
2598	///
2599	/// \headerfile <x86intrin.h>
2600	///
2601	/// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
2602	///
2603	/// \param __a
2604	/// A 128-bit integer vector containing the minuends.
2605	/// \param __b
2606	/// A 128-bit integer vector containing the subtrahends.
2607	/// \returns A 128-bit integer vector containing the differences of the values
2608	/// in the operands.
2609	static __inline__ __m128i __DEFAULT_FN_ATTRS
2610	_mm_sub_epi16(__m128i __a, __m128i __b)
2611	{
2612	return (__m128i)((__v8hu)__a - (__v8hu)__b);
2613	}
2614
2615	/// Subtracts the corresponding 32-bit integer values in the operands.
2616	///
2617	/// \headerfile <x86intrin.h>
2618	///
2619	/// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
2620	///
2621	/// \param __a
2622	/// A 128-bit integer vector containing the minuends.
2623	/// \param __b
2624	/// A 128-bit integer vector containing the subtrahends.
2625	/// \returns A 128-bit integer vector containing the differences of the values
2626	/// in the operands.
2627	static __inline__ __m128i __DEFAULT_FN_ATTRS
2628	_mm_sub_epi32(__m128i __a, __m128i __b)
2629	{
2630	return (__m128i)((__v4su)__a - (__v4su)__b);
2631	}
2632
2633	/// Subtracts signed or unsigned 64-bit integer values and writes the
2634	/// difference to the corresponding bits in the destination.
2635	///
2636	/// \headerfile <x86intrin.h>
2637	///
2638	/// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
2639	///
2640	/// \param __a
2641	/// A 64-bit integer vector containing the minuend.
2642	/// \param __b
2643	/// A 64-bit integer vector containing the subtrahend.
2644	/// \returns A 64-bit integer vector containing the difference of the values in
2645	/// the operands.
2646	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2647	_mm_sub_si64(__m64 __a, __m64 __b)
2648	{
2649	return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
2650	}
2651
2652	/// Subtracts the corresponding elements of two [2 x i64] vectors.
2653	///
2654	/// \headerfile <x86intrin.h>
2655	///
2656	/// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
2657	///
2658	/// \param __a
2659	/// A 128-bit integer vector containing the minuends.
2660	/// \param __b
2661	/// A 128-bit integer vector containing the subtrahends.
2662	/// \returns A 128-bit integer vector containing the differences of the values
2663	/// in the operands.
2664	static __inline__ __m128i __DEFAULT_FN_ATTRS
2665	_mm_sub_epi64(__m128i __a, __m128i __b)
2666	{
2667	return (__m128i)((__v2du)__a - (__v2du)__b);
2668	}
2669
2670	/// Subtracts corresponding 8-bit signed integer values in the input and
2671	/// returns the differences in the corresponding bytes in the destination.
2672	/// Differences greater than 0x7F are saturated to 0x7F, and differences less
2673	/// than 0x80 are saturated to 0x80.
2674	///
2675	/// \headerfile <x86intrin.h>
2676	///
2677	/// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
2678	///
2679	/// \param __a
2680	/// A 128-bit integer vector containing the minuends.
2681	/// \param __b
2682	/// A 128-bit integer vector containing the subtrahends.
2683	/// \returns A 128-bit integer vector containing the differences of the values
2684	/// in the operands.
2685	static __inline__ __m128i __DEFAULT_FN_ATTRS
2686	_mm_subs_epi8(__m128i __a, __m128i __b)
2687	{
2688	return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
2689	}
2690
2691	/// Subtracts corresponding 16-bit signed integer values in the input and
2692	/// returns the differences in the corresponding bytes in the destination.
2693	/// Differences greater than 0x7FFF are saturated to 0x7FFF, and values less
2694	/// than 0x8000 are saturated to 0x8000.
2695	///
2696	/// \headerfile <x86intrin.h>
2697	///
2698	/// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
2699	///
2700	/// \param __a
2701	/// A 128-bit integer vector containing the minuends.
2702	/// \param __b
2703	/// A 128-bit integer vector containing the subtrahends.
2704	/// \returns A 128-bit integer vector containing the differences of the values
2705	/// in the operands.
2706	static __inline__ __m128i __DEFAULT_FN_ATTRS
2707	_mm_subs_epi16(__m128i __a, __m128i __b)
2708	{
2709	return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
2710	}
2711
2712	/// Subtracts corresponding 8-bit unsigned integer values in the input
2713	/// and returns the differences in the corresponding bytes in the
2714	/// destination. Differences less than 0x00 are saturated to 0x00.
2715	///
2716	/// \headerfile <x86intrin.h>
2717	///
2718	/// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
2719	///
2720	/// \param __a
2721	/// A 128-bit integer vector containing the minuends.
2722	/// \param __b
2723	/// A 128-bit integer vector containing the subtrahends.
2724	/// \returns A 128-bit integer vector containing the unsigned integer
2725	/// differences of the values in the operands.
2726	static __inline__ __m128i __DEFAULT_FN_ATTRS
2727	_mm_subs_epu8(__m128i __a, __m128i __b)
2728	{
2729	return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
2730	}
2731
2732	/// Subtracts corresponding 16-bit unsigned integer values in the input
2733	/// and returns the differences in the corresponding bytes in the
2734	/// destination. Differences less than 0x0000 are saturated to 0x0000.
2735	///
2736	/// \headerfile <x86intrin.h>
2737	///
2738	/// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
2739	///
2740	/// \param __a
2741	/// A 128-bit integer vector containing the minuends.
2742	/// \param __b
2743	/// A 128-bit integer vector containing the subtrahends.
2744	/// \returns A 128-bit integer vector containing the unsigned integer
2745	/// differences of the values in the operands.
2746	static __inline__ __m128i __DEFAULT_FN_ATTRS
2747	_mm_subs_epu16(__m128i __a, __m128i __b)
2748	{
2749	return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
2750	}
2751
2752	/// Performs a bitwise AND of two 128-bit integer vectors.
2753	///
2754	/// \headerfile <x86intrin.h>
2755	///
2756	/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
2757	///
2758	/// \param __a
2759	/// A 128-bit integer vector containing one of the source operands.
2760	/// \param __b
2761	/// A 128-bit integer vector containing one of the source operands.
2762	/// \returns A 128-bit integer vector containing the bitwise AND of the values
2763	/// in both operands.
2764	static __inline__ __m128i __DEFAULT_FN_ATTRS
2765	_mm_and_si128(__m128i __a, __m128i __b)
2766	{
2767	return (__m128i)((__v2du)__a & (__v2du)__b);
2768	}
2769
2770	/// Performs a bitwise AND of two 128-bit integer vectors, using the
2771	/// one's complement of the values contained in the first source operand.
2772	///
2773	/// \headerfile <x86intrin.h>
2774	///
2775	/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
2776	///
2777	/// \param __a
2778	/// A 128-bit vector containing the left source operand. The one's complement
2779	/// of this value is used in the bitwise AND.
2780	/// \param __b
2781	/// A 128-bit vector containing the right source operand.
2782	/// \returns A 128-bit integer vector containing the bitwise AND of the one's
2783	/// complement of the first operand and the values in the second operand.
2784	static __inline__ __m128i __DEFAULT_FN_ATTRS
2785	_mm_andnot_si128(__m128i __a, __m128i __b)
2786	{
2787	return (__m128i)(~(__v2du)__a & (__v2du)__b);
2788	}
2789	/// Performs a bitwise OR of two 128-bit integer vectors.
2790	///
2791	/// \headerfile <x86intrin.h>
2792	///
2793	/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
2794	///
2795	/// \param __a
2796	/// A 128-bit integer vector containing one of the source operands.
2797	/// \param __b
2798	/// A 128-bit integer vector containing one of the source operands.
2799	/// \returns A 128-bit integer vector containing the bitwise OR of the values
2800	/// in both operands.
2801	static __inline__ __m128i __DEFAULT_FN_ATTRS
2802	_mm_or_si128(__m128i __a, __m128i __b)
2803	{
2804	return (__m128i)((__v2du)__a \| (__v2du)__b);
2805	}
2806
2807	/// Performs a bitwise exclusive OR of two 128-bit integer vectors.
2808	///
2809	/// \headerfile <x86intrin.h>
2810	///
2811	/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
2812	///
2813	/// \param __a
2814	/// A 128-bit integer vector containing one of the source operands.
2815	/// \param __b
2816	/// A 128-bit integer vector containing one of the source operands.
2817	/// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
2818	/// values in both operands.
2819	static __inline__ __m128i __DEFAULT_FN_ATTRS
2820	_mm_xor_si128(__m128i __a, __m128i __b)
2821	{
2822	return (__m128i)((__v2du)__a ^ (__v2du)__b);
2823	}
2824
2825	/// Left-shifts the 128-bit integer vector operand by the specified
2826	/// number of bytes. Low-order bits are cleared.
2827	///
2828	/// \headerfile <x86intrin.h>
2829	///
2830	/// \code
2831	/// __m128i _mm_slli_si128(__m128i a, const int imm);
2832	/// \endcode
2833	///
2834	/// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
2835	///
2836	/// \param a
2837	/// A 128-bit integer vector containing the source operand.
2838	/// \param imm
2839	/// An immediate value specifying the number of bytes to left-shift operand
2840	/// \a a.
2841	/// \returns A 128-bit integer vector containing the left-shifted value.
2842	#define _mm_slli_si128(a, imm) \
2843	(__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
2844
2845	#define _mm_bslli_si128(a, imm) \
2846	(__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
2847
2848	/// Left-shifts each 16-bit value in the 128-bit integer vector operand
2849	/// by the specified number of bits. Low-order bits are cleared.
2850	///
2851	/// \headerfile <x86intrin.h>
2852	///
2853	/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2854	///
2855	/// \param __a
2856	/// A 128-bit integer vector containing the source operand.
2857	/// \param __count
2858	/// An integer value specifying the number of bits to left-shift each value
2859	/// in operand \a __a.
2860	/// \returns A 128-bit integer vector containing the left-shifted values.
2861	static __inline__ __m128i __DEFAULT_FN_ATTRS
2862	_mm_slli_epi16(__m128i __a, int __count)
2863	{
2864	return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
2865	}
2866
2867	/// Left-shifts each 16-bit value in the 128-bit integer vector operand
2868	/// by the specified number of bits. Low-order bits are cleared.
2869	///
2870	/// \headerfile <x86intrin.h>
2871	///
2872	/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2873	///
2874	/// \param __a
2875	/// A 128-bit integer vector containing the source operand.
2876	/// \param __count
2877	/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2878	/// to left-shift each value in operand \a __a.
2879	/// \returns A 128-bit integer vector containing the left-shifted values.
2880	static __inline__ __m128i __DEFAULT_FN_ATTRS
2881	_mm_sll_epi16(__m128i __a, __m128i __count)
2882	{
2883	return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
2884	}
2885
2886	/// Left-shifts each 32-bit value in the 128-bit integer vector operand
2887	/// by the specified number of bits. Low-order bits are cleared.
2888	///
2889	/// \headerfile <x86intrin.h>
2890	///
2891	/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2892	///
2893	/// \param __a
2894	/// A 128-bit integer vector containing the source operand.
2895	/// \param __count
2896	/// An integer value specifying the number of bits to left-shift each value
2897	/// in operand \a __a.
2898	/// \returns A 128-bit integer vector containing the left-shifted values.
2899	static __inline__ __m128i __DEFAULT_FN_ATTRS
2900	_mm_slli_epi32(__m128i __a, int __count)
2901	{
2902	return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
2903	}
2904
2905	/// Left-shifts each 32-bit value in the 128-bit integer vector operand
2906	/// by the specified number of bits. Low-order bits are cleared.
2907	///
2908	/// \headerfile <x86intrin.h>
2909	///
2910	/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2911	///
2912	/// \param __a
2913	/// A 128-bit integer vector containing the source operand.
2914	/// \param __count
2915	/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2916	/// to left-shift each value in operand \a __a.
2917	/// \returns A 128-bit integer vector containing the left-shifted values.
2918	static __inline__ __m128i __DEFAULT_FN_ATTRS
2919	_mm_sll_epi32(__m128i __a, __m128i __count)
2920	{
2921	return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
2922	}
2923
2924	/// Left-shifts each 64-bit value in the 128-bit integer vector operand
2925	/// by the specified number of bits. Low-order bits are cleared.
2926	///
2927	/// \headerfile <x86intrin.h>
2928	///
2929	/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2930	///
2931	/// \param __a
2932	/// A 128-bit integer vector containing the source operand.
2933	/// \param __count
2934	/// An integer value specifying the number of bits to left-shift each value
2935	/// in operand \a __a.
2936	/// \returns A 128-bit integer vector containing the left-shifted values.
2937	static __inline__ __m128i __DEFAULT_FN_ATTRS
2938	_mm_slli_epi64(__m128i __a, int __count)
2939	{
2940	return __builtin_ia32_psllqi128((__v2di)__a, __count);
2941	}
2942
2943	/// Left-shifts each 64-bit value in the 128-bit integer vector operand
2944	/// by the specified number of bits. Low-order bits are cleared.
2945	///
2946	/// \headerfile <x86intrin.h>
2947	///
2948	/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2949	///
2950	/// \param __a
2951	/// A 128-bit integer vector containing the source operand.
2952	/// \param __count
2953	/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2954	/// to left-shift each value in operand \a __a.
2955	/// \returns A 128-bit integer vector containing the left-shifted values.
2956	static __inline__ __m128i __DEFAULT_FN_ATTRS
2957	_mm_sll_epi64(__m128i __a, __m128i __count)
2958	{
2959	return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
2960	}
2961
2962	/// Right-shifts each 16-bit value in the 128-bit integer vector operand
2963	/// by the specified number of bits. High-order bits are filled with the sign
2964	/// bit of the initial value.
2965	///
2966	/// \headerfile <x86intrin.h>
2967	///
2968	/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2969	///
2970	/// \param __a
2971	/// A 128-bit integer vector containing the source operand.
2972	/// \param __count
2973	/// An integer value specifying the number of bits to right-shift each value
2974	/// in operand \a __a.
2975	/// \returns A 128-bit integer vector containing the right-shifted values.
2976	static __inline__ __m128i __DEFAULT_FN_ATTRS
2977	_mm_srai_epi16(__m128i __a, int __count)
2978	{
2979	return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
2980	}
2981
2982	/// Right-shifts each 16-bit value in the 128-bit integer vector operand
2983	/// by the specified number of bits. High-order bits are filled with the sign
2984	/// bit of the initial value.
2985	///
2986	/// \headerfile <x86intrin.h>
2987	///
2988	/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2989	///
2990	/// \param __a
2991	/// A 128-bit integer vector containing the source operand.
2992	/// \param __count
2993	/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2994	/// to right-shift each value in operand \a __a.
2995	/// \returns A 128-bit integer vector containing the right-shifted values.
2996	static __inline__ __m128i __DEFAULT_FN_ATTRS
2997	_mm_sra_epi16(__m128i __a, __m128i __count)
2998	{
2999	return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
3000	}
3001
3002	/// Right-shifts each 32-bit value in the 128-bit integer vector operand
3003	/// by the specified number of bits. High-order bits are filled with the sign
3004	/// bit of the initial value.
3005	///
3006	/// \headerfile <x86intrin.h>
3007	///
3008	/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
3009	///
3010	/// \param __a
3011	/// A 128-bit integer vector containing the source operand.
3012	/// \param __count
3013	/// An integer value specifying the number of bits to right-shift each value
3014	/// in operand \a __a.
3015	/// \returns A 128-bit integer vector containing the right-shifted values.
3016	static __inline__ __m128i __DEFAULT_FN_ATTRS
3017	_mm_srai_epi32(__m128i __a, int __count)
3018	{
3019	return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
3020	}
3021
3022	/// Right-shifts each 32-bit value in the 128-bit integer vector operand
3023	/// by the specified number of bits. High-order bits are filled with the sign
3024	/// bit of the initial value.
3025	///
3026	/// \headerfile <x86intrin.h>
3027	///
3028	/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
3029	///
3030	/// \param __a
3031	/// A 128-bit integer vector containing the source operand.
3032	/// \param __count
3033	/// A 128-bit integer vector in which bits [63:0] specify the number of bits
3034	/// to right-shift each value in operand \a __a.
3035	/// \returns A 128-bit integer vector containing the right-shifted values.
3036	static __inline__ __m128i __DEFAULT_FN_ATTRS
3037	_mm_sra_epi32(__m128i __a, __m128i __count)
3038	{
3039	return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
3040	}
3041
3042	/// Right-shifts the 128-bit integer vector operand by the specified
3043	/// number of bytes. High-order bits are cleared.
3044	///
3045	/// \headerfile <x86intrin.h>
3046	///
3047	/// \code
3048	/// __m128i _mm_srli_si128(__m128i a, const int imm);
3049	/// \endcode
3050	///
3051	/// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
3052	///
3053	/// \param a
3054	/// A 128-bit integer vector containing the source operand.
3055	/// \param imm
3056	/// An immediate value specifying the number of bytes to right-shift operand
3057	/// \a a.
3058	/// \returns A 128-bit integer vector containing the right-shifted value.
3059	#define _mm_srli_si128(a, imm) \
3060	(__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
3061
3062	#define _mm_bsrli_si128(a, imm) \
3063	(__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
3064
3065	/// Right-shifts each of 16-bit values in the 128-bit integer vector
3066	/// operand by the specified number of bits. High-order bits are cleared.
3067	///
3068	/// \headerfile <x86intrin.h>
3069	///
3070	/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
3071	///
3072	/// \param __a
3073	/// A 128-bit integer vector containing the source operand.
3074	/// \param __count
3075	/// An integer value specifying the number of bits to right-shift each value
3076	/// in operand \a __a.
3077	/// \returns A 128-bit integer vector containing the right-shifted values.
3078	static __inline__ __m128i __DEFAULT_FN_ATTRS
3079	_mm_srli_epi16(__m128i __a, int __count)
3080	{
3081	return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
3082	}
3083
3084	/// Right-shifts each of 16-bit values in the 128-bit integer vector
3085	/// operand by the specified number of bits. High-order bits are cleared.
3086	///
3087	/// \headerfile <x86intrin.h>
3088	///
3089	/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
3090	///
3091	/// \param __a
3092	/// A 128-bit integer vector containing the source operand.
3093	/// \param __count
3094	/// A 128-bit integer vector in which bits [63:0] specify the number of bits
3095	/// to right-shift each value in operand \a __a.
3096	/// \returns A 128-bit integer vector containing the right-shifted values.
3097	static __inline__ __m128i __DEFAULT_FN_ATTRS
3098	_mm_srl_epi16(__m128i __a, __m128i __count)
3099	{
3100	return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
3101	}
3102
3103	/// Right-shifts each of 32-bit values in the 128-bit integer vector
3104	/// operand by the specified number of bits. High-order bits are cleared.
3105	///
3106	/// \headerfile <x86intrin.h>
3107	///
3108	/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
3109	///
3110	/// \param __a
3111	/// A 128-bit integer vector containing the source operand.
3112	/// \param __count
3113	/// An integer value specifying the number of bits to right-shift each value
3114	/// in operand \a __a.
3115	/// \returns A 128-bit integer vector containing the right-shifted values.
3116	static __inline__ __m128i __DEFAULT_FN_ATTRS
3117	_mm_srli_epi32(__m128i __a, int __count)
3118	{
3119	return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
3120	}
3121
3122	/// Right-shifts each of 32-bit values in the 128-bit integer vector
3123	/// operand by the specified number of bits. High-order bits are cleared.
3124	///
3125	/// \headerfile <x86intrin.h>
3126	///
3127	/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
3128	///
3129	/// \param __a
3130	/// A 128-bit integer vector containing the source operand.
3131	/// \param __count
3132	/// A 128-bit integer vector in which bits [63:0] specify the number of bits
3133	/// to right-shift each value in operand \a __a.
3134	/// \returns A 128-bit integer vector containing the right-shifted values.
3135	static __inline__ __m128i __DEFAULT_FN_ATTRS
3136	_mm_srl_epi32(__m128i __a, __m128i __count)
3137	{
3138	return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
3139	}
3140
3141	/// Right-shifts each of 64-bit values in the 128-bit integer vector
3142	/// operand by the specified number of bits. High-order bits are cleared.
3143	///
3144	/// \headerfile <x86intrin.h>
3145	///
3146	/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
3147	///
3148	/// \param __a
3149	/// A 128-bit integer vector containing the source operand.
3150	/// \param __count
3151	/// An integer value specifying the number of bits to right-shift each value
3152	/// in operand \a __a.
3153	/// \returns A 128-bit integer vector containing the right-shifted values.
3154	static __inline__ __m128i __DEFAULT_FN_ATTRS
3155	_mm_srli_epi64(__m128i __a, int __count)
3156	{
3157	return __builtin_ia32_psrlqi128((__v2di)__a, __count);
3158	}
3159
3160	/// Right-shifts each of 64-bit values in the 128-bit integer vector
3161	/// operand by the specified number of bits. High-order bits are cleared.
3162	///
3163	/// \headerfile <x86intrin.h>
3164	///
3165	/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
3166	///
3167	/// \param __a
3168	/// A 128-bit integer vector containing the source operand.
3169	/// \param __count
3170	/// A 128-bit integer vector in which bits [63:0] specify the number of bits
3171	/// to right-shift each value in operand \a __a.
3172	/// \returns A 128-bit integer vector containing the right-shifted values.
3173	static __inline__ __m128i __DEFAULT_FN_ATTRS
3174	_mm_srl_epi64(__m128i __a, __m128i __count)
3175	{
3176	return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
3177	}
3178
3179	/// Compares each of the corresponding 8-bit values of the 128-bit
3180	/// integer vectors for equality. Each comparison yields 0x0 for false, 0xFF
3181	/// for true.
3182	///
3183	/// \headerfile <x86intrin.h>
3184	///
3185	/// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
3186	///
3187	/// \param __a
3188	/// A 128-bit integer vector.
3189	/// \param __b
3190	/// A 128-bit integer vector.
3191	/// \returns A 128-bit integer vector containing the comparison results.
3192	static __inline__ __m128i __DEFAULT_FN_ATTRS
3193	_mm_cmpeq_epi8(__m128i __a, __m128i __b)
3194	{
3195	return (__m128i)((__v16qi)__a == (__v16qi)__b);
3196	}
3197
3198	/// Compares each of the corresponding 16-bit values of the 128-bit
3199	/// integer vectors for equality. Each comparison yields 0x0 for false,
3200	/// 0xFFFF for true.
3201	///
3202	/// \headerfile <x86intrin.h>
3203	///
3204	/// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
3205	///
3206	/// \param __a
3207	/// A 128-bit integer vector.
3208	/// \param __b
3209	/// A 128-bit integer vector.
3210	/// \returns A 128-bit integer vector containing the comparison results.
3211	static __inline__ __m128i __DEFAULT_FN_ATTRS
3212	_mm_cmpeq_epi16(__m128i __a, __m128i __b)
3213	{
3214	return (__m128i)((__v8hi)__a == (__v8hi)__b);
3215	}
3216
3217	/// Compares each of the corresponding 32-bit values of the 128-bit
3218	/// integer vectors for equality. Each comparison yields 0x0 for false,
3219	/// 0xFFFFFFFF for true.
3220	///
3221	/// \headerfile <x86intrin.h>
3222	///
3223	/// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
3224	///
3225	/// \param __a
3226	/// A 128-bit integer vector.
3227	/// \param __b
3228	/// A 128-bit integer vector.
3229	/// \returns A 128-bit integer vector containing the comparison results.
3230	static __inline__ __m128i __DEFAULT_FN_ATTRS
3231	_mm_cmpeq_epi32(__m128i __a, __m128i __b)
3232	{
3233	return (__m128i)((__v4si)__a == (__v4si)__b);
3234	}
3235
3236	/// Compares each of the corresponding signed 8-bit values of the 128-bit
3237	/// integer vectors to determine if the values in the first operand are
3238	/// greater than those in the second operand. Each comparison yields 0x0 for
3239	/// false, 0xFF for true.
3240	///
3241	/// \headerfile <x86intrin.h>
3242	///
3243	/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3244	///
3245	/// \param __a
3246	/// A 128-bit integer vector.
3247	/// \param __b
3248	/// A 128-bit integer vector.
3249	/// \returns A 128-bit integer vector containing the comparison results.
3250	static __inline__ __m128i __DEFAULT_FN_ATTRS
3251	_mm_cmpgt_epi8(__m128i __a, __m128i __b)
3252	{
3253	/* This function always performs a signed comparison, but __v16qi is a char
3254	which may be signed or unsigned, so use __v16qs. */
3255	return (__m128i)((__v16qs)__a > (__v16qs)__b);
3256	}
3257
3258	/// Compares each of the corresponding signed 16-bit values of the
3259	/// 128-bit integer vectors to determine if the values in the first operand
3260	/// are greater than those in the second operand.
3261	///
3262	/// Each comparison yields 0x0 for false, 0xFFFF for true.
3263	///
3264	/// \headerfile <x86intrin.h>
3265	///
3266	/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3267	///
3268	/// \param __a
3269	/// A 128-bit integer vector.
3270	/// \param __b
3271	/// A 128-bit integer vector.
3272	/// \returns A 128-bit integer vector containing the comparison results.
3273	static __inline__ __m128i __DEFAULT_FN_ATTRS
3274	_mm_cmpgt_epi16(__m128i __a, __m128i __b)
3275	{
3276	return (__m128i)((__v8hi)__a > (__v8hi)__b);
3277	}
3278
3279	/// Compares each of the corresponding signed 32-bit values of the
3280	/// 128-bit integer vectors to determine if the values in the first operand
3281	/// are greater than those in the second operand.
3282	///
3283	/// Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
3284	///
3285	/// \headerfile <x86intrin.h>
3286	///
3287	/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3288	///
3289	/// \param __a
3290	/// A 128-bit integer vector.
3291	/// \param __b
3292	/// A 128-bit integer vector.
3293	/// \returns A 128-bit integer vector containing the comparison results.
3294	static __inline__ __m128i __DEFAULT_FN_ATTRS
3295	_mm_cmpgt_epi32(__m128i __a, __m128i __b)
3296	{
3297	return (__m128i)((__v4si)__a > (__v4si)__b);
3298	}
3299
3300	/// Compares each of the corresponding signed 8-bit values of the 128-bit
3301	/// integer vectors to determine if the values in the first operand are less
3302	/// than those in the second operand.
3303	///
3304	/// Each comparison yields 0x0 for false, 0xFF for true.
3305	///
3306	/// \headerfile <x86intrin.h>
3307	///
3308	/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3309	///
3310	/// \param __a
3311	/// A 128-bit integer vector.
3312	/// \param __b
3313	/// A 128-bit integer vector.
3314	/// \returns A 128-bit integer vector containing the comparison results.
3315	static __inline__ __m128i __DEFAULT_FN_ATTRS
3316	_mm_cmplt_epi8(__m128i __a, __m128i __b)
3317	{
3318	return _mm_cmpgt_epi8(__b, __a);
3319	}
3320
3321	/// Compares each of the corresponding signed 16-bit values of the
3322	/// 128-bit integer vectors to determine if the values in the first operand
3323	/// are less than those in the second operand.
3324	///
3325	/// Each comparison yields 0x0 for false, 0xFFFF for true.
3326	///
3327	/// \headerfile <x86intrin.h>
3328	///
3329	/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3330	///
3331	/// \param __a
3332	/// A 128-bit integer vector.
3333	/// \param __b
3334	/// A 128-bit integer vector.
3335	/// \returns A 128-bit integer vector containing the comparison results.
3336	static __inline__ __m128i __DEFAULT_FN_ATTRS
3337	_mm_cmplt_epi16(__m128i __a, __m128i __b)
3338	{
3339	return _mm_cmpgt_epi16(__b, __a);
3340	}
3341
3342	/// Compares each of the corresponding signed 32-bit values of the
3343	/// 128-bit integer vectors to determine if the values in the first operand
3344	/// are less than those in the second operand.
3345	///
3346	/// Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
3347	///
3348	/// \headerfile <x86intrin.h>
3349	///
3350	/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3351	///
3352	/// \param __a
3353	/// A 128-bit integer vector.
3354	/// \param __b
3355	/// A 128-bit integer vector.
3356	/// \returns A 128-bit integer vector containing the comparison results.
3357	static __inline__ __m128i __DEFAULT_FN_ATTRS
3358	_mm_cmplt_epi32(__m128i __a, __m128i __b)
3359	{
3360	return _mm_cmpgt_epi32(__b, __a);
3361	}
3362
3363	#ifdef __x86_64__
3364	/// Converts a 64-bit signed integer value from the second operand into a
3365	/// double-precision value and returns it in the lower element of a [2 x
3366	/// double] vector; the upper element of the returned vector is copied from
3367	/// the upper element of the first operand.
3368	///
3369	/// \headerfile <x86intrin.h>
3370	///
3371	/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
3372	///
3373	/// \param __a
3374	/// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
3375	/// copied to the upper 64 bits of the destination.
3376	/// \param __b
3377	/// A 64-bit signed integer operand containing the value to be converted.
3378	/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
3379	/// converted value of the second operand. The upper 64 bits are copied from
3380	/// the upper 64 bits of the first operand.
3381	static __inline__ __m128d __DEFAULT_FN_ATTRS
3382	_mm_cvtsi64_sd(__m128d __a, long long __b)
3383	{
3384	__a[0] = __b;
3385	return __a;
3386	}
3387
3388	/// Converts the first (lower) element of a vector of [2 x double] into a
3389	/// 64-bit signed integer value, according to the current rounding mode.
3390	///
3391	/// \headerfile <x86intrin.h>
3392	///
3393	/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
3394	///
3395	/// \param __a
3396	/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3397	/// conversion.
3398	/// \returns A 64-bit signed integer containing the converted value.
3399	static __inline__ long long __DEFAULT_FN_ATTRS
3400	_mm_cvtsd_si64(__m128d __a)
3401	{
3402	return __builtin_ia32_cvtsd2si64((__v2df)__a);
3403	}
3404
3405	/// Converts the first (lower) element of a vector of [2 x double] into a
3406	/// 64-bit signed integer value, truncating the result when it is inexact.
3407	///
3408	/// \headerfile <x86intrin.h>
3409	///
3410	/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
3411	/// instruction.
3412	///
3413	/// \param __a
3414	/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3415	/// conversion.
3416	/// \returns A 64-bit signed integer containing the converted value.
3417	static __inline__ long long __DEFAULT_FN_ATTRS
3418	_mm_cvttsd_si64(__m128d __a)
3419	{
3420	return __builtin_ia32_cvttsd2si64((__v2df)__a);
3421	}
3422	#endif
3423
3424	/// Converts a vector of [4 x i32] into a vector of [4 x float].
3425	///
3426	/// \headerfile <x86intrin.h>
3427	///
3428	/// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
3429	///
3430	/// \param __a
3431	/// A 128-bit integer vector.
3432	/// \returns A 128-bit vector of [4 x float] containing the converted values.
3433	static __inline__ __m128 __DEFAULT_FN_ATTRS
3434	_mm_cvtepi32_ps(__m128i __a)
3435	{
3436	return (__m128)__builtin_convertvector((__v4si)__a, __v4sf);
3437	}
3438
3439	/// Converts a vector of [4 x float] into a vector of [4 x i32].
3440	///
3441	/// \headerfile <x86intrin.h>
3442	///
3443	/// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
3444	///
3445	/// \param __a
3446	/// A 128-bit vector of [4 x float].
3447	/// \returns A 128-bit integer vector of [4 x i32] containing the converted
3448	/// values.
3449	static __inline__ __m128i __DEFAULT_FN_ATTRS
3450	_mm_cvtps_epi32(__m128 __a)
3451	{
3452	return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
3453	}
3454
3455	/// Converts a vector of [4 x float] into a vector of [4 x i32],
3456	/// truncating the result when it is inexact.
3457	///
3458	/// \headerfile <x86intrin.h>
3459	///
3460	/// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
3461	/// instruction.
3462	///
3463	/// \param __a
3464	/// A 128-bit vector of [4 x float].
3465	/// \returns A 128-bit vector of [4 x i32] containing the converted values.
3466	static __inline__ __m128i __DEFAULT_FN_ATTRS
3467	_mm_cvttps_epi32(__m128 __a)
3468	{
3469	return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
3470	}
3471
3472	/// Returns a vector of [4 x i32] where the lowest element is the input
3473	/// operand and the remaining elements are zero.
3474	///
3475	/// \headerfile <x86intrin.h>
3476	///
3477	/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3478	///
3479	/// \param __a
3480	/// A 32-bit signed integer operand.
3481	/// \returns A 128-bit vector of [4 x i32].
3482	static __inline__ __m128i __DEFAULT_FN_ATTRS
3483	_mm_cvtsi32_si128(int __a)
3484	{
3485	return __extension__ (__m128i)(__v4si){ __a, 0, 0, 0 };
3486	}
3487
3488	#ifdef __x86_64__
3489	/// Returns a vector of [2 x i64] where the lower element is the input
3490	/// operand and the upper element is zero.
3491	///
3492	/// \headerfile <x86intrin.h>
3493	///
3494	/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3495	///
3496	/// \param __a
3497	/// A 64-bit signed integer operand containing the value to be converted.
3498	/// \returns A 128-bit vector of [2 x i64] containing the converted value.
3499	static __inline__ __m128i __DEFAULT_FN_ATTRS
3500	_mm_cvtsi64_si128(long long __a)
3501	{
3502	return __extension__ (__m128i)(__v2di){ __a, 0 };
3503	}
3504	#endif
3505
3506	/// Moves the least significant 32 bits of a vector of [4 x i32] to a
3507	/// 32-bit signed integer value.
3508	///
3509	/// \headerfile <x86intrin.h>
3510	///
3511	/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3512	///
3513	/// \param __a
3514	/// A vector of [4 x i32]. The least significant 32 bits are moved to the
3515	/// destination.
3516	/// \returns A 32-bit signed integer containing the moved value.
3517	static __inline__ int __DEFAULT_FN_ATTRS
3518	_mm_cvtsi128_si32(__m128i __a)
3519	{
3520	__v4si __b = (__v4si)__a;
3521	return __b[0];
3522	}
3523
3524	#ifdef __x86_64__
3525	/// Moves the least significant 64 bits of a vector of [2 x i64] to a
3526	/// 64-bit signed integer value.
3527	///
3528	/// \headerfile <x86intrin.h>
3529	///
3530	/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3531	///
3532	/// \param __a
3533	/// A vector of [2 x i64]. The least significant 64 bits are moved to the
3534	/// destination.
3535	/// \returns A 64-bit signed integer containing the moved value.
3536	static __inline__ long long __DEFAULT_FN_ATTRS
3537	_mm_cvtsi128_si64(__m128i __a)
3538	{
3539	return __a[0];
3540	}
3541	#endif
3542
3543	/// Moves packed integer values from an aligned 128-bit memory location
3544	/// to elements in a 128-bit integer vector.
3545	///
3546	/// \headerfile <x86intrin.h>
3547	///
3548	/// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
3549	///
3550	/// \param __p
3551	/// An aligned pointer to a memory location containing integer values.
3552	/// \returns A 128-bit integer vector containing the moved values.
3553	static __inline__ __m128i __DEFAULT_FN_ATTRS
3554	_mm_load_si128(__m128i const *__p)
3555	{
3556	return *__p;
3557	}
3558
3559	/// Moves packed integer values from an unaligned 128-bit memory location
3560	/// to elements in a 128-bit integer vector.
3561	///
3562	/// \headerfile <x86intrin.h>
3563	///
3564	/// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
3565	///
3566	/// \param __p
3567	/// A pointer to a memory location containing integer values.
3568	/// \returns A 128-bit integer vector containing the moved values.
3569	static __inline__ __m128i __DEFAULT_FN_ATTRS
3570	_mm_loadu_si128(__m128i_u const *__p)
3571	{
3572	struct __loadu_si128 {
3573	__m128i_u __v;
3574	} __attribute__((__packed__, __may_alias__));
3575	return ((struct __loadu_si128*)__p)->__v;
3576	}
3577
3578	/// Returns a vector of [2 x i64] where the lower element is taken from
3579	/// the lower element of the operand, and the upper element is zero.
3580	///
3581	/// \headerfile <x86intrin.h>
3582	///
3583	/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3584	///
3585	/// \param __p
3586	/// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
3587	/// the destination.
3588	/// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
3589	/// moved value. The higher order bits are cleared.
3590	static __inline__ __m128i __DEFAULT_FN_ATTRS
3591	_mm_loadl_epi64(__m128i_u const *__p)
3592	{
3593	struct __mm_loadl_epi64_struct {
3594	long long __u;
3595	} __attribute__((__packed__, __may_alias__));
3596	return __extension__ (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0};
3597	}
3598
3599	/// Generates a 128-bit vector of [4 x i32] with unspecified content.
3600	/// This could be used as an argument to another intrinsic function where the
3601	/// argument is required but the value is not actually used.
3602	///
3603	/// \headerfile <x86intrin.h>
3604	///
3605	/// This intrinsic has no corresponding instruction.
3606	///
3607	/// \returns A 128-bit vector of [4 x i32] with unspecified content.
3608	static __inline__ __m128i __DEFAULT_FN_ATTRS
3609	_mm_undefined_si128(void)
3610	{
3611	return (__m128i)__builtin_ia32_undef128();
3612	}
3613
3614	/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3615	/// the specified 64-bit integer values.
3616	///
3617	/// \headerfile <x86intrin.h>
3618	///
3619	/// This intrinsic is a utility function and does not correspond to a specific
3620	/// instruction.
3621	///
3622	/// \param __q1
3623	/// A 64-bit integer value used to initialize the upper 64 bits of the
3624	/// destination vector of [2 x i64].
3625	/// \param __q0
3626	/// A 64-bit integer value used to initialize the lower 64 bits of the
3627	/// destination vector of [2 x i64].
3628	/// \returns An initialized 128-bit vector of [2 x i64] containing the values
3629	/// provided in the operands.
3630	static __inline__ __m128i __DEFAULT_FN_ATTRS
3631	_mm_set_epi64x(long long __q1, long long __q0)
3632	{
3633	return __extension__ (__m128i)(__v2di){ __q0, __q1 };
3634	}
3635
3636	/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3637	/// the specified 64-bit integer values.
3638	///
3639	/// \headerfile <x86intrin.h>
3640	///
3641	/// This intrinsic is a utility function and does not correspond to a specific
3642	/// instruction.
3643	///
3644	/// \param __q1
3645	/// A 64-bit integer value used to initialize the upper 64 bits of the
3646	/// destination vector of [2 x i64].
3647	/// \param __q0
3648	/// A 64-bit integer value used to initialize the lower 64 bits of the
3649	/// destination vector of [2 x i64].
3650	/// \returns An initialized 128-bit vector of [2 x i64] containing the values
3651	/// provided in the operands.
3652	static __inline__ __m128i __DEFAULT_FN_ATTRS
3653	_mm_set_epi64(__m64 __q1, __m64 __q0)
3654	{
3655	return _mm_set_epi64x((long long)__q1, (long long)__q0);
3656	}
3657
3658	/// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
3659	/// the specified 32-bit integer values.
3660	///
3661	/// \headerfile <x86intrin.h>
3662	///
3663	/// This intrinsic is a utility function and does not correspond to a specific
3664	/// instruction.
3665	///
3666	/// \param __i3
3667	/// A 32-bit integer value used to initialize bits [127:96] of the
3668	/// destination vector.
3669	/// \param __i2
3670	/// A 32-bit integer value used to initialize bits [95:64] of the destination
3671	/// vector.
3672	/// \param __i1
3673	/// A 32-bit integer value used to initialize bits [63:32] of the destination
3674	/// vector.
3675	/// \param __i0
3676	/// A 32-bit integer value used to initialize bits [31:0] of the destination
3677	/// vector.
3678	/// \returns An initialized 128-bit vector of [4 x i32] containing the values
3679	/// provided in the operands.
3680	static __inline__ __m128i __DEFAULT_FN_ATTRS
3681	_mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
3682	{
3683	return __extension__ (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
3684	}
3685
3686	/// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
3687	/// the specified 16-bit integer values.
3688	///
3689	/// \headerfile <x86intrin.h>
3690	///
3691	/// This intrinsic is a utility function and does not correspond to a specific
3692	/// instruction.
3693	///
3694	/// \param __w7
3695	/// A 16-bit integer value used to initialize bits [127:112] of the
3696	/// destination vector.
3697	/// \param __w6
3698	/// A 16-bit integer value used to initialize bits [111:96] of the
3699	/// destination vector.
3700	/// \param __w5
3701	/// A 16-bit integer value used to initialize bits [95:80] of the destination
3702	/// vector.
3703	/// \param __w4
3704	/// A 16-bit integer value used to initialize bits [79:64] of the destination
3705	/// vector.
3706	/// \param __w3
3707	/// A 16-bit integer value used to initialize bits [63:48] of the destination
3708	/// vector.
3709	/// \param __w2
3710	/// A 16-bit integer value used to initialize bits [47:32] of the destination
3711	/// vector.
3712	/// \param __w1
3713	/// A 16-bit integer value used to initialize bits [31:16] of the destination
3714	/// vector.
3715	/// \param __w0
3716	/// A 16-bit integer value used to initialize bits [15:0] of the destination
3717	/// vector.
3718	/// \returns An initialized 128-bit vector of [8 x i16] containing the values
3719	/// provided in the operands.
3720	static __inline__ __m128i __DEFAULT_FN_ATTRS
3721	_mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
3722	{
3723	return __extension__ (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
3724	}
3725
3726	/// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
3727	/// the specified 8-bit integer values.
3728	///
3729	/// \headerfile <x86intrin.h>
3730	///
3731	/// This intrinsic is a utility function and does not correspond to a specific
3732	/// instruction.
3733	///
3734	/// \param __b15
3735	/// Initializes bits [127:120] of the destination vector.
3736	/// \param __b14
3737	/// Initializes bits [119:112] of the destination vector.
3738	/// \param __b13
3739	/// Initializes bits [111:104] of the destination vector.
3740	/// \param __b12
3741	/// Initializes bits [103:96] of the destination vector.
3742	/// \param __b11
3743	/// Initializes bits [95:88] of the destination vector.
3744	/// \param __b10
3745	/// Initializes bits [87:80] of the destination vector.
3746	/// \param __b9
3747	/// Initializes bits [79:72] of the destination vector.
3748	/// \param __b8
3749	/// Initializes bits [71:64] of the destination vector.
3750	/// \param __b7
3751	/// Initializes bits [63:56] of the destination vector.
3752	/// \param __b6
3753	/// Initializes bits [55:48] of the destination vector.
3754	/// \param __b5
3755	/// Initializes bits [47:40] of the destination vector.
3756	/// \param __b4
3757	/// Initializes bits [39:32] of the destination vector.
3758	/// \param __b3
3759	/// Initializes bits [31:24] of the destination vector.
3760	/// \param __b2
3761	/// Initializes bits [23:16] of the destination vector.
3762	/// \param __b1
3763	/// Initializes bits [15:8] of the destination vector.
3764	/// \param __b0
3765	/// Initializes bits [7:0] of the destination vector.
3766	/// \returns An initialized 128-bit vector of [16 x i8] containing the values
3767	/// provided in the operands.
3768	static __inline__ __m128i __DEFAULT_FN_ATTRS
3769	_mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
3770	{
3771	return __extension__ (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
3772	}
3773
3774	/// Initializes both values in a 128-bit integer vector with the
3775	/// specified 64-bit integer value.
3776	///
3777	/// \headerfile <x86intrin.h>
3778	///
3779	/// This intrinsic is a utility function and does not correspond to a specific
3780	/// instruction.
3781	///
3782	/// \param __q
3783	/// Integer value used to initialize the elements of the destination integer
3784	/// vector.
3785	/// \returns An initialized 128-bit integer vector of [2 x i64] with both
3786	/// elements containing the value provided in the operand.
3787	static __inline__ __m128i __DEFAULT_FN_ATTRS
3788	_mm_set1_epi64x(long long __q)
3789	{
3790	return _mm_set_epi64x(__q, __q);
3791	}
3792
3793	/// Initializes both values in a 128-bit vector of [2 x i64] with the
3794	/// specified 64-bit value.
3795	///
3796	/// \headerfile <x86intrin.h>
3797	///
3798	/// This intrinsic is a utility function and does not correspond to a specific
3799	/// instruction.
3800	///
3801	/// \param __q
3802	/// A 64-bit value used to initialize the elements of the destination integer
3803	/// vector.
3804	/// \returns An initialized 128-bit vector of [2 x i64] with all elements
3805	/// containing the value provided in the operand.
3806	static __inline__ __m128i __DEFAULT_FN_ATTRS
3807	_mm_set1_epi64(__m64 __q)
3808	{
3809	return _mm_set_epi64(__q, __q);
3810	}
3811
3812	/// Initializes all values in a 128-bit vector of [4 x i32] with the
3813	/// specified 32-bit value.
3814	///
3815	/// \headerfile <x86intrin.h>
3816	///
3817	/// This intrinsic is a utility function and does not correspond to a specific
3818	/// instruction.
3819	///
3820	/// \param __i
3821	/// A 32-bit value used to initialize the elements of the destination integer
3822	/// vector.
3823	/// \returns An initialized 128-bit vector of [4 x i32] with all elements
3824	/// containing the value provided in the operand.
3825	static __inline__ __m128i __DEFAULT_FN_ATTRS
3826	_mm_set1_epi32(int __i)
3827	{
3828	return _mm_set_epi32(__i, __i, __i, __i);
3829	}
3830
3831	/// Initializes all values in a 128-bit vector of [8 x i16] with the
3832	/// specified 16-bit value.
3833	///
3834	/// \headerfile <x86intrin.h>
3835	///
3836	/// This intrinsic is a utility function and does not correspond to a specific
3837	/// instruction.
3838	///
3839	/// \param __w
3840	/// A 16-bit value used to initialize the elements of the destination integer
3841	/// vector.
3842	/// \returns An initialized 128-bit vector of [8 x i16] with all elements
3843	/// containing the value provided in the operand.
3844	static __inline__ __m128i __DEFAULT_FN_ATTRS
3845	_mm_set1_epi16(short __w)
3846	{
3847	return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w);
3848	}
3849
3850	/// Initializes all values in a 128-bit vector of [16 x i8] with the
3851	/// specified 8-bit value.
3852	///
3853	/// \headerfile <x86intrin.h>
3854	///
3855	/// This intrinsic is a utility function and does not correspond to a specific
3856	/// instruction.
3857	///
3858	/// \param __b
3859	/// An 8-bit value used to initialize the elements of the destination integer
3860	/// vector.
3861	/// \returns An initialized 128-bit vector of [16 x i8] with all elements
3862	/// containing the value provided in the operand.
3863	static __inline__ __m128i __DEFAULT_FN_ATTRS
3864	_mm_set1_epi8(char __b)
3865	{
3866	return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b);
3867	}
3868
3869	/// Constructs a 128-bit integer vector, initialized in reverse order
3870	/// with the specified 64-bit integral values.
3871	///
3872	/// \headerfile <x86intrin.h>
3873	///
3874	/// This intrinsic does not correspond to a specific instruction.
3875	///
3876	/// \param __q0
3877	/// A 64-bit integral value used to initialize the lower 64 bits of the
3878	/// result.
3879	/// \param __q1
3880	/// A 64-bit integral value used to initialize the upper 64 bits of the
3881	/// result.
3882	/// \returns An initialized 128-bit integer vector.
3883	static __inline__ __m128i __DEFAULT_FN_ATTRS
3884	_mm_setr_epi64(__m64 __q0, __m64 __q1)
3885	{
3886	return _mm_set_epi64(__q1, __q0);
3887	}
3888
3889	/// Constructs a 128-bit integer vector, initialized in reverse order
3890	/// with the specified 32-bit integral values.
3891	///
3892	/// \headerfile <x86intrin.h>
3893	///
3894	/// This intrinsic is a utility function and does not correspond to a specific
3895	/// instruction.
3896	///
3897	/// \param __i0
3898	/// A 32-bit integral value used to initialize bits [31:0] of the result.
3899	/// \param __i1
3900	/// A 32-bit integral value used to initialize bits [63:32] of the result.
3901	/// \param __i2
3902	/// A 32-bit integral value used to initialize bits [95:64] of the result.
3903	/// \param __i3
3904	/// A 32-bit integral value used to initialize bits [127:96] of the result.
3905	/// \returns An initialized 128-bit integer vector.
3906	static __inline__ __m128i __DEFAULT_FN_ATTRS
3907	_mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
3908	{
3909	return _mm_set_epi32(__i3, __i2, __i1, __i0);
3910	}
3911
3912	/// Constructs a 128-bit integer vector, initialized in reverse order
3913	/// with the specified 16-bit integral values.
3914	///
3915	/// \headerfile <x86intrin.h>
3916	///
3917	/// This intrinsic is a utility function and does not correspond to a specific
3918	/// instruction.
3919	///
3920	/// \param __w0
3921	/// A 16-bit integral value used to initialize bits [15:0] of the result.
3922	/// \param __w1
3923	/// A 16-bit integral value used to initialize bits [31:16] of the result.
3924	/// \param __w2
3925	/// A 16-bit integral value used to initialize bits [47:32] of the result.
3926	/// \param __w3
3927	/// A 16-bit integral value used to initialize bits [63:48] of the result.
3928	/// \param __w4
3929	/// A 16-bit integral value used to initialize bits [79:64] of the result.
3930	/// \param __w5
3931	/// A 16-bit integral value used to initialize bits [95:80] of the result.
3932	/// \param __w6
3933	/// A 16-bit integral value used to initialize bits [111:96] of the result.
3934	/// \param __w7
3935	/// A 16-bit integral value used to initialize bits [127:112] of the result.
3936	/// \returns An initialized 128-bit integer vector.
3937	static __inline__ __m128i __DEFAULT_FN_ATTRS
3938	_mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
3939	{
3940	return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0);
3941	}
3942
3943	/// Constructs a 128-bit integer vector, initialized in reverse order
3944	/// with the specified 8-bit integral values.
3945	///
3946	/// \headerfile <x86intrin.h>
3947	///
3948	/// This intrinsic is a utility function and does not correspond to a specific
3949	/// instruction.
3950	///
3951	/// \param __b0
3952	/// An 8-bit integral value used to initialize bits [7:0] of the result.
3953	/// \param __b1
3954	/// An 8-bit integral value used to initialize bits [15:8] of the result.
3955	/// \param __b2
3956	/// An 8-bit integral value used to initialize bits [23:16] of the result.
3957	/// \param __b3
3958	/// An 8-bit integral value used to initialize bits [31:24] of the result.
3959	/// \param __b4
3960	/// An 8-bit integral value used to initialize bits [39:32] of the result.
3961	/// \param __b5
3962	/// An 8-bit integral value used to initialize bits [47:40] of the result.
3963	/// \param __b6
3964	/// An 8-bit integral value used to initialize bits [55:48] of the result.
3965	/// \param __b7
3966	/// An 8-bit integral value used to initialize bits [63:56] of the result.
3967	/// \param __b8
3968	/// An 8-bit integral value used to initialize bits [71:64] of the result.
3969	/// \param __b9
3970	/// An 8-bit integral value used to initialize bits [79:72] of the result.
3971	/// \param __b10
3972	/// An 8-bit integral value used to initialize bits [87:80] of the result.
3973	/// \param __b11
3974	/// An 8-bit integral value used to initialize bits [95:88] of the result.
3975	/// \param __b12
3976	/// An 8-bit integral value used to initialize bits [103:96] of the result.
3977	/// \param __b13
3978	/// An 8-bit integral value used to initialize bits [111:104] of the result.
3979	/// \param __b14
3980	/// An 8-bit integral value used to initialize bits [119:112] of the result.
3981	/// \param __b15
3982	/// An 8-bit integral value used to initialize bits [127:120] of the result.
3983	/// \returns An initialized 128-bit integer vector.
3984	static __inline__ __m128i __DEFAULT_FN_ATTRS
3985	_mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
3986	{
3987	return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8, __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
3988	}
3989
3990	/// Creates a 128-bit integer vector initialized to zero.
3991	///
3992	/// \headerfile <x86intrin.h>
3993	///
3994	/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
3995	///
3996	/// \returns An initialized 128-bit integer vector with all elements set to
3997	/// zero.
3998	static __inline__ __m128i __DEFAULT_FN_ATTRS
3999	_mm_setzero_si128(void)
4000	{
4001	return __extension__ (__m128i)(__v2di){ 0LL, 0LL };
4002	}
4003
4004	/// Stores a 128-bit integer vector to a memory location aligned on a
4005	/// 128-bit boundary.
4006	///
4007	/// \headerfile <x86intrin.h>
4008	///
4009	/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
4010	///
4011	/// \param __p
4012	/// A pointer to an aligned memory location that will receive the integer
4013	/// values.
4014	/// \param __b
4015	/// A 128-bit integer vector containing the values to be moved.
4016	static __inline__ void __DEFAULT_FN_ATTRS
4017	_mm_store_si128(__m128i *__p, __m128i __b)
4018	{
4019	*__p = __b;
4020	}
4021
4022	/// Stores a 128-bit integer vector to an unaligned memory location.
4023	///
4024	/// \headerfile <x86intrin.h>
4025	///
4026	/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
4027	///
4028	/// \param __p
4029	/// A pointer to a memory location that will receive the integer values.
4030	/// \param __b
4031	/// A 128-bit integer vector containing the values to be moved.
4032	static __inline__ void __DEFAULT_FN_ATTRS
4033	_mm_storeu_si128(__m128i_u *__p, __m128i __b)
4034	{
4035	struct __storeu_si128 {
4036	__m128i_u __v;
4037	} __attribute__((__packed__, __may_alias__));
4038	((struct __storeu_si128*)__p)->__v = __b;
4039	}
4040
4041	/// Stores a 64-bit integer value from the low element of a 128-bit integer
4042	/// vector.
4043	///
4044	/// \headerfile <x86intrin.h>
4045	///
4046	/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
4047	///
4048	/// \param __p
4049	/// A pointer to a 64-bit memory location. The address of the memory
4050	/// location does not have to be algned.
4051	/// \param __b
4052	/// A 128-bit integer vector containing the value to be stored.
4053	static __inline__ void __DEFAULT_FN_ATTRS
4054	_mm_storeu_si64(void const *__p, __m128i __b)
4055	{
4056	struct __storeu_si64 {
4057	long long __v;
4058	} __attribute__((__packed__, __may_alias__));
4059	((struct __storeu_si64*)__p)->__v = ((__v2di)__b)[0];
4060	}
4061
4062	/// Stores a 32-bit integer value from the low element of a 128-bit integer
4063	/// vector.
4064	///
4065	/// \headerfile <x86intrin.h>
4066	///
4067	/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
4068	///
4069	/// \param __p
4070	/// A pointer to a 32-bit memory location. The address of the memory
4071	/// location does not have to be aligned.
4072	/// \param __b
4073	/// A 128-bit integer vector containing the value to be stored.
4074	static __inline__ void __DEFAULT_FN_ATTRS
4075	_mm_storeu_si32(void const *__p, __m128i __b)
4076	{
4077	struct __storeu_si32 {
4078	int __v;
4079	} __attribute__((__packed__, __may_alias__));
4080	((struct __storeu_si32*)__p)->__v = ((__v4si)__b)[0];
4081	}
4082
4083	/// Stores a 16-bit integer value from the low element of a 128-bit integer
4084	/// vector.
4085	///
4086	/// \headerfile <x86intrin.h>
4087	///
4088	/// This intrinsic does not correspond to a specific instruction.
4089	///
4090	/// \param __p
4091	/// A pointer to a 16-bit memory location. The address of the memory
4092	/// location does not have to be aligned.
4093	/// \param __b
4094	/// A 128-bit integer vector containing the value to be stored.
4095	static __inline__ void __DEFAULT_FN_ATTRS
4096	_mm_storeu_si16(void const *__p, __m128i __b)
4097	{
4098	struct __storeu_si16 {
4099	short __v;
4100	} __attribute__((__packed__, __may_alias__));
4101	((struct __storeu_si16*)__p)->__v = ((__v8hi)__b)[0];
4102	}
4103
4104	/// Moves bytes selected by the mask from the first operand to the
4105	/// specified unaligned memory location. When a mask bit is 1, the
4106	/// corresponding byte is written, otherwise it is not written.
4107	///
4108	/// To minimize caching, the data is flagged as non-temporal (unlikely to be
4109	/// used again soon). Exception and trap behavior for elements not selected
4110	/// for storage to memory are implementation dependent.
4111	///
4112	/// \headerfile <x86intrin.h>
4113	///
4114	/// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
4115	/// instruction.
4116	///
4117	/// \param __d
4118	/// A 128-bit integer vector containing the values to be moved.
4119	/// \param __n
4120	/// A 128-bit integer vector containing the mask. The most significant bit of
4121	/// each byte represents the mask bits.
4122	/// \param __p
4123	/// A pointer to an unaligned 128-bit memory location where the specified
4124	/// values are moved.
4125	static __inline__ void __DEFAULT_FN_ATTRS
4126	_mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
4127	{
4128	__builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
4129	}
4130
4131	/// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
4132	/// a memory location.
4133	///
4134	/// \headerfile <x86intrin.h>
4135	///
4136	/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
4137	///
4138	/// \param __p
4139	/// A pointer to a 64-bit memory location that will receive the lower 64 bits
4140	/// of the integer vector parameter.
4141	/// \param __a
4142	/// A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
4143	/// value to be stored.
4144	static __inline__ void __DEFAULT_FN_ATTRS
4145	_mm_storel_epi64(__m128i_u *__p, __m128i __a)
4146	{
4147	struct __mm_storel_epi64_struct {
4148	long long __u;
4149	} __attribute__((__packed__, __may_alias__));
4150	((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
4151	}
4152
4153	/// Stores a 128-bit floating point vector of [2 x double] to a 128-bit
4154	/// aligned memory location.
4155	///
4156	/// To minimize caching, the data is flagged as non-temporal (unlikely to be
4157	/// used again soon).
4158	///
4159	/// \headerfile <x86intrin.h>
4160	///
4161	/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
4162	///
4163	/// \param __p
4164	/// A pointer to the 128-bit aligned memory location used to store the value.
4165	/// \param __a
4166	/// A vector of [2 x double] containing the 64-bit values to be stored.
4167	static __inline__ void __DEFAULT_FN_ATTRS
4168	_mm_stream_pd(double *__p, __m128d __a)
4169	{
4170	__builtin_nontemporal_store((__v2df)__a, (__v2df*)__p);
4171	}
4172
4173	/// Stores a 128-bit integer vector to a 128-bit aligned memory location.
4174	///
4175	/// To minimize caching, the data is flagged as non-temporal (unlikely to be
4176	/// used again soon).
4177	///
4178	/// \headerfile <x86intrin.h>
4179	///
4180	/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
4181	///
4182	/// \param __p
4183	/// A pointer to the 128-bit aligned memory location used to store the value.
4184	/// \param __a
4185	/// A 128-bit integer vector containing the values to be stored.
4186	static __inline__ void __DEFAULT_FN_ATTRS
4187	_mm_stream_si128(__m128i *__p, __m128i __a)
4188	{
4189	__builtin_nontemporal_store((__v2di)__a, (__v2di*)__p);
4190	}
4191
4192	/// Stores a 32-bit integer value in the specified memory location.
4193	///
4194	/// To minimize caching, the data is flagged as non-temporal (unlikely to be
4195	/// used again soon).
4196	///
4197	/// \headerfile <x86intrin.h>
4198	///
4199	/// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
4200	///
4201	/// \param __p
4202	/// A pointer to the 32-bit memory location used to store the value.
4203	/// \param __a
4204	/// A 32-bit integer containing the value to be stored.
4205	static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
4206	_mm_stream_si32(int *__p, int __a)
4207	{
4208	__builtin_ia32_movnti(__p, __a);
4209	}
4210
4211	#ifdef __x86_64__
4212	/// Stores a 64-bit integer value in the specified memory location.
4213	///
4214	/// To minimize caching, the data is flagged as non-temporal (unlikely to be
4215	/// used again soon).
4216	///
4217	/// \headerfile <x86intrin.h>
4218	///
4219	/// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
4220	///
4221	/// \param __p
4222	/// A pointer to the 64-bit memory location used to store the value.
4223	/// \param __a
4224	/// A 64-bit integer containing the value to be stored.
4225	static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
4226	_mm_stream_si64(long long *__p, long long __a)
4227	{
4228	__builtin_ia32_movnti64(__p, __a);
4229	}
4230	#endif
4231
4232	#if defined(__cplusplus)
4233	extern "C" {
4234	#endif
4235
4236	/// The cache line containing \a __p is flushed and invalidated from all
4237	/// caches in the coherency domain.
4238	///
4239	/// \headerfile <x86intrin.h>
4240	///
4241	/// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
4242	///
4243	/// \param __p
4244	/// A pointer to the memory location used to identify the cache line to be
4245	/// flushed.
4246	void _mm_clflush(void const * __p);
4247
4248	/// Forces strong memory ordering (serialization) between load
4249	/// instructions preceding this instruction and load instructions following
4250	/// this instruction, ensuring the system completes all previous loads before
4251	/// executing subsequent loads.
4252	///
4253	/// \headerfile <x86intrin.h>
4254	///
4255	/// This intrinsic corresponds to the <c> LFENCE </c> instruction.
4256	///
4257	void _mm_lfence(void);
4258
4259	/// Forces strong memory ordering (serialization) between load and store
4260	/// instructions preceding this instruction and load and store instructions
4261	/// following this instruction, ensuring that the system completes all
4262	/// previous memory accesses before executing subsequent memory accesses.
4263	///
4264	/// \headerfile <x86intrin.h>
4265	///
4266	/// This intrinsic corresponds to the <c> MFENCE </c> instruction.
4267	///
4268	void _mm_mfence(void);
4269
4270	#if defined(__cplusplus)
4271	} // extern "C"
4272	#endif
4273
4274	/// Converts 16-bit signed integers from both 128-bit integer vector
4275	/// operands into 8-bit signed integers, and packs the results into the
4276	/// destination. Positive values greater than 0x7F are saturated to 0x7F.
4277	/// Negative values less than 0x80 are saturated to 0x80.
4278	///
4279	/// \headerfile <x86intrin.h>
4280	///
4281	/// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
4282	///
4283	/// \param __a
4284	/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4285	/// a signed integer and is converted to a 8-bit signed integer with
4286	/// saturation. Values greater than 0x7F are saturated to 0x7F. Values less
4287	/// than 0x80 are saturated to 0x80. The converted [8 x i8] values are
4288	/// written to the lower 64 bits of the result.
4289	/// \param __b
4290	/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4291	/// a signed integer and is converted to a 8-bit signed integer with
4292	/// saturation. Values greater than 0x7F are saturated to 0x7F. Values less
4293	/// than 0x80 are saturated to 0x80. The converted [8 x i8] values are
4294	/// written to the higher 64 bits of the result.
4295	/// \returns A 128-bit vector of [16 x i8] containing the converted values.
4296	static __inline__ __m128i __DEFAULT_FN_ATTRS
4297	_mm_packs_epi16(__m128i __a, __m128i __b)
4298	{
4299	return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
4300	}
4301
4302	/// Converts 32-bit signed integers from both 128-bit integer vector
4303	/// operands into 16-bit signed integers, and packs the results into the
4304	/// destination. Positive values greater than 0x7FFF are saturated to 0x7FFF.
4305	/// Negative values less than 0x8000 are saturated to 0x8000.
4306	///
4307	/// \headerfile <x86intrin.h>
4308	///
4309	/// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
4310	///
4311	/// \param __a
4312	/// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
4313	/// a signed integer and is converted to a 16-bit signed integer with
4314	/// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
4315	/// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
4316	/// are written to the lower 64 bits of the result.
4317	/// \param __b
4318	/// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
4319	/// a signed integer and is converted to a 16-bit signed integer with
4320	/// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
4321	/// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
4322	/// are written to the higher 64 bits of the result.
4323	/// \returns A 128-bit vector of [8 x i16] containing the converted values.
4324	static __inline__ __m128i __DEFAULT_FN_ATTRS
4325	_mm_packs_epi32(__m128i __a, __m128i __b)
4326	{
4327	return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
4328	}
4329
4330	/// Converts 16-bit signed integers from both 128-bit integer vector
4331	/// operands into 8-bit unsigned integers, and packs the results into the
4332	/// destination. Values greater than 0xFF are saturated to 0xFF. Values less
4333	/// than 0x00 are saturated to 0x00.
4334	///
4335	/// \headerfile <x86intrin.h>
4336	///
4337	/// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
4338	///
4339	/// \param __a
4340	/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4341	/// a signed integer and is converted to an 8-bit unsigned integer with
4342	/// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
4343	/// than 0x00 are saturated to 0x00. The converted [8 x i8] values are
4344	/// written to the lower 64 bits of the result.
4345	/// \param __b
4346	/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4347	/// a signed integer and is converted to an 8-bit unsigned integer with
4348	/// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
4349	/// than 0x00 are saturated to 0x00. The converted [8 x i8] values are
4350	/// written to the higher 64 bits of the result.
4351	/// \returns A 128-bit vector of [16 x i8] containing the converted values.
4352	static __inline__ __m128i __DEFAULT_FN_ATTRS
4353	_mm_packus_epi16(__m128i __a, __m128i __b)
4354	{
4355	return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
4356	}
4357
4358	/// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
4359	/// the immediate-value parameter as a selector.
4360	///
4361	/// \headerfile <x86intrin.h>
4362	///
4363	/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
4364	///
4365	/// \param __a
4366	/// A 128-bit integer vector.
4367	/// \param __imm
4368	/// An immediate value. Bits [2:0] selects values from \a __a to be assigned
4369	/// to bits[15:0] of the result. \n
4370	/// 000: assign values from bits [15:0] of \a __a. \n
4371	/// 001: assign values from bits [31:16] of \a __a. \n
4372	/// 010: assign values from bits [47:32] of \a __a. \n
4373	/// 011: assign values from bits [63:48] of \a __a. \n
4374	/// 100: assign values from bits [79:64] of \a __a. \n
4375	/// 101: assign values from bits [95:80] of \a __a. \n
4376	/// 110: assign values from bits [111:96] of \a __a. \n
4377	/// 111: assign values from bits [127:112] of \a __a.
4378	/// \returns An integer, whose lower 16 bits are selected from the 128-bit
4379	/// integer vector parameter and the remaining bits are assigned zeros.
4380	#define _mm_extract_epi16(a, imm) \
4381	(int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
4382	(int)(imm))
4383
4384	/// Constructs a 128-bit integer vector by first making a copy of the
4385	/// 128-bit integer vector parameter, and then inserting the lower 16 bits
4386	/// of an integer parameter into an offset specified by the immediate-value
4387	/// parameter.
4388	///
4389	/// \headerfile <x86intrin.h>
4390	///
4391	/// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
4392	///
4393	/// \param __a
4394	/// A 128-bit integer vector of [8 x i16]. This vector is copied to the
4395	/// result and then one of the eight elements in the result is replaced by
4396	/// the lower 16 bits of \a __b.
4397	/// \param __b
4398	/// An integer. The lower 16 bits of this parameter are written to the
4399	/// result beginning at an offset specified by \a __imm.
4400	/// \param __imm
4401	/// An immediate value specifying the bit offset in the result at which the
4402	/// lower 16 bits of \a __b are written.
4403	/// \returns A 128-bit integer vector containing the constructed values.
4404	#define _mm_insert_epi16(a, b, imm) \
4405	(__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
4406	(int)(imm))
4407
4408	/// Copies the values of the most significant bits from each 8-bit
4409	/// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
4410	/// value, zero-extends the value, and writes it to the destination.
4411	///
4412	/// \headerfile <x86intrin.h>
4413	///
4414	/// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
4415	///
4416	/// \param __a
4417	/// A 128-bit integer vector containing the values with bits to be extracted.
4418	/// \returns The most significant bits from each 8-bit element in \a __a,
4419	/// written to bits [15:0]. The other bits are assigned zeros.
4420	static __inline__ int __DEFAULT_FN_ATTRS
4421	_mm_movemask_epi8(__m128i __a)
4422	{
4423	return __builtin_ia32_pmovmskb128((__v16qi)__a);
4424	}
4425
4426	/// Constructs a 128-bit integer vector by shuffling four 32-bit
4427	/// elements of a 128-bit integer vector parameter, using the immediate-value
4428	/// parameter as a specifier.
4429	///
4430	/// \headerfile <x86intrin.h>
4431	///
4432	/// \code
4433	/// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
4434	/// \endcode
4435	///
4436	/// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
4437	///
4438	/// \param a
4439	/// A 128-bit integer vector containing the values to be copied.
4440	/// \param imm
4441	/// An immediate value containing an 8-bit value specifying which elements to
4442	/// copy from a. The destinations within the 128-bit destination are assigned
4443	/// values as follows: \n
4444	/// Bits [1:0] are used to assign values to bits [31:0] of the result. \n
4445	/// Bits [3:2] are used to assign values to bits [63:32] of the result. \n
4446	/// Bits [5:4] are used to assign values to bits [95:64] of the result. \n
4447	/// Bits [7:6] are used to assign values to bits [127:96] of the result. \n
4448	/// Bit value assignments: \n
4449	/// 00: assign values from bits [31:0] of \a a. \n
4450	/// 01: assign values from bits [63:32] of \a a. \n
4451	/// 10: assign values from bits [95:64] of \a a. \n
4452	/// 11: assign values from bits [127:96] of \a a.
4453	/// \returns A 128-bit integer vector containing the shuffled values.
4454	#define _mm_shuffle_epi32(a, imm) \
4455	(__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm))
4456
4457	/// Constructs a 128-bit integer vector by shuffling four lower 16-bit
4458	/// elements of a 128-bit integer vector of [8 x i16], using the immediate
4459	/// value parameter as a specifier.
4460	///
4461	/// \headerfile <x86intrin.h>
4462	///
4463	/// \code
4464	/// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
4465	/// \endcode
4466	///
4467	/// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
4468	///
4469	/// \param a
4470	/// A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
4471	/// [127:64] of the result.
4472	/// \param imm
4473	/// An 8-bit immediate value specifying which elements to copy from \a a. \n
4474	/// Bits[1:0] are used to assign values to bits [15:0] of the result. \n
4475	/// Bits[3:2] are used to assign values to bits [31:16] of the result. \n
4476	/// Bits[5:4] are used to assign values to bits [47:32] of the result. \n
4477	/// Bits[7:6] are used to assign values to bits [63:48] of the result. \n
4478	/// Bit value assignments: \n
4479	/// 00: assign values from bits [15:0] of \a a. \n
4480	/// 01: assign values from bits [31:16] of \a a. \n
4481	/// 10: assign values from bits [47:32] of \a a. \n
4482	/// 11: assign values from bits [63:48] of \a a. \n
4483	/// \returns A 128-bit integer vector containing the shuffled values.
4484	#define _mm_shufflelo_epi16(a, imm) \
4485	(__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm))
4486
4487	/// Constructs a 128-bit integer vector by shuffling four upper 16-bit
4488	/// elements of a 128-bit integer vector of [8 x i16], using the immediate
4489	/// value parameter as a specifier.
4490	///
4491	/// \headerfile <x86intrin.h>
4492	///
4493	/// \code
4494	/// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
4495	/// \endcode
4496	///
4497	/// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
4498	///
4499	/// \param a
4500	/// A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
4501	/// [63:0] of the result.
4502	/// \param imm
4503	/// An 8-bit immediate value specifying which elements to copy from \a a. \n
4504	/// Bits[1:0] are used to assign values to bits [79:64] of the result. \n
4505	/// Bits[3:2] are used to assign values to bits [95:80] of the result. \n
4506	/// Bits[5:4] are used to assign values to bits [111:96] of the result. \n
4507	/// Bits[7:6] are used to assign values to bits [127:112] of the result. \n
4508	/// Bit value assignments: \n
4509	/// 00: assign values from bits [79:64] of \a a. \n
4510	/// 01: assign values from bits [95:80] of \a a. \n
4511	/// 10: assign values from bits [111:96] of \a a. \n
4512	/// 11: assign values from bits [127:112] of \a a. \n
4513	/// \returns A 128-bit integer vector containing the shuffled values.
4514	#define _mm_shufflehi_epi16(a, imm) \
4515	(__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm))
4516
4517	/// Unpacks the high-order (index 8-15) values from two 128-bit vectors
4518	/// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4519	///
4520	/// \headerfile <x86intrin.h>
4521	///
4522	/// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
4523	/// instruction.
4524	///
4525	/// \param __a
4526	/// A 128-bit vector of [16 x i8].
4527	/// Bits [71:64] are written to bits [7:0] of the result. \n
4528	/// Bits [79:72] are written to bits [23:16] of the result. \n
4529	/// Bits [87:80] are written to bits [39:32] of the result. \n
4530	/// Bits [95:88] are written to bits [55:48] of the result. \n
4531	/// Bits [103:96] are written to bits [71:64] of the result. \n
4532	/// Bits [111:104] are written to bits [87:80] of the result. \n
4533	/// Bits [119:112] are written to bits [103:96] of the result. \n
4534	/// Bits [127:120] are written to bits [119:112] of the result.
4535	/// \param __b
4536	/// A 128-bit vector of [16 x i8]. \n
4537	/// Bits [71:64] are written to bits [15:8] of the result. \n
4538	/// Bits [79:72] are written to bits [31:24] of the result. \n
4539	/// Bits [87:80] are written to bits [47:40] of the result. \n
4540	/// Bits [95:88] are written to bits [63:56] of the result. \n
4541	/// Bits [103:96] are written to bits [79:72] of the result. \n
4542	/// Bits [111:104] are written to bits [95:88] of the result. \n
4543	/// Bits [119:112] are written to bits [111:104] of the result. \n
4544	/// Bits [127:120] are written to bits [127:120] of the result.
4545	/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4546	static __inline__ __m128i __DEFAULT_FN_ATTRS
4547	_mm_unpackhi_epi8(__m128i __a, __m128i __b)
4548	{
4549	return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
4550	}
4551
4552	/// Unpacks the high-order (index 4-7) values from two 128-bit vectors of
4553	/// [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
4554	///
4555	/// \headerfile <x86intrin.h>
4556	///
4557	/// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
4558	/// instruction.
4559	///
4560	/// \param __a
4561	/// A 128-bit vector of [8 x i16].
4562	/// Bits [79:64] are written to bits [15:0] of the result. \n
4563	/// Bits [95:80] are written to bits [47:32] of the result. \n
4564	/// Bits [111:96] are written to bits [79:64] of the result. \n
4565	/// Bits [127:112] are written to bits [111:96] of the result.
4566	/// \param __b
4567	/// A 128-bit vector of [8 x i16].
4568	/// Bits [79:64] are written to bits [31:16] of the result. \n
4569	/// Bits [95:80] are written to bits [63:48] of the result. \n
4570	/// Bits [111:96] are written to bits [95:80] of the result. \n
4571	/// Bits [127:112] are written to bits [127:112] of the result.
4572	/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4573	static __inline__ __m128i __DEFAULT_FN_ATTRS
4574	_mm_unpackhi_epi16(__m128i __a, __m128i __b)
4575	{
4576	return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
4577	}
4578
4579	/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
4580	/// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4581	///
4582	/// \headerfile <x86intrin.h>
4583	///
4584	/// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
4585	/// instruction.
4586	///
4587	/// \param __a
4588	/// A 128-bit vector of [4 x i32]. \n
4589	/// Bits [95:64] are written to bits [31:0] of the destination. \n
4590	/// Bits [127:96] are written to bits [95:64] of the destination.
4591	/// \param __b
4592	/// A 128-bit vector of [4 x i32]. \n
4593	/// Bits [95:64] are written to bits [64:32] of the destination. \n
4594	/// Bits [127:96] are written to bits [127:96] of the destination.
4595	/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4596	static __inline__ __m128i __DEFAULT_FN_ATTRS
4597	_mm_unpackhi_epi32(__m128i __a, __m128i __b)
4598	{
4599	return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
4600	}
4601
4602	/// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4603	/// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4604	///
4605	/// \headerfile <x86intrin.h>
4606	///
4607	/// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
4608	/// instruction.
4609	///
4610	/// \param __a
4611	/// A 128-bit vector of [2 x i64]. \n
4612	/// Bits [127:64] are written to bits [63:0] of the destination.
4613	/// \param __b
4614	/// A 128-bit vector of [2 x i64]. \n
4615	/// Bits [127:64] are written to bits [127:64] of the destination.
4616	/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4617	static __inline__ __m128i __DEFAULT_FN_ATTRS
4618	_mm_unpackhi_epi64(__m128i __a, __m128i __b)
4619	{
4620	return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2+1);
4621	}
4622
4623	/// Unpacks the low-order (index 0-7) values from two 128-bit vectors of
4624	/// [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4625	///
4626	/// \headerfile <x86intrin.h>
4627	///
4628	/// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
4629	/// instruction.
4630	///
4631	/// \param __a
4632	/// A 128-bit vector of [16 x i8]. \n
4633	/// Bits [7:0] are written to bits [7:0] of the result. \n
4634	/// Bits [15:8] are written to bits [23:16] of the result. \n
4635	/// Bits [23:16] are written to bits [39:32] of the result. \n
4636	/// Bits [31:24] are written to bits [55:48] of the result. \n
4637	/// Bits [39:32] are written to bits [71:64] of the result. \n
4638	/// Bits [47:40] are written to bits [87:80] of the result. \n
4639	/// Bits [55:48] are written to bits [103:96] of the result. \n
4640	/// Bits [63:56] are written to bits [119:112] of the result.
4641	/// \param __b
4642	/// A 128-bit vector of [16 x i8].
4643	/// Bits [7:0] are written to bits [15:8] of the result. \n
4644	/// Bits [15:8] are written to bits [31:24] of the result. \n
4645	/// Bits [23:16] are written to bits [47:40] of the result. \n
4646	/// Bits [31:24] are written to bits [63:56] of the result. \n
4647	/// Bits [39:32] are written to bits [79:72] of the result. \n
4648	/// Bits [47:40] are written to bits [95:88] of the result. \n
4649	/// Bits [55:48] are written to bits [111:104] of the result. \n
4650	/// Bits [63:56] are written to bits [127:120] of the result.
4651	/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4652	static __inline__ __m128i __DEFAULT_FN_ATTRS
4653	_mm_unpacklo_epi8(__m128i __a, __m128i __b)
4654	{
4655	return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
4656	}
4657
4658	/// Unpacks the low-order (index 0-3) values from each of the two 128-bit
4659	/// vectors of [8 x i16] and interleaves them into a 128-bit vector of
4660	/// [8 x i16].
4661	///
4662	/// \headerfile <x86intrin.h>
4663	///
4664	/// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
4665	/// instruction.
4666	///
4667	/// \param __a
4668	/// A 128-bit vector of [8 x i16].
4669	/// Bits [15:0] are written to bits [15:0] of the result. \n
4670	/// Bits [31:16] are written to bits [47:32] of the result. \n
4671	/// Bits [47:32] are written to bits [79:64] of the result. \n
4672	/// Bits [63:48] are written to bits [111:96] of the result.
4673	/// \param __b
4674	/// A 128-bit vector of [8 x i16].
4675	/// Bits [15:0] are written to bits [31:16] of the result. \n
4676	/// Bits [31:16] are written to bits [63:48] of the result. \n
4677	/// Bits [47:32] are written to bits [95:80] of the result. \n
4678	/// Bits [63:48] are written to bits [127:112] of the result.
4679	/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4680	static __inline__ __m128i __DEFAULT_FN_ATTRS
4681	_mm_unpacklo_epi16(__m128i __a, __m128i __b)
4682	{
4683	return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
4684	}
4685
4686	/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
4687	/// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4688	///
4689	/// \headerfile <x86intrin.h>
4690	///
4691	/// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
4692	/// instruction.
4693	///
4694	/// \param __a
4695	/// A 128-bit vector of [4 x i32]. \n
4696	/// Bits [31:0] are written to bits [31:0] of the destination. \n
4697	/// Bits [63:32] are written to bits [95:64] of the destination.
4698	/// \param __b
4699	/// A 128-bit vector of [4 x i32]. \n
4700	/// Bits [31:0] are written to bits [64:32] of the destination. \n
4701	/// Bits [63:32] are written to bits [127:96] of the destination.
4702	/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4703	static __inline__ __m128i __DEFAULT_FN_ATTRS
4704	_mm_unpacklo_epi32(__m128i __a, __m128i __b)
4705	{
4706	return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
4707	}
4708
4709	/// Unpacks the low-order 64-bit elements from two 128-bit vectors of
4710	/// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4711	///
4712	/// \headerfile <x86intrin.h>
4713	///
4714	/// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
4715	/// instruction.
4716	///
4717	/// \param __a
4718	/// A 128-bit vector of [2 x i64]. \n
4719	/// Bits [63:0] are written to bits [63:0] of the destination. \n
4720	/// \param __b
4721	/// A 128-bit vector of [2 x i64]. \n
4722	/// Bits [63:0] are written to bits [127:64] of the destination. \n
4723	/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4724	static __inline__ __m128i __DEFAULT_FN_ATTRS
4725	_mm_unpacklo_epi64(__m128i __a, __m128i __b)
4726	{
4727	return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2+0);
4728	}
4729
4730	/// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
4731	/// integer.
4732	///
4733	/// \headerfile <x86intrin.h>
4734	///
4735	/// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
4736	///
4737	/// \param __a
4738	/// A 128-bit integer vector operand. The lower 64 bits are moved to the
4739	/// destination.
4740	/// \returns A 64-bit integer containing the lower 64 bits of the parameter.
4741	static __inline__ __m64 __DEFAULT_FN_ATTRS
4742	_mm_movepi64_pi64(__m128i __a)
4743	{
4744	return (__m64)__a[0];
4745	}
4746
4747	/// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
4748	/// upper bits.
4749	///
4750	/// \headerfile <x86intrin.h>
4751	///
4752	/// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
4753	///
4754	/// \param __a
4755	/// A 64-bit value.
4756	/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4757	/// the operand. The upper 64 bits are assigned zeros.
4758	static __inline__ __m128i __DEFAULT_FN_ATTRS
4759	_mm_movpi64_epi64(__m64 __a)
4760	{
4761	return __extension__ (__m128i)(__v2di){ (long long)__a, 0 };
4762	}
4763
4764	/// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
4765	/// integer vector, zeroing the upper bits.
4766	///
4767	/// \headerfile <x86intrin.h>
4768	///
4769	/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
4770	///
4771	/// \param __a
4772	/// A 128-bit integer vector operand. The lower 64 bits are moved to the
4773	/// destination.
4774	/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4775	/// the operand. The upper 64 bits are assigned zeros.
4776	static __inline__ __m128i __DEFAULT_FN_ATTRS
4777	_mm_move_epi64(__m128i __a)
4778	{
4779	return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2);
4780	}
4781
4782	/// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4783	/// [2 x double] and interleaves them into a 128-bit vector of [2 x
4784	/// double].
4785	///
4786	/// \headerfile <x86intrin.h>
4787	///
4788	/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
4789	///
4790	/// \param __a
4791	/// A 128-bit vector of [2 x double]. \n
4792	/// Bits [127:64] are written to bits [63:0] of the destination.
4793	/// \param __b
4794	/// A 128-bit vector of [2 x double]. \n
4795	/// Bits [127:64] are written to bits [127:64] of the destination.
4796	/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4797	static __inline__ __m128d __DEFAULT_FN_ATTRS
4798	_mm_unpackhi_pd(__m128d __a, __m128d __b)
4799	{
4800	return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1);
4801	}
4802
4803	/// Unpacks the low-order 64-bit elements from two 128-bit vectors
4804	/// of [2 x double] and interleaves them into a 128-bit vector of [2 x
4805	/// double].
4806	///
4807	/// \headerfile <x86intrin.h>
4808	///
4809	/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
4810	///
4811	/// \param __a
4812	/// A 128-bit vector of [2 x double]. \n
4813	/// Bits [63:0] are written to bits [63:0] of the destination.
4814	/// \param __b
4815	/// A 128-bit vector of [2 x double]. \n
4816	/// Bits [63:0] are written to bits [127:64] of the destination.
4817	/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4818	static __inline__ __m128d __DEFAULT_FN_ATTRS
4819	_mm_unpacklo_pd(__m128d __a, __m128d __b)
4820	{
4821	return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0);
4822	}
4823
4824	/// Extracts the sign bits of the double-precision values in the 128-bit
4825	/// vector of [2 x double], zero-extends the value, and writes it to the
4826	/// low-order bits of the destination.
4827	///
4828	/// \headerfile <x86intrin.h>
4829	///
4830	/// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
4831	///
4832	/// \param __a
4833	/// A 128-bit vector of [2 x double] containing the values with sign bits to
4834	/// be extracted.
4835	/// \returns The sign bits from each of the double-precision elements in \a __a,
4836	/// written to bits [1:0]. The remaining bits are assigned values of zero.
4837	static __inline__ int __DEFAULT_FN_ATTRS
4838	_mm_movemask_pd(__m128d __a)
4839	{
4840	return __builtin_ia32_movmskpd((__v2df)__a);
4841	}
4842
4843
4844	/// Constructs a 128-bit floating-point vector of [2 x double] from two
4845	/// 128-bit vector parameters of [2 x double], using the immediate-value
4846	/// parameter as a specifier.
4847	///
4848	/// \headerfile <x86intrin.h>
4849	///
4850	/// \code
4851	/// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
4852	/// \endcode
4853	///
4854	/// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
4855	///
4856	/// \param a
4857	/// A 128-bit vector of [2 x double].
4858	/// \param b
4859	/// A 128-bit vector of [2 x double].
4860	/// \param i
4861	/// An 8-bit immediate value. The least significant two bits specify which
4862	/// elements to copy from \a a and \a b: \n
4863	/// Bit[0] = 0: lower element of \a a copied to lower element of result. \n
4864	/// Bit[0] = 1: upper element of \a a copied to lower element of result. \n
4865	/// Bit[1] = 0: lower element of \a b copied to upper element of result. \n
4866	/// Bit[1] = 1: upper element of \a b copied to upper element of result. \n
4867	/// \returns A 128-bit vector of [2 x double] containing the shuffled values.
4868	#define _mm_shuffle_pd(a, b, i) \
4869	(__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
4870	(int)(i))
4871
4872	/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4873	/// floating-point vector of [4 x float].
4874	///
4875	/// \headerfile <x86intrin.h>
4876	///
4877	/// This intrinsic has no corresponding instruction.
4878	///
4879	/// \param __a
4880	/// A 128-bit floating-point vector of [2 x double].
4881	/// \returns A 128-bit floating-point vector of [4 x float] containing the same
4882	/// bitwise pattern as the parameter.
4883	static __inline__ __m128 __DEFAULT_FN_ATTRS
4884	_mm_castpd_ps(__m128d __a)
4885	{
4886	return (__m128)__a;
4887	}
4888
4889	/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4890	/// integer vector.
4891	///
4892	/// \headerfile <x86intrin.h>
4893	///
4894	/// This intrinsic has no corresponding instruction.
4895	///
4896	/// \param __a
4897	/// A 128-bit floating-point vector of [2 x double].
4898	/// \returns A 128-bit integer vector containing the same bitwise pattern as the
4899	/// parameter.
4900	static __inline__ __m128i __DEFAULT_FN_ATTRS
4901	_mm_castpd_si128(__m128d __a)
4902	{
4903	return (__m128i)__a;
4904	}
4905
4906	/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4907	/// floating-point vector of [2 x double].
4908	///
4909	/// \headerfile <x86intrin.h>
4910	///
4911	/// This intrinsic has no corresponding instruction.
4912	///
4913	/// \param __a
4914	/// A 128-bit floating-point vector of [4 x float].
4915	/// \returns A 128-bit floating-point vector of [2 x double] containing the same
4916	/// bitwise pattern as the parameter.
4917	static __inline__ __m128d __DEFAULT_FN_ATTRS
4918	_mm_castps_pd(__m128 __a)
4919	{
4920	return (__m128d)__a;
4921	}
4922
4923	/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4924	/// integer vector.
4925	///
4926	/// \headerfile <x86intrin.h>
4927	///
4928	/// This intrinsic has no corresponding instruction.
4929	///
4930	/// \param __a
4931	/// A 128-bit floating-point vector of [4 x float].
4932	/// \returns A 128-bit integer vector containing the same bitwise pattern as the
4933	/// parameter.
4934	static __inline__ __m128i __DEFAULT_FN_ATTRS
4935	_mm_castps_si128(__m128 __a)
4936	{
4937	return (__m128i)__a;
4938	}
4939
4940	/// Casts a 128-bit integer vector into a 128-bit floating-point vector
4941	/// of [4 x float].
4942	///
4943	/// \headerfile <x86intrin.h>
4944	///
4945	/// This intrinsic has no corresponding instruction.
4946	///
4947	/// \param __a
4948	/// A 128-bit integer vector.
4949	/// \returns A 128-bit floating-point vector of [4 x float] containing the same
4950	/// bitwise pattern as the parameter.
4951	static __inline__ __m128 __DEFAULT_FN_ATTRS
4952	_mm_castsi128_ps(__m128i __a)
4953	{
4954	return (__m128)__a;
4955	}
4956
4957	/// Casts a 128-bit integer vector into a 128-bit floating-point vector
4958	/// of [2 x double].
4959	///
4960	/// \headerfile <x86intrin.h>
4961	///
4962	/// This intrinsic has no corresponding instruction.
4963	///
4964	/// \param __a
4965	/// A 128-bit integer vector.
4966	/// \returns A 128-bit floating-point vector of [2 x double] containing the same
4967	/// bitwise pattern as the parameter.
4968	static __inline__ __m128d __DEFAULT_FN_ATTRS
4969	_mm_castsi128_pd(__m128i __a)
4970	{
4971	return (__m128d)__a;
4972	}
4973
4974	#if defined(__cplusplus)
4975	extern "C" {
4976	#endif
4977
4978	/// Indicates that a spin loop is being executed for the purposes of
4979	/// optimizing power consumption during the loop.
4980	///
4981	/// \headerfile <x86intrin.h>
4982	///
4983	/// This intrinsic corresponds to the <c> PAUSE </c> instruction.
4984	///
4985	void _mm_pause(void);
4986
4987	#if defined(__cplusplus)
4988	} // extern "C"
4989	#endif
4990	#undef __DEFAULT_FN_ATTRS
4991	#undef __DEFAULT_FN_ATTRS_MMX
4992
4993	#define _MM_SHUFFLE2(x, y) (((x) << 1) \| (y))
4994
4995	#define _MM_DENORMALS_ZERO_ON (0x0040)
4996	#define _MM_DENORMALS_ZERO_OFF (0x0000)
4997
4998	#define _MM_DENORMALS_ZERO_MASK (0x0040)
4999
5000	#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
5001	#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) \| (x)))
5002
5003	#endif /* __EMMINTRIN_H */
5004

Clang Project