core/stdarch/crates/core_arch/src/x86/
sse2.rs

1//! Streaming SIMD Extensions 2 (SSE2)
2
3#[cfg(test)]
4use stdarch_test::assert_instr;
5
6use crate::{
7    core_arch::{simd::*, x86::*},
8    intrinsics::simd::*,
9    intrinsics::sqrtf64,
10    mem, ptr,
11};
12
13/// Provides a hint to the processor that the code sequence is a spin-wait loop.
14///
15/// This can help improve the performance and power consumption of spin-wait
16/// loops.
17///
18/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause)
19#[inline]
20#[cfg_attr(all(test, target_feature = "sse2"), assert_instr(pause))]
21#[stable(feature = "simd_x86", since = "1.27.0")]
22pub fn _mm_pause() {
23    // note: `pause` is guaranteed to be interpreted as a `nop` by CPUs without
24    // the SSE2 target-feature - therefore it does not require any target features
25    unsafe { pause() }
26}
27
28/// Invalidates and flushes the cache line that contains `p` from all levels of
29/// the cache hierarchy.
30///
31/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush)
32#[inline]
33#[target_feature(enable = "sse2")]
34#[cfg_attr(test, assert_instr(clflush))]
35#[stable(feature = "simd_x86", since = "1.27.0")]
36pub unsafe fn _mm_clflush(p: *const u8) {
37    clflush(p)
38}
39
40/// Performs a serializing operation on all load-from-memory instructions
41/// that were issued prior to this instruction.
42///
43/// Guarantees that every load instruction that precedes, in program order, is
44/// globally visible before any load instruction which follows the fence in
45/// program order.
46///
47/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence)
48#[inline]
49#[target_feature(enable = "sse2")]
50#[cfg_attr(test, assert_instr(lfence))]
51#[stable(feature = "simd_x86", since = "1.27.0")]
52pub fn _mm_lfence() {
53    unsafe { lfence() }
54}
55
56/// Performs a serializing operation on all load-from-memory and store-to-memory
57/// instructions that were issued prior to this instruction.
58///
59/// Guarantees that every memory access that precedes, in program order, the
60/// memory fence instruction is globally visible before any memory instruction
61/// which follows the fence in program order.
62///
63/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence)
64#[inline]
65#[target_feature(enable = "sse2")]
66#[cfg_attr(test, assert_instr(mfence))]
67#[stable(feature = "simd_x86", since = "1.27.0")]
68pub fn _mm_mfence() {
69    unsafe { mfence() }
70}
71
72/// Adds packed 8-bit integers in `a` and `b`.
73///
74/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8)
75#[inline]
76#[target_feature(enable = "sse2")]
77#[cfg_attr(test, assert_instr(paddb))]
78#[stable(feature = "simd_x86", since = "1.27.0")]
79#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
80pub const fn _mm_add_epi8(a: __m128i, b: __m128i) -> __m128i {
81    unsafe { transmute(simd_add(a.as_i8x16(), b.as_i8x16())) }
82}
83
84/// Adds packed 16-bit integers in `a` and `b`.
85///
86/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16)
87#[inline]
88#[target_feature(enable = "sse2")]
89#[cfg_attr(test, assert_instr(paddw))]
90#[stable(feature = "simd_x86", since = "1.27.0")]
91#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
92pub const fn _mm_add_epi16(a: __m128i, b: __m128i) -> __m128i {
93    unsafe { transmute(simd_add(a.as_i16x8(), b.as_i16x8())) }
94}
95
96/// Adds packed 32-bit integers in `a` and `b`.
97///
98/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32)
99#[inline]
100#[target_feature(enable = "sse2")]
101#[cfg_attr(test, assert_instr(paddd))]
102#[stable(feature = "simd_x86", since = "1.27.0")]
103#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
104pub const fn _mm_add_epi32(a: __m128i, b: __m128i) -> __m128i {
105    unsafe { transmute(simd_add(a.as_i32x4(), b.as_i32x4())) }
106}
107
108/// Adds packed 64-bit integers in `a` and `b`.
109///
110/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64)
111#[inline]
112#[target_feature(enable = "sse2")]
113#[cfg_attr(test, assert_instr(paddq))]
114#[stable(feature = "simd_x86", since = "1.27.0")]
115#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
116pub const fn _mm_add_epi64(a: __m128i, b: __m128i) -> __m128i {
117    unsafe { transmute(simd_add(a.as_i64x2(), b.as_i64x2())) }
118}
119
120/// Adds packed 8-bit integers in `a` and `b` using saturation.
121///
122/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8)
123#[inline]
124#[target_feature(enable = "sse2")]
125#[cfg_attr(test, assert_instr(paddsb))]
126#[stable(feature = "simd_x86", since = "1.27.0")]
127#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
128pub const fn _mm_adds_epi8(a: __m128i, b: __m128i) -> __m128i {
129    unsafe { transmute(simd_saturating_add(a.as_i8x16(), b.as_i8x16())) }
130}
131
132/// Adds packed 16-bit integers in `a` and `b` using saturation.
133///
134/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16)
135#[inline]
136#[target_feature(enable = "sse2")]
137#[cfg_attr(test, assert_instr(paddsw))]
138#[stable(feature = "simd_x86", since = "1.27.0")]
139#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
140pub const fn _mm_adds_epi16(a: __m128i, b: __m128i) -> __m128i {
141    unsafe { transmute(simd_saturating_add(a.as_i16x8(), b.as_i16x8())) }
142}
143
144/// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
145///
146/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8)
147#[inline]
148#[target_feature(enable = "sse2")]
149#[cfg_attr(test, assert_instr(paddusb))]
150#[stable(feature = "simd_x86", since = "1.27.0")]
151#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
152pub const fn _mm_adds_epu8(a: __m128i, b: __m128i) -> __m128i {
153    unsafe { transmute(simd_saturating_add(a.as_u8x16(), b.as_u8x16())) }
154}
155
156/// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
157///
158/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16)
159#[inline]
160#[target_feature(enable = "sse2")]
161#[cfg_attr(test, assert_instr(paddusw))]
162#[stable(feature = "simd_x86", since = "1.27.0")]
163#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
164pub const fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i {
165    unsafe { transmute(simd_saturating_add(a.as_u16x8(), b.as_u16x8())) }
166}
167
168/// Averages packed unsigned 8-bit integers in `a` and `b`.
169///
170/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8)
171#[inline]
172#[target_feature(enable = "sse2")]
173#[cfg_attr(test, assert_instr(pavgb))]
174#[stable(feature = "simd_x86", since = "1.27.0")]
175#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
176pub const fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i {
177    unsafe {
178        let a = simd_cast::<_, u16x16>(a.as_u8x16());
179        let b = simd_cast::<_, u16x16>(b.as_u8x16());
180        let r = simd_shr(simd_add(simd_add(a, b), u16x16::splat(1)), u16x16::splat(1));
181        transmute(simd_cast::<_, u8x16>(r))
182    }
183}
184
185/// Averages packed unsigned 16-bit integers in `a` and `b`.
186///
187/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16)
188#[inline]
189#[target_feature(enable = "sse2")]
190#[cfg_attr(test, assert_instr(pavgw))]
191#[stable(feature = "simd_x86", since = "1.27.0")]
192#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
193pub const fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
194    unsafe {
195        let a = simd_cast::<_, u32x8>(a.as_u16x8());
196        let b = simd_cast::<_, u32x8>(b.as_u16x8());
197        let r = simd_shr(simd_add(simd_add(a, b), u32x8::splat(1)), u32x8::splat(1));
198        transmute(simd_cast::<_, u16x8>(r))
199    }
200}
201
202/// Multiplies and then horizontally add signed 16 bit integers in `a` and `b`.
203///
204/// Multiplies packed signed 16-bit integers in `a` and `b`, producing
205/// intermediate signed 32-bit integers. Horizontally add adjacent pairs of
206/// intermediate 32-bit integers.
207///
208/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16)
209#[inline]
210#[target_feature(enable = "sse2")]
211#[cfg_attr(test, assert_instr(pmaddwd))]
212#[stable(feature = "simd_x86", since = "1.27.0")]
213pub fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
214    // It's a trick used in the Adler-32 algorithm to perform a widening addition.
215    //
216    // ```rust
217    // #[target_feature(enable = "sse2")]
218    // unsafe fn widening_add(mad: __m128i) -> __m128i {
219    //     _mm_madd_epi16(mad, _mm_set1_epi16(1))
220    // }
221    // ```
222    //
223    // If we implement this using generic vector intrinsics, the optimizer
224    // will eliminate this pattern, and `pmaddwd` will no longer be emitted.
225    // For this reason, we use x86 intrinsics.
226    unsafe { transmute(pmaddwd(a.as_i16x8(), b.as_i16x8())) }
227}
228
229/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
230/// maximum values.
231///
232/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16)
233#[inline]
234#[target_feature(enable = "sse2")]
235#[cfg_attr(test, assert_instr(pmaxsw))]
236#[stable(feature = "simd_x86", since = "1.27.0")]
237#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
238pub const fn _mm_max_epi16(a: __m128i, b: __m128i) -> __m128i {
239    unsafe { simd_imax(a.as_i16x8(), b.as_i16x8()).as_m128i() }
240}
241
242/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
243/// packed maximum values.
244///
245/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8)
246#[inline]
247#[target_feature(enable = "sse2")]
248#[cfg_attr(test, assert_instr(pmaxub))]
249#[stable(feature = "simd_x86", since = "1.27.0")]
250#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
251pub const fn _mm_max_epu8(a: __m128i, b: __m128i) -> __m128i {
252    unsafe { simd_imax(a.as_u8x16(), b.as_u8x16()).as_m128i() }
253}
254
255/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
256/// minimum values.
257///
258/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16)
259#[inline]
260#[target_feature(enable = "sse2")]
261#[cfg_attr(test, assert_instr(pminsw))]
262#[stable(feature = "simd_x86", since = "1.27.0")]
263#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
264pub const fn _mm_min_epi16(a: __m128i, b: __m128i) -> __m128i {
265    unsafe { simd_imin(a.as_i16x8(), b.as_i16x8()).as_m128i() }
266}
267
268/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
269/// packed minimum values.
270///
271/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8)
272#[inline]
273#[target_feature(enable = "sse2")]
274#[cfg_attr(test, assert_instr(pminub))]
275#[stable(feature = "simd_x86", since = "1.27.0")]
276#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
277pub const fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i {
278    unsafe { simd_imin(a.as_u8x16(), b.as_u8x16()).as_m128i() }
279}
280
281/// Multiplies the packed 16-bit integers in `a` and `b`.
282///
283/// The multiplication produces intermediate 32-bit integers, and returns the
284/// high 16 bits of the intermediate integers.
285///
286/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16)
287#[inline]
288#[target_feature(enable = "sse2")]
289#[cfg_attr(test, assert_instr(pmulhw))]
290#[stable(feature = "simd_x86", since = "1.27.0")]
291#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
292pub const fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i {
293    unsafe {
294        let a = simd_cast::<_, i32x8>(a.as_i16x8());
295        let b = simd_cast::<_, i32x8>(b.as_i16x8());
296        let r = simd_shr(simd_mul(a, b), i32x8::splat(16));
297        transmute(simd_cast::<i32x8, i16x8>(r))
298    }
299}
300
301/// Multiplies the packed unsigned 16-bit integers in `a` and `b`.
302///
303/// The multiplication produces intermediate 32-bit integers, and returns the
304/// high 16 bits of the intermediate integers.
305///
306/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16)
307#[inline]
308#[target_feature(enable = "sse2")]
309#[cfg_attr(test, assert_instr(pmulhuw))]
310#[stable(feature = "simd_x86", since = "1.27.0")]
311#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
312pub const fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i {
313    unsafe {
314        let a = simd_cast::<_, u32x8>(a.as_u16x8());
315        let b = simd_cast::<_, u32x8>(b.as_u16x8());
316        let r = simd_shr(simd_mul(a, b), u32x8::splat(16));
317        transmute(simd_cast::<u32x8, u16x8>(r))
318    }
319}
320
321/// Multiplies the packed 16-bit integers in `a` and `b`.
322///
323/// The multiplication produces intermediate 32-bit integers, and returns the
324/// low 16 bits of the intermediate integers.
325///
326/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16)
327#[inline]
328#[target_feature(enable = "sse2")]
329#[cfg_attr(test, assert_instr(pmullw))]
330#[stable(feature = "simd_x86", since = "1.27.0")]
331#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
332pub const fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i {
333    unsafe { transmute(simd_mul(a.as_i16x8(), b.as_i16x8())) }
334}
335
336/// Multiplies the low unsigned 32-bit integers from each packed 64-bit element
337/// in `a` and `b`.
338///
339/// Returns the unsigned 64-bit results.
340///
341/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32)
342#[inline]
343#[target_feature(enable = "sse2")]
344#[cfg_attr(test, assert_instr(pmuludq))]
345#[stable(feature = "simd_x86", since = "1.27.0")]
346#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
347pub const fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i {
348    unsafe {
349        let a = a.as_u64x2();
350        let b = b.as_u64x2();
351        let mask = u64x2::splat(u32::MAX as u64);
352        transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
353    }
354}
355
356/// Sum the absolute differences of packed unsigned 8-bit integers.
357///
358/// Computes the absolute differences of packed unsigned 8-bit integers in `a`
359/// and `b`, then horizontally sum each consecutive 8 differences to produce
360/// two unsigned 16-bit integers, and pack these unsigned 16-bit integers in
361/// the low 16 bits of 64-bit elements returned.
362///
363/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8)
364#[inline]
365#[target_feature(enable = "sse2")]
366#[cfg_attr(test, assert_instr(psadbw))]
367#[stable(feature = "simd_x86", since = "1.27.0")]
368pub fn _mm_sad_epu8(a: __m128i, b: __m128i) -> __m128i {
369    unsafe { transmute(psadbw(a.as_u8x16(), b.as_u8x16())) }
370}
371
372/// Subtracts packed 8-bit integers in `b` from packed 8-bit integers in `a`.
373///
374/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8)
375#[inline]
376#[target_feature(enable = "sse2")]
377#[cfg_attr(test, assert_instr(psubb))]
378#[stable(feature = "simd_x86", since = "1.27.0")]
379#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
380pub const fn _mm_sub_epi8(a: __m128i, b: __m128i) -> __m128i {
381    unsafe { transmute(simd_sub(a.as_i8x16(), b.as_i8x16())) }
382}
383
384/// Subtracts packed 16-bit integers in `b` from packed 16-bit integers in `a`.
385///
386/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16)
387#[inline]
388#[target_feature(enable = "sse2")]
389#[cfg_attr(test, assert_instr(psubw))]
390#[stable(feature = "simd_x86", since = "1.27.0")]
391#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
392pub const fn _mm_sub_epi16(a: __m128i, b: __m128i) -> __m128i {
393    unsafe { transmute(simd_sub(a.as_i16x8(), b.as_i16x8())) }
394}
395
396/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
397///
398/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32)
399#[inline]
400#[target_feature(enable = "sse2")]
401#[cfg_attr(test, assert_instr(psubd))]
402#[stable(feature = "simd_x86", since = "1.27.0")]
403#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
404pub const fn _mm_sub_epi32(a: __m128i, b: __m128i) -> __m128i {
405    unsafe { transmute(simd_sub(a.as_i32x4(), b.as_i32x4())) }
406}
407
408/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
409///
410/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64)
411#[inline]
412#[target_feature(enable = "sse2")]
413#[cfg_attr(test, assert_instr(psubq))]
414#[stable(feature = "simd_x86", since = "1.27.0")]
415#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
416pub const fn _mm_sub_epi64(a: __m128i, b: __m128i) -> __m128i {
417    unsafe { transmute(simd_sub(a.as_i64x2(), b.as_i64x2())) }
418}
419
420/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
421/// using saturation.
422///
423/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8)
424#[inline]
425#[target_feature(enable = "sse2")]
426#[cfg_attr(test, assert_instr(psubsb))]
427#[stable(feature = "simd_x86", since = "1.27.0")]
428#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
429pub const fn _mm_subs_epi8(a: __m128i, b: __m128i) -> __m128i {
430    unsafe { transmute(simd_saturating_sub(a.as_i8x16(), b.as_i8x16())) }
431}
432
433/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
434/// using saturation.
435///
436/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16)
437#[inline]
438#[target_feature(enable = "sse2")]
439#[cfg_attr(test, assert_instr(psubsw))]
440#[stable(feature = "simd_x86", since = "1.27.0")]
441#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
442pub const fn _mm_subs_epi16(a: __m128i, b: __m128i) -> __m128i {
443    unsafe { transmute(simd_saturating_sub(a.as_i16x8(), b.as_i16x8())) }
444}
445
446/// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit
447/// integers in `a` using saturation.
448///
449/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8)
450#[inline]
451#[target_feature(enable = "sse2")]
452#[cfg_attr(test, assert_instr(psubusb))]
453#[stable(feature = "simd_x86", since = "1.27.0")]
454#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
455pub const fn _mm_subs_epu8(a: __m128i, b: __m128i) -> __m128i {
456    unsafe { transmute(simd_saturating_sub(a.as_u8x16(), b.as_u8x16())) }
457}
458
459/// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit
460/// integers in `a` using saturation.
461///
462/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16)
463#[inline]
464#[target_feature(enable = "sse2")]
465#[cfg_attr(test, assert_instr(psubusw))]
466#[stable(feature = "simd_x86", since = "1.27.0")]
467#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
468pub const fn _mm_subs_epu16(a: __m128i, b: __m128i) -> __m128i {
469    unsafe { transmute(simd_saturating_sub(a.as_u16x8(), b.as_u16x8())) }
470}
471
472/// Shifts `a` left by `IMM8` bytes while shifting in zeros.
473///
474/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128)
475#[inline]
476#[target_feature(enable = "sse2")]
477#[cfg_attr(test, assert_instr(pslldq, IMM8 = 1))]
478#[rustc_legacy_const_generics(1)]
479#[stable(feature = "simd_x86", since = "1.27.0")]
480#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
481pub const fn _mm_slli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
482    static_assert_uimm_bits!(IMM8, 8);
483    unsafe { _mm_slli_si128_impl::<IMM8>(a) }
484}
485
486/// Implementation detail: converts the immediate argument of the
487/// `_mm_slli_si128` intrinsic into a compile-time constant.
488#[inline]
489#[target_feature(enable = "sse2")]
490#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
491const unsafe fn _mm_slli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
492    const fn mask(shift: i32, i: u32) -> u32 {
493        let shift = shift as u32 & 0xff;
494        if shift > 15 { i } else { 16 - shift + i }
495    }
496    transmute::<i8x16, _>(simd_shuffle!(
497        i8x16::ZERO,
498        a.as_i8x16(),
499        [
500            mask(IMM8, 0),
501            mask(IMM8, 1),
502            mask(IMM8, 2),
503            mask(IMM8, 3),
504            mask(IMM8, 4),
505            mask(IMM8, 5),
506            mask(IMM8, 6),
507            mask(IMM8, 7),
508            mask(IMM8, 8),
509            mask(IMM8, 9),
510            mask(IMM8, 10),
511            mask(IMM8, 11),
512            mask(IMM8, 12),
513            mask(IMM8, 13),
514            mask(IMM8, 14),
515            mask(IMM8, 15),
516        ],
517    ))
518}
519
520/// Shifts `a` left by `IMM8` bytes while shifting in zeros.
521///
522/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128)
523#[inline]
524#[target_feature(enable = "sse2")]
525#[cfg_attr(test, assert_instr(pslldq, IMM8 = 1))]
526#[rustc_legacy_const_generics(1)]
527#[stable(feature = "simd_x86", since = "1.27.0")]
528#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
529pub const fn _mm_bslli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
530    unsafe {
531        static_assert_uimm_bits!(IMM8, 8);
532        _mm_slli_si128_impl::<IMM8>(a)
533    }
534}
535
536/// Shifts `a` right by `IMM8` bytes while shifting in zeros.
537///
538/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128)
539#[inline]
540#[target_feature(enable = "sse2")]
541#[cfg_attr(test, assert_instr(psrldq, IMM8 = 1))]
542#[rustc_legacy_const_generics(1)]
543#[stable(feature = "simd_x86", since = "1.27.0")]
544#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
545pub const fn _mm_bsrli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
546    unsafe {
547        static_assert_uimm_bits!(IMM8, 8);
548        _mm_srli_si128_impl::<IMM8>(a)
549    }
550}
551
552/// Shifts packed 16-bit integers in `a` left by `IMM8` while shifting in zeros.
553///
554/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16)
555#[inline]
556#[target_feature(enable = "sse2")]
557#[cfg_attr(test, assert_instr(psllw, IMM8 = 7))]
558#[rustc_legacy_const_generics(1)]
559#[stable(feature = "simd_x86", since = "1.27.0")]
560#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
561pub const fn _mm_slli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
562    static_assert_uimm_bits!(IMM8, 8);
563    unsafe {
564        if IMM8 >= 16 {
565            _mm_setzero_si128()
566        } else {
567            transmute(simd_shl(a.as_u16x8(), u16x8::splat(IMM8 as u16)))
568        }
569    }
570}
571
572/// Shifts packed 16-bit integers in `a` left by `count` while shifting in
573/// zeros.
574///
575/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16)
576#[inline]
577#[target_feature(enable = "sse2")]
578#[cfg_attr(test, assert_instr(psllw))]
579#[stable(feature = "simd_x86", since = "1.27.0")]
580pub fn _mm_sll_epi16(a: __m128i, count: __m128i) -> __m128i {
581    unsafe { transmute(psllw(a.as_i16x8(), count.as_i16x8())) }
582}
583
584/// Shifts packed 32-bit integers in `a` left by `IMM8` while shifting in zeros.
585///
586/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32)
587#[inline]
588#[target_feature(enable = "sse2")]
589#[cfg_attr(test, assert_instr(pslld, IMM8 = 7))]
590#[rustc_legacy_const_generics(1)]
591#[stable(feature = "simd_x86", since = "1.27.0")]
592#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
593pub const fn _mm_slli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
594    static_assert_uimm_bits!(IMM8, 8);
595    unsafe {
596        if IMM8 >= 32 {
597            _mm_setzero_si128()
598        } else {
599            transmute(simd_shl(a.as_u32x4(), u32x4::splat(IMM8 as u32)))
600        }
601    }
602}
603
604/// Shifts packed 32-bit integers in `a` left by `count` while shifting in
605/// zeros.
606///
607/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32)
608#[inline]
609#[target_feature(enable = "sse2")]
610#[cfg_attr(test, assert_instr(pslld))]
611#[stable(feature = "simd_x86", since = "1.27.0")]
612pub fn _mm_sll_epi32(a: __m128i, count: __m128i) -> __m128i {
613    unsafe { transmute(pslld(a.as_i32x4(), count.as_i32x4())) }
614}
615
616/// Shifts packed 64-bit integers in `a` left by `IMM8` while shifting in zeros.
617///
618/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64)
619#[inline]
620#[target_feature(enable = "sse2")]
621#[cfg_attr(test, assert_instr(psllq, IMM8 = 7))]
622#[rustc_legacy_const_generics(1)]
623#[stable(feature = "simd_x86", since = "1.27.0")]
624#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
625pub const fn _mm_slli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
626    static_assert_uimm_bits!(IMM8, 8);
627    unsafe {
628        if IMM8 >= 64 {
629            _mm_setzero_si128()
630        } else {
631            transmute(simd_shl(a.as_u64x2(), u64x2::splat(IMM8 as u64)))
632        }
633    }
634}
635
636/// Shifts packed 64-bit integers in `a` left by `count` while shifting in
637/// zeros.
638///
639/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64)
640#[inline]
641#[target_feature(enable = "sse2")]
642#[cfg_attr(test, assert_instr(psllq))]
643#[stable(feature = "simd_x86", since = "1.27.0")]
644pub fn _mm_sll_epi64(a: __m128i, count: __m128i) -> __m128i {
645    unsafe { transmute(psllq(a.as_i64x2(), count.as_i64x2())) }
646}
647
648/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in sign
649/// bits.
650///
651/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16)
652#[inline]
653#[target_feature(enable = "sse2")]
654#[cfg_attr(test, assert_instr(psraw, IMM8 = 1))]
655#[rustc_legacy_const_generics(1)]
656#[stable(feature = "simd_x86", since = "1.27.0")]
657#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
658pub const fn _mm_srai_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
659    static_assert_uimm_bits!(IMM8, 8);
660    unsafe { transmute(simd_shr(a.as_i16x8(), i16x8::splat(IMM8.min(15) as i16))) }
661}
662
663/// Shifts packed 16-bit integers in `a` right by `count` while shifting in sign
664/// bits.
665///
666/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16)
667#[inline]
668#[target_feature(enable = "sse2")]
669#[cfg_attr(test, assert_instr(psraw))]
670#[stable(feature = "simd_x86", since = "1.27.0")]
671pub fn _mm_sra_epi16(a: __m128i, count: __m128i) -> __m128i {
672    unsafe { transmute(psraw(a.as_i16x8(), count.as_i16x8())) }
673}
674
675/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in sign
676/// bits.
677///
678/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32)
679#[inline]
680#[target_feature(enable = "sse2")]
681#[cfg_attr(test, assert_instr(psrad, IMM8 = 1))]
682#[rustc_legacy_const_generics(1)]
683#[stable(feature = "simd_x86", since = "1.27.0")]
684#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
685pub const fn _mm_srai_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
686    static_assert_uimm_bits!(IMM8, 8);
687    unsafe { transmute(simd_shr(a.as_i32x4(), i32x4::splat(IMM8.min(31)))) }
688}
689
690/// Shifts packed 32-bit integers in `a` right by `count` while shifting in sign
691/// bits.
692///
693/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32)
694#[inline]
695#[target_feature(enable = "sse2")]
696#[cfg_attr(test, assert_instr(psrad))]
697#[stable(feature = "simd_x86", since = "1.27.0")]
698pub fn _mm_sra_epi32(a: __m128i, count: __m128i) -> __m128i {
699    unsafe { transmute(psrad(a.as_i32x4(), count.as_i32x4())) }
700}
701
702/// Shifts `a` right by `IMM8` bytes while shifting in zeros.
703///
704/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128)
705#[inline]
706#[target_feature(enable = "sse2")]
707#[cfg_attr(test, assert_instr(psrldq, IMM8 = 1))]
708#[rustc_legacy_const_generics(1)]
709#[stable(feature = "simd_x86", since = "1.27.0")]
710#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
711pub const fn _mm_srli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
712    static_assert_uimm_bits!(IMM8, 8);
713    unsafe { _mm_srli_si128_impl::<IMM8>(a) }
714}
715
716/// Implementation detail: converts the immediate argument of the
717/// `_mm_srli_si128` intrinsic into a compile-time constant.
718#[inline]
719#[target_feature(enable = "sse2")]
720#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
721const unsafe fn _mm_srli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
722    const fn mask(shift: i32, i: u32) -> u32 {
723        if (shift as u32) > 15 {
724            i + 16
725        } else {
726            i + (shift as u32)
727        }
728    }
729    let x: i8x16 = simd_shuffle!(
730        a.as_i8x16(),
731        i8x16::ZERO,
732        [
733            mask(IMM8, 0),
734            mask(IMM8, 1),
735            mask(IMM8, 2),
736            mask(IMM8, 3),
737            mask(IMM8, 4),
738            mask(IMM8, 5),
739            mask(IMM8, 6),
740            mask(IMM8, 7),
741            mask(IMM8, 8),
742            mask(IMM8, 9),
743            mask(IMM8, 10),
744            mask(IMM8, 11),
745            mask(IMM8, 12),
746            mask(IMM8, 13),
747            mask(IMM8, 14),
748            mask(IMM8, 15),
749        ],
750    );
751    transmute(x)
752}
753
754/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
755/// zeros.
756///
757/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16)
758#[inline]
759#[target_feature(enable = "sse2")]
760#[cfg_attr(test, assert_instr(psrlw, IMM8 = 1))]
761#[rustc_legacy_const_generics(1)]
762#[stable(feature = "simd_x86", since = "1.27.0")]
763#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
764pub const fn _mm_srli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
765    static_assert_uimm_bits!(IMM8, 8);
766    unsafe {
767        if IMM8 >= 16 {
768            _mm_setzero_si128()
769        } else {
770            transmute(simd_shr(a.as_u16x8(), u16x8::splat(IMM8 as u16)))
771        }
772    }
773}
774
775/// Shifts packed 16-bit integers in `a` right by `count` while shifting in
776/// zeros.
777///
778/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16)
779#[inline]
780#[target_feature(enable = "sse2")]
781#[cfg_attr(test, assert_instr(psrlw))]
782#[stable(feature = "simd_x86", since = "1.27.0")]
783pub fn _mm_srl_epi16(a: __m128i, count: __m128i) -> __m128i {
784    unsafe { transmute(psrlw(a.as_i16x8(), count.as_i16x8())) }
785}
786
787/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
788/// zeros.
789///
790/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32)
791#[inline]
792#[target_feature(enable = "sse2")]
793#[cfg_attr(test, assert_instr(psrld, IMM8 = 8))]
794#[rustc_legacy_const_generics(1)]
795#[stable(feature = "simd_x86", since = "1.27.0")]
796#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
797pub const fn _mm_srli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
798    static_assert_uimm_bits!(IMM8, 8);
799    unsafe {
800        if IMM8 >= 32 {
801            _mm_setzero_si128()
802        } else {
803            transmute(simd_shr(a.as_u32x4(), u32x4::splat(IMM8 as u32)))
804        }
805    }
806}
807
808/// Shifts packed 32-bit integers in `a` right by `count` while shifting in
809/// zeros.
810///
811/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32)
812#[inline]
813#[target_feature(enable = "sse2")]
814#[cfg_attr(test, assert_instr(psrld))]
815#[stable(feature = "simd_x86", since = "1.27.0")]
816pub fn _mm_srl_epi32(a: __m128i, count: __m128i) -> __m128i {
817    unsafe { transmute(psrld(a.as_i32x4(), count.as_i32x4())) }
818}
819
820/// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
821/// zeros.
822///
823/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64)
824#[inline]
825#[target_feature(enable = "sse2")]
826#[cfg_attr(test, assert_instr(psrlq, IMM8 = 1))]
827#[rustc_legacy_const_generics(1)]
828#[stable(feature = "simd_x86", since = "1.27.0")]
829#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
830pub const fn _mm_srli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
831    static_assert_uimm_bits!(IMM8, 8);
832    unsafe {
833        if IMM8 >= 64 {
834            _mm_setzero_si128()
835        } else {
836            transmute(simd_shr(a.as_u64x2(), u64x2::splat(IMM8 as u64)))
837        }
838    }
839}
840
841/// Shifts packed 64-bit integers in `a` right by `count` while shifting in
842/// zeros.
843///
844/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64)
845#[inline]
846#[target_feature(enable = "sse2")]
847#[cfg_attr(test, assert_instr(psrlq))]
848#[stable(feature = "simd_x86", since = "1.27.0")]
849pub fn _mm_srl_epi64(a: __m128i, count: __m128i) -> __m128i {
850    unsafe { transmute(psrlq(a.as_i64x2(), count.as_i64x2())) }
851}
852
853/// Computes the bitwise AND of 128 bits (representing integer data) in `a` and
854/// `b`.
855///
856/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128)
857#[inline]
858#[target_feature(enable = "sse2")]
859#[cfg_attr(test, assert_instr(andps))]
860#[stable(feature = "simd_x86", since = "1.27.0")]
861#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
862pub const fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i {
863    unsafe { simd_and(a, b) }
864}
865
866/// Computes the bitwise NOT of 128 bits (representing integer data) in `a` and
867/// then AND with `b`.
868///
869/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128)
870#[inline]
871#[target_feature(enable = "sse2")]
872#[cfg_attr(test, assert_instr(andnps))]
873#[stable(feature = "simd_x86", since = "1.27.0")]
874#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
875pub const fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i {
876    unsafe { simd_and(simd_xor(_mm_set1_epi8(-1), a), b) }
877}
878
879/// Computes the bitwise OR of 128 bits (representing integer data) in `a` and
880/// `b`.
881///
882/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128)
883#[inline]
884#[target_feature(enable = "sse2")]
885#[cfg_attr(test, assert_instr(orps))]
886#[stable(feature = "simd_x86", since = "1.27.0")]
887#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
888pub const fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i {
889    unsafe { simd_or(a, b) }
890}
891
892/// Computes the bitwise XOR of 128 bits (representing integer data) in `a` and
893/// `b`.
894///
895/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128)
896#[inline]
897#[target_feature(enable = "sse2")]
898#[cfg_attr(test, assert_instr(xorps))]
899#[stable(feature = "simd_x86", since = "1.27.0")]
900#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
901pub const fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i {
902    unsafe { simd_xor(a, b) }
903}
904
905/// Compares packed 8-bit integers in `a` and `b` for equality.
906///
907/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8)
908#[inline]
909#[target_feature(enable = "sse2")]
910#[cfg_attr(test, assert_instr(pcmpeqb))]
911#[stable(feature = "simd_x86", since = "1.27.0")]
912#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
913pub const fn _mm_cmpeq_epi8(a: __m128i, b: __m128i) -> __m128i {
914    unsafe { transmute::<i8x16, _>(simd_eq(a.as_i8x16(), b.as_i8x16())) }
915}
916
917/// Compares packed 16-bit integers in `a` and `b` for equality.
918///
919/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16)
920#[inline]
921#[target_feature(enable = "sse2")]
922#[cfg_attr(test, assert_instr(pcmpeqw))]
923#[stable(feature = "simd_x86", since = "1.27.0")]
924#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
925pub const fn _mm_cmpeq_epi16(a: __m128i, b: __m128i) -> __m128i {
926    unsafe { transmute::<i16x8, _>(simd_eq(a.as_i16x8(), b.as_i16x8())) }
927}
928
929/// Compares packed 32-bit integers in `a` and `b` for equality.
930///
931/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32)
932#[inline]
933#[target_feature(enable = "sse2")]
934#[cfg_attr(test, assert_instr(pcmpeqd))]
935#[stable(feature = "simd_x86", since = "1.27.0")]
936#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
937pub const fn _mm_cmpeq_epi32(a: __m128i, b: __m128i) -> __m128i {
938    unsafe { transmute::<i32x4, _>(simd_eq(a.as_i32x4(), b.as_i32x4())) }
939}
940
941/// Compares packed 8-bit integers in `a` and `b` for greater-than.
942///
943/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8)
944#[inline]
945#[target_feature(enable = "sse2")]
946#[cfg_attr(test, assert_instr(pcmpgtb))]
947#[stable(feature = "simd_x86", since = "1.27.0")]
948#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
949pub const fn _mm_cmpgt_epi8(a: __m128i, b: __m128i) -> __m128i {
950    unsafe { transmute::<i8x16, _>(simd_gt(a.as_i8x16(), b.as_i8x16())) }
951}
952
953/// Compares packed 16-bit integers in `a` and `b` for greater-than.
954///
955/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16)
956#[inline]
957#[target_feature(enable = "sse2")]
958#[cfg_attr(test, assert_instr(pcmpgtw))]
959#[stable(feature = "simd_x86", since = "1.27.0")]
960#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
961pub const fn _mm_cmpgt_epi16(a: __m128i, b: __m128i) -> __m128i {
962    unsafe { transmute::<i16x8, _>(simd_gt(a.as_i16x8(), b.as_i16x8())) }
963}
964
965/// Compares packed 32-bit integers in `a` and `b` for greater-than.
966///
967/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32)
968#[inline]
969#[target_feature(enable = "sse2")]
970#[cfg_attr(test, assert_instr(pcmpgtd))]
971#[stable(feature = "simd_x86", since = "1.27.0")]
972#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
973pub const fn _mm_cmpgt_epi32(a: __m128i, b: __m128i) -> __m128i {
974    unsafe { transmute::<i32x4, _>(simd_gt(a.as_i32x4(), b.as_i32x4())) }
975}
976
977/// Compares packed 8-bit integers in `a` and `b` for less-than.
978///
979/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8)
980#[inline]
981#[target_feature(enable = "sse2")]
982#[cfg_attr(test, assert_instr(pcmpgtb))]
983#[stable(feature = "simd_x86", since = "1.27.0")]
984#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
985pub const fn _mm_cmplt_epi8(a: __m128i, b: __m128i) -> __m128i {
986    unsafe { transmute::<i8x16, _>(simd_lt(a.as_i8x16(), b.as_i8x16())) }
987}
988
989/// Compares packed 16-bit integers in `a` and `b` for less-than.
990///
991/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16)
992#[inline]
993#[target_feature(enable = "sse2")]
994#[cfg_attr(test, assert_instr(pcmpgtw))]
995#[stable(feature = "simd_x86", since = "1.27.0")]
996#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
997pub const fn _mm_cmplt_epi16(a: __m128i, b: __m128i) -> __m128i {
998    unsafe { transmute::<i16x8, _>(simd_lt(a.as_i16x8(), b.as_i16x8())) }
999}
1000
1001/// Compares packed 32-bit integers in `a` and `b` for less-than.
1002///
1003/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32)
1004#[inline]
1005#[target_feature(enable = "sse2")]
1006#[cfg_attr(test, assert_instr(pcmpgtd))]
1007#[stable(feature = "simd_x86", since = "1.27.0")]
1008#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1009pub const fn _mm_cmplt_epi32(a: __m128i, b: __m128i) -> __m128i {
1010    unsafe { transmute::<i32x4, _>(simd_lt(a.as_i32x4(), b.as_i32x4())) }
1011}
1012
1013/// Converts the lower two packed 32-bit integers in `a` to packed
1014/// double-precision (64-bit) floating-point elements.
1015///
1016/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd)
1017#[inline]
1018#[target_feature(enable = "sse2")]
1019#[cfg_attr(test, assert_instr(cvtdq2pd))]
1020#[stable(feature = "simd_x86", since = "1.27.0")]
1021#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1022pub const fn _mm_cvtepi32_pd(a: __m128i) -> __m128d {
1023    unsafe {
1024        let a = a.as_i32x4();
1025        simd_cast::<i32x2, __m128d>(simd_shuffle!(a, a, [0, 1]))
1026    }
1027}
1028
1029/// Returns `a` with its lower element replaced by `b` after converting it to
1030/// an `f64`.
1031///
1032/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd)
1033#[inline]
1034#[target_feature(enable = "sse2")]
1035#[cfg_attr(test, assert_instr(cvtsi2sd))]
1036#[stable(feature = "simd_x86", since = "1.27.0")]
1037#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1038pub const fn _mm_cvtsi32_sd(a: __m128d, b: i32) -> __m128d {
1039    unsafe { simd_insert!(a, 0, b as f64) }
1040}
1041
1042/// Converts packed 32-bit integers in `a` to packed single-precision (32-bit)
1043/// floating-point elements.
1044///
1045/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps)
1046#[inline]
1047#[target_feature(enable = "sse2")]
1048#[cfg_attr(test, assert_instr(cvtdq2ps))]
1049#[stable(feature = "simd_x86", since = "1.27.0")]
1050#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1051pub const fn _mm_cvtepi32_ps(a: __m128i) -> __m128 {
1052    unsafe { transmute(simd_cast::<_, f32x4>(a.as_i32x4())) }
1053}
1054
1055/// Converts packed single-precision (32-bit) floating-point elements in `a`
1056/// to packed 32-bit integers.
1057///
1058/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32)
1059#[inline]
1060#[target_feature(enable = "sse2")]
1061#[cfg_attr(test, assert_instr(cvtps2dq))]
1062#[stable(feature = "simd_x86", since = "1.27.0")]
1063pub fn _mm_cvtps_epi32(a: __m128) -> __m128i {
1064    unsafe { transmute(cvtps2dq(a)) }
1065}
1066
1067/// Returns a vector whose lowest element is `a` and all higher elements are
1068/// `0`.
1069///
1070/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128)
1071#[inline]
1072#[target_feature(enable = "sse2")]
1073#[stable(feature = "simd_x86", since = "1.27.0")]
1074#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1075pub const fn _mm_cvtsi32_si128(a: i32) -> __m128i {
1076    unsafe { transmute(i32x4::new(a, 0, 0, 0)) }
1077}
1078
1079/// Returns the lowest element of `a`.
1080///
1081/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32)
1082#[inline]
1083#[target_feature(enable = "sse2")]
1084#[stable(feature = "simd_x86", since = "1.27.0")]
1085#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1086pub const fn _mm_cvtsi128_si32(a: __m128i) -> i32 {
1087    unsafe { simd_extract!(a.as_i32x4(), 0) }
1088}
1089
1090/// Sets packed 64-bit integers with the supplied values, from highest to
1091/// lowest.
1092///
1093/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x)
1094#[inline]
1095#[target_feature(enable = "sse2")]
1096// no particular instruction to test
1097#[stable(feature = "simd_x86", since = "1.27.0")]
1098#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1099pub const fn _mm_set_epi64x(e1: i64, e0: i64) -> __m128i {
1100    unsafe { transmute(i64x2::new(e0, e1)) }
1101}
1102
1103/// Sets packed 32-bit integers with the supplied values.
1104///
1105/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32)
1106#[inline]
1107#[target_feature(enable = "sse2")]
1108// no particular instruction to test
1109#[stable(feature = "simd_x86", since = "1.27.0")]
1110#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1111pub const fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
1112    unsafe { transmute(i32x4::new(e0, e1, e2, e3)) }
1113}
1114
1115/// Sets packed 16-bit integers with the supplied values.
1116///
1117/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16)
1118#[inline]
1119#[target_feature(enable = "sse2")]
1120// no particular instruction to test
1121#[stable(feature = "simd_x86", since = "1.27.0")]
1122#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1123pub const fn _mm_set_epi16(
1124    e7: i16,
1125    e6: i16,
1126    e5: i16,
1127    e4: i16,
1128    e3: i16,
1129    e2: i16,
1130    e1: i16,
1131    e0: i16,
1132) -> __m128i {
1133    unsafe { transmute(i16x8::new(e0, e1, e2, e3, e4, e5, e6, e7)) }
1134}
1135
1136/// Sets packed 8-bit integers with the supplied values.
1137///
1138/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8)
1139#[inline]
1140#[target_feature(enable = "sse2")]
1141// no particular instruction to test
1142#[stable(feature = "simd_x86", since = "1.27.0")]
1143#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1144pub const fn _mm_set_epi8(
1145    e15: i8,
1146    e14: i8,
1147    e13: i8,
1148    e12: i8,
1149    e11: i8,
1150    e10: i8,
1151    e9: i8,
1152    e8: i8,
1153    e7: i8,
1154    e6: i8,
1155    e5: i8,
1156    e4: i8,
1157    e3: i8,
1158    e2: i8,
1159    e1: i8,
1160    e0: i8,
1161) -> __m128i {
1162    unsafe {
1163        #[rustfmt::skip]
1164        transmute(i8x16::new(
1165            e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
1166        ))
1167    }
1168}
1169
1170/// Broadcasts 64-bit integer `a` to all elements.
1171///
1172/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x)
1173#[inline]
1174#[target_feature(enable = "sse2")]
1175// no particular instruction to test
1176#[stable(feature = "simd_x86", since = "1.27.0")]
1177#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1178pub const fn _mm_set1_epi64x(a: i64) -> __m128i {
1179    _mm_set_epi64x(a, a)
1180}
1181
1182/// Broadcasts 32-bit integer `a` to all elements.
1183///
1184/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32)
1185#[inline]
1186#[target_feature(enable = "sse2")]
1187// no particular instruction to test
1188#[stable(feature = "simd_x86", since = "1.27.0")]
1189#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1190pub const fn _mm_set1_epi32(a: i32) -> __m128i {
1191    _mm_set_epi32(a, a, a, a)
1192}
1193
1194/// Broadcasts 16-bit integer `a` to all elements.
1195///
1196/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16)
1197#[inline]
1198#[target_feature(enable = "sse2")]
1199// no particular instruction to test
1200#[stable(feature = "simd_x86", since = "1.27.0")]
1201#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1202pub const fn _mm_set1_epi16(a: i16) -> __m128i {
1203    _mm_set_epi16(a, a, a, a, a, a, a, a)
1204}
1205
1206/// Broadcasts 8-bit integer `a` to all elements.
1207///
1208/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8)
1209#[inline]
1210#[target_feature(enable = "sse2")]
1211// no particular instruction to test
1212#[stable(feature = "simd_x86", since = "1.27.0")]
1213#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1214pub const fn _mm_set1_epi8(a: i8) -> __m128i {
1215    _mm_set_epi8(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
1216}
1217
1218/// Sets packed 32-bit integers with the supplied values in reverse order.
1219///
1220/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32)
1221#[inline]
1222#[target_feature(enable = "sse2")]
1223// no particular instruction to test
1224#[stable(feature = "simd_x86", since = "1.27.0")]
1225#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1226pub const fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
1227    _mm_set_epi32(e0, e1, e2, e3)
1228}
1229
1230/// Sets packed 16-bit integers with the supplied values in reverse order.
1231///
1232/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16)
1233#[inline]
1234#[target_feature(enable = "sse2")]
1235// no particular instruction to test
1236#[stable(feature = "simd_x86", since = "1.27.0")]
1237#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1238pub const fn _mm_setr_epi16(
1239    e7: i16,
1240    e6: i16,
1241    e5: i16,
1242    e4: i16,
1243    e3: i16,
1244    e2: i16,
1245    e1: i16,
1246    e0: i16,
1247) -> __m128i {
1248    _mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7)
1249}
1250
1251/// Sets packed 8-bit integers with the supplied values in reverse order.
1252///
1253/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8)
1254#[inline]
1255#[target_feature(enable = "sse2")]
1256// no particular instruction to test
1257#[stable(feature = "simd_x86", since = "1.27.0")]
1258#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1259pub const fn _mm_setr_epi8(
1260    e15: i8,
1261    e14: i8,
1262    e13: i8,
1263    e12: i8,
1264    e11: i8,
1265    e10: i8,
1266    e9: i8,
1267    e8: i8,
1268    e7: i8,
1269    e6: i8,
1270    e5: i8,
1271    e4: i8,
1272    e3: i8,
1273    e2: i8,
1274    e1: i8,
1275    e0: i8,
1276) -> __m128i {
1277    #[rustfmt::skip]
1278    _mm_set_epi8(
1279        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
1280    )
1281}
1282
1283/// Returns a vector with all elements set to zero.
1284///
1285/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128)
1286#[inline]
1287#[target_feature(enable = "sse2")]
1288#[cfg_attr(test, assert_instr(xorps))]
1289#[stable(feature = "simd_x86", since = "1.27.0")]
1290#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1291pub const fn _mm_setzero_si128() -> __m128i {
1292    const { unsafe { mem::zeroed() } }
1293}
1294
1295/// Loads 64-bit integer from memory into first element of returned vector.
1296///
1297/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64)
1298#[inline]
1299#[target_feature(enable = "sse2")]
1300#[stable(feature = "simd_x86", since = "1.27.0")]
1301#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1302pub const unsafe fn _mm_loadl_epi64(mem_addr: *const __m128i) -> __m128i {
1303    _mm_set_epi64x(0, ptr::read_unaligned(mem_addr as *const i64))
1304}
1305
1306/// Loads 128-bits of integer data from memory into a new vector.
1307///
1308/// `mem_addr` must be aligned on a 16-byte boundary.
1309///
1310/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128)
1311#[inline]
1312#[target_feature(enable = "sse2")]
1313#[cfg_attr(
1314    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1315    assert_instr(movaps)
1316)]
1317#[stable(feature = "simd_x86", since = "1.27.0")]
1318#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1319pub const unsafe fn _mm_load_si128(mem_addr: *const __m128i) -> __m128i {
1320    *mem_addr
1321}
1322
1323/// Loads 128-bits of integer data from memory into a new vector.
1324///
1325/// `mem_addr` does not need to be aligned on any particular boundary.
1326///
1327/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128)
1328#[inline]
1329#[target_feature(enable = "sse2")]
1330#[cfg_attr(test, assert_instr(movups))]
1331#[stable(feature = "simd_x86", since = "1.27.0")]
1332#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1333pub const unsafe fn _mm_loadu_si128(mem_addr: *const __m128i) -> __m128i {
1334    let mut dst: __m128i = _mm_undefined_si128();
1335    ptr::copy_nonoverlapping(
1336        mem_addr as *const u8,
1337        ptr::addr_of_mut!(dst) as *mut u8,
1338        mem::size_of::<__m128i>(),
1339    );
1340    dst
1341}
1342
1343/// Conditionally store 8-bit integer elements from `a` into memory using
1344/// `mask` flagged as non-temporal (unlikely to be used again soon).
1345///
1346/// Elements are not stored when the highest bit is not set in the
1347/// corresponding element.
1348///
1349/// `mem_addr` should correspond to a 128-bit memory location and does not need
1350/// to be aligned on any particular boundary.
1351///
1352/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128)
1353///
1354/// # Safety of non-temporal stores
1355///
1356/// After using this intrinsic, but before any other access to the memory that this intrinsic
1357/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1358/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1359/// return.
1360///
1361/// See [`_mm_sfence`] for details.
1362#[inline]
1363#[target_feature(enable = "sse2")]
1364#[cfg_attr(test, assert_instr(maskmovdqu))]
1365#[stable(feature = "simd_x86", since = "1.27.0")]
1366pub unsafe fn _mm_maskmoveu_si128(a: __m128i, mask: __m128i, mem_addr: *mut i8) {
1367    maskmovdqu(a.as_i8x16(), mask.as_i8x16(), mem_addr)
1368}
1369
1370/// Stores 128-bits of integer data from `a` into memory.
1371///
1372/// `mem_addr` must be aligned on a 16-byte boundary.
1373///
1374/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128)
1375#[inline]
1376#[target_feature(enable = "sse2")]
1377#[cfg_attr(
1378    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1379    assert_instr(movaps)
1380)]
1381#[stable(feature = "simd_x86", since = "1.27.0")]
1382#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1383pub const unsafe fn _mm_store_si128(mem_addr: *mut __m128i, a: __m128i) {
1384    *mem_addr = a;
1385}
1386
1387/// Stores 128-bits of integer data from `a` into memory.
1388///
1389/// `mem_addr` does not need to be aligned on any particular boundary.
1390///
1391/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128)
1392#[inline]
1393#[target_feature(enable = "sse2")]
1394#[cfg_attr(test, assert_instr(movups))] // FIXME movdqu expected
1395#[stable(feature = "simd_x86", since = "1.27.0")]
1396#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1397pub const unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) {
1398    mem_addr.write_unaligned(a);
1399}
1400
1401/// Stores the lower 64-bit integer `a` to a memory location.
1402///
1403/// `mem_addr` does not need to be aligned on any particular boundary.
1404///
1405/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64)
1406#[inline]
1407#[target_feature(enable = "sse2")]
1408#[stable(feature = "simd_x86", since = "1.27.0")]
1409#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1410pub const unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
1411    ptr::copy_nonoverlapping(ptr::addr_of!(a) as *const u8, mem_addr as *mut u8, 8);
1412}
1413
1414/// Stores a 128-bit integer vector to a 128-bit aligned memory location.
1415/// To minimize caching, the data is flagged as non-temporal (unlikely to be
1416/// used again soon).
1417///
1418/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128)
1419///
1420/// # Safety of non-temporal stores
1421///
1422/// After using this intrinsic, but before any other access to the memory that this intrinsic
1423/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1424/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1425/// return.
1426///
1427/// See [`_mm_sfence`] for details.
1428#[inline]
1429#[target_feature(enable = "sse2")]
1430#[cfg_attr(test, assert_instr(movntdq))]
1431#[stable(feature = "simd_x86", since = "1.27.0")]
1432pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
1433    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
1434    crate::arch::asm!(
1435        vps!("movntdq",  ",{a}"),
1436        p = in(reg) mem_addr,
1437        a = in(xmm_reg) a,
1438        options(nostack, preserves_flags),
1439    );
1440}
1441
1442/// Stores a 32-bit integer value in the specified memory location.
1443/// To minimize caching, the data is flagged as non-temporal (unlikely to be
1444/// used again soon).
1445///
1446/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32)
1447///
1448/// # Safety of non-temporal stores
1449///
1450/// After using this intrinsic, but before any other access to the memory that this intrinsic
1451/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1452/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1453/// return.
1454///
1455/// See [`_mm_sfence`] for details.
1456#[inline]
1457#[target_feature(enable = "sse2")]
1458#[cfg_attr(test, assert_instr(movnti))]
1459#[stable(feature = "simd_x86", since = "1.27.0")]
1460pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) {
1461    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
1462    crate::arch::asm!(
1463        vps!("movnti", ",{a:e}"), // `:e` for 32bit value
1464        p = in(reg) mem_addr,
1465        a = in(reg) a,
1466        options(nostack, preserves_flags),
1467    );
1468}
1469
1470/// Returns a vector where the low element is extracted from `a` and its upper
1471/// element is zero.
1472///
1473/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64)
1474#[inline]
1475#[target_feature(enable = "sse2")]
1476// FIXME movd on msvc, movd on i686
1477#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(movq))]
1478#[stable(feature = "simd_x86", since = "1.27.0")]
1479#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1480pub const fn _mm_move_epi64(a: __m128i) -> __m128i {
1481    unsafe {
1482        let r: i64x2 = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 2]);
1483        transmute(r)
1484    }
1485}
1486
1487/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
1488/// using signed saturation.
1489///
1490/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16)
1491#[inline]
1492#[target_feature(enable = "sse2")]
1493#[cfg_attr(test, assert_instr(packsswb))]
1494#[stable(feature = "simd_x86", since = "1.27.0")]
1495pub fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i {
1496    unsafe { transmute(packsswb(a.as_i16x8(), b.as_i16x8())) }
1497}
1498
1499/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
1500/// using signed saturation.
1501///
1502/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32)
1503#[inline]
1504#[target_feature(enable = "sse2")]
1505#[cfg_attr(test, assert_instr(packssdw))]
1506#[stable(feature = "simd_x86", since = "1.27.0")]
1507pub fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i {
1508    unsafe { transmute(packssdw(a.as_i32x4(), b.as_i32x4())) }
1509}
1510
1511/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
1512/// using unsigned saturation.
1513///
1514/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16)
1515#[inline]
1516#[target_feature(enable = "sse2")]
1517#[cfg_attr(test, assert_instr(packuswb))]
1518#[stable(feature = "simd_x86", since = "1.27.0")]
1519pub fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i {
1520    unsafe { transmute(packuswb(a.as_i16x8(), b.as_i16x8())) }
1521}
1522
1523/// Returns the `imm8` element of `a`.
1524///
1525/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16)
1526#[inline]
1527#[target_feature(enable = "sse2")]
1528#[cfg_attr(test, assert_instr(pextrw, IMM8 = 7))]
1529#[rustc_legacy_const_generics(1)]
1530#[stable(feature = "simd_x86", since = "1.27.0")]
1531#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1532pub const fn _mm_extract_epi16<const IMM8: i32>(a: __m128i) -> i32 {
1533    static_assert_uimm_bits!(IMM8, 3);
1534    unsafe { simd_extract!(a.as_u16x8(), IMM8 as u32, u16) as i32 }
1535}
1536
1537/// Returns a new vector where the `imm8` element of `a` is replaced with `i`.
1538///
1539/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16)
1540#[inline]
1541#[target_feature(enable = "sse2")]
1542#[cfg_attr(test, assert_instr(pinsrw, IMM8 = 7))]
1543#[rustc_legacy_const_generics(2)]
1544#[stable(feature = "simd_x86", since = "1.27.0")]
1545#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1546pub const fn _mm_insert_epi16<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
1547    static_assert_uimm_bits!(IMM8, 3);
1548    unsafe { transmute(simd_insert!(a.as_i16x8(), IMM8 as u32, i as i16)) }
1549}
1550
1551/// Returns a mask of the most significant bit of each element in `a`.
1552///
1553/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8)
1554#[inline]
1555#[target_feature(enable = "sse2")]
1556#[cfg_attr(test, assert_instr(pmovmskb))]
1557#[stable(feature = "simd_x86", since = "1.27.0")]
1558#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1559pub const fn _mm_movemask_epi8(a: __m128i) -> i32 {
1560    unsafe {
1561        let z = i8x16::ZERO;
1562        let m: i8x16 = simd_lt(a.as_i8x16(), z);
1563        simd_bitmask::<_, u16>(m) as u32 as i32
1564    }
1565}
1566
1567/// Shuffles 32-bit integers in `a` using the control in `IMM8`.
1568///
1569/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32)
1570#[inline]
1571#[target_feature(enable = "sse2")]
1572#[cfg_attr(test, assert_instr(pshufd, IMM8 = 9))]
1573#[rustc_legacy_const_generics(1)]
1574#[stable(feature = "simd_x86", since = "1.27.0")]
1575#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1576pub const fn _mm_shuffle_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
1577    static_assert_uimm_bits!(IMM8, 8);
1578    unsafe {
1579        let a = a.as_i32x4();
1580        let x: i32x4 = simd_shuffle!(
1581            a,
1582            a,
1583            [
1584                IMM8 as u32 & 0b11,
1585                (IMM8 as u32 >> 2) & 0b11,
1586                (IMM8 as u32 >> 4) & 0b11,
1587                (IMM8 as u32 >> 6) & 0b11,
1588            ],
1589        );
1590        transmute(x)
1591    }
1592}
1593
1594/// Shuffles 16-bit integers in the high 64 bits of `a` using the control in
1595/// `IMM8`.
1596///
1597/// Put the results in the high 64 bits of the returned vector, with the low 64
1598/// bits being copied from `a`.
1599///
1600/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16)
1601#[inline]
1602#[target_feature(enable = "sse2")]
1603#[cfg_attr(test, assert_instr(pshufhw, IMM8 = 9))]
1604#[rustc_legacy_const_generics(1)]
1605#[stable(feature = "simd_x86", since = "1.27.0")]
1606#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1607pub const fn _mm_shufflehi_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
1608    static_assert_uimm_bits!(IMM8, 8);
1609    unsafe {
1610        let a = a.as_i16x8();
1611        let x: i16x8 = simd_shuffle!(
1612            a,
1613            a,
1614            [
1615                0,
1616                1,
1617                2,
1618                3,
1619                (IMM8 as u32 & 0b11) + 4,
1620                ((IMM8 as u32 >> 2) & 0b11) + 4,
1621                ((IMM8 as u32 >> 4) & 0b11) + 4,
1622                ((IMM8 as u32 >> 6) & 0b11) + 4,
1623            ],
1624        );
1625        transmute(x)
1626    }
1627}
1628
1629/// Shuffles 16-bit integers in the low 64 bits of `a` using the control in
1630/// `IMM8`.
1631///
1632/// Put the results in the low 64 bits of the returned vector, with the high 64
1633/// bits being copied from `a`.
1634///
1635/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16)
1636#[inline]
1637#[target_feature(enable = "sse2")]
1638#[cfg_attr(test, assert_instr(pshuflw, IMM8 = 9))]
1639#[rustc_legacy_const_generics(1)]
1640#[stable(feature = "simd_x86", since = "1.27.0")]
1641#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1642pub const fn _mm_shufflelo_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
1643    static_assert_uimm_bits!(IMM8, 8);
1644    unsafe {
1645        let a = a.as_i16x8();
1646        let x: i16x8 = simd_shuffle!(
1647            a,
1648            a,
1649            [
1650                IMM8 as u32 & 0b11,
1651                (IMM8 as u32 >> 2) & 0b11,
1652                (IMM8 as u32 >> 4) & 0b11,
1653                (IMM8 as u32 >> 6) & 0b11,
1654                4,
1655                5,
1656                6,
1657                7,
1658            ],
1659        );
1660        transmute(x)
1661    }
1662}
1663
1664/// Unpacks and interleave 8-bit integers from the high half of `a` and `b`.
1665///
1666/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8)
1667#[inline]
1668#[target_feature(enable = "sse2")]
1669#[cfg_attr(test, assert_instr(punpckhbw))]
1670#[stable(feature = "simd_x86", since = "1.27.0")]
1671#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1672pub const fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i {
1673    unsafe {
1674        transmute::<i8x16, _>(simd_shuffle!(
1675            a.as_i8x16(),
1676            b.as_i8x16(),
1677            [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31],
1678        ))
1679    }
1680}
1681
1682/// Unpacks and interleave 16-bit integers from the high half of `a` and `b`.
1683///
1684/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16)
1685#[inline]
1686#[target_feature(enable = "sse2")]
1687#[cfg_attr(test, assert_instr(punpckhwd))]
1688#[stable(feature = "simd_x86", since = "1.27.0")]
1689#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1690pub const fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i {
1691    unsafe {
1692        let x = simd_shuffle!(a.as_i16x8(), b.as_i16x8(), [4, 12, 5, 13, 6, 14, 7, 15]);
1693        transmute::<i16x8, _>(x)
1694    }
1695}
1696
1697/// Unpacks and interleave 32-bit integers from the high half of `a` and `b`.
1698///
1699/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32)
1700#[inline]
1701#[target_feature(enable = "sse2")]
1702#[cfg_attr(test, assert_instr(unpckhps))]
1703#[stable(feature = "simd_x86", since = "1.27.0")]
1704#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1705pub const fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i {
1706    unsafe { transmute::<i32x4, _>(simd_shuffle!(a.as_i32x4(), b.as_i32x4(), [2, 6, 3, 7])) }
1707}
1708
1709/// Unpacks and interleave 64-bit integers from the high half of `a` and `b`.
1710///
1711/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64)
1712#[inline]
1713#[target_feature(enable = "sse2")]
1714#[cfg_attr(test, assert_instr(unpckhpd))]
1715#[stable(feature = "simd_x86", since = "1.27.0")]
1716#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1717pub const fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i {
1718    unsafe { transmute::<i64x2, _>(simd_shuffle!(a.as_i64x2(), b.as_i64x2(), [1, 3])) }
1719}
1720
1721/// Unpacks and interleave 8-bit integers from the low half of `a` and `b`.
1722///
1723/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8)
1724#[inline]
1725#[target_feature(enable = "sse2")]
1726#[cfg_attr(test, assert_instr(punpcklbw))]
1727#[stable(feature = "simd_x86", since = "1.27.0")]
1728#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1729pub const fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
1730    unsafe {
1731        transmute::<i8x16, _>(simd_shuffle!(
1732            a.as_i8x16(),
1733            b.as_i8x16(),
1734            [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23],
1735        ))
1736    }
1737}
1738
1739/// Unpacks and interleave 16-bit integers from the low half of `a` and `b`.
1740///
1741/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16)
1742#[inline]
1743#[target_feature(enable = "sse2")]
1744#[cfg_attr(test, assert_instr(punpcklwd))]
1745#[stable(feature = "simd_x86", since = "1.27.0")]
1746#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1747pub const fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
1748    unsafe {
1749        let x = simd_shuffle!(a.as_i16x8(), b.as_i16x8(), [0, 8, 1, 9, 2, 10, 3, 11]);
1750        transmute::<i16x8, _>(x)
1751    }
1752}
1753
1754/// Unpacks and interleave 32-bit integers from the low half of `a` and `b`.
1755///
1756/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32)
1757#[inline]
1758#[target_feature(enable = "sse2")]
1759#[cfg_attr(test, assert_instr(unpcklps))]
1760#[stable(feature = "simd_x86", since = "1.27.0")]
1761#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1762pub const fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i {
1763    unsafe { transmute::<i32x4, _>(simd_shuffle!(a.as_i32x4(), b.as_i32x4(), [0, 4, 1, 5])) }
1764}
1765
1766/// Unpacks and interleave 64-bit integers from the low half of `a` and `b`.
1767///
1768/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64)
1769#[inline]
1770#[target_feature(enable = "sse2")]
1771#[cfg_attr(test, assert_instr(movlhps))]
1772#[stable(feature = "simd_x86", since = "1.27.0")]
1773#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1774pub const fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
1775    unsafe { transmute::<i64x2, _>(simd_shuffle!(a.as_i64x2(), b.as_i64x2(), [0, 2])) }
1776}
1777
1778/// Returns a new vector with the low element of `a` replaced by the sum of the
1779/// low elements of `a` and `b`.
1780///
1781/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd)
1782#[inline]
1783#[target_feature(enable = "sse2")]
1784#[cfg_attr(test, assert_instr(addsd))]
1785#[stable(feature = "simd_x86", since = "1.27.0")]
1786#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1787pub const fn _mm_add_sd(a: __m128d, b: __m128d) -> __m128d {
1788    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) + _mm_cvtsd_f64(b)) }
1789}
1790
1791/// Adds packed double-precision (64-bit) floating-point elements in `a` and
1792/// `b`.
1793///
1794/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd)
1795#[inline]
1796#[target_feature(enable = "sse2")]
1797#[cfg_attr(test, assert_instr(addpd))]
1798#[stable(feature = "simd_x86", since = "1.27.0")]
1799#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1800pub const fn _mm_add_pd(a: __m128d, b: __m128d) -> __m128d {
1801    unsafe { simd_add(a, b) }
1802}
1803
1804/// Returns a new vector with the low element of `a` replaced by the result of
1805/// diving the lower element of `a` by the lower element of `b`.
1806///
1807/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd)
1808#[inline]
1809#[target_feature(enable = "sse2")]
1810#[cfg_attr(test, assert_instr(divsd))]
1811#[stable(feature = "simd_x86", since = "1.27.0")]
1812#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1813pub const fn _mm_div_sd(a: __m128d, b: __m128d) -> __m128d {
1814    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) / _mm_cvtsd_f64(b)) }
1815}
1816
1817/// Divide packed double-precision (64-bit) floating-point elements in `a` by
1818/// packed elements in `b`.
1819///
1820/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd)
1821#[inline]
1822#[target_feature(enable = "sse2")]
1823#[cfg_attr(test, assert_instr(divpd))]
1824#[stable(feature = "simd_x86", since = "1.27.0")]
1825#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1826pub const fn _mm_div_pd(a: __m128d, b: __m128d) -> __m128d {
1827    unsafe { simd_div(a, b) }
1828}
1829
1830/// Returns a new vector with the low element of `a` replaced by the maximum
1831/// of the lower elements of `a` and `b`.
1832///
1833/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd)
1834#[inline]
1835#[target_feature(enable = "sse2")]
1836#[cfg_attr(test, assert_instr(maxsd))]
1837#[stable(feature = "simd_x86", since = "1.27.0")]
1838pub fn _mm_max_sd(a: __m128d, b: __m128d) -> __m128d {
1839    unsafe { maxsd(a, b) }
1840}
1841
1842/// Returns a new vector with the maximum values from corresponding elements in
1843/// `a` and `b`.
1844///
1845/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd)
1846#[inline]
1847#[target_feature(enable = "sse2")]
1848#[cfg_attr(test, assert_instr(maxpd))]
1849#[stable(feature = "simd_x86", since = "1.27.0")]
1850pub fn _mm_max_pd(a: __m128d, b: __m128d) -> __m128d {
1851    unsafe { maxpd(a, b) }
1852}
1853
1854/// Returns a new vector with the low element of `a` replaced by the minimum
1855/// of the lower elements of `a` and `b`.
1856///
1857/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd)
1858#[inline]
1859#[target_feature(enable = "sse2")]
1860#[cfg_attr(test, assert_instr(minsd))]
1861#[stable(feature = "simd_x86", since = "1.27.0")]
1862pub fn _mm_min_sd(a: __m128d, b: __m128d) -> __m128d {
1863    unsafe { minsd(a, b) }
1864}
1865
1866/// Returns a new vector with the minimum values from corresponding elements in
1867/// `a` and `b`.
1868///
1869/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd)
1870#[inline]
1871#[target_feature(enable = "sse2")]
1872#[cfg_attr(test, assert_instr(minpd))]
1873#[stable(feature = "simd_x86", since = "1.27.0")]
1874pub fn _mm_min_pd(a: __m128d, b: __m128d) -> __m128d {
1875    unsafe { minpd(a, b) }
1876}
1877
1878/// Returns a new vector with the low element of `a` replaced by multiplying the
1879/// low elements of `a` and `b`.
1880///
1881/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_sd)
1882#[inline]
1883#[target_feature(enable = "sse2")]
1884#[cfg_attr(test, assert_instr(mulsd))]
1885#[stable(feature = "simd_x86", since = "1.27.0")]
1886#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1887pub const fn _mm_mul_sd(a: __m128d, b: __m128d) -> __m128d {
1888    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) * _mm_cvtsd_f64(b)) }
1889}
1890
1891/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
1892/// and `b`.
1893///
1894/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd)
1895#[inline]
1896#[target_feature(enable = "sse2")]
1897#[cfg_attr(test, assert_instr(mulpd))]
1898#[stable(feature = "simd_x86", since = "1.27.0")]
1899#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1900pub const fn _mm_mul_pd(a: __m128d, b: __m128d) -> __m128d {
1901    unsafe { simd_mul(a, b) }
1902}
1903
1904/// Returns a new vector with the low element of `a` replaced by the square
1905/// root of the lower element `b`.
1906///
1907/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd)
1908#[inline]
1909#[target_feature(enable = "sse2")]
1910#[cfg_attr(test, assert_instr(sqrtsd))]
1911#[stable(feature = "simd_x86", since = "1.27.0")]
1912pub fn _mm_sqrt_sd(a: __m128d, b: __m128d) -> __m128d {
1913    unsafe { simd_insert!(a, 0, sqrtf64(_mm_cvtsd_f64(b))) }
1914}
1915
1916/// Returns a new vector with the square root of each of the values in `a`.
1917///
1918/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd)
1919#[inline]
1920#[target_feature(enable = "sse2")]
1921#[cfg_attr(test, assert_instr(sqrtpd))]
1922#[stable(feature = "simd_x86", since = "1.27.0")]
1923pub fn _mm_sqrt_pd(a: __m128d) -> __m128d {
1924    unsafe { simd_fsqrt(a) }
1925}
1926
1927/// Returns a new vector with the low element of `a` replaced by subtracting the
1928/// low element by `b` from the low element of `a`.
1929///
1930/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd)
1931#[inline]
1932#[target_feature(enable = "sse2")]
1933#[cfg_attr(test, assert_instr(subsd))]
1934#[stable(feature = "simd_x86", since = "1.27.0")]
1935#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1936pub const fn _mm_sub_sd(a: __m128d, b: __m128d) -> __m128d {
1937    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) - _mm_cvtsd_f64(b)) }
1938}
1939
1940/// Subtract packed double-precision (64-bit) floating-point elements in `b`
1941/// from `a`.
1942///
1943/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_pd)
1944#[inline]
1945#[target_feature(enable = "sse2")]
1946#[cfg_attr(test, assert_instr(subpd))]
1947#[stable(feature = "simd_x86", since = "1.27.0")]
1948#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1949pub const fn _mm_sub_pd(a: __m128d, b: __m128d) -> __m128d {
1950    unsafe { simd_sub(a, b) }
1951}
1952
1953/// Computes the bitwise AND of packed double-precision (64-bit) floating-point
1954/// elements in `a` and `b`.
1955///
1956/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd)
1957#[inline]
1958#[target_feature(enable = "sse2")]
1959#[cfg_attr(test, assert_instr(andps))]
1960#[stable(feature = "simd_x86", since = "1.27.0")]
1961#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1962pub const fn _mm_and_pd(a: __m128d, b: __m128d) -> __m128d {
1963    unsafe {
1964        let a: __m128i = transmute(a);
1965        let b: __m128i = transmute(b);
1966        transmute(_mm_and_si128(a, b))
1967    }
1968}
1969
1970/// Computes the bitwise NOT of `a` and then AND with `b`.
1971///
1972/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd)
1973#[inline]
1974#[target_feature(enable = "sse2")]
1975#[cfg_attr(test, assert_instr(andnps))]
1976#[stable(feature = "simd_x86", since = "1.27.0")]
1977#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1978pub const fn _mm_andnot_pd(a: __m128d, b: __m128d) -> __m128d {
1979    unsafe {
1980        let a: __m128i = transmute(a);
1981        let b: __m128i = transmute(b);
1982        transmute(_mm_andnot_si128(a, b))
1983    }
1984}
1985
1986/// Computes the bitwise OR of `a` and `b`.
1987///
1988/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_pd)
1989#[inline]
1990#[target_feature(enable = "sse2")]
1991#[cfg_attr(test, assert_instr(orps))]
1992#[stable(feature = "simd_x86", since = "1.27.0")]
1993#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1994pub const fn _mm_or_pd(a: __m128d, b: __m128d) -> __m128d {
1995    unsafe {
1996        let a: __m128i = transmute(a);
1997        let b: __m128i = transmute(b);
1998        transmute(_mm_or_si128(a, b))
1999    }
2000}
2001
2002/// Computes the bitwise XOR of `a` and `b`.
2003///
2004/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd)
2005#[inline]
2006#[target_feature(enable = "sse2")]
2007#[cfg_attr(test, assert_instr(xorps))]
2008#[stable(feature = "simd_x86", since = "1.27.0")]
2009#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2010pub const fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d {
2011    unsafe {
2012        let a: __m128i = transmute(a);
2013        let b: __m128i = transmute(b);
2014        transmute(_mm_xor_si128(a, b))
2015    }
2016}
2017
2018/// Returns a new vector with the low element of `a` replaced by the equality
2019/// comparison of the lower elements of `a` and `b`.
2020///
2021/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd)
2022#[inline]
2023#[target_feature(enable = "sse2")]
2024#[cfg_attr(test, assert_instr(cmpeqsd))]
2025#[stable(feature = "simd_x86", since = "1.27.0")]
2026pub fn _mm_cmpeq_sd(a: __m128d, b: __m128d) -> __m128d {
2027    unsafe { cmpsd(a, b, 0) }
2028}
2029
2030/// Returns a new vector with the low element of `a` replaced by the less-than
2031/// comparison of the lower elements of `a` and `b`.
2032///
2033/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd)
2034#[inline]
2035#[target_feature(enable = "sse2")]
2036#[cfg_attr(test, assert_instr(cmpltsd))]
2037#[stable(feature = "simd_x86", since = "1.27.0")]
2038pub fn _mm_cmplt_sd(a: __m128d, b: __m128d) -> __m128d {
2039    unsafe { cmpsd(a, b, 1) }
2040}
2041
2042/// Returns a new vector with the low element of `a` replaced by the
2043/// less-than-or-equal comparison of the lower elements of `a` and `b`.
2044///
2045/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd)
2046#[inline]
2047#[target_feature(enable = "sse2")]
2048#[cfg_attr(test, assert_instr(cmplesd))]
2049#[stable(feature = "simd_x86", since = "1.27.0")]
2050pub fn _mm_cmple_sd(a: __m128d, b: __m128d) -> __m128d {
2051    unsafe { cmpsd(a, b, 2) }
2052}
2053
2054/// Returns a new vector with the low element of `a` replaced by the
2055/// greater-than comparison of the lower elements of `a` and `b`.
2056///
2057/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd)
2058#[inline]
2059#[target_feature(enable = "sse2")]
2060#[cfg_attr(test, assert_instr(cmpltsd))]
2061#[stable(feature = "simd_x86", since = "1.27.0")]
2062pub fn _mm_cmpgt_sd(a: __m128d, b: __m128d) -> __m128d {
2063    unsafe { simd_insert!(_mm_cmplt_sd(b, a), 1, simd_extract!(a, 1, f64)) }
2064}
2065
2066/// Returns a new vector with the low element of `a` replaced by the
2067/// greater-than-or-equal comparison of the lower elements of `a` and `b`.
2068///
2069/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd)
2070#[inline]
2071#[target_feature(enable = "sse2")]
2072#[cfg_attr(test, assert_instr(cmplesd))]
2073#[stable(feature = "simd_x86", since = "1.27.0")]
2074pub fn _mm_cmpge_sd(a: __m128d, b: __m128d) -> __m128d {
2075    unsafe { simd_insert!(_mm_cmple_sd(b, a), 1, simd_extract!(a, 1, f64)) }
2076}
2077
2078/// Returns a new vector with the low element of `a` replaced by the result
2079/// of comparing both of the lower elements of `a` and `b` to `NaN`. If
2080/// neither are equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0`
2081/// otherwise.
2082///
2083/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd)
2084#[inline]
2085#[target_feature(enable = "sse2")]
2086#[cfg_attr(test, assert_instr(cmpordsd))]
2087#[stable(feature = "simd_x86", since = "1.27.0")]
2088pub fn _mm_cmpord_sd(a: __m128d, b: __m128d) -> __m128d {
2089    unsafe { cmpsd(a, b, 7) }
2090}
2091
2092/// Returns a new vector with the low element of `a` replaced by the result of
2093/// comparing both of the lower elements of `a` and `b` to `NaN`. If either is
2094/// equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0` otherwise.
2095///
2096/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd)
2097#[inline]
2098#[target_feature(enable = "sse2")]
2099#[cfg_attr(test, assert_instr(cmpunordsd))]
2100#[stable(feature = "simd_x86", since = "1.27.0")]
2101pub fn _mm_cmpunord_sd(a: __m128d, b: __m128d) -> __m128d {
2102    unsafe { cmpsd(a, b, 3) }
2103}
2104
2105/// Returns a new vector with the low element of `a` replaced by the not-equal
2106/// comparison of the lower elements of `a` and `b`.
2107///
2108/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd)
2109#[inline]
2110#[target_feature(enable = "sse2")]
2111#[cfg_attr(test, assert_instr(cmpneqsd))]
2112#[stable(feature = "simd_x86", since = "1.27.0")]
2113pub fn _mm_cmpneq_sd(a: __m128d, b: __m128d) -> __m128d {
2114    unsafe { cmpsd(a, b, 4) }
2115}
2116
2117/// Returns a new vector with the low element of `a` replaced by the
2118/// not-less-than comparison of the lower elements of `a` and `b`.
2119///
2120/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd)
2121#[inline]
2122#[target_feature(enable = "sse2")]
2123#[cfg_attr(test, assert_instr(cmpnltsd))]
2124#[stable(feature = "simd_x86", since = "1.27.0")]
2125pub fn _mm_cmpnlt_sd(a: __m128d, b: __m128d) -> __m128d {
2126    unsafe { cmpsd(a, b, 5) }
2127}
2128
2129/// Returns a new vector with the low element of `a` replaced by the
2130/// not-less-than-or-equal comparison of the lower elements of `a` and `b`.
2131///
2132/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd)
2133#[inline]
2134#[target_feature(enable = "sse2")]
2135#[cfg_attr(test, assert_instr(cmpnlesd))]
2136#[stable(feature = "simd_x86", since = "1.27.0")]
2137pub fn _mm_cmpnle_sd(a: __m128d, b: __m128d) -> __m128d {
2138    unsafe { cmpsd(a, b, 6) }
2139}
2140
2141/// Returns a new vector with the low element of `a` replaced by the
2142/// not-greater-than comparison of the lower elements of `a` and `b`.
2143///
2144/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd)
2145#[inline]
2146#[target_feature(enable = "sse2")]
2147#[cfg_attr(test, assert_instr(cmpnltsd))]
2148#[stable(feature = "simd_x86", since = "1.27.0")]
2149pub fn _mm_cmpngt_sd(a: __m128d, b: __m128d) -> __m128d {
2150    unsafe { simd_insert!(_mm_cmpnlt_sd(b, a), 1, simd_extract!(a, 1, f64)) }
2151}
2152
2153/// Returns a new vector with the low element of `a` replaced by the
2154/// not-greater-than-or-equal comparison of the lower elements of `a` and `b`.
2155///
2156/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd)
2157#[inline]
2158#[target_feature(enable = "sse2")]
2159#[cfg_attr(test, assert_instr(cmpnlesd))]
2160#[stable(feature = "simd_x86", since = "1.27.0")]
2161pub fn _mm_cmpnge_sd(a: __m128d, b: __m128d) -> __m128d {
2162    unsafe { simd_insert!(_mm_cmpnle_sd(b, a), 1, simd_extract!(a, 1, f64)) }
2163}
2164
2165/// Compares corresponding elements in `a` and `b` for equality.
2166///
2167/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd)
2168#[inline]
2169#[target_feature(enable = "sse2")]
2170#[cfg_attr(test, assert_instr(cmpeqpd))]
2171#[stable(feature = "simd_x86", since = "1.27.0")]
2172pub fn _mm_cmpeq_pd(a: __m128d, b: __m128d) -> __m128d {
2173    unsafe { cmppd(a, b, 0) }
2174}
2175
2176/// Compares corresponding elements in `a` and `b` for less-than.
2177///
2178/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd)
2179#[inline]
2180#[target_feature(enable = "sse2")]
2181#[cfg_attr(test, assert_instr(cmpltpd))]
2182#[stable(feature = "simd_x86", since = "1.27.0")]
2183pub fn _mm_cmplt_pd(a: __m128d, b: __m128d) -> __m128d {
2184    unsafe { cmppd(a, b, 1) }
2185}
2186
2187/// Compares corresponding elements in `a` and `b` for less-than-or-equal
2188///
2189/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd)
2190#[inline]
2191#[target_feature(enable = "sse2")]
2192#[cfg_attr(test, assert_instr(cmplepd))]
2193#[stable(feature = "simd_x86", since = "1.27.0")]
2194pub fn _mm_cmple_pd(a: __m128d, b: __m128d) -> __m128d {
2195    unsafe { cmppd(a, b, 2) }
2196}
2197
2198/// Compares corresponding elements in `a` and `b` for greater-than.
2199///
2200/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd)
2201#[inline]
2202#[target_feature(enable = "sse2")]
2203#[cfg_attr(test, assert_instr(cmpltpd))]
2204#[stable(feature = "simd_x86", since = "1.27.0")]
2205pub fn _mm_cmpgt_pd(a: __m128d, b: __m128d) -> __m128d {
2206    _mm_cmplt_pd(b, a)
2207}
2208
2209/// Compares corresponding elements in `a` and `b` for greater-than-or-equal.
2210///
2211/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd)
2212#[inline]
2213#[target_feature(enable = "sse2")]
2214#[cfg_attr(test, assert_instr(cmplepd))]
2215#[stable(feature = "simd_x86", since = "1.27.0")]
2216pub fn _mm_cmpge_pd(a: __m128d, b: __m128d) -> __m128d {
2217    _mm_cmple_pd(b, a)
2218}
2219
2220/// Compares corresponding elements in `a` and `b` to see if neither is `NaN`.
2221///
2222/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd)
2223#[inline]
2224#[target_feature(enable = "sse2")]
2225#[cfg_attr(test, assert_instr(cmpordpd))]
2226#[stable(feature = "simd_x86", since = "1.27.0")]
2227pub fn _mm_cmpord_pd(a: __m128d, b: __m128d) -> __m128d {
2228    unsafe { cmppd(a, b, 7) }
2229}
2230
2231/// Compares corresponding elements in `a` and `b` to see if either is `NaN`.
2232///
2233/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd)
2234#[inline]
2235#[target_feature(enable = "sse2")]
2236#[cfg_attr(test, assert_instr(cmpunordpd))]
2237#[stable(feature = "simd_x86", since = "1.27.0")]
2238pub fn _mm_cmpunord_pd(a: __m128d, b: __m128d) -> __m128d {
2239    unsafe { cmppd(a, b, 3) }
2240}
2241
2242/// Compares corresponding elements in `a` and `b` for not-equal.
2243///
2244/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd)
2245#[inline]
2246#[target_feature(enable = "sse2")]
2247#[cfg_attr(test, assert_instr(cmpneqpd))]
2248#[stable(feature = "simd_x86", since = "1.27.0")]
2249pub fn _mm_cmpneq_pd(a: __m128d, b: __m128d) -> __m128d {
2250    unsafe { cmppd(a, b, 4) }
2251}
2252
2253/// Compares corresponding elements in `a` and `b` for not-less-than.
2254///
2255/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd)
2256#[inline]
2257#[target_feature(enable = "sse2")]
2258#[cfg_attr(test, assert_instr(cmpnltpd))]
2259#[stable(feature = "simd_x86", since = "1.27.0")]
2260pub fn _mm_cmpnlt_pd(a: __m128d, b: __m128d) -> __m128d {
2261    unsafe { cmppd(a, b, 5) }
2262}
2263
2264/// Compares corresponding elements in `a` and `b` for not-less-than-or-equal.
2265///
2266/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd)
2267#[inline]
2268#[target_feature(enable = "sse2")]
2269#[cfg_attr(test, assert_instr(cmpnlepd))]
2270#[stable(feature = "simd_x86", since = "1.27.0")]
2271pub fn _mm_cmpnle_pd(a: __m128d, b: __m128d) -> __m128d {
2272    unsafe { cmppd(a, b, 6) }
2273}
2274
2275/// Compares corresponding elements in `a` and `b` for not-greater-than.
2276///
2277/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_pd)
2278#[inline]
2279#[target_feature(enable = "sse2")]
2280#[cfg_attr(test, assert_instr(cmpnltpd))]
2281#[stable(feature = "simd_x86", since = "1.27.0")]
2282pub fn _mm_cmpngt_pd(a: __m128d, b: __m128d) -> __m128d {
2283    _mm_cmpnlt_pd(b, a)
2284}
2285
2286/// Compares corresponding elements in `a` and `b` for
2287/// not-greater-than-or-equal.
2288///
2289/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd)
2290#[inline]
2291#[target_feature(enable = "sse2")]
2292#[cfg_attr(test, assert_instr(cmpnlepd))]
2293#[stable(feature = "simd_x86", since = "1.27.0")]
2294pub fn _mm_cmpnge_pd(a: __m128d, b: __m128d) -> __m128d {
2295    _mm_cmpnle_pd(b, a)
2296}
2297
2298/// Compares the lower element of `a` and `b` for equality.
2299///
2300/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd)
2301#[inline]
2302#[target_feature(enable = "sse2")]
2303#[cfg_attr(test, assert_instr(comisd))]
2304#[stable(feature = "simd_x86", since = "1.27.0")]
2305pub fn _mm_comieq_sd(a: __m128d, b: __m128d) -> i32 {
2306    unsafe { comieqsd(a, b) }
2307}
2308
2309/// Compares the lower element of `a` and `b` for less-than.
2310///
2311/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd)
2312#[inline]
2313#[target_feature(enable = "sse2")]
2314#[cfg_attr(test, assert_instr(comisd))]
2315#[stable(feature = "simd_x86", since = "1.27.0")]
2316pub fn _mm_comilt_sd(a: __m128d, b: __m128d) -> i32 {
2317    unsafe { comiltsd(a, b) }
2318}
2319
2320/// Compares the lower element of `a` and `b` for less-than-or-equal.
2321///
2322/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd)
2323#[inline]
2324#[target_feature(enable = "sse2")]
2325#[cfg_attr(test, assert_instr(comisd))]
2326#[stable(feature = "simd_x86", since = "1.27.0")]
2327pub fn _mm_comile_sd(a: __m128d, b: __m128d) -> i32 {
2328    unsafe { comilesd(a, b) }
2329}
2330
2331/// Compares the lower element of `a` and `b` for greater-than.
2332///
2333/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd)
2334#[inline]
2335#[target_feature(enable = "sse2")]
2336#[cfg_attr(test, assert_instr(comisd))]
2337#[stable(feature = "simd_x86", since = "1.27.0")]
2338pub fn _mm_comigt_sd(a: __m128d, b: __m128d) -> i32 {
2339    unsafe { comigtsd(a, b) }
2340}
2341
2342/// Compares the lower element of `a` and `b` for greater-than-or-equal.
2343///
2344/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd)
2345#[inline]
2346#[target_feature(enable = "sse2")]
2347#[cfg_attr(test, assert_instr(comisd))]
2348#[stable(feature = "simd_x86", since = "1.27.0")]
2349pub fn _mm_comige_sd(a: __m128d, b: __m128d) -> i32 {
2350    unsafe { comigesd(a, b) }
2351}
2352
2353/// Compares the lower element of `a` and `b` for not-equal.
2354///
2355/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd)
2356#[inline]
2357#[target_feature(enable = "sse2")]
2358#[cfg_attr(test, assert_instr(comisd))]
2359#[stable(feature = "simd_x86", since = "1.27.0")]
2360pub fn _mm_comineq_sd(a: __m128d, b: __m128d) -> i32 {
2361    unsafe { comineqsd(a, b) }
2362}
2363
2364/// Compares the lower element of `a` and `b` for equality.
2365///
2366/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomieq_sd)
2367#[inline]
2368#[target_feature(enable = "sse2")]
2369#[cfg_attr(test, assert_instr(ucomisd))]
2370#[stable(feature = "simd_x86", since = "1.27.0")]
2371pub fn _mm_ucomieq_sd(a: __m128d, b: __m128d) -> i32 {
2372    unsafe { ucomieqsd(a, b) }
2373}
2374
2375/// Compares the lower element of `a` and `b` for less-than.
2376///
2377/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomilt_sd)
2378#[inline]
2379#[target_feature(enable = "sse2")]
2380#[cfg_attr(test, assert_instr(ucomisd))]
2381#[stable(feature = "simd_x86", since = "1.27.0")]
2382pub fn _mm_ucomilt_sd(a: __m128d, b: __m128d) -> i32 {
2383    unsafe { ucomiltsd(a, b) }
2384}
2385
2386/// Compares the lower element of `a` and `b` for less-than-or-equal.
2387///
2388/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomile_sd)
2389#[inline]
2390#[target_feature(enable = "sse2")]
2391#[cfg_attr(test, assert_instr(ucomisd))]
2392#[stable(feature = "simd_x86", since = "1.27.0")]
2393pub fn _mm_ucomile_sd(a: __m128d, b: __m128d) -> i32 {
2394    unsafe { ucomilesd(a, b) }
2395}
2396
2397/// Compares the lower element of `a` and `b` for greater-than.
2398///
2399/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomigt_sd)
2400#[inline]
2401#[target_feature(enable = "sse2")]
2402#[cfg_attr(test, assert_instr(ucomisd))]
2403#[stable(feature = "simd_x86", since = "1.27.0")]
2404pub fn _mm_ucomigt_sd(a: __m128d, b: __m128d) -> i32 {
2405    unsafe { ucomigtsd(a, b) }
2406}
2407
2408/// Compares the lower element of `a` and `b` for greater-than-or-equal.
2409///
2410/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomige_sd)
2411#[inline]
2412#[target_feature(enable = "sse2")]
2413#[cfg_attr(test, assert_instr(ucomisd))]
2414#[stable(feature = "simd_x86", since = "1.27.0")]
2415pub fn _mm_ucomige_sd(a: __m128d, b: __m128d) -> i32 {
2416    unsafe { ucomigesd(a, b) }
2417}
2418
2419/// Compares the lower element of `a` and `b` for not-equal.
2420///
2421/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomineq_sd)
2422#[inline]
2423#[target_feature(enable = "sse2")]
2424#[cfg_attr(test, assert_instr(ucomisd))]
2425#[stable(feature = "simd_x86", since = "1.27.0")]
2426pub fn _mm_ucomineq_sd(a: __m128d, b: __m128d) -> i32 {
2427    unsafe { ucomineqsd(a, b) }
2428}
2429
2430/// Converts packed double-precision (64-bit) floating-point elements in `a` to
2431/// packed single-precision (32-bit) floating-point elements
2432///
2433/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps)
2434#[inline]
2435#[target_feature(enable = "sse2")]
2436#[cfg_attr(test, assert_instr(cvtpd2ps))]
2437#[stable(feature = "simd_x86", since = "1.27.0")]
2438#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2439pub const fn _mm_cvtpd_ps(a: __m128d) -> __m128 {
2440    unsafe {
2441        let r = simd_cast::<_, f32x2>(a.as_f64x2());
2442        let zero = f32x2::ZERO;
2443        transmute::<f32x4, _>(simd_shuffle!(r, zero, [0, 1, 2, 3]))
2444    }
2445}
2446
2447/// Converts packed single-precision (32-bit) floating-point elements in `a` to
2448/// packed
2449/// double-precision (64-bit) floating-point elements.
2450///
2451/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd)
2452#[inline]
2453#[target_feature(enable = "sse2")]
2454#[cfg_attr(test, assert_instr(cvtps2pd))]
2455#[stable(feature = "simd_x86", since = "1.27.0")]
2456#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2457pub const fn _mm_cvtps_pd(a: __m128) -> __m128d {
2458    unsafe {
2459        let a = a.as_f32x4();
2460        transmute(simd_cast::<f32x2, f64x2>(simd_shuffle!(a, a, [0, 1])))
2461    }
2462}
2463
2464/// Converts packed double-precision (64-bit) floating-point elements in `a` to
2465/// packed 32-bit integers.
2466///
2467/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32)
2468#[inline]
2469#[target_feature(enable = "sse2")]
2470#[cfg_attr(test, assert_instr(cvtpd2dq))]
2471#[stable(feature = "simd_x86", since = "1.27.0")]
2472pub fn _mm_cvtpd_epi32(a: __m128d) -> __m128i {
2473    unsafe { transmute(cvtpd2dq(a)) }
2474}
2475
2476/// Converts the lower double-precision (64-bit) floating-point element in a to
2477/// a 32-bit integer.
2478///
2479/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32)
2480#[inline]
2481#[target_feature(enable = "sse2")]
2482#[cfg_attr(test, assert_instr(cvtsd2si))]
2483#[stable(feature = "simd_x86", since = "1.27.0")]
2484pub fn _mm_cvtsd_si32(a: __m128d) -> i32 {
2485    unsafe { cvtsd2si(a) }
2486}
2487
2488/// Converts the lower double-precision (64-bit) floating-point element in `b`
2489/// to a single-precision (32-bit) floating-point element, store the result in
2490/// the lower element of the return value, and copies the upper element from `a`
2491/// to the upper element the return value.
2492///
2493/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss)
2494#[inline]
2495#[target_feature(enable = "sse2")]
2496#[cfg_attr(test, assert_instr(cvtsd2ss))]
2497#[stable(feature = "simd_x86", since = "1.27.0")]
2498pub fn _mm_cvtsd_ss(a: __m128, b: __m128d) -> __m128 {
2499    unsafe { cvtsd2ss(a, b) }
2500}
2501
2502/// Returns the lower double-precision (64-bit) floating-point element of `a`.
2503///
2504/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64)
2505#[inline]
2506#[target_feature(enable = "sse2")]
2507#[stable(feature = "simd_x86", since = "1.27.0")]
2508#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2509pub const fn _mm_cvtsd_f64(a: __m128d) -> f64 {
2510    unsafe { simd_extract!(a, 0) }
2511}
2512
2513/// Converts the lower single-precision (32-bit) floating-point element in `b`
2514/// to a double-precision (64-bit) floating-point element, store the result in
2515/// the lower element of the return value, and copies the upper element from `a`
2516/// to the upper element the return value.
2517///
2518/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd)
2519#[inline]
2520#[target_feature(enable = "sse2")]
2521#[cfg_attr(test, assert_instr(cvtss2sd))]
2522#[stable(feature = "simd_x86", since = "1.27.0")]
2523#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2524pub const fn _mm_cvtss_sd(a: __m128d, b: __m128) -> __m128d {
2525    unsafe {
2526        let elt: f32 = simd_extract!(b, 0);
2527        simd_insert!(a, 0, elt as f64)
2528    }
2529}
2530
2531/// Converts packed double-precision (64-bit) floating-point elements in `a` to
2532/// packed 32-bit integers with truncation.
2533///
2534/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32)
2535#[inline]
2536#[target_feature(enable = "sse2")]
2537#[cfg_attr(test, assert_instr(cvttpd2dq))]
2538#[stable(feature = "simd_x86", since = "1.27.0")]
2539pub fn _mm_cvttpd_epi32(a: __m128d) -> __m128i {
2540    unsafe { transmute(cvttpd2dq(a)) }
2541}
2542
2543/// Converts the lower double-precision (64-bit) floating-point element in `a`
2544/// to a 32-bit integer with truncation.
2545///
2546/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32)
2547#[inline]
2548#[target_feature(enable = "sse2")]
2549#[cfg_attr(test, assert_instr(cvttsd2si))]
2550#[stable(feature = "simd_x86", since = "1.27.0")]
2551pub fn _mm_cvttsd_si32(a: __m128d) -> i32 {
2552    unsafe { cvttsd2si(a) }
2553}
2554
2555/// Converts packed single-precision (32-bit) floating-point elements in `a` to
2556/// packed 32-bit integers with truncation.
2557///
2558/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32)
2559#[inline]
2560#[target_feature(enable = "sse2")]
2561#[cfg_attr(test, assert_instr(cvttps2dq))]
2562#[stable(feature = "simd_x86", since = "1.27.0")]
2563pub fn _mm_cvttps_epi32(a: __m128) -> __m128i {
2564    unsafe { transmute(cvttps2dq(a)) }
2565}
2566
2567/// Copies double-precision (64-bit) floating-point element `a` to the lower
2568/// element of the packed 64-bit return value.
2569///
2570/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd)
2571#[inline]
2572#[target_feature(enable = "sse2")]
2573#[stable(feature = "simd_x86", since = "1.27.0")]
2574#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2575pub const fn _mm_set_sd(a: f64) -> __m128d {
2576    _mm_set_pd(0.0, a)
2577}
2578
2579/// Broadcasts double-precision (64-bit) floating-point value a to all elements
2580/// of the return value.
2581///
2582/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd)
2583#[inline]
2584#[target_feature(enable = "sse2")]
2585#[stable(feature = "simd_x86", since = "1.27.0")]
2586#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2587pub const fn _mm_set1_pd(a: f64) -> __m128d {
2588    _mm_set_pd(a, a)
2589}
2590
2591/// Broadcasts double-precision (64-bit) floating-point value a to all elements
2592/// of the return value.
2593///
2594/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1)
2595#[inline]
2596#[target_feature(enable = "sse2")]
2597#[stable(feature = "simd_x86", since = "1.27.0")]
2598#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2599pub const fn _mm_set_pd1(a: f64) -> __m128d {
2600    _mm_set_pd(a, a)
2601}
2602
2603/// Sets packed double-precision (64-bit) floating-point elements in the return
2604/// value with the supplied values.
2605///
2606/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd)
2607#[inline]
2608#[target_feature(enable = "sse2")]
2609#[stable(feature = "simd_x86", since = "1.27.0")]
2610#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2611pub const fn _mm_set_pd(a: f64, b: f64) -> __m128d {
2612    __m128d([b, a])
2613}
2614
2615/// Sets packed double-precision (64-bit) floating-point elements in the return
2616/// value with the supplied values in reverse order.
2617///
2618/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd)
2619#[inline]
2620#[target_feature(enable = "sse2")]
2621#[stable(feature = "simd_x86", since = "1.27.0")]
2622#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2623pub const fn _mm_setr_pd(a: f64, b: f64) -> __m128d {
2624    _mm_set_pd(b, a)
2625}
2626
2627/// Returns packed double-precision (64-bit) floating-point elements with all
2628/// zeros.
2629///
2630/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd)
2631#[inline]
2632#[target_feature(enable = "sse2")]
2633#[cfg_attr(test, assert_instr(xorp))]
2634#[stable(feature = "simd_x86", since = "1.27.0")]
2635#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2636pub const fn _mm_setzero_pd() -> __m128d {
2637    const { unsafe { mem::zeroed() } }
2638}
2639
2640/// Returns a mask of the most significant bit of each element in `a`.
2641///
2642/// The mask is stored in the 2 least significant bits of the return value.
2643/// All other bits are set to `0`.
2644///
2645/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd)
2646#[inline]
2647#[target_feature(enable = "sse2")]
2648#[cfg_attr(test, assert_instr(movmskpd))]
2649#[stable(feature = "simd_x86", since = "1.27.0")]
2650#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2651pub const fn _mm_movemask_pd(a: __m128d) -> i32 {
2652    // Propagate the highest bit to the rest, because simd_bitmask
2653    // requires all-1 or all-0.
2654    unsafe {
2655        let mask: i64x2 = simd_lt(transmute(a), i64x2::ZERO);
2656        simd_bitmask::<i64x2, u8>(mask) as i32
2657    }
2658}
2659
2660/// Loads 128-bits (composed of 2 packed double-precision (64-bit)
2661/// floating-point elements) from memory into the returned vector.
2662/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
2663/// exception may be generated.
2664///
2665/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd)
2666#[inline]
2667#[target_feature(enable = "sse2")]
2668#[cfg_attr(
2669    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
2670    assert_instr(movaps)
2671)]
2672#[stable(feature = "simd_x86", since = "1.27.0")]
2673#[allow(clippy::cast_ptr_alignment)]
2674#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2675pub const unsafe fn _mm_load_pd(mem_addr: *const f64) -> __m128d {
2676    *(mem_addr as *const __m128d)
2677}
2678
2679/// Loads a 64-bit double-precision value to the low element of a
2680/// 128-bit integer vector and clears the upper element.
2681///
2682/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd)
2683#[inline]
2684#[target_feature(enable = "sse2")]
2685#[cfg_attr(test, assert_instr(movsd))]
2686#[stable(feature = "simd_x86", since = "1.27.0")]
2687#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2688pub const unsafe fn _mm_load_sd(mem_addr: *const f64) -> __m128d {
2689    _mm_setr_pd(*mem_addr, 0.)
2690}
2691
2692/// Loads a double-precision value into the high-order bits of a 128-bit
2693/// vector of `[2 x double]`. The low-order bits are copied from the low-order
2694/// bits of the first operand.
2695///
2696/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd)
2697#[inline]
2698#[target_feature(enable = "sse2")]
2699#[cfg_attr(test, assert_instr(movhps))]
2700#[stable(feature = "simd_x86", since = "1.27.0")]
2701#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2702pub const unsafe fn _mm_loadh_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
2703    _mm_setr_pd(simd_extract!(a, 0), *mem_addr)
2704}
2705
2706/// Loads a double-precision value into the low-order bits of a 128-bit
2707/// vector of `[2 x double]`. The high-order bits are copied from the
2708/// high-order bits of the first operand.
2709///
2710/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd)
2711#[inline]
2712#[target_feature(enable = "sse2")]
2713#[cfg_attr(test, assert_instr(movlps))]
2714#[stable(feature = "simd_x86", since = "1.27.0")]
2715#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2716pub const unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
2717    _mm_setr_pd(*mem_addr, simd_extract!(a, 1))
2718}
2719
2720/// Stores a 128-bit floating point vector of `[2 x double]` to a 128-bit
2721/// aligned memory location.
2722/// To minimize caching, the data is flagged as non-temporal (unlikely to be
2723/// used again soon).
2724///
2725/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd)
2726///
2727/// # Safety of non-temporal stores
2728///
2729/// After using this intrinsic, but before any other access to the memory that this intrinsic
2730/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
2731/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
2732/// return.
2733///
2734/// See [`_mm_sfence`] for details.
2735#[inline]
2736#[target_feature(enable = "sse2")]
2737#[cfg_attr(test, assert_instr(movntpd))]
2738#[stable(feature = "simd_x86", since = "1.27.0")]
2739#[allow(clippy::cast_ptr_alignment)]
2740pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) {
2741    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
2742    crate::arch::asm!(
2743        vps!("movntpd", ",{a}"),
2744        p = in(reg) mem_addr,
2745        a = in(xmm_reg) a,
2746        options(nostack, preserves_flags),
2747    );
2748}
2749
2750/// Stores the lower 64 bits of a 128-bit vector of `[2 x double]` to a
2751/// memory location.
2752///
2753/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_sd)
2754#[inline]
2755#[target_feature(enable = "sse2")]
2756#[cfg_attr(test, assert_instr(movlps))]
2757#[stable(feature = "simd_x86", since = "1.27.0")]
2758#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2759pub const unsafe fn _mm_store_sd(mem_addr: *mut f64, a: __m128d) {
2760    *mem_addr = simd_extract!(a, 0)
2761}
2762
2763/// Stores 128-bits (composed of 2 packed double-precision (64-bit)
2764/// floating-point elements) from `a` into memory. `mem_addr` must be aligned
2765/// on a 16-byte boundary or a general-protection exception may be generated.
2766///
2767/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd)
2768#[inline]
2769#[target_feature(enable = "sse2")]
2770#[cfg_attr(
2771    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
2772    assert_instr(movaps)
2773)]
2774#[stable(feature = "simd_x86", since = "1.27.0")]
2775#[allow(clippy::cast_ptr_alignment)]
2776#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2777pub const unsafe fn _mm_store_pd(mem_addr: *mut f64, a: __m128d) {
2778    *(mem_addr as *mut __m128d) = a;
2779}
2780
2781/// Stores 128-bits (composed of 2 packed double-precision (64-bit)
2782/// floating-point elements) from `a` into memory.
2783/// `mem_addr` does not need to be aligned on any particular boundary.
2784///
2785/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd)
2786#[inline]
2787#[target_feature(enable = "sse2")]
2788#[cfg_attr(test, assert_instr(movups))] // FIXME movupd expected
2789#[stable(feature = "simd_x86", since = "1.27.0")]
2790#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2791pub const unsafe fn _mm_storeu_pd(mem_addr: *mut f64, a: __m128d) {
2792    mem_addr.cast::<__m128d>().write_unaligned(a);
2793}
2794
2795/// Store 16-bit integer from the first element of a into memory.
2796///
2797/// `mem_addr` does not need to be aligned on any particular boundary.
2798///
2799/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16)
2800#[inline]
2801#[target_feature(enable = "sse2")]
2802#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2803#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2804pub const unsafe fn _mm_storeu_si16(mem_addr: *mut u8, a: __m128i) {
2805    ptr::write_unaligned(mem_addr as *mut i16, simd_extract(a.as_i16x8(), 0))
2806}
2807
2808/// Store 32-bit integer from the first element of a into memory.
2809///
2810/// `mem_addr` does not need to be aligned on any particular boundary.
2811///
2812/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32)
2813#[inline]
2814#[target_feature(enable = "sse2")]
2815#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2816#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2817pub const unsafe fn _mm_storeu_si32(mem_addr: *mut u8, a: __m128i) {
2818    ptr::write_unaligned(mem_addr as *mut i32, simd_extract(a.as_i32x4(), 0))
2819}
2820
2821/// Store 64-bit integer from the first element of a into memory.
2822///
2823/// `mem_addr` does not need to be aligned on any particular boundary.
2824///
2825/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64)
2826#[inline]
2827#[target_feature(enable = "sse2")]
2828#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2829#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2830pub const unsafe fn _mm_storeu_si64(mem_addr: *mut u8, a: __m128i) {
2831    ptr::write_unaligned(mem_addr as *mut i64, simd_extract(a.as_i64x2(), 0))
2832}
2833
2834/// Stores the lower double-precision (64-bit) floating-point element from `a`
2835/// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
2836/// 16-byte boundary or a general-protection exception may be generated.
2837///
2838/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_pd)
2839#[inline]
2840#[target_feature(enable = "sse2")]
2841#[stable(feature = "simd_x86", since = "1.27.0")]
2842#[allow(clippy::cast_ptr_alignment)]
2843#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2844pub const unsafe fn _mm_store1_pd(mem_addr: *mut f64, a: __m128d) {
2845    let b: __m128d = simd_shuffle!(a, a, [0, 0]);
2846    *(mem_addr as *mut __m128d) = b;
2847}
2848
2849/// Stores the lower double-precision (64-bit) floating-point element from `a`
2850/// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
2851/// 16-byte boundary or a general-protection exception may be generated.
2852///
2853/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1)
2854#[inline]
2855#[target_feature(enable = "sse2")]
2856#[stable(feature = "simd_x86", since = "1.27.0")]
2857#[allow(clippy::cast_ptr_alignment)]
2858#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2859pub const unsafe fn _mm_store_pd1(mem_addr: *mut f64, a: __m128d) {
2860    let b: __m128d = simd_shuffle!(a, a, [0, 0]);
2861    *(mem_addr as *mut __m128d) = b;
2862}
2863
2864/// Stores 2 double-precision (64-bit) floating-point elements from `a` into
2865/// memory in reverse order.
2866/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
2867/// exception may be generated.
2868///
2869/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd)
2870#[inline]
2871#[target_feature(enable = "sse2")]
2872#[stable(feature = "simd_x86", since = "1.27.0")]
2873#[allow(clippy::cast_ptr_alignment)]
2874#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2875pub const unsafe fn _mm_storer_pd(mem_addr: *mut f64, a: __m128d) {
2876    let b: __m128d = simd_shuffle!(a, a, [1, 0]);
2877    *(mem_addr as *mut __m128d) = b;
2878}
2879
2880/// Stores the upper 64 bits of a 128-bit vector of `[2 x double]` to a
2881/// memory location.
2882///
2883/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd)
2884#[inline]
2885#[target_feature(enable = "sse2")]
2886#[cfg_attr(test, assert_instr(movhps))]
2887#[stable(feature = "simd_x86", since = "1.27.0")]
2888#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2889pub const unsafe fn _mm_storeh_pd(mem_addr: *mut f64, a: __m128d) {
2890    *mem_addr = simd_extract!(a, 1);
2891}
2892
2893/// Stores the lower 64 bits of a 128-bit vector of `[2 x double]` to a
2894/// memory location.
2895///
2896/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd)
2897#[inline]
2898#[target_feature(enable = "sse2")]
2899#[cfg_attr(test, assert_instr(movlps))]
2900#[stable(feature = "simd_x86", since = "1.27.0")]
2901#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2902pub const unsafe fn _mm_storel_pd(mem_addr: *mut f64, a: __m128d) {
2903    *mem_addr = simd_extract!(a, 0);
2904}
2905
2906/// Loads a double-precision (64-bit) floating-point element from memory
2907/// into both elements of returned vector.
2908///
2909/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd)
2910#[inline]
2911#[target_feature(enable = "sse2")]
2912// #[cfg_attr(test, assert_instr(movapd))] // FIXME LLVM uses different codegen
2913#[stable(feature = "simd_x86", since = "1.27.0")]
2914#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2915pub const unsafe fn _mm_load1_pd(mem_addr: *const f64) -> __m128d {
2916    let d = *mem_addr;
2917    _mm_setr_pd(d, d)
2918}
2919
2920/// Loads a double-precision (64-bit) floating-point element from memory
2921/// into both elements of returned vector.
2922///
2923/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1)
2924#[inline]
2925#[target_feature(enable = "sse2")]
2926// #[cfg_attr(test, assert_instr(movapd))] // FIXME same as _mm_load1_pd
2927#[stable(feature = "simd_x86", since = "1.27.0")]
2928#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2929pub const unsafe fn _mm_load_pd1(mem_addr: *const f64) -> __m128d {
2930    _mm_load1_pd(mem_addr)
2931}
2932
2933/// Loads 2 double-precision (64-bit) floating-point elements from memory into
2934/// the returned vector in reverse order. `mem_addr` must be aligned on a
2935/// 16-byte boundary or a general-protection exception may be generated.
2936///
2937/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd)
2938#[inline]
2939#[target_feature(enable = "sse2")]
2940#[cfg_attr(
2941    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
2942    assert_instr(movaps)
2943)]
2944#[stable(feature = "simd_x86", since = "1.27.0")]
2945#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2946pub const unsafe fn _mm_loadr_pd(mem_addr: *const f64) -> __m128d {
2947    let a = _mm_load_pd(mem_addr);
2948    simd_shuffle!(a, a, [1, 0])
2949}
2950
2951/// Loads 128-bits (composed of 2 packed double-precision (64-bit)
2952/// floating-point elements) from memory into the returned vector.
2953/// `mem_addr` does not need to be aligned on any particular boundary.
2954///
2955/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd)
2956#[inline]
2957#[target_feature(enable = "sse2")]
2958#[cfg_attr(test, assert_instr(movups))]
2959#[stable(feature = "simd_x86", since = "1.27.0")]
2960#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2961pub const unsafe fn _mm_loadu_pd(mem_addr: *const f64) -> __m128d {
2962    let mut dst = _mm_undefined_pd();
2963    ptr::copy_nonoverlapping(
2964        mem_addr as *const u8,
2965        ptr::addr_of_mut!(dst) as *mut u8,
2966        mem::size_of::<__m128d>(),
2967    );
2968    dst
2969}
2970
2971/// Loads unaligned 16-bits of integer data from memory into new vector.
2972///
2973/// `mem_addr` does not need to be aligned on any particular boundary.
2974///
2975/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16)
2976#[inline]
2977#[target_feature(enable = "sse2")]
2978#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2979#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2980pub const unsafe fn _mm_loadu_si16(mem_addr: *const u8) -> __m128i {
2981    transmute(i16x8::new(
2982        ptr::read_unaligned(mem_addr as *const i16),
2983        0,
2984        0,
2985        0,
2986        0,
2987        0,
2988        0,
2989        0,
2990    ))
2991}
2992
2993/// Loads unaligned 32-bits of integer data from memory into new vector.
2994///
2995/// `mem_addr` does not need to be aligned on any particular boundary.
2996///
2997/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32)
2998#[inline]
2999#[target_feature(enable = "sse2")]
3000#[stable(feature = "simd_x86_updates", since = "1.82.0")]
3001#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3002pub const unsafe fn _mm_loadu_si32(mem_addr: *const u8) -> __m128i {
3003    transmute(i32x4::new(
3004        ptr::read_unaligned(mem_addr as *const i32),
3005        0,
3006        0,
3007        0,
3008    ))
3009}
3010
3011/// Loads unaligned 64-bits of integer data from memory into new vector.
3012///
3013/// `mem_addr` does not need to be aligned on any particular boundary.
3014///
3015/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64)
3016#[inline]
3017#[target_feature(enable = "sse2")]
3018#[stable(feature = "simd_x86_mm_loadu_si64", since = "1.46.0")]
3019#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3020pub const unsafe fn _mm_loadu_si64(mem_addr: *const u8) -> __m128i {
3021    transmute(i64x2::new(ptr::read_unaligned(mem_addr as *const i64), 0))
3022}
3023
3024/// Constructs a 128-bit floating-point vector of `[2 x double]` from two
3025/// 128-bit vector parameters of `[2 x double]`, using the immediate-value
3026/// parameter as a specifier.
3027///
3028/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd)
3029#[inline]
3030#[target_feature(enable = "sse2")]
3031#[cfg_attr(test, assert_instr(shufps, MASK = 2))]
3032#[rustc_legacy_const_generics(2)]
3033#[stable(feature = "simd_x86", since = "1.27.0")]
3034#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3035pub const fn _mm_shuffle_pd<const MASK: i32>(a: __m128d, b: __m128d) -> __m128d {
3036    static_assert_uimm_bits!(MASK, 8);
3037    unsafe { simd_shuffle!(a, b, [MASK as u32 & 0b1, ((MASK as u32 >> 1) & 0b1) + 2]) }
3038}
3039
3040/// Constructs a 128-bit floating-point vector of `[2 x double]`. The lower
3041/// 64 bits are set to the lower 64 bits of the second parameter. The upper
3042/// 64 bits are set to the upper 64 bits of the first parameter.
3043///
3044/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd)
3045#[inline]
3046#[target_feature(enable = "sse2")]
3047#[cfg_attr(test, assert_instr(movsd))]
3048#[stable(feature = "simd_x86", since = "1.27.0")]
3049#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3050pub const fn _mm_move_sd(a: __m128d, b: __m128d) -> __m128d {
3051    unsafe { _mm_setr_pd(simd_extract!(b, 0), simd_extract!(a, 1)) }
3052}
3053
3054/// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit
3055/// floating-point vector of `[4 x float]`.
3056///
3057/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps)
3058#[inline]
3059#[target_feature(enable = "sse2")]
3060#[stable(feature = "simd_x86", since = "1.27.0")]
3061#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3062pub const fn _mm_castpd_ps(a: __m128d) -> __m128 {
3063    unsafe { transmute(a) }
3064}
3065
3066/// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit
3067/// integer vector.
3068///
3069/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128)
3070#[inline]
3071#[target_feature(enable = "sse2")]
3072#[stable(feature = "simd_x86", since = "1.27.0")]
3073#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3074pub const fn _mm_castpd_si128(a: __m128d) -> __m128i {
3075    unsafe { transmute(a) }
3076}
3077
3078/// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit
3079/// floating-point vector of `[2 x double]`.
3080///
3081/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd)
3082#[inline]
3083#[target_feature(enable = "sse2")]
3084#[stable(feature = "simd_x86", since = "1.27.0")]
3085#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3086pub const fn _mm_castps_pd(a: __m128) -> __m128d {
3087    unsafe { transmute(a) }
3088}
3089
3090/// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit
3091/// integer vector.
3092///
3093/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128)
3094#[inline]
3095#[target_feature(enable = "sse2")]
3096#[stable(feature = "simd_x86", since = "1.27.0")]
3097#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3098pub const fn _mm_castps_si128(a: __m128) -> __m128i {
3099    unsafe { transmute(a) }
3100}
3101
3102/// Casts a 128-bit integer vector into a 128-bit floating-point vector
3103/// of `[2 x double]`.
3104///
3105/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd)
3106#[inline]
3107#[target_feature(enable = "sse2")]
3108#[stable(feature = "simd_x86", since = "1.27.0")]
3109#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3110pub const fn _mm_castsi128_pd(a: __m128i) -> __m128d {
3111    unsafe { transmute(a) }
3112}
3113
3114/// Casts a 128-bit integer vector into a 128-bit floating-point vector
3115/// of `[4 x float]`.
3116///
3117/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps)
3118#[inline]
3119#[target_feature(enable = "sse2")]
3120#[stable(feature = "simd_x86", since = "1.27.0")]
3121#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3122pub const fn _mm_castsi128_ps(a: __m128i) -> __m128 {
3123    unsafe { transmute(a) }
3124}
3125
3126/// Returns vector of type __m128d with indeterminate elements.with indetermination elements.
3127/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
3128/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
3129/// In practice, this is typically equivalent to [`mem::zeroed`].
3130///
3131/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd)
3132#[inline]
3133#[target_feature(enable = "sse2")]
3134#[stable(feature = "simd_x86", since = "1.27.0")]
3135#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3136pub const fn _mm_undefined_pd() -> __m128d {
3137    const { unsafe { mem::zeroed() } }
3138}
3139
3140/// Returns vector of type __m128i with indeterminate elements.with indetermination elements.
3141/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
3142/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
3143/// In practice, this is typically equivalent to [`mem::zeroed`].
3144///
3145/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_si128)
3146#[inline]
3147#[target_feature(enable = "sse2")]
3148#[stable(feature = "simd_x86", since = "1.27.0")]
3149#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3150pub const fn _mm_undefined_si128() -> __m128i {
3151    const { unsafe { mem::zeroed() } }
3152}
3153
3154/// The resulting `__m128d` element is composed by the low-order values of
3155/// the two `__m128d` interleaved input elements, i.e.:
3156///
3157/// * The `[127:64]` bits are copied from the `[127:64]` bits of the second input
3158/// * The `[63:0]` bits are copied from the `[127:64]` bits of the first input
3159///
3160/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd)
3161#[inline]
3162#[target_feature(enable = "sse2")]
3163#[cfg_attr(test, assert_instr(unpckhpd))]
3164#[stable(feature = "simd_x86", since = "1.27.0")]
3165#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3166pub const fn _mm_unpackhi_pd(a: __m128d, b: __m128d) -> __m128d {
3167    unsafe { simd_shuffle!(a, b, [1, 3]) }
3168}
3169
3170/// The resulting `__m128d` element is composed by the high-order values of
3171/// the two `__m128d` interleaved input elements, i.e.:
3172///
3173/// * The `[127:64]` bits are copied from the `[63:0]` bits of the second input
3174/// * The `[63:0]` bits are copied from the `[63:0]` bits of the first input
3175///
3176/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd)
3177#[inline]
3178#[target_feature(enable = "sse2")]
3179#[cfg_attr(test, assert_instr(movlhps))]
3180#[stable(feature = "simd_x86", since = "1.27.0")]
3181#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3182pub const fn _mm_unpacklo_pd(a: __m128d, b: __m128d) -> __m128d {
3183    unsafe { simd_shuffle!(a, b, [0, 2]) }
3184}
3185
3186#[allow(improper_ctypes)]
3187unsafe extern "C" {
3188    #[link_name = "llvm.x86.sse2.pause"]
3189    fn pause();
3190    #[link_name = "llvm.x86.sse2.clflush"]
3191    fn clflush(p: *const u8);
3192    #[link_name = "llvm.x86.sse2.lfence"]
3193    fn lfence();
3194    #[link_name = "llvm.x86.sse2.mfence"]
3195    fn mfence();
3196    #[link_name = "llvm.x86.sse2.pmadd.wd"]
3197    fn pmaddwd(a: i16x8, b: i16x8) -> i32x4;
3198    #[link_name = "llvm.x86.sse2.psad.bw"]
3199    fn psadbw(a: u8x16, b: u8x16) -> u64x2;
3200    #[link_name = "llvm.x86.sse2.psll.w"]
3201    fn psllw(a: i16x8, count: i16x8) -> i16x8;
3202    #[link_name = "llvm.x86.sse2.psll.d"]
3203    fn pslld(a: i32x4, count: i32x4) -> i32x4;
3204    #[link_name = "llvm.x86.sse2.psll.q"]
3205    fn psllq(a: i64x2, count: i64x2) -> i64x2;
3206    #[link_name = "llvm.x86.sse2.psra.w"]
3207    fn psraw(a: i16x8, count: i16x8) -> i16x8;
3208    #[link_name = "llvm.x86.sse2.psra.d"]
3209    fn psrad(a: i32x4, count: i32x4) -> i32x4;
3210    #[link_name = "llvm.x86.sse2.psrl.w"]
3211    fn psrlw(a: i16x8, count: i16x8) -> i16x8;
3212    #[link_name = "llvm.x86.sse2.psrl.d"]
3213    fn psrld(a: i32x4, count: i32x4) -> i32x4;
3214    #[link_name = "llvm.x86.sse2.psrl.q"]
3215    fn psrlq(a: i64x2, count: i64x2) -> i64x2;
3216    #[link_name = "llvm.x86.sse2.cvtps2dq"]
3217    fn cvtps2dq(a: __m128) -> i32x4;
3218    #[link_name = "llvm.x86.sse2.maskmov.dqu"]
3219    fn maskmovdqu(a: i8x16, mask: i8x16, mem_addr: *mut i8);
3220    #[link_name = "llvm.x86.sse2.packsswb.128"]
3221    fn packsswb(a: i16x8, b: i16x8) -> i8x16;
3222    #[link_name = "llvm.x86.sse2.packssdw.128"]
3223    fn packssdw(a: i32x4, b: i32x4) -> i16x8;
3224    #[link_name = "llvm.x86.sse2.packuswb.128"]
3225    fn packuswb(a: i16x8, b: i16x8) -> u8x16;
3226    #[link_name = "llvm.x86.sse2.max.sd"]
3227    fn maxsd(a: __m128d, b: __m128d) -> __m128d;
3228    #[link_name = "llvm.x86.sse2.max.pd"]
3229    fn maxpd(a: __m128d, b: __m128d) -> __m128d;
3230    #[link_name = "llvm.x86.sse2.min.sd"]
3231    fn minsd(a: __m128d, b: __m128d) -> __m128d;
3232    #[link_name = "llvm.x86.sse2.min.pd"]
3233    fn minpd(a: __m128d, b: __m128d) -> __m128d;
3234    #[link_name = "llvm.x86.sse2.cmp.sd"]
3235    fn cmpsd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3236    #[link_name = "llvm.x86.sse2.cmp.pd"]
3237    fn cmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3238    #[link_name = "llvm.x86.sse2.comieq.sd"]
3239    fn comieqsd(a: __m128d, b: __m128d) -> i32;
3240    #[link_name = "llvm.x86.sse2.comilt.sd"]
3241    fn comiltsd(a: __m128d, b: __m128d) -> i32;
3242    #[link_name = "llvm.x86.sse2.comile.sd"]
3243    fn comilesd(a: __m128d, b: __m128d) -> i32;
3244    #[link_name = "llvm.x86.sse2.comigt.sd"]
3245    fn comigtsd(a: __m128d, b: __m128d) -> i32;
3246    #[link_name = "llvm.x86.sse2.comige.sd"]
3247    fn comigesd(a: __m128d, b: __m128d) -> i32;
3248    #[link_name = "llvm.x86.sse2.comineq.sd"]
3249    fn comineqsd(a: __m128d, b: __m128d) -> i32;
3250    #[link_name = "llvm.x86.sse2.ucomieq.sd"]
3251    fn ucomieqsd(a: __m128d, b: __m128d) -> i32;
3252    #[link_name = "llvm.x86.sse2.ucomilt.sd"]
3253    fn ucomiltsd(a: __m128d, b: __m128d) -> i32;
3254    #[link_name = "llvm.x86.sse2.ucomile.sd"]
3255    fn ucomilesd(a: __m128d, b: __m128d) -> i32;
3256    #[link_name = "llvm.x86.sse2.ucomigt.sd"]
3257    fn ucomigtsd(a: __m128d, b: __m128d) -> i32;
3258    #[link_name = "llvm.x86.sse2.ucomige.sd"]
3259    fn ucomigesd(a: __m128d, b: __m128d) -> i32;
3260    #[link_name = "llvm.x86.sse2.ucomineq.sd"]
3261    fn ucomineqsd(a: __m128d, b: __m128d) -> i32;
3262    #[link_name = "llvm.x86.sse2.cvtpd2dq"]
3263    fn cvtpd2dq(a: __m128d) -> i32x4;
3264    #[link_name = "llvm.x86.sse2.cvtsd2si"]
3265    fn cvtsd2si(a: __m128d) -> i32;
3266    #[link_name = "llvm.x86.sse2.cvtsd2ss"]
3267    fn cvtsd2ss(a: __m128, b: __m128d) -> __m128;
3268    #[link_name = "llvm.x86.sse2.cvttpd2dq"]
3269    fn cvttpd2dq(a: __m128d) -> i32x4;
3270    #[link_name = "llvm.x86.sse2.cvttsd2si"]
3271    fn cvttsd2si(a: __m128d) -> i32;
3272    #[link_name = "llvm.x86.sse2.cvttps2dq"]
3273    fn cvttps2dq(a: __m128) -> i32x4;
3274}
3275
3276#[cfg(test)]
3277mod tests {
3278    use crate::core_arch::assert_eq_const as assert_eq;
3279    use crate::{
3280        core_arch::{simd::*, x86::*},
3281        hint::black_box,
3282    };
3283    use std::{
3284        boxed, f32, f64,
3285        mem::{self, transmute},
3286        ptr,
3287    };
3288    use stdarch_test::simd_test;
3289
3290    const NAN: f64 = f64::NAN;
3291
3292    #[test]
3293    fn test_mm_pause() {
3294        _mm_pause()
3295    }
3296
3297    #[simd_test(enable = "sse2")]
3298    unsafe fn test_mm_clflush() {
3299        let x = 0_u8;
3300        _mm_clflush(ptr::addr_of!(x));
3301    }
3302
3303    #[simd_test(enable = "sse2")]
3304    // Miri cannot support this until it is clear how it fits in the Rust memory model
3305    #[cfg_attr(miri, ignore)]
3306    fn test_mm_lfence() {
3307        _mm_lfence();
3308    }
3309
3310    #[simd_test(enable = "sse2")]
3311    // Miri cannot support this until it is clear how it fits in the Rust memory model
3312    #[cfg_attr(miri, ignore)]
3313    fn test_mm_mfence() {
3314        _mm_mfence();
3315    }
3316
3317    #[simd_test(enable = "sse2")]
3318    const fn test_mm_add_epi8() {
3319        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3320        #[rustfmt::skip]
3321        let b = _mm_setr_epi8(
3322            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3323        );
3324        let r = _mm_add_epi8(a, b);
3325        #[rustfmt::skip]
3326        let e = _mm_setr_epi8(
3327            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
3328        );
3329        assert_eq_m128i(r, e);
3330    }
3331
3332    #[simd_test(enable = "sse2")]
3333    fn test_mm_add_epi8_overflow() {
3334        let a = _mm_set1_epi8(0x7F);
3335        let b = _mm_set1_epi8(1);
3336        let r = _mm_add_epi8(a, b);
3337        assert_eq_m128i(r, _mm_set1_epi8(-128));
3338    }
3339
3340    #[simd_test(enable = "sse2")]
3341    const fn test_mm_add_epi16() {
3342        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3343        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
3344        let r = _mm_add_epi16(a, b);
3345        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
3346        assert_eq_m128i(r, e);
3347    }
3348
3349    #[simd_test(enable = "sse2")]
3350    const fn test_mm_add_epi32() {
3351        let a = _mm_setr_epi32(0, 1, 2, 3);
3352        let b = _mm_setr_epi32(4, 5, 6, 7);
3353        let r = _mm_add_epi32(a, b);
3354        let e = _mm_setr_epi32(4, 6, 8, 10);
3355        assert_eq_m128i(r, e);
3356    }
3357
3358    #[simd_test(enable = "sse2")]
3359    const fn test_mm_add_epi64() {
3360        let a = _mm_setr_epi64x(0, 1);
3361        let b = _mm_setr_epi64x(2, 3);
3362        let r = _mm_add_epi64(a, b);
3363        let e = _mm_setr_epi64x(2, 4);
3364        assert_eq_m128i(r, e);
3365    }
3366
3367    #[simd_test(enable = "sse2")]
3368    const fn test_mm_adds_epi8() {
3369        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3370        #[rustfmt::skip]
3371        let b = _mm_setr_epi8(
3372            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3373        );
3374        let r = _mm_adds_epi8(a, b);
3375        #[rustfmt::skip]
3376        let e = _mm_setr_epi8(
3377            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
3378        );
3379        assert_eq_m128i(r, e);
3380    }
3381
3382    #[simd_test(enable = "sse2")]
3383    fn test_mm_adds_epi8_saturate_positive() {
3384        let a = _mm_set1_epi8(0x7F);
3385        let b = _mm_set1_epi8(1);
3386        let r = _mm_adds_epi8(a, b);
3387        assert_eq_m128i(r, a);
3388    }
3389
3390    #[simd_test(enable = "sse2")]
3391    fn test_mm_adds_epi8_saturate_negative() {
3392        let a = _mm_set1_epi8(-0x80);
3393        let b = _mm_set1_epi8(-1);
3394        let r = _mm_adds_epi8(a, b);
3395        assert_eq_m128i(r, a);
3396    }
3397
3398    #[simd_test(enable = "sse2")]
3399    const fn test_mm_adds_epi16() {
3400        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3401        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
3402        let r = _mm_adds_epi16(a, b);
3403        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
3404        assert_eq_m128i(r, e);
3405    }
3406
3407    #[simd_test(enable = "sse2")]
3408    fn test_mm_adds_epi16_saturate_positive() {
3409        let a = _mm_set1_epi16(0x7FFF);
3410        let b = _mm_set1_epi16(1);
3411        let r = _mm_adds_epi16(a, b);
3412        assert_eq_m128i(r, a);
3413    }
3414
3415    #[simd_test(enable = "sse2")]
3416    fn test_mm_adds_epi16_saturate_negative() {
3417        let a = _mm_set1_epi16(-0x8000);
3418        let b = _mm_set1_epi16(-1);
3419        let r = _mm_adds_epi16(a, b);
3420        assert_eq_m128i(r, a);
3421    }
3422
3423    #[simd_test(enable = "sse2")]
3424    const fn test_mm_adds_epu8() {
3425        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3426        #[rustfmt::skip]
3427        let b = _mm_setr_epi8(
3428            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3429        );
3430        let r = _mm_adds_epu8(a, b);
3431        #[rustfmt::skip]
3432        let e = _mm_setr_epi8(
3433            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
3434        );
3435        assert_eq_m128i(r, e);
3436    }
3437
3438    #[simd_test(enable = "sse2")]
3439    fn test_mm_adds_epu8_saturate() {
3440        let a = _mm_set1_epi8(!0);
3441        let b = _mm_set1_epi8(1);
3442        let r = _mm_adds_epu8(a, b);
3443        assert_eq_m128i(r, a);
3444    }
3445
3446    #[simd_test(enable = "sse2")]
3447    const fn test_mm_adds_epu16() {
3448        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3449        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
3450        let r = _mm_adds_epu16(a, b);
3451        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
3452        assert_eq_m128i(r, e);
3453    }
3454
3455    #[simd_test(enable = "sse2")]
3456    fn test_mm_adds_epu16_saturate() {
3457        let a = _mm_set1_epi16(!0);
3458        let b = _mm_set1_epi16(1);
3459        let r = _mm_adds_epu16(a, b);
3460        assert_eq_m128i(r, a);
3461    }
3462
3463    #[simd_test(enable = "sse2")]
3464    const fn test_mm_avg_epu8() {
3465        let (a, b) = (_mm_set1_epi8(3), _mm_set1_epi8(9));
3466        let r = _mm_avg_epu8(a, b);
3467        assert_eq_m128i(r, _mm_set1_epi8(6));
3468    }
3469
3470    #[simd_test(enable = "sse2")]
3471    const fn test_mm_avg_epu16() {
3472        let (a, b) = (_mm_set1_epi16(3), _mm_set1_epi16(9));
3473        let r = _mm_avg_epu16(a, b);
3474        assert_eq_m128i(r, _mm_set1_epi16(6));
3475    }
3476
3477    #[simd_test(enable = "sse2")]
3478    fn test_mm_madd_epi16() {
3479        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
3480        let b = _mm_setr_epi16(9, 10, 11, 12, 13, 14, 15, 16);
3481        let r = _mm_madd_epi16(a, b);
3482        let e = _mm_setr_epi32(29, 81, 149, 233);
3483        assert_eq_m128i(r, e);
3484
3485        // Test large values.
3486        // MIN*MIN+MIN*MIN will overflow into i32::MIN.
3487        let a = _mm_setr_epi16(
3488            i16::MAX,
3489            i16::MAX,
3490            i16::MIN,
3491            i16::MIN,
3492            i16::MIN,
3493            i16::MAX,
3494            0,
3495            0,
3496        );
3497        let b = _mm_setr_epi16(
3498            i16::MAX,
3499            i16::MAX,
3500            i16::MIN,
3501            i16::MIN,
3502            i16::MAX,
3503            i16::MIN,
3504            0,
3505            0,
3506        );
3507        let r = _mm_madd_epi16(a, b);
3508        let e = _mm_setr_epi32(0x7FFE0002, i32::MIN, -0x7FFF0000, 0);
3509        assert_eq_m128i(r, e);
3510    }
3511
3512    #[simd_test(enable = "sse2")]
3513    const fn test_mm_max_epi16() {
3514        let a = _mm_set1_epi16(1);
3515        let b = _mm_set1_epi16(-1);
3516        let r = _mm_max_epi16(a, b);
3517        assert_eq_m128i(r, a);
3518    }
3519
3520    #[simd_test(enable = "sse2")]
3521    const fn test_mm_max_epu8() {
3522        let a = _mm_set1_epi8(1);
3523        let b = _mm_set1_epi8(!0);
3524        let r = _mm_max_epu8(a, b);
3525        assert_eq_m128i(r, b);
3526    }
3527
3528    #[simd_test(enable = "sse2")]
3529    const fn test_mm_min_epi16() {
3530        let a = _mm_set1_epi16(1);
3531        let b = _mm_set1_epi16(-1);
3532        let r = _mm_min_epi16(a, b);
3533        assert_eq_m128i(r, b);
3534    }
3535
3536    #[simd_test(enable = "sse2")]
3537    const fn test_mm_min_epu8() {
3538        let a = _mm_set1_epi8(1);
3539        let b = _mm_set1_epi8(!0);
3540        let r = _mm_min_epu8(a, b);
3541        assert_eq_m128i(r, a);
3542    }
3543
3544    #[simd_test(enable = "sse2")]
3545    const fn test_mm_mulhi_epi16() {
3546        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(-1001));
3547        let r = _mm_mulhi_epi16(a, b);
3548        assert_eq_m128i(r, _mm_set1_epi16(-16));
3549    }
3550
3551    #[simd_test(enable = "sse2")]
3552    const fn test_mm_mulhi_epu16() {
3553        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(1001));
3554        let r = _mm_mulhi_epu16(a, b);
3555        assert_eq_m128i(r, _mm_set1_epi16(15));
3556    }
3557
3558    #[simd_test(enable = "sse2")]
3559    const fn test_mm_mullo_epi16() {
3560        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(-1001));
3561        let r = _mm_mullo_epi16(a, b);
3562        assert_eq_m128i(r, _mm_set1_epi16(-17960));
3563    }
3564
3565    #[simd_test(enable = "sse2")]
3566    const fn test_mm_mul_epu32() {
3567        let a = _mm_setr_epi64x(1_000_000_000, 1 << 34);
3568        let b = _mm_setr_epi64x(1_000_000_000, 1 << 35);
3569        let r = _mm_mul_epu32(a, b);
3570        let e = _mm_setr_epi64x(1_000_000_000 * 1_000_000_000, 0);
3571        assert_eq_m128i(r, e);
3572    }
3573
3574    #[simd_test(enable = "sse2")]
3575    fn test_mm_sad_epu8() {
3576        #[rustfmt::skip]
3577        let a = _mm_setr_epi8(
3578            255u8 as i8, 254u8 as i8, 253u8 as i8, 252u8 as i8,
3579            1, 2, 3, 4,
3580            155u8 as i8, 154u8 as i8, 153u8 as i8, 152u8 as i8,
3581            1, 2, 3, 4,
3582        );
3583        let b = _mm_setr_epi8(0, 0, 0, 0, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2);
3584        let r = _mm_sad_epu8(a, b);
3585        let e = _mm_setr_epi64x(1020, 614);
3586        assert_eq_m128i(r, e);
3587    }
3588
3589    #[simd_test(enable = "sse2")]
3590    const fn test_mm_sub_epi8() {
3591        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(6));
3592        let r = _mm_sub_epi8(a, b);
3593        assert_eq_m128i(r, _mm_set1_epi8(-1));
3594    }
3595
3596    #[simd_test(enable = "sse2")]
3597    const fn test_mm_sub_epi16() {
3598        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(6));
3599        let r = _mm_sub_epi16(a, b);
3600        assert_eq_m128i(r, _mm_set1_epi16(-1));
3601    }
3602
3603    #[simd_test(enable = "sse2")]
3604    const fn test_mm_sub_epi32() {
3605        let (a, b) = (_mm_set1_epi32(5), _mm_set1_epi32(6));
3606        let r = _mm_sub_epi32(a, b);
3607        assert_eq_m128i(r, _mm_set1_epi32(-1));
3608    }
3609
3610    #[simd_test(enable = "sse2")]
3611    const fn test_mm_sub_epi64() {
3612        let (a, b) = (_mm_set1_epi64x(5), _mm_set1_epi64x(6));
3613        let r = _mm_sub_epi64(a, b);
3614        assert_eq_m128i(r, _mm_set1_epi64x(-1));
3615    }
3616
3617    #[simd_test(enable = "sse2")]
3618    const fn test_mm_subs_epi8() {
3619        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(2));
3620        let r = _mm_subs_epi8(a, b);
3621        assert_eq_m128i(r, _mm_set1_epi8(3));
3622    }
3623
3624    #[simd_test(enable = "sse2")]
3625    fn test_mm_subs_epi8_saturate_positive() {
3626        let a = _mm_set1_epi8(0x7F);
3627        let b = _mm_set1_epi8(-1);
3628        let r = _mm_subs_epi8(a, b);
3629        assert_eq_m128i(r, a);
3630    }
3631
3632    #[simd_test(enable = "sse2")]
3633    fn test_mm_subs_epi8_saturate_negative() {
3634        let a = _mm_set1_epi8(-0x80);
3635        let b = _mm_set1_epi8(1);
3636        let r = _mm_subs_epi8(a, b);
3637        assert_eq_m128i(r, a);
3638    }
3639
3640    #[simd_test(enable = "sse2")]
3641    const fn test_mm_subs_epi16() {
3642        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(2));
3643        let r = _mm_subs_epi16(a, b);
3644        assert_eq_m128i(r, _mm_set1_epi16(3));
3645    }
3646
3647    #[simd_test(enable = "sse2")]
3648    fn test_mm_subs_epi16_saturate_positive() {
3649        let a = _mm_set1_epi16(0x7FFF);
3650        let b = _mm_set1_epi16(-1);
3651        let r = _mm_subs_epi16(a, b);
3652        assert_eq_m128i(r, a);
3653    }
3654
3655    #[simd_test(enable = "sse2")]
3656    fn test_mm_subs_epi16_saturate_negative() {
3657        let a = _mm_set1_epi16(-0x8000);
3658        let b = _mm_set1_epi16(1);
3659        let r = _mm_subs_epi16(a, b);
3660        assert_eq_m128i(r, a);
3661    }
3662
3663    #[simd_test(enable = "sse2")]
3664    const fn test_mm_subs_epu8() {
3665        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(2));
3666        let r = _mm_subs_epu8(a, b);
3667        assert_eq_m128i(r, _mm_set1_epi8(3));
3668    }
3669
3670    #[simd_test(enable = "sse2")]
3671    fn test_mm_subs_epu8_saturate() {
3672        let a = _mm_set1_epi8(0);
3673        let b = _mm_set1_epi8(1);
3674        let r = _mm_subs_epu8(a, b);
3675        assert_eq_m128i(r, a);
3676    }
3677
3678    #[simd_test(enable = "sse2")]
3679    const fn test_mm_subs_epu16() {
3680        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(2));
3681        let r = _mm_subs_epu16(a, b);
3682        assert_eq_m128i(r, _mm_set1_epi16(3));
3683    }
3684
3685    #[simd_test(enable = "sse2")]
3686    fn test_mm_subs_epu16_saturate() {
3687        let a = _mm_set1_epi16(0);
3688        let b = _mm_set1_epi16(1);
3689        let r = _mm_subs_epu16(a, b);
3690        assert_eq_m128i(r, a);
3691    }
3692
3693    #[simd_test(enable = "sse2")]
3694    const fn test_mm_slli_si128() {
3695        #[rustfmt::skip]
3696        let a = _mm_setr_epi8(
3697            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3698        );
3699        let r = _mm_slli_si128::<1>(a);
3700        let e = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3701        assert_eq_m128i(r, e);
3702
3703        #[rustfmt::skip]
3704        let a = _mm_setr_epi8(
3705            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3706        );
3707        let r = _mm_slli_si128::<15>(a);
3708        let e = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
3709        assert_eq_m128i(r, e);
3710
3711        #[rustfmt::skip]
3712        let a = _mm_setr_epi8(
3713            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3714        );
3715        let r = _mm_slli_si128::<16>(a);
3716        assert_eq_m128i(r, _mm_set1_epi8(0));
3717    }
3718
3719    #[simd_test(enable = "sse2")]
3720    const fn test_mm_slli_epi16() {
3721        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3722        let r = _mm_slli_epi16::<4>(a);
3723        assert_eq_m128i(
3724            r,
3725            _mm_setr_epi16(0xCC0, -0xCC0, 0xDD0, -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0),
3726        );
3727        let r = _mm_slli_epi16::<16>(a);
3728        assert_eq_m128i(r, _mm_set1_epi16(0));
3729    }
3730
3731    #[simd_test(enable = "sse2")]
3732    unsafe fn test_mm_sll_epi16() {
3733        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3734        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 4));
3735        assert_eq_m128i(
3736            r,
3737            _mm_setr_epi16(0xCC0, -0xCC0, 0xDD0, -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0),
3738        );
3739        let r = _mm_sll_epi16(a, _mm_set_epi64x(4, 0));
3740        assert_eq_m128i(r, a);
3741        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 16));
3742        assert_eq_m128i(r, _mm_set1_epi16(0));
3743        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, i64::MAX));
3744        assert_eq_m128i(r, _mm_set1_epi16(0));
3745    }
3746
3747    #[simd_test(enable = "sse2")]
3748    const fn test_mm_slli_epi32() {
3749        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3750        let r = _mm_slli_epi32::<4>(a);
3751        assert_eq_m128i(r, _mm_setr_epi32(0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0));
3752        let r = _mm_slli_epi32::<32>(a);
3753        assert_eq_m128i(r, _mm_set1_epi32(0));
3754    }
3755
3756    #[simd_test(enable = "sse2")]
3757    fn test_mm_sll_epi32() {
3758        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3759        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 4));
3760        assert_eq_m128i(r, _mm_setr_epi32(0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0));
3761        let r = _mm_sll_epi32(a, _mm_set_epi64x(4, 0));
3762        assert_eq_m128i(r, a);
3763        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 32));
3764        assert_eq_m128i(r, _mm_set1_epi32(0));
3765        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, i64::MAX));
3766        assert_eq_m128i(r, _mm_set1_epi32(0));
3767    }
3768
3769    #[simd_test(enable = "sse2")]
3770    const fn test_mm_slli_epi64() {
3771        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3772        let r = _mm_slli_epi64::<4>(a);
3773        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFFF0, -0xFFFFFFFF0));
3774        let r = _mm_slli_epi64::<64>(a);
3775        assert_eq_m128i(r, _mm_set1_epi64x(0));
3776    }
3777
3778    #[simd_test(enable = "sse2")]
3779    fn test_mm_sll_epi64() {
3780        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3781        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 4));
3782        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFFF0, -0xFFFFFFFF0));
3783        let r = _mm_sll_epi64(a, _mm_set_epi64x(4, 0));
3784        assert_eq_m128i(r, a);
3785        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 64));
3786        assert_eq_m128i(r, _mm_set1_epi64x(0));
3787        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, i64::MAX));
3788        assert_eq_m128i(r, _mm_set1_epi64x(0));
3789    }
3790
3791    #[simd_test(enable = "sse2")]
3792    const fn test_mm_srai_epi16() {
3793        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3794        let r = _mm_srai_epi16::<4>(a);
3795        assert_eq_m128i(
3796            r,
3797            _mm_setr_epi16(0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF, -0x10),
3798        );
3799        let r = _mm_srai_epi16::<16>(a);
3800        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3801    }
3802
3803    #[simd_test(enable = "sse2")]
3804    fn test_mm_sra_epi16() {
3805        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3806        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 4));
3807        assert_eq_m128i(
3808            r,
3809            _mm_setr_epi16(0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF, -0x10),
3810        );
3811        let r = _mm_sra_epi16(a, _mm_set_epi64x(4, 0));
3812        assert_eq_m128i(r, a);
3813        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 16));
3814        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3815        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, i64::MAX));
3816        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3817    }
3818
3819    #[simd_test(enable = "sse2")]
3820    const fn test_mm_srai_epi32() {
3821        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3822        let r = _mm_srai_epi32::<4>(a);
3823        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, -0xEEF, 0xFFF, -0x1000));
3824        let r = _mm_srai_epi32::<32>(a);
3825        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
3826    }
3827
3828    #[simd_test(enable = "sse2")]
3829    fn test_mm_sra_epi32() {
3830        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3831        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 4));
3832        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, -0xEEF, 0xFFF, -0x1000));
3833        let r = _mm_sra_epi32(a, _mm_set_epi64x(4, 0));
3834        assert_eq_m128i(r, a);
3835        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 32));
3836        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
3837        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, i64::MAX));
3838        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
3839    }
3840
3841    #[simd_test(enable = "sse2")]
3842    const fn test_mm_srli_si128() {
3843        #[rustfmt::skip]
3844        let a = _mm_setr_epi8(
3845            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3846        );
3847        let r = _mm_srli_si128::<1>(a);
3848        #[rustfmt::skip]
3849        let e = _mm_setr_epi8(
3850            2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0,
3851        );
3852        assert_eq_m128i(r, e);
3853
3854        #[rustfmt::skip]
3855        let a = _mm_setr_epi8(
3856            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3857        );
3858        let r = _mm_srli_si128::<15>(a);
3859        let e = _mm_setr_epi8(16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3860        assert_eq_m128i(r, e);
3861
3862        #[rustfmt::skip]
3863        let a = _mm_setr_epi8(
3864            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3865        );
3866        let r = _mm_srli_si128::<16>(a);
3867        assert_eq_m128i(r, _mm_set1_epi8(0));
3868    }
3869
3870    #[simd_test(enable = "sse2")]
3871    const fn test_mm_srli_epi16() {
3872        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3873        let r = _mm_srli_epi16::<4>(a);
3874        assert_eq_m128i(
3875            r,
3876            _mm_setr_epi16(0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1, 0xF, 0xFF0),
3877        );
3878        let r = _mm_srli_epi16::<16>(a);
3879        assert_eq_m128i(r, _mm_set1_epi16(0));
3880    }
3881
3882    #[simd_test(enable = "sse2")]
3883    fn test_mm_srl_epi16() {
3884        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3885        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 4));
3886        assert_eq_m128i(
3887            r,
3888            _mm_setr_epi16(0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1, 0xF, 0xFF0),
3889        );
3890        let r = _mm_srl_epi16(a, _mm_set_epi64x(4, 0));
3891        assert_eq_m128i(r, a);
3892        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 16));
3893        assert_eq_m128i(r, _mm_set1_epi16(0));
3894        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, i64::MAX));
3895        assert_eq_m128i(r, _mm_set1_epi16(0));
3896    }
3897
3898    #[simd_test(enable = "sse2")]
3899    const fn test_mm_srli_epi32() {
3900        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3901        let r = _mm_srli_epi32::<4>(a);
3902        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000));
3903        let r = _mm_srli_epi32::<32>(a);
3904        assert_eq_m128i(r, _mm_set1_epi32(0));
3905    }
3906
3907    #[simd_test(enable = "sse2")]
3908    fn test_mm_srl_epi32() {
3909        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3910        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 4));
3911        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000));
3912        let r = _mm_srl_epi32(a, _mm_set_epi64x(4, 0));
3913        assert_eq_m128i(r, a);
3914        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 32));
3915        assert_eq_m128i(r, _mm_set1_epi32(0));
3916        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, i64::MAX));
3917        assert_eq_m128i(r, _mm_set1_epi32(0));
3918    }
3919
3920    #[simd_test(enable = "sse2")]
3921    const fn test_mm_srli_epi64() {
3922        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3923        let r = _mm_srli_epi64::<4>(a);
3924        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFF, 0xFFFFFFFF0000000));
3925        let r = _mm_srli_epi64::<64>(a);
3926        assert_eq_m128i(r, _mm_set1_epi64x(0));
3927    }
3928
3929    #[simd_test(enable = "sse2")]
3930    fn test_mm_srl_epi64() {
3931        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3932        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 4));
3933        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFF, 0xFFFFFFFF0000000));
3934        let r = _mm_srl_epi64(a, _mm_set_epi64x(4, 0));
3935        assert_eq_m128i(r, a);
3936        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 64));
3937        assert_eq_m128i(r, _mm_set1_epi64x(0));
3938        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, i64::MAX));
3939        assert_eq_m128i(r, _mm_set1_epi64x(0));
3940    }
3941
3942    #[simd_test(enable = "sse2")]
3943    const fn test_mm_and_si128() {
3944        let a = _mm_set1_epi8(5);
3945        let b = _mm_set1_epi8(3);
3946        let r = _mm_and_si128(a, b);
3947        assert_eq_m128i(r, _mm_set1_epi8(1));
3948    }
3949
3950    #[simd_test(enable = "sse2")]
3951    const fn test_mm_andnot_si128() {
3952        let a = _mm_set1_epi8(5);
3953        let b = _mm_set1_epi8(3);
3954        let r = _mm_andnot_si128(a, b);
3955        assert_eq_m128i(r, _mm_set1_epi8(2));
3956    }
3957
3958    #[simd_test(enable = "sse2")]
3959    const fn test_mm_or_si128() {
3960        let a = _mm_set1_epi8(5);
3961        let b = _mm_set1_epi8(3);
3962        let r = _mm_or_si128(a, b);
3963        assert_eq_m128i(r, _mm_set1_epi8(7));
3964    }
3965
3966    #[simd_test(enable = "sse2")]
3967    const fn test_mm_xor_si128() {
3968        let a = _mm_set1_epi8(5);
3969        let b = _mm_set1_epi8(3);
3970        let r = _mm_xor_si128(a, b);
3971        assert_eq_m128i(r, _mm_set1_epi8(6));
3972    }
3973
3974    #[simd_test(enable = "sse2")]
3975    const fn test_mm_cmpeq_epi8() {
3976        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3977        let b = _mm_setr_epi8(15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
3978        let r = _mm_cmpeq_epi8(a, b);
3979        #[rustfmt::skip]
3980        assert_eq_m128i(
3981            r,
3982            _mm_setr_epi8(
3983                0, 0, 0xFFu8 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
3984            )
3985        );
3986    }
3987
3988    #[simd_test(enable = "sse2")]
3989    const fn test_mm_cmpeq_epi16() {
3990        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3991        let b = _mm_setr_epi16(7, 6, 2, 4, 3, 2, 1, 0);
3992        let r = _mm_cmpeq_epi16(a, b);
3993        assert_eq_m128i(r, _mm_setr_epi16(0, 0, !0, 0, 0, 0, 0, 0));
3994    }
3995
3996    #[simd_test(enable = "sse2")]
3997    const fn test_mm_cmpeq_epi32() {
3998        let a = _mm_setr_epi32(0, 1, 2, 3);
3999        let b = _mm_setr_epi32(3, 2, 2, 0);
4000        let r = _mm_cmpeq_epi32(a, b);
4001        assert_eq_m128i(r, _mm_setr_epi32(0, 0, !0, 0));
4002    }
4003
4004    #[simd_test(enable = "sse2")]
4005    const fn test_mm_cmpgt_epi8() {
4006        let a = _mm_set_epi8(5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
4007        let b = _mm_set1_epi8(0);
4008        let r = _mm_cmpgt_epi8(a, b);
4009        let e = _mm_set_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
4010        assert_eq_m128i(r, e);
4011    }
4012
4013    #[simd_test(enable = "sse2")]
4014    const fn test_mm_cmpgt_epi16() {
4015        let a = _mm_set_epi16(5, 0, 0, 0, 0, 0, 0, 0);
4016        let b = _mm_set1_epi16(0);
4017        let r = _mm_cmpgt_epi16(a, b);
4018        let e = _mm_set_epi16(!0, 0, 0, 0, 0, 0, 0, 0);
4019        assert_eq_m128i(r, e);
4020    }
4021
4022    #[simd_test(enable = "sse2")]
4023    const fn test_mm_cmpgt_epi32() {
4024        let a = _mm_set_epi32(5, 0, 0, 0);
4025        let b = _mm_set1_epi32(0);
4026        let r = _mm_cmpgt_epi32(a, b);
4027        assert_eq_m128i(r, _mm_set_epi32(!0, 0, 0, 0));
4028    }
4029
4030    #[simd_test(enable = "sse2")]
4031    const fn test_mm_cmplt_epi8() {
4032        let a = _mm_set1_epi8(0);
4033        let b = _mm_set_epi8(5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
4034        let r = _mm_cmplt_epi8(a, b);
4035        let e = _mm_set_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
4036        assert_eq_m128i(r, e);
4037    }
4038
4039    #[simd_test(enable = "sse2")]
4040    const fn test_mm_cmplt_epi16() {
4041        let a = _mm_set1_epi16(0);
4042        let b = _mm_set_epi16(5, 0, 0, 0, 0, 0, 0, 0);
4043        let r = _mm_cmplt_epi16(a, b);
4044        let e = _mm_set_epi16(!0, 0, 0, 0, 0, 0, 0, 0);
4045        assert_eq_m128i(r, e);
4046    }
4047
4048    #[simd_test(enable = "sse2")]
4049    const fn test_mm_cmplt_epi32() {
4050        let a = _mm_set1_epi32(0);
4051        let b = _mm_set_epi32(5, 0, 0, 0);
4052        let r = _mm_cmplt_epi32(a, b);
4053        assert_eq_m128i(r, _mm_set_epi32(!0, 0, 0, 0));
4054    }
4055
4056    #[simd_test(enable = "sse2")]
4057    const fn test_mm_cvtepi32_pd() {
4058        let a = _mm_set_epi32(35, 25, 15, 5);
4059        let r = _mm_cvtepi32_pd(a);
4060        assert_eq_m128d(r, _mm_setr_pd(5.0, 15.0));
4061    }
4062
4063    #[simd_test(enable = "sse2")]
4064    const fn test_mm_cvtsi32_sd() {
4065        let a = _mm_set1_pd(3.5);
4066        let r = _mm_cvtsi32_sd(a, 5);
4067        assert_eq_m128d(r, _mm_setr_pd(5.0, 3.5));
4068    }
4069
4070    #[simd_test(enable = "sse2")]
4071    const fn test_mm_cvtepi32_ps() {
4072        let a = _mm_setr_epi32(1, 2, 3, 4);
4073        let r = _mm_cvtepi32_ps(a);
4074        assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 3.0, 4.0));
4075    }
4076
4077    #[simd_test(enable = "sse2")]
4078    unsafe fn test_mm_cvtps_epi32() {
4079        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
4080        let r = _mm_cvtps_epi32(a);
4081        assert_eq_m128i(r, _mm_setr_epi32(1, 2, 3, 4));
4082    }
4083
4084    #[simd_test(enable = "sse2")]
4085    const fn test_mm_cvtsi32_si128() {
4086        let r = _mm_cvtsi32_si128(5);
4087        assert_eq_m128i(r, _mm_setr_epi32(5, 0, 0, 0));
4088    }
4089
4090    #[simd_test(enable = "sse2")]
4091    const fn test_mm_cvtsi128_si32() {
4092        let r = _mm_cvtsi128_si32(_mm_setr_epi32(5, 0, 0, 0));
4093        assert_eq!(r, 5);
4094    }
4095
4096    #[simd_test(enable = "sse2")]
4097    const fn test_mm_set_epi64x() {
4098        let r = _mm_set_epi64x(0, 1);
4099        assert_eq_m128i(r, _mm_setr_epi64x(1, 0));
4100    }
4101
4102    #[simd_test(enable = "sse2")]
4103    const fn test_mm_set_epi32() {
4104        let r = _mm_set_epi32(0, 1, 2, 3);
4105        assert_eq_m128i(r, _mm_setr_epi32(3, 2, 1, 0));
4106    }
4107
4108    #[simd_test(enable = "sse2")]
4109    const fn test_mm_set_epi16() {
4110        let r = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4111        assert_eq_m128i(r, _mm_setr_epi16(7, 6, 5, 4, 3, 2, 1, 0));
4112    }
4113
4114    #[simd_test(enable = "sse2")]
4115    const fn test_mm_set_epi8() {
4116        #[rustfmt::skip]
4117        let r = _mm_set_epi8(
4118            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4119        );
4120        #[rustfmt::skip]
4121        let e = _mm_setr_epi8(
4122            15, 14, 13, 12, 11, 10, 9, 8,
4123            7, 6, 5, 4, 3, 2, 1, 0,
4124        );
4125        assert_eq_m128i(r, e);
4126    }
4127
4128    #[simd_test(enable = "sse2")]
4129    const fn test_mm_set1_epi64x() {
4130        let r = _mm_set1_epi64x(1);
4131        assert_eq_m128i(r, _mm_set1_epi64x(1));
4132    }
4133
4134    #[simd_test(enable = "sse2")]
4135    const fn test_mm_set1_epi32() {
4136        let r = _mm_set1_epi32(1);
4137        assert_eq_m128i(r, _mm_set1_epi32(1));
4138    }
4139
4140    #[simd_test(enable = "sse2")]
4141    const fn test_mm_set1_epi16() {
4142        let r = _mm_set1_epi16(1);
4143        assert_eq_m128i(r, _mm_set1_epi16(1));
4144    }
4145
4146    #[simd_test(enable = "sse2")]
4147    const fn test_mm_set1_epi8() {
4148        let r = _mm_set1_epi8(1);
4149        assert_eq_m128i(r, _mm_set1_epi8(1));
4150    }
4151
4152    #[simd_test(enable = "sse2")]
4153    const fn test_mm_setr_epi32() {
4154        let r = _mm_setr_epi32(0, 1, 2, 3);
4155        assert_eq_m128i(r, _mm_setr_epi32(0, 1, 2, 3));
4156    }
4157
4158    #[simd_test(enable = "sse2")]
4159    const fn test_mm_setr_epi16() {
4160        let r = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4161        assert_eq_m128i(r, _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7));
4162    }
4163
4164    #[simd_test(enable = "sse2")]
4165    const fn test_mm_setr_epi8() {
4166        #[rustfmt::skip]
4167        let r = _mm_setr_epi8(
4168            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4169        );
4170        #[rustfmt::skip]
4171        let e = _mm_setr_epi8(
4172            0, 1, 2, 3, 4, 5, 6, 7,
4173            8, 9, 10, 11, 12, 13, 14, 15,
4174        );
4175        assert_eq_m128i(r, e);
4176    }
4177
4178    #[simd_test(enable = "sse2")]
4179    const fn test_mm_setzero_si128() {
4180        let r = _mm_setzero_si128();
4181        assert_eq_m128i(r, _mm_set1_epi64x(0));
4182    }
4183
4184    #[simd_test(enable = "sse2")]
4185    const unsafe fn test_mm_loadl_epi64() {
4186        let a = _mm_setr_epi64x(6, 5);
4187        let r = _mm_loadl_epi64(ptr::addr_of!(a));
4188        assert_eq_m128i(r, _mm_setr_epi64x(6, 0));
4189    }
4190
4191    #[simd_test(enable = "sse2")]
4192    const unsafe fn test_mm_load_si128() {
4193        let a = _mm_set_epi64x(5, 6);
4194        let r = _mm_load_si128(ptr::addr_of!(a) as *const _);
4195        assert_eq_m128i(a, r);
4196    }
4197
4198    #[simd_test(enable = "sse2")]
4199    const unsafe fn test_mm_loadu_si128() {
4200        let a = _mm_set_epi64x(5, 6);
4201        let r = _mm_loadu_si128(ptr::addr_of!(a) as *const _);
4202        assert_eq_m128i(a, r);
4203    }
4204
4205    #[simd_test(enable = "sse2")]
4206    // Miri cannot support this until it is clear how it fits in the Rust memory model
4207    // (non-temporal store)
4208    #[cfg_attr(miri, ignore)]
4209    unsafe fn test_mm_maskmoveu_si128() {
4210        let a = _mm_set1_epi8(9);
4211        #[rustfmt::skip]
4212        let mask = _mm_set_epi8(
4213            0, 0, 0x80u8 as i8, 0, 0, 0, 0, 0,
4214            0, 0, 0, 0, 0, 0, 0, 0,
4215        );
4216        let mut r = _mm_set1_epi8(0);
4217        _mm_maskmoveu_si128(a, mask, ptr::addr_of_mut!(r) as *mut i8);
4218        _mm_sfence();
4219        let e = _mm_set_epi8(0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
4220        assert_eq_m128i(r, e);
4221    }
4222
4223    #[simd_test(enable = "sse2")]
4224    const unsafe fn test_mm_store_si128() {
4225        let a = _mm_set1_epi8(9);
4226        let mut r = _mm_set1_epi8(0);
4227        _mm_store_si128(&mut r, a);
4228        assert_eq_m128i(r, a);
4229    }
4230
4231    #[simd_test(enable = "sse2")]
4232    const unsafe fn test_mm_storeu_si128() {
4233        let a = _mm_set1_epi8(9);
4234        let mut r = _mm_set1_epi8(0);
4235        _mm_storeu_si128(&mut r, a);
4236        assert_eq_m128i(r, a);
4237    }
4238
4239    #[simd_test(enable = "sse2")]
4240    const unsafe fn test_mm_storel_epi64() {
4241        let a = _mm_setr_epi64x(2, 9);
4242        let mut r = _mm_set1_epi8(0);
4243        _mm_storel_epi64(&mut r, a);
4244        assert_eq_m128i(r, _mm_setr_epi64x(2, 0));
4245    }
4246
4247    #[simd_test(enable = "sse2")]
4248    // Miri cannot support this until it is clear how it fits in the Rust memory model
4249    // (non-temporal store)
4250    #[cfg_attr(miri, ignore)]
4251    unsafe fn test_mm_stream_si128() {
4252        let a = _mm_setr_epi32(1, 2, 3, 4);
4253        let mut r = _mm_undefined_si128();
4254        _mm_stream_si128(ptr::addr_of_mut!(r), a);
4255        _mm_sfence();
4256        assert_eq_m128i(r, a);
4257    }
4258
4259    #[simd_test(enable = "sse2")]
4260    // Miri cannot support this until it is clear how it fits in the Rust memory model
4261    // (non-temporal store)
4262    #[cfg_attr(miri, ignore)]
4263    unsafe fn test_mm_stream_si32() {
4264        let a: i32 = 7;
4265        let mut mem = boxed::Box::<i32>::new(-1);
4266        _mm_stream_si32(ptr::addr_of_mut!(*mem), a);
4267        _mm_sfence();
4268        assert_eq!(a, *mem);
4269    }
4270
4271    #[simd_test(enable = "sse2")]
4272    const fn test_mm_move_epi64() {
4273        let a = _mm_setr_epi64x(5, 6);
4274        let r = _mm_move_epi64(a);
4275        assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
4276    }
4277
4278    #[simd_test(enable = "sse2")]
4279    fn test_mm_packs_epi16() {
4280        let a = _mm_setr_epi16(0x80, -0x81, 0, 0, 0, 0, 0, 0);
4281        let b = _mm_setr_epi16(0, 0, 0, 0, 0, 0, -0x81, 0x80);
4282        let r = _mm_packs_epi16(a, b);
4283        #[rustfmt::skip]
4284        assert_eq_m128i(
4285            r,
4286            _mm_setr_epi8(
4287                0x7F, -0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0x80, 0x7F
4288            )
4289        );
4290    }
4291
4292    #[simd_test(enable = "sse2")]
4293    fn test_mm_packs_epi32() {
4294        let a = _mm_setr_epi32(0x8000, -0x8001, 0, 0);
4295        let b = _mm_setr_epi32(0, 0, -0x8001, 0x8000);
4296        let r = _mm_packs_epi32(a, b);
4297        assert_eq_m128i(
4298            r,
4299            _mm_setr_epi16(0x7FFF, -0x8000, 0, 0, 0, 0, -0x8000, 0x7FFF),
4300        );
4301    }
4302
4303    #[simd_test(enable = "sse2")]
4304    fn test_mm_packus_epi16() {
4305        let a = _mm_setr_epi16(0x100, -1, 0, 0, 0, 0, 0, 0);
4306        let b = _mm_setr_epi16(0, 0, 0, 0, 0, 0, -1, 0x100);
4307        let r = _mm_packus_epi16(a, b);
4308        assert_eq_m128i(
4309            r,
4310            _mm_setr_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, !0),
4311        );
4312    }
4313
4314    #[simd_test(enable = "sse2")]
4315    const fn test_mm_extract_epi16() {
4316        let a = _mm_setr_epi16(-1, 1, 2, 3, 4, 5, 6, 7);
4317        let r1 = _mm_extract_epi16::<0>(a);
4318        let r2 = _mm_extract_epi16::<3>(a);
4319        assert_eq!(r1, 0xFFFF);
4320        assert_eq!(r2, 3);
4321    }
4322
4323    #[simd_test(enable = "sse2")]
4324    const fn test_mm_insert_epi16() {
4325        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4326        let r = _mm_insert_epi16::<0>(a, 9);
4327        let e = _mm_setr_epi16(9, 1, 2, 3, 4, 5, 6, 7);
4328        assert_eq_m128i(r, e);
4329    }
4330
4331    #[simd_test(enable = "sse2")]
4332    const fn test_mm_movemask_epi8() {
4333        #[rustfmt::skip]
4334        let a = _mm_setr_epi8(
4335            0b1000_0000u8 as i8, 0b0, 0b1000_0000u8 as i8, 0b01,
4336            0b0101, 0b1111_0000u8 as i8, 0, 0,
4337            0, 0b1011_0101u8 as i8, 0b1111_0000u8 as i8, 0b0101,
4338            0b01, 0b1000_0000u8 as i8, 0b0, 0b1000_0000u8 as i8,
4339        );
4340        let r = _mm_movemask_epi8(a);
4341        assert_eq!(r, 0b10100110_00100101);
4342    }
4343
4344    #[simd_test(enable = "sse2")]
4345    const fn test_mm_shuffle_epi32() {
4346        let a = _mm_setr_epi32(5, 10, 15, 20);
4347        let r = _mm_shuffle_epi32::<0b00_01_01_11>(a);
4348        let e = _mm_setr_epi32(20, 10, 10, 5);
4349        assert_eq_m128i(r, e);
4350    }
4351
4352    #[simd_test(enable = "sse2")]
4353    const fn test_mm_shufflehi_epi16() {
4354        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 10, 15, 20);
4355        let r = _mm_shufflehi_epi16::<0b00_01_01_11>(a);
4356        let e = _mm_setr_epi16(1, 2, 3, 4, 20, 10, 10, 5);
4357        assert_eq_m128i(r, e);
4358    }
4359
4360    #[simd_test(enable = "sse2")]
4361    const fn test_mm_shufflelo_epi16() {
4362        let a = _mm_setr_epi16(5, 10, 15, 20, 1, 2, 3, 4);
4363        let r = _mm_shufflelo_epi16::<0b00_01_01_11>(a);
4364        let e = _mm_setr_epi16(20, 10, 10, 5, 1, 2, 3, 4);
4365        assert_eq_m128i(r, e);
4366    }
4367
4368    #[simd_test(enable = "sse2")]
4369    const fn test_mm_unpackhi_epi8() {
4370        #[rustfmt::skip]
4371        let a = _mm_setr_epi8(
4372            0, 1, 2, 3, 4, 5, 6, 7,
4373            8, 9, 10, 11, 12, 13, 14, 15,
4374        );
4375        #[rustfmt::skip]
4376        let b = _mm_setr_epi8(
4377            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
4378        );
4379        let r = _mm_unpackhi_epi8(a, b);
4380        #[rustfmt::skip]
4381        let e = _mm_setr_epi8(
4382            8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31,
4383        );
4384        assert_eq_m128i(r, e);
4385    }
4386
4387    #[simd_test(enable = "sse2")]
4388    const fn test_mm_unpackhi_epi16() {
4389        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4390        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
4391        let r = _mm_unpackhi_epi16(a, b);
4392        let e = _mm_setr_epi16(4, 12, 5, 13, 6, 14, 7, 15);
4393        assert_eq_m128i(r, e);
4394    }
4395
4396    #[simd_test(enable = "sse2")]
4397    const fn test_mm_unpackhi_epi32() {
4398        let a = _mm_setr_epi32(0, 1, 2, 3);
4399        let b = _mm_setr_epi32(4, 5, 6, 7);
4400        let r = _mm_unpackhi_epi32(a, b);
4401        let e = _mm_setr_epi32(2, 6, 3, 7);
4402        assert_eq_m128i(r, e);
4403    }
4404
4405    #[simd_test(enable = "sse2")]
4406    const fn test_mm_unpackhi_epi64() {
4407        let a = _mm_setr_epi64x(0, 1);
4408        let b = _mm_setr_epi64x(2, 3);
4409        let r = _mm_unpackhi_epi64(a, b);
4410        let e = _mm_setr_epi64x(1, 3);
4411        assert_eq_m128i(r, e);
4412    }
4413
4414    #[simd_test(enable = "sse2")]
4415    const fn test_mm_unpacklo_epi8() {
4416        #[rustfmt::skip]
4417        let a = _mm_setr_epi8(
4418            0, 1, 2, 3, 4, 5, 6, 7,
4419            8, 9, 10, 11, 12, 13, 14, 15,
4420        );
4421        #[rustfmt::skip]
4422        let b = _mm_setr_epi8(
4423            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
4424        );
4425        let r = _mm_unpacklo_epi8(a, b);
4426        #[rustfmt::skip]
4427        let e = _mm_setr_epi8(
4428            0, 16, 1, 17, 2, 18, 3, 19,
4429            4, 20, 5, 21, 6, 22, 7, 23,
4430        );
4431        assert_eq_m128i(r, e);
4432    }
4433
4434    #[simd_test(enable = "sse2")]
4435    const fn test_mm_unpacklo_epi16() {
4436        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4437        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
4438        let r = _mm_unpacklo_epi16(a, b);
4439        let e = _mm_setr_epi16(0, 8, 1, 9, 2, 10, 3, 11);
4440        assert_eq_m128i(r, e);
4441    }
4442
4443    #[simd_test(enable = "sse2")]
4444    const fn test_mm_unpacklo_epi32() {
4445        let a = _mm_setr_epi32(0, 1, 2, 3);
4446        let b = _mm_setr_epi32(4, 5, 6, 7);
4447        let r = _mm_unpacklo_epi32(a, b);
4448        let e = _mm_setr_epi32(0, 4, 1, 5);
4449        assert_eq_m128i(r, e);
4450    }
4451
4452    #[simd_test(enable = "sse2")]
4453    const fn test_mm_unpacklo_epi64() {
4454        let a = _mm_setr_epi64x(0, 1);
4455        let b = _mm_setr_epi64x(2, 3);
4456        let r = _mm_unpacklo_epi64(a, b);
4457        let e = _mm_setr_epi64x(0, 2);
4458        assert_eq_m128i(r, e);
4459    }
4460
4461    #[simd_test(enable = "sse2")]
4462    const fn test_mm_add_sd() {
4463        let a = _mm_setr_pd(1.0, 2.0);
4464        let b = _mm_setr_pd(5.0, 10.0);
4465        let r = _mm_add_sd(a, b);
4466        assert_eq_m128d(r, _mm_setr_pd(6.0, 2.0));
4467    }
4468
4469    #[simd_test(enable = "sse2")]
4470    const fn test_mm_add_pd() {
4471        let a = _mm_setr_pd(1.0, 2.0);
4472        let b = _mm_setr_pd(5.0, 10.0);
4473        let r = _mm_add_pd(a, b);
4474        assert_eq_m128d(r, _mm_setr_pd(6.0, 12.0));
4475    }
4476
4477    #[simd_test(enable = "sse2")]
4478    const fn test_mm_div_sd() {
4479        let a = _mm_setr_pd(1.0, 2.0);
4480        let b = _mm_setr_pd(5.0, 10.0);
4481        let r = _mm_div_sd(a, b);
4482        assert_eq_m128d(r, _mm_setr_pd(0.2, 2.0));
4483    }
4484
4485    #[simd_test(enable = "sse2")]
4486    const fn test_mm_div_pd() {
4487        let a = _mm_setr_pd(1.0, 2.0);
4488        let b = _mm_setr_pd(5.0, 10.0);
4489        let r = _mm_div_pd(a, b);
4490        assert_eq_m128d(r, _mm_setr_pd(0.2, 0.2));
4491    }
4492
4493    #[simd_test(enable = "sse2")]
4494    fn test_mm_max_sd() {
4495        let a = _mm_setr_pd(1.0, 2.0);
4496        let b = _mm_setr_pd(5.0, 10.0);
4497        let r = _mm_max_sd(a, b);
4498        assert_eq_m128d(r, _mm_setr_pd(5.0, 2.0));
4499    }
4500
4501    #[simd_test(enable = "sse2")]
4502    fn test_mm_max_pd() {
4503        let a = _mm_setr_pd(1.0, 2.0);
4504        let b = _mm_setr_pd(5.0, 10.0);
4505        let r = _mm_max_pd(a, b);
4506        assert_eq_m128d(r, _mm_setr_pd(5.0, 10.0));
4507
4508        // Check SSE(2)-specific semantics for -0.0 handling.
4509        let a = _mm_setr_pd(-0.0, 0.0);
4510        let b = _mm_setr_pd(0.0, 0.0);
4511        // Cast to __m128i to compare exact bit patterns
4512        let r1 = _mm_castpd_si128(_mm_max_pd(a, b));
4513        let r2 = _mm_castpd_si128(_mm_max_pd(b, a));
4514        let a = _mm_castpd_si128(a);
4515        let b = _mm_castpd_si128(b);
4516        assert_eq_m128i(r1, b);
4517        assert_eq_m128i(r2, a);
4518        assert_ne!(a.as_u8x16(), b.as_u8x16()); // sanity check that -0.0 is actually present
4519    }
4520
4521    #[simd_test(enable = "sse2")]
4522    fn test_mm_min_sd() {
4523        let a = _mm_setr_pd(1.0, 2.0);
4524        let b = _mm_setr_pd(5.0, 10.0);
4525        let r = _mm_min_sd(a, b);
4526        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
4527    }
4528
4529    #[simd_test(enable = "sse2")]
4530    fn test_mm_min_pd() {
4531        let a = _mm_setr_pd(1.0, 2.0);
4532        let b = _mm_setr_pd(5.0, 10.0);
4533        let r = _mm_min_pd(a, b);
4534        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
4535
4536        // Check SSE(2)-specific semantics for -0.0 handling.
4537        let a = _mm_setr_pd(-0.0, 0.0);
4538        let b = _mm_setr_pd(0.0, 0.0);
4539        // Cast to __m128i to compare exact bit patterns
4540        let r1 = _mm_castpd_si128(_mm_min_pd(a, b));
4541        let r2 = _mm_castpd_si128(_mm_min_pd(b, a));
4542        let a = _mm_castpd_si128(a);
4543        let b = _mm_castpd_si128(b);
4544        assert_eq_m128i(r1, b);
4545        assert_eq_m128i(r2, a);
4546        assert_ne!(a.as_u8x16(), b.as_u8x16()); // sanity check that -0.0 is actually present
4547    }
4548
4549    #[simd_test(enable = "sse2")]
4550    const fn test_mm_mul_sd() {
4551        let a = _mm_setr_pd(1.0, 2.0);
4552        let b = _mm_setr_pd(5.0, 10.0);
4553        let r = _mm_mul_sd(a, b);
4554        assert_eq_m128d(r, _mm_setr_pd(5.0, 2.0));
4555    }
4556
4557    #[simd_test(enable = "sse2")]
4558    const fn test_mm_mul_pd() {
4559        let a = _mm_setr_pd(1.0, 2.0);
4560        let b = _mm_setr_pd(5.0, 10.0);
4561        let r = _mm_mul_pd(a, b);
4562        assert_eq_m128d(r, _mm_setr_pd(5.0, 20.0));
4563    }
4564
4565    #[simd_test(enable = "sse2")]
4566    fn test_mm_sqrt_sd() {
4567        let a = _mm_setr_pd(1.0, 2.0);
4568        let b = _mm_setr_pd(5.0, 10.0);
4569        let r = _mm_sqrt_sd(a, b);
4570        assert_eq_m128d(r, _mm_setr_pd(5.0f64.sqrt(), 2.0));
4571    }
4572
4573    #[simd_test(enable = "sse2")]
4574    fn test_mm_sqrt_pd() {
4575        let r = _mm_sqrt_pd(_mm_setr_pd(1.0, 2.0));
4576        assert_eq_m128d(r, _mm_setr_pd(1.0f64.sqrt(), 2.0f64.sqrt()));
4577    }
4578
4579    #[simd_test(enable = "sse2")]
4580    const fn test_mm_sub_sd() {
4581        let a = _mm_setr_pd(1.0, 2.0);
4582        let b = _mm_setr_pd(5.0, 10.0);
4583        let r = _mm_sub_sd(a, b);
4584        assert_eq_m128d(r, _mm_setr_pd(-4.0, 2.0));
4585    }
4586
4587    #[simd_test(enable = "sse2")]
4588    const fn test_mm_sub_pd() {
4589        let a = _mm_setr_pd(1.0, 2.0);
4590        let b = _mm_setr_pd(5.0, 10.0);
4591        let r = _mm_sub_pd(a, b);
4592        assert_eq_m128d(r, _mm_setr_pd(-4.0, -8.0));
4593    }
4594
4595    #[simd_test(enable = "sse2")]
4596    const unsafe fn test_mm_and_pd() {
4597        let a = transmute(u64x2::splat(5));
4598        let b = transmute(u64x2::splat(3));
4599        let r = _mm_and_pd(a, b);
4600        let e = transmute(u64x2::splat(1));
4601        assert_eq_m128d(r, e);
4602    }
4603
4604    #[simd_test(enable = "sse2")]
4605    const unsafe fn test_mm_andnot_pd() {
4606        let a = transmute(u64x2::splat(5));
4607        let b = transmute(u64x2::splat(3));
4608        let r = _mm_andnot_pd(a, b);
4609        let e = transmute(u64x2::splat(2));
4610        assert_eq_m128d(r, e);
4611    }
4612
4613    #[simd_test(enable = "sse2")]
4614    const unsafe fn test_mm_or_pd() {
4615        let a = transmute(u64x2::splat(5));
4616        let b = transmute(u64x2::splat(3));
4617        let r = _mm_or_pd(a, b);
4618        let e = transmute(u64x2::splat(7));
4619        assert_eq_m128d(r, e);
4620    }
4621
4622    #[simd_test(enable = "sse2")]
4623    const unsafe fn test_mm_xor_pd() {
4624        let a = transmute(u64x2::splat(5));
4625        let b = transmute(u64x2::splat(3));
4626        let r = _mm_xor_pd(a, b);
4627        let e = transmute(u64x2::splat(6));
4628        assert_eq_m128d(r, e);
4629    }
4630
4631    #[simd_test(enable = "sse2")]
4632    fn test_mm_cmpeq_sd() {
4633        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4634        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4635        let r = _mm_castpd_si128(_mm_cmpeq_sd(a, b));
4636        assert_eq_m128i(r, e);
4637    }
4638
4639    #[simd_test(enable = "sse2")]
4640    fn test_mm_cmplt_sd() {
4641        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4642        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4643        let r = _mm_castpd_si128(_mm_cmplt_sd(a, b));
4644        assert_eq_m128i(r, e);
4645    }
4646
4647    #[simd_test(enable = "sse2")]
4648    fn test_mm_cmple_sd() {
4649        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4650        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4651        let r = _mm_castpd_si128(_mm_cmple_sd(a, b));
4652        assert_eq_m128i(r, e);
4653    }
4654
4655    #[simd_test(enable = "sse2")]
4656    fn test_mm_cmpgt_sd() {
4657        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
4658        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4659        let r = _mm_castpd_si128(_mm_cmpgt_sd(a, b));
4660        assert_eq_m128i(r, e);
4661    }
4662
4663    #[simd_test(enable = "sse2")]
4664    fn test_mm_cmpge_sd() {
4665        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4666        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4667        let r = _mm_castpd_si128(_mm_cmpge_sd(a, b));
4668        assert_eq_m128i(r, e);
4669    }
4670
4671    #[simd_test(enable = "sse2")]
4672    fn test_mm_cmpord_sd() {
4673        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4674        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4675        let r = _mm_castpd_si128(_mm_cmpord_sd(a, b));
4676        assert_eq_m128i(r, e);
4677    }
4678
4679    #[simd_test(enable = "sse2")]
4680    fn test_mm_cmpunord_sd() {
4681        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4682        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4683        let r = _mm_castpd_si128(_mm_cmpunord_sd(a, b));
4684        assert_eq_m128i(r, e);
4685    }
4686
4687    #[simd_test(enable = "sse2")]
4688    fn test_mm_cmpneq_sd() {
4689        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4690        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4691        let r = _mm_castpd_si128(_mm_cmpneq_sd(a, b));
4692        assert_eq_m128i(r, e);
4693    }
4694
4695    #[simd_test(enable = "sse2")]
4696    fn test_mm_cmpnlt_sd() {
4697        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4698        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4699        let r = _mm_castpd_si128(_mm_cmpnlt_sd(a, b));
4700        assert_eq_m128i(r, e);
4701    }
4702
4703    #[simd_test(enable = "sse2")]
4704    fn test_mm_cmpnle_sd() {
4705        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4706        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4707        let r = _mm_castpd_si128(_mm_cmpnle_sd(a, b));
4708        assert_eq_m128i(r, e);
4709    }
4710
4711    #[simd_test(enable = "sse2")]
4712    fn test_mm_cmpngt_sd() {
4713        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
4714        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4715        let r = _mm_castpd_si128(_mm_cmpngt_sd(a, b));
4716        assert_eq_m128i(r, e);
4717    }
4718
4719    #[simd_test(enable = "sse2")]
4720    fn test_mm_cmpnge_sd() {
4721        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4722        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4723        let r = _mm_castpd_si128(_mm_cmpnge_sd(a, b));
4724        assert_eq_m128i(r, e);
4725    }
4726
4727    #[simd_test(enable = "sse2")]
4728    fn test_mm_cmpeq_pd() {
4729        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4730        let e = _mm_setr_epi64x(!0, 0);
4731        let r = _mm_castpd_si128(_mm_cmpeq_pd(a, b));
4732        assert_eq_m128i(r, e);
4733    }
4734
4735    #[simd_test(enable = "sse2")]
4736    fn test_mm_cmplt_pd() {
4737        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4738        let e = _mm_setr_epi64x(0, !0);
4739        let r = _mm_castpd_si128(_mm_cmplt_pd(a, b));
4740        assert_eq_m128i(r, e);
4741    }
4742
4743    #[simd_test(enable = "sse2")]
4744    fn test_mm_cmple_pd() {
4745        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4746        let e = _mm_setr_epi64x(!0, !0);
4747        let r = _mm_castpd_si128(_mm_cmple_pd(a, b));
4748        assert_eq_m128i(r, e);
4749    }
4750
4751    #[simd_test(enable = "sse2")]
4752    fn test_mm_cmpgt_pd() {
4753        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4754        let e = _mm_setr_epi64x(0, 0);
4755        let r = _mm_castpd_si128(_mm_cmpgt_pd(a, b));
4756        assert_eq_m128i(r, e);
4757    }
4758
4759    #[simd_test(enable = "sse2")]
4760    fn test_mm_cmpge_pd() {
4761        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4762        let e = _mm_setr_epi64x(!0, 0);
4763        let r = _mm_castpd_si128(_mm_cmpge_pd(a, b));
4764        assert_eq_m128i(r, e);
4765    }
4766
4767    #[simd_test(enable = "sse2")]
4768    fn test_mm_cmpord_pd() {
4769        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4770        let e = _mm_setr_epi64x(0, !0);
4771        let r = _mm_castpd_si128(_mm_cmpord_pd(a, b));
4772        assert_eq_m128i(r, e);
4773    }
4774
4775    #[simd_test(enable = "sse2")]
4776    fn test_mm_cmpunord_pd() {
4777        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4778        let e = _mm_setr_epi64x(!0, 0);
4779        let r = _mm_castpd_si128(_mm_cmpunord_pd(a, b));
4780        assert_eq_m128i(r, e);
4781    }
4782
4783    #[simd_test(enable = "sse2")]
4784    fn test_mm_cmpneq_pd() {
4785        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4786        let e = _mm_setr_epi64x(!0, !0);
4787        let r = _mm_castpd_si128(_mm_cmpneq_pd(a, b));
4788        assert_eq_m128i(r, e);
4789    }
4790
4791    #[simd_test(enable = "sse2")]
4792    fn test_mm_cmpnlt_pd() {
4793        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4794        let e = _mm_setr_epi64x(0, 0);
4795        let r = _mm_castpd_si128(_mm_cmpnlt_pd(a, b));
4796        assert_eq_m128i(r, e);
4797    }
4798
4799    #[simd_test(enable = "sse2")]
4800    fn test_mm_cmpnle_pd() {
4801        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4802        let e = _mm_setr_epi64x(0, 0);
4803        let r = _mm_castpd_si128(_mm_cmpnle_pd(a, b));
4804        assert_eq_m128i(r, e);
4805    }
4806
4807    #[simd_test(enable = "sse2")]
4808    fn test_mm_cmpngt_pd() {
4809        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
4810        let e = _mm_setr_epi64x(0, !0);
4811        let r = _mm_castpd_si128(_mm_cmpngt_pd(a, b));
4812        assert_eq_m128i(r, e);
4813    }
4814
4815    #[simd_test(enable = "sse2")]
4816    fn test_mm_cmpnge_pd() {
4817        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4818        let e = _mm_setr_epi64x(0, !0);
4819        let r = _mm_castpd_si128(_mm_cmpnge_pd(a, b));
4820        assert_eq_m128i(r, e);
4821    }
4822
4823    #[simd_test(enable = "sse2")]
4824    fn test_mm_comieq_sd() {
4825        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4826        assert!(_mm_comieq_sd(a, b) != 0);
4827
4828        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(1.0, 3.0));
4829        assert!(_mm_comieq_sd(a, b) == 0);
4830    }
4831
4832    #[simd_test(enable = "sse2")]
4833    fn test_mm_comilt_sd() {
4834        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4835        assert!(_mm_comilt_sd(a, b) == 0);
4836    }
4837
4838    #[simd_test(enable = "sse2")]
4839    fn test_mm_comile_sd() {
4840        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4841        assert!(_mm_comile_sd(a, b) != 0);
4842    }
4843
4844    #[simd_test(enable = "sse2")]
4845    fn test_mm_comigt_sd() {
4846        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4847        assert!(_mm_comigt_sd(a, b) == 0);
4848    }
4849
4850    #[simd_test(enable = "sse2")]
4851    fn test_mm_comige_sd() {
4852        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4853        assert!(_mm_comige_sd(a, b) != 0);
4854    }
4855
4856    #[simd_test(enable = "sse2")]
4857    fn test_mm_comineq_sd() {
4858        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4859        assert!(_mm_comineq_sd(a, b) == 0);
4860    }
4861
4862    #[simd_test(enable = "sse2")]
4863    fn test_mm_ucomieq_sd() {
4864        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4865        assert!(_mm_ucomieq_sd(a, b) != 0);
4866
4867        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(NAN, 3.0));
4868        assert!(_mm_ucomieq_sd(a, b) == 0);
4869    }
4870
4871    #[simd_test(enable = "sse2")]
4872    fn test_mm_ucomilt_sd() {
4873        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4874        assert!(_mm_ucomilt_sd(a, b) == 0);
4875    }
4876
4877    #[simd_test(enable = "sse2")]
4878    fn test_mm_ucomile_sd() {
4879        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4880        assert!(_mm_ucomile_sd(a, b) != 0);
4881    }
4882
4883    #[simd_test(enable = "sse2")]
4884    fn test_mm_ucomigt_sd() {
4885        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4886        assert!(_mm_ucomigt_sd(a, b) == 0);
4887    }
4888
4889    #[simd_test(enable = "sse2")]
4890    fn test_mm_ucomige_sd() {
4891        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4892        assert!(_mm_ucomige_sd(a, b) != 0);
4893    }
4894
4895    #[simd_test(enable = "sse2")]
4896    fn test_mm_ucomineq_sd() {
4897        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4898        assert!(_mm_ucomineq_sd(a, b) == 0);
4899    }
4900
4901    #[simd_test(enable = "sse2")]
4902    const fn test_mm_movemask_pd() {
4903        let r = _mm_movemask_pd(_mm_setr_pd(-1.0, 5.0));
4904        assert_eq!(r, 0b01);
4905
4906        let r = _mm_movemask_pd(_mm_setr_pd(-1.0, -5.0));
4907        assert_eq!(r, 0b11);
4908    }
4909
4910    #[repr(align(16))]
4911    struct Memory {
4912        data: [f64; 4],
4913    }
4914
4915    #[simd_test(enable = "sse2")]
4916    const unsafe fn test_mm_load_pd() {
4917        let mem = Memory {
4918            data: [1.0f64, 2.0, 3.0, 4.0],
4919        };
4920        let vals = &mem.data;
4921        let d = vals.as_ptr();
4922
4923        let r = _mm_load_pd(d);
4924        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
4925    }
4926
4927    #[simd_test(enable = "sse2")]
4928    const unsafe fn test_mm_load_sd() {
4929        let a = 1.;
4930        let expected = _mm_setr_pd(a, 0.);
4931        let r = _mm_load_sd(&a);
4932        assert_eq_m128d(r, expected);
4933    }
4934
4935    #[simd_test(enable = "sse2")]
4936    const unsafe fn test_mm_loadh_pd() {
4937        let a = _mm_setr_pd(1., 2.);
4938        let b = 3.;
4939        let expected = _mm_setr_pd(_mm_cvtsd_f64(a), 3.);
4940        let r = _mm_loadh_pd(a, &b);
4941        assert_eq_m128d(r, expected);
4942    }
4943
4944    #[simd_test(enable = "sse2")]
4945    const unsafe fn test_mm_loadl_pd() {
4946        let a = _mm_setr_pd(1., 2.);
4947        let b = 3.;
4948        let expected = _mm_setr_pd(3., get_m128d(a, 1));
4949        let r = _mm_loadl_pd(a, &b);
4950        assert_eq_m128d(r, expected);
4951    }
4952
4953    #[simd_test(enable = "sse2")]
4954    // Miri cannot support this until it is clear how it fits in the Rust memory model
4955    // (non-temporal store)
4956    #[cfg_attr(miri, ignore)]
4957    unsafe fn test_mm_stream_pd() {
4958        #[repr(align(128))]
4959        struct Memory {
4960            pub data: [f64; 2],
4961        }
4962        let a = _mm_set1_pd(7.0);
4963        let mut mem = Memory { data: [-1.0; 2] };
4964
4965        _mm_stream_pd(ptr::addr_of_mut!(mem.data[0]), a);
4966        _mm_sfence();
4967        for i in 0..2 {
4968            assert_eq!(mem.data[i], get_m128d(a, i));
4969        }
4970    }
4971
4972    #[simd_test(enable = "sse2")]
4973    const unsafe fn test_mm_store_sd() {
4974        let mut dest = 0.;
4975        let a = _mm_setr_pd(1., 2.);
4976        _mm_store_sd(&mut dest, a);
4977        assert_eq!(dest, _mm_cvtsd_f64(a));
4978    }
4979
4980    #[simd_test(enable = "sse2")]
4981    const unsafe fn test_mm_store_pd() {
4982        let mut mem = Memory { data: [0.0f64; 4] };
4983        let vals = &mut mem.data;
4984        let a = _mm_setr_pd(1.0, 2.0);
4985        let d = vals.as_mut_ptr();
4986
4987        _mm_store_pd(d, *black_box(&a));
4988        assert_eq!(vals[0], 1.0);
4989        assert_eq!(vals[1], 2.0);
4990    }
4991
4992    #[simd_test(enable = "sse2")]
4993    const unsafe fn test_mm_storeu_pd() {
4994        // guaranteed to be aligned to 16 bytes
4995        let mut mem = Memory { data: [0.0f64; 4] };
4996        let vals = &mut mem.data;
4997        let a = _mm_setr_pd(1.0, 2.0);
4998
4999        // so p is *not* aligned to 16 bytes
5000        let p = vals.as_mut_ptr().offset(1);
5001        _mm_storeu_pd(p, *black_box(&a));
5002
5003        assert_eq!(*vals, [0.0, 1.0, 2.0, 0.0]);
5004    }
5005
5006    #[simd_test(enable = "sse2")]
5007    const unsafe fn test_mm_storeu_si16() {
5008        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
5009        let mut r = _mm_setr_epi16(9, 10, 11, 12, 13, 14, 15, 16);
5010        _mm_storeu_si16(ptr::addr_of_mut!(r).cast(), a);
5011        let e = _mm_setr_epi16(1, 10, 11, 12, 13, 14, 15, 16);
5012        assert_eq_m128i(r, e);
5013    }
5014
5015    #[simd_test(enable = "sse2")]
5016    const unsafe fn test_mm_storeu_si32() {
5017        let a = _mm_setr_epi32(1, 2, 3, 4);
5018        let mut r = _mm_setr_epi32(5, 6, 7, 8);
5019        _mm_storeu_si32(ptr::addr_of_mut!(r).cast(), a);
5020        let e = _mm_setr_epi32(1, 6, 7, 8);
5021        assert_eq_m128i(r, e);
5022    }
5023
5024    #[simd_test(enable = "sse2")]
5025    const unsafe fn test_mm_storeu_si64() {
5026        let a = _mm_setr_epi64x(1, 2);
5027        let mut r = _mm_setr_epi64x(3, 4);
5028        _mm_storeu_si64(ptr::addr_of_mut!(r).cast(), a);
5029        let e = _mm_setr_epi64x(1, 4);
5030        assert_eq_m128i(r, e);
5031    }
5032
5033    #[simd_test(enable = "sse2")]
5034    const unsafe fn test_mm_store1_pd() {
5035        let mut mem = Memory { data: [0.0f64; 4] };
5036        let vals = &mut mem.data;
5037        let a = _mm_setr_pd(1.0, 2.0);
5038        let d = vals.as_mut_ptr();
5039
5040        _mm_store1_pd(d, *black_box(&a));
5041        assert_eq!(vals[0], 1.0);
5042        assert_eq!(vals[1], 1.0);
5043    }
5044
5045    #[simd_test(enable = "sse2")]
5046    const unsafe fn test_mm_store_pd1() {
5047        let mut mem = Memory { data: [0.0f64; 4] };
5048        let vals = &mut mem.data;
5049        let a = _mm_setr_pd(1.0, 2.0);
5050        let d = vals.as_mut_ptr();
5051
5052        _mm_store_pd1(d, *black_box(&a));
5053        assert_eq!(vals[0], 1.0);
5054        assert_eq!(vals[1], 1.0);
5055    }
5056
5057    #[simd_test(enable = "sse2")]
5058    const unsafe fn test_mm_storer_pd() {
5059        let mut mem = Memory { data: [0.0f64; 4] };
5060        let vals = &mut mem.data;
5061        let a = _mm_setr_pd(1.0, 2.0);
5062        let d = vals.as_mut_ptr();
5063
5064        _mm_storer_pd(d, *black_box(&a));
5065        assert_eq!(vals[0], 2.0);
5066        assert_eq!(vals[1], 1.0);
5067    }
5068
5069    #[simd_test(enable = "sse2")]
5070    const unsafe fn test_mm_storeh_pd() {
5071        let mut dest = 0.;
5072        let a = _mm_setr_pd(1., 2.);
5073        _mm_storeh_pd(&mut dest, a);
5074        assert_eq!(dest, get_m128d(a, 1));
5075    }
5076
5077    #[simd_test(enable = "sse2")]
5078    const unsafe fn test_mm_storel_pd() {
5079        let mut dest = 0.;
5080        let a = _mm_setr_pd(1., 2.);
5081        _mm_storel_pd(&mut dest, a);
5082        assert_eq!(dest, _mm_cvtsd_f64(a));
5083    }
5084
5085    #[simd_test(enable = "sse2")]
5086    const unsafe fn test_mm_loadr_pd() {
5087        let mut mem = Memory {
5088            data: [1.0f64, 2.0, 3.0, 4.0],
5089        };
5090        let vals = &mut mem.data;
5091        let d = vals.as_ptr();
5092
5093        let r = _mm_loadr_pd(d);
5094        assert_eq_m128d(r, _mm_setr_pd(2.0, 1.0));
5095    }
5096
5097    #[simd_test(enable = "sse2")]
5098    const unsafe fn test_mm_loadu_pd() {
5099        // guaranteed to be aligned to 16 bytes
5100        let mut mem = Memory {
5101            data: [1.0f64, 2.0, 3.0, 4.0],
5102        };
5103        let vals = &mut mem.data;
5104
5105        // so this will *not* be aligned to 16 bytes
5106        let d = vals.as_ptr().offset(1);
5107
5108        let r = _mm_loadu_pd(d);
5109        let e = _mm_setr_pd(2.0, 3.0);
5110        assert_eq_m128d(r, e);
5111    }
5112
5113    #[simd_test(enable = "sse2")]
5114    const unsafe fn test_mm_loadu_si16() {
5115        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
5116        let r = _mm_loadu_si16(ptr::addr_of!(a) as *const _);
5117        assert_eq_m128i(r, _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0));
5118    }
5119
5120    #[simd_test(enable = "sse2")]
5121    const unsafe fn test_mm_loadu_si32() {
5122        let a = _mm_setr_epi32(1, 2, 3, 4);
5123        let r = _mm_loadu_si32(ptr::addr_of!(a) as *const _);
5124        assert_eq_m128i(r, _mm_setr_epi32(1, 0, 0, 0));
5125    }
5126
5127    #[simd_test(enable = "sse2")]
5128    const unsafe fn test_mm_loadu_si64() {
5129        let a = _mm_setr_epi64x(5, 6);
5130        let r = _mm_loadu_si64(ptr::addr_of!(a) as *const _);
5131        assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
5132    }
5133
5134    #[simd_test(enable = "sse2")]
5135    const fn test_mm_cvtpd_ps() {
5136        let r = _mm_cvtpd_ps(_mm_setr_pd(-1.0, 5.0));
5137        assert_eq_m128(r, _mm_setr_ps(-1.0, 5.0, 0.0, 0.0));
5138
5139        let r = _mm_cvtpd_ps(_mm_setr_pd(-1.0, -5.0));
5140        assert_eq_m128(r, _mm_setr_ps(-1.0, -5.0, 0.0, 0.0));
5141
5142        let r = _mm_cvtpd_ps(_mm_setr_pd(f64::MAX, f64::MIN));
5143        assert_eq_m128(r, _mm_setr_ps(f32::INFINITY, f32::NEG_INFINITY, 0.0, 0.0));
5144
5145        let r = _mm_cvtpd_ps(_mm_setr_pd(f32::MAX as f64, f32::MIN as f64));
5146        assert_eq_m128(r, _mm_setr_ps(f32::MAX, f32::MIN, 0.0, 0.0));
5147    }
5148
5149    #[simd_test(enable = "sse2")]
5150    const fn test_mm_cvtps_pd() {
5151        let r = _mm_cvtps_pd(_mm_setr_ps(-1.0, 2.0, -3.0, 5.0));
5152        assert_eq_m128d(r, _mm_setr_pd(-1.0, 2.0));
5153
5154        let r = _mm_cvtps_pd(_mm_setr_ps(
5155            f32::MAX,
5156            f32::INFINITY,
5157            f32::NEG_INFINITY,
5158            f32::MIN,
5159        ));
5160        assert_eq_m128d(r, _mm_setr_pd(f32::MAX as f64, f64::INFINITY));
5161    }
5162
5163    #[simd_test(enable = "sse2")]
5164    fn test_mm_cvtpd_epi32() {
5165        let r = _mm_cvtpd_epi32(_mm_setr_pd(-1.0, 5.0));
5166        assert_eq_m128i(r, _mm_setr_epi32(-1, 5, 0, 0));
5167
5168        let r = _mm_cvtpd_epi32(_mm_setr_pd(-1.0, -5.0));
5169        assert_eq_m128i(r, _mm_setr_epi32(-1, -5, 0, 0));
5170
5171        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::MAX, f64::MIN));
5172        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5173
5174        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::INFINITY, f64::NEG_INFINITY));
5175        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5176
5177        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::NAN, f64::NAN));
5178        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5179    }
5180
5181    #[simd_test(enable = "sse2")]
5182    fn test_mm_cvtsd_si32() {
5183        let r = _mm_cvtsd_si32(_mm_setr_pd(-2.0, 5.0));
5184        assert_eq!(r, -2);
5185
5186        let r = _mm_cvtsd_si32(_mm_setr_pd(f64::MAX, f64::MIN));
5187        assert_eq!(r, i32::MIN);
5188
5189        let r = _mm_cvtsd_si32(_mm_setr_pd(f64::NAN, f64::NAN));
5190        assert_eq!(r, i32::MIN);
5191    }
5192
5193    #[simd_test(enable = "sse2")]
5194    fn test_mm_cvtsd_ss() {
5195        let a = _mm_setr_ps(-1.1, -2.2, 3.3, 4.4);
5196        let b = _mm_setr_pd(2.0, -5.0);
5197
5198        let r = _mm_cvtsd_ss(a, b);
5199
5200        assert_eq_m128(r, _mm_setr_ps(2.0, -2.2, 3.3, 4.4));
5201
5202        let a = _mm_setr_ps(-1.1, f32::NEG_INFINITY, f32::MAX, f32::NEG_INFINITY);
5203        let b = _mm_setr_pd(f64::INFINITY, -5.0);
5204
5205        let r = _mm_cvtsd_ss(a, b);
5206
5207        assert_eq_m128(
5208            r,
5209            _mm_setr_ps(
5210                f32::INFINITY,
5211                f32::NEG_INFINITY,
5212                f32::MAX,
5213                f32::NEG_INFINITY,
5214            ),
5215        );
5216    }
5217
5218    #[simd_test(enable = "sse2")]
5219    const fn test_mm_cvtsd_f64() {
5220        let r = _mm_cvtsd_f64(_mm_setr_pd(-1.1, 2.2));
5221        assert_eq!(r, -1.1);
5222    }
5223
5224    #[simd_test(enable = "sse2")]
5225    const fn test_mm_cvtss_sd() {
5226        let a = _mm_setr_pd(-1.1, 2.2);
5227        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
5228
5229        let r = _mm_cvtss_sd(a, b);
5230        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.2));
5231
5232        let a = _mm_setr_pd(-1.1, f64::INFINITY);
5233        let b = _mm_setr_ps(f32::NEG_INFINITY, 2.0, 3.0, 4.0);
5234
5235        let r = _mm_cvtss_sd(a, b);
5236        assert_eq_m128d(r, _mm_setr_pd(f64::NEG_INFINITY, f64::INFINITY));
5237    }
5238
5239    #[simd_test(enable = "sse2")]
5240    fn test_mm_cvttpd_epi32() {
5241        let a = _mm_setr_pd(-1.1, 2.2);
5242        let r = _mm_cvttpd_epi32(a);
5243        assert_eq_m128i(r, _mm_setr_epi32(-1, 2, 0, 0));
5244
5245        let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
5246        let r = _mm_cvttpd_epi32(a);
5247        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5248    }
5249
5250    #[simd_test(enable = "sse2")]
5251    fn test_mm_cvttsd_si32() {
5252        let a = _mm_setr_pd(-1.1, 2.2);
5253        let r = _mm_cvttsd_si32(a);
5254        assert_eq!(r, -1);
5255
5256        let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
5257        let r = _mm_cvttsd_si32(a);
5258        assert_eq!(r, i32::MIN);
5259    }
5260
5261    #[simd_test(enable = "sse2")]
5262    fn test_mm_cvttps_epi32() {
5263        let a = _mm_setr_ps(-1.1, 2.2, -3.3, 6.6);
5264        let r = _mm_cvttps_epi32(a);
5265        assert_eq_m128i(r, _mm_setr_epi32(-1, 2, -3, 6));
5266
5267        let a = _mm_setr_ps(f32::NEG_INFINITY, f32::INFINITY, f32::MIN, f32::MAX);
5268        let r = _mm_cvttps_epi32(a);
5269        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, i32::MIN, i32::MIN));
5270    }
5271
5272    #[simd_test(enable = "sse2")]
5273    const fn test_mm_set_sd() {
5274        let r = _mm_set_sd(-1.0_f64);
5275        assert_eq_m128d(r, _mm_setr_pd(-1.0_f64, 0_f64));
5276    }
5277
5278    #[simd_test(enable = "sse2")]
5279    const fn test_mm_set1_pd() {
5280        let r = _mm_set1_pd(-1.0_f64);
5281        assert_eq_m128d(r, _mm_setr_pd(-1.0_f64, -1.0_f64));
5282    }
5283
5284    #[simd_test(enable = "sse2")]
5285    const fn test_mm_set_pd1() {
5286        let r = _mm_set_pd1(-2.0_f64);
5287        assert_eq_m128d(r, _mm_setr_pd(-2.0_f64, -2.0_f64));
5288    }
5289
5290    #[simd_test(enable = "sse2")]
5291    const fn test_mm_set_pd() {
5292        let r = _mm_set_pd(1.0_f64, 5.0_f64);
5293        assert_eq_m128d(r, _mm_setr_pd(5.0_f64, 1.0_f64));
5294    }
5295
5296    #[simd_test(enable = "sse2")]
5297    const fn test_mm_setr_pd() {
5298        let r = _mm_setr_pd(1.0_f64, -5.0_f64);
5299        assert_eq_m128d(r, _mm_setr_pd(1.0_f64, -5.0_f64));
5300    }
5301
5302    #[simd_test(enable = "sse2")]
5303    const fn test_mm_setzero_pd() {
5304        let r = _mm_setzero_pd();
5305        assert_eq_m128d(r, _mm_setr_pd(0_f64, 0_f64));
5306    }
5307
5308    #[simd_test(enable = "sse2")]
5309    const unsafe fn test_mm_load1_pd() {
5310        let d = -5.0;
5311        let r = _mm_load1_pd(&d);
5312        assert_eq_m128d(r, _mm_setr_pd(d, d));
5313    }
5314
5315    #[simd_test(enable = "sse2")]
5316    const unsafe fn test_mm_load_pd1() {
5317        let d = -5.0;
5318        let r = _mm_load_pd1(&d);
5319        assert_eq_m128d(r, _mm_setr_pd(d, d));
5320    }
5321
5322    #[simd_test(enable = "sse2")]
5323    const fn test_mm_unpackhi_pd() {
5324        let a = _mm_setr_pd(1.0, 2.0);
5325        let b = _mm_setr_pd(3.0, 4.0);
5326        let r = _mm_unpackhi_pd(a, b);
5327        assert_eq_m128d(r, _mm_setr_pd(2.0, 4.0));
5328    }
5329
5330    #[simd_test(enable = "sse2")]
5331    const fn test_mm_unpacklo_pd() {
5332        let a = _mm_setr_pd(1.0, 2.0);
5333        let b = _mm_setr_pd(3.0, 4.0);
5334        let r = _mm_unpacklo_pd(a, b);
5335        assert_eq_m128d(r, _mm_setr_pd(1.0, 3.0));
5336    }
5337
5338    #[simd_test(enable = "sse2")]
5339    const fn test_mm_shuffle_pd() {
5340        let a = _mm_setr_pd(1., 2.);
5341        let b = _mm_setr_pd(3., 4.);
5342        let expected = _mm_setr_pd(1., 3.);
5343        let r = _mm_shuffle_pd::<0b00_00_00_00>(a, b);
5344        assert_eq_m128d(r, expected);
5345    }
5346
5347    #[simd_test(enable = "sse2")]
5348    const fn test_mm_move_sd() {
5349        let a = _mm_setr_pd(1., 2.);
5350        let b = _mm_setr_pd(3., 4.);
5351        let expected = _mm_setr_pd(3., 2.);
5352        let r = _mm_move_sd(a, b);
5353        assert_eq_m128d(r, expected);
5354    }
5355
5356    #[simd_test(enable = "sse2")]
5357    const fn test_mm_castpd_ps() {
5358        let a = _mm_set1_pd(0.);
5359        let expected = _mm_set1_ps(0.);
5360        let r = _mm_castpd_ps(a);
5361        assert_eq_m128(r, expected);
5362    }
5363
5364    #[simd_test(enable = "sse2")]
5365    const fn test_mm_castpd_si128() {
5366        let a = _mm_set1_pd(0.);
5367        let expected = _mm_set1_epi64x(0);
5368        let r = _mm_castpd_si128(a);
5369        assert_eq_m128i(r, expected);
5370    }
5371
5372    #[simd_test(enable = "sse2")]
5373    const fn test_mm_castps_pd() {
5374        let a = _mm_set1_ps(0.);
5375        let expected = _mm_set1_pd(0.);
5376        let r = _mm_castps_pd(a);
5377        assert_eq_m128d(r, expected);
5378    }
5379
5380    #[simd_test(enable = "sse2")]
5381    const fn test_mm_castps_si128() {
5382        let a = _mm_set1_ps(0.);
5383        let expected = _mm_set1_epi32(0);
5384        let r = _mm_castps_si128(a);
5385        assert_eq_m128i(r, expected);
5386    }
5387
5388    #[simd_test(enable = "sse2")]
5389    const fn test_mm_castsi128_pd() {
5390        let a = _mm_set1_epi64x(0);
5391        let expected = _mm_set1_pd(0.);
5392        let r = _mm_castsi128_pd(a);
5393        assert_eq_m128d(r, expected);
5394    }
5395
5396    #[simd_test(enable = "sse2")]
5397    const fn test_mm_castsi128_ps() {
5398        let a = _mm_set1_epi32(0);
5399        let expected = _mm_set1_ps(0.);
5400        let r = _mm_castsi128_ps(a);
5401        assert_eq_m128(r, expected);
5402    }
5403}