Skip to main content

core/stdarch/crates/core_arch/src/x86/
sse2.rs

1//! Streaming SIMD Extensions 2 (SSE2)
2
3#[cfg(test)]
4use stdarch_test::assert_instr;
5
6use crate::{
7    core_arch::{simd::*, x86::*},
8    intrinsics::simd::*,
9    intrinsics::sqrtf64,
10    mem, ptr,
11};
12
13/// Provides a hint to the processor that the code sequence is a spin-wait loop.
14///
15/// This can help improve the performance and power consumption of spin-wait
16/// loops.
17///
18/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause)
19#[inline]
20#[cfg_attr(all(test, target_feature = "sse2"), assert_instr(pause))]
21#[stable(feature = "simd_x86", since = "1.27.0")]
22pub fn _mm_pause() {
23    // note: `pause` is guaranteed to be interpreted as a `nop` by CPUs without
24    // the SSE2 target-feature - therefore it does not require any target features
25    unsafe { pause() }
26}
27
28/// Invalidates and flushes the cache line that contains `p` from all levels of
29/// the cache hierarchy.
30///
31/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush)
32#[inline]
33#[target_feature(enable = "sse2")]
34#[cfg_attr(test, assert_instr(clflush))]
35#[stable(feature = "simd_x86", since = "1.27.0")]
36pub unsafe fn _mm_clflush(p: *const u8) {
37    clflush(p)
38}
39
40/// Performs a serializing operation on all load-from-memory instructions
41/// that were issued prior to this instruction.
42///
43/// Guarantees that every load instruction that precedes, in program order, is
44/// globally visible before any load instruction which follows the fence in
45/// program order.
46///
47/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence)
48#[inline]
49#[target_feature(enable = "sse2")]
50#[cfg_attr(test, assert_instr(lfence))]
51#[stable(feature = "simd_x86", since = "1.27.0")]
52pub fn _mm_lfence() {
53    unsafe { lfence() }
54}
55
56/// Performs a serializing operation on all load-from-memory and store-to-memory
57/// instructions that were issued prior to this instruction.
58///
59/// Guarantees that every memory access that precedes, in program order, the
60/// memory fence instruction is globally visible before any memory instruction
61/// which follows the fence in program order.
62///
63/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence)
64#[inline]
65#[target_feature(enable = "sse2")]
66#[cfg_attr(test, assert_instr(mfence))]
67#[stable(feature = "simd_x86", since = "1.27.0")]
68pub fn _mm_mfence() {
69    unsafe { mfence() }
70}
71
72/// Adds packed 8-bit integers in `a` and `b`.
73///
74/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8)
75#[inline]
76#[target_feature(enable = "sse2")]
77#[cfg_attr(test, assert_instr(paddb))]
78#[stable(feature = "simd_x86", since = "1.27.0")]
79#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
80pub const fn _mm_add_epi8(a: __m128i, b: __m128i) -> __m128i {
81    unsafe { transmute(simd_add(a.as_i8x16(), b.as_i8x16())) }
82}
83
84/// Adds packed 16-bit integers in `a` and `b`.
85///
86/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16)
87#[inline]
88#[target_feature(enable = "sse2")]
89#[cfg_attr(test, assert_instr(paddw))]
90#[stable(feature = "simd_x86", since = "1.27.0")]
91#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
92pub const fn _mm_add_epi16(a: __m128i, b: __m128i) -> __m128i {
93    unsafe { transmute(simd_add(a.as_i16x8(), b.as_i16x8())) }
94}
95
96/// Adds packed 32-bit integers in `a` and `b`.
97///
98/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32)
99#[inline]
100#[target_feature(enable = "sse2")]
101#[cfg_attr(test, assert_instr(paddd))]
102#[stable(feature = "simd_x86", since = "1.27.0")]
103#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
104pub const fn _mm_add_epi32(a: __m128i, b: __m128i) -> __m128i {
105    unsafe { transmute(simd_add(a.as_i32x4(), b.as_i32x4())) }
106}
107
108/// Adds packed 64-bit integers in `a` and `b`.
109///
110/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64)
111#[inline]
112#[target_feature(enable = "sse2")]
113#[cfg_attr(test, assert_instr(paddq))]
114#[stable(feature = "simd_x86", since = "1.27.0")]
115#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
116pub const fn _mm_add_epi64(a: __m128i, b: __m128i) -> __m128i {
117    unsafe { transmute(simd_add(a.as_i64x2(), b.as_i64x2())) }
118}
119
120/// Adds packed 8-bit integers in `a` and `b` using saturation.
121///
122/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8)
123#[inline]
124#[target_feature(enable = "sse2")]
125#[cfg_attr(test, assert_instr(paddsb))]
126#[stable(feature = "simd_x86", since = "1.27.0")]
127#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
128pub const fn _mm_adds_epi8(a: __m128i, b: __m128i) -> __m128i {
129    unsafe { transmute(simd_saturating_add(a.as_i8x16(), b.as_i8x16())) }
130}
131
132/// Adds packed 16-bit integers in `a` and `b` using saturation.
133///
134/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16)
135#[inline]
136#[target_feature(enable = "sse2")]
137#[cfg_attr(test, assert_instr(paddsw))]
138#[stable(feature = "simd_x86", since = "1.27.0")]
139#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
140pub const fn _mm_adds_epi16(a: __m128i, b: __m128i) -> __m128i {
141    unsafe { transmute(simd_saturating_add(a.as_i16x8(), b.as_i16x8())) }
142}
143
144/// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
145///
146/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8)
147#[inline]
148#[target_feature(enable = "sse2")]
149#[cfg_attr(test, assert_instr(paddusb))]
150#[stable(feature = "simd_x86", since = "1.27.0")]
151#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
152pub const fn _mm_adds_epu8(a: __m128i, b: __m128i) -> __m128i {
153    unsafe { transmute(simd_saturating_add(a.as_u8x16(), b.as_u8x16())) }
154}
155
156/// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
157///
158/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16)
159#[inline]
160#[target_feature(enable = "sse2")]
161#[cfg_attr(test, assert_instr(paddusw))]
162#[stable(feature = "simd_x86", since = "1.27.0")]
163#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
164pub const fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i {
165    unsafe { transmute(simd_saturating_add(a.as_u16x8(), b.as_u16x8())) }
166}
167
168/// Averages packed unsigned 8-bit integers in `a` and `b`.
169///
170/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8)
171#[inline]
172#[target_feature(enable = "sse2")]
173#[cfg_attr(test, assert_instr(pavgb))]
174#[stable(feature = "simd_x86", since = "1.27.0")]
175#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
176pub const fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i {
177    unsafe {
178        let a = simd_cast::<_, u16x16>(a.as_u8x16());
179        let b = simd_cast::<_, u16x16>(b.as_u8x16());
180        let r = simd_shr(simd_add(simd_add(a, b), u16x16::splat(1)), u16x16::splat(1));
181        transmute(simd_cast::<_, u8x16>(r))
182    }
183}
184
185/// Averages packed unsigned 16-bit integers in `a` and `b`.
186///
187/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16)
188#[inline]
189#[target_feature(enable = "sse2")]
190#[cfg_attr(test, assert_instr(pavgw))]
191#[stable(feature = "simd_x86", since = "1.27.0")]
192#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
193pub const fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
194    unsafe {
195        let a = simd_cast::<_, u32x8>(a.as_u16x8());
196        let b = simd_cast::<_, u32x8>(b.as_u16x8());
197        let r = simd_shr(simd_add(simd_add(a, b), u32x8::splat(1)), u32x8::splat(1));
198        transmute(simd_cast::<_, u16x8>(r))
199    }
200}
201
202/// Multiplies and then horizontally add signed 16 bit integers in `a` and `b`.
203///
204/// Multiplies packed signed 16-bit integers in `a` and `b`, producing
205/// intermediate signed 32-bit integers. Horizontally add adjacent pairs of
206/// intermediate 32-bit integers.
207///
208/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16)
209#[inline]
210#[target_feature(enable = "sse2")]
211#[cfg_attr(test, assert_instr(pmaddwd))]
212#[stable(feature = "simd_x86", since = "1.27.0")]
213pub fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
214    // It's a trick used in the Adler-32 algorithm to perform a widening addition.
215    //
216    // ```rust
217    // #[target_feature(enable = "sse2")]
218    // unsafe fn widening_add(mad: __m128i) -> __m128i {
219    //     _mm_madd_epi16(mad, _mm_set1_epi16(1))
220    // }
221    // ```
222    //
223    // If we implement this using generic vector intrinsics, the optimizer
224    // will eliminate this pattern, and `pmaddwd` will no longer be emitted.
225    // For this reason, we use x86 intrinsics.
226    unsafe { transmute(pmaddwd(a.as_i16x8(), b.as_i16x8())) }
227}
228
229/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
230/// maximum values.
231///
232/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16)
233#[inline]
234#[target_feature(enable = "sse2")]
235#[cfg_attr(test, assert_instr(pmaxsw))]
236#[stable(feature = "simd_x86", since = "1.27.0")]
237#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
238pub const fn _mm_max_epi16(a: __m128i, b: __m128i) -> __m128i {
239    unsafe { simd_imax(a.as_i16x8(), b.as_i16x8()).as_m128i() }
240}
241
242/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
243/// packed maximum values.
244///
245/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8)
246#[inline]
247#[target_feature(enable = "sse2")]
248#[cfg_attr(test, assert_instr(pmaxub))]
249#[stable(feature = "simd_x86", since = "1.27.0")]
250#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
251pub const fn _mm_max_epu8(a: __m128i, b: __m128i) -> __m128i {
252    unsafe { simd_imax(a.as_u8x16(), b.as_u8x16()).as_m128i() }
253}
254
255/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
256/// minimum values.
257///
258/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16)
259#[inline]
260#[target_feature(enable = "sse2")]
261#[cfg_attr(test, assert_instr(pminsw))]
262#[stable(feature = "simd_x86", since = "1.27.0")]
263#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
264pub const fn _mm_min_epi16(a: __m128i, b: __m128i) -> __m128i {
265    unsafe { simd_imin(a.as_i16x8(), b.as_i16x8()).as_m128i() }
266}
267
268/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
269/// packed minimum values.
270///
271/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8)
272#[inline]
273#[target_feature(enable = "sse2")]
274#[cfg_attr(test, assert_instr(pminub))]
275#[stable(feature = "simd_x86", since = "1.27.0")]
276#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
277pub const fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i {
278    unsafe { simd_imin(a.as_u8x16(), b.as_u8x16()).as_m128i() }
279}
280
281/// Multiplies the packed 16-bit integers in `a` and `b`.
282///
283/// The multiplication produces intermediate 32-bit integers, and returns the
284/// high 16 bits of the intermediate integers.
285///
286/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16)
287#[inline]
288#[target_feature(enable = "sse2")]
289#[cfg_attr(test, assert_instr(pmulhw))]
290#[stable(feature = "simd_x86", since = "1.27.0")]
291#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
292pub const fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i {
293    unsafe {
294        let a = simd_cast::<_, i32x8>(a.as_i16x8());
295        let b = simd_cast::<_, i32x8>(b.as_i16x8());
296        let r = simd_shr(simd_mul(a, b), i32x8::splat(16));
297        transmute(simd_cast::<i32x8, i16x8>(r))
298    }
299}
300
301/// Multiplies the packed unsigned 16-bit integers in `a` and `b`.
302///
303/// The multiplication produces intermediate 32-bit integers, and returns the
304/// high 16 bits of the intermediate integers.
305///
306/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16)
307#[inline]
308#[target_feature(enable = "sse2")]
309#[cfg_attr(test, assert_instr(pmulhuw))]
310#[stable(feature = "simd_x86", since = "1.27.0")]
311#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
312pub const fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i {
313    unsafe {
314        let a = simd_cast::<_, u32x8>(a.as_u16x8());
315        let b = simd_cast::<_, u32x8>(b.as_u16x8());
316        let r = simd_shr(simd_mul(a, b), u32x8::splat(16));
317        transmute(simd_cast::<u32x8, u16x8>(r))
318    }
319}
320
321/// Multiplies the packed 16-bit integers in `a` and `b`.
322///
323/// The multiplication produces intermediate 32-bit integers, and returns the
324/// low 16 bits of the intermediate integers.
325///
326/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16)
327#[inline]
328#[target_feature(enable = "sse2")]
329#[cfg_attr(test, assert_instr(pmullw))]
330#[stable(feature = "simd_x86", since = "1.27.0")]
331#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
332pub const fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i {
333    unsafe { transmute(simd_mul(a.as_i16x8(), b.as_i16x8())) }
334}
335
336/// Multiplies the low unsigned 32-bit integers from each packed 64-bit element
337/// in `a` and `b`.
338///
339/// Returns the unsigned 64-bit results.
340///
341/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32)
342#[inline]
343#[target_feature(enable = "sse2")]
344#[cfg_attr(test, assert_instr(pmuludq))]
345#[stable(feature = "simd_x86", since = "1.27.0")]
346#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
347pub const fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i {
348    unsafe {
349        let a = a.as_u64x2();
350        let b = b.as_u64x2();
351        let mask = u64x2::splat(u32::MAX as u64);
352        transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
353    }
354}
355
356/// Sum the absolute differences of packed unsigned 8-bit integers.
357///
358/// Computes the absolute differences of packed unsigned 8-bit integers in `a`
359/// and `b`, then horizontally sum each consecutive 8 differences to produce
360/// two unsigned 16-bit integers, and pack these unsigned 16-bit integers in
361/// the low 16 bits of 64-bit elements returned.
362///
363/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8)
364#[inline]
365#[target_feature(enable = "sse2")]
366#[cfg_attr(test, assert_instr(psadbw))]
367#[stable(feature = "simd_x86", since = "1.27.0")]
368pub fn _mm_sad_epu8(a: __m128i, b: __m128i) -> __m128i {
369    unsafe { transmute(psadbw(a.as_u8x16(), b.as_u8x16())) }
370}
371
372/// Subtracts packed 8-bit integers in `b` from packed 8-bit integers in `a`.
373///
374/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8)
375#[inline]
376#[target_feature(enable = "sse2")]
377#[cfg_attr(test, assert_instr(psubb))]
378#[stable(feature = "simd_x86", since = "1.27.0")]
379#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
380pub const fn _mm_sub_epi8(a: __m128i, b: __m128i) -> __m128i {
381    unsafe { transmute(simd_sub(a.as_i8x16(), b.as_i8x16())) }
382}
383
384/// Subtracts packed 16-bit integers in `b` from packed 16-bit integers in `a`.
385///
386/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16)
387#[inline]
388#[target_feature(enable = "sse2")]
389#[cfg_attr(test, assert_instr(psubw))]
390#[stable(feature = "simd_x86", since = "1.27.0")]
391#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
392pub const fn _mm_sub_epi16(a: __m128i, b: __m128i) -> __m128i {
393    unsafe { transmute(simd_sub(a.as_i16x8(), b.as_i16x8())) }
394}
395
396/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
397///
398/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32)
399#[inline]
400#[target_feature(enable = "sse2")]
401#[cfg_attr(test, assert_instr(psubd))]
402#[stable(feature = "simd_x86", since = "1.27.0")]
403#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
404pub const fn _mm_sub_epi32(a: __m128i, b: __m128i) -> __m128i {
405    unsafe { transmute(simd_sub(a.as_i32x4(), b.as_i32x4())) }
406}
407
408/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
409///
410/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64)
411#[inline]
412#[target_feature(enable = "sse2")]
413#[cfg_attr(test, assert_instr(psubq))]
414#[stable(feature = "simd_x86", since = "1.27.0")]
415#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
416pub const fn _mm_sub_epi64(a: __m128i, b: __m128i) -> __m128i {
417    unsafe { transmute(simd_sub(a.as_i64x2(), b.as_i64x2())) }
418}
419
420/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
421/// using saturation.
422///
423/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8)
424#[inline]
425#[target_feature(enable = "sse2")]
426#[cfg_attr(test, assert_instr(psubsb))]
427#[stable(feature = "simd_x86", since = "1.27.0")]
428#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
429pub const fn _mm_subs_epi8(a: __m128i, b: __m128i) -> __m128i {
430    unsafe { transmute(simd_saturating_sub(a.as_i8x16(), b.as_i8x16())) }
431}
432
433/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
434/// using saturation.
435///
436/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16)
437#[inline]
438#[target_feature(enable = "sse2")]
439#[cfg_attr(test, assert_instr(psubsw))]
440#[stable(feature = "simd_x86", since = "1.27.0")]
441#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
442pub const fn _mm_subs_epi16(a: __m128i, b: __m128i) -> __m128i {
443    unsafe { transmute(simd_saturating_sub(a.as_i16x8(), b.as_i16x8())) }
444}
445
446/// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit
447/// integers in `a` using saturation.
448///
449/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8)
450#[inline]
451#[target_feature(enable = "sse2")]
452#[cfg_attr(test, assert_instr(psubusb))]
453#[stable(feature = "simd_x86", since = "1.27.0")]
454#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
455pub const fn _mm_subs_epu8(a: __m128i, b: __m128i) -> __m128i {
456    unsafe { transmute(simd_saturating_sub(a.as_u8x16(), b.as_u8x16())) }
457}
458
459/// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit
460/// integers in `a` using saturation.
461///
462/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16)
463#[inline]
464#[target_feature(enable = "sse2")]
465#[cfg_attr(test, assert_instr(psubusw))]
466#[stable(feature = "simd_x86", since = "1.27.0")]
467#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
468pub const fn _mm_subs_epu16(a: __m128i, b: __m128i) -> __m128i {
469    unsafe { transmute(simd_saturating_sub(a.as_u16x8(), b.as_u16x8())) }
470}
471
472/// Shifts `a` left by `IMM8` bytes while shifting in zeros.
473///
474/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128)
475#[inline]
476#[target_feature(enable = "sse2")]
477#[cfg_attr(test, assert_instr(pslldq, IMM8 = 1))]
478#[rustc_legacy_const_generics(1)]
479#[stable(feature = "simd_x86", since = "1.27.0")]
480#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
481pub const fn _mm_slli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
482    static_assert_uimm_bits!(IMM8, 8);
483    unsafe { _mm_slli_si128_impl::<IMM8>(a) }
484}
485
486/// Implementation detail: converts the immediate argument of the
487/// `_mm_slli_si128` intrinsic into a compile-time constant.
488#[inline]
489#[target_feature(enable = "sse2")]
490#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
491const unsafe fn _mm_slli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
492    const fn mask(shift: i32, i: u32) -> u32 {
493        let shift = shift as u32 & 0xff;
494        if shift > 15 { i } else { 16 - shift + i }
495    }
496    transmute::<i8x16, _>(simd_shuffle!(
497        i8x16::ZERO,
498        a.as_i8x16(),
499        [
500            mask(IMM8, 0),
501            mask(IMM8, 1),
502            mask(IMM8, 2),
503            mask(IMM8, 3),
504            mask(IMM8, 4),
505            mask(IMM8, 5),
506            mask(IMM8, 6),
507            mask(IMM8, 7),
508            mask(IMM8, 8),
509            mask(IMM8, 9),
510            mask(IMM8, 10),
511            mask(IMM8, 11),
512            mask(IMM8, 12),
513            mask(IMM8, 13),
514            mask(IMM8, 14),
515            mask(IMM8, 15),
516        ],
517    ))
518}
519
520/// Shifts `a` left by `IMM8` bytes while shifting in zeros.
521///
522/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128)
523#[inline]
524#[target_feature(enable = "sse2")]
525#[cfg_attr(test, assert_instr(pslldq, IMM8 = 1))]
526#[rustc_legacy_const_generics(1)]
527#[stable(feature = "simd_x86", since = "1.27.0")]
528#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
529pub const fn _mm_bslli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
530    unsafe {
531        static_assert_uimm_bits!(IMM8, 8);
532        _mm_slli_si128_impl::<IMM8>(a)
533    }
534}
535
536/// Shifts `a` right by `IMM8` bytes while shifting in zeros.
537///
538/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128)
539#[inline]
540#[target_feature(enable = "sse2")]
541#[cfg_attr(test, assert_instr(psrldq, IMM8 = 1))]
542#[rustc_legacy_const_generics(1)]
543#[stable(feature = "simd_x86", since = "1.27.0")]
544#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
545pub const fn _mm_bsrli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
546    unsafe {
547        static_assert_uimm_bits!(IMM8, 8);
548        _mm_srli_si128_impl::<IMM8>(a)
549    }
550}
551
552/// Shifts packed 16-bit integers in `a` left by `IMM8` while shifting in zeros.
553///
554/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16)
555#[inline]
556#[target_feature(enable = "sse2")]
557#[cfg_attr(test, assert_instr(psllw, IMM8 = 7))]
558#[rustc_legacy_const_generics(1)]
559#[stable(feature = "simd_x86", since = "1.27.0")]
560#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
561pub const fn _mm_slli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
562    static_assert_uimm_bits!(IMM8, 8);
563    unsafe {
564        if IMM8 >= 16 {
565            _mm_setzero_si128()
566        } else {
567            transmute(simd_shl(a.as_u16x8(), u16x8::splat(IMM8 as u16)))
568        }
569    }
570}
571
572/// Shifts packed 16-bit integers in `a` left by `count` while shifting in
573/// zeros.
574///
575/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16)
576#[inline]
577#[target_feature(enable = "sse2")]
578#[cfg_attr(test, assert_instr(psllw))]
579#[stable(feature = "simd_x86", since = "1.27.0")]
580pub fn _mm_sll_epi16(a: __m128i, count: __m128i) -> __m128i {
581    unsafe { transmute(psllw(a.as_i16x8(), count.as_i16x8())) }
582}
583
584/// Shifts packed 32-bit integers in `a` left by `IMM8` while shifting in zeros.
585///
586/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32)
587#[inline]
588#[target_feature(enable = "sse2")]
589#[cfg_attr(test, assert_instr(pslld, IMM8 = 7))]
590#[rustc_legacy_const_generics(1)]
591#[stable(feature = "simd_x86", since = "1.27.0")]
592#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
593pub const fn _mm_slli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
594    static_assert_uimm_bits!(IMM8, 8);
595    unsafe {
596        if IMM8 >= 32 {
597            _mm_setzero_si128()
598        } else {
599            transmute(simd_shl(a.as_u32x4(), u32x4::splat(IMM8 as u32)))
600        }
601    }
602}
603
604/// Shifts packed 32-bit integers in `a` left by `count` while shifting in
605/// zeros.
606///
607/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32)
608#[inline]
609#[target_feature(enable = "sse2")]
610#[cfg_attr(test, assert_instr(pslld))]
611#[stable(feature = "simd_x86", since = "1.27.0")]
612pub fn _mm_sll_epi32(a: __m128i, count: __m128i) -> __m128i {
613    unsafe { transmute(pslld(a.as_i32x4(), count.as_i32x4())) }
614}
615
616/// Shifts packed 64-bit integers in `a` left by `IMM8` while shifting in zeros.
617///
618/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64)
619#[inline]
620#[target_feature(enable = "sse2")]
621#[cfg_attr(test, assert_instr(psllq, IMM8 = 7))]
622#[rustc_legacy_const_generics(1)]
623#[stable(feature = "simd_x86", since = "1.27.0")]
624#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
625pub const fn _mm_slli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
626    static_assert_uimm_bits!(IMM8, 8);
627    unsafe {
628        if IMM8 >= 64 {
629            _mm_setzero_si128()
630        } else {
631            transmute(simd_shl(a.as_u64x2(), u64x2::splat(IMM8 as u64)))
632        }
633    }
634}
635
636/// Shifts packed 64-bit integers in `a` left by `count` while shifting in
637/// zeros.
638///
639/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64)
640#[inline]
641#[target_feature(enable = "sse2")]
642#[cfg_attr(test, assert_instr(psllq))]
643#[stable(feature = "simd_x86", since = "1.27.0")]
644pub fn _mm_sll_epi64(a: __m128i, count: __m128i) -> __m128i {
645    unsafe { transmute(psllq(a.as_i64x2(), count.as_i64x2())) }
646}
647
648/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in sign
649/// bits.
650///
651/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16)
652#[inline]
653#[target_feature(enable = "sse2")]
654#[cfg_attr(test, assert_instr(psraw, IMM8 = 1))]
655#[rustc_legacy_const_generics(1)]
656#[stable(feature = "simd_x86", since = "1.27.0")]
657#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
658pub const fn _mm_srai_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
659    static_assert_uimm_bits!(IMM8, 8);
660    unsafe { transmute(simd_shr(a.as_i16x8(), i16x8::splat(IMM8.min(15) as i16))) }
661}
662
663/// Shifts packed 16-bit integers in `a` right by `count` while shifting in sign
664/// bits.
665///
666/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16)
667#[inline]
668#[target_feature(enable = "sse2")]
669#[cfg_attr(test, assert_instr(psraw))]
670#[stable(feature = "simd_x86", since = "1.27.0")]
671pub fn _mm_sra_epi16(a: __m128i, count: __m128i) -> __m128i {
672    unsafe { transmute(psraw(a.as_i16x8(), count.as_i16x8())) }
673}
674
675/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in sign
676/// bits.
677///
678/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32)
679#[inline]
680#[target_feature(enable = "sse2")]
681#[cfg_attr(test, assert_instr(psrad, IMM8 = 1))]
682#[rustc_legacy_const_generics(1)]
683#[stable(feature = "simd_x86", since = "1.27.0")]
684#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
685pub const fn _mm_srai_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
686    static_assert_uimm_bits!(IMM8, 8);
687    unsafe { transmute(simd_shr(a.as_i32x4(), i32x4::splat(IMM8.min(31)))) }
688}
689
690/// Shifts packed 32-bit integers in `a` right by `count` while shifting in sign
691/// bits.
692///
693/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32)
694#[inline]
695#[target_feature(enable = "sse2")]
696#[cfg_attr(test, assert_instr(psrad))]
697#[stable(feature = "simd_x86", since = "1.27.0")]
698pub fn _mm_sra_epi32(a: __m128i, count: __m128i) -> __m128i {
699    unsafe { transmute(psrad(a.as_i32x4(), count.as_i32x4())) }
700}
701
702/// Shifts `a` right by `IMM8` bytes while shifting in zeros.
703///
704/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128)
705#[inline]
706#[target_feature(enable = "sse2")]
707#[cfg_attr(test, assert_instr(psrldq, IMM8 = 1))]
708#[rustc_legacy_const_generics(1)]
709#[stable(feature = "simd_x86", since = "1.27.0")]
710#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
711pub const fn _mm_srli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
712    static_assert_uimm_bits!(IMM8, 8);
713    unsafe { _mm_srli_si128_impl::<IMM8>(a) }
714}
715
716/// Implementation detail: converts the immediate argument of the
717/// `_mm_srli_si128` intrinsic into a compile-time constant.
718#[inline]
719#[target_feature(enable = "sse2")]
720#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
721const unsafe fn _mm_srli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
722    const fn mask(shift: i32, i: u32) -> u32 {
723        if (shift as u32) > 15 {
724            i + 16
725        } else {
726            i + (shift as u32)
727        }
728    }
729    let x: i8x16 = simd_shuffle!(
730        a.as_i8x16(),
731        i8x16::ZERO,
732        [
733            mask(IMM8, 0),
734            mask(IMM8, 1),
735            mask(IMM8, 2),
736            mask(IMM8, 3),
737            mask(IMM8, 4),
738            mask(IMM8, 5),
739            mask(IMM8, 6),
740            mask(IMM8, 7),
741            mask(IMM8, 8),
742            mask(IMM8, 9),
743            mask(IMM8, 10),
744            mask(IMM8, 11),
745            mask(IMM8, 12),
746            mask(IMM8, 13),
747            mask(IMM8, 14),
748            mask(IMM8, 15),
749        ],
750    );
751    transmute(x)
752}
753
754/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
755/// zeros.
756///
757/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16)
758#[inline]
759#[target_feature(enable = "sse2")]
760#[cfg_attr(test, assert_instr(psrlw, IMM8 = 1))]
761#[rustc_legacy_const_generics(1)]
762#[stable(feature = "simd_x86", since = "1.27.0")]
763#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
764pub const fn _mm_srli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
765    static_assert_uimm_bits!(IMM8, 8);
766    unsafe {
767        if IMM8 >= 16 {
768            _mm_setzero_si128()
769        } else {
770            transmute(simd_shr(a.as_u16x8(), u16x8::splat(IMM8 as u16)))
771        }
772    }
773}
774
775/// Shifts packed 16-bit integers in `a` right by `count` while shifting in
776/// zeros.
777///
778/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16)
779#[inline]
780#[target_feature(enable = "sse2")]
781#[cfg_attr(test, assert_instr(psrlw))]
782#[stable(feature = "simd_x86", since = "1.27.0")]
783pub fn _mm_srl_epi16(a: __m128i, count: __m128i) -> __m128i {
784    unsafe { transmute(psrlw(a.as_i16x8(), count.as_i16x8())) }
785}
786
787/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
788/// zeros.
789///
790/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32)
791#[inline]
792#[target_feature(enable = "sse2")]
793#[cfg_attr(test, assert_instr(psrld, IMM8 = 8))]
794#[rustc_legacy_const_generics(1)]
795#[stable(feature = "simd_x86", since = "1.27.0")]
796#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
797pub const fn _mm_srli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
798    static_assert_uimm_bits!(IMM8, 8);
799    unsafe {
800        if IMM8 >= 32 {
801            _mm_setzero_si128()
802        } else {
803            transmute(simd_shr(a.as_u32x4(), u32x4::splat(IMM8 as u32)))
804        }
805    }
806}
807
808/// Shifts packed 32-bit integers in `a` right by `count` while shifting in
809/// zeros.
810///
811/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32)
812#[inline]
813#[target_feature(enable = "sse2")]
814#[cfg_attr(test, assert_instr(psrld))]
815#[stable(feature = "simd_x86", since = "1.27.0")]
816pub fn _mm_srl_epi32(a: __m128i, count: __m128i) -> __m128i {
817    unsafe { transmute(psrld(a.as_i32x4(), count.as_i32x4())) }
818}
819
820/// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
821/// zeros.
822///
823/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64)
824#[inline]
825#[target_feature(enable = "sse2")]
826#[cfg_attr(test, assert_instr(psrlq, IMM8 = 1))]
827#[rustc_legacy_const_generics(1)]
828#[stable(feature = "simd_x86", since = "1.27.0")]
829#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
830pub const fn _mm_srli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
831    static_assert_uimm_bits!(IMM8, 8);
832    unsafe {
833        if IMM8 >= 64 {
834            _mm_setzero_si128()
835        } else {
836            transmute(simd_shr(a.as_u64x2(), u64x2::splat(IMM8 as u64)))
837        }
838    }
839}
840
841/// Shifts packed 64-bit integers in `a` right by `count` while shifting in
842/// zeros.
843///
844/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64)
845#[inline]
846#[target_feature(enable = "sse2")]
847#[cfg_attr(test, assert_instr(psrlq))]
848#[stable(feature = "simd_x86", since = "1.27.0")]
849pub fn _mm_srl_epi64(a: __m128i, count: __m128i) -> __m128i {
850    unsafe { transmute(psrlq(a.as_i64x2(), count.as_i64x2())) }
851}
852
853/// Computes the bitwise AND of 128 bits (representing integer data) in `a` and
854/// `b`.
855///
856/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128)
857#[inline]
858#[target_feature(enable = "sse2")]
859#[cfg_attr(test, assert_instr(andps))]
860#[stable(feature = "simd_x86", since = "1.27.0")]
861#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
862pub const fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i {
863    unsafe { simd_and(a, b) }
864}
865
866/// Computes the bitwise NOT of 128 bits (representing integer data) in `a` and
867/// then AND with `b`.
868///
869/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128)
870#[inline]
871#[target_feature(enable = "sse2")]
872#[cfg_attr(test, assert_instr(andnps))]
873#[stable(feature = "simd_x86", since = "1.27.0")]
874#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
875pub const fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i {
876    unsafe { simd_and(simd_xor(_mm_set1_epi8(-1), a), b) }
877}
878
879/// Computes the bitwise OR of 128 bits (representing integer data) in `a` and
880/// `b`.
881///
882/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128)
883#[inline]
884#[target_feature(enable = "sse2")]
885#[cfg_attr(test, assert_instr(orps))]
886#[stable(feature = "simd_x86", since = "1.27.0")]
887#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
888pub const fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i {
889    unsafe { simd_or(a, b) }
890}
891
892/// Computes the bitwise XOR of 128 bits (representing integer data) in `a` and
893/// `b`.
894///
895/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128)
896#[inline]
897#[target_feature(enable = "sse2")]
898#[cfg_attr(test, assert_instr(xorps))]
899#[stable(feature = "simd_x86", since = "1.27.0")]
900#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
901pub const fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i {
902    unsafe { simd_xor(a, b) }
903}
904
905/// Compares packed 8-bit integers in `a` and `b` for equality.
906///
907/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8)
908#[inline]
909#[target_feature(enable = "sse2")]
910#[cfg_attr(test, assert_instr(pcmpeqb))]
911#[stable(feature = "simd_x86", since = "1.27.0")]
912#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
913pub const fn _mm_cmpeq_epi8(a: __m128i, b: __m128i) -> __m128i {
914    unsafe { transmute::<i8x16, _>(simd_eq(a.as_i8x16(), b.as_i8x16())) }
915}
916
917/// Compares packed 16-bit integers in `a` and `b` for equality.
918///
919/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16)
920#[inline]
921#[target_feature(enable = "sse2")]
922#[cfg_attr(test, assert_instr(pcmpeqw))]
923#[stable(feature = "simd_x86", since = "1.27.0")]
924#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
925pub const fn _mm_cmpeq_epi16(a: __m128i, b: __m128i) -> __m128i {
926    unsafe { transmute::<i16x8, _>(simd_eq(a.as_i16x8(), b.as_i16x8())) }
927}
928
929/// Compares packed 32-bit integers in `a` and `b` for equality.
930///
931/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32)
932#[inline]
933#[target_feature(enable = "sse2")]
934#[cfg_attr(test, assert_instr(pcmpeqd))]
935#[stable(feature = "simd_x86", since = "1.27.0")]
936#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
937pub const fn _mm_cmpeq_epi32(a: __m128i, b: __m128i) -> __m128i {
938    unsafe { transmute::<i32x4, _>(simd_eq(a.as_i32x4(), b.as_i32x4())) }
939}
940
941/// Compares packed 8-bit integers in `a` and `b` for greater-than.
942///
943/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8)
944#[inline]
945#[target_feature(enable = "sse2")]
946#[cfg_attr(test, assert_instr(pcmpgtb))]
947#[stable(feature = "simd_x86", since = "1.27.0")]
948#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
949pub const fn _mm_cmpgt_epi8(a: __m128i, b: __m128i) -> __m128i {
950    unsafe { transmute::<i8x16, _>(simd_gt(a.as_i8x16(), b.as_i8x16())) }
951}
952
953/// Compares packed 16-bit integers in `a` and `b` for greater-than.
954///
955/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16)
956#[inline]
957#[target_feature(enable = "sse2")]
958#[cfg_attr(test, assert_instr(pcmpgtw))]
959#[stable(feature = "simd_x86", since = "1.27.0")]
960#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
961pub const fn _mm_cmpgt_epi16(a: __m128i, b: __m128i) -> __m128i {
962    unsafe { transmute::<i16x8, _>(simd_gt(a.as_i16x8(), b.as_i16x8())) }
963}
964
965/// Compares packed 32-bit integers in `a` and `b` for greater-than.
966///
967/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32)
968#[inline]
969#[target_feature(enable = "sse2")]
970#[cfg_attr(test, assert_instr(pcmpgtd))]
971#[stable(feature = "simd_x86", since = "1.27.0")]
972#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
973pub const fn _mm_cmpgt_epi32(a: __m128i, b: __m128i) -> __m128i {
974    unsafe { transmute::<i32x4, _>(simd_gt(a.as_i32x4(), b.as_i32x4())) }
975}
976
977/// Compares packed 8-bit integers in `a` and `b` for less-than.
978///
979/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8)
980#[inline]
981#[target_feature(enable = "sse2")]
982#[cfg_attr(test, assert_instr(pcmpgtb))]
983#[stable(feature = "simd_x86", since = "1.27.0")]
984#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
985pub const fn _mm_cmplt_epi8(a: __m128i, b: __m128i) -> __m128i {
986    unsafe { transmute::<i8x16, _>(simd_lt(a.as_i8x16(), b.as_i8x16())) }
987}
988
989/// Compares packed 16-bit integers in `a` and `b` for less-than.
990///
991/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16)
992#[inline]
993#[target_feature(enable = "sse2")]
994#[cfg_attr(test, assert_instr(pcmpgtw))]
995#[stable(feature = "simd_x86", since = "1.27.0")]
996#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
997pub const fn _mm_cmplt_epi16(a: __m128i, b: __m128i) -> __m128i {
998    unsafe { transmute::<i16x8, _>(simd_lt(a.as_i16x8(), b.as_i16x8())) }
999}
1000
1001/// Compares packed 32-bit integers in `a` and `b` for less-than.
1002///
1003/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32)
1004#[inline]
1005#[target_feature(enable = "sse2")]
1006#[cfg_attr(test, assert_instr(pcmpgtd))]
1007#[stable(feature = "simd_x86", since = "1.27.0")]
1008#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1009pub const fn _mm_cmplt_epi32(a: __m128i, b: __m128i) -> __m128i {
1010    unsafe { transmute::<i32x4, _>(simd_lt(a.as_i32x4(), b.as_i32x4())) }
1011}
1012
1013/// Converts the lower two packed 32-bit integers in `a` to packed
1014/// double-precision (64-bit) floating-point elements.
1015///
1016/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd)
1017#[inline]
1018#[target_feature(enable = "sse2")]
1019#[cfg_attr(test, assert_instr(cvtdq2pd))]
1020#[stable(feature = "simd_x86", since = "1.27.0")]
1021#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1022pub const fn _mm_cvtepi32_pd(a: __m128i) -> __m128d {
1023    unsafe {
1024        let a = a.as_i32x4();
1025        simd_cast::<i32x2, __m128d>(simd_shuffle!(a, a, [0, 1]))
1026    }
1027}
1028
1029/// Returns `a` with its lower element replaced by `b` after converting it to
1030/// an `f64`.
1031///
1032/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd)
1033#[inline]
1034#[target_feature(enable = "sse2")]
1035#[cfg_attr(test, assert_instr(cvtsi2sd))]
1036#[stable(feature = "simd_x86", since = "1.27.0")]
1037#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1038pub const fn _mm_cvtsi32_sd(a: __m128d, b: i32) -> __m128d {
1039    unsafe { simd_insert!(a, 0, b as f64) }
1040}
1041
1042/// Converts packed 32-bit integers in `a` to packed single-precision (32-bit)
1043/// floating-point elements.
1044///
1045/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps)
1046#[inline]
1047#[target_feature(enable = "sse2")]
1048#[cfg_attr(test, assert_instr(cvtdq2ps))]
1049#[stable(feature = "simd_x86", since = "1.27.0")]
1050#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1051pub const fn _mm_cvtepi32_ps(a: __m128i) -> __m128 {
1052    unsafe { transmute(simd_cast::<_, f32x4>(a.as_i32x4())) }
1053}
1054
1055/// Converts packed single-precision (32-bit) floating-point elements in `a`
1056/// to packed 32-bit integers.
1057///
1058/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32)
1059#[inline]
1060#[target_feature(enable = "sse2")]
1061#[cfg_attr(test, assert_instr(cvtps2dq))]
1062#[stable(feature = "simd_x86", since = "1.27.0")]
1063pub fn _mm_cvtps_epi32(a: __m128) -> __m128i {
1064    unsafe { transmute(cvtps2dq(a)) }
1065}
1066
1067/// Returns a vector whose lowest element is `a` and all higher elements are
1068/// `0`.
1069///
1070/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128)
1071#[inline]
1072#[target_feature(enable = "sse2")]
1073#[stable(feature = "simd_x86", since = "1.27.0")]
1074#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1075pub const fn _mm_cvtsi32_si128(a: i32) -> __m128i {
1076    unsafe { transmute(i32x4::new(a, 0, 0, 0)) }
1077}
1078
1079/// Returns the lowest element of `a`.
1080///
1081/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32)
1082#[inline]
1083#[target_feature(enable = "sse2")]
1084#[stable(feature = "simd_x86", since = "1.27.0")]
1085#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1086pub const fn _mm_cvtsi128_si32(a: __m128i) -> i32 {
1087    unsafe { simd_extract!(a.as_i32x4(), 0) }
1088}
1089
1090/// Sets packed 64-bit integers with the supplied values, from highest to
1091/// lowest.
1092///
1093/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x)
1094#[inline]
1095#[target_feature(enable = "sse2")]
1096// no particular instruction to test
1097#[stable(feature = "simd_x86", since = "1.27.0")]
1098#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1099pub const fn _mm_set_epi64x(e1: i64, e0: i64) -> __m128i {
1100    unsafe { transmute(i64x2::new(e0, e1)) }
1101}
1102
1103/// Sets packed 32-bit integers with the supplied values.
1104///
1105/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32)
1106#[inline]
1107#[target_feature(enable = "sse2")]
1108// no particular instruction to test
1109#[stable(feature = "simd_x86", since = "1.27.0")]
1110#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1111pub const fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
1112    unsafe { transmute(i32x4::new(e0, e1, e2, e3)) }
1113}
1114
1115/// Sets packed 16-bit integers with the supplied values.
1116///
1117/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16)
1118#[inline]
1119#[target_feature(enable = "sse2")]
1120// no particular instruction to test
1121#[stable(feature = "simd_x86", since = "1.27.0")]
1122#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1123pub const fn _mm_set_epi16(
1124    e7: i16,
1125    e6: i16,
1126    e5: i16,
1127    e4: i16,
1128    e3: i16,
1129    e2: i16,
1130    e1: i16,
1131    e0: i16,
1132) -> __m128i {
1133    unsafe { transmute(i16x8::new(e0, e1, e2, e3, e4, e5, e6, e7)) }
1134}
1135
1136/// Sets packed 8-bit integers with the supplied values.
1137///
1138/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8)
1139#[inline]
1140#[target_feature(enable = "sse2")]
1141// no particular instruction to test
1142#[stable(feature = "simd_x86", since = "1.27.0")]
1143#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1144pub const fn _mm_set_epi8(
1145    e15: i8,
1146    e14: i8,
1147    e13: i8,
1148    e12: i8,
1149    e11: i8,
1150    e10: i8,
1151    e9: i8,
1152    e8: i8,
1153    e7: i8,
1154    e6: i8,
1155    e5: i8,
1156    e4: i8,
1157    e3: i8,
1158    e2: i8,
1159    e1: i8,
1160    e0: i8,
1161) -> __m128i {
1162    unsafe {
1163        #[rustfmt::skip]
1164        transmute(i8x16::new(
1165            e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
1166        ))
1167    }
1168}
1169
1170/// Broadcasts 64-bit integer `a` to all elements.
1171///
1172/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x)
1173#[inline]
1174#[target_feature(enable = "sse2")]
1175// no particular instruction to test
1176#[stable(feature = "simd_x86", since = "1.27.0")]
1177#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1178pub const fn _mm_set1_epi64x(a: i64) -> __m128i {
1179    i64x2::splat(a).as_m128i()
1180}
1181
1182/// Broadcasts 32-bit integer `a` to all elements.
1183///
1184/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32)
1185#[inline]
1186#[target_feature(enable = "sse2")]
1187// no particular instruction to test
1188#[stable(feature = "simd_x86", since = "1.27.0")]
1189#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1190pub const fn _mm_set1_epi32(a: i32) -> __m128i {
1191    i32x4::splat(a).as_m128i()
1192}
1193
1194/// Broadcasts 16-bit integer `a` to all elements.
1195///
1196/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16)
1197#[inline]
1198#[target_feature(enable = "sse2")]
1199// no particular instruction to test
1200#[stable(feature = "simd_x86", since = "1.27.0")]
1201#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1202pub const fn _mm_set1_epi16(a: i16) -> __m128i {
1203    i16x8::splat(a).as_m128i()
1204}
1205
1206/// Broadcasts 8-bit integer `a` to all elements.
1207///
1208/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8)
1209#[inline]
1210#[target_feature(enable = "sse2")]
1211// no particular instruction to test
1212#[stable(feature = "simd_x86", since = "1.27.0")]
1213#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1214pub const fn _mm_set1_epi8(a: i8) -> __m128i {
1215    i8x16::splat(a).as_m128i()
1216}
1217
1218/// Sets packed 32-bit integers with the supplied values in reverse order.
1219///
1220/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32)
1221#[inline]
1222#[target_feature(enable = "sse2")]
1223// no particular instruction to test
1224#[stable(feature = "simd_x86", since = "1.27.0")]
1225#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1226pub const fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
1227    _mm_set_epi32(e0, e1, e2, e3)
1228}
1229
1230/// Sets packed 16-bit integers with the supplied values in reverse order.
1231///
1232/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16)
1233#[inline]
1234#[target_feature(enable = "sse2")]
1235// no particular instruction to test
1236#[stable(feature = "simd_x86", since = "1.27.0")]
1237#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1238pub const fn _mm_setr_epi16(
1239    e7: i16,
1240    e6: i16,
1241    e5: i16,
1242    e4: i16,
1243    e3: i16,
1244    e2: i16,
1245    e1: i16,
1246    e0: i16,
1247) -> __m128i {
1248    _mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7)
1249}
1250
1251/// Sets packed 8-bit integers with the supplied values in reverse order.
1252///
1253/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8)
1254#[inline]
1255#[target_feature(enable = "sse2")]
1256// no particular instruction to test
1257#[stable(feature = "simd_x86", since = "1.27.0")]
1258#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1259pub const fn _mm_setr_epi8(
1260    e15: i8,
1261    e14: i8,
1262    e13: i8,
1263    e12: i8,
1264    e11: i8,
1265    e10: i8,
1266    e9: i8,
1267    e8: i8,
1268    e7: i8,
1269    e6: i8,
1270    e5: i8,
1271    e4: i8,
1272    e3: i8,
1273    e2: i8,
1274    e1: i8,
1275    e0: i8,
1276) -> __m128i {
1277    #[rustfmt::skip]
1278    _mm_set_epi8(
1279        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
1280    )
1281}
1282
1283/// Returns a vector with all elements set to zero.
1284///
1285/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128)
1286#[inline]
1287#[target_feature(enable = "sse2")]
1288#[cfg_attr(test, assert_instr(xorps))]
1289#[stable(feature = "simd_x86", since = "1.27.0")]
1290#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1291pub const fn _mm_setzero_si128() -> __m128i {
1292    const { unsafe { mem::zeroed() } }
1293}
1294
1295/// Loads 64-bit integer from memory into first element of returned vector.
1296///
1297/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64)
1298#[inline]
1299#[target_feature(enable = "sse2")]
1300#[stable(feature = "simd_x86", since = "1.27.0")]
1301#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1302pub const unsafe fn _mm_loadl_epi64(mem_addr: *const __m128i) -> __m128i {
1303    _mm_set_epi64x(0, ptr::read_unaligned(mem_addr as *const i64))
1304}
1305
1306/// Loads 128-bits of integer data from memory into a new vector.
1307///
1308/// `mem_addr` must be aligned on a 16-byte boundary.
1309///
1310/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128)
1311#[inline]
1312#[target_feature(enable = "sse2")]
1313#[cfg_attr(
1314    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1315    assert_instr(movaps)
1316)]
1317#[stable(feature = "simd_x86", since = "1.27.0")]
1318#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1319pub const unsafe fn _mm_load_si128(mem_addr: *const __m128i) -> __m128i {
1320    *mem_addr
1321}
1322
1323/// Loads 128-bits of integer data from memory into a new vector.
1324///
1325/// `mem_addr` does not need to be aligned on any particular boundary.
1326///
1327/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128)
1328#[inline]
1329#[target_feature(enable = "sse2")]
1330#[cfg_attr(test, assert_instr(movups))]
1331#[stable(feature = "simd_x86", since = "1.27.0")]
1332#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1333pub const unsafe fn _mm_loadu_si128(mem_addr: *const __m128i) -> __m128i {
1334    let mut dst: __m128i = _mm_undefined_si128();
1335    ptr::copy_nonoverlapping(
1336        mem_addr as *const u8,
1337        ptr::addr_of_mut!(dst) as *mut u8,
1338        mem::size_of::<__m128i>(),
1339    );
1340    dst
1341}
1342
1343/// Conditionally store 8-bit integer elements from `a` into memory using
1344/// `mask` flagged as non-temporal (unlikely to be used again soon).
1345///
1346/// Elements are not stored when the highest bit is not set in the
1347/// corresponding element.
1348///
1349/// `mem_addr` should correspond to a 128-bit memory location and does not need
1350/// to be aligned on any particular boundary.
1351///
1352/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128)
1353///
1354/// # Safety of non-temporal stores
1355///
1356/// After using this intrinsic, but before any other access to the memory that this intrinsic
1357/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1358/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1359/// return.
1360///
1361/// See [`_mm_sfence`] for details.
1362#[inline]
1363#[target_feature(enable = "sse2")]
1364#[cfg_attr(test, assert_instr(maskmovdqu))]
1365#[stable(feature = "simd_x86", since = "1.27.0")]
1366pub unsafe fn _mm_maskmoveu_si128(a: __m128i, mask: __m128i, mem_addr: *mut i8) {
1367    maskmovdqu(a.as_i8x16(), mask.as_i8x16(), mem_addr)
1368}
1369
1370/// Stores 128-bits of integer data from `a` into memory.
1371///
1372/// `mem_addr` must be aligned on a 16-byte boundary.
1373///
1374/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128)
1375#[inline]
1376#[target_feature(enable = "sse2")]
1377#[cfg_attr(
1378    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1379    assert_instr(movaps)
1380)]
1381#[stable(feature = "simd_x86", since = "1.27.0")]
1382#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1383pub const unsafe fn _mm_store_si128(mem_addr: *mut __m128i, a: __m128i) {
1384    *mem_addr = a;
1385}
1386
1387/// Stores 128-bits of integer data from `a` into memory.
1388///
1389/// `mem_addr` does not need to be aligned on any particular boundary.
1390///
1391/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128)
1392#[inline]
1393#[target_feature(enable = "sse2")]
1394#[cfg_attr(test, assert_instr(movups))] // FIXME movdqu expected
1395#[stable(feature = "simd_x86", since = "1.27.0")]
1396#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1397pub const unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) {
1398    mem_addr.write_unaligned(a);
1399}
1400
1401/// Stores the lower 64-bit integer `a` to a memory location.
1402///
1403/// `mem_addr` does not need to be aligned on any particular boundary.
1404///
1405/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64)
1406#[inline]
1407#[target_feature(enable = "sse2")]
1408#[stable(feature = "simd_x86", since = "1.27.0")]
1409#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1410pub const unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
1411    ptr::copy_nonoverlapping(ptr::addr_of!(a) as *const u8, mem_addr as *mut u8, 8);
1412}
1413
1414/// Stores a 128-bit integer vector to a 128-bit aligned memory location.
1415/// To minimize caching, the data is flagged as non-temporal (unlikely to be
1416/// used again soon).
1417///
1418/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128)
1419///
1420/// # Safety of non-temporal stores
1421///
1422/// After using this intrinsic, but before any other access to the memory that this intrinsic
1423/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1424/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1425/// return.
1426///
1427/// See [`_mm_sfence`] for details.
1428#[inline]
1429#[target_feature(enable = "sse2")]
1430#[cfg_attr(test, assert_instr(movntdq))]
1431#[stable(feature = "simd_x86", since = "1.27.0")]
1432pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
1433    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
1434    crate::arch::asm!(
1435        vps!("movntdq",  ",{a}"),
1436        p = in(reg) mem_addr,
1437        a = in(xmm_reg) a,
1438        options(nostack, preserves_flags),
1439    );
1440}
1441
1442/// Stores a 32-bit integer value in the specified memory location.
1443/// To minimize caching, the data is flagged as non-temporal (unlikely to be
1444/// used again soon).
1445///
1446/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32)
1447///
1448/// # Safety of non-temporal stores
1449///
1450/// After using this intrinsic, but before any other access to the memory that this intrinsic
1451/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1452/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1453/// return.
1454///
1455/// See [`_mm_sfence`] for details.
1456#[inline]
1457#[target_feature(enable = "sse2")]
1458#[cfg_attr(test, assert_instr(movnti))]
1459#[stable(feature = "simd_x86", since = "1.27.0")]
1460pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) {
1461    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
1462    crate::arch::asm!(
1463        vps!("movnti", ",{a:e}"), // `:e` for 32bit value
1464        p = in(reg) mem_addr,
1465        a = in(reg) a,
1466        options(nostack, preserves_flags),
1467    );
1468}
1469
1470/// Returns a vector where the low element is extracted from `a` and its upper
1471/// element is zero.
1472///
1473/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64)
1474#[inline]
1475#[target_feature(enable = "sse2")]
1476// FIXME movd on msvc, movd on i686
1477#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(movq))]
1478#[stable(feature = "simd_x86", since = "1.27.0")]
1479#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1480pub const fn _mm_move_epi64(a: __m128i) -> __m128i {
1481    unsafe {
1482        let r: i64x2 = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 2]);
1483        transmute(r)
1484    }
1485}
1486
1487/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
1488/// using signed saturation.
1489///
1490/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16)
1491#[inline]
1492#[target_feature(enable = "sse2")]
1493#[cfg_attr(test, assert_instr(packsswb))]
1494#[stable(feature = "simd_x86", since = "1.27.0")]
1495pub fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i {
1496    unsafe { transmute(packsswb(a.as_i16x8(), b.as_i16x8())) }
1497}
1498
1499/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
1500/// using signed saturation.
1501///
1502/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32)
1503#[inline]
1504#[target_feature(enable = "sse2")]
1505#[cfg_attr(test, assert_instr(packssdw))]
1506#[stable(feature = "simd_x86", since = "1.27.0")]
1507pub fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i {
1508    unsafe { transmute(packssdw(a.as_i32x4(), b.as_i32x4())) }
1509}
1510
1511/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
1512/// using unsigned saturation.
1513///
1514/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16)
1515#[inline]
1516#[target_feature(enable = "sse2")]
1517#[cfg_attr(test, assert_instr(packuswb))]
1518#[stable(feature = "simd_x86", since = "1.27.0")]
1519pub fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i {
1520    unsafe { transmute(packuswb(a.as_i16x8(), b.as_i16x8())) }
1521}
1522
1523/// Returns the `imm8` element of `a`.
1524///
1525/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16)
1526#[inline]
1527#[target_feature(enable = "sse2")]
1528#[cfg_attr(test, assert_instr(pextrw, IMM8 = 7))]
1529#[rustc_legacy_const_generics(1)]
1530#[stable(feature = "simd_x86", since = "1.27.0")]
1531#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1532pub const fn _mm_extract_epi16<const IMM8: i32>(a: __m128i) -> i32 {
1533    static_assert_uimm_bits!(IMM8, 3);
1534    unsafe { simd_extract!(a.as_u16x8(), IMM8 as u32, u16) as i32 }
1535}
1536
1537/// Returns a new vector where the `imm8` element of `a` is replaced with `i`.
1538///
1539/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16)
1540#[inline]
1541#[target_feature(enable = "sse2")]
1542#[cfg_attr(test, assert_instr(pinsrw, IMM8 = 7))]
1543#[rustc_legacy_const_generics(2)]
1544#[stable(feature = "simd_x86", since = "1.27.0")]
1545#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1546pub const fn _mm_insert_epi16<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
1547    static_assert_uimm_bits!(IMM8, 3);
1548    unsafe { transmute(simd_insert!(a.as_i16x8(), IMM8 as u32, i as i16)) }
1549}
1550
1551/// Returns a mask of the most significant bit of each element in `a`.
1552///
1553/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8)
1554#[inline]
1555#[target_feature(enable = "sse2")]
1556#[cfg_attr(test, assert_instr(pmovmskb))]
1557#[stable(feature = "simd_x86", since = "1.27.0")]
1558#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1559pub const fn _mm_movemask_epi8(a: __m128i) -> i32 {
1560    unsafe {
1561        let z = i8x16::ZERO;
1562        let m: i8x16 = simd_lt(a.as_i8x16(), z);
1563        simd_bitmask::<_, u16>(m) as u32 as i32
1564    }
1565}
1566
1567/// Shuffles 32-bit integers in `a` using the control in `IMM8`.
1568///
1569/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32)
1570#[inline]
1571#[target_feature(enable = "sse2")]
1572#[cfg_attr(test, assert_instr(pshufd, IMM8 = 9))]
1573#[rustc_legacy_const_generics(1)]
1574#[stable(feature = "simd_x86", since = "1.27.0")]
1575#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1576pub const fn _mm_shuffle_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
1577    static_assert_uimm_bits!(IMM8, 8);
1578    unsafe {
1579        let a = a.as_i32x4();
1580        let x: i32x4 = simd_shuffle!(
1581            a,
1582            a,
1583            [
1584                IMM8 as u32 & 0b11,
1585                (IMM8 as u32 >> 2) & 0b11,
1586                (IMM8 as u32 >> 4) & 0b11,
1587                (IMM8 as u32 >> 6) & 0b11,
1588            ],
1589        );
1590        transmute(x)
1591    }
1592}
1593
1594/// Shuffles 16-bit integers in the high 64 bits of `a` using the control in
1595/// `IMM8`.
1596///
1597/// Put the results in the high 64 bits of the returned vector, with the low 64
1598/// bits being copied from `a`.
1599///
1600/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16)
1601#[inline]
1602#[target_feature(enable = "sse2")]
1603#[cfg_attr(test, assert_instr(pshufhw, IMM8 = 9))]
1604#[rustc_legacy_const_generics(1)]
1605#[stable(feature = "simd_x86", since = "1.27.0")]
1606#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1607pub const fn _mm_shufflehi_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
1608    static_assert_uimm_bits!(IMM8, 8);
1609    unsafe {
1610        let a = a.as_i16x8();
1611        let x: i16x8 = simd_shuffle!(
1612            a,
1613            a,
1614            [
1615                0,
1616                1,
1617                2,
1618                3,
1619                (IMM8 as u32 & 0b11) + 4,
1620                ((IMM8 as u32 >> 2) & 0b11) + 4,
1621                ((IMM8 as u32 >> 4) & 0b11) + 4,
1622                ((IMM8 as u32 >> 6) & 0b11) + 4,
1623            ],
1624        );
1625        transmute(x)
1626    }
1627}
1628
1629/// Shuffles 16-bit integers in the low 64 bits of `a` using the control in
1630/// `IMM8`.
1631///
1632/// Put the results in the low 64 bits of the returned vector, with the high 64
1633/// bits being copied from `a`.
1634///
1635/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16)
1636#[inline]
1637#[target_feature(enable = "sse2")]
1638#[cfg_attr(test, assert_instr(pshuflw, IMM8 = 9))]
1639#[rustc_legacy_const_generics(1)]
1640#[stable(feature = "simd_x86", since = "1.27.0")]
1641#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1642pub const fn _mm_shufflelo_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
1643    static_assert_uimm_bits!(IMM8, 8);
1644    unsafe {
1645        let a = a.as_i16x8();
1646        let x: i16x8 = simd_shuffle!(
1647            a,
1648            a,
1649            [
1650                IMM8 as u32 & 0b11,
1651                (IMM8 as u32 >> 2) & 0b11,
1652                (IMM8 as u32 >> 4) & 0b11,
1653                (IMM8 as u32 >> 6) & 0b11,
1654                4,
1655                5,
1656                6,
1657                7,
1658            ],
1659        );
1660        transmute(x)
1661    }
1662}
1663
1664/// Unpacks and interleave 8-bit integers from the high half of `a` and `b`.
1665///
1666/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8)
1667#[inline]
1668#[target_feature(enable = "sse2")]
1669#[cfg_attr(test, assert_instr(punpckhbw))]
1670#[stable(feature = "simd_x86", since = "1.27.0")]
1671#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1672pub const fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i {
1673    unsafe {
1674        transmute::<i8x16, _>(simd_shuffle!(
1675            a.as_i8x16(),
1676            b.as_i8x16(),
1677            [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31],
1678        ))
1679    }
1680}
1681
1682/// Unpacks and interleave 16-bit integers from the high half of `a` and `b`.
1683///
1684/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16)
1685#[inline]
1686#[target_feature(enable = "sse2")]
1687#[cfg_attr(test, assert_instr(punpckhwd))]
1688#[stable(feature = "simd_x86", since = "1.27.0")]
1689#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1690pub const fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i {
1691    unsafe {
1692        let x = simd_shuffle!(a.as_i16x8(), b.as_i16x8(), [4, 12, 5, 13, 6, 14, 7, 15]);
1693        transmute::<i16x8, _>(x)
1694    }
1695}
1696
1697/// Unpacks and interleave 32-bit integers from the high half of `a` and `b`.
1698///
1699/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32)
1700#[inline]
1701#[target_feature(enable = "sse2")]
1702#[cfg_attr(test, assert_instr(unpckhps))]
1703#[stable(feature = "simd_x86", since = "1.27.0")]
1704#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1705pub const fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i {
1706    unsafe { transmute::<i32x4, _>(simd_shuffle!(a.as_i32x4(), b.as_i32x4(), [2, 6, 3, 7])) }
1707}
1708
1709/// Unpacks and interleave 64-bit integers from the high half of `a` and `b`.
1710///
1711/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64)
1712#[inline]
1713#[target_feature(enable = "sse2")]
1714#[cfg_attr(test, assert_instr(unpckhpd))]
1715#[stable(feature = "simd_x86", since = "1.27.0")]
1716#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1717pub const fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i {
1718    unsafe { transmute::<i64x2, _>(simd_shuffle!(a.as_i64x2(), b.as_i64x2(), [1, 3])) }
1719}
1720
1721/// Unpacks and interleave 8-bit integers from the low half of `a` and `b`.
1722///
1723/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8)
1724#[inline]
1725#[target_feature(enable = "sse2")]
1726#[cfg_attr(test, assert_instr(punpcklbw))]
1727#[stable(feature = "simd_x86", since = "1.27.0")]
1728#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1729pub const fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
1730    unsafe {
1731        transmute::<i8x16, _>(simd_shuffle!(
1732            a.as_i8x16(),
1733            b.as_i8x16(),
1734            [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23],
1735        ))
1736    }
1737}
1738
1739/// Unpacks and interleave 16-bit integers from the low half of `a` and `b`.
1740///
1741/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16)
1742#[inline]
1743#[target_feature(enable = "sse2")]
1744#[cfg_attr(test, assert_instr(punpcklwd))]
1745#[stable(feature = "simd_x86", since = "1.27.0")]
1746#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1747pub const fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
1748    unsafe {
1749        let x = simd_shuffle!(a.as_i16x8(), b.as_i16x8(), [0, 8, 1, 9, 2, 10, 3, 11]);
1750        transmute::<i16x8, _>(x)
1751    }
1752}
1753
1754/// Unpacks and interleave 32-bit integers from the low half of `a` and `b`.
1755///
1756/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32)
1757#[inline]
1758#[target_feature(enable = "sse2")]
1759#[cfg_attr(test, assert_instr(unpcklps))]
1760#[stable(feature = "simd_x86", since = "1.27.0")]
1761#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1762pub const fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i {
1763    unsafe { transmute::<i32x4, _>(simd_shuffle!(a.as_i32x4(), b.as_i32x4(), [0, 4, 1, 5])) }
1764}
1765
1766/// Unpacks and interleave 64-bit integers from the low half of `a` and `b`.
1767///
1768/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64)
1769#[inline]
1770#[target_feature(enable = "sse2")]
1771#[cfg_attr(test, assert_instr(movlhps))]
1772#[stable(feature = "simd_x86", since = "1.27.0")]
1773#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1774pub const fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
1775    unsafe { transmute::<i64x2, _>(simd_shuffle!(a.as_i64x2(), b.as_i64x2(), [0, 2])) }
1776}
1777
1778/// Returns a new vector with the low element of `a` replaced by the sum of the
1779/// low elements of `a` and `b`.
1780///
1781/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd)
1782#[inline]
1783#[target_feature(enable = "sse2")]
1784#[cfg_attr(test, assert_instr(addsd))]
1785#[stable(feature = "simd_x86", since = "1.27.0")]
1786#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1787pub const fn _mm_add_sd(a: __m128d, b: __m128d) -> __m128d {
1788    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) + _mm_cvtsd_f64(b)) }
1789}
1790
1791/// Adds packed double-precision (64-bit) floating-point elements in `a` and
1792/// `b`.
1793///
1794/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd)
1795#[inline]
1796#[target_feature(enable = "sse2")]
1797#[cfg_attr(test, assert_instr(addpd))]
1798#[stable(feature = "simd_x86", since = "1.27.0")]
1799#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1800pub const fn _mm_add_pd(a: __m128d, b: __m128d) -> __m128d {
1801    unsafe { simd_add(a, b) }
1802}
1803
1804/// Returns a new vector with the low element of `a` replaced by the result of
1805/// diving the lower element of `a` by the lower element of `b`.
1806///
1807/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd)
1808#[inline]
1809#[target_feature(enable = "sse2")]
1810#[cfg_attr(test, assert_instr(divsd))]
1811#[stable(feature = "simd_x86", since = "1.27.0")]
1812#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1813pub const fn _mm_div_sd(a: __m128d, b: __m128d) -> __m128d {
1814    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) / _mm_cvtsd_f64(b)) }
1815}
1816
1817/// Divide packed double-precision (64-bit) floating-point elements in `a` by
1818/// packed elements in `b`.
1819///
1820/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd)
1821#[inline]
1822#[target_feature(enable = "sse2")]
1823#[cfg_attr(test, assert_instr(divpd))]
1824#[stable(feature = "simd_x86", since = "1.27.0")]
1825#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1826pub const fn _mm_div_pd(a: __m128d, b: __m128d) -> __m128d {
1827    unsafe { simd_div(a, b) }
1828}
1829
1830/// Returns a new vector with the low element of `a` replaced by the maximum
1831/// of the lower elements of `a` and `b`.
1832///
1833/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd)
1834#[inline]
1835#[target_feature(enable = "sse2")]
1836#[cfg_attr(test, assert_instr(maxsd))]
1837#[stable(feature = "simd_x86", since = "1.27.0")]
1838pub fn _mm_max_sd(a: __m128d, b: __m128d) -> __m128d {
1839    unsafe { maxsd(a, b) }
1840}
1841
1842/// Returns a new vector with the maximum values from corresponding elements in
1843/// `a` and `b`.
1844///
1845/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd)
1846#[inline]
1847#[target_feature(enable = "sse2")]
1848#[cfg_attr(test, assert_instr(maxpd))]
1849#[stable(feature = "simd_x86", since = "1.27.0")]
1850pub fn _mm_max_pd(a: __m128d, b: __m128d) -> __m128d {
1851    unsafe { maxpd(a, b) }
1852}
1853
1854/// Returns a new vector with the low element of `a` replaced by the minimum
1855/// of the lower elements of `a` and `b`.
1856///
1857/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd)
1858#[inline]
1859#[target_feature(enable = "sse2")]
1860#[cfg_attr(test, assert_instr(minsd))]
1861#[stable(feature = "simd_x86", since = "1.27.0")]
1862pub fn _mm_min_sd(a: __m128d, b: __m128d) -> __m128d {
1863    unsafe { minsd(a, b) }
1864}
1865
1866/// Returns a new vector with the minimum values from corresponding elements in
1867/// `a` and `b`.
1868///
1869/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd)
1870#[inline]
1871#[target_feature(enable = "sse2")]
1872#[cfg_attr(test, assert_instr(minpd))]
1873#[stable(feature = "simd_x86", since = "1.27.0")]
1874pub fn _mm_min_pd(a: __m128d, b: __m128d) -> __m128d {
1875    unsafe { minpd(a, b) }
1876}
1877
1878/// Returns a new vector with the low element of `a` replaced by multiplying the
1879/// low elements of `a` and `b`.
1880///
1881/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_sd)
1882#[inline]
1883#[target_feature(enable = "sse2")]
1884#[cfg_attr(test, assert_instr(mulsd))]
1885#[stable(feature = "simd_x86", since = "1.27.0")]
1886#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1887pub const fn _mm_mul_sd(a: __m128d, b: __m128d) -> __m128d {
1888    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) * _mm_cvtsd_f64(b)) }
1889}
1890
1891/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
1892/// and `b`.
1893///
1894/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd)
1895#[inline]
1896#[target_feature(enable = "sse2")]
1897#[cfg_attr(test, assert_instr(mulpd))]
1898#[stable(feature = "simd_x86", since = "1.27.0")]
1899#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1900pub const fn _mm_mul_pd(a: __m128d, b: __m128d) -> __m128d {
1901    unsafe { simd_mul(a, b) }
1902}
1903
1904/// Returns a new vector with the low element of `a` replaced by the square
1905/// root of the lower element `b`.
1906///
1907/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd)
1908#[inline]
1909#[target_feature(enable = "sse2")]
1910#[cfg_attr(test, assert_instr(sqrtsd))]
1911#[stable(feature = "simd_x86", since = "1.27.0")]
1912pub fn _mm_sqrt_sd(a: __m128d, b: __m128d) -> __m128d {
1913    unsafe { simd_insert!(a, 0, sqrtf64(_mm_cvtsd_f64(b))) }
1914}
1915
1916/// Returns a new vector with the square root of each of the values in `a`.
1917///
1918/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd)
1919#[inline]
1920#[target_feature(enable = "sse2")]
1921#[cfg_attr(test, assert_instr(sqrtpd))]
1922#[stable(feature = "simd_x86", since = "1.27.0")]
1923pub fn _mm_sqrt_pd(a: __m128d) -> __m128d {
1924    unsafe { simd_fsqrt(a) }
1925}
1926
1927/// Returns a new vector with the low element of `a` replaced by subtracting the
1928/// low element by `b` from the low element of `a`.
1929///
1930/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd)
1931#[inline]
1932#[target_feature(enable = "sse2")]
1933#[cfg_attr(test, assert_instr(subsd))]
1934#[stable(feature = "simd_x86", since = "1.27.0")]
1935#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1936pub const fn _mm_sub_sd(a: __m128d, b: __m128d) -> __m128d {
1937    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) - _mm_cvtsd_f64(b)) }
1938}
1939
1940/// Subtract packed double-precision (64-bit) floating-point elements in `b`
1941/// from `a`.
1942///
1943/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_pd)
1944#[inline]
1945#[target_feature(enable = "sse2")]
1946#[cfg_attr(test, assert_instr(subpd))]
1947#[stable(feature = "simd_x86", since = "1.27.0")]
1948#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1949pub const fn _mm_sub_pd(a: __m128d, b: __m128d) -> __m128d {
1950    unsafe { simd_sub(a, b) }
1951}
1952
1953/// Computes the bitwise AND of packed double-precision (64-bit) floating-point
1954/// elements in `a` and `b`.
1955///
1956/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd)
1957#[inline]
1958#[target_feature(enable = "sse2")]
1959#[cfg_attr(test, assert_instr(andps))]
1960#[stable(feature = "simd_x86", since = "1.27.0")]
1961#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1962pub const fn _mm_and_pd(a: __m128d, b: __m128d) -> __m128d {
1963    unsafe {
1964        let a: __m128i = transmute(a);
1965        let b: __m128i = transmute(b);
1966        transmute(_mm_and_si128(a, b))
1967    }
1968}
1969
1970/// Computes the bitwise NOT of `a` and then AND with `b`.
1971///
1972/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd)
1973#[inline]
1974#[target_feature(enable = "sse2")]
1975#[cfg_attr(test, assert_instr(andnps))]
1976#[stable(feature = "simd_x86", since = "1.27.0")]
1977#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1978pub const fn _mm_andnot_pd(a: __m128d, b: __m128d) -> __m128d {
1979    unsafe {
1980        let a: __m128i = transmute(a);
1981        let b: __m128i = transmute(b);
1982        transmute(_mm_andnot_si128(a, b))
1983    }
1984}
1985
1986/// Computes the bitwise OR of `a` and `b`.
1987///
1988/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_pd)
1989#[inline]
1990#[target_feature(enable = "sse2")]
1991#[cfg_attr(test, assert_instr(orps))]
1992#[stable(feature = "simd_x86", since = "1.27.0")]
1993#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1994pub const fn _mm_or_pd(a: __m128d, b: __m128d) -> __m128d {
1995    unsafe {
1996        let a: __m128i = transmute(a);
1997        let b: __m128i = transmute(b);
1998        transmute(_mm_or_si128(a, b))
1999    }
2000}
2001
2002/// Computes the bitwise XOR of `a` and `b`.
2003///
2004/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd)
2005#[inline]
2006#[target_feature(enable = "sse2")]
2007#[cfg_attr(test, assert_instr(xorps))]
2008#[stable(feature = "simd_x86", since = "1.27.0")]
2009#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2010pub const fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d {
2011    unsafe {
2012        let a: __m128i = transmute(a);
2013        let b: __m128i = transmute(b);
2014        transmute(_mm_xor_si128(a, b))
2015    }
2016}
2017
2018/// Returns a new vector with the low element of `a` replaced by the equality
2019/// comparison of the lower elements of `a` and `b`.
2020///
2021/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd)
2022#[inline]
2023#[target_feature(enable = "sse2")]
2024#[cfg_attr(test, assert_instr(cmpeqsd))]
2025#[stable(feature = "simd_x86", since = "1.27.0")]
2026pub fn _mm_cmpeq_sd(a: __m128d, b: __m128d) -> __m128d {
2027    unsafe { cmpsd(a, b, 0) }
2028}
2029
2030/// Returns a new vector with the low element of `a` replaced by the less-than
2031/// comparison of the lower elements of `a` and `b`.
2032///
2033/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd)
2034#[inline]
2035#[target_feature(enable = "sse2")]
2036#[cfg_attr(test, assert_instr(cmpltsd))]
2037#[stable(feature = "simd_x86", since = "1.27.0")]
2038pub fn _mm_cmplt_sd(a: __m128d, b: __m128d) -> __m128d {
2039    unsafe { cmpsd(a, b, 1) }
2040}
2041
2042/// Returns a new vector with the low element of `a` replaced by the
2043/// less-than-or-equal comparison of the lower elements of `a` and `b`.
2044///
2045/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd)
2046#[inline]
2047#[target_feature(enable = "sse2")]
2048#[cfg_attr(test, assert_instr(cmplesd))]
2049#[stable(feature = "simd_x86", since = "1.27.0")]
2050pub fn _mm_cmple_sd(a: __m128d, b: __m128d) -> __m128d {
2051    unsafe { cmpsd(a, b, 2) }
2052}
2053
2054/// Returns a new vector with the low element of `a` replaced by the
2055/// greater-than comparison of the lower elements of `a` and `b`.
2056///
2057/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd)
2058#[inline]
2059#[target_feature(enable = "sse2")]
2060#[cfg_attr(test, assert_instr(cmpltsd))]
2061#[stable(feature = "simd_x86", since = "1.27.0")]
2062pub fn _mm_cmpgt_sd(a: __m128d, b: __m128d) -> __m128d {
2063    unsafe { simd_insert!(_mm_cmplt_sd(b, a), 1, simd_extract!(a, 1, f64)) }
2064}
2065
2066/// Returns a new vector with the low element of `a` replaced by the
2067/// greater-than-or-equal comparison of the lower elements of `a` and `b`.
2068///
2069/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd)
2070#[inline]
2071#[target_feature(enable = "sse2")]
2072#[cfg_attr(test, assert_instr(cmplesd))]
2073#[stable(feature = "simd_x86", since = "1.27.0")]
2074pub fn _mm_cmpge_sd(a: __m128d, b: __m128d) -> __m128d {
2075    unsafe { simd_insert!(_mm_cmple_sd(b, a), 1, simd_extract!(a, 1, f64)) }
2076}
2077
2078/// Returns a new vector with the low element of `a` replaced by the result
2079/// of comparing both of the lower elements of `a` and `b` to `NaN`. If
2080/// neither are equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0`
2081/// otherwise.
2082///
2083/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd)
2084#[inline]
2085#[target_feature(enable = "sse2")]
2086#[cfg_attr(test, assert_instr(cmpordsd))]
2087#[stable(feature = "simd_x86", since = "1.27.0")]
2088pub fn _mm_cmpord_sd(a: __m128d, b: __m128d) -> __m128d {
2089    unsafe { cmpsd(a, b, 7) }
2090}
2091
2092/// Returns a new vector with the low element of `a` replaced by the result of
2093/// comparing both of the lower elements of `a` and `b` to `NaN`. If either is
2094/// equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0` otherwise.
2095///
2096/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd)
2097#[inline]
2098#[target_feature(enable = "sse2")]
2099#[cfg_attr(test, assert_instr(cmpunordsd))]
2100#[stable(feature = "simd_x86", since = "1.27.0")]
2101pub fn _mm_cmpunord_sd(a: __m128d, b: __m128d) -> __m128d {
2102    unsafe { cmpsd(a, b, 3) }
2103}
2104
2105/// Returns a new vector with the low element of `a` replaced by the not-equal
2106/// comparison of the lower elements of `a` and `b`.
2107///
2108/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd)
2109#[inline]
2110#[target_feature(enable = "sse2")]
2111#[cfg_attr(test, assert_instr(cmpneqsd))]
2112#[stable(feature = "simd_x86", since = "1.27.0")]
2113pub fn _mm_cmpneq_sd(a: __m128d, b: __m128d) -> __m128d {
2114    unsafe { cmpsd(a, b, 4) }
2115}
2116
2117/// Returns a new vector with the low element of `a` replaced by the
2118/// not-less-than comparison of the lower elements of `a` and `b`.
2119///
2120/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd)
2121#[inline]
2122#[target_feature(enable = "sse2")]
2123#[cfg_attr(test, assert_instr(cmpnltsd))]
2124#[stable(feature = "simd_x86", since = "1.27.0")]
2125pub fn _mm_cmpnlt_sd(a: __m128d, b: __m128d) -> __m128d {
2126    unsafe { cmpsd(a, b, 5) }
2127}
2128
2129/// Returns a new vector with the low element of `a` replaced by the
2130/// not-less-than-or-equal comparison of the lower elements of `a` and `b`.
2131///
2132/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd)
2133#[inline]
2134#[target_feature(enable = "sse2")]
2135#[cfg_attr(test, assert_instr(cmpnlesd))]
2136#[stable(feature = "simd_x86", since = "1.27.0")]
2137pub fn _mm_cmpnle_sd(a: __m128d, b: __m128d) -> __m128d {
2138    unsafe { cmpsd(a, b, 6) }
2139}
2140
2141/// Returns a new vector with the low element of `a` replaced by the
2142/// not-greater-than comparison of the lower elements of `a` and `b`.
2143///
2144/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd)
2145#[inline]
2146#[target_feature(enable = "sse2")]
2147#[cfg_attr(test, assert_instr(cmpnltsd))]
2148#[stable(feature = "simd_x86", since = "1.27.0")]
2149pub fn _mm_cmpngt_sd(a: __m128d, b: __m128d) -> __m128d {
2150    unsafe { simd_insert!(_mm_cmpnlt_sd(b, a), 1, simd_extract!(a, 1, f64)) }
2151}
2152
2153/// Returns a new vector with the low element of `a` replaced by the
2154/// not-greater-than-or-equal comparison of the lower elements of `a` and `b`.
2155///
2156/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd)
2157#[inline]
2158#[target_feature(enable = "sse2")]
2159#[cfg_attr(test, assert_instr(cmpnlesd))]
2160#[stable(feature = "simd_x86", since = "1.27.0")]
2161pub fn _mm_cmpnge_sd(a: __m128d, b: __m128d) -> __m128d {
2162    unsafe { simd_insert!(_mm_cmpnle_sd(b, a), 1, simd_extract!(a, 1, f64)) }
2163}
2164
2165/// Compares corresponding elements in `a` and `b` for equality.
2166///
2167/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd)
2168#[inline]
2169#[target_feature(enable = "sse2")]
2170#[cfg_attr(test, assert_instr(cmpeqpd))]
2171#[stable(feature = "simd_x86", since = "1.27.0")]
2172pub fn _mm_cmpeq_pd(a: __m128d, b: __m128d) -> __m128d {
2173    unsafe { cmppd(a, b, 0) }
2174}
2175
2176/// Compares corresponding elements in `a` and `b` for less-than.
2177///
2178/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd)
2179#[inline]
2180#[target_feature(enable = "sse2")]
2181#[cfg_attr(test, assert_instr(cmpltpd))]
2182#[stable(feature = "simd_x86", since = "1.27.0")]
2183pub fn _mm_cmplt_pd(a: __m128d, b: __m128d) -> __m128d {
2184    unsafe { cmppd(a, b, 1) }
2185}
2186
2187/// Compares corresponding elements in `a` and `b` for less-than-or-equal
2188///
2189/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd)
2190#[inline]
2191#[target_feature(enable = "sse2")]
2192#[cfg_attr(test, assert_instr(cmplepd))]
2193#[stable(feature = "simd_x86", since = "1.27.0")]
2194pub fn _mm_cmple_pd(a: __m128d, b: __m128d) -> __m128d {
2195    unsafe { cmppd(a, b, 2) }
2196}
2197
2198/// Compares corresponding elements in `a` and `b` for greater-than.
2199///
2200/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd)
2201#[inline]
2202#[target_feature(enable = "sse2")]
2203#[cfg_attr(test, assert_instr(cmpltpd))]
2204#[stable(feature = "simd_x86", since = "1.27.0")]
2205pub fn _mm_cmpgt_pd(a: __m128d, b: __m128d) -> __m128d {
2206    _mm_cmplt_pd(b, a)
2207}
2208
2209/// Compares corresponding elements in `a` and `b` for greater-than-or-equal.
2210///
2211/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd)
2212#[inline]
2213#[target_feature(enable = "sse2")]
2214#[cfg_attr(test, assert_instr(cmplepd))]
2215#[stable(feature = "simd_x86", since = "1.27.0")]
2216pub fn _mm_cmpge_pd(a: __m128d, b: __m128d) -> __m128d {
2217    _mm_cmple_pd(b, a)
2218}
2219
2220/// Compares corresponding elements in `a` and `b` to see if neither is `NaN`.
2221///
2222/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd)
2223#[inline]
2224#[target_feature(enable = "sse2")]
2225#[cfg_attr(test, assert_instr(cmpordpd))]
2226#[stable(feature = "simd_x86", since = "1.27.0")]
2227pub fn _mm_cmpord_pd(a: __m128d, b: __m128d) -> __m128d {
2228    unsafe { cmppd(a, b, 7) }
2229}
2230
2231/// Compares corresponding elements in `a` and `b` to see if either is `NaN`.
2232///
2233/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd)
2234#[inline]
2235#[target_feature(enable = "sse2")]
2236#[cfg_attr(test, assert_instr(cmpunordpd))]
2237#[stable(feature = "simd_x86", since = "1.27.0")]
2238pub fn _mm_cmpunord_pd(a: __m128d, b: __m128d) -> __m128d {
2239    unsafe { cmppd(a, b, 3) }
2240}
2241
2242/// Compares corresponding elements in `a` and `b` for not-equal.
2243///
2244/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd)
2245#[inline]
2246#[target_feature(enable = "sse2")]
2247#[cfg_attr(test, assert_instr(cmpneqpd))]
2248#[stable(feature = "simd_x86", since = "1.27.0")]
2249pub fn _mm_cmpneq_pd(a: __m128d, b: __m128d) -> __m128d {
2250    unsafe { cmppd(a, b, 4) }
2251}
2252
2253/// Compares corresponding elements in `a` and `b` for not-less-than.
2254///
2255/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd)
2256#[inline]
2257#[target_feature(enable = "sse2")]
2258#[cfg_attr(test, assert_instr(cmpnltpd))]
2259#[stable(feature = "simd_x86", since = "1.27.0")]
2260pub fn _mm_cmpnlt_pd(a: __m128d, b: __m128d) -> __m128d {
2261    unsafe { cmppd(a, b, 5) }
2262}
2263
2264/// Compares corresponding elements in `a` and `b` for not-less-than-or-equal.
2265///
2266/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd)
2267#[inline]
2268#[target_feature(enable = "sse2")]
2269#[cfg_attr(test, assert_instr(cmpnlepd))]
2270#[stable(feature = "simd_x86", since = "1.27.0")]
2271pub fn _mm_cmpnle_pd(a: __m128d, b: __m128d) -> __m128d {
2272    unsafe { cmppd(a, b, 6) }
2273}
2274
2275/// Compares corresponding elements in `a` and `b` for not-greater-than.
2276///
2277/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_pd)
2278#[inline]
2279#[target_feature(enable = "sse2")]
2280#[cfg_attr(test, assert_instr(cmpnltpd))]
2281#[stable(feature = "simd_x86", since = "1.27.0")]
2282pub fn _mm_cmpngt_pd(a: __m128d, b: __m128d) -> __m128d {
2283    _mm_cmpnlt_pd(b, a)
2284}
2285
2286/// Compares corresponding elements in `a` and `b` for
2287/// not-greater-than-or-equal.
2288///
2289/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd)
2290#[inline]
2291#[target_feature(enable = "sse2")]
2292#[cfg_attr(test, assert_instr(cmpnlepd))]
2293#[stable(feature = "simd_x86", since = "1.27.0")]
2294pub fn _mm_cmpnge_pd(a: __m128d, b: __m128d) -> __m128d {
2295    _mm_cmpnle_pd(b, a)
2296}
2297
2298/// Compares the lower element of `a` and `b` for equality.
2299///
2300/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd)
2301#[inline]
2302#[target_feature(enable = "sse2")]
2303#[cfg_attr(test, assert_instr(comisd))]
2304#[stable(feature = "simd_x86", since = "1.27.0")]
2305pub fn _mm_comieq_sd(a: __m128d, b: __m128d) -> i32 {
2306    unsafe { comieqsd(a, b) }
2307}
2308
2309/// Compares the lower element of `a` and `b` for less-than.
2310///
2311/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd)
2312#[inline]
2313#[target_feature(enable = "sse2")]
2314#[cfg_attr(test, assert_instr(comisd))]
2315#[stable(feature = "simd_x86", since = "1.27.0")]
2316pub fn _mm_comilt_sd(a: __m128d, b: __m128d) -> i32 {
2317    unsafe { comiltsd(a, b) }
2318}
2319
2320/// Compares the lower element of `a` and `b` for less-than-or-equal.
2321///
2322/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd)
2323#[inline]
2324#[target_feature(enable = "sse2")]
2325#[cfg_attr(test, assert_instr(comisd))]
2326#[stable(feature = "simd_x86", since = "1.27.0")]
2327pub fn _mm_comile_sd(a: __m128d, b: __m128d) -> i32 {
2328    unsafe { comilesd(a, b) }
2329}
2330
2331/// Compares the lower element of `a` and `b` for greater-than.
2332///
2333/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd)
2334#[inline]
2335#[target_feature(enable = "sse2")]
2336#[cfg_attr(test, assert_instr(comisd))]
2337#[stable(feature = "simd_x86", since = "1.27.0")]
2338pub fn _mm_comigt_sd(a: __m128d, b: __m128d) -> i32 {
2339    unsafe { comigtsd(a, b) }
2340}
2341
2342/// Compares the lower element of `a` and `b` for greater-than-or-equal.
2343///
2344/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd)
2345#[inline]
2346#[target_feature(enable = "sse2")]
2347#[cfg_attr(test, assert_instr(comisd))]
2348#[stable(feature = "simd_x86", since = "1.27.0")]
2349pub fn _mm_comige_sd(a: __m128d, b: __m128d) -> i32 {
2350    unsafe { comigesd(a, b) }
2351}
2352
2353/// Compares the lower element of `a` and `b` for not-equal.
2354///
2355/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd)
2356#[inline]
2357#[target_feature(enable = "sse2")]
2358#[cfg_attr(test, assert_instr(comisd))]
2359#[stable(feature = "simd_x86", since = "1.27.0")]
2360pub fn _mm_comineq_sd(a: __m128d, b: __m128d) -> i32 {
2361    unsafe { comineqsd(a, b) }
2362}
2363
2364/// Compares the lower element of `a` and `b` for equality.
2365///
2366/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomieq_sd)
2367#[inline]
2368#[target_feature(enable = "sse2")]
2369#[cfg_attr(test, assert_instr(ucomisd))]
2370#[stable(feature = "simd_x86", since = "1.27.0")]
2371pub fn _mm_ucomieq_sd(a: __m128d, b: __m128d) -> i32 {
2372    unsafe { ucomieqsd(a, b) }
2373}
2374
2375/// Compares the lower element of `a` and `b` for less-than.
2376///
2377/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomilt_sd)
2378#[inline]
2379#[target_feature(enable = "sse2")]
2380#[cfg_attr(test, assert_instr(ucomisd))]
2381#[stable(feature = "simd_x86", since = "1.27.0")]
2382pub fn _mm_ucomilt_sd(a: __m128d, b: __m128d) -> i32 {
2383    unsafe { ucomiltsd(a, b) }
2384}
2385
2386/// Compares the lower element of `a` and `b` for less-than-or-equal.
2387///
2388/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomile_sd)
2389#[inline]
2390#[target_feature(enable = "sse2")]
2391#[cfg_attr(test, assert_instr(ucomisd))]
2392#[stable(feature = "simd_x86", since = "1.27.0")]
2393pub fn _mm_ucomile_sd(a: __m128d, b: __m128d) -> i32 {
2394    unsafe { ucomilesd(a, b) }
2395}
2396
2397/// Compares the lower element of `a` and `b` for greater-than.
2398///
2399/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomigt_sd)
2400#[inline]
2401#[target_feature(enable = "sse2")]
2402#[cfg_attr(test, assert_instr(ucomisd))]
2403#[stable(feature = "simd_x86", since = "1.27.0")]
2404pub fn _mm_ucomigt_sd(a: __m128d, b: __m128d) -> i32 {
2405    unsafe { ucomigtsd(a, b) }
2406}
2407
2408/// Compares the lower element of `a` and `b` for greater-than-or-equal.
2409///
2410/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomige_sd)
2411#[inline]
2412#[target_feature(enable = "sse2")]
2413#[cfg_attr(test, assert_instr(ucomisd))]
2414#[stable(feature = "simd_x86", since = "1.27.0")]
2415pub fn _mm_ucomige_sd(a: __m128d, b: __m128d) -> i32 {
2416    unsafe { ucomigesd(a, b) }
2417}
2418
2419/// Compares the lower element of `a` and `b` for not-equal.
2420///
2421/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomineq_sd)
2422#[inline]
2423#[target_feature(enable = "sse2")]
2424#[cfg_attr(test, assert_instr(ucomisd))]
2425#[stable(feature = "simd_x86", since = "1.27.0")]
2426pub fn _mm_ucomineq_sd(a: __m128d, b: __m128d) -> i32 {
2427    unsafe { ucomineqsd(a, b) }
2428}
2429
2430/// Converts packed double-precision (64-bit) floating-point elements in `a` to
2431/// packed single-precision (32-bit) floating-point elements
2432///
2433/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps)
2434#[inline]
2435#[target_feature(enable = "sse2")]
2436#[cfg_attr(test, assert_instr(cvtpd2ps))]
2437#[stable(feature = "simd_x86", since = "1.27.0")]
2438#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2439pub const fn _mm_cvtpd_ps(a: __m128d) -> __m128 {
2440    unsafe {
2441        let r = simd_cast::<_, f32x2>(a.as_f64x2());
2442        let zero = f32x2::ZERO;
2443        transmute::<f32x4, _>(simd_shuffle!(r, zero, [0, 1, 2, 3]))
2444    }
2445}
2446
2447/// Converts packed single-precision (32-bit) floating-point elements in `a` to
2448/// packed
2449/// double-precision (64-bit) floating-point elements.
2450///
2451/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd)
2452#[inline]
2453#[target_feature(enable = "sse2")]
2454#[cfg_attr(test, assert_instr(cvtps2pd))]
2455#[stable(feature = "simd_x86", since = "1.27.0")]
2456#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2457pub const fn _mm_cvtps_pd(a: __m128) -> __m128d {
2458    unsafe {
2459        let a = a.as_f32x4();
2460        transmute(simd_cast::<f32x2, f64x2>(simd_shuffle!(a, a, [0, 1])))
2461    }
2462}
2463
2464/// Converts packed double-precision (64-bit) floating-point elements in `a` to
2465/// packed 32-bit integers.
2466///
2467/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32)
2468#[inline]
2469#[target_feature(enable = "sse2")]
2470#[cfg_attr(test, assert_instr(cvtpd2dq))]
2471#[stable(feature = "simd_x86", since = "1.27.0")]
2472pub fn _mm_cvtpd_epi32(a: __m128d) -> __m128i {
2473    unsafe { transmute(cvtpd2dq(a)) }
2474}
2475
2476/// Converts the lower double-precision (64-bit) floating-point element in a to
2477/// a 32-bit integer.
2478///
2479/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32)
2480#[inline]
2481#[target_feature(enable = "sse2")]
2482#[cfg_attr(test, assert_instr(cvtsd2si))]
2483#[stable(feature = "simd_x86", since = "1.27.0")]
2484pub fn _mm_cvtsd_si32(a: __m128d) -> i32 {
2485    unsafe { cvtsd2si(a) }
2486}
2487
2488/// Converts the lower double-precision (64-bit) floating-point element in `b`
2489/// to a single-precision (32-bit) floating-point element, store the result in
2490/// the lower element of the return value, and copies the upper element from `a`
2491/// to the upper element the return value.
2492///
2493/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss)
2494#[inline]
2495#[target_feature(enable = "sse2")]
2496#[cfg_attr(test, assert_instr(cvtsd2ss))]
2497#[stable(feature = "simd_x86", since = "1.27.0")]
2498pub fn _mm_cvtsd_ss(a: __m128, b: __m128d) -> __m128 {
2499    unsafe { cvtsd2ss(a, b) }
2500}
2501
2502/// Returns the lower double-precision (64-bit) floating-point element of `a`.
2503///
2504/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64)
2505#[inline]
2506#[target_feature(enable = "sse2")]
2507#[stable(feature = "simd_x86", since = "1.27.0")]
2508#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2509pub const fn _mm_cvtsd_f64(a: __m128d) -> f64 {
2510    unsafe { simd_extract!(a, 0) }
2511}
2512
2513/// Converts the lower single-precision (32-bit) floating-point element in `b`
2514/// to a double-precision (64-bit) floating-point element, store the result in
2515/// the lower element of the return value, and copies the upper element from `a`
2516/// to the upper element the return value.
2517///
2518/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd)
2519#[inline]
2520#[target_feature(enable = "sse2")]
2521#[cfg_attr(test, assert_instr(cvtss2sd))]
2522#[stable(feature = "simd_x86", since = "1.27.0")]
2523#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2524pub const fn _mm_cvtss_sd(a: __m128d, b: __m128) -> __m128d {
2525    unsafe {
2526        let elt: f32 = simd_extract!(b, 0);
2527        simd_insert!(a, 0, elt as f64)
2528    }
2529}
2530
2531/// Converts packed double-precision (64-bit) floating-point elements in `a` to
2532/// packed 32-bit integers with truncation.
2533///
2534/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32)
2535#[inline]
2536#[target_feature(enable = "sse2")]
2537#[cfg_attr(test, assert_instr(cvttpd2dq))]
2538#[stable(feature = "simd_x86", since = "1.27.0")]
2539pub fn _mm_cvttpd_epi32(a: __m128d) -> __m128i {
2540    unsafe { transmute(cvttpd2dq(a)) }
2541}
2542
2543/// Converts the lower double-precision (64-bit) floating-point element in `a`
2544/// to a 32-bit integer with truncation.
2545///
2546/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32)
2547#[inline]
2548#[target_feature(enable = "sse2")]
2549#[cfg_attr(test, assert_instr(cvttsd2si))]
2550#[stable(feature = "simd_x86", since = "1.27.0")]
2551pub fn _mm_cvttsd_si32(a: __m128d) -> i32 {
2552    unsafe { cvttsd2si(a) }
2553}
2554
2555/// Converts packed single-precision (32-bit) floating-point elements in `a` to
2556/// packed 32-bit integers with truncation.
2557///
2558/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32)
2559#[inline]
2560#[target_feature(enable = "sse2")]
2561#[cfg_attr(test, assert_instr(cvttps2dq))]
2562#[stable(feature = "simd_x86", since = "1.27.0")]
2563pub fn _mm_cvttps_epi32(a: __m128) -> __m128i {
2564    unsafe { transmute(cvttps2dq(a)) }
2565}
2566
2567/// Copies double-precision (64-bit) floating-point element `a` to the lower
2568/// element of the packed 64-bit return value.
2569///
2570/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd)
2571#[inline]
2572#[target_feature(enable = "sse2")]
2573#[stable(feature = "simd_x86", since = "1.27.0")]
2574#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2575pub const fn _mm_set_sd(a: f64) -> __m128d {
2576    _mm_set_pd(0.0, a)
2577}
2578
2579/// Broadcasts double-precision (64-bit) floating-point value a to all elements
2580/// of the return value.
2581///
2582/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd)
2583#[inline]
2584#[target_feature(enable = "sse2")]
2585#[stable(feature = "simd_x86", since = "1.27.0")]
2586#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2587pub const fn _mm_set1_pd(a: f64) -> __m128d {
2588    _mm_set_pd(a, a)
2589}
2590
2591/// Broadcasts double-precision (64-bit) floating-point value a to all elements
2592/// of the return value.
2593///
2594/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1)
2595#[inline]
2596#[target_feature(enable = "sse2")]
2597#[stable(feature = "simd_x86", since = "1.27.0")]
2598#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2599pub const fn _mm_set_pd1(a: f64) -> __m128d {
2600    _mm_set_pd(a, a)
2601}
2602
2603/// Sets packed double-precision (64-bit) floating-point elements in the return
2604/// value with the supplied values.
2605///
2606/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd)
2607#[inline]
2608#[target_feature(enable = "sse2")]
2609#[stable(feature = "simd_x86", since = "1.27.0")]
2610#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2611pub const fn _mm_set_pd(a: f64, b: f64) -> __m128d {
2612    __m128d([b, a])
2613}
2614
2615/// Sets packed double-precision (64-bit) floating-point elements in the return
2616/// value with the supplied values in reverse order.
2617///
2618/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd)
2619#[inline]
2620#[target_feature(enable = "sse2")]
2621#[stable(feature = "simd_x86", since = "1.27.0")]
2622#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2623pub const fn _mm_setr_pd(a: f64, b: f64) -> __m128d {
2624    _mm_set_pd(b, a)
2625}
2626
2627/// Returns packed double-precision (64-bit) floating-point elements with all
2628/// zeros.
2629///
2630/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd)
2631#[inline]
2632#[target_feature(enable = "sse2")]
2633#[cfg_attr(test, assert_instr(xorp))]
2634#[stable(feature = "simd_x86", since = "1.27.0")]
2635#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2636pub const fn _mm_setzero_pd() -> __m128d {
2637    const { unsafe { mem::zeroed() } }
2638}
2639
2640/// Returns a mask of the most significant bit of each element in `a`.
2641///
2642/// The mask is stored in the 2 least significant bits of the return value.
2643/// All other bits are set to `0`.
2644///
2645/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd)
2646#[inline]
2647#[target_feature(enable = "sse2")]
2648#[cfg_attr(test, assert_instr(movmskpd))]
2649#[stable(feature = "simd_x86", since = "1.27.0")]
2650#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2651pub const fn _mm_movemask_pd(a: __m128d) -> i32 {
2652    // Propagate the highest bit to the rest, because simd_bitmask
2653    // requires all-1 or all-0.
2654    unsafe {
2655        let mask: i64x2 = simd_lt(transmute(a), i64x2::ZERO);
2656        simd_bitmask::<i64x2, u8>(mask) as i32
2657    }
2658}
2659
2660/// Loads 128-bits (composed of 2 packed double-precision (64-bit)
2661/// floating-point elements) from memory into the returned vector.
2662/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
2663/// exception may be generated.
2664///
2665/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd)
2666#[inline]
2667#[target_feature(enable = "sse2")]
2668#[cfg_attr(
2669    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
2670    assert_instr(movaps)
2671)]
2672#[stable(feature = "simd_x86", since = "1.27.0")]
2673#[allow(clippy::cast_ptr_alignment)]
2674#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2675pub const unsafe fn _mm_load_pd(mem_addr: *const f64) -> __m128d {
2676    *(mem_addr as *const __m128d)
2677}
2678
2679/// Loads a 64-bit double-precision value to the low element of a
2680/// 128-bit integer vector and clears the upper element.
2681///
2682/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd)
2683#[inline]
2684#[target_feature(enable = "sse2")]
2685#[cfg_attr(test, assert_instr(movsd))]
2686#[stable(feature = "simd_x86", since = "1.27.0")]
2687#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2688pub const unsafe fn _mm_load_sd(mem_addr: *const f64) -> __m128d {
2689    _mm_setr_pd(*mem_addr, 0.)
2690}
2691
2692/// Loads a double-precision value into the high-order bits of a 128-bit
2693/// vector of `[2 x double]`. The low-order bits are copied from the low-order
2694/// bits of the first operand.
2695///
2696/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd)
2697#[inline]
2698#[target_feature(enable = "sse2")]
2699#[cfg_attr(test, assert_instr(movhps))]
2700#[stable(feature = "simd_x86", since = "1.27.0")]
2701#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2702pub const unsafe fn _mm_loadh_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
2703    _mm_setr_pd(simd_extract!(a, 0), *mem_addr)
2704}
2705
2706/// Loads a double-precision value into the low-order bits of a 128-bit
2707/// vector of `[2 x double]`. The high-order bits are copied from the
2708/// high-order bits of the first operand.
2709///
2710/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd)
2711#[inline]
2712#[target_feature(enable = "sse2")]
2713#[cfg_attr(test, assert_instr(movlps))]
2714#[stable(feature = "simd_x86", since = "1.27.0")]
2715#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2716pub const unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
2717    _mm_setr_pd(*mem_addr, simd_extract!(a, 1))
2718}
2719
2720/// Stores a 128-bit floating point vector of `[2 x double]` to a 128-bit
2721/// aligned memory location.
2722/// To minimize caching, the data is flagged as non-temporal (unlikely to be
2723/// used again soon).
2724///
2725/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd)
2726///
2727/// # Safety of non-temporal stores
2728///
2729/// After using this intrinsic, but before any other access to the memory that this intrinsic
2730/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
2731/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
2732/// return.
2733///
2734/// See [`_mm_sfence`] for details.
2735#[inline]
2736#[target_feature(enable = "sse2")]
2737#[cfg_attr(test, assert_instr(movntpd))]
2738#[stable(feature = "simd_x86", since = "1.27.0")]
2739#[allow(clippy::cast_ptr_alignment)]
2740pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) {
2741    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
2742    crate::arch::asm!(
2743        vps!("movntpd", ",{a}"),
2744        p = in(reg) mem_addr,
2745        a = in(xmm_reg) a,
2746        options(nostack, preserves_flags),
2747    );
2748}
2749
2750/// Stores the lower 64 bits of a 128-bit vector of `[2 x double]` to a
2751/// memory location.
2752///
2753/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_sd)
2754#[inline]
2755#[target_feature(enable = "sse2")]
2756#[cfg_attr(test, assert_instr(movlps))]
2757#[stable(feature = "simd_x86", since = "1.27.0")]
2758#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2759pub const unsafe fn _mm_store_sd(mem_addr: *mut f64, a: __m128d) {
2760    *mem_addr = simd_extract!(a, 0)
2761}
2762
2763/// Stores 128-bits (composed of 2 packed double-precision (64-bit)
2764/// floating-point elements) from `a` into memory. `mem_addr` must be aligned
2765/// on a 16-byte boundary or a general-protection exception may be generated.
2766///
2767/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd)
2768#[inline]
2769#[target_feature(enable = "sse2")]
2770#[cfg_attr(
2771    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
2772    assert_instr(movaps)
2773)]
2774#[stable(feature = "simd_x86", since = "1.27.0")]
2775#[allow(clippy::cast_ptr_alignment)]
2776#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2777pub const unsafe fn _mm_store_pd(mem_addr: *mut f64, a: __m128d) {
2778    *(mem_addr as *mut __m128d) = a;
2779}
2780
2781/// Stores 128-bits (composed of 2 packed double-precision (64-bit)
2782/// floating-point elements) from `a` into memory.
2783/// `mem_addr` does not need to be aligned on any particular boundary.
2784///
2785/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd)
2786#[inline]
2787#[target_feature(enable = "sse2")]
2788#[cfg_attr(test, assert_instr(movups))] // FIXME movupd expected
2789#[stable(feature = "simd_x86", since = "1.27.0")]
2790#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2791pub const unsafe fn _mm_storeu_pd(mem_addr: *mut f64, a: __m128d) {
2792    mem_addr.cast::<__m128d>().write_unaligned(a);
2793}
2794
2795/// Store 16-bit integer from the first element of a into memory.
2796///
2797/// `mem_addr` does not need to be aligned on any particular boundary.
2798///
2799/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16)
2800#[inline]
2801#[target_feature(enable = "sse2")]
2802#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2803#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2804pub const unsafe fn _mm_storeu_si16(mem_addr: *mut u8, a: __m128i) {
2805    ptr::write_unaligned(mem_addr as *mut i16, simd_extract(a.as_i16x8(), 0))
2806}
2807
2808/// Store 32-bit integer from the first element of a into memory.
2809///
2810/// `mem_addr` does not need to be aligned on any particular boundary.
2811///
2812/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32)
2813#[inline]
2814#[target_feature(enable = "sse2")]
2815#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2816#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2817pub const unsafe fn _mm_storeu_si32(mem_addr: *mut u8, a: __m128i) {
2818    ptr::write_unaligned(mem_addr as *mut i32, simd_extract(a.as_i32x4(), 0))
2819}
2820
2821/// Store 64-bit integer from the first element of a into memory.
2822///
2823/// `mem_addr` does not need to be aligned on any particular boundary.
2824///
2825/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64)
2826#[inline]
2827#[target_feature(enable = "sse2")]
2828#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2829#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2830pub const unsafe fn _mm_storeu_si64(mem_addr: *mut u8, a: __m128i) {
2831    ptr::write_unaligned(mem_addr as *mut i64, simd_extract(a.as_i64x2(), 0))
2832}
2833
2834/// Stores the lower double-precision (64-bit) floating-point element from `a`
2835/// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
2836/// 16-byte boundary or a general-protection exception may be generated.
2837///
2838/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_pd)
2839#[inline]
2840#[target_feature(enable = "sse2")]
2841#[stable(feature = "simd_x86", since = "1.27.0")]
2842#[allow(clippy::cast_ptr_alignment)]
2843#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2844pub const unsafe fn _mm_store1_pd(mem_addr: *mut f64, a: __m128d) {
2845    let b: __m128d = simd_shuffle!(a, a, [0, 0]);
2846    *(mem_addr as *mut __m128d) = b;
2847}
2848
2849/// Stores the lower double-precision (64-bit) floating-point element from `a`
2850/// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
2851/// 16-byte boundary or a general-protection exception may be generated.
2852///
2853/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1)
2854#[inline]
2855#[target_feature(enable = "sse2")]
2856#[stable(feature = "simd_x86", since = "1.27.0")]
2857#[allow(clippy::cast_ptr_alignment)]
2858#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2859pub const unsafe fn _mm_store_pd1(mem_addr: *mut f64, a: __m128d) {
2860    let b: __m128d = simd_shuffle!(a, a, [0, 0]);
2861    *(mem_addr as *mut __m128d) = b;
2862}
2863
2864/// Stores 2 double-precision (64-bit) floating-point elements from `a` into
2865/// memory in reverse order.
2866/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
2867/// exception may be generated.
2868///
2869/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd)
2870#[inline]
2871#[target_feature(enable = "sse2")]
2872#[stable(feature = "simd_x86", since = "1.27.0")]
2873#[allow(clippy::cast_ptr_alignment)]
2874#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2875pub const unsafe fn _mm_storer_pd(mem_addr: *mut f64, a: __m128d) {
2876    let b: __m128d = simd_shuffle!(a, a, [1, 0]);
2877    *(mem_addr as *mut __m128d) = b;
2878}
2879
2880/// Stores the upper 64 bits of a 128-bit vector of `[2 x double]` to a
2881/// memory location.
2882///
2883/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd)
2884#[inline]
2885#[target_feature(enable = "sse2")]
2886#[cfg_attr(test, assert_instr(movhps))]
2887#[stable(feature = "simd_x86", since = "1.27.0")]
2888#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2889pub const unsafe fn _mm_storeh_pd(mem_addr: *mut f64, a: __m128d) {
2890    *mem_addr = simd_extract!(a, 1);
2891}
2892
2893/// Stores the lower 64 bits of a 128-bit vector of `[2 x double]` to a
2894/// memory location.
2895///
2896/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd)
2897#[inline]
2898#[target_feature(enable = "sse2")]
2899#[cfg_attr(test, assert_instr(movlps))]
2900#[stable(feature = "simd_x86", since = "1.27.0")]
2901#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2902pub const unsafe fn _mm_storel_pd(mem_addr: *mut f64, a: __m128d) {
2903    *mem_addr = simd_extract!(a, 0);
2904}
2905
2906/// Loads a double-precision (64-bit) floating-point element from memory
2907/// into both elements of returned vector.
2908///
2909/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd)
2910#[inline]
2911#[target_feature(enable = "sse2")]
2912// #[cfg_attr(test, assert_instr(movapd))] // FIXME LLVM uses different codegen
2913#[stable(feature = "simd_x86", since = "1.27.0")]
2914#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2915pub const unsafe fn _mm_load1_pd(mem_addr: *const f64) -> __m128d {
2916    let d = *mem_addr;
2917    _mm_setr_pd(d, d)
2918}
2919
2920/// Loads a double-precision (64-bit) floating-point element from memory
2921/// into both elements of returned vector.
2922///
2923/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1)
2924#[inline]
2925#[target_feature(enable = "sse2")]
2926// #[cfg_attr(test, assert_instr(movapd))] // FIXME same as _mm_load1_pd
2927#[stable(feature = "simd_x86", since = "1.27.0")]
2928#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2929pub const unsafe fn _mm_load_pd1(mem_addr: *const f64) -> __m128d {
2930    _mm_load1_pd(mem_addr)
2931}
2932
2933/// Loads 2 double-precision (64-bit) floating-point elements from memory into
2934/// the returned vector in reverse order. `mem_addr` must be aligned on a
2935/// 16-byte boundary or a general-protection exception may be generated.
2936///
2937/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd)
2938#[inline]
2939#[target_feature(enable = "sse2")]
2940#[cfg_attr(
2941    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
2942    assert_instr(movaps)
2943)]
2944#[stable(feature = "simd_x86", since = "1.27.0")]
2945#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2946pub const unsafe fn _mm_loadr_pd(mem_addr: *const f64) -> __m128d {
2947    let a = _mm_load_pd(mem_addr);
2948    simd_shuffle!(a, a, [1, 0])
2949}
2950
2951/// Loads 128-bits (composed of 2 packed double-precision (64-bit)
2952/// floating-point elements) from memory into the returned vector.
2953/// `mem_addr` does not need to be aligned on any particular boundary.
2954///
2955/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd)
2956#[inline]
2957#[target_feature(enable = "sse2")]
2958#[cfg_attr(test, assert_instr(movups))]
2959#[stable(feature = "simd_x86", since = "1.27.0")]
2960#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2961pub const unsafe fn _mm_loadu_pd(mem_addr: *const f64) -> __m128d {
2962    let mut dst = _mm_undefined_pd();
2963    ptr::copy_nonoverlapping(
2964        mem_addr as *const u8,
2965        ptr::addr_of_mut!(dst) as *mut u8,
2966        mem::size_of::<__m128d>(),
2967    );
2968    dst
2969}
2970
2971/// Loads unaligned 16-bits of integer data from memory into new vector.
2972///
2973/// `mem_addr` does not need to be aligned on any particular boundary.
2974///
2975/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16)
2976#[inline]
2977#[target_feature(enable = "sse2")]
2978#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2979#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2980pub const unsafe fn _mm_loadu_si16(mem_addr: *const u8) -> __m128i {
2981    transmute(i16x8::new(
2982        ptr::read_unaligned(mem_addr as *const i16),
2983        0,
2984        0,
2985        0,
2986        0,
2987        0,
2988        0,
2989        0,
2990    ))
2991}
2992
2993/// Loads unaligned 32-bits of integer data from memory into new vector.
2994///
2995/// `mem_addr` does not need to be aligned on any particular boundary.
2996///
2997/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32)
2998#[inline]
2999#[target_feature(enable = "sse2")]
3000#[stable(feature = "simd_x86_updates", since = "1.82.0")]
3001#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3002pub const unsafe fn _mm_loadu_si32(mem_addr: *const u8) -> __m128i {
3003    transmute(i32x4::new(
3004        ptr::read_unaligned(mem_addr as *const i32),
3005        0,
3006        0,
3007        0,
3008    ))
3009}
3010
3011/// Loads unaligned 64-bits of integer data from memory into new vector.
3012///
3013/// `mem_addr` does not need to be aligned on any particular boundary.
3014///
3015/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64)
3016#[inline]
3017#[target_feature(enable = "sse2")]
3018#[stable(feature = "simd_x86_mm_loadu_si64", since = "1.46.0")]
3019#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3020pub const unsafe fn _mm_loadu_si64(mem_addr: *const u8) -> __m128i {
3021    transmute(i64x2::new(ptr::read_unaligned(mem_addr as *const i64), 0))
3022}
3023
3024/// Constructs a 128-bit floating-point vector of `[2 x double]` from two
3025/// 128-bit vector parameters of `[2 x double]`, using the immediate-value
3026/// parameter as a specifier.
3027///
3028/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd)
3029#[inline]
3030#[target_feature(enable = "sse2")]
3031#[cfg_attr(test, assert_instr(shufps, MASK = 2))]
3032#[rustc_legacy_const_generics(2)]
3033#[stable(feature = "simd_x86", since = "1.27.0")]
3034#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3035pub const fn _mm_shuffle_pd<const MASK: i32>(a: __m128d, b: __m128d) -> __m128d {
3036    static_assert_uimm_bits!(MASK, 8);
3037    unsafe { simd_shuffle!(a, b, [MASK as u32 & 0b1, ((MASK as u32 >> 1) & 0b1) + 2]) }
3038}
3039
3040/// Constructs a 128-bit floating-point vector of `[2 x double]`. The lower
3041/// 64 bits are set to the lower 64 bits of the second parameter. The upper
3042/// 64 bits are set to the upper 64 bits of the first parameter.
3043///
3044/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd)
3045#[inline]
3046#[target_feature(enable = "sse2")]
3047#[cfg_attr(test, assert_instr(movsd))]
3048#[stable(feature = "simd_x86", since = "1.27.0")]
3049#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3050pub const fn _mm_move_sd(a: __m128d, b: __m128d) -> __m128d {
3051    unsafe { _mm_setr_pd(simd_extract!(b, 0), simd_extract!(a, 1)) }
3052}
3053
3054/// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit
3055/// floating-point vector of `[4 x float]`.
3056///
3057/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps)
3058#[inline]
3059#[target_feature(enable = "sse2")]
3060#[stable(feature = "simd_x86", since = "1.27.0")]
3061#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3062pub const fn _mm_castpd_ps(a: __m128d) -> __m128 {
3063    unsafe { transmute(a) }
3064}
3065
3066/// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit
3067/// integer vector.
3068///
3069/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128)
3070#[inline]
3071#[target_feature(enable = "sse2")]
3072#[stable(feature = "simd_x86", since = "1.27.0")]
3073#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3074pub const fn _mm_castpd_si128(a: __m128d) -> __m128i {
3075    unsafe { transmute(a) }
3076}
3077
3078/// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit
3079/// floating-point vector of `[2 x double]`.
3080///
3081/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd)
3082#[inline]
3083#[target_feature(enable = "sse2")]
3084#[stable(feature = "simd_x86", since = "1.27.0")]
3085#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3086pub const fn _mm_castps_pd(a: __m128) -> __m128d {
3087    unsafe { transmute(a) }
3088}
3089
3090/// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit
3091/// integer vector.
3092///
3093/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128)
3094#[inline]
3095#[target_feature(enable = "sse2")]
3096#[stable(feature = "simd_x86", since = "1.27.0")]
3097#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3098pub const fn _mm_castps_si128(a: __m128) -> __m128i {
3099    unsafe { transmute(a) }
3100}
3101
3102/// Casts a 128-bit integer vector into a 128-bit floating-point vector
3103/// of `[2 x double]`.
3104///
3105/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd)
3106#[inline]
3107#[target_feature(enable = "sse2")]
3108#[stable(feature = "simd_x86", since = "1.27.0")]
3109#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3110pub const fn _mm_castsi128_pd(a: __m128i) -> __m128d {
3111    unsafe { transmute(a) }
3112}
3113
3114/// Casts a 128-bit integer vector into a 128-bit floating-point vector
3115/// of `[4 x float]`.
3116///
3117/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps)
3118#[inline]
3119#[target_feature(enable = "sse2")]
3120#[stable(feature = "simd_x86", since = "1.27.0")]
3121#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3122pub const fn _mm_castsi128_ps(a: __m128i) -> __m128 {
3123    unsafe { transmute(a) }
3124}
3125
3126/// Returns vector of type __m128d with indeterminate elements.with indetermination elements.
3127/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
3128/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
3129/// In practice, this is typically equivalent to [`mem::zeroed`].
3130///
3131/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd)
3132#[inline]
3133#[target_feature(enable = "sse2")]
3134#[stable(feature = "simd_x86", since = "1.27.0")]
3135#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3136pub const fn _mm_undefined_pd() -> __m128d {
3137    const { unsafe { mem::zeroed() } }
3138}
3139
3140/// Returns vector of type __m128i with indeterminate elements.with indetermination elements.
3141/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
3142/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
3143/// In practice, this is typically equivalent to [`mem::zeroed`].
3144///
3145/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_si128)
3146#[inline]
3147#[target_feature(enable = "sse2")]
3148#[stable(feature = "simd_x86", since = "1.27.0")]
3149#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3150pub const fn _mm_undefined_si128() -> __m128i {
3151    const { unsafe { mem::zeroed() } }
3152}
3153
3154/// The resulting `__m128d` element is composed by the low-order values of
3155/// the two `__m128d` interleaved input elements, i.e.:
3156///
3157/// * The `[127:64]` bits are copied from the `[127:64]` bits of the second input
3158/// * The `[63:0]` bits are copied from the `[127:64]` bits of the first input
3159///
3160/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd)
3161#[inline]
3162#[target_feature(enable = "sse2")]
3163#[cfg_attr(test, assert_instr(unpckhpd))]
3164#[stable(feature = "simd_x86", since = "1.27.0")]
3165#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3166pub const fn _mm_unpackhi_pd(a: __m128d, b: __m128d) -> __m128d {
3167    unsafe { simd_shuffle!(a, b, [1, 3]) }
3168}
3169
3170/// The resulting `__m128d` element is composed by the high-order values of
3171/// the two `__m128d` interleaved input elements, i.e.:
3172///
3173/// * The `[127:64]` bits are copied from the `[63:0]` bits of the second input
3174/// * The `[63:0]` bits are copied from the `[63:0]` bits of the first input
3175///
3176/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd)
3177#[inline]
3178#[target_feature(enable = "sse2")]
3179#[cfg_attr(test, assert_instr(movlhps))]
3180#[stable(feature = "simd_x86", since = "1.27.0")]
3181#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3182pub const fn _mm_unpacklo_pd(a: __m128d, b: __m128d) -> __m128d {
3183    unsafe { simd_shuffle!(a, b, [0, 2]) }
3184}
3185
3186#[allow(improper_ctypes)]
3187unsafe extern "C" {
3188    #[link_name = "llvm.x86.sse2.pause"]
3189    fn pause();
3190    #[link_name = "llvm.x86.sse2.clflush"]
3191    fn clflush(p: *const u8);
3192    #[link_name = "llvm.x86.sse2.lfence"]
3193    fn lfence();
3194    #[link_name = "llvm.x86.sse2.mfence"]
3195    fn mfence();
3196    #[link_name = "llvm.x86.sse2.pmadd.wd"]
3197    fn pmaddwd(a: i16x8, b: i16x8) -> i32x4;
3198    #[link_name = "llvm.x86.sse2.psad.bw"]
3199    fn psadbw(a: u8x16, b: u8x16) -> u64x2;
3200    #[link_name = "llvm.x86.sse2.psll.w"]
3201    fn psllw(a: i16x8, count: i16x8) -> i16x8;
3202    #[link_name = "llvm.x86.sse2.psll.d"]
3203    fn pslld(a: i32x4, count: i32x4) -> i32x4;
3204    #[link_name = "llvm.x86.sse2.psll.q"]
3205    fn psllq(a: i64x2, count: i64x2) -> i64x2;
3206    #[link_name = "llvm.x86.sse2.psra.w"]
3207    fn psraw(a: i16x8, count: i16x8) -> i16x8;
3208    #[link_name = "llvm.x86.sse2.psra.d"]
3209    fn psrad(a: i32x4, count: i32x4) -> i32x4;
3210    #[link_name = "llvm.x86.sse2.psrl.w"]
3211    fn psrlw(a: i16x8, count: i16x8) -> i16x8;
3212    #[link_name = "llvm.x86.sse2.psrl.d"]
3213    fn psrld(a: i32x4, count: i32x4) -> i32x4;
3214    #[link_name = "llvm.x86.sse2.psrl.q"]
3215    fn psrlq(a: i64x2, count: i64x2) -> i64x2;
3216    #[link_name = "llvm.x86.sse2.cvtps2dq"]
3217    fn cvtps2dq(a: __m128) -> i32x4;
3218    #[link_name = "llvm.x86.sse2.maskmov.dqu"]
3219    fn maskmovdqu(a: i8x16, mask: i8x16, mem_addr: *mut i8);
3220    #[link_name = "llvm.x86.sse2.packsswb.128"]
3221    fn packsswb(a: i16x8, b: i16x8) -> i8x16;
3222    #[link_name = "llvm.x86.sse2.packssdw.128"]
3223    fn packssdw(a: i32x4, b: i32x4) -> i16x8;
3224    #[link_name = "llvm.x86.sse2.packuswb.128"]
3225    fn packuswb(a: i16x8, b: i16x8) -> u8x16;
3226    #[link_name = "llvm.x86.sse2.max.sd"]
3227    fn maxsd(a: __m128d, b: __m128d) -> __m128d;
3228    #[link_name = "llvm.x86.sse2.max.pd"]
3229    fn maxpd(a: __m128d, b: __m128d) -> __m128d;
3230    #[link_name = "llvm.x86.sse2.min.sd"]
3231    fn minsd(a: __m128d, b: __m128d) -> __m128d;
3232    #[link_name = "llvm.x86.sse2.min.pd"]
3233    fn minpd(a: __m128d, b: __m128d) -> __m128d;
3234    #[link_name = "llvm.x86.sse2.cmp.sd"]
3235    fn cmpsd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3236    #[link_name = "llvm.x86.sse2.cmp.pd"]
3237    fn cmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3238    #[link_name = "llvm.x86.sse2.comieq.sd"]
3239    fn comieqsd(a: __m128d, b: __m128d) -> i32;
3240    #[link_name = "llvm.x86.sse2.comilt.sd"]
3241    fn comiltsd(a: __m128d, b: __m128d) -> i32;
3242    #[link_name = "llvm.x86.sse2.comile.sd"]
3243    fn comilesd(a: __m128d, b: __m128d) -> i32;
3244    #[link_name = "llvm.x86.sse2.comigt.sd"]
3245    fn comigtsd(a: __m128d, b: __m128d) -> i32;
3246    #[link_name = "llvm.x86.sse2.comige.sd"]
3247    fn comigesd(a: __m128d, b: __m128d) -> i32;
3248    #[link_name = "llvm.x86.sse2.comineq.sd"]
3249    fn comineqsd(a: __m128d, b: __m128d) -> i32;
3250    #[link_name = "llvm.x86.sse2.ucomieq.sd"]
3251    fn ucomieqsd(a: __m128d, b: __m128d) -> i32;
3252    #[link_name = "llvm.x86.sse2.ucomilt.sd"]
3253    fn ucomiltsd(a: __m128d, b: __m128d) -> i32;
3254    #[link_name = "llvm.x86.sse2.ucomile.sd"]
3255    fn ucomilesd(a: __m128d, b: __m128d) -> i32;
3256    #[link_name = "llvm.x86.sse2.ucomigt.sd"]
3257    fn ucomigtsd(a: __m128d, b: __m128d) -> i32;
3258    #[link_name = "llvm.x86.sse2.ucomige.sd"]
3259    fn ucomigesd(a: __m128d, b: __m128d) -> i32;
3260    #[link_name = "llvm.x86.sse2.ucomineq.sd"]
3261    fn ucomineqsd(a: __m128d, b: __m128d) -> i32;
3262    #[link_name = "llvm.x86.sse2.cvtpd2dq"]
3263    fn cvtpd2dq(a: __m128d) -> i32x4;
3264    #[link_name = "llvm.x86.sse2.cvtsd2si"]
3265    fn cvtsd2si(a: __m128d) -> i32;
3266    #[link_name = "llvm.x86.sse2.cvtsd2ss"]
3267    fn cvtsd2ss(a: __m128, b: __m128d) -> __m128;
3268    #[link_name = "llvm.x86.sse2.cvttpd2dq"]
3269    fn cvttpd2dq(a: __m128d) -> i32x4;
3270    #[link_name = "llvm.x86.sse2.cvttsd2si"]
3271    fn cvttsd2si(a: __m128d) -> i32;
3272    #[link_name = "llvm.x86.sse2.cvttps2dq"]
3273    fn cvttps2dq(a: __m128) -> i32x4;
3274}
3275
3276#[cfg(test)]
3277mod tests {
3278    use crate::core_arch::assert_eq_const as assert_eq;
3279    use crate::{
3280        core_arch::{simd::*, x86::*},
3281        hint::black_box,
3282    };
3283    use std::{boxed, f32, f64, mem, ptr};
3284    use stdarch_test::simd_test;
3285
3286    const NAN: f64 = f64::NAN;
3287
3288    #[test]
3289    fn test_mm_pause() {
3290        _mm_pause()
3291    }
3292
3293    #[simd_test(enable = "sse2")]
3294    fn test_mm_clflush() {
3295        let x = 0_u8;
3296        unsafe {
3297            _mm_clflush(ptr::addr_of!(x));
3298        }
3299    }
3300
3301    #[simd_test(enable = "sse2")]
3302    // Miri cannot support this until it is clear how it fits in the Rust memory model
3303    #[cfg_attr(miri, ignore)]
3304    fn test_mm_lfence() {
3305        _mm_lfence();
3306    }
3307
3308    #[simd_test(enable = "sse2")]
3309    // Miri cannot support this until it is clear how it fits in the Rust memory model
3310    #[cfg_attr(miri, ignore)]
3311    fn test_mm_mfence() {
3312        _mm_mfence();
3313    }
3314
3315    #[simd_test(enable = "sse2")]
3316    const fn test_mm_add_epi8() {
3317        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3318        #[rustfmt::skip]
3319        let b = _mm_setr_epi8(
3320            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3321        );
3322        let r = _mm_add_epi8(a, b);
3323        #[rustfmt::skip]
3324        let e = _mm_setr_epi8(
3325            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
3326        );
3327        assert_eq_m128i(r, e);
3328    }
3329
3330    #[simd_test(enable = "sse2")]
3331    fn test_mm_add_epi8_overflow() {
3332        let a = _mm_set1_epi8(0x7F);
3333        let b = _mm_set1_epi8(1);
3334        let r = _mm_add_epi8(a, b);
3335        assert_eq_m128i(r, _mm_set1_epi8(-128));
3336    }
3337
3338    #[simd_test(enable = "sse2")]
3339    const fn test_mm_add_epi16() {
3340        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3341        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
3342        let r = _mm_add_epi16(a, b);
3343        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
3344        assert_eq_m128i(r, e);
3345    }
3346
3347    #[simd_test(enable = "sse2")]
3348    const fn test_mm_add_epi32() {
3349        let a = _mm_setr_epi32(0, 1, 2, 3);
3350        let b = _mm_setr_epi32(4, 5, 6, 7);
3351        let r = _mm_add_epi32(a, b);
3352        let e = _mm_setr_epi32(4, 6, 8, 10);
3353        assert_eq_m128i(r, e);
3354    }
3355
3356    #[simd_test(enable = "sse2")]
3357    const fn test_mm_add_epi64() {
3358        let a = _mm_setr_epi64x(0, 1);
3359        let b = _mm_setr_epi64x(2, 3);
3360        let r = _mm_add_epi64(a, b);
3361        let e = _mm_setr_epi64x(2, 4);
3362        assert_eq_m128i(r, e);
3363    }
3364
3365    #[simd_test(enable = "sse2")]
3366    const fn test_mm_adds_epi8() {
3367        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3368        #[rustfmt::skip]
3369        let b = _mm_setr_epi8(
3370            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3371        );
3372        let r = _mm_adds_epi8(a, b);
3373        #[rustfmt::skip]
3374        let e = _mm_setr_epi8(
3375            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
3376        );
3377        assert_eq_m128i(r, e);
3378    }
3379
3380    #[simd_test(enable = "sse2")]
3381    fn test_mm_adds_epi8_saturate_positive() {
3382        let a = _mm_set1_epi8(0x7F);
3383        let b = _mm_set1_epi8(1);
3384        let r = _mm_adds_epi8(a, b);
3385        assert_eq_m128i(r, a);
3386    }
3387
3388    #[simd_test(enable = "sse2")]
3389    fn test_mm_adds_epi8_saturate_negative() {
3390        let a = _mm_set1_epi8(-0x80);
3391        let b = _mm_set1_epi8(-1);
3392        let r = _mm_adds_epi8(a, b);
3393        assert_eq_m128i(r, a);
3394    }
3395
3396    #[simd_test(enable = "sse2")]
3397    const fn test_mm_adds_epi16() {
3398        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3399        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
3400        let r = _mm_adds_epi16(a, b);
3401        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
3402        assert_eq_m128i(r, e);
3403    }
3404
3405    #[simd_test(enable = "sse2")]
3406    fn test_mm_adds_epi16_saturate_positive() {
3407        let a = _mm_set1_epi16(0x7FFF);
3408        let b = _mm_set1_epi16(1);
3409        let r = _mm_adds_epi16(a, b);
3410        assert_eq_m128i(r, a);
3411    }
3412
3413    #[simd_test(enable = "sse2")]
3414    fn test_mm_adds_epi16_saturate_negative() {
3415        let a = _mm_set1_epi16(-0x8000);
3416        let b = _mm_set1_epi16(-1);
3417        let r = _mm_adds_epi16(a, b);
3418        assert_eq_m128i(r, a);
3419    }
3420
3421    #[simd_test(enable = "sse2")]
3422    const fn test_mm_adds_epu8() {
3423        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3424        #[rustfmt::skip]
3425        let b = _mm_setr_epi8(
3426            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3427        );
3428        let r = _mm_adds_epu8(a, b);
3429        #[rustfmt::skip]
3430        let e = _mm_setr_epi8(
3431            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
3432        );
3433        assert_eq_m128i(r, e);
3434    }
3435
3436    #[simd_test(enable = "sse2")]
3437    fn test_mm_adds_epu8_saturate() {
3438        let a = _mm_set1_epi8(!0);
3439        let b = _mm_set1_epi8(1);
3440        let r = _mm_adds_epu8(a, b);
3441        assert_eq_m128i(r, a);
3442    }
3443
3444    #[simd_test(enable = "sse2")]
3445    const fn test_mm_adds_epu16() {
3446        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3447        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
3448        let r = _mm_adds_epu16(a, b);
3449        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
3450        assert_eq_m128i(r, e);
3451    }
3452
3453    #[simd_test(enable = "sse2")]
3454    fn test_mm_adds_epu16_saturate() {
3455        let a = _mm_set1_epi16(!0);
3456        let b = _mm_set1_epi16(1);
3457        let r = _mm_adds_epu16(a, b);
3458        assert_eq_m128i(r, a);
3459    }
3460
3461    #[simd_test(enable = "sse2")]
3462    const fn test_mm_avg_epu8() {
3463        let (a, b) = (_mm_set1_epi8(3), _mm_set1_epi8(9));
3464        let r = _mm_avg_epu8(a, b);
3465        assert_eq_m128i(r, _mm_set1_epi8(6));
3466    }
3467
3468    #[simd_test(enable = "sse2")]
3469    const fn test_mm_avg_epu16() {
3470        let (a, b) = (_mm_set1_epi16(3), _mm_set1_epi16(9));
3471        let r = _mm_avg_epu16(a, b);
3472        assert_eq_m128i(r, _mm_set1_epi16(6));
3473    }
3474
3475    #[simd_test(enable = "sse2")]
3476    fn test_mm_madd_epi16() {
3477        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
3478        let b = _mm_setr_epi16(9, 10, 11, 12, 13, 14, 15, 16);
3479        let r = _mm_madd_epi16(a, b);
3480        let e = _mm_setr_epi32(29, 81, 149, 233);
3481        assert_eq_m128i(r, e);
3482
3483        // Test large values.
3484        // MIN*MIN+MIN*MIN will overflow into i32::MIN.
3485        let a = _mm_setr_epi16(
3486            i16::MAX,
3487            i16::MAX,
3488            i16::MIN,
3489            i16::MIN,
3490            i16::MIN,
3491            i16::MAX,
3492            0,
3493            0,
3494        );
3495        let b = _mm_setr_epi16(
3496            i16::MAX,
3497            i16::MAX,
3498            i16::MIN,
3499            i16::MIN,
3500            i16::MAX,
3501            i16::MIN,
3502            0,
3503            0,
3504        );
3505        let r = _mm_madd_epi16(a, b);
3506        let e = _mm_setr_epi32(0x7FFE0002, i32::MIN, -0x7FFF0000, 0);
3507        assert_eq_m128i(r, e);
3508    }
3509
3510    #[simd_test(enable = "sse2")]
3511    const fn test_mm_max_epi16() {
3512        let a = _mm_set1_epi16(1);
3513        let b = _mm_set1_epi16(-1);
3514        let r = _mm_max_epi16(a, b);
3515        assert_eq_m128i(r, a);
3516    }
3517
3518    #[simd_test(enable = "sse2")]
3519    const fn test_mm_max_epu8() {
3520        let a = _mm_set1_epi8(1);
3521        let b = _mm_set1_epi8(!0);
3522        let r = _mm_max_epu8(a, b);
3523        assert_eq_m128i(r, b);
3524    }
3525
3526    #[simd_test(enable = "sse2")]
3527    const fn test_mm_min_epi16() {
3528        let a = _mm_set1_epi16(1);
3529        let b = _mm_set1_epi16(-1);
3530        let r = _mm_min_epi16(a, b);
3531        assert_eq_m128i(r, b);
3532    }
3533
3534    #[simd_test(enable = "sse2")]
3535    const fn test_mm_min_epu8() {
3536        let a = _mm_set1_epi8(1);
3537        let b = _mm_set1_epi8(!0);
3538        let r = _mm_min_epu8(a, b);
3539        assert_eq_m128i(r, a);
3540    }
3541
3542    #[simd_test(enable = "sse2")]
3543    const fn test_mm_mulhi_epi16() {
3544        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(-1001));
3545        let r = _mm_mulhi_epi16(a, b);
3546        assert_eq_m128i(r, _mm_set1_epi16(-16));
3547    }
3548
3549    #[simd_test(enable = "sse2")]
3550    const fn test_mm_mulhi_epu16() {
3551        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(1001));
3552        let r = _mm_mulhi_epu16(a, b);
3553        assert_eq_m128i(r, _mm_set1_epi16(15));
3554    }
3555
3556    #[simd_test(enable = "sse2")]
3557    const fn test_mm_mullo_epi16() {
3558        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(-1001));
3559        let r = _mm_mullo_epi16(a, b);
3560        assert_eq_m128i(r, _mm_set1_epi16(-17960));
3561    }
3562
3563    #[simd_test(enable = "sse2")]
3564    const fn test_mm_mul_epu32() {
3565        let a = _mm_setr_epi64x(1_000_000_000, 1 << 34);
3566        let b = _mm_setr_epi64x(1_000_000_000, 1 << 35);
3567        let r = _mm_mul_epu32(a, b);
3568        let e = _mm_setr_epi64x(1_000_000_000 * 1_000_000_000, 0);
3569        assert_eq_m128i(r, e);
3570    }
3571
3572    #[simd_test(enable = "sse2")]
3573    fn test_mm_sad_epu8() {
3574        #[rustfmt::skip]
3575        let a = _mm_setr_epi8(
3576            255u8 as i8, 254u8 as i8, 253u8 as i8, 252u8 as i8,
3577            1, 2, 3, 4,
3578            155u8 as i8, 154u8 as i8, 153u8 as i8, 152u8 as i8,
3579            1, 2, 3, 4,
3580        );
3581        let b = _mm_setr_epi8(0, 0, 0, 0, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2);
3582        let r = _mm_sad_epu8(a, b);
3583        let e = _mm_setr_epi64x(1020, 614);
3584        assert_eq_m128i(r, e);
3585    }
3586
3587    #[simd_test(enable = "sse2")]
3588    const fn test_mm_sub_epi8() {
3589        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(6));
3590        let r = _mm_sub_epi8(a, b);
3591        assert_eq_m128i(r, _mm_set1_epi8(-1));
3592    }
3593
3594    #[simd_test(enable = "sse2")]
3595    const fn test_mm_sub_epi16() {
3596        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(6));
3597        let r = _mm_sub_epi16(a, b);
3598        assert_eq_m128i(r, _mm_set1_epi16(-1));
3599    }
3600
3601    #[simd_test(enable = "sse2")]
3602    const fn test_mm_sub_epi32() {
3603        let (a, b) = (_mm_set1_epi32(5), _mm_set1_epi32(6));
3604        let r = _mm_sub_epi32(a, b);
3605        assert_eq_m128i(r, _mm_set1_epi32(-1));
3606    }
3607
3608    #[simd_test(enable = "sse2")]
3609    const fn test_mm_sub_epi64() {
3610        let (a, b) = (_mm_set1_epi64x(5), _mm_set1_epi64x(6));
3611        let r = _mm_sub_epi64(a, b);
3612        assert_eq_m128i(r, _mm_set1_epi64x(-1));
3613    }
3614
3615    #[simd_test(enable = "sse2")]
3616    const fn test_mm_subs_epi8() {
3617        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(2));
3618        let r = _mm_subs_epi8(a, b);
3619        assert_eq_m128i(r, _mm_set1_epi8(3));
3620    }
3621
3622    #[simd_test(enable = "sse2")]
3623    fn test_mm_subs_epi8_saturate_positive() {
3624        let a = _mm_set1_epi8(0x7F);
3625        let b = _mm_set1_epi8(-1);
3626        let r = _mm_subs_epi8(a, b);
3627        assert_eq_m128i(r, a);
3628    }
3629
3630    #[simd_test(enable = "sse2")]
3631    fn test_mm_subs_epi8_saturate_negative() {
3632        let a = _mm_set1_epi8(-0x80);
3633        let b = _mm_set1_epi8(1);
3634        let r = _mm_subs_epi8(a, b);
3635        assert_eq_m128i(r, a);
3636    }
3637
3638    #[simd_test(enable = "sse2")]
3639    const fn test_mm_subs_epi16() {
3640        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(2));
3641        let r = _mm_subs_epi16(a, b);
3642        assert_eq_m128i(r, _mm_set1_epi16(3));
3643    }
3644
3645    #[simd_test(enable = "sse2")]
3646    fn test_mm_subs_epi16_saturate_positive() {
3647        let a = _mm_set1_epi16(0x7FFF);
3648        let b = _mm_set1_epi16(-1);
3649        let r = _mm_subs_epi16(a, b);
3650        assert_eq_m128i(r, a);
3651    }
3652
3653    #[simd_test(enable = "sse2")]
3654    fn test_mm_subs_epi16_saturate_negative() {
3655        let a = _mm_set1_epi16(-0x8000);
3656        let b = _mm_set1_epi16(1);
3657        let r = _mm_subs_epi16(a, b);
3658        assert_eq_m128i(r, a);
3659    }
3660
3661    #[simd_test(enable = "sse2")]
3662    const fn test_mm_subs_epu8() {
3663        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(2));
3664        let r = _mm_subs_epu8(a, b);
3665        assert_eq_m128i(r, _mm_set1_epi8(3));
3666    }
3667
3668    #[simd_test(enable = "sse2")]
3669    fn test_mm_subs_epu8_saturate() {
3670        let a = _mm_set1_epi8(0);
3671        let b = _mm_set1_epi8(1);
3672        let r = _mm_subs_epu8(a, b);
3673        assert_eq_m128i(r, a);
3674    }
3675
3676    #[simd_test(enable = "sse2")]
3677    const fn test_mm_subs_epu16() {
3678        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(2));
3679        let r = _mm_subs_epu16(a, b);
3680        assert_eq_m128i(r, _mm_set1_epi16(3));
3681    }
3682
3683    #[simd_test(enable = "sse2")]
3684    fn test_mm_subs_epu16_saturate() {
3685        let a = _mm_set1_epi16(0);
3686        let b = _mm_set1_epi16(1);
3687        let r = _mm_subs_epu16(a, b);
3688        assert_eq_m128i(r, a);
3689    }
3690
3691    #[simd_test(enable = "sse2")]
3692    const fn test_mm_slli_si128() {
3693        #[rustfmt::skip]
3694        let a = _mm_setr_epi8(
3695            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3696        );
3697        let r = _mm_slli_si128::<1>(a);
3698        let e = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3699        assert_eq_m128i(r, e);
3700
3701        #[rustfmt::skip]
3702        let a = _mm_setr_epi8(
3703            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3704        );
3705        let r = _mm_slli_si128::<15>(a);
3706        let e = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
3707        assert_eq_m128i(r, e);
3708
3709        #[rustfmt::skip]
3710        let a = _mm_setr_epi8(
3711            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3712        );
3713        let r = _mm_slli_si128::<16>(a);
3714        assert_eq_m128i(r, _mm_set1_epi8(0));
3715    }
3716
3717    #[simd_test(enable = "sse2")]
3718    const fn test_mm_slli_epi16() {
3719        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3720        let r = _mm_slli_epi16::<4>(a);
3721        assert_eq_m128i(
3722            r,
3723            _mm_setr_epi16(0xCC0, -0xCC0, 0xDD0, -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0),
3724        );
3725        let r = _mm_slli_epi16::<16>(a);
3726        assert_eq_m128i(r, _mm_set1_epi16(0));
3727    }
3728
3729    #[simd_test(enable = "sse2")]
3730    fn test_mm_sll_epi16() {
3731        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3732        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 4));
3733        assert_eq_m128i(
3734            r,
3735            _mm_setr_epi16(0xCC0, -0xCC0, 0xDD0, -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0),
3736        );
3737        let r = _mm_sll_epi16(a, _mm_set_epi64x(4, 0));
3738        assert_eq_m128i(r, a);
3739        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 16));
3740        assert_eq_m128i(r, _mm_set1_epi16(0));
3741        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, i64::MAX));
3742        assert_eq_m128i(r, _mm_set1_epi16(0));
3743    }
3744
3745    #[simd_test(enable = "sse2")]
3746    const fn test_mm_slli_epi32() {
3747        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3748        let r = _mm_slli_epi32::<4>(a);
3749        assert_eq_m128i(r, _mm_setr_epi32(0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0));
3750        let r = _mm_slli_epi32::<32>(a);
3751        assert_eq_m128i(r, _mm_set1_epi32(0));
3752    }
3753
3754    #[simd_test(enable = "sse2")]
3755    fn test_mm_sll_epi32() {
3756        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3757        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 4));
3758        assert_eq_m128i(r, _mm_setr_epi32(0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0));
3759        let r = _mm_sll_epi32(a, _mm_set_epi64x(4, 0));
3760        assert_eq_m128i(r, a);
3761        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 32));
3762        assert_eq_m128i(r, _mm_set1_epi32(0));
3763        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, i64::MAX));
3764        assert_eq_m128i(r, _mm_set1_epi32(0));
3765    }
3766
3767    #[simd_test(enable = "sse2")]
3768    const fn test_mm_slli_epi64() {
3769        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3770        let r = _mm_slli_epi64::<4>(a);
3771        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFFF0, -0xFFFFFFFF0));
3772        let r = _mm_slli_epi64::<64>(a);
3773        assert_eq_m128i(r, _mm_set1_epi64x(0));
3774    }
3775
3776    #[simd_test(enable = "sse2")]
3777    fn test_mm_sll_epi64() {
3778        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3779        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 4));
3780        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFFF0, -0xFFFFFFFF0));
3781        let r = _mm_sll_epi64(a, _mm_set_epi64x(4, 0));
3782        assert_eq_m128i(r, a);
3783        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 64));
3784        assert_eq_m128i(r, _mm_set1_epi64x(0));
3785        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, i64::MAX));
3786        assert_eq_m128i(r, _mm_set1_epi64x(0));
3787    }
3788
3789    #[simd_test(enable = "sse2")]
3790    const fn test_mm_srai_epi16() {
3791        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3792        let r = _mm_srai_epi16::<4>(a);
3793        assert_eq_m128i(
3794            r,
3795            _mm_setr_epi16(0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF, -0x10),
3796        );
3797        let r = _mm_srai_epi16::<16>(a);
3798        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3799    }
3800
3801    #[simd_test(enable = "sse2")]
3802    fn test_mm_sra_epi16() {
3803        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3804        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 4));
3805        assert_eq_m128i(
3806            r,
3807            _mm_setr_epi16(0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF, -0x10),
3808        );
3809        let r = _mm_sra_epi16(a, _mm_set_epi64x(4, 0));
3810        assert_eq_m128i(r, a);
3811        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 16));
3812        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3813        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, i64::MAX));
3814        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3815    }
3816
3817    #[simd_test(enable = "sse2")]
3818    const fn test_mm_srai_epi32() {
3819        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3820        let r = _mm_srai_epi32::<4>(a);
3821        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, -0xEEF, 0xFFF, -0x1000));
3822        let r = _mm_srai_epi32::<32>(a);
3823        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
3824    }
3825
3826    #[simd_test(enable = "sse2")]
3827    fn test_mm_sra_epi32() {
3828        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3829        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 4));
3830        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, -0xEEF, 0xFFF, -0x1000));
3831        let r = _mm_sra_epi32(a, _mm_set_epi64x(4, 0));
3832        assert_eq_m128i(r, a);
3833        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 32));
3834        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
3835        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, i64::MAX));
3836        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
3837    }
3838
3839    #[simd_test(enable = "sse2")]
3840    const fn test_mm_srli_si128() {
3841        #[rustfmt::skip]
3842        let a = _mm_setr_epi8(
3843            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3844        );
3845        let r = _mm_srli_si128::<1>(a);
3846        #[rustfmt::skip]
3847        let e = _mm_setr_epi8(
3848            2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0,
3849        );
3850        assert_eq_m128i(r, e);
3851
3852        #[rustfmt::skip]
3853        let a = _mm_setr_epi8(
3854            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3855        );
3856        let r = _mm_srli_si128::<15>(a);
3857        let e = _mm_setr_epi8(16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3858        assert_eq_m128i(r, e);
3859
3860        #[rustfmt::skip]
3861        let a = _mm_setr_epi8(
3862            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3863        );
3864        let r = _mm_srli_si128::<16>(a);
3865        assert_eq_m128i(r, _mm_set1_epi8(0));
3866    }
3867
3868    #[simd_test(enable = "sse2")]
3869    const fn test_mm_srli_epi16() {
3870        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3871        let r = _mm_srli_epi16::<4>(a);
3872        assert_eq_m128i(
3873            r,
3874            _mm_setr_epi16(0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1, 0xF, 0xFF0),
3875        );
3876        let r = _mm_srli_epi16::<16>(a);
3877        assert_eq_m128i(r, _mm_set1_epi16(0));
3878    }
3879
3880    #[simd_test(enable = "sse2")]
3881    fn test_mm_srl_epi16() {
3882        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3883        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 4));
3884        assert_eq_m128i(
3885            r,
3886            _mm_setr_epi16(0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1, 0xF, 0xFF0),
3887        );
3888        let r = _mm_srl_epi16(a, _mm_set_epi64x(4, 0));
3889        assert_eq_m128i(r, a);
3890        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 16));
3891        assert_eq_m128i(r, _mm_set1_epi16(0));
3892        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, i64::MAX));
3893        assert_eq_m128i(r, _mm_set1_epi16(0));
3894    }
3895
3896    #[simd_test(enable = "sse2")]
3897    const fn test_mm_srli_epi32() {
3898        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3899        let r = _mm_srli_epi32::<4>(a);
3900        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000));
3901        let r = _mm_srli_epi32::<32>(a);
3902        assert_eq_m128i(r, _mm_set1_epi32(0));
3903    }
3904
3905    #[simd_test(enable = "sse2")]
3906    fn test_mm_srl_epi32() {
3907        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3908        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 4));
3909        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000));
3910        let r = _mm_srl_epi32(a, _mm_set_epi64x(4, 0));
3911        assert_eq_m128i(r, a);
3912        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 32));
3913        assert_eq_m128i(r, _mm_set1_epi32(0));
3914        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, i64::MAX));
3915        assert_eq_m128i(r, _mm_set1_epi32(0));
3916    }
3917
3918    #[simd_test(enable = "sse2")]
3919    const fn test_mm_srli_epi64() {
3920        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3921        let r = _mm_srli_epi64::<4>(a);
3922        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFF, 0xFFFFFFFF0000000));
3923        let r = _mm_srli_epi64::<64>(a);
3924        assert_eq_m128i(r, _mm_set1_epi64x(0));
3925    }
3926
3927    #[simd_test(enable = "sse2")]
3928    fn test_mm_srl_epi64() {
3929        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3930        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 4));
3931        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFF, 0xFFFFFFFF0000000));
3932        let r = _mm_srl_epi64(a, _mm_set_epi64x(4, 0));
3933        assert_eq_m128i(r, a);
3934        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 64));
3935        assert_eq_m128i(r, _mm_set1_epi64x(0));
3936        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, i64::MAX));
3937        assert_eq_m128i(r, _mm_set1_epi64x(0));
3938    }
3939
3940    #[simd_test(enable = "sse2")]
3941    const fn test_mm_and_si128() {
3942        let a = _mm_set1_epi8(5);
3943        let b = _mm_set1_epi8(3);
3944        let r = _mm_and_si128(a, b);
3945        assert_eq_m128i(r, _mm_set1_epi8(1));
3946    }
3947
3948    #[simd_test(enable = "sse2")]
3949    const fn test_mm_andnot_si128() {
3950        let a = _mm_set1_epi8(5);
3951        let b = _mm_set1_epi8(3);
3952        let r = _mm_andnot_si128(a, b);
3953        assert_eq_m128i(r, _mm_set1_epi8(2));
3954    }
3955
3956    #[simd_test(enable = "sse2")]
3957    const fn test_mm_or_si128() {
3958        let a = _mm_set1_epi8(5);
3959        let b = _mm_set1_epi8(3);
3960        let r = _mm_or_si128(a, b);
3961        assert_eq_m128i(r, _mm_set1_epi8(7));
3962    }
3963
3964    #[simd_test(enable = "sse2")]
3965    const fn test_mm_xor_si128() {
3966        let a = _mm_set1_epi8(5);
3967        let b = _mm_set1_epi8(3);
3968        let r = _mm_xor_si128(a, b);
3969        assert_eq_m128i(r, _mm_set1_epi8(6));
3970    }
3971
3972    #[simd_test(enable = "sse2")]
3973    const fn test_mm_cmpeq_epi8() {
3974        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3975        let b = _mm_setr_epi8(15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
3976        let r = _mm_cmpeq_epi8(a, b);
3977        #[rustfmt::skip]
3978        assert_eq_m128i(
3979            r,
3980            _mm_setr_epi8(
3981                0, 0, 0xFFu8 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
3982            )
3983        );
3984    }
3985
3986    #[simd_test(enable = "sse2")]
3987    const fn test_mm_cmpeq_epi16() {
3988        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3989        let b = _mm_setr_epi16(7, 6, 2, 4, 3, 2, 1, 0);
3990        let r = _mm_cmpeq_epi16(a, b);
3991        assert_eq_m128i(r, _mm_setr_epi16(0, 0, !0, 0, 0, 0, 0, 0));
3992    }
3993
3994    #[simd_test(enable = "sse2")]
3995    const fn test_mm_cmpeq_epi32() {
3996        let a = _mm_setr_epi32(0, 1, 2, 3);
3997        let b = _mm_setr_epi32(3, 2, 2, 0);
3998        let r = _mm_cmpeq_epi32(a, b);
3999        assert_eq_m128i(r, _mm_setr_epi32(0, 0, !0, 0));
4000    }
4001
4002    #[simd_test(enable = "sse2")]
4003    const fn test_mm_cmpgt_epi8() {
4004        let a = _mm_set_epi8(5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
4005        let b = _mm_set1_epi8(0);
4006        let r = _mm_cmpgt_epi8(a, b);
4007        let e = _mm_set_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
4008        assert_eq_m128i(r, e);
4009    }
4010
4011    #[simd_test(enable = "sse2")]
4012    const fn test_mm_cmpgt_epi16() {
4013        let a = _mm_set_epi16(5, 0, 0, 0, 0, 0, 0, 0);
4014        let b = _mm_set1_epi16(0);
4015        let r = _mm_cmpgt_epi16(a, b);
4016        let e = _mm_set_epi16(!0, 0, 0, 0, 0, 0, 0, 0);
4017        assert_eq_m128i(r, e);
4018    }
4019
4020    #[simd_test(enable = "sse2")]
4021    const fn test_mm_cmpgt_epi32() {
4022        let a = _mm_set_epi32(5, 0, 0, 0);
4023        let b = _mm_set1_epi32(0);
4024        let r = _mm_cmpgt_epi32(a, b);
4025        assert_eq_m128i(r, _mm_set_epi32(!0, 0, 0, 0));
4026    }
4027
4028    #[simd_test(enable = "sse2")]
4029    const fn test_mm_cmplt_epi8() {
4030        let a = _mm_set1_epi8(0);
4031        let b = _mm_set_epi8(5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
4032        let r = _mm_cmplt_epi8(a, b);
4033        let e = _mm_set_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
4034        assert_eq_m128i(r, e);
4035    }
4036
4037    #[simd_test(enable = "sse2")]
4038    const fn test_mm_cmplt_epi16() {
4039        let a = _mm_set1_epi16(0);
4040        let b = _mm_set_epi16(5, 0, 0, 0, 0, 0, 0, 0);
4041        let r = _mm_cmplt_epi16(a, b);
4042        let e = _mm_set_epi16(!0, 0, 0, 0, 0, 0, 0, 0);
4043        assert_eq_m128i(r, e);
4044    }
4045
4046    #[simd_test(enable = "sse2")]
4047    const fn test_mm_cmplt_epi32() {
4048        let a = _mm_set1_epi32(0);
4049        let b = _mm_set_epi32(5, 0, 0, 0);
4050        let r = _mm_cmplt_epi32(a, b);
4051        assert_eq_m128i(r, _mm_set_epi32(!0, 0, 0, 0));
4052    }
4053
4054    #[simd_test(enable = "sse2")]
4055    const fn test_mm_cvtepi32_pd() {
4056        let a = _mm_set_epi32(35, 25, 15, 5);
4057        let r = _mm_cvtepi32_pd(a);
4058        assert_eq_m128d(r, _mm_setr_pd(5.0, 15.0));
4059    }
4060
4061    #[simd_test(enable = "sse2")]
4062    const fn test_mm_cvtsi32_sd() {
4063        let a = _mm_set1_pd(3.5);
4064        let r = _mm_cvtsi32_sd(a, 5);
4065        assert_eq_m128d(r, _mm_setr_pd(5.0, 3.5));
4066    }
4067
4068    #[simd_test(enable = "sse2")]
4069    const fn test_mm_cvtepi32_ps() {
4070        let a = _mm_setr_epi32(1, 2, 3, 4);
4071        let r = _mm_cvtepi32_ps(a);
4072        assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 3.0, 4.0));
4073    }
4074
4075    #[simd_test(enable = "sse2")]
4076    fn test_mm_cvtps_epi32() {
4077        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
4078        let r = _mm_cvtps_epi32(a);
4079        assert_eq_m128i(r, _mm_setr_epi32(1, 2, 3, 4));
4080    }
4081
4082    #[simd_test(enable = "sse2")]
4083    const fn test_mm_cvtsi32_si128() {
4084        let r = _mm_cvtsi32_si128(5);
4085        assert_eq_m128i(r, _mm_setr_epi32(5, 0, 0, 0));
4086    }
4087
4088    #[simd_test(enable = "sse2")]
4089    const fn test_mm_cvtsi128_si32() {
4090        let r = _mm_cvtsi128_si32(_mm_setr_epi32(5, 0, 0, 0));
4091        assert_eq!(r, 5);
4092    }
4093
4094    #[simd_test(enable = "sse2")]
4095    const fn test_mm_set_epi64x() {
4096        let r = _mm_set_epi64x(0, 1);
4097        assert_eq_m128i(r, _mm_setr_epi64x(1, 0));
4098    }
4099
4100    #[simd_test(enable = "sse2")]
4101    const fn test_mm_set_epi32() {
4102        let r = _mm_set_epi32(0, 1, 2, 3);
4103        assert_eq_m128i(r, _mm_setr_epi32(3, 2, 1, 0));
4104    }
4105
4106    #[simd_test(enable = "sse2")]
4107    const fn test_mm_set_epi16() {
4108        let r = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4109        assert_eq_m128i(r, _mm_setr_epi16(7, 6, 5, 4, 3, 2, 1, 0));
4110    }
4111
4112    #[simd_test(enable = "sse2")]
4113    const fn test_mm_set_epi8() {
4114        #[rustfmt::skip]
4115        let r = _mm_set_epi8(
4116            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4117        );
4118        #[rustfmt::skip]
4119        let e = _mm_setr_epi8(
4120            15, 14, 13, 12, 11, 10, 9, 8,
4121            7, 6, 5, 4, 3, 2, 1, 0,
4122        );
4123        assert_eq_m128i(r, e);
4124    }
4125
4126    #[simd_test(enable = "sse2")]
4127    const fn test_mm_set1_epi64x() {
4128        let r = _mm_set1_epi64x(1);
4129        assert_eq_m128i(r, _mm_set1_epi64x(1));
4130    }
4131
4132    #[simd_test(enable = "sse2")]
4133    const fn test_mm_set1_epi32() {
4134        let r = _mm_set1_epi32(1);
4135        assert_eq_m128i(r, _mm_set1_epi32(1));
4136    }
4137
4138    #[simd_test(enable = "sse2")]
4139    const fn test_mm_set1_epi16() {
4140        let r = _mm_set1_epi16(1);
4141        assert_eq_m128i(r, _mm_set1_epi16(1));
4142    }
4143
4144    #[simd_test(enable = "sse2")]
4145    const fn test_mm_set1_epi8() {
4146        let r = _mm_set1_epi8(1);
4147        assert_eq_m128i(r, _mm_set1_epi8(1));
4148    }
4149
4150    #[simd_test(enable = "sse2")]
4151    const fn test_mm_setr_epi32() {
4152        let r = _mm_setr_epi32(0, 1, 2, 3);
4153        assert_eq_m128i(r, _mm_setr_epi32(0, 1, 2, 3));
4154    }
4155
4156    #[simd_test(enable = "sse2")]
4157    const fn test_mm_setr_epi16() {
4158        let r = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4159        assert_eq_m128i(r, _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7));
4160    }
4161
4162    #[simd_test(enable = "sse2")]
4163    const fn test_mm_setr_epi8() {
4164        #[rustfmt::skip]
4165        let r = _mm_setr_epi8(
4166            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4167        );
4168        #[rustfmt::skip]
4169        let e = _mm_setr_epi8(
4170            0, 1, 2, 3, 4, 5, 6, 7,
4171            8, 9, 10, 11, 12, 13, 14, 15,
4172        );
4173        assert_eq_m128i(r, e);
4174    }
4175
4176    #[simd_test(enable = "sse2")]
4177    const fn test_mm_setzero_si128() {
4178        let r = _mm_setzero_si128();
4179        assert_eq_m128i(r, _mm_set1_epi64x(0));
4180    }
4181
4182    #[simd_test(enable = "sse2")]
4183    const fn test_mm_loadl_epi64() {
4184        let a = _mm_setr_epi64x(6, 5);
4185        let r = unsafe { _mm_loadl_epi64(ptr::addr_of!(a)) };
4186        assert_eq_m128i(r, _mm_setr_epi64x(6, 0));
4187    }
4188
4189    #[simd_test(enable = "sse2")]
4190    const fn test_mm_load_si128() {
4191        let a = _mm_set_epi64x(5, 6);
4192        let r = unsafe { _mm_load_si128(ptr::addr_of!(a) as *const _) };
4193        assert_eq_m128i(a, r);
4194    }
4195
4196    #[simd_test(enable = "sse2")]
4197    const fn test_mm_loadu_si128() {
4198        let a = _mm_set_epi64x(5, 6);
4199        let r = unsafe { _mm_loadu_si128(ptr::addr_of!(a) as *const _) };
4200        assert_eq_m128i(a, r);
4201    }
4202
4203    #[simd_test(enable = "sse2")]
4204    // Miri cannot support this until it is clear how it fits in the Rust memory model
4205    // (non-temporal store)
4206    #[cfg_attr(miri, ignore)]
4207    fn test_mm_maskmoveu_si128() {
4208        let a = _mm_set1_epi8(9);
4209        #[rustfmt::skip]
4210        let mask = _mm_set_epi8(
4211            0, 0, 0x80u8 as i8, 0, 0, 0, 0, 0,
4212            0, 0, 0, 0, 0, 0, 0, 0,
4213        );
4214        let mut r = _mm_set1_epi8(0);
4215        unsafe {
4216            _mm_maskmoveu_si128(a, mask, ptr::addr_of_mut!(r) as *mut i8);
4217        }
4218        _mm_sfence();
4219        let e = _mm_set_epi8(0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
4220        assert_eq_m128i(r, e);
4221    }
4222
4223    #[simd_test(enable = "sse2")]
4224    const fn test_mm_store_si128() {
4225        let a = _mm_set1_epi8(9);
4226        let mut r = _mm_set1_epi8(0);
4227        unsafe {
4228            _mm_store_si128(&mut r, a);
4229        }
4230        assert_eq_m128i(r, a);
4231    }
4232
4233    #[simd_test(enable = "sse2")]
4234    const fn test_mm_storeu_si128() {
4235        let a = _mm_set1_epi8(9);
4236        let mut r = _mm_set1_epi8(0);
4237        unsafe {
4238            _mm_storeu_si128(&mut r, a);
4239        }
4240        assert_eq_m128i(r, a);
4241    }
4242
4243    #[simd_test(enable = "sse2")]
4244    const fn test_mm_storel_epi64() {
4245        let a = _mm_setr_epi64x(2, 9);
4246        let mut r = _mm_set1_epi8(0);
4247        unsafe {
4248            _mm_storel_epi64(&mut r, a);
4249        }
4250        assert_eq_m128i(r, _mm_setr_epi64x(2, 0));
4251    }
4252
4253    #[simd_test(enable = "sse2")]
4254    // Miri cannot support this until it is clear how it fits in the Rust memory model
4255    // (non-temporal store)
4256    #[cfg_attr(miri, ignore)]
4257    fn test_mm_stream_si128() {
4258        let a = _mm_setr_epi32(1, 2, 3, 4);
4259        let mut r = _mm_undefined_si128();
4260        unsafe {
4261            _mm_stream_si128(ptr::addr_of_mut!(r), a);
4262        }
4263        _mm_sfence();
4264        assert_eq_m128i(r, a);
4265    }
4266
4267    #[simd_test(enable = "sse2")]
4268    // Miri cannot support this until it is clear how it fits in the Rust memory model
4269    // (non-temporal store)
4270    #[cfg_attr(miri, ignore)]
4271    fn test_mm_stream_si32() {
4272        let a: i32 = 7;
4273        let mut mem = boxed::Box::<i32>::new(-1);
4274        unsafe {
4275            _mm_stream_si32(ptr::addr_of_mut!(*mem), a);
4276        }
4277        _mm_sfence();
4278        assert_eq!(a, *mem);
4279    }
4280
4281    #[simd_test(enable = "sse2")]
4282    const fn test_mm_move_epi64() {
4283        let a = _mm_setr_epi64x(5, 6);
4284        let r = _mm_move_epi64(a);
4285        assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
4286    }
4287
4288    #[simd_test(enable = "sse2")]
4289    fn test_mm_packs_epi16() {
4290        let a = _mm_setr_epi16(0x80, -0x81, 0, 0, 0, 0, 0, 0);
4291        let b = _mm_setr_epi16(0, 0, 0, 0, 0, 0, -0x81, 0x80);
4292        let r = _mm_packs_epi16(a, b);
4293        #[rustfmt::skip]
4294        assert_eq_m128i(
4295            r,
4296            _mm_setr_epi8(
4297                0x7F, -0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0x80, 0x7F
4298            )
4299        );
4300    }
4301
4302    #[simd_test(enable = "sse2")]
4303    fn test_mm_packs_epi32() {
4304        let a = _mm_setr_epi32(0x8000, -0x8001, 0, 0);
4305        let b = _mm_setr_epi32(0, 0, -0x8001, 0x8000);
4306        let r = _mm_packs_epi32(a, b);
4307        assert_eq_m128i(
4308            r,
4309            _mm_setr_epi16(0x7FFF, -0x8000, 0, 0, 0, 0, -0x8000, 0x7FFF),
4310        );
4311    }
4312
4313    #[simd_test(enable = "sse2")]
4314    fn test_mm_packus_epi16() {
4315        let a = _mm_setr_epi16(0x100, -1, 0, 0, 0, 0, 0, 0);
4316        let b = _mm_setr_epi16(0, 0, 0, 0, 0, 0, -1, 0x100);
4317        let r = _mm_packus_epi16(a, b);
4318        assert_eq_m128i(
4319            r,
4320            _mm_setr_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, !0),
4321        );
4322    }
4323
4324    #[simd_test(enable = "sse2")]
4325    const fn test_mm_extract_epi16() {
4326        let a = _mm_setr_epi16(-1, 1, 2, 3, 4, 5, 6, 7);
4327        let r1 = _mm_extract_epi16::<0>(a);
4328        let r2 = _mm_extract_epi16::<3>(a);
4329        assert_eq!(r1, 0xFFFF);
4330        assert_eq!(r2, 3);
4331    }
4332
4333    #[simd_test(enable = "sse2")]
4334    const fn test_mm_insert_epi16() {
4335        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4336        let r = _mm_insert_epi16::<0>(a, 9);
4337        let e = _mm_setr_epi16(9, 1, 2, 3, 4, 5, 6, 7);
4338        assert_eq_m128i(r, e);
4339    }
4340
4341    #[simd_test(enable = "sse2")]
4342    const fn test_mm_movemask_epi8() {
4343        #[rustfmt::skip]
4344        let a = _mm_setr_epi8(
4345            0b1000_0000u8 as i8, 0b0, 0b1000_0000u8 as i8, 0b01,
4346            0b0101, 0b1111_0000u8 as i8, 0, 0,
4347            0, 0b1011_0101u8 as i8, 0b1111_0000u8 as i8, 0b0101,
4348            0b01, 0b1000_0000u8 as i8, 0b0, 0b1000_0000u8 as i8,
4349        );
4350        let r = _mm_movemask_epi8(a);
4351        assert_eq!(r, 0b10100110_00100101);
4352    }
4353
4354    #[simd_test(enable = "sse2")]
4355    const fn test_mm_shuffle_epi32() {
4356        let a = _mm_setr_epi32(5, 10, 15, 20);
4357        let r = _mm_shuffle_epi32::<0b00_01_01_11>(a);
4358        let e = _mm_setr_epi32(20, 10, 10, 5);
4359        assert_eq_m128i(r, e);
4360    }
4361
4362    #[simd_test(enable = "sse2")]
4363    const fn test_mm_shufflehi_epi16() {
4364        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 10, 15, 20);
4365        let r = _mm_shufflehi_epi16::<0b00_01_01_11>(a);
4366        let e = _mm_setr_epi16(1, 2, 3, 4, 20, 10, 10, 5);
4367        assert_eq_m128i(r, e);
4368    }
4369
4370    #[simd_test(enable = "sse2")]
4371    const fn test_mm_shufflelo_epi16() {
4372        let a = _mm_setr_epi16(5, 10, 15, 20, 1, 2, 3, 4);
4373        let r = _mm_shufflelo_epi16::<0b00_01_01_11>(a);
4374        let e = _mm_setr_epi16(20, 10, 10, 5, 1, 2, 3, 4);
4375        assert_eq_m128i(r, e);
4376    }
4377
4378    #[simd_test(enable = "sse2")]
4379    const fn test_mm_unpackhi_epi8() {
4380        #[rustfmt::skip]
4381        let a = _mm_setr_epi8(
4382            0, 1, 2, 3, 4, 5, 6, 7,
4383            8, 9, 10, 11, 12, 13, 14, 15,
4384        );
4385        #[rustfmt::skip]
4386        let b = _mm_setr_epi8(
4387            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
4388        );
4389        let r = _mm_unpackhi_epi8(a, b);
4390        #[rustfmt::skip]
4391        let e = _mm_setr_epi8(
4392            8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31,
4393        );
4394        assert_eq_m128i(r, e);
4395    }
4396
4397    #[simd_test(enable = "sse2")]
4398    const fn test_mm_unpackhi_epi16() {
4399        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4400        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
4401        let r = _mm_unpackhi_epi16(a, b);
4402        let e = _mm_setr_epi16(4, 12, 5, 13, 6, 14, 7, 15);
4403        assert_eq_m128i(r, e);
4404    }
4405
4406    #[simd_test(enable = "sse2")]
4407    const fn test_mm_unpackhi_epi32() {
4408        let a = _mm_setr_epi32(0, 1, 2, 3);
4409        let b = _mm_setr_epi32(4, 5, 6, 7);
4410        let r = _mm_unpackhi_epi32(a, b);
4411        let e = _mm_setr_epi32(2, 6, 3, 7);
4412        assert_eq_m128i(r, e);
4413    }
4414
4415    #[simd_test(enable = "sse2")]
4416    const fn test_mm_unpackhi_epi64() {
4417        let a = _mm_setr_epi64x(0, 1);
4418        let b = _mm_setr_epi64x(2, 3);
4419        let r = _mm_unpackhi_epi64(a, b);
4420        let e = _mm_setr_epi64x(1, 3);
4421        assert_eq_m128i(r, e);
4422    }
4423
4424    #[simd_test(enable = "sse2")]
4425    const fn test_mm_unpacklo_epi8() {
4426        #[rustfmt::skip]
4427        let a = _mm_setr_epi8(
4428            0, 1, 2, 3, 4, 5, 6, 7,
4429            8, 9, 10, 11, 12, 13, 14, 15,
4430        );
4431        #[rustfmt::skip]
4432        let b = _mm_setr_epi8(
4433            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
4434        );
4435        let r = _mm_unpacklo_epi8(a, b);
4436        #[rustfmt::skip]
4437        let e = _mm_setr_epi8(
4438            0, 16, 1, 17, 2, 18, 3, 19,
4439            4, 20, 5, 21, 6, 22, 7, 23,
4440        );
4441        assert_eq_m128i(r, e);
4442    }
4443
4444    #[simd_test(enable = "sse2")]
4445    const fn test_mm_unpacklo_epi16() {
4446        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4447        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
4448        let r = _mm_unpacklo_epi16(a, b);
4449        let e = _mm_setr_epi16(0, 8, 1, 9, 2, 10, 3, 11);
4450        assert_eq_m128i(r, e);
4451    }
4452
4453    #[simd_test(enable = "sse2")]
4454    const fn test_mm_unpacklo_epi32() {
4455        let a = _mm_setr_epi32(0, 1, 2, 3);
4456        let b = _mm_setr_epi32(4, 5, 6, 7);
4457        let r = _mm_unpacklo_epi32(a, b);
4458        let e = _mm_setr_epi32(0, 4, 1, 5);
4459        assert_eq_m128i(r, e);
4460    }
4461
4462    #[simd_test(enable = "sse2")]
4463    const fn test_mm_unpacklo_epi64() {
4464        let a = _mm_setr_epi64x(0, 1);
4465        let b = _mm_setr_epi64x(2, 3);
4466        let r = _mm_unpacklo_epi64(a, b);
4467        let e = _mm_setr_epi64x(0, 2);
4468        assert_eq_m128i(r, e);
4469    }
4470
4471    #[simd_test(enable = "sse2")]
4472    const fn test_mm_add_sd() {
4473        let a = _mm_setr_pd(1.0, 2.0);
4474        let b = _mm_setr_pd(5.0, 10.0);
4475        let r = _mm_add_sd(a, b);
4476        assert_eq_m128d(r, _mm_setr_pd(6.0, 2.0));
4477    }
4478
4479    #[simd_test(enable = "sse2")]
4480    const fn test_mm_add_pd() {
4481        let a = _mm_setr_pd(1.0, 2.0);
4482        let b = _mm_setr_pd(5.0, 10.0);
4483        let r = _mm_add_pd(a, b);
4484        assert_eq_m128d(r, _mm_setr_pd(6.0, 12.0));
4485    }
4486
4487    #[simd_test(enable = "sse2")]
4488    const fn test_mm_div_sd() {
4489        let a = _mm_setr_pd(1.0, 2.0);
4490        let b = _mm_setr_pd(5.0, 10.0);
4491        let r = _mm_div_sd(a, b);
4492        assert_eq_m128d(r, _mm_setr_pd(0.2, 2.0));
4493    }
4494
4495    #[simd_test(enable = "sse2")]
4496    const fn test_mm_div_pd() {
4497        let a = _mm_setr_pd(1.0, 2.0);
4498        let b = _mm_setr_pd(5.0, 10.0);
4499        let r = _mm_div_pd(a, b);
4500        assert_eq_m128d(r, _mm_setr_pd(0.2, 0.2));
4501    }
4502
4503    #[simd_test(enable = "sse2")]
4504    fn test_mm_max_sd() {
4505        let a = _mm_setr_pd(1.0, 2.0);
4506        let b = _mm_setr_pd(5.0, 10.0);
4507        let r = _mm_max_sd(a, b);
4508        assert_eq_m128d(r, _mm_setr_pd(5.0, 2.0));
4509    }
4510
4511    #[simd_test(enable = "sse2")]
4512    fn test_mm_max_pd() {
4513        let a = _mm_setr_pd(1.0, 2.0);
4514        let b = _mm_setr_pd(5.0, 10.0);
4515        let r = _mm_max_pd(a, b);
4516        assert_eq_m128d(r, _mm_setr_pd(5.0, 10.0));
4517
4518        // Check SSE(2)-specific semantics for -0.0 handling.
4519        let a = _mm_setr_pd(-0.0, 0.0);
4520        let b = _mm_setr_pd(0.0, 0.0);
4521        // Cast to __m128i to compare exact bit patterns
4522        let r1 = _mm_castpd_si128(_mm_max_pd(a, b));
4523        let r2 = _mm_castpd_si128(_mm_max_pd(b, a));
4524        let a = _mm_castpd_si128(a);
4525        let b = _mm_castpd_si128(b);
4526        assert_eq_m128i(r1, b);
4527        assert_eq_m128i(r2, a);
4528        assert_ne!(a.as_u8x16(), b.as_u8x16()); // sanity check that -0.0 is actually present
4529    }
4530
4531    #[simd_test(enable = "sse2")]
4532    fn test_mm_min_sd() {
4533        let a = _mm_setr_pd(1.0, 2.0);
4534        let b = _mm_setr_pd(5.0, 10.0);
4535        let r = _mm_min_sd(a, b);
4536        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
4537    }
4538
4539    #[simd_test(enable = "sse2")]
4540    fn test_mm_min_pd() {
4541        let a = _mm_setr_pd(1.0, 2.0);
4542        let b = _mm_setr_pd(5.0, 10.0);
4543        let r = _mm_min_pd(a, b);
4544        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
4545
4546        // Check SSE(2)-specific semantics for -0.0 handling.
4547        let a = _mm_setr_pd(-0.0, 0.0);
4548        let b = _mm_setr_pd(0.0, 0.0);
4549        // Cast to __m128i to compare exact bit patterns
4550        let r1 = _mm_castpd_si128(_mm_min_pd(a, b));
4551        let r2 = _mm_castpd_si128(_mm_min_pd(b, a));
4552        let a = _mm_castpd_si128(a);
4553        let b = _mm_castpd_si128(b);
4554        assert_eq_m128i(r1, b);
4555        assert_eq_m128i(r2, a);
4556        assert_ne!(a.as_u8x16(), b.as_u8x16()); // sanity check that -0.0 is actually present
4557    }
4558
4559    #[simd_test(enable = "sse2")]
4560    const fn test_mm_mul_sd() {
4561        let a = _mm_setr_pd(1.0, 2.0);
4562        let b = _mm_setr_pd(5.0, 10.0);
4563        let r = _mm_mul_sd(a, b);
4564        assert_eq_m128d(r, _mm_setr_pd(5.0, 2.0));
4565    }
4566
4567    #[simd_test(enable = "sse2")]
4568    const fn test_mm_mul_pd() {
4569        let a = _mm_setr_pd(1.0, 2.0);
4570        let b = _mm_setr_pd(5.0, 10.0);
4571        let r = _mm_mul_pd(a, b);
4572        assert_eq_m128d(r, _mm_setr_pd(5.0, 20.0));
4573    }
4574
4575    #[simd_test(enable = "sse2")]
4576    fn test_mm_sqrt_sd() {
4577        let a = _mm_setr_pd(1.0, 2.0);
4578        let b = _mm_setr_pd(5.0, 10.0);
4579        let r = _mm_sqrt_sd(a, b);
4580        assert_eq_m128d(r, _mm_setr_pd(5.0f64.sqrt(), 2.0));
4581    }
4582
4583    #[simd_test(enable = "sse2")]
4584    fn test_mm_sqrt_pd() {
4585        let r = _mm_sqrt_pd(_mm_setr_pd(1.0, 2.0));
4586        assert_eq_m128d(r, _mm_setr_pd(1.0f64.sqrt(), 2.0f64.sqrt()));
4587    }
4588
4589    #[simd_test(enable = "sse2")]
4590    const fn test_mm_sub_sd() {
4591        let a = _mm_setr_pd(1.0, 2.0);
4592        let b = _mm_setr_pd(5.0, 10.0);
4593        let r = _mm_sub_sd(a, b);
4594        assert_eq_m128d(r, _mm_setr_pd(-4.0, 2.0));
4595    }
4596
4597    #[simd_test(enable = "sse2")]
4598    const fn test_mm_sub_pd() {
4599        let a = _mm_setr_pd(1.0, 2.0);
4600        let b = _mm_setr_pd(5.0, 10.0);
4601        let r = _mm_sub_pd(a, b);
4602        assert_eq_m128d(r, _mm_setr_pd(-4.0, -8.0));
4603    }
4604
4605    #[simd_test(enable = "sse2")]
4606    const fn test_mm_and_pd() {
4607        let a = f64x2::from_bits(u64x2::splat(5)).as_m128d();
4608        let b = f64x2::from_bits(u64x2::splat(3)).as_m128d();
4609        let r = _mm_and_pd(a, b);
4610        let e = f64x2::from_bits(u64x2::splat(1)).as_m128d();
4611        assert_eq_m128d(r, e);
4612    }
4613
4614    #[simd_test(enable = "sse2")]
4615    const fn test_mm_andnot_pd() {
4616        let a = f64x2::from_bits(u64x2::splat(5)).as_m128d();
4617        let b = f64x2::from_bits(u64x2::splat(3)).as_m128d();
4618        let r = _mm_andnot_pd(a, b);
4619        let e = f64x2::from_bits(u64x2::splat(2)).as_m128d();
4620        assert_eq_m128d(r, e);
4621    }
4622
4623    #[simd_test(enable = "sse2")]
4624    const fn test_mm_or_pd() {
4625        let a = f64x2::from_bits(u64x2::splat(5)).as_m128d();
4626        let b = f64x2::from_bits(u64x2::splat(3)).as_m128d();
4627        let r = _mm_or_pd(a, b);
4628        let e = f64x2::from_bits(u64x2::splat(7)).as_m128d();
4629        assert_eq_m128d(r, e);
4630    }
4631
4632    #[simd_test(enable = "sse2")]
4633    const fn test_mm_xor_pd() {
4634        let a = f64x2::from_bits(u64x2::splat(5)).as_m128d();
4635        let b = f64x2::from_bits(u64x2::splat(3)).as_m128d();
4636        let r = _mm_xor_pd(a, b);
4637        let e = f64x2::from_bits(u64x2::splat(6)).as_m128d();
4638        assert_eq_m128d(r, e);
4639    }
4640
4641    #[simd_test(enable = "sse2")]
4642    fn test_mm_cmpeq_sd() {
4643        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4644        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4645        let r = _mm_castpd_si128(_mm_cmpeq_sd(a, b));
4646        assert_eq_m128i(r, e);
4647    }
4648
4649    #[simd_test(enable = "sse2")]
4650    fn test_mm_cmplt_sd() {
4651        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4652        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4653        let r = _mm_castpd_si128(_mm_cmplt_sd(a, b));
4654        assert_eq_m128i(r, e);
4655    }
4656
4657    #[simd_test(enable = "sse2")]
4658    fn test_mm_cmple_sd() {
4659        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4660        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4661        let r = _mm_castpd_si128(_mm_cmple_sd(a, b));
4662        assert_eq_m128i(r, e);
4663    }
4664
4665    #[simd_test(enable = "sse2")]
4666    fn test_mm_cmpgt_sd() {
4667        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
4668        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4669        let r = _mm_castpd_si128(_mm_cmpgt_sd(a, b));
4670        assert_eq_m128i(r, e);
4671    }
4672
4673    #[simd_test(enable = "sse2")]
4674    fn test_mm_cmpge_sd() {
4675        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4676        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4677        let r = _mm_castpd_si128(_mm_cmpge_sd(a, b));
4678        assert_eq_m128i(r, e);
4679    }
4680
4681    #[simd_test(enable = "sse2")]
4682    fn test_mm_cmpord_sd() {
4683        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4684        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4685        let r = _mm_castpd_si128(_mm_cmpord_sd(a, b));
4686        assert_eq_m128i(r, e);
4687    }
4688
4689    #[simd_test(enable = "sse2")]
4690    fn test_mm_cmpunord_sd() {
4691        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4692        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4693        let r = _mm_castpd_si128(_mm_cmpunord_sd(a, b));
4694        assert_eq_m128i(r, e);
4695    }
4696
4697    #[simd_test(enable = "sse2")]
4698    fn test_mm_cmpneq_sd() {
4699        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4700        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4701        let r = _mm_castpd_si128(_mm_cmpneq_sd(a, b));
4702        assert_eq_m128i(r, e);
4703    }
4704
4705    #[simd_test(enable = "sse2")]
4706    fn test_mm_cmpnlt_sd() {
4707        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4708        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4709        let r = _mm_castpd_si128(_mm_cmpnlt_sd(a, b));
4710        assert_eq_m128i(r, e);
4711    }
4712
4713    #[simd_test(enable = "sse2")]
4714    fn test_mm_cmpnle_sd() {
4715        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4716        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4717        let r = _mm_castpd_si128(_mm_cmpnle_sd(a, b));
4718        assert_eq_m128i(r, e);
4719    }
4720
4721    #[simd_test(enable = "sse2")]
4722    fn test_mm_cmpngt_sd() {
4723        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
4724        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4725        let r = _mm_castpd_si128(_mm_cmpngt_sd(a, b));
4726        assert_eq_m128i(r, e);
4727    }
4728
4729    #[simd_test(enable = "sse2")]
4730    fn test_mm_cmpnge_sd() {
4731        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4732        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4733        let r = _mm_castpd_si128(_mm_cmpnge_sd(a, b));
4734        assert_eq_m128i(r, e);
4735    }
4736
4737    #[simd_test(enable = "sse2")]
4738    fn test_mm_cmpeq_pd() {
4739        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4740        let e = _mm_setr_epi64x(!0, 0);
4741        let r = _mm_castpd_si128(_mm_cmpeq_pd(a, b));
4742        assert_eq_m128i(r, e);
4743    }
4744
4745    #[simd_test(enable = "sse2")]
4746    fn test_mm_cmplt_pd() {
4747        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4748        let e = _mm_setr_epi64x(0, !0);
4749        let r = _mm_castpd_si128(_mm_cmplt_pd(a, b));
4750        assert_eq_m128i(r, e);
4751    }
4752
4753    #[simd_test(enable = "sse2")]
4754    fn test_mm_cmple_pd() {
4755        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4756        let e = _mm_setr_epi64x(!0, !0);
4757        let r = _mm_castpd_si128(_mm_cmple_pd(a, b));
4758        assert_eq_m128i(r, e);
4759    }
4760
4761    #[simd_test(enable = "sse2")]
4762    fn test_mm_cmpgt_pd() {
4763        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4764        let e = _mm_setr_epi64x(0, 0);
4765        let r = _mm_castpd_si128(_mm_cmpgt_pd(a, b));
4766        assert_eq_m128i(r, e);
4767    }
4768
4769    #[simd_test(enable = "sse2")]
4770    fn test_mm_cmpge_pd() {
4771        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4772        let e = _mm_setr_epi64x(!0, 0);
4773        let r = _mm_castpd_si128(_mm_cmpge_pd(a, b));
4774        assert_eq_m128i(r, e);
4775    }
4776
4777    #[simd_test(enable = "sse2")]
4778    fn test_mm_cmpord_pd() {
4779        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4780        let e = _mm_setr_epi64x(0, !0);
4781        let r = _mm_castpd_si128(_mm_cmpord_pd(a, b));
4782        assert_eq_m128i(r, e);
4783    }
4784
4785    #[simd_test(enable = "sse2")]
4786    fn test_mm_cmpunord_pd() {
4787        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4788        let e = _mm_setr_epi64x(!0, 0);
4789        let r = _mm_castpd_si128(_mm_cmpunord_pd(a, b));
4790        assert_eq_m128i(r, e);
4791    }
4792
4793    #[simd_test(enable = "sse2")]
4794    fn test_mm_cmpneq_pd() {
4795        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4796        let e = _mm_setr_epi64x(!0, !0);
4797        let r = _mm_castpd_si128(_mm_cmpneq_pd(a, b));
4798        assert_eq_m128i(r, e);
4799    }
4800
4801    #[simd_test(enable = "sse2")]
4802    fn test_mm_cmpnlt_pd() {
4803        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4804        let e = _mm_setr_epi64x(0, 0);
4805        let r = _mm_castpd_si128(_mm_cmpnlt_pd(a, b));
4806        assert_eq_m128i(r, e);
4807    }
4808
4809    #[simd_test(enable = "sse2")]
4810    fn test_mm_cmpnle_pd() {
4811        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4812        let e = _mm_setr_epi64x(0, 0);
4813        let r = _mm_castpd_si128(_mm_cmpnle_pd(a, b));
4814        assert_eq_m128i(r, e);
4815    }
4816
4817    #[simd_test(enable = "sse2")]
4818    fn test_mm_cmpngt_pd() {
4819        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
4820        let e = _mm_setr_epi64x(0, !0);
4821        let r = _mm_castpd_si128(_mm_cmpngt_pd(a, b));
4822        assert_eq_m128i(r, e);
4823    }
4824
4825    #[simd_test(enable = "sse2")]
4826    fn test_mm_cmpnge_pd() {
4827        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4828        let e = _mm_setr_epi64x(0, !0);
4829        let r = _mm_castpd_si128(_mm_cmpnge_pd(a, b));
4830        assert_eq_m128i(r, e);
4831    }
4832
4833    #[simd_test(enable = "sse2")]
4834    fn test_mm_comieq_sd() {
4835        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4836        assert!(_mm_comieq_sd(a, b) != 0);
4837
4838        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(1.0, 3.0));
4839        assert!(_mm_comieq_sd(a, b) == 0);
4840    }
4841
4842    #[simd_test(enable = "sse2")]
4843    fn test_mm_comilt_sd() {
4844        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4845        assert!(_mm_comilt_sd(a, b) == 0);
4846    }
4847
4848    #[simd_test(enable = "sse2")]
4849    fn test_mm_comile_sd() {
4850        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4851        assert!(_mm_comile_sd(a, b) != 0);
4852    }
4853
4854    #[simd_test(enable = "sse2")]
4855    fn test_mm_comigt_sd() {
4856        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4857        assert!(_mm_comigt_sd(a, b) == 0);
4858    }
4859
4860    #[simd_test(enable = "sse2")]
4861    fn test_mm_comige_sd() {
4862        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4863        assert!(_mm_comige_sd(a, b) != 0);
4864    }
4865
4866    #[simd_test(enable = "sse2")]
4867    fn test_mm_comineq_sd() {
4868        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4869        assert!(_mm_comineq_sd(a, b) == 0);
4870    }
4871
4872    #[simd_test(enable = "sse2")]
4873    fn test_mm_ucomieq_sd() {
4874        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4875        assert!(_mm_ucomieq_sd(a, b) != 0);
4876
4877        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(NAN, 3.0));
4878        assert!(_mm_ucomieq_sd(a, b) == 0);
4879    }
4880
4881    #[simd_test(enable = "sse2")]
4882    fn test_mm_ucomilt_sd() {
4883        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4884        assert!(_mm_ucomilt_sd(a, b) == 0);
4885    }
4886
4887    #[simd_test(enable = "sse2")]
4888    fn test_mm_ucomile_sd() {
4889        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4890        assert!(_mm_ucomile_sd(a, b) != 0);
4891    }
4892
4893    #[simd_test(enable = "sse2")]
4894    fn test_mm_ucomigt_sd() {
4895        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4896        assert!(_mm_ucomigt_sd(a, b) == 0);
4897    }
4898
4899    #[simd_test(enable = "sse2")]
4900    fn test_mm_ucomige_sd() {
4901        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4902        assert!(_mm_ucomige_sd(a, b) != 0);
4903    }
4904
4905    #[simd_test(enable = "sse2")]
4906    fn test_mm_ucomineq_sd() {
4907        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4908        assert!(_mm_ucomineq_sd(a, b) == 0);
4909    }
4910
4911    #[simd_test(enable = "sse2")]
4912    const fn test_mm_movemask_pd() {
4913        let r = _mm_movemask_pd(_mm_setr_pd(-1.0, 5.0));
4914        assert_eq!(r, 0b01);
4915
4916        let r = _mm_movemask_pd(_mm_setr_pd(-1.0, -5.0));
4917        assert_eq!(r, 0b11);
4918    }
4919
4920    #[repr(align(16))]
4921    struct Memory {
4922        data: [f64; 4],
4923    }
4924
4925    #[simd_test(enable = "sse2")]
4926    const fn test_mm_load_pd() {
4927        let mem = Memory {
4928            data: [1.0f64, 2.0, 3.0, 4.0],
4929        };
4930        let vals = &mem.data;
4931        let d = vals.as_ptr();
4932
4933        let r = unsafe { _mm_load_pd(d) };
4934        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
4935    }
4936
4937    #[simd_test(enable = "sse2")]
4938    const fn test_mm_load_sd() {
4939        let a = 1.;
4940        let expected = _mm_setr_pd(a, 0.);
4941        let r = unsafe { _mm_load_sd(&a) };
4942        assert_eq_m128d(r, expected);
4943    }
4944
4945    #[simd_test(enable = "sse2")]
4946    const fn test_mm_loadh_pd() {
4947        let a = _mm_setr_pd(1., 2.);
4948        let b = 3.;
4949        let expected = _mm_setr_pd(_mm_cvtsd_f64(a), 3.);
4950        let r = unsafe { _mm_loadh_pd(a, &b) };
4951        assert_eq_m128d(r, expected);
4952    }
4953
4954    #[simd_test(enable = "sse2")]
4955    const fn test_mm_loadl_pd() {
4956        let a = _mm_setr_pd(1., 2.);
4957        let b = 3.;
4958        let expected = _mm_setr_pd(3., get_m128d(a, 1));
4959        let r = unsafe { _mm_loadl_pd(a, &b) };
4960        assert_eq_m128d(r, expected);
4961    }
4962
4963    #[simd_test(enable = "sse2")]
4964    // Miri cannot support this until it is clear how it fits in the Rust memory model
4965    // (non-temporal store)
4966    #[cfg_attr(miri, ignore)]
4967    fn test_mm_stream_pd() {
4968        #[repr(align(128))]
4969        struct Memory {
4970            pub data: [f64; 2],
4971        }
4972        let a = _mm_set1_pd(7.0);
4973        let mut mem = Memory { data: [-1.0; 2] };
4974
4975        unsafe {
4976            _mm_stream_pd(ptr::addr_of_mut!(mem.data[0]), a);
4977        }
4978        _mm_sfence();
4979        for i in 0..2 {
4980            assert_eq!(mem.data[i], get_m128d(a, i));
4981        }
4982    }
4983
4984    #[simd_test(enable = "sse2")]
4985    const fn test_mm_store_sd() {
4986        let mut dest = 0.;
4987        let a = _mm_setr_pd(1., 2.);
4988        unsafe {
4989            _mm_store_sd(&mut dest, a);
4990        }
4991        assert_eq!(dest, _mm_cvtsd_f64(a));
4992    }
4993
4994    #[simd_test(enable = "sse2")]
4995    const fn test_mm_store_pd() {
4996        let mut mem = Memory { data: [0.0f64; 4] };
4997        let vals = &mut mem.data;
4998        let a = _mm_setr_pd(1.0, 2.0);
4999        let d = vals.as_mut_ptr();
5000
5001        unsafe {
5002            _mm_store_pd(d, *black_box(&a));
5003        }
5004        assert_eq!(vals[0], 1.0);
5005        assert_eq!(vals[1], 2.0);
5006    }
5007
5008    #[simd_test(enable = "sse2")]
5009    const fn test_mm_storeu_pd() {
5010        // guaranteed to be aligned to 16 bytes
5011        let mut mem = Memory { data: [0.0f64; 4] };
5012        let vals = &mut mem.data;
5013        let a = _mm_setr_pd(1.0, 2.0);
5014
5015        // so p is *not* aligned to 16 bytes
5016        unsafe {
5017            let p = vals.as_mut_ptr().offset(1);
5018            _mm_storeu_pd(p, *black_box(&a));
5019        }
5020
5021        assert_eq!(*vals, [0.0, 1.0, 2.0, 0.0]);
5022    }
5023
5024    #[simd_test(enable = "sse2")]
5025    const fn test_mm_storeu_si16() {
5026        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
5027        let mut r = _mm_setr_epi16(9, 10, 11, 12, 13, 14, 15, 16);
5028        unsafe {
5029            _mm_storeu_si16(ptr::addr_of_mut!(r).cast(), a);
5030        }
5031        let e = _mm_setr_epi16(1, 10, 11, 12, 13, 14, 15, 16);
5032        assert_eq_m128i(r, e);
5033    }
5034
5035    #[simd_test(enable = "sse2")]
5036    const fn test_mm_storeu_si32() {
5037        let a = _mm_setr_epi32(1, 2, 3, 4);
5038        let mut r = _mm_setr_epi32(5, 6, 7, 8);
5039        unsafe {
5040            _mm_storeu_si32(ptr::addr_of_mut!(r).cast(), a);
5041        }
5042        let e = _mm_setr_epi32(1, 6, 7, 8);
5043        assert_eq_m128i(r, e);
5044    }
5045
5046    #[simd_test(enable = "sse2")]
5047    const fn test_mm_storeu_si64() {
5048        let a = _mm_setr_epi64x(1, 2);
5049        let mut r = _mm_setr_epi64x(3, 4);
5050        unsafe {
5051            _mm_storeu_si64(ptr::addr_of_mut!(r).cast(), a);
5052        }
5053        let e = _mm_setr_epi64x(1, 4);
5054        assert_eq_m128i(r, e);
5055    }
5056
5057    #[simd_test(enable = "sse2")]
5058    const fn test_mm_store1_pd() {
5059        let mut mem = Memory { data: [0.0f64; 4] };
5060        let vals = &mut mem.data;
5061        let a = _mm_setr_pd(1.0, 2.0);
5062        let d = vals.as_mut_ptr();
5063
5064        unsafe {
5065            _mm_store1_pd(d, *black_box(&a));
5066        }
5067        assert_eq!(vals[0], 1.0);
5068        assert_eq!(vals[1], 1.0);
5069    }
5070
5071    #[simd_test(enable = "sse2")]
5072    const fn test_mm_store_pd1() {
5073        let mut mem = Memory { data: [0.0f64; 4] };
5074        let vals = &mut mem.data;
5075        let a = _mm_setr_pd(1.0, 2.0);
5076        let d = vals.as_mut_ptr();
5077
5078        unsafe {
5079            _mm_store_pd1(d, *black_box(&a));
5080        }
5081        assert_eq!(vals[0], 1.0);
5082        assert_eq!(vals[1], 1.0);
5083    }
5084
5085    #[simd_test(enable = "sse2")]
5086    const fn test_mm_storer_pd() {
5087        let mut mem = Memory { data: [0.0f64; 4] };
5088        let vals = &mut mem.data;
5089        let a = _mm_setr_pd(1.0, 2.0);
5090        let d = vals.as_mut_ptr();
5091
5092        unsafe {
5093            _mm_storer_pd(d, *black_box(&a));
5094        }
5095        assert_eq!(vals[0], 2.0);
5096        assert_eq!(vals[1], 1.0);
5097    }
5098
5099    #[simd_test(enable = "sse2")]
5100    const fn test_mm_storeh_pd() {
5101        let mut dest = 0.;
5102        let a = _mm_setr_pd(1., 2.);
5103        unsafe {
5104            _mm_storeh_pd(&mut dest, a);
5105        }
5106        assert_eq!(dest, get_m128d(a, 1));
5107    }
5108
5109    #[simd_test(enable = "sse2")]
5110    const fn test_mm_storel_pd() {
5111        let mut dest = 0.;
5112        let a = _mm_setr_pd(1., 2.);
5113        unsafe {
5114            _mm_storel_pd(&mut dest, a);
5115        }
5116        assert_eq!(dest, _mm_cvtsd_f64(a));
5117    }
5118
5119    #[simd_test(enable = "sse2")]
5120    const fn test_mm_loadr_pd() {
5121        let mut mem = Memory {
5122            data: [1.0f64, 2.0, 3.0, 4.0],
5123        };
5124        let vals = &mut mem.data;
5125        let d = vals.as_ptr();
5126
5127        let r = unsafe { _mm_loadr_pd(d) };
5128        assert_eq_m128d(r, _mm_setr_pd(2.0, 1.0));
5129    }
5130
5131    #[simd_test(enable = "sse2")]
5132    const fn test_mm_loadu_pd() {
5133        // guaranteed to be aligned to 16 bytes
5134        let mut mem = Memory {
5135            data: [1.0f64, 2.0, 3.0, 4.0],
5136        };
5137        let vals = &mut mem.data;
5138
5139        // so this will *not* be aligned to 16 bytes
5140        let d = unsafe { vals.as_ptr().offset(1) };
5141
5142        let r = unsafe { _mm_loadu_pd(d) };
5143        let e = _mm_setr_pd(2.0, 3.0);
5144        assert_eq_m128d(r, e);
5145    }
5146
5147    #[simd_test(enable = "sse2")]
5148    const fn test_mm_loadu_si16() {
5149        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
5150        let r = unsafe { _mm_loadu_si16(ptr::addr_of!(a) as *const _) };
5151        assert_eq_m128i(r, _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0));
5152    }
5153
5154    #[simd_test(enable = "sse2")]
5155    const fn test_mm_loadu_si32() {
5156        let a = _mm_setr_epi32(1, 2, 3, 4);
5157        let r = unsafe { _mm_loadu_si32(ptr::addr_of!(a) as *const _) };
5158        assert_eq_m128i(r, _mm_setr_epi32(1, 0, 0, 0));
5159    }
5160
5161    #[simd_test(enable = "sse2")]
5162    const fn test_mm_loadu_si64() {
5163        let a = _mm_setr_epi64x(5, 6);
5164        let r = unsafe { _mm_loadu_si64(ptr::addr_of!(a) as *const _) };
5165        assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
5166    }
5167
5168    #[simd_test(enable = "sse2")]
5169    const fn test_mm_cvtpd_ps() {
5170        let r = _mm_cvtpd_ps(_mm_setr_pd(-1.0, 5.0));
5171        assert_eq_m128(r, _mm_setr_ps(-1.0, 5.0, 0.0, 0.0));
5172
5173        let r = _mm_cvtpd_ps(_mm_setr_pd(-1.0, -5.0));
5174        assert_eq_m128(r, _mm_setr_ps(-1.0, -5.0, 0.0, 0.0));
5175
5176        let r = _mm_cvtpd_ps(_mm_setr_pd(f64::MAX, f64::MIN));
5177        assert_eq_m128(r, _mm_setr_ps(f32::INFINITY, f32::NEG_INFINITY, 0.0, 0.0));
5178
5179        let r = _mm_cvtpd_ps(_mm_setr_pd(f32::MAX as f64, f32::MIN as f64));
5180        assert_eq_m128(r, _mm_setr_ps(f32::MAX, f32::MIN, 0.0, 0.0));
5181    }
5182
5183    #[simd_test(enable = "sse2")]
5184    const fn test_mm_cvtps_pd() {
5185        let r = _mm_cvtps_pd(_mm_setr_ps(-1.0, 2.0, -3.0, 5.0));
5186        assert_eq_m128d(r, _mm_setr_pd(-1.0, 2.0));
5187
5188        let r = _mm_cvtps_pd(_mm_setr_ps(
5189            f32::MAX,
5190            f32::INFINITY,
5191            f32::NEG_INFINITY,
5192            f32::MIN,
5193        ));
5194        assert_eq_m128d(r, _mm_setr_pd(f32::MAX as f64, f64::INFINITY));
5195    }
5196
5197    #[simd_test(enable = "sse2")]
5198    fn test_mm_cvtpd_epi32() {
5199        let r = _mm_cvtpd_epi32(_mm_setr_pd(-1.0, 5.0));
5200        assert_eq_m128i(r, _mm_setr_epi32(-1, 5, 0, 0));
5201
5202        let r = _mm_cvtpd_epi32(_mm_setr_pd(-1.0, -5.0));
5203        assert_eq_m128i(r, _mm_setr_epi32(-1, -5, 0, 0));
5204
5205        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::MAX, f64::MIN));
5206        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5207
5208        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::INFINITY, f64::NEG_INFINITY));
5209        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5210
5211        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::NAN, f64::NAN));
5212        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5213    }
5214
5215    #[simd_test(enable = "sse2")]
5216    fn test_mm_cvtsd_si32() {
5217        let r = _mm_cvtsd_si32(_mm_setr_pd(-2.0, 5.0));
5218        assert_eq!(r, -2);
5219
5220        let r = _mm_cvtsd_si32(_mm_setr_pd(f64::MAX, f64::MIN));
5221        assert_eq!(r, i32::MIN);
5222
5223        let r = _mm_cvtsd_si32(_mm_setr_pd(f64::NAN, f64::NAN));
5224        assert_eq!(r, i32::MIN);
5225    }
5226
5227    #[simd_test(enable = "sse2")]
5228    fn test_mm_cvtsd_ss() {
5229        let a = _mm_setr_ps(-1.1, -2.2, 3.3, 4.4);
5230        let b = _mm_setr_pd(2.0, -5.0);
5231
5232        let r = _mm_cvtsd_ss(a, b);
5233
5234        assert_eq_m128(r, _mm_setr_ps(2.0, -2.2, 3.3, 4.4));
5235
5236        let a = _mm_setr_ps(-1.1, f32::NEG_INFINITY, f32::MAX, f32::NEG_INFINITY);
5237        let b = _mm_setr_pd(f64::INFINITY, -5.0);
5238
5239        let r = _mm_cvtsd_ss(a, b);
5240
5241        assert_eq_m128(
5242            r,
5243            _mm_setr_ps(
5244                f32::INFINITY,
5245                f32::NEG_INFINITY,
5246                f32::MAX,
5247                f32::NEG_INFINITY,
5248            ),
5249        );
5250    }
5251
5252    #[simd_test(enable = "sse2")]
5253    const fn test_mm_cvtsd_f64() {
5254        let r = _mm_cvtsd_f64(_mm_setr_pd(-1.1, 2.2));
5255        assert_eq!(r, -1.1);
5256    }
5257
5258    #[simd_test(enable = "sse2")]
5259    const fn test_mm_cvtss_sd() {
5260        let a = _mm_setr_pd(-1.1, 2.2);
5261        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
5262
5263        let r = _mm_cvtss_sd(a, b);
5264        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.2));
5265
5266        let a = _mm_setr_pd(-1.1, f64::INFINITY);
5267        let b = _mm_setr_ps(f32::NEG_INFINITY, 2.0, 3.0, 4.0);
5268
5269        let r = _mm_cvtss_sd(a, b);
5270        assert_eq_m128d(r, _mm_setr_pd(f64::NEG_INFINITY, f64::INFINITY));
5271    }
5272
5273    #[simd_test(enable = "sse2")]
5274    fn test_mm_cvttpd_epi32() {
5275        let a = _mm_setr_pd(-1.1, 2.2);
5276        let r = _mm_cvttpd_epi32(a);
5277        assert_eq_m128i(r, _mm_setr_epi32(-1, 2, 0, 0));
5278
5279        let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
5280        let r = _mm_cvttpd_epi32(a);
5281        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5282    }
5283
5284    #[simd_test(enable = "sse2")]
5285    fn test_mm_cvttsd_si32() {
5286        let a = _mm_setr_pd(-1.1, 2.2);
5287        let r = _mm_cvttsd_si32(a);
5288        assert_eq!(r, -1);
5289
5290        let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
5291        let r = _mm_cvttsd_si32(a);
5292        assert_eq!(r, i32::MIN);
5293    }
5294
5295    #[simd_test(enable = "sse2")]
5296    fn test_mm_cvttps_epi32() {
5297        let a = _mm_setr_ps(-1.1, 2.2, -3.3, 6.6);
5298        let r = _mm_cvttps_epi32(a);
5299        assert_eq_m128i(r, _mm_setr_epi32(-1, 2, -3, 6));
5300
5301        let a = _mm_setr_ps(f32::NEG_INFINITY, f32::INFINITY, f32::MIN, f32::MAX);
5302        let r = _mm_cvttps_epi32(a);
5303        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, i32::MIN, i32::MIN));
5304    }
5305
5306    #[simd_test(enable = "sse2")]
5307    const fn test_mm_set_sd() {
5308        let r = _mm_set_sd(-1.0_f64);
5309        assert_eq_m128d(r, _mm_setr_pd(-1.0_f64, 0_f64));
5310    }
5311
5312    #[simd_test(enable = "sse2")]
5313    const fn test_mm_set1_pd() {
5314        let r = _mm_set1_pd(-1.0_f64);
5315        assert_eq_m128d(r, _mm_setr_pd(-1.0_f64, -1.0_f64));
5316    }
5317
5318    #[simd_test(enable = "sse2")]
5319    const fn test_mm_set_pd1() {
5320        let r = _mm_set_pd1(-2.0_f64);
5321        assert_eq_m128d(r, _mm_setr_pd(-2.0_f64, -2.0_f64));
5322    }
5323
5324    #[simd_test(enable = "sse2")]
5325    const fn test_mm_set_pd() {
5326        let r = _mm_set_pd(1.0_f64, 5.0_f64);
5327        assert_eq_m128d(r, _mm_setr_pd(5.0_f64, 1.0_f64));
5328    }
5329
5330    #[simd_test(enable = "sse2")]
5331    const fn test_mm_setr_pd() {
5332        let r = _mm_setr_pd(1.0_f64, -5.0_f64);
5333        assert_eq_m128d(r, _mm_setr_pd(1.0_f64, -5.0_f64));
5334    }
5335
5336    #[simd_test(enable = "sse2")]
5337    const fn test_mm_setzero_pd() {
5338        let r = _mm_setzero_pd();
5339        assert_eq_m128d(r, _mm_setr_pd(0_f64, 0_f64));
5340    }
5341
5342    #[simd_test(enable = "sse2")]
5343    const fn test_mm_load1_pd() {
5344        let d = -5.0;
5345        let r = unsafe { _mm_load1_pd(&d) };
5346        assert_eq_m128d(r, _mm_setr_pd(d, d));
5347    }
5348
5349    #[simd_test(enable = "sse2")]
5350    const fn test_mm_load_pd1() {
5351        let d = -5.0;
5352        let r = unsafe { _mm_load_pd1(&d) };
5353        assert_eq_m128d(r, _mm_setr_pd(d, d));
5354    }
5355
5356    #[simd_test(enable = "sse2")]
5357    const fn test_mm_unpackhi_pd() {
5358        let a = _mm_setr_pd(1.0, 2.0);
5359        let b = _mm_setr_pd(3.0, 4.0);
5360        let r = _mm_unpackhi_pd(a, b);
5361        assert_eq_m128d(r, _mm_setr_pd(2.0, 4.0));
5362    }
5363
5364    #[simd_test(enable = "sse2")]
5365    const fn test_mm_unpacklo_pd() {
5366        let a = _mm_setr_pd(1.0, 2.0);
5367        let b = _mm_setr_pd(3.0, 4.0);
5368        let r = _mm_unpacklo_pd(a, b);
5369        assert_eq_m128d(r, _mm_setr_pd(1.0, 3.0));
5370    }
5371
5372    #[simd_test(enable = "sse2")]
5373    const fn test_mm_shuffle_pd() {
5374        let a = _mm_setr_pd(1., 2.);
5375        let b = _mm_setr_pd(3., 4.);
5376        let expected = _mm_setr_pd(1., 3.);
5377        let r = _mm_shuffle_pd::<0b00_00_00_00>(a, b);
5378        assert_eq_m128d(r, expected);
5379    }
5380
5381    #[simd_test(enable = "sse2")]
5382    const fn test_mm_move_sd() {
5383        let a = _mm_setr_pd(1., 2.);
5384        let b = _mm_setr_pd(3., 4.);
5385        let expected = _mm_setr_pd(3., 2.);
5386        let r = _mm_move_sd(a, b);
5387        assert_eq_m128d(r, expected);
5388    }
5389
5390    #[simd_test(enable = "sse2")]
5391    const fn test_mm_castpd_ps() {
5392        let a = _mm_set1_pd(0.);
5393        let expected = _mm_set1_ps(0.);
5394        let r = _mm_castpd_ps(a);
5395        assert_eq_m128(r, expected);
5396    }
5397
5398    #[simd_test(enable = "sse2")]
5399    const fn test_mm_castpd_si128() {
5400        let a = _mm_set1_pd(0.);
5401        let expected = _mm_set1_epi64x(0);
5402        let r = _mm_castpd_si128(a);
5403        assert_eq_m128i(r, expected);
5404    }
5405
5406    #[simd_test(enable = "sse2")]
5407    const fn test_mm_castps_pd() {
5408        let a = _mm_set1_ps(0.);
5409        let expected = _mm_set1_pd(0.);
5410        let r = _mm_castps_pd(a);
5411        assert_eq_m128d(r, expected);
5412    }
5413
5414    #[simd_test(enable = "sse2")]
5415    const fn test_mm_castps_si128() {
5416        let a = _mm_set1_ps(0.);
5417        let expected = _mm_set1_epi32(0);
5418        let r = _mm_castps_si128(a);
5419        assert_eq_m128i(r, expected);
5420    }
5421
5422    #[simd_test(enable = "sse2")]
5423    const fn test_mm_castsi128_pd() {
5424        let a = _mm_set1_epi64x(0);
5425        let expected = _mm_set1_pd(0.);
5426        let r = _mm_castsi128_pd(a);
5427        assert_eq_m128d(r, expected);
5428    }
5429
5430    #[simd_test(enable = "sse2")]
5431    const fn test_mm_castsi128_ps() {
5432        let a = _mm_set1_epi32(0);
5433        let expected = _mm_set1_ps(0.);
5434        let r = _mm_castsi128_ps(a);
5435        assert_eq_m128(r, expected);
5436    }
5437}