Skip to main content

core/stdarch/crates/core_arch/src/x86/
sse2.rs

1//! Streaming SIMD Extensions 2 (SSE2)
2
3#[cfg(test)]
4use stdarch_test::assert_instr;
5
6use crate::{
7    core_arch::{simd::*, x86::*},
8    intrinsics::simd::*,
9    intrinsics::sqrtf64,
10    mem, ptr,
11};
12
13/// Provides a hint to the processor that the code sequence is a spin-wait loop.
14///
15/// This can help improve the performance and power consumption of spin-wait
16/// loops.
17///
18/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause)
19#[inline]
20#[cfg_attr(all(test, target_feature = "sse2"), assert_instr(pause))]
21#[stable(feature = "simd_x86", since = "1.27.0")]
22pub fn _mm_pause() {
23    // note: `pause` is guaranteed to be interpreted as a `nop` by CPUs without
24    // the SSE2 target-feature - therefore it does not require any target features
25    unsafe { pause() }
26}
27
28/// Invalidates and flushes the cache line that contains `p` from all levels of
29/// the cache hierarchy.
30///
31/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush)
32#[inline]
33#[target_feature(enable = "sse2")]
34#[cfg_attr(test, assert_instr(clflush))]
35#[stable(feature = "simd_x86", since = "1.27.0")]
36pub unsafe fn _mm_clflush(p: *const u8) {
37    clflush(p)
38}
39
40/// Performs a serializing operation on all load-from-memory instructions
41/// that were issued prior to this instruction.
42///
43/// Guarantees that every load instruction that precedes, in program order, is
44/// globally visible before any load instruction which follows the fence in
45/// program order.
46///
47/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence)
48#[inline]
49#[target_feature(enable = "sse2")]
50#[cfg_attr(test, assert_instr(lfence))]
51#[stable(feature = "simd_x86", since = "1.27.0")]
52pub fn _mm_lfence() {
53    unsafe { lfence() }
54}
55
56/// Performs a serializing operation on all load-from-memory and store-to-memory
57/// instructions that were issued prior to this instruction.
58///
59/// Guarantees that every memory access that precedes, in program order, the
60/// memory fence instruction is globally visible before any memory instruction
61/// which follows the fence in program order.
62///
63/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence)
64#[inline]
65#[target_feature(enable = "sse2")]
66#[cfg_attr(test, assert_instr(mfence))]
67#[stable(feature = "simd_x86", since = "1.27.0")]
68pub fn _mm_mfence() {
69    unsafe { mfence() }
70}
71
72/// Adds packed 8-bit integers in `a` and `b`.
73///
74/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8)
75#[inline]
76#[target_feature(enable = "sse2")]
77#[cfg_attr(test, assert_instr(paddb))]
78#[stable(feature = "simd_x86", since = "1.27.0")]
79#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
80pub const fn _mm_add_epi8(a: __m128i, b: __m128i) -> __m128i {
81    unsafe { transmute(simd_add(a.as_i8x16(), b.as_i8x16())) }
82}
83
84/// Adds packed 16-bit integers in `a` and `b`.
85///
86/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16)
87#[inline]
88#[target_feature(enable = "sse2")]
89#[cfg_attr(test, assert_instr(paddw))]
90#[stable(feature = "simd_x86", since = "1.27.0")]
91#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
92pub const fn _mm_add_epi16(a: __m128i, b: __m128i) -> __m128i {
93    unsafe { transmute(simd_add(a.as_i16x8(), b.as_i16x8())) }
94}
95
96/// Adds packed 32-bit integers in `a` and `b`.
97///
98/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32)
99#[inline]
100#[target_feature(enable = "sse2")]
101#[cfg_attr(test, assert_instr(paddd))]
102#[stable(feature = "simd_x86", since = "1.27.0")]
103#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
104pub const fn _mm_add_epi32(a: __m128i, b: __m128i) -> __m128i {
105    unsafe { transmute(simd_add(a.as_i32x4(), b.as_i32x4())) }
106}
107
108/// Adds packed 64-bit integers in `a` and `b`.
109///
110/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64)
111#[inline]
112#[target_feature(enable = "sse2")]
113#[cfg_attr(test, assert_instr(paddq))]
114#[stable(feature = "simd_x86", since = "1.27.0")]
115#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
116pub const fn _mm_add_epi64(a: __m128i, b: __m128i) -> __m128i {
117    unsafe { transmute(simd_add(a.as_i64x2(), b.as_i64x2())) }
118}
119
120/// Adds packed 8-bit integers in `a` and `b` using saturation.
121///
122/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8)
123#[inline]
124#[target_feature(enable = "sse2")]
125#[cfg_attr(test, assert_instr(paddsb))]
126#[stable(feature = "simd_x86", since = "1.27.0")]
127#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
128pub const fn _mm_adds_epi8(a: __m128i, b: __m128i) -> __m128i {
129    unsafe { transmute(simd_saturating_add(a.as_i8x16(), b.as_i8x16())) }
130}
131
132/// Adds packed 16-bit integers in `a` and `b` using saturation.
133///
134/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16)
135#[inline]
136#[target_feature(enable = "sse2")]
137#[cfg_attr(test, assert_instr(paddsw))]
138#[stable(feature = "simd_x86", since = "1.27.0")]
139#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
140pub const fn _mm_adds_epi16(a: __m128i, b: __m128i) -> __m128i {
141    unsafe { transmute(simd_saturating_add(a.as_i16x8(), b.as_i16x8())) }
142}
143
144/// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
145///
146/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8)
147#[inline]
148#[target_feature(enable = "sse2")]
149#[cfg_attr(test, assert_instr(paddusb))]
150#[stable(feature = "simd_x86", since = "1.27.0")]
151#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
152pub const fn _mm_adds_epu8(a: __m128i, b: __m128i) -> __m128i {
153    unsafe { transmute(simd_saturating_add(a.as_u8x16(), b.as_u8x16())) }
154}
155
156/// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
157///
158/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16)
159#[inline]
160#[target_feature(enable = "sse2")]
161#[cfg_attr(test, assert_instr(paddusw))]
162#[stable(feature = "simd_x86", since = "1.27.0")]
163#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
164pub const fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i {
165    unsafe { transmute(simd_saturating_add(a.as_u16x8(), b.as_u16x8())) }
166}
167
168/// Averages packed unsigned 8-bit integers in `a` and `b`.
169///
170/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8)
171#[inline]
172#[target_feature(enable = "sse2")]
173#[cfg_attr(test, assert_instr(pavgb))]
174#[stable(feature = "simd_x86", since = "1.27.0")]
175#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
176pub const fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i {
177    unsafe {
178        let a = simd_cast::<_, u16x16>(a.as_u8x16());
179        let b = simd_cast::<_, u16x16>(b.as_u8x16());
180        let r = simd_shr(simd_add(simd_add(a, b), u16x16::splat(1)), u16x16::splat(1));
181        transmute(simd_cast::<_, u8x16>(r))
182    }
183}
184
185/// Averages packed unsigned 16-bit integers in `a` and `b`.
186///
187/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16)
188#[inline]
189#[target_feature(enable = "sse2")]
190#[cfg_attr(test, assert_instr(pavgw))]
191#[stable(feature = "simd_x86", since = "1.27.0")]
192#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
193pub const fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
194    unsafe {
195        let a = simd_cast::<_, u32x8>(a.as_u16x8());
196        let b = simd_cast::<_, u32x8>(b.as_u16x8());
197        let r = simd_shr(simd_add(simd_add(a, b), u32x8::splat(1)), u32x8::splat(1));
198        transmute(simd_cast::<_, u16x8>(r))
199    }
200}
201
202/// Multiplies and then horizontally add signed 16 bit integers in `a` and `b`.
203///
204/// Multiplies packed signed 16-bit integers in `a` and `b`, producing
205/// intermediate signed 32-bit integers. Horizontally add adjacent pairs of
206/// intermediate 32-bit integers.
207///
208/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16)
209#[inline]
210#[target_feature(enable = "sse2")]
211#[cfg_attr(test, assert_instr(pmaddwd))]
212#[stable(feature = "simd_x86", since = "1.27.0")]
213pub fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
214    // It's a trick used in the Adler-32 algorithm to perform a widening addition.
215    //
216    // ```rust
217    // #[target_feature(enable = "sse2")]
218    // unsafe fn widening_add(mad: __m128i) -> __m128i {
219    //     _mm_madd_epi16(mad, _mm_set1_epi16(1))
220    // }
221    // ```
222    //
223    // If we implement this using generic vector intrinsics, the optimizer
224    // will eliminate this pattern, and `pmaddwd` will no longer be emitted.
225    // For this reason, we use x86 intrinsics.
226    unsafe { transmute(pmaddwd(a.as_i16x8(), b.as_i16x8())) }
227}
228
229/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
230/// maximum values.
231///
232/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16)
233#[inline]
234#[target_feature(enable = "sse2")]
235#[cfg_attr(test, assert_instr(pmaxsw))]
236#[stable(feature = "simd_x86", since = "1.27.0")]
237#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
238pub const fn _mm_max_epi16(a: __m128i, b: __m128i) -> __m128i {
239    unsafe { simd_imax(a.as_i16x8(), b.as_i16x8()).as_m128i() }
240}
241
242/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
243/// packed maximum values.
244///
245/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8)
246#[inline]
247#[target_feature(enable = "sse2")]
248#[cfg_attr(test, assert_instr(pmaxub))]
249#[stable(feature = "simd_x86", since = "1.27.0")]
250#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
251pub const fn _mm_max_epu8(a: __m128i, b: __m128i) -> __m128i {
252    unsafe { simd_imax(a.as_u8x16(), b.as_u8x16()).as_m128i() }
253}
254
255/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
256/// minimum values.
257///
258/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16)
259#[inline]
260#[target_feature(enable = "sse2")]
261#[cfg_attr(test, assert_instr(pminsw))]
262#[stable(feature = "simd_x86", since = "1.27.0")]
263#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
264pub const fn _mm_min_epi16(a: __m128i, b: __m128i) -> __m128i {
265    unsafe { simd_imin(a.as_i16x8(), b.as_i16x8()).as_m128i() }
266}
267
268/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
269/// packed minimum values.
270///
271/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8)
272#[inline]
273#[target_feature(enable = "sse2")]
274#[cfg_attr(test, assert_instr(pminub))]
275#[stable(feature = "simd_x86", since = "1.27.0")]
276#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
277pub const fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i {
278    unsafe { simd_imin(a.as_u8x16(), b.as_u8x16()).as_m128i() }
279}
280
281/// Multiplies the packed 16-bit integers in `a` and `b`.
282///
283/// The multiplication produces intermediate 32-bit integers, and returns the
284/// high 16 bits of the intermediate integers.
285///
286/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16)
287#[inline]
288#[target_feature(enable = "sse2")]
289#[cfg_attr(test, assert_instr(pmulhw))]
290#[stable(feature = "simd_x86", since = "1.27.0")]
291#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
292pub const fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i {
293    unsafe {
294        let a = simd_cast::<_, i32x8>(a.as_i16x8());
295        let b = simd_cast::<_, i32x8>(b.as_i16x8());
296        let r = simd_shr(simd_mul(a, b), i32x8::splat(16));
297        transmute(simd_cast::<i32x8, i16x8>(r))
298    }
299}
300
301/// Multiplies the packed unsigned 16-bit integers in `a` and `b`.
302///
303/// The multiplication produces intermediate 32-bit integers, and returns the
304/// high 16 bits of the intermediate integers.
305///
306/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16)
307#[inline]
308#[target_feature(enable = "sse2")]
309#[cfg_attr(test, assert_instr(pmulhuw))]
310#[stable(feature = "simd_x86", since = "1.27.0")]
311#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
312pub const fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i {
313    unsafe {
314        let a = simd_cast::<_, u32x8>(a.as_u16x8());
315        let b = simd_cast::<_, u32x8>(b.as_u16x8());
316        let r = simd_shr(simd_mul(a, b), u32x8::splat(16));
317        transmute(simd_cast::<u32x8, u16x8>(r))
318    }
319}
320
321/// Multiplies the packed 16-bit integers in `a` and `b`.
322///
323/// The multiplication produces intermediate 32-bit integers, and returns the
324/// low 16 bits of the intermediate integers.
325///
326/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16)
327#[inline]
328#[target_feature(enable = "sse2")]
329#[cfg_attr(test, assert_instr(pmullw))]
330#[stable(feature = "simd_x86", since = "1.27.0")]
331#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
332pub const fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i {
333    unsafe { transmute(simd_mul(a.as_i16x8(), b.as_i16x8())) }
334}
335
336/// Multiplies the low unsigned 32-bit integers from each packed 64-bit element
337/// in `a` and `b`.
338///
339/// Returns the unsigned 64-bit results.
340///
341/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32)
342#[inline]
343#[target_feature(enable = "sse2")]
344#[cfg_attr(test, assert_instr(pmuludq))]
345#[stable(feature = "simd_x86", since = "1.27.0")]
346#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
347pub const fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i {
348    unsafe {
349        let a = a.as_u64x2();
350        let b = b.as_u64x2();
351        let mask = u64x2::splat(u32::MAX as u64);
352        transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
353    }
354}
355
356/// Sum the absolute differences of packed unsigned 8-bit integers.
357///
358/// Computes the absolute differences of packed unsigned 8-bit integers in `a`
359/// and `b`, then horizontally sum each consecutive 8 differences to produce
360/// two unsigned 16-bit integers, and pack these unsigned 16-bit integers in
361/// the low 16 bits of 64-bit elements returned.
362///
363/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8)
364#[inline]
365#[target_feature(enable = "sse2")]
366#[cfg_attr(test, assert_instr(psadbw))]
367#[stable(feature = "simd_x86", since = "1.27.0")]
368pub fn _mm_sad_epu8(a: __m128i, b: __m128i) -> __m128i {
369    unsafe { transmute(psadbw(a.as_u8x16(), b.as_u8x16())) }
370}
371
372/// Subtracts packed 8-bit integers in `b` from packed 8-bit integers in `a`.
373///
374/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8)
375#[inline]
376#[target_feature(enable = "sse2")]
377#[cfg_attr(test, assert_instr(psubb))]
378#[stable(feature = "simd_x86", since = "1.27.0")]
379#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
380pub const fn _mm_sub_epi8(a: __m128i, b: __m128i) -> __m128i {
381    unsafe { transmute(simd_sub(a.as_i8x16(), b.as_i8x16())) }
382}
383
384/// Subtracts packed 16-bit integers in `b` from packed 16-bit integers in `a`.
385///
386/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16)
387#[inline]
388#[target_feature(enable = "sse2")]
389#[cfg_attr(test, assert_instr(psubw))]
390#[stable(feature = "simd_x86", since = "1.27.0")]
391#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
392pub const fn _mm_sub_epi16(a: __m128i, b: __m128i) -> __m128i {
393    unsafe { transmute(simd_sub(a.as_i16x8(), b.as_i16x8())) }
394}
395
396/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
397///
398/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32)
399#[inline]
400#[target_feature(enable = "sse2")]
401#[cfg_attr(test, assert_instr(psubd))]
402#[stable(feature = "simd_x86", since = "1.27.0")]
403#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
404pub const fn _mm_sub_epi32(a: __m128i, b: __m128i) -> __m128i {
405    unsafe { transmute(simd_sub(a.as_i32x4(), b.as_i32x4())) }
406}
407
408/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
409///
410/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64)
411#[inline]
412#[target_feature(enable = "sse2")]
413#[cfg_attr(test, assert_instr(psubq))]
414#[stable(feature = "simd_x86", since = "1.27.0")]
415#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
416pub const fn _mm_sub_epi64(a: __m128i, b: __m128i) -> __m128i {
417    unsafe { transmute(simd_sub(a.as_i64x2(), b.as_i64x2())) }
418}
419
420/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
421/// using saturation.
422///
423/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8)
424#[inline]
425#[target_feature(enable = "sse2")]
426#[cfg_attr(test, assert_instr(psubsb))]
427#[stable(feature = "simd_x86", since = "1.27.0")]
428#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
429pub const fn _mm_subs_epi8(a: __m128i, b: __m128i) -> __m128i {
430    unsafe { transmute(simd_saturating_sub(a.as_i8x16(), b.as_i8x16())) }
431}
432
433/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
434/// using saturation.
435///
436/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16)
437#[inline]
438#[target_feature(enable = "sse2")]
439#[cfg_attr(test, assert_instr(psubsw))]
440#[stable(feature = "simd_x86", since = "1.27.0")]
441#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
442pub const fn _mm_subs_epi16(a: __m128i, b: __m128i) -> __m128i {
443    unsafe { transmute(simd_saturating_sub(a.as_i16x8(), b.as_i16x8())) }
444}
445
446/// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit
447/// integers in `a` using saturation.
448///
449/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8)
450#[inline]
451#[target_feature(enable = "sse2")]
452#[cfg_attr(test, assert_instr(psubusb))]
453#[stable(feature = "simd_x86", since = "1.27.0")]
454#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
455pub const fn _mm_subs_epu8(a: __m128i, b: __m128i) -> __m128i {
456    unsafe { transmute(simd_saturating_sub(a.as_u8x16(), b.as_u8x16())) }
457}
458
459/// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit
460/// integers in `a` using saturation.
461///
462/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16)
463#[inline]
464#[target_feature(enable = "sse2")]
465#[cfg_attr(test, assert_instr(psubusw))]
466#[stable(feature = "simd_x86", since = "1.27.0")]
467#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
468pub const fn _mm_subs_epu16(a: __m128i, b: __m128i) -> __m128i {
469    unsafe { transmute(simd_saturating_sub(a.as_u16x8(), b.as_u16x8())) }
470}
471
472/// Shifts `a` left by `IMM8` bytes while shifting in zeros.
473///
474/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128)
475#[inline]
476#[target_feature(enable = "sse2")]
477#[cfg_attr(test, assert_instr(pslldq, IMM8 = 1))]
478#[rustc_legacy_const_generics(1)]
479#[stable(feature = "simd_x86", since = "1.27.0")]
480#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
481pub const fn _mm_slli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
482    static_assert_uimm_bits!(IMM8, 8);
483    unsafe { _mm_slli_si128_impl::<IMM8>(a) }
484}
485
486/// Implementation detail: converts the immediate argument of the
487/// `_mm_slli_si128` intrinsic into a compile-time constant.
488#[inline]
489#[target_feature(enable = "sse2")]
490#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
491const unsafe fn _mm_slli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
492    const fn mask(shift: i32, i: u32) -> u32 {
493        let shift = shift as u32 & 0xff;
494        if shift > 15 { i } else { 16 - shift + i }
495    }
496    transmute::<i8x16, _>(simd_shuffle!(
497        i8x16::ZERO,
498        a.as_i8x16(),
499        [
500            mask(IMM8, 0),
501            mask(IMM8, 1),
502            mask(IMM8, 2),
503            mask(IMM8, 3),
504            mask(IMM8, 4),
505            mask(IMM8, 5),
506            mask(IMM8, 6),
507            mask(IMM8, 7),
508            mask(IMM8, 8),
509            mask(IMM8, 9),
510            mask(IMM8, 10),
511            mask(IMM8, 11),
512            mask(IMM8, 12),
513            mask(IMM8, 13),
514            mask(IMM8, 14),
515            mask(IMM8, 15),
516        ],
517    ))
518}
519
520/// Shifts `a` left by `IMM8` bytes while shifting in zeros.
521///
522/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128)
523#[inline]
524#[target_feature(enable = "sse2")]
525#[cfg_attr(test, assert_instr(pslldq, IMM8 = 1))]
526#[rustc_legacy_const_generics(1)]
527#[stable(feature = "simd_x86", since = "1.27.0")]
528#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
529pub const fn _mm_bslli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
530    unsafe {
531        static_assert_uimm_bits!(IMM8, 8);
532        _mm_slli_si128_impl::<IMM8>(a)
533    }
534}
535
536/// Shifts `a` right by `IMM8` bytes while shifting in zeros.
537///
538/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128)
539#[inline]
540#[target_feature(enable = "sse2")]
541#[cfg_attr(test, assert_instr(psrldq, IMM8 = 1))]
542#[rustc_legacy_const_generics(1)]
543#[stable(feature = "simd_x86", since = "1.27.0")]
544#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
545pub const fn _mm_bsrli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
546    unsafe {
547        static_assert_uimm_bits!(IMM8, 8);
548        _mm_srli_si128_impl::<IMM8>(a)
549    }
550}
551
552/// Shifts packed 16-bit integers in `a` left by `IMM8` while shifting in zeros.
553///
554/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16)
555#[inline]
556#[target_feature(enable = "sse2")]
557#[cfg_attr(test, assert_instr(psllw, IMM8 = 7))]
558#[rustc_legacy_const_generics(1)]
559#[stable(feature = "simd_x86", since = "1.27.0")]
560#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
561pub const fn _mm_slli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
562    static_assert_uimm_bits!(IMM8, 8);
563    unsafe {
564        if IMM8 >= 16 {
565            _mm_setzero_si128()
566        } else {
567            transmute(simd_shl(a.as_u16x8(), u16x8::splat(IMM8 as u16)))
568        }
569    }
570}
571
572/// Shifts packed 16-bit integers in `a` left by `count` while shifting in
573/// zeros.
574///
575/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16)
576#[inline]
577#[target_feature(enable = "sse2")]
578#[cfg_attr(test, assert_instr(psllw))]
579#[stable(feature = "simd_x86", since = "1.27.0")]
580pub fn _mm_sll_epi16(a: __m128i, count: __m128i) -> __m128i {
581    unsafe { transmute(psllw(a.as_i16x8(), count.as_i16x8())) }
582}
583
584/// Shifts packed 32-bit integers in `a` left by `IMM8` while shifting in zeros.
585///
586/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32)
587#[inline]
588#[target_feature(enable = "sse2")]
589#[cfg_attr(test, assert_instr(pslld, IMM8 = 7))]
590#[rustc_legacy_const_generics(1)]
591#[stable(feature = "simd_x86", since = "1.27.0")]
592#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
593pub const fn _mm_slli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
594    static_assert_uimm_bits!(IMM8, 8);
595    unsafe {
596        if IMM8 >= 32 {
597            _mm_setzero_si128()
598        } else {
599            transmute(simd_shl(a.as_u32x4(), u32x4::splat(IMM8 as u32)))
600        }
601    }
602}
603
604/// Shifts packed 32-bit integers in `a` left by `count` while shifting in
605/// zeros.
606///
607/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32)
608#[inline]
609#[target_feature(enable = "sse2")]
610#[cfg_attr(test, assert_instr(pslld))]
611#[stable(feature = "simd_x86", since = "1.27.0")]
612pub fn _mm_sll_epi32(a: __m128i, count: __m128i) -> __m128i {
613    unsafe { transmute(pslld(a.as_i32x4(), count.as_i32x4())) }
614}
615
616/// Shifts packed 64-bit integers in `a` left by `IMM8` while shifting in zeros.
617///
618/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64)
619#[inline]
620#[target_feature(enable = "sse2")]
621#[cfg_attr(test, assert_instr(psllq, IMM8 = 7))]
622#[rustc_legacy_const_generics(1)]
623#[stable(feature = "simd_x86", since = "1.27.0")]
624#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
625pub const fn _mm_slli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
626    static_assert_uimm_bits!(IMM8, 8);
627    unsafe {
628        if IMM8 >= 64 {
629            _mm_setzero_si128()
630        } else {
631            transmute(simd_shl(a.as_u64x2(), u64x2::splat(IMM8 as u64)))
632        }
633    }
634}
635
636/// Shifts packed 64-bit integers in `a` left by `count` while shifting in
637/// zeros.
638///
639/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64)
640#[inline]
641#[target_feature(enable = "sse2")]
642#[cfg_attr(test, assert_instr(psllq))]
643#[stable(feature = "simd_x86", since = "1.27.0")]
644pub fn _mm_sll_epi64(a: __m128i, count: __m128i) -> __m128i {
645    unsafe { transmute(psllq(a.as_i64x2(), count.as_i64x2())) }
646}
647
648/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in sign
649/// bits.
650///
651/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16)
652#[inline]
653#[target_feature(enable = "sse2")]
654#[cfg_attr(test, assert_instr(psraw, IMM8 = 1))]
655#[rustc_legacy_const_generics(1)]
656#[stable(feature = "simd_x86", since = "1.27.0")]
657#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
658pub const fn _mm_srai_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
659    static_assert_uimm_bits!(IMM8, 8);
660    unsafe { transmute(simd_shr(a.as_i16x8(), i16x8::splat(IMM8.min(15) as i16))) }
661}
662
663/// Shifts packed 16-bit integers in `a` right by `count` while shifting in sign
664/// bits.
665///
666/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16)
667#[inline]
668#[target_feature(enable = "sse2")]
669#[cfg_attr(test, assert_instr(psraw))]
670#[stable(feature = "simd_x86", since = "1.27.0")]
671pub fn _mm_sra_epi16(a: __m128i, count: __m128i) -> __m128i {
672    unsafe { transmute(psraw(a.as_i16x8(), count.as_i16x8())) }
673}
674
675/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in sign
676/// bits.
677///
678/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32)
679#[inline]
680#[target_feature(enable = "sse2")]
681#[cfg_attr(test, assert_instr(psrad, IMM8 = 1))]
682#[rustc_legacy_const_generics(1)]
683#[stable(feature = "simd_x86", since = "1.27.0")]
684#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
685pub const fn _mm_srai_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
686    static_assert_uimm_bits!(IMM8, 8);
687    unsafe { transmute(simd_shr(a.as_i32x4(), i32x4::splat(IMM8.min(31)))) }
688}
689
690/// Shifts packed 32-bit integers in `a` right by `count` while shifting in sign
691/// bits.
692///
693/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32)
694#[inline]
695#[target_feature(enable = "sse2")]
696#[cfg_attr(test, assert_instr(psrad))]
697#[stable(feature = "simd_x86", since = "1.27.0")]
698pub fn _mm_sra_epi32(a: __m128i, count: __m128i) -> __m128i {
699    unsafe { transmute(psrad(a.as_i32x4(), count.as_i32x4())) }
700}
701
702/// Shifts `a` right by `IMM8` bytes while shifting in zeros.
703///
704/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128)
705#[inline]
706#[target_feature(enable = "sse2")]
707#[cfg_attr(test, assert_instr(psrldq, IMM8 = 1))]
708#[rustc_legacy_const_generics(1)]
709#[stable(feature = "simd_x86", since = "1.27.0")]
710#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
711pub const fn _mm_srli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
712    static_assert_uimm_bits!(IMM8, 8);
713    unsafe { _mm_srli_si128_impl::<IMM8>(a) }
714}
715
716/// Implementation detail: converts the immediate argument of the
717/// `_mm_srli_si128` intrinsic into a compile-time constant.
718#[inline]
719#[target_feature(enable = "sse2")]
720#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
721const unsafe fn _mm_srli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
722    const fn mask(shift: i32, i: u32) -> u32 {
723        if (shift as u32) > 15 {
724            i + 16
725        } else {
726            i + (shift as u32)
727        }
728    }
729    let x: i8x16 = simd_shuffle!(
730        a.as_i8x16(),
731        i8x16::ZERO,
732        [
733            mask(IMM8, 0),
734            mask(IMM8, 1),
735            mask(IMM8, 2),
736            mask(IMM8, 3),
737            mask(IMM8, 4),
738            mask(IMM8, 5),
739            mask(IMM8, 6),
740            mask(IMM8, 7),
741            mask(IMM8, 8),
742            mask(IMM8, 9),
743            mask(IMM8, 10),
744            mask(IMM8, 11),
745            mask(IMM8, 12),
746            mask(IMM8, 13),
747            mask(IMM8, 14),
748            mask(IMM8, 15),
749        ],
750    );
751    transmute(x)
752}
753
754/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
755/// zeros.
756///
757/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16)
758#[inline]
759#[target_feature(enable = "sse2")]
760#[cfg_attr(test, assert_instr(psrlw, IMM8 = 1))]
761#[rustc_legacy_const_generics(1)]
762#[stable(feature = "simd_x86", since = "1.27.0")]
763#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
764pub const fn _mm_srli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
765    static_assert_uimm_bits!(IMM8, 8);
766    unsafe {
767        if IMM8 >= 16 {
768            _mm_setzero_si128()
769        } else {
770            transmute(simd_shr(a.as_u16x8(), u16x8::splat(IMM8 as u16)))
771        }
772    }
773}
774
775/// Shifts packed 16-bit integers in `a` right by `count` while shifting in
776/// zeros.
777///
778/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16)
779#[inline]
780#[target_feature(enable = "sse2")]
781#[cfg_attr(test, assert_instr(psrlw))]
782#[stable(feature = "simd_x86", since = "1.27.0")]
783pub fn _mm_srl_epi16(a: __m128i, count: __m128i) -> __m128i {
784    unsafe { transmute(psrlw(a.as_i16x8(), count.as_i16x8())) }
785}
786
787/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
788/// zeros.
789///
790/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32)
791#[inline]
792#[target_feature(enable = "sse2")]
793#[cfg_attr(test, assert_instr(psrld, IMM8 = 8))]
794#[rustc_legacy_const_generics(1)]
795#[stable(feature = "simd_x86", since = "1.27.0")]
796#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
797pub const fn _mm_srli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
798    static_assert_uimm_bits!(IMM8, 8);
799    unsafe {
800        if IMM8 >= 32 {
801            _mm_setzero_si128()
802        } else {
803            transmute(simd_shr(a.as_u32x4(), u32x4::splat(IMM8 as u32)))
804        }
805    }
806}
807
808/// Shifts packed 32-bit integers in `a` right by `count` while shifting in
809/// zeros.
810///
811/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32)
812#[inline]
813#[target_feature(enable = "sse2")]
814#[cfg_attr(test, assert_instr(psrld))]
815#[stable(feature = "simd_x86", since = "1.27.0")]
816pub fn _mm_srl_epi32(a: __m128i, count: __m128i) -> __m128i {
817    unsafe { transmute(psrld(a.as_i32x4(), count.as_i32x4())) }
818}
819
820/// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
821/// zeros.
822///
823/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64)
824#[inline]
825#[target_feature(enable = "sse2")]
826#[cfg_attr(test, assert_instr(psrlq, IMM8 = 1))]
827#[rustc_legacy_const_generics(1)]
828#[stable(feature = "simd_x86", since = "1.27.0")]
829#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
830pub const fn _mm_srli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
831    static_assert_uimm_bits!(IMM8, 8);
832    unsafe {
833        if IMM8 >= 64 {
834            _mm_setzero_si128()
835        } else {
836            transmute(simd_shr(a.as_u64x2(), u64x2::splat(IMM8 as u64)))
837        }
838    }
839}
840
841/// Shifts packed 64-bit integers in `a` right by `count` while shifting in
842/// zeros.
843///
844/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64)
845#[inline]
846#[target_feature(enable = "sse2")]
847#[cfg_attr(test, assert_instr(psrlq))]
848#[stable(feature = "simd_x86", since = "1.27.0")]
849pub fn _mm_srl_epi64(a: __m128i, count: __m128i) -> __m128i {
850    unsafe { transmute(psrlq(a.as_i64x2(), count.as_i64x2())) }
851}
852
853/// Computes the bitwise AND of 128 bits (representing integer data) in `a` and
854/// `b`.
855///
856/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128)
857#[inline]
858#[target_feature(enable = "sse2")]
859#[cfg_attr(test, assert_instr(andps))]
860#[stable(feature = "simd_x86", since = "1.27.0")]
861#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
862pub const fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i {
863    unsafe { simd_and(a, b) }
864}
865
866/// Computes the bitwise NOT of 128 bits (representing integer data) in `a` and
867/// then AND with `b`.
868///
869/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128)
870#[inline]
871#[target_feature(enable = "sse2")]
872#[cfg_attr(test, assert_instr(andnps))]
873#[stable(feature = "simd_x86", since = "1.27.0")]
874#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
875pub const fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i {
876    unsafe { simd_and(simd_xor(_mm_set1_epi8(-1), a), b) }
877}
878
879/// Computes the bitwise OR of 128 bits (representing integer data) in `a` and
880/// `b`.
881///
882/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128)
883#[inline]
884#[target_feature(enable = "sse2")]
885#[cfg_attr(test, assert_instr(orps))]
886#[stable(feature = "simd_x86", since = "1.27.0")]
887#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
888pub const fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i {
889    unsafe { simd_or(a, b) }
890}
891
892/// Computes the bitwise XOR of 128 bits (representing integer data) in `a` and
893/// `b`.
894///
895/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128)
896#[inline]
897#[target_feature(enable = "sse2")]
898#[cfg_attr(test, assert_instr(xorps))]
899#[stable(feature = "simd_x86", since = "1.27.0")]
900#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
901pub const fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i {
902    unsafe { simd_xor(a, b) }
903}
904
905/// Compares packed 8-bit integers in `a` and `b` for equality.
906///
907/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8)
908#[inline]
909#[target_feature(enable = "sse2")]
910#[cfg_attr(test, assert_instr(pcmpeqb))]
911#[stable(feature = "simd_x86", since = "1.27.0")]
912#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
913pub const fn _mm_cmpeq_epi8(a: __m128i, b: __m128i) -> __m128i {
914    unsafe { transmute::<i8x16, _>(simd_eq(a.as_i8x16(), b.as_i8x16())) }
915}
916
917/// Compares packed 16-bit integers in `a` and `b` for equality.
918///
919/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16)
920#[inline]
921#[target_feature(enable = "sse2")]
922#[cfg_attr(test, assert_instr(pcmpeqw))]
923#[stable(feature = "simd_x86", since = "1.27.0")]
924#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
925pub const fn _mm_cmpeq_epi16(a: __m128i, b: __m128i) -> __m128i {
926    unsafe { transmute::<i16x8, _>(simd_eq(a.as_i16x8(), b.as_i16x8())) }
927}
928
929/// Compares packed 32-bit integers in `a` and `b` for equality.
930///
931/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32)
932#[inline]
933#[target_feature(enable = "sse2")]
934#[cfg_attr(test, assert_instr(pcmpeqd))]
935#[stable(feature = "simd_x86", since = "1.27.0")]
936#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
937pub const fn _mm_cmpeq_epi32(a: __m128i, b: __m128i) -> __m128i {
938    unsafe { transmute::<i32x4, _>(simd_eq(a.as_i32x4(), b.as_i32x4())) }
939}
940
941/// Compares packed 8-bit integers in `a` and `b` for greater-than.
942///
943/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8)
944#[inline]
945#[target_feature(enable = "sse2")]
946#[cfg_attr(test, assert_instr(pcmpgtb))]
947#[stable(feature = "simd_x86", since = "1.27.0")]
948#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
949pub const fn _mm_cmpgt_epi8(a: __m128i, b: __m128i) -> __m128i {
950    unsafe { transmute::<i8x16, _>(simd_gt(a.as_i8x16(), b.as_i8x16())) }
951}
952
953/// Compares packed 16-bit integers in `a` and `b` for greater-than.
954///
955/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16)
956#[inline]
957#[target_feature(enable = "sse2")]
958#[cfg_attr(test, assert_instr(pcmpgtw))]
959#[stable(feature = "simd_x86", since = "1.27.0")]
960#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
961pub const fn _mm_cmpgt_epi16(a: __m128i, b: __m128i) -> __m128i {
962    unsafe { transmute::<i16x8, _>(simd_gt(a.as_i16x8(), b.as_i16x8())) }
963}
964
965/// Compares packed 32-bit integers in `a` and `b` for greater-than.
966///
967/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32)
968#[inline]
969#[target_feature(enable = "sse2")]
970#[cfg_attr(test, assert_instr(pcmpgtd))]
971#[stable(feature = "simd_x86", since = "1.27.0")]
972#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
973pub const fn _mm_cmpgt_epi32(a: __m128i, b: __m128i) -> __m128i {
974    unsafe { transmute::<i32x4, _>(simd_gt(a.as_i32x4(), b.as_i32x4())) }
975}
976
977/// Compares packed 8-bit integers in `a` and `b` for less-than.
978///
979/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8)
980#[inline]
981#[target_feature(enable = "sse2")]
982#[cfg_attr(test, assert_instr(pcmpgtb))]
983#[stable(feature = "simd_x86", since = "1.27.0")]
984#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
985pub const fn _mm_cmplt_epi8(a: __m128i, b: __m128i) -> __m128i {
986    unsafe { transmute::<i8x16, _>(simd_lt(a.as_i8x16(), b.as_i8x16())) }
987}
988
989/// Compares packed 16-bit integers in `a` and `b` for less-than.
990///
991/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16)
992#[inline]
993#[target_feature(enable = "sse2")]
994#[cfg_attr(test, assert_instr(pcmpgtw))]
995#[stable(feature = "simd_x86", since = "1.27.0")]
996#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
997pub const fn _mm_cmplt_epi16(a: __m128i, b: __m128i) -> __m128i {
998    unsafe { transmute::<i16x8, _>(simd_lt(a.as_i16x8(), b.as_i16x8())) }
999}
1000
1001/// Compares packed 32-bit integers in `a` and `b` for less-than.
1002///
1003/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32)
1004#[inline]
1005#[target_feature(enable = "sse2")]
1006#[cfg_attr(test, assert_instr(pcmpgtd))]
1007#[stable(feature = "simd_x86", since = "1.27.0")]
1008#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1009pub const fn _mm_cmplt_epi32(a: __m128i, b: __m128i) -> __m128i {
1010    unsafe { transmute::<i32x4, _>(simd_lt(a.as_i32x4(), b.as_i32x4())) }
1011}
1012
1013/// Converts the lower two packed 32-bit integers in `a` to packed
1014/// double-precision (64-bit) floating-point elements.
1015///
1016/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd)
1017#[inline]
1018#[target_feature(enable = "sse2")]
1019#[cfg_attr(test, assert_instr(cvtdq2pd))]
1020#[stable(feature = "simd_x86", since = "1.27.0")]
1021#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1022pub const fn _mm_cvtepi32_pd(a: __m128i) -> __m128d {
1023    unsafe {
1024        let a = a.as_i32x4();
1025        simd_cast::<i32x2, __m128d>(simd_shuffle!(a, a, [0, 1]))
1026    }
1027}
1028
1029/// Returns `a` with its lower element replaced by `b` after converting it to
1030/// an `f64`.
1031///
1032/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd)
1033#[inline]
1034#[target_feature(enable = "sse2")]
1035#[cfg_attr(test, assert_instr(cvtsi2sd))]
1036#[stable(feature = "simd_x86", since = "1.27.0")]
1037#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1038pub const fn _mm_cvtsi32_sd(a: __m128d, b: i32) -> __m128d {
1039    unsafe { simd_insert!(a, 0, b as f64) }
1040}
1041
1042/// Converts packed 32-bit integers in `a` to packed single-precision (32-bit)
1043/// floating-point elements.
1044///
1045/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps)
1046#[inline]
1047#[target_feature(enable = "sse2")]
1048#[cfg_attr(test, assert_instr(cvtdq2ps))]
1049#[stable(feature = "simd_x86", since = "1.27.0")]
1050#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1051pub const fn _mm_cvtepi32_ps(a: __m128i) -> __m128 {
1052    unsafe { transmute(simd_cast::<_, f32x4>(a.as_i32x4())) }
1053}
1054
1055/// Converts packed single-precision (32-bit) floating-point elements in `a`
1056/// to packed 32-bit integers.
1057///
1058/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32)
1059#[inline]
1060#[target_feature(enable = "sse2")]
1061#[cfg_attr(test, assert_instr(cvtps2dq))]
1062#[stable(feature = "simd_x86", since = "1.27.0")]
1063pub fn _mm_cvtps_epi32(a: __m128) -> __m128i {
1064    unsafe { transmute(cvtps2dq(a)) }
1065}
1066
1067/// Returns a vector whose lowest element is `a` and all higher elements are
1068/// `0`.
1069///
1070/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128)
1071#[inline]
1072#[target_feature(enable = "sse2")]
1073#[stable(feature = "simd_x86", since = "1.27.0")]
1074#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1075pub const fn _mm_cvtsi32_si128(a: i32) -> __m128i {
1076    unsafe { transmute(i32x4::new(a, 0, 0, 0)) }
1077}
1078
1079/// Returns the lowest element of `a`.
1080///
1081/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32)
1082#[inline]
1083#[target_feature(enable = "sse2")]
1084#[stable(feature = "simd_x86", since = "1.27.0")]
1085#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1086pub const fn _mm_cvtsi128_si32(a: __m128i) -> i32 {
1087    unsafe { simd_extract!(a.as_i32x4(), 0) }
1088}
1089
1090/// Sets packed 64-bit integers with the supplied values, from highest to
1091/// lowest.
1092///
1093/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x)
1094#[inline]
1095#[target_feature(enable = "sse2")]
1096// no particular instruction to test
1097#[stable(feature = "simd_x86", since = "1.27.0")]
1098#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1099pub const fn _mm_set_epi64x(e1: i64, e0: i64) -> __m128i {
1100    unsafe { transmute(i64x2::new(e0, e1)) }
1101}
1102
1103/// Sets packed 32-bit integers with the supplied values.
1104///
1105/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32)
1106#[inline]
1107#[target_feature(enable = "sse2")]
1108// no particular instruction to test
1109#[stable(feature = "simd_x86", since = "1.27.0")]
1110#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1111pub const fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
1112    unsafe { transmute(i32x4::new(e0, e1, e2, e3)) }
1113}
1114
1115/// Sets packed 16-bit integers with the supplied values.
1116///
1117/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16)
1118#[inline]
1119#[target_feature(enable = "sse2")]
1120// no particular instruction to test
1121#[stable(feature = "simd_x86", since = "1.27.0")]
1122#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1123pub const fn _mm_set_epi16(
1124    e7: i16,
1125    e6: i16,
1126    e5: i16,
1127    e4: i16,
1128    e3: i16,
1129    e2: i16,
1130    e1: i16,
1131    e0: i16,
1132) -> __m128i {
1133    unsafe { transmute(i16x8::new(e0, e1, e2, e3, e4, e5, e6, e7)) }
1134}
1135
1136/// Sets packed 8-bit integers with the supplied values.
1137///
1138/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8)
1139#[inline]
1140#[target_feature(enable = "sse2")]
1141// no particular instruction to test
1142#[stable(feature = "simd_x86", since = "1.27.0")]
1143#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1144pub const fn _mm_set_epi8(
1145    e15: i8,
1146    e14: i8,
1147    e13: i8,
1148    e12: i8,
1149    e11: i8,
1150    e10: i8,
1151    e9: i8,
1152    e8: i8,
1153    e7: i8,
1154    e6: i8,
1155    e5: i8,
1156    e4: i8,
1157    e3: i8,
1158    e2: i8,
1159    e1: i8,
1160    e0: i8,
1161) -> __m128i {
1162    unsafe {
1163        #[rustfmt::skip]
1164        transmute(i8x16::new(
1165            e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
1166        ))
1167    }
1168}
1169
1170/// Broadcasts 64-bit integer `a` to all elements.
1171///
1172/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x)
1173#[inline]
1174#[target_feature(enable = "sse2")]
1175// no particular instruction to test
1176#[stable(feature = "simd_x86", since = "1.27.0")]
1177#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1178pub const fn _mm_set1_epi64x(a: i64) -> __m128i {
1179    i64x2::splat(a).as_m128i()
1180}
1181
1182/// Broadcasts 32-bit integer `a` to all elements.
1183///
1184/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32)
1185#[inline]
1186#[target_feature(enable = "sse2")]
1187// no particular instruction to test
1188#[stable(feature = "simd_x86", since = "1.27.0")]
1189#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1190pub const fn _mm_set1_epi32(a: i32) -> __m128i {
1191    i32x4::splat(a).as_m128i()
1192}
1193
1194/// Broadcasts 16-bit integer `a` to all elements.
1195///
1196/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16)
1197#[inline]
1198#[target_feature(enable = "sse2")]
1199// no particular instruction to test
1200#[stable(feature = "simd_x86", since = "1.27.0")]
1201#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1202pub const fn _mm_set1_epi16(a: i16) -> __m128i {
1203    i16x8::splat(a).as_m128i()
1204}
1205
1206/// Broadcasts 8-bit integer `a` to all elements.
1207///
1208/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8)
1209#[inline]
1210#[target_feature(enable = "sse2")]
1211// no particular instruction to test
1212#[stable(feature = "simd_x86", since = "1.27.0")]
1213#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1214pub const fn _mm_set1_epi8(a: i8) -> __m128i {
1215    i8x16::splat(a).as_m128i()
1216}
1217
1218/// Sets packed 32-bit integers with the supplied values in reverse order.
1219///
1220/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32)
1221#[inline]
1222#[target_feature(enable = "sse2")]
1223// no particular instruction to test
1224#[stable(feature = "simd_x86", since = "1.27.0")]
1225#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1226pub const fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
1227    _mm_set_epi32(e0, e1, e2, e3)
1228}
1229
1230/// Sets packed 16-bit integers with the supplied values in reverse order.
1231///
1232/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16)
1233#[inline]
1234#[target_feature(enable = "sse2")]
1235// no particular instruction to test
1236#[stable(feature = "simd_x86", since = "1.27.0")]
1237#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1238pub const fn _mm_setr_epi16(
1239    e7: i16,
1240    e6: i16,
1241    e5: i16,
1242    e4: i16,
1243    e3: i16,
1244    e2: i16,
1245    e1: i16,
1246    e0: i16,
1247) -> __m128i {
1248    _mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7)
1249}
1250
1251/// Sets packed 8-bit integers with the supplied values in reverse order.
1252///
1253/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8)
1254#[inline]
1255#[target_feature(enable = "sse2")]
1256// no particular instruction to test
1257#[stable(feature = "simd_x86", since = "1.27.0")]
1258#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1259pub const fn _mm_setr_epi8(
1260    e15: i8,
1261    e14: i8,
1262    e13: i8,
1263    e12: i8,
1264    e11: i8,
1265    e10: i8,
1266    e9: i8,
1267    e8: i8,
1268    e7: i8,
1269    e6: i8,
1270    e5: i8,
1271    e4: i8,
1272    e3: i8,
1273    e2: i8,
1274    e1: i8,
1275    e0: i8,
1276) -> __m128i {
1277    #[rustfmt::skip]
1278    _mm_set_epi8(
1279        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
1280    )
1281}
1282
1283/// Returns a vector with all elements set to zero.
1284///
1285/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128)
1286#[inline]
1287#[target_feature(enable = "sse2")]
1288#[cfg_attr(test, assert_instr(xorps))]
1289#[stable(feature = "simd_x86", since = "1.27.0")]
1290#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1291pub const fn _mm_setzero_si128() -> __m128i {
1292    const { unsafe { mem::zeroed() } }
1293}
1294
1295/// Loads 64-bit integer from memory into first element of returned vector.
1296///
1297/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64)
1298#[inline]
1299#[target_feature(enable = "sse2")]
1300#[stable(feature = "simd_x86", since = "1.27.0")]
1301#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1302pub const unsafe fn _mm_loadl_epi64(mem_addr: *const __m128i) -> __m128i {
1303    _mm_set_epi64x(0, ptr::read_unaligned(mem_addr as *const i64))
1304}
1305
1306/// Loads 128-bits of integer data from memory into a new vector.
1307///
1308/// `mem_addr` must be aligned on a 16-byte boundary.
1309///
1310/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128)
1311#[inline]
1312#[target_feature(enable = "sse2")]
1313#[cfg_attr(
1314    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1315    assert_instr(movaps)
1316)]
1317#[stable(feature = "simd_x86", since = "1.27.0")]
1318#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1319pub const unsafe fn _mm_load_si128(mem_addr: *const __m128i) -> __m128i {
1320    *mem_addr
1321}
1322
1323/// Loads 128-bits of integer data from memory into a new vector.
1324///
1325/// `mem_addr` does not need to be aligned on any particular boundary.
1326///
1327/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128)
1328#[inline]
1329#[target_feature(enable = "sse2")]
1330#[cfg_attr(test, assert_instr(movups))]
1331#[stable(feature = "simd_x86", since = "1.27.0")]
1332#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1333pub const unsafe fn _mm_loadu_si128(mem_addr: *const __m128i) -> __m128i {
1334    let mut dst: __m128i = _mm_undefined_si128();
1335    ptr::copy_nonoverlapping(
1336        mem_addr as *const u8,
1337        ptr::addr_of_mut!(dst) as *mut u8,
1338        mem::size_of::<__m128i>(),
1339    );
1340    dst
1341}
1342
1343/// Conditionally store 8-bit integer elements from `a` into memory using
1344/// `mask` flagged as non-temporal (unlikely to be used again soon).
1345///
1346/// Elements are not stored when the highest bit is not set in the
1347/// corresponding element.
1348///
1349/// `mem_addr` should correspond to a 128-bit memory location and does not need
1350/// to be aligned on any particular boundary.
1351///
1352/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128)
1353///
1354/// # Safety of non-temporal stores
1355///
1356/// After using this intrinsic, but before any other access to the memory that this intrinsic
1357/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1358/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1359/// return.
1360///
1361/// See [`_mm_sfence`] for details.
1362#[inline]
1363#[target_feature(enable = "sse2")]
1364#[cfg_attr(test, assert_instr(maskmovdqu))]
1365#[stable(feature = "simd_x86", since = "1.27.0")]
1366pub unsafe fn _mm_maskmoveu_si128(a: __m128i, mask: __m128i, mem_addr: *mut i8) {
1367    maskmovdqu(a.as_i8x16(), mask.as_i8x16(), mem_addr)
1368}
1369
1370/// Stores 128-bits of integer data from `a` into memory.
1371///
1372/// `mem_addr` must be aligned on a 16-byte boundary.
1373///
1374/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128)
1375#[inline]
1376#[target_feature(enable = "sse2")]
1377#[cfg_attr(
1378    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1379    assert_instr(movaps)
1380)]
1381#[stable(feature = "simd_x86", since = "1.27.0")]
1382#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1383pub const unsafe fn _mm_store_si128(mem_addr: *mut __m128i, a: __m128i) {
1384    *mem_addr = a;
1385}
1386
1387/// Stores 128-bits of integer data from `a` into memory.
1388///
1389/// `mem_addr` does not need to be aligned on any particular boundary.
1390///
1391/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128)
1392#[inline]
1393#[target_feature(enable = "sse2")]
1394#[cfg_attr(test, assert_instr(movups))] // FIXME movdqu expected
1395#[stable(feature = "simd_x86", since = "1.27.0")]
1396#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1397pub const unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) {
1398    mem_addr.write_unaligned(a);
1399}
1400
1401/// Stores the lower 64-bit integer `a` to a memory location.
1402///
1403/// `mem_addr` does not need to be aligned on any particular boundary.
1404///
1405/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64)
1406#[inline]
1407#[target_feature(enable = "sse2")]
1408#[stable(feature = "simd_x86", since = "1.27.0")]
1409#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1410pub const unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
1411    ptr::copy_nonoverlapping(ptr::addr_of!(a) as *const u8, mem_addr as *mut u8, 8);
1412}
1413
1414/// Stores a 128-bit integer vector to a 128-bit aligned memory location.
1415/// To minimize caching, the data is flagged as non-temporal (unlikely to be
1416/// used again soon).
1417///
1418/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128)
1419///
1420/// # Safety of non-temporal stores
1421///
1422/// After using this intrinsic, but before any other access to the memory that this intrinsic
1423/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1424/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1425/// return.
1426///
1427/// See [`_mm_sfence`] for details.
1428#[inline]
1429#[target_feature(enable = "sse2")]
1430#[cfg_attr(test, assert_instr(movntdq))]
1431#[stable(feature = "simd_x86", since = "1.27.0")]
1432pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
1433    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
1434    crate::arch::asm!(
1435        vps!("movntdq",  ",{a}"),
1436        p = in(reg) mem_addr,
1437        a = in(xmm_reg) a,
1438        options(nostack, preserves_flags),
1439    );
1440}
1441
1442/// Stores a 32-bit integer value in the specified memory location.
1443/// To minimize caching, the data is flagged as non-temporal (unlikely to be
1444/// used again soon).
1445///
1446/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32)
1447///
1448/// # Safety of non-temporal stores
1449///
1450/// After using this intrinsic, but before any other access to the memory that this intrinsic
1451/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1452/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1453/// return.
1454///
1455/// See [`_mm_sfence`] for details.
1456#[inline]
1457#[target_feature(enable = "sse2")]
1458#[cfg_attr(test, assert_instr(movnti))]
1459#[stable(feature = "simd_x86", since = "1.27.0")]
1460pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) {
1461    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
1462    crate::arch::asm!(
1463        vps!("movnti", ",{a:e}"), // `:e` for 32bit value
1464        p = in(reg) mem_addr,
1465        a = in(reg) a,
1466        options(nostack, preserves_flags),
1467    );
1468}
1469
1470/// Returns a vector where the low element is extracted from `a` and its upper
1471/// element is zero.
1472///
1473/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64)
1474#[inline]
1475#[target_feature(enable = "sse2")]
1476// FIXME movd on msvc, movd on i686
1477#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(movq))]
1478#[stable(feature = "simd_x86", since = "1.27.0")]
1479#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1480pub const fn _mm_move_epi64(a: __m128i) -> __m128i {
1481    unsafe {
1482        let r: i64x2 = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 2]);
1483        transmute(r)
1484    }
1485}
1486
1487/// Converts packed signed 16-bit integers from `a` and `b` to packed 8-bit integers
1488/// using signed saturation.
1489///
1490/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16)
1491#[inline]
1492#[target_feature(enable = "sse2")]
1493#[cfg_attr(test, assert_instr(packsswb))]
1494#[stable(feature = "simd_x86", since = "1.27.0")]
1495#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1496pub const fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i {
1497    unsafe {
1498        let max = simd_splat(i8::MAX as i16);
1499        let min = simd_splat(i8::MIN as i16);
1500
1501        let clamped_a = simd_imax(simd_imin(a.as_i16x8(), max), min)
1502            .as_m128i()
1503            .as_i8x16();
1504        let clamped_b = simd_imax(simd_imin(b.as_i16x8(), max), min)
1505            .as_m128i()
1506            .as_i8x16();
1507
1508        // Shuffle the low i8 of each i16 from two concatenated vectors into
1509        // the low bits of the result register.
1510        const IDXS: [u32; 16] = [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30];
1511        let result: i8x16 = simd_shuffle!(clamped_a, clamped_b, IDXS);
1512
1513        result.as_m128i()
1514    }
1515}
1516
1517/// Converts packed signed 32-bit integers from `a` and `b` to packed 16-bit integers
1518/// using signed saturation.
1519///
1520/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32)
1521#[inline]
1522#[target_feature(enable = "sse2")]
1523#[cfg_attr(test, assert_instr(packssdw))]
1524#[stable(feature = "simd_x86", since = "1.27.0")]
1525#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1526pub const fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i {
1527    unsafe {
1528        let max = simd_splat(i16::MAX as i32);
1529        let min = simd_splat(i16::MIN as i32);
1530
1531        let clamped_a = simd_imax(simd_imin(a.as_i32x4(), max), min);
1532        let clamped_b = simd_imax(simd_imin(b.as_i32x4(), max), min);
1533
1534        let clamped_a: i16x4 = simd_cast(clamped_a);
1535        let clamped_b: i16x4 = simd_cast(clamped_b);
1536
1537        let a: i64 = transmute(clamped_a);
1538        let b: i64 = transmute(clamped_b);
1539        i64x2::new(a, b).as_m128i()
1540    }
1541}
1542
1543/// Converts packed signed 16-bit integers from `a` and `b` to packed 8-bit integers
1544/// using unsigned saturation.
1545///
1546/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16)
1547#[inline]
1548#[target_feature(enable = "sse2")]
1549#[cfg_attr(test, assert_instr(packuswb))]
1550#[stable(feature = "simd_x86", since = "1.27.0")]
1551#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1552pub const fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i {
1553    unsafe {
1554        let max = simd_splat(u8::MAX as i16);
1555        let min = simd_splat(u8::MIN as i16);
1556
1557        let clamped_a = simd_imax(simd_imin(a.as_i16x8(), max), min)
1558            .as_m128i()
1559            .as_i8x16();
1560        let clamped_b = simd_imax(simd_imin(b.as_i16x8(), max), min)
1561            .as_m128i()
1562            .as_i8x16();
1563
1564        // Shuffle the low bytes of each i16 from two concatenated vectors into
1565        // the low bits of the result register.
1566        // Without `simd_shuffle`, this intrinsic will cause the AVX-512BW
1567        // `_mm_mask_packus_epi16` and `_mm_maskz_packus_epi16` tests to fail.
1568        const IDXS: [u32; 16] = [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30];
1569        let result: i8x16 = simd_shuffle!(clamped_a, clamped_b, IDXS);
1570
1571        result.as_m128i()
1572    }
1573}
1574
1575/// Returns the `imm8` element of `a`.
1576///
1577/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16)
1578#[inline]
1579#[target_feature(enable = "sse2")]
1580#[cfg_attr(test, assert_instr(pextrw, IMM8 = 7))]
1581#[rustc_legacy_const_generics(1)]
1582#[stable(feature = "simd_x86", since = "1.27.0")]
1583#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1584pub const fn _mm_extract_epi16<const IMM8: i32>(a: __m128i) -> i32 {
1585    static_assert_uimm_bits!(IMM8, 3);
1586    unsafe { simd_extract!(a.as_u16x8(), IMM8 as u32, u16) as i32 }
1587}
1588
1589/// Returns a new vector where the `imm8` element of `a` is replaced with `i`.
1590///
1591/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16)
1592#[inline]
1593#[target_feature(enable = "sse2")]
1594#[cfg_attr(test, assert_instr(pinsrw, IMM8 = 7))]
1595#[rustc_legacy_const_generics(2)]
1596#[stable(feature = "simd_x86", since = "1.27.0")]
1597#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1598pub const fn _mm_insert_epi16<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
1599    static_assert_uimm_bits!(IMM8, 3);
1600    unsafe { transmute(simd_insert!(a.as_i16x8(), IMM8 as u32, i as i16)) }
1601}
1602
1603/// Returns a mask of the most significant bit of each element in `a`.
1604///
1605/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8)
1606#[inline]
1607#[target_feature(enable = "sse2")]
1608#[cfg_attr(test, assert_instr(pmovmskb))]
1609#[stable(feature = "simd_x86", since = "1.27.0")]
1610#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1611pub const fn _mm_movemask_epi8(a: __m128i) -> i32 {
1612    unsafe {
1613        let z = i8x16::ZERO;
1614        let m: i8x16 = simd_lt(a.as_i8x16(), z);
1615        simd_bitmask::<_, u16>(m) as u32 as i32
1616    }
1617}
1618
1619/// Shuffles 32-bit integers in `a` using the control in `IMM8`.
1620///
1621/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32)
1622#[inline]
1623#[target_feature(enable = "sse2")]
1624#[cfg_attr(test, assert_instr(pshufd, IMM8 = 9))]
1625#[rustc_legacy_const_generics(1)]
1626#[stable(feature = "simd_x86", since = "1.27.0")]
1627#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1628pub const fn _mm_shuffle_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
1629    static_assert_uimm_bits!(IMM8, 8);
1630    unsafe {
1631        let a = a.as_i32x4();
1632        let x: i32x4 = simd_shuffle!(
1633            a,
1634            a,
1635            [
1636                IMM8 as u32 & 0b11,
1637                (IMM8 as u32 >> 2) & 0b11,
1638                (IMM8 as u32 >> 4) & 0b11,
1639                (IMM8 as u32 >> 6) & 0b11,
1640            ],
1641        );
1642        transmute(x)
1643    }
1644}
1645
1646/// Shuffles 16-bit integers in the high 64 bits of `a` using the control in
1647/// `IMM8`.
1648///
1649/// Put the results in the high 64 bits of the returned vector, with the low 64
1650/// bits being copied from `a`.
1651///
1652/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16)
1653#[inline]
1654#[target_feature(enable = "sse2")]
1655#[cfg_attr(test, assert_instr(pshufhw, IMM8 = 9))]
1656#[rustc_legacy_const_generics(1)]
1657#[stable(feature = "simd_x86", since = "1.27.0")]
1658#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1659pub const fn _mm_shufflehi_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
1660    static_assert_uimm_bits!(IMM8, 8);
1661    unsafe {
1662        let a = a.as_i16x8();
1663        let x: i16x8 = simd_shuffle!(
1664            a,
1665            a,
1666            [
1667                0,
1668                1,
1669                2,
1670                3,
1671                (IMM8 as u32 & 0b11) + 4,
1672                ((IMM8 as u32 >> 2) & 0b11) + 4,
1673                ((IMM8 as u32 >> 4) & 0b11) + 4,
1674                ((IMM8 as u32 >> 6) & 0b11) + 4,
1675            ],
1676        );
1677        transmute(x)
1678    }
1679}
1680
1681/// Shuffles 16-bit integers in the low 64 bits of `a` using the control in
1682/// `IMM8`.
1683///
1684/// Put the results in the low 64 bits of the returned vector, with the high 64
1685/// bits being copied from `a`.
1686///
1687/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16)
1688#[inline]
1689#[target_feature(enable = "sse2")]
1690#[cfg_attr(test, assert_instr(pshuflw, IMM8 = 9))]
1691#[rustc_legacy_const_generics(1)]
1692#[stable(feature = "simd_x86", since = "1.27.0")]
1693#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1694pub const fn _mm_shufflelo_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
1695    static_assert_uimm_bits!(IMM8, 8);
1696    unsafe {
1697        let a = a.as_i16x8();
1698        let x: i16x8 = simd_shuffle!(
1699            a,
1700            a,
1701            [
1702                IMM8 as u32 & 0b11,
1703                (IMM8 as u32 >> 2) & 0b11,
1704                (IMM8 as u32 >> 4) & 0b11,
1705                (IMM8 as u32 >> 6) & 0b11,
1706                4,
1707                5,
1708                6,
1709                7,
1710            ],
1711        );
1712        transmute(x)
1713    }
1714}
1715
1716/// Unpacks and interleave 8-bit integers from the high half of `a` and `b`.
1717///
1718/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8)
1719#[inline]
1720#[target_feature(enable = "sse2")]
1721#[cfg_attr(test, assert_instr(punpckhbw))]
1722#[stable(feature = "simd_x86", since = "1.27.0")]
1723#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1724pub const fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i {
1725    unsafe {
1726        transmute::<i8x16, _>(simd_shuffle!(
1727            a.as_i8x16(),
1728            b.as_i8x16(),
1729            [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31],
1730        ))
1731    }
1732}
1733
1734/// Unpacks and interleave 16-bit integers from the high half of `a` and `b`.
1735///
1736/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16)
1737#[inline]
1738#[target_feature(enable = "sse2")]
1739#[cfg_attr(test, assert_instr(punpckhwd))]
1740#[stable(feature = "simd_x86", since = "1.27.0")]
1741#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1742pub const fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i {
1743    unsafe {
1744        let x = simd_shuffle!(a.as_i16x8(), b.as_i16x8(), [4, 12, 5, 13, 6, 14, 7, 15]);
1745        transmute::<i16x8, _>(x)
1746    }
1747}
1748
1749/// Unpacks and interleave 32-bit integers from the high half of `a` and `b`.
1750///
1751/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32)
1752#[inline]
1753#[target_feature(enable = "sse2")]
1754#[cfg_attr(test, assert_instr(unpckhps))]
1755#[stable(feature = "simd_x86", since = "1.27.0")]
1756#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1757pub const fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i {
1758    unsafe { transmute::<i32x4, _>(simd_shuffle!(a.as_i32x4(), b.as_i32x4(), [2, 6, 3, 7])) }
1759}
1760
1761/// Unpacks and interleave 64-bit integers from the high half of `a` and `b`.
1762///
1763/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64)
1764#[inline]
1765#[target_feature(enable = "sse2")]
1766#[cfg_attr(test, assert_instr(unpckhpd))]
1767#[stable(feature = "simd_x86", since = "1.27.0")]
1768#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1769pub const fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i {
1770    unsafe { transmute::<i64x2, _>(simd_shuffle!(a.as_i64x2(), b.as_i64x2(), [1, 3])) }
1771}
1772
1773/// Unpacks and interleave 8-bit integers from the low half of `a` and `b`.
1774///
1775/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8)
1776#[inline]
1777#[target_feature(enable = "sse2")]
1778#[cfg_attr(test, assert_instr(punpcklbw))]
1779#[stable(feature = "simd_x86", since = "1.27.0")]
1780#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1781pub const fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
1782    unsafe {
1783        transmute::<i8x16, _>(simd_shuffle!(
1784            a.as_i8x16(),
1785            b.as_i8x16(),
1786            [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23],
1787        ))
1788    }
1789}
1790
1791/// Unpacks and interleave 16-bit integers from the low half of `a` and `b`.
1792///
1793/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16)
1794#[inline]
1795#[target_feature(enable = "sse2")]
1796#[cfg_attr(test, assert_instr(punpcklwd))]
1797#[stable(feature = "simd_x86", since = "1.27.0")]
1798#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1799pub const fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
1800    unsafe {
1801        let x = simd_shuffle!(a.as_i16x8(), b.as_i16x8(), [0, 8, 1, 9, 2, 10, 3, 11]);
1802        transmute::<i16x8, _>(x)
1803    }
1804}
1805
1806/// Unpacks and interleave 32-bit integers from the low half of `a` and `b`.
1807///
1808/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32)
1809#[inline]
1810#[target_feature(enable = "sse2")]
1811#[cfg_attr(test, assert_instr(unpcklps))]
1812#[stable(feature = "simd_x86", since = "1.27.0")]
1813#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1814pub const fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i {
1815    unsafe { transmute::<i32x4, _>(simd_shuffle!(a.as_i32x4(), b.as_i32x4(), [0, 4, 1, 5])) }
1816}
1817
1818/// Unpacks and interleave 64-bit integers from the low half of `a` and `b`.
1819///
1820/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64)
1821#[inline]
1822#[target_feature(enable = "sse2")]
1823#[cfg_attr(test, assert_instr(movlhps))]
1824#[stable(feature = "simd_x86", since = "1.27.0")]
1825#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1826pub const fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
1827    unsafe { transmute::<i64x2, _>(simd_shuffle!(a.as_i64x2(), b.as_i64x2(), [0, 2])) }
1828}
1829
1830/// Returns a new vector with the low element of `a` replaced by the sum of the
1831/// low elements of `a` and `b`.
1832///
1833/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd)
1834#[inline]
1835#[target_feature(enable = "sse2")]
1836#[cfg_attr(test, assert_instr(addsd))]
1837#[stable(feature = "simd_x86", since = "1.27.0")]
1838#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1839pub const fn _mm_add_sd(a: __m128d, b: __m128d) -> __m128d {
1840    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) + _mm_cvtsd_f64(b)) }
1841}
1842
1843/// Adds packed double-precision (64-bit) floating-point elements in `a` and
1844/// `b`.
1845///
1846/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd)
1847#[inline]
1848#[target_feature(enable = "sse2")]
1849#[cfg_attr(test, assert_instr(addpd))]
1850#[stable(feature = "simd_x86", since = "1.27.0")]
1851#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1852pub const fn _mm_add_pd(a: __m128d, b: __m128d) -> __m128d {
1853    unsafe { simd_add(a, b) }
1854}
1855
1856/// Returns a new vector with the low element of `a` replaced by the result of
1857/// diving the lower element of `a` by the lower element of `b`.
1858///
1859/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd)
1860#[inline]
1861#[target_feature(enable = "sse2")]
1862#[cfg_attr(test, assert_instr(divsd))]
1863#[stable(feature = "simd_x86", since = "1.27.0")]
1864#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1865pub const fn _mm_div_sd(a: __m128d, b: __m128d) -> __m128d {
1866    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) / _mm_cvtsd_f64(b)) }
1867}
1868
1869/// Divide packed double-precision (64-bit) floating-point elements in `a` by
1870/// packed elements in `b`.
1871///
1872/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd)
1873#[inline]
1874#[target_feature(enable = "sse2")]
1875#[cfg_attr(test, assert_instr(divpd))]
1876#[stable(feature = "simd_x86", since = "1.27.0")]
1877#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1878pub const fn _mm_div_pd(a: __m128d, b: __m128d) -> __m128d {
1879    unsafe { simd_div(a, b) }
1880}
1881
1882/// Returns a new vector with the low element of `a` replaced by the maximum
1883/// of the lower elements of `a` and `b`.
1884///
1885/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd)
1886#[inline]
1887#[target_feature(enable = "sse2")]
1888#[cfg_attr(test, assert_instr(maxsd))]
1889#[stable(feature = "simd_x86", since = "1.27.0")]
1890pub fn _mm_max_sd(a: __m128d, b: __m128d) -> __m128d {
1891    unsafe { maxsd(a, b) }
1892}
1893
1894/// Returns a new vector with the maximum values from corresponding elements in
1895/// `a` and `b`.
1896///
1897/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd)
1898#[inline]
1899#[target_feature(enable = "sse2")]
1900#[cfg_attr(test, assert_instr(maxpd))]
1901#[stable(feature = "simd_x86", since = "1.27.0")]
1902pub fn _mm_max_pd(a: __m128d, b: __m128d) -> __m128d {
1903    unsafe { maxpd(a, b) }
1904}
1905
1906/// Returns a new vector with the low element of `a` replaced by the minimum
1907/// of the lower elements of `a` and `b`.
1908///
1909/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd)
1910#[inline]
1911#[target_feature(enable = "sse2")]
1912#[cfg_attr(test, assert_instr(minsd))]
1913#[stable(feature = "simd_x86", since = "1.27.0")]
1914pub fn _mm_min_sd(a: __m128d, b: __m128d) -> __m128d {
1915    unsafe { minsd(a, b) }
1916}
1917
1918/// Returns a new vector with the minimum values from corresponding elements in
1919/// `a` and `b`.
1920///
1921/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd)
1922#[inline]
1923#[target_feature(enable = "sse2")]
1924#[cfg_attr(test, assert_instr(minpd))]
1925#[stable(feature = "simd_x86", since = "1.27.0")]
1926pub fn _mm_min_pd(a: __m128d, b: __m128d) -> __m128d {
1927    unsafe { minpd(a, b) }
1928}
1929
1930/// Returns a new vector with the low element of `a` replaced by multiplying the
1931/// low elements of `a` and `b`.
1932///
1933/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_sd)
1934#[inline]
1935#[target_feature(enable = "sse2")]
1936#[cfg_attr(test, assert_instr(mulsd))]
1937#[stable(feature = "simd_x86", since = "1.27.0")]
1938#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1939pub const fn _mm_mul_sd(a: __m128d, b: __m128d) -> __m128d {
1940    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) * _mm_cvtsd_f64(b)) }
1941}
1942
1943/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
1944/// and `b`.
1945///
1946/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd)
1947#[inline]
1948#[target_feature(enable = "sse2")]
1949#[cfg_attr(test, assert_instr(mulpd))]
1950#[stable(feature = "simd_x86", since = "1.27.0")]
1951#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1952pub const fn _mm_mul_pd(a: __m128d, b: __m128d) -> __m128d {
1953    unsafe { simd_mul(a, b) }
1954}
1955
1956/// Returns a new vector with the low element of `a` replaced by the square
1957/// root of the lower element `b`.
1958///
1959/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd)
1960#[inline]
1961#[target_feature(enable = "sse2")]
1962#[cfg_attr(test, assert_instr(sqrtsd))]
1963#[stable(feature = "simd_x86", since = "1.27.0")]
1964pub fn _mm_sqrt_sd(a: __m128d, b: __m128d) -> __m128d {
1965    unsafe { simd_insert!(a, 0, sqrtf64(_mm_cvtsd_f64(b))) }
1966}
1967
1968/// Returns a new vector with the square root of each of the values in `a`.
1969///
1970/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd)
1971#[inline]
1972#[target_feature(enable = "sse2")]
1973#[cfg_attr(test, assert_instr(sqrtpd))]
1974#[stable(feature = "simd_x86", since = "1.27.0")]
1975pub fn _mm_sqrt_pd(a: __m128d) -> __m128d {
1976    unsafe { simd_fsqrt(a) }
1977}
1978
1979/// Returns a new vector with the low element of `a` replaced by subtracting the
1980/// low element by `b` from the low element of `a`.
1981///
1982/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd)
1983#[inline]
1984#[target_feature(enable = "sse2")]
1985#[cfg_attr(test, assert_instr(subsd))]
1986#[stable(feature = "simd_x86", since = "1.27.0")]
1987#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1988pub const fn _mm_sub_sd(a: __m128d, b: __m128d) -> __m128d {
1989    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) - _mm_cvtsd_f64(b)) }
1990}
1991
1992/// Subtract packed double-precision (64-bit) floating-point elements in `b`
1993/// from `a`.
1994///
1995/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_pd)
1996#[inline]
1997#[target_feature(enable = "sse2")]
1998#[cfg_attr(test, assert_instr(subpd))]
1999#[stable(feature = "simd_x86", since = "1.27.0")]
2000#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2001pub const fn _mm_sub_pd(a: __m128d, b: __m128d) -> __m128d {
2002    unsafe { simd_sub(a, b) }
2003}
2004
2005/// Computes the bitwise AND of packed double-precision (64-bit) floating-point
2006/// elements in `a` and `b`.
2007///
2008/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd)
2009#[inline]
2010#[target_feature(enable = "sse2")]
2011#[cfg_attr(test, assert_instr(andps))]
2012#[stable(feature = "simd_x86", since = "1.27.0")]
2013#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2014pub const fn _mm_and_pd(a: __m128d, b: __m128d) -> __m128d {
2015    unsafe {
2016        let a: __m128i = transmute(a);
2017        let b: __m128i = transmute(b);
2018        transmute(_mm_and_si128(a, b))
2019    }
2020}
2021
2022/// Computes the bitwise NOT of `a` and then AND with `b`.
2023///
2024/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd)
2025#[inline]
2026#[target_feature(enable = "sse2")]
2027#[cfg_attr(test, assert_instr(andnps))]
2028#[stable(feature = "simd_x86", since = "1.27.0")]
2029#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2030pub const fn _mm_andnot_pd(a: __m128d, b: __m128d) -> __m128d {
2031    unsafe {
2032        let a: __m128i = transmute(a);
2033        let b: __m128i = transmute(b);
2034        transmute(_mm_andnot_si128(a, b))
2035    }
2036}
2037
2038/// Computes the bitwise OR of `a` and `b`.
2039///
2040/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_pd)
2041#[inline]
2042#[target_feature(enable = "sse2")]
2043#[cfg_attr(test, assert_instr(orps))]
2044#[stable(feature = "simd_x86", since = "1.27.0")]
2045#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2046pub const fn _mm_or_pd(a: __m128d, b: __m128d) -> __m128d {
2047    unsafe {
2048        let a: __m128i = transmute(a);
2049        let b: __m128i = transmute(b);
2050        transmute(_mm_or_si128(a, b))
2051    }
2052}
2053
2054/// Computes the bitwise XOR of `a` and `b`.
2055///
2056/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd)
2057#[inline]
2058#[target_feature(enable = "sse2")]
2059#[cfg_attr(test, assert_instr(xorps))]
2060#[stable(feature = "simd_x86", since = "1.27.0")]
2061#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2062pub const fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d {
2063    unsafe {
2064        let a: __m128i = transmute(a);
2065        let b: __m128i = transmute(b);
2066        transmute(_mm_xor_si128(a, b))
2067    }
2068}
2069
2070/// Returns a new vector with the low element of `a` replaced by the equality
2071/// comparison of the lower elements of `a` and `b`.
2072///
2073/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd)
2074#[inline]
2075#[target_feature(enable = "sse2")]
2076#[cfg_attr(test, assert_instr(cmpeqsd))]
2077#[stable(feature = "simd_x86", since = "1.27.0")]
2078pub fn _mm_cmpeq_sd(a: __m128d, b: __m128d) -> __m128d {
2079    unsafe { cmpsd(a, b, 0) }
2080}
2081
2082/// Returns a new vector with the low element of `a` replaced by the less-than
2083/// comparison of the lower elements of `a` and `b`.
2084///
2085/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd)
2086#[inline]
2087#[target_feature(enable = "sse2")]
2088#[cfg_attr(test, assert_instr(cmpltsd))]
2089#[stable(feature = "simd_x86", since = "1.27.0")]
2090pub fn _mm_cmplt_sd(a: __m128d, b: __m128d) -> __m128d {
2091    unsafe { cmpsd(a, b, 1) }
2092}
2093
2094/// Returns a new vector with the low element of `a` replaced by the
2095/// less-than-or-equal comparison of the lower elements of `a` and `b`.
2096///
2097/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd)
2098#[inline]
2099#[target_feature(enable = "sse2")]
2100#[cfg_attr(test, assert_instr(cmplesd))]
2101#[stable(feature = "simd_x86", since = "1.27.0")]
2102pub fn _mm_cmple_sd(a: __m128d, b: __m128d) -> __m128d {
2103    unsafe { cmpsd(a, b, 2) }
2104}
2105
2106/// Returns a new vector with the low element of `a` replaced by the
2107/// greater-than comparison of the lower elements of `a` and `b`.
2108///
2109/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd)
2110#[inline]
2111#[target_feature(enable = "sse2")]
2112#[cfg_attr(test, assert_instr(cmpltsd))]
2113#[stable(feature = "simd_x86", since = "1.27.0")]
2114pub fn _mm_cmpgt_sd(a: __m128d, b: __m128d) -> __m128d {
2115    unsafe { simd_insert!(_mm_cmplt_sd(b, a), 1, simd_extract!(a, 1, f64)) }
2116}
2117
2118/// Returns a new vector with the low element of `a` replaced by the
2119/// greater-than-or-equal comparison of the lower elements of `a` and `b`.
2120///
2121/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd)
2122#[inline]
2123#[target_feature(enable = "sse2")]
2124#[cfg_attr(test, assert_instr(cmplesd))]
2125#[stable(feature = "simd_x86", since = "1.27.0")]
2126pub fn _mm_cmpge_sd(a: __m128d, b: __m128d) -> __m128d {
2127    unsafe { simd_insert!(_mm_cmple_sd(b, a), 1, simd_extract!(a, 1, f64)) }
2128}
2129
2130/// Returns a new vector with the low element of `a` replaced by the result
2131/// of comparing both of the lower elements of `a` and `b` to `NaN`. If
2132/// neither are equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0`
2133/// otherwise.
2134///
2135/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd)
2136#[inline]
2137#[target_feature(enable = "sse2")]
2138#[cfg_attr(test, assert_instr(cmpordsd))]
2139#[stable(feature = "simd_x86", since = "1.27.0")]
2140pub fn _mm_cmpord_sd(a: __m128d, b: __m128d) -> __m128d {
2141    unsafe { cmpsd(a, b, 7) }
2142}
2143
2144/// Returns a new vector with the low element of `a` replaced by the result of
2145/// comparing both of the lower elements of `a` and `b` to `NaN`. If either is
2146/// equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0` otherwise.
2147///
2148/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd)
2149#[inline]
2150#[target_feature(enable = "sse2")]
2151#[cfg_attr(test, assert_instr(cmpunordsd))]
2152#[stable(feature = "simd_x86", since = "1.27.0")]
2153pub fn _mm_cmpunord_sd(a: __m128d, b: __m128d) -> __m128d {
2154    unsafe { cmpsd(a, b, 3) }
2155}
2156
2157/// Returns a new vector with the low element of `a` replaced by the not-equal
2158/// comparison of the lower elements of `a` and `b`.
2159///
2160/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd)
2161#[inline]
2162#[target_feature(enable = "sse2")]
2163#[cfg_attr(test, assert_instr(cmpneqsd))]
2164#[stable(feature = "simd_x86", since = "1.27.0")]
2165pub fn _mm_cmpneq_sd(a: __m128d, b: __m128d) -> __m128d {
2166    unsafe { cmpsd(a, b, 4) }
2167}
2168
2169/// Returns a new vector with the low element of `a` replaced by the
2170/// not-less-than comparison of the lower elements of `a` and `b`.
2171///
2172/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd)
2173#[inline]
2174#[target_feature(enable = "sse2")]
2175#[cfg_attr(test, assert_instr(cmpnltsd))]
2176#[stable(feature = "simd_x86", since = "1.27.0")]
2177pub fn _mm_cmpnlt_sd(a: __m128d, b: __m128d) -> __m128d {
2178    unsafe { cmpsd(a, b, 5) }
2179}
2180
2181/// Returns a new vector with the low element of `a` replaced by the
2182/// not-less-than-or-equal comparison of the lower elements of `a` and `b`.
2183///
2184/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd)
2185#[inline]
2186#[target_feature(enable = "sse2")]
2187#[cfg_attr(test, assert_instr(cmpnlesd))]
2188#[stable(feature = "simd_x86", since = "1.27.0")]
2189pub fn _mm_cmpnle_sd(a: __m128d, b: __m128d) -> __m128d {
2190    unsafe { cmpsd(a, b, 6) }
2191}
2192
2193/// Returns a new vector with the low element of `a` replaced by the
2194/// not-greater-than comparison of the lower elements of `a` and `b`.
2195///
2196/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd)
2197#[inline]
2198#[target_feature(enable = "sse2")]
2199#[cfg_attr(test, assert_instr(cmpnltsd))]
2200#[stable(feature = "simd_x86", since = "1.27.0")]
2201pub fn _mm_cmpngt_sd(a: __m128d, b: __m128d) -> __m128d {
2202    unsafe { simd_insert!(_mm_cmpnlt_sd(b, a), 1, simd_extract!(a, 1, f64)) }
2203}
2204
2205/// Returns a new vector with the low element of `a` replaced by the
2206/// not-greater-than-or-equal comparison of the lower elements of `a` and `b`.
2207///
2208/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd)
2209#[inline]
2210#[target_feature(enable = "sse2")]
2211#[cfg_attr(test, assert_instr(cmpnlesd))]
2212#[stable(feature = "simd_x86", since = "1.27.0")]
2213pub fn _mm_cmpnge_sd(a: __m128d, b: __m128d) -> __m128d {
2214    unsafe { simd_insert!(_mm_cmpnle_sd(b, a), 1, simd_extract!(a, 1, f64)) }
2215}
2216
2217/// Compares corresponding elements in `a` and `b` for equality.
2218///
2219/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd)
2220#[inline]
2221#[target_feature(enable = "sse2")]
2222#[cfg_attr(test, assert_instr(cmpeqpd))]
2223#[stable(feature = "simd_x86", since = "1.27.0")]
2224pub fn _mm_cmpeq_pd(a: __m128d, b: __m128d) -> __m128d {
2225    unsafe { cmppd(a, b, 0) }
2226}
2227
2228/// Compares corresponding elements in `a` and `b` for less-than.
2229///
2230/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd)
2231#[inline]
2232#[target_feature(enable = "sse2")]
2233#[cfg_attr(test, assert_instr(cmpltpd))]
2234#[stable(feature = "simd_x86", since = "1.27.0")]
2235pub fn _mm_cmplt_pd(a: __m128d, b: __m128d) -> __m128d {
2236    unsafe { cmppd(a, b, 1) }
2237}
2238
2239/// Compares corresponding elements in `a` and `b` for less-than-or-equal
2240///
2241/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd)
2242#[inline]
2243#[target_feature(enable = "sse2")]
2244#[cfg_attr(test, assert_instr(cmplepd))]
2245#[stable(feature = "simd_x86", since = "1.27.0")]
2246pub fn _mm_cmple_pd(a: __m128d, b: __m128d) -> __m128d {
2247    unsafe { cmppd(a, b, 2) }
2248}
2249
2250/// Compares corresponding elements in `a` and `b` for greater-than.
2251///
2252/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd)
2253#[inline]
2254#[target_feature(enable = "sse2")]
2255#[cfg_attr(test, assert_instr(cmpltpd))]
2256#[stable(feature = "simd_x86", since = "1.27.0")]
2257pub fn _mm_cmpgt_pd(a: __m128d, b: __m128d) -> __m128d {
2258    _mm_cmplt_pd(b, a)
2259}
2260
2261/// Compares corresponding elements in `a` and `b` for greater-than-or-equal.
2262///
2263/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd)
2264#[inline]
2265#[target_feature(enable = "sse2")]
2266#[cfg_attr(test, assert_instr(cmplepd))]
2267#[stable(feature = "simd_x86", since = "1.27.0")]
2268pub fn _mm_cmpge_pd(a: __m128d, b: __m128d) -> __m128d {
2269    _mm_cmple_pd(b, a)
2270}
2271
2272/// Compares corresponding elements in `a` and `b` to see if neither is `NaN`.
2273///
2274/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd)
2275#[inline]
2276#[target_feature(enable = "sse2")]
2277#[cfg_attr(test, assert_instr(cmpordpd))]
2278#[stable(feature = "simd_x86", since = "1.27.0")]
2279pub fn _mm_cmpord_pd(a: __m128d, b: __m128d) -> __m128d {
2280    unsafe { cmppd(a, b, 7) }
2281}
2282
2283/// Compares corresponding elements in `a` and `b` to see if either is `NaN`.
2284///
2285/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd)
2286#[inline]
2287#[target_feature(enable = "sse2")]
2288#[cfg_attr(test, assert_instr(cmpunordpd))]
2289#[stable(feature = "simd_x86", since = "1.27.0")]
2290pub fn _mm_cmpunord_pd(a: __m128d, b: __m128d) -> __m128d {
2291    unsafe { cmppd(a, b, 3) }
2292}
2293
2294/// Compares corresponding elements in `a` and `b` for not-equal.
2295///
2296/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd)
2297#[inline]
2298#[target_feature(enable = "sse2")]
2299#[cfg_attr(test, assert_instr(cmpneqpd))]
2300#[stable(feature = "simd_x86", since = "1.27.0")]
2301pub fn _mm_cmpneq_pd(a: __m128d, b: __m128d) -> __m128d {
2302    unsafe { cmppd(a, b, 4) }
2303}
2304
2305/// Compares corresponding elements in `a` and `b` for not-less-than.
2306///
2307/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd)
2308#[inline]
2309#[target_feature(enable = "sse2")]
2310#[cfg_attr(test, assert_instr(cmpnltpd))]
2311#[stable(feature = "simd_x86", since = "1.27.0")]
2312pub fn _mm_cmpnlt_pd(a: __m128d, b: __m128d) -> __m128d {
2313    unsafe { cmppd(a, b, 5) }
2314}
2315
2316/// Compares corresponding elements in `a` and `b` for not-less-than-or-equal.
2317///
2318/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd)
2319#[inline]
2320#[target_feature(enable = "sse2")]
2321#[cfg_attr(test, assert_instr(cmpnlepd))]
2322#[stable(feature = "simd_x86", since = "1.27.0")]
2323pub fn _mm_cmpnle_pd(a: __m128d, b: __m128d) -> __m128d {
2324    unsafe { cmppd(a, b, 6) }
2325}
2326
2327/// Compares corresponding elements in `a` and `b` for not-greater-than.
2328///
2329/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_pd)
2330#[inline]
2331#[target_feature(enable = "sse2")]
2332#[cfg_attr(test, assert_instr(cmpnltpd))]
2333#[stable(feature = "simd_x86", since = "1.27.0")]
2334pub fn _mm_cmpngt_pd(a: __m128d, b: __m128d) -> __m128d {
2335    _mm_cmpnlt_pd(b, a)
2336}
2337
2338/// Compares corresponding elements in `a` and `b` for
2339/// not-greater-than-or-equal.
2340///
2341/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd)
2342#[inline]
2343#[target_feature(enable = "sse2")]
2344#[cfg_attr(test, assert_instr(cmpnlepd))]
2345#[stable(feature = "simd_x86", since = "1.27.0")]
2346pub fn _mm_cmpnge_pd(a: __m128d, b: __m128d) -> __m128d {
2347    _mm_cmpnle_pd(b, a)
2348}
2349
2350/// Compares the lower element of `a` and `b` for equality.
2351///
2352/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd)
2353#[inline]
2354#[target_feature(enable = "sse2")]
2355#[cfg_attr(test, assert_instr(comisd))]
2356#[stable(feature = "simd_x86", since = "1.27.0")]
2357pub fn _mm_comieq_sd(a: __m128d, b: __m128d) -> i32 {
2358    unsafe { comieqsd(a, b) }
2359}
2360
2361/// Compares the lower element of `a` and `b` for less-than.
2362///
2363/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd)
2364#[inline]
2365#[target_feature(enable = "sse2")]
2366#[cfg_attr(test, assert_instr(comisd))]
2367#[stable(feature = "simd_x86", since = "1.27.0")]
2368pub fn _mm_comilt_sd(a: __m128d, b: __m128d) -> i32 {
2369    unsafe { comiltsd(a, b) }
2370}
2371
2372/// Compares the lower element of `a` and `b` for less-than-or-equal.
2373///
2374/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd)
2375#[inline]
2376#[target_feature(enable = "sse2")]
2377#[cfg_attr(test, assert_instr(comisd))]
2378#[stable(feature = "simd_x86", since = "1.27.0")]
2379pub fn _mm_comile_sd(a: __m128d, b: __m128d) -> i32 {
2380    unsafe { comilesd(a, b) }
2381}
2382
2383/// Compares the lower element of `a` and `b` for greater-than.
2384///
2385/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd)
2386#[inline]
2387#[target_feature(enable = "sse2")]
2388#[cfg_attr(test, assert_instr(comisd))]
2389#[stable(feature = "simd_x86", since = "1.27.0")]
2390pub fn _mm_comigt_sd(a: __m128d, b: __m128d) -> i32 {
2391    unsafe { comigtsd(a, b) }
2392}
2393
2394/// Compares the lower element of `a` and `b` for greater-than-or-equal.
2395///
2396/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd)
2397#[inline]
2398#[target_feature(enable = "sse2")]
2399#[cfg_attr(test, assert_instr(comisd))]
2400#[stable(feature = "simd_x86", since = "1.27.0")]
2401pub fn _mm_comige_sd(a: __m128d, b: __m128d) -> i32 {
2402    unsafe { comigesd(a, b) }
2403}
2404
2405/// Compares the lower element of `a` and `b` for not-equal.
2406///
2407/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd)
2408#[inline]
2409#[target_feature(enable = "sse2")]
2410#[cfg_attr(test, assert_instr(comisd))]
2411#[stable(feature = "simd_x86", since = "1.27.0")]
2412pub fn _mm_comineq_sd(a: __m128d, b: __m128d) -> i32 {
2413    unsafe { comineqsd(a, b) }
2414}
2415
2416/// Compares the lower element of `a` and `b` for equality.
2417///
2418/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomieq_sd)
2419#[inline]
2420#[target_feature(enable = "sse2")]
2421#[cfg_attr(test, assert_instr(ucomisd))]
2422#[stable(feature = "simd_x86", since = "1.27.0")]
2423pub fn _mm_ucomieq_sd(a: __m128d, b: __m128d) -> i32 {
2424    unsafe { ucomieqsd(a, b) }
2425}
2426
2427/// Compares the lower element of `a` and `b` for less-than.
2428///
2429/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomilt_sd)
2430#[inline]
2431#[target_feature(enable = "sse2")]
2432#[cfg_attr(test, assert_instr(ucomisd))]
2433#[stable(feature = "simd_x86", since = "1.27.0")]
2434pub fn _mm_ucomilt_sd(a: __m128d, b: __m128d) -> i32 {
2435    unsafe { ucomiltsd(a, b) }
2436}
2437
2438/// Compares the lower element of `a` and `b` for less-than-or-equal.
2439///
2440/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomile_sd)
2441#[inline]
2442#[target_feature(enable = "sse2")]
2443#[cfg_attr(test, assert_instr(ucomisd))]
2444#[stable(feature = "simd_x86", since = "1.27.0")]
2445pub fn _mm_ucomile_sd(a: __m128d, b: __m128d) -> i32 {
2446    unsafe { ucomilesd(a, b) }
2447}
2448
2449/// Compares the lower element of `a` and `b` for greater-than.
2450///
2451/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomigt_sd)
2452#[inline]
2453#[target_feature(enable = "sse2")]
2454#[cfg_attr(test, assert_instr(ucomisd))]
2455#[stable(feature = "simd_x86", since = "1.27.0")]
2456pub fn _mm_ucomigt_sd(a: __m128d, b: __m128d) -> i32 {
2457    unsafe { ucomigtsd(a, b) }
2458}
2459
2460/// Compares the lower element of `a` and `b` for greater-than-or-equal.
2461///
2462/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomige_sd)
2463#[inline]
2464#[target_feature(enable = "sse2")]
2465#[cfg_attr(test, assert_instr(ucomisd))]
2466#[stable(feature = "simd_x86", since = "1.27.0")]
2467pub fn _mm_ucomige_sd(a: __m128d, b: __m128d) -> i32 {
2468    unsafe { ucomigesd(a, b) }
2469}
2470
2471/// Compares the lower element of `a` and `b` for not-equal.
2472///
2473/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomineq_sd)
2474#[inline]
2475#[target_feature(enable = "sse2")]
2476#[cfg_attr(test, assert_instr(ucomisd))]
2477#[stable(feature = "simd_x86", since = "1.27.0")]
2478pub fn _mm_ucomineq_sd(a: __m128d, b: __m128d) -> i32 {
2479    unsafe { ucomineqsd(a, b) }
2480}
2481
2482/// Converts packed double-precision (64-bit) floating-point elements in `a` to
2483/// packed single-precision (32-bit) floating-point elements
2484///
2485/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps)
2486#[inline]
2487#[target_feature(enable = "sse2")]
2488#[cfg_attr(test, assert_instr(cvtpd2ps))]
2489#[stable(feature = "simd_x86", since = "1.27.0")]
2490#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2491pub const fn _mm_cvtpd_ps(a: __m128d) -> __m128 {
2492    unsafe {
2493        let r = simd_cast::<_, f32x2>(a.as_f64x2());
2494        let zero = f32x2::ZERO;
2495        transmute::<f32x4, _>(simd_shuffle!(r, zero, [0, 1, 2, 3]))
2496    }
2497}
2498
2499/// Converts packed single-precision (32-bit) floating-point elements in `a` to
2500/// packed
2501/// double-precision (64-bit) floating-point elements.
2502///
2503/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd)
2504#[inline]
2505#[target_feature(enable = "sse2")]
2506#[cfg_attr(test, assert_instr(cvtps2pd))]
2507#[stable(feature = "simd_x86", since = "1.27.0")]
2508#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2509pub const fn _mm_cvtps_pd(a: __m128) -> __m128d {
2510    unsafe {
2511        let a = a.as_f32x4();
2512        transmute(simd_cast::<f32x2, f64x2>(simd_shuffle!(a, a, [0, 1])))
2513    }
2514}
2515
2516/// Converts packed double-precision (64-bit) floating-point elements in `a` to
2517/// packed 32-bit integers.
2518///
2519/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32)
2520#[inline]
2521#[target_feature(enable = "sse2")]
2522#[cfg_attr(test, assert_instr(cvtpd2dq))]
2523#[stable(feature = "simd_x86", since = "1.27.0")]
2524pub fn _mm_cvtpd_epi32(a: __m128d) -> __m128i {
2525    unsafe { transmute(cvtpd2dq(a)) }
2526}
2527
2528/// Converts the lower double-precision (64-bit) floating-point element in a to
2529/// a 32-bit integer.
2530///
2531/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32)
2532#[inline]
2533#[target_feature(enable = "sse2")]
2534#[cfg_attr(test, assert_instr(cvtsd2si))]
2535#[stable(feature = "simd_x86", since = "1.27.0")]
2536pub fn _mm_cvtsd_si32(a: __m128d) -> i32 {
2537    unsafe { cvtsd2si(a) }
2538}
2539
2540/// Converts the lower double-precision (64-bit) floating-point element in `b`
2541/// to a single-precision (32-bit) floating-point element, store the result in
2542/// the lower element of the return value, and copies the upper element from `a`
2543/// to the upper element the return value.
2544///
2545/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss)
2546#[inline]
2547#[target_feature(enable = "sse2")]
2548#[cfg_attr(test, assert_instr(cvtsd2ss))]
2549#[stable(feature = "simd_x86", since = "1.27.0")]
2550pub fn _mm_cvtsd_ss(a: __m128, b: __m128d) -> __m128 {
2551    unsafe { cvtsd2ss(a, b) }
2552}
2553
2554/// Returns the lower double-precision (64-bit) floating-point element of `a`.
2555///
2556/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64)
2557#[inline]
2558#[target_feature(enable = "sse2")]
2559#[stable(feature = "simd_x86", since = "1.27.0")]
2560#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2561pub const fn _mm_cvtsd_f64(a: __m128d) -> f64 {
2562    unsafe { simd_extract!(a, 0) }
2563}
2564
2565/// Converts the lower single-precision (32-bit) floating-point element in `b`
2566/// to a double-precision (64-bit) floating-point element, store the result in
2567/// the lower element of the return value, and copies the upper element from `a`
2568/// to the upper element the return value.
2569///
2570/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd)
2571#[inline]
2572#[target_feature(enable = "sse2")]
2573#[cfg_attr(test, assert_instr(cvtss2sd))]
2574#[stable(feature = "simd_x86", since = "1.27.0")]
2575#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2576pub const fn _mm_cvtss_sd(a: __m128d, b: __m128) -> __m128d {
2577    unsafe {
2578        let elt: f32 = simd_extract!(b, 0);
2579        simd_insert!(a, 0, elt as f64)
2580    }
2581}
2582
2583/// Converts packed double-precision (64-bit) floating-point elements in `a` to
2584/// packed 32-bit integers with truncation.
2585///
2586/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32)
2587#[inline]
2588#[target_feature(enable = "sse2")]
2589#[cfg_attr(test, assert_instr(cvttpd2dq))]
2590#[stable(feature = "simd_x86", since = "1.27.0")]
2591pub fn _mm_cvttpd_epi32(a: __m128d) -> __m128i {
2592    unsafe { transmute(cvttpd2dq(a)) }
2593}
2594
2595/// Converts the lower double-precision (64-bit) floating-point element in `a`
2596/// to a 32-bit integer with truncation.
2597///
2598/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32)
2599#[inline]
2600#[target_feature(enable = "sse2")]
2601#[cfg_attr(test, assert_instr(cvttsd2si))]
2602#[stable(feature = "simd_x86", since = "1.27.0")]
2603pub fn _mm_cvttsd_si32(a: __m128d) -> i32 {
2604    unsafe { cvttsd2si(a) }
2605}
2606
2607/// Converts packed single-precision (32-bit) floating-point elements in `a` to
2608/// packed 32-bit integers with truncation.
2609///
2610/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32)
2611#[inline]
2612#[target_feature(enable = "sse2")]
2613#[cfg_attr(test, assert_instr(cvttps2dq))]
2614#[stable(feature = "simd_x86", since = "1.27.0")]
2615pub fn _mm_cvttps_epi32(a: __m128) -> __m128i {
2616    unsafe { transmute(cvttps2dq(a)) }
2617}
2618
2619/// Copies double-precision (64-bit) floating-point element `a` to the lower
2620/// element of the packed 64-bit return value.
2621///
2622/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd)
2623#[inline]
2624#[target_feature(enable = "sse2")]
2625#[stable(feature = "simd_x86", since = "1.27.0")]
2626#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2627pub const fn _mm_set_sd(a: f64) -> __m128d {
2628    _mm_set_pd(0.0, a)
2629}
2630
2631/// Broadcasts double-precision (64-bit) floating-point value a to all elements
2632/// of the return value.
2633///
2634/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd)
2635#[inline]
2636#[target_feature(enable = "sse2")]
2637#[stable(feature = "simd_x86", since = "1.27.0")]
2638#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2639pub const fn _mm_set1_pd(a: f64) -> __m128d {
2640    _mm_set_pd(a, a)
2641}
2642
2643/// Broadcasts double-precision (64-bit) floating-point value a to all elements
2644/// of the return value.
2645///
2646/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1)
2647#[inline]
2648#[target_feature(enable = "sse2")]
2649#[stable(feature = "simd_x86", since = "1.27.0")]
2650#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2651pub const fn _mm_set_pd1(a: f64) -> __m128d {
2652    _mm_set_pd(a, a)
2653}
2654
2655/// Sets packed double-precision (64-bit) floating-point elements in the return
2656/// value with the supplied values.
2657///
2658/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd)
2659#[inline]
2660#[target_feature(enable = "sse2")]
2661#[stable(feature = "simd_x86", since = "1.27.0")]
2662#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2663pub const fn _mm_set_pd(a: f64, b: f64) -> __m128d {
2664    __m128d([b, a])
2665}
2666
2667/// Sets packed double-precision (64-bit) floating-point elements in the return
2668/// value with the supplied values in reverse order.
2669///
2670/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd)
2671#[inline]
2672#[target_feature(enable = "sse2")]
2673#[stable(feature = "simd_x86", since = "1.27.0")]
2674#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2675pub const fn _mm_setr_pd(a: f64, b: f64) -> __m128d {
2676    _mm_set_pd(b, a)
2677}
2678
2679/// Returns packed double-precision (64-bit) floating-point elements with all
2680/// zeros.
2681///
2682/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd)
2683#[inline]
2684#[target_feature(enable = "sse2")]
2685#[cfg_attr(test, assert_instr(xorp))]
2686#[stable(feature = "simd_x86", since = "1.27.0")]
2687#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2688pub const fn _mm_setzero_pd() -> __m128d {
2689    const { unsafe { mem::zeroed() } }
2690}
2691
2692/// Returns a mask of the most significant bit of each element in `a`.
2693///
2694/// The mask is stored in the 2 least significant bits of the return value.
2695/// All other bits are set to `0`.
2696///
2697/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd)
2698#[inline]
2699#[target_feature(enable = "sse2")]
2700#[cfg_attr(test, assert_instr(movmskpd))]
2701#[stable(feature = "simd_x86", since = "1.27.0")]
2702#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2703pub const fn _mm_movemask_pd(a: __m128d) -> i32 {
2704    // Propagate the highest bit to the rest, because simd_bitmask
2705    // requires all-1 or all-0.
2706    unsafe {
2707        let mask: i64x2 = simd_lt(transmute(a), i64x2::ZERO);
2708        simd_bitmask::<i64x2, u8>(mask) as i32
2709    }
2710}
2711
2712/// Loads 128-bits (composed of 2 packed double-precision (64-bit)
2713/// floating-point elements) from memory into the returned vector.
2714/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
2715/// exception may be generated.
2716///
2717/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd)
2718#[inline]
2719#[target_feature(enable = "sse2")]
2720#[cfg_attr(
2721    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
2722    assert_instr(movaps)
2723)]
2724#[stable(feature = "simd_x86", since = "1.27.0")]
2725#[allow(clippy::cast_ptr_alignment)]
2726#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2727pub const unsafe fn _mm_load_pd(mem_addr: *const f64) -> __m128d {
2728    *(mem_addr as *const __m128d)
2729}
2730
2731/// Loads a 64-bit double-precision value to the low element of a
2732/// 128-bit integer vector and clears the upper element.
2733///
2734/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd)
2735#[inline]
2736#[target_feature(enable = "sse2")]
2737#[cfg_attr(test, assert_instr(movsd))]
2738#[stable(feature = "simd_x86", since = "1.27.0")]
2739#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2740pub const unsafe fn _mm_load_sd(mem_addr: *const f64) -> __m128d {
2741    _mm_setr_pd(*mem_addr, 0.)
2742}
2743
2744/// Loads a double-precision value into the high-order bits of a 128-bit
2745/// vector of `[2 x double]`. The low-order bits are copied from the low-order
2746/// bits of the first operand.
2747///
2748/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd)
2749#[inline]
2750#[target_feature(enable = "sse2")]
2751#[cfg_attr(test, assert_instr(movhps))]
2752#[stable(feature = "simd_x86", since = "1.27.0")]
2753#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2754pub const unsafe fn _mm_loadh_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
2755    _mm_setr_pd(simd_extract!(a, 0), *mem_addr)
2756}
2757
2758/// Loads a double-precision value into the low-order bits of a 128-bit
2759/// vector of `[2 x double]`. The high-order bits are copied from the
2760/// high-order bits of the first operand.
2761///
2762/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd)
2763#[inline]
2764#[target_feature(enable = "sse2")]
2765#[cfg_attr(test, assert_instr(movlps))]
2766#[stable(feature = "simd_x86", since = "1.27.0")]
2767#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2768pub const unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
2769    _mm_setr_pd(*mem_addr, simd_extract!(a, 1))
2770}
2771
2772/// Stores a 128-bit floating point vector of `[2 x double]` to a 128-bit
2773/// aligned memory location.
2774/// To minimize caching, the data is flagged as non-temporal (unlikely to be
2775/// used again soon).
2776///
2777/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd)
2778///
2779/// # Safety of non-temporal stores
2780///
2781/// After using this intrinsic, but before any other access to the memory that this intrinsic
2782/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
2783/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
2784/// return.
2785///
2786/// See [`_mm_sfence`] for details.
2787#[inline]
2788#[target_feature(enable = "sse2")]
2789#[cfg_attr(test, assert_instr(movntpd))]
2790#[stable(feature = "simd_x86", since = "1.27.0")]
2791#[allow(clippy::cast_ptr_alignment)]
2792pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) {
2793    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
2794    crate::arch::asm!(
2795        vps!("movntpd", ",{a}"),
2796        p = in(reg) mem_addr,
2797        a = in(xmm_reg) a,
2798        options(nostack, preserves_flags),
2799    );
2800}
2801
2802/// Stores the lower 64 bits of a 128-bit vector of `[2 x double]` to a
2803/// memory location.
2804///
2805/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_sd)
2806#[inline]
2807#[target_feature(enable = "sse2")]
2808#[cfg_attr(test, assert_instr(movlps))]
2809#[stable(feature = "simd_x86", since = "1.27.0")]
2810#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2811pub const unsafe fn _mm_store_sd(mem_addr: *mut f64, a: __m128d) {
2812    *mem_addr = simd_extract!(a, 0)
2813}
2814
2815/// Stores 128-bits (composed of 2 packed double-precision (64-bit)
2816/// floating-point elements) from `a` into memory. `mem_addr` must be aligned
2817/// on a 16-byte boundary or a general-protection exception may be generated.
2818///
2819/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd)
2820#[inline]
2821#[target_feature(enable = "sse2")]
2822#[cfg_attr(
2823    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
2824    assert_instr(movaps)
2825)]
2826#[stable(feature = "simd_x86", since = "1.27.0")]
2827#[allow(clippy::cast_ptr_alignment)]
2828#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2829pub const unsafe fn _mm_store_pd(mem_addr: *mut f64, a: __m128d) {
2830    *(mem_addr as *mut __m128d) = a;
2831}
2832
2833/// Stores 128-bits (composed of 2 packed double-precision (64-bit)
2834/// floating-point elements) from `a` into memory.
2835/// `mem_addr` does not need to be aligned on any particular boundary.
2836///
2837/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd)
2838#[inline]
2839#[target_feature(enable = "sse2")]
2840#[cfg_attr(test, assert_instr(movups))] // FIXME movupd expected
2841#[stable(feature = "simd_x86", since = "1.27.0")]
2842#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2843pub const unsafe fn _mm_storeu_pd(mem_addr: *mut f64, a: __m128d) {
2844    mem_addr.cast::<__m128d>().write_unaligned(a);
2845}
2846
2847/// Store 16-bit integer from the first element of a into memory.
2848///
2849/// `mem_addr` does not need to be aligned on any particular boundary.
2850///
2851/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16)
2852#[inline]
2853#[target_feature(enable = "sse2")]
2854#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2855#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2856pub const unsafe fn _mm_storeu_si16(mem_addr: *mut u8, a: __m128i) {
2857    ptr::write_unaligned(mem_addr as *mut i16, simd_extract(a.as_i16x8(), 0))
2858}
2859
2860/// Store 32-bit integer from the first element of a into memory.
2861///
2862/// `mem_addr` does not need to be aligned on any particular boundary.
2863///
2864/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32)
2865#[inline]
2866#[target_feature(enable = "sse2")]
2867#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2868#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2869pub const unsafe fn _mm_storeu_si32(mem_addr: *mut u8, a: __m128i) {
2870    ptr::write_unaligned(mem_addr as *mut i32, simd_extract(a.as_i32x4(), 0))
2871}
2872
2873/// Store 64-bit integer from the first element of a into memory.
2874///
2875/// `mem_addr` does not need to be aligned on any particular boundary.
2876///
2877/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64)
2878#[inline]
2879#[target_feature(enable = "sse2")]
2880#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2881#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2882pub const unsafe fn _mm_storeu_si64(mem_addr: *mut u8, a: __m128i) {
2883    ptr::write_unaligned(mem_addr as *mut i64, simd_extract(a.as_i64x2(), 0))
2884}
2885
2886/// Stores the lower double-precision (64-bit) floating-point element from `a`
2887/// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
2888/// 16-byte boundary or a general-protection exception may be generated.
2889///
2890/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_pd)
2891#[inline]
2892#[target_feature(enable = "sse2")]
2893#[stable(feature = "simd_x86", since = "1.27.0")]
2894#[allow(clippy::cast_ptr_alignment)]
2895#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2896pub const unsafe fn _mm_store1_pd(mem_addr: *mut f64, a: __m128d) {
2897    let b: __m128d = simd_shuffle!(a, a, [0, 0]);
2898    *(mem_addr as *mut __m128d) = b;
2899}
2900
2901/// Stores the lower double-precision (64-bit) floating-point element from `a`
2902/// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
2903/// 16-byte boundary or a general-protection exception may be generated.
2904///
2905/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1)
2906#[inline]
2907#[target_feature(enable = "sse2")]
2908#[stable(feature = "simd_x86", since = "1.27.0")]
2909#[allow(clippy::cast_ptr_alignment)]
2910#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2911pub const unsafe fn _mm_store_pd1(mem_addr: *mut f64, a: __m128d) {
2912    let b: __m128d = simd_shuffle!(a, a, [0, 0]);
2913    *(mem_addr as *mut __m128d) = b;
2914}
2915
2916/// Stores 2 double-precision (64-bit) floating-point elements from `a` into
2917/// memory in reverse order.
2918/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
2919/// exception may be generated.
2920///
2921/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd)
2922#[inline]
2923#[target_feature(enable = "sse2")]
2924#[stable(feature = "simd_x86", since = "1.27.0")]
2925#[allow(clippy::cast_ptr_alignment)]
2926#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2927pub const unsafe fn _mm_storer_pd(mem_addr: *mut f64, a: __m128d) {
2928    let b: __m128d = simd_shuffle!(a, a, [1, 0]);
2929    *(mem_addr as *mut __m128d) = b;
2930}
2931
2932/// Stores the upper 64 bits of a 128-bit vector of `[2 x double]` to a
2933/// memory location.
2934///
2935/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd)
2936#[inline]
2937#[target_feature(enable = "sse2")]
2938#[cfg_attr(test, assert_instr(movhps))]
2939#[stable(feature = "simd_x86", since = "1.27.0")]
2940#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2941pub const unsafe fn _mm_storeh_pd(mem_addr: *mut f64, a: __m128d) {
2942    *mem_addr = simd_extract!(a, 1);
2943}
2944
2945/// Stores the lower 64 bits of a 128-bit vector of `[2 x double]` to a
2946/// memory location.
2947///
2948/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd)
2949#[inline]
2950#[target_feature(enable = "sse2")]
2951#[cfg_attr(test, assert_instr(movlps))]
2952#[stable(feature = "simd_x86", since = "1.27.0")]
2953#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2954pub const unsafe fn _mm_storel_pd(mem_addr: *mut f64, a: __m128d) {
2955    *mem_addr = simd_extract!(a, 0);
2956}
2957
2958/// Loads a double-precision (64-bit) floating-point element from memory
2959/// into both elements of returned vector.
2960///
2961/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd)
2962#[inline]
2963#[target_feature(enable = "sse2")]
2964// #[cfg_attr(test, assert_instr(movapd))] // FIXME LLVM uses different codegen
2965#[stable(feature = "simd_x86", since = "1.27.0")]
2966#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2967pub const unsafe fn _mm_load1_pd(mem_addr: *const f64) -> __m128d {
2968    let d = *mem_addr;
2969    _mm_setr_pd(d, d)
2970}
2971
2972/// Loads a double-precision (64-bit) floating-point element from memory
2973/// into both elements of returned vector.
2974///
2975/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1)
2976#[inline]
2977#[target_feature(enable = "sse2")]
2978// #[cfg_attr(test, assert_instr(movapd))] // FIXME same as _mm_load1_pd
2979#[stable(feature = "simd_x86", since = "1.27.0")]
2980#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2981pub const unsafe fn _mm_load_pd1(mem_addr: *const f64) -> __m128d {
2982    _mm_load1_pd(mem_addr)
2983}
2984
2985/// Loads 2 double-precision (64-bit) floating-point elements from memory into
2986/// the returned vector in reverse order. `mem_addr` must be aligned on a
2987/// 16-byte boundary or a general-protection exception may be generated.
2988///
2989/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd)
2990#[inline]
2991#[target_feature(enable = "sse2")]
2992#[cfg_attr(
2993    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
2994    assert_instr(movaps)
2995)]
2996#[stable(feature = "simd_x86", since = "1.27.0")]
2997#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2998pub const unsafe fn _mm_loadr_pd(mem_addr: *const f64) -> __m128d {
2999    let a = _mm_load_pd(mem_addr);
3000    simd_shuffle!(a, a, [1, 0])
3001}
3002
3003/// Loads 128-bits (composed of 2 packed double-precision (64-bit)
3004/// floating-point elements) from memory into the returned vector.
3005/// `mem_addr` does not need to be aligned on any particular boundary.
3006///
3007/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd)
3008#[inline]
3009#[target_feature(enable = "sse2")]
3010#[cfg_attr(test, assert_instr(movups))]
3011#[stable(feature = "simd_x86", since = "1.27.0")]
3012#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3013pub const unsafe fn _mm_loadu_pd(mem_addr: *const f64) -> __m128d {
3014    let mut dst = _mm_undefined_pd();
3015    ptr::copy_nonoverlapping(
3016        mem_addr as *const u8,
3017        ptr::addr_of_mut!(dst) as *mut u8,
3018        mem::size_of::<__m128d>(),
3019    );
3020    dst
3021}
3022
3023/// Loads unaligned 16-bits of integer data from memory into new vector.
3024///
3025/// `mem_addr` does not need to be aligned on any particular boundary.
3026///
3027/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16)
3028#[inline]
3029#[target_feature(enable = "sse2")]
3030#[stable(feature = "simd_x86_updates", since = "1.82.0")]
3031#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3032pub const unsafe fn _mm_loadu_si16(mem_addr: *const u8) -> __m128i {
3033    transmute(i16x8::new(
3034        ptr::read_unaligned(mem_addr as *const i16),
3035        0,
3036        0,
3037        0,
3038        0,
3039        0,
3040        0,
3041        0,
3042    ))
3043}
3044
3045/// Loads unaligned 32-bits of integer data from memory into new vector.
3046///
3047/// `mem_addr` does not need to be aligned on any particular boundary.
3048///
3049/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32)
3050#[inline]
3051#[target_feature(enable = "sse2")]
3052#[stable(feature = "simd_x86_updates", since = "1.82.0")]
3053#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3054pub const unsafe fn _mm_loadu_si32(mem_addr: *const u8) -> __m128i {
3055    transmute(i32x4::new(
3056        ptr::read_unaligned(mem_addr as *const i32),
3057        0,
3058        0,
3059        0,
3060    ))
3061}
3062
3063/// Loads unaligned 64-bits of integer data from memory into new vector.
3064///
3065/// `mem_addr` does not need to be aligned on any particular boundary.
3066///
3067/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64)
3068#[inline]
3069#[target_feature(enable = "sse2")]
3070#[stable(feature = "simd_x86_mm_loadu_si64", since = "1.46.0")]
3071#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3072pub const unsafe fn _mm_loadu_si64(mem_addr: *const u8) -> __m128i {
3073    transmute(i64x2::new(ptr::read_unaligned(mem_addr as *const i64), 0))
3074}
3075
3076/// Constructs a 128-bit floating-point vector of `[2 x double]` from two
3077/// 128-bit vector parameters of `[2 x double]`, using the immediate-value
3078/// parameter as a specifier.
3079///
3080/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd)
3081#[inline]
3082#[target_feature(enable = "sse2")]
3083#[cfg_attr(test, assert_instr(shufps, MASK = 2))]
3084#[rustc_legacy_const_generics(2)]
3085#[stable(feature = "simd_x86", since = "1.27.0")]
3086#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3087pub const fn _mm_shuffle_pd<const MASK: i32>(a: __m128d, b: __m128d) -> __m128d {
3088    static_assert_uimm_bits!(MASK, 8);
3089    unsafe { simd_shuffle!(a, b, [MASK as u32 & 0b1, ((MASK as u32 >> 1) & 0b1) + 2]) }
3090}
3091
3092/// Constructs a 128-bit floating-point vector of `[2 x double]`. The lower
3093/// 64 bits are set to the lower 64 bits of the second parameter. The upper
3094/// 64 bits are set to the upper 64 bits of the first parameter.
3095///
3096/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd)
3097#[inline]
3098#[target_feature(enable = "sse2")]
3099#[cfg_attr(test, assert_instr(movsd))]
3100#[stable(feature = "simd_x86", since = "1.27.0")]
3101#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3102pub const fn _mm_move_sd(a: __m128d, b: __m128d) -> __m128d {
3103    unsafe { _mm_setr_pd(simd_extract!(b, 0), simd_extract!(a, 1)) }
3104}
3105
3106/// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit
3107/// floating-point vector of `[4 x float]`.
3108///
3109/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps)
3110#[inline]
3111#[target_feature(enable = "sse2")]
3112#[stable(feature = "simd_x86", since = "1.27.0")]
3113#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3114pub const fn _mm_castpd_ps(a: __m128d) -> __m128 {
3115    unsafe { transmute(a) }
3116}
3117
3118/// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit
3119/// integer vector.
3120///
3121/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128)
3122#[inline]
3123#[target_feature(enable = "sse2")]
3124#[stable(feature = "simd_x86", since = "1.27.0")]
3125#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3126pub const fn _mm_castpd_si128(a: __m128d) -> __m128i {
3127    unsafe { transmute(a) }
3128}
3129
3130/// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit
3131/// floating-point vector of `[2 x double]`.
3132///
3133/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd)
3134#[inline]
3135#[target_feature(enable = "sse2")]
3136#[stable(feature = "simd_x86", since = "1.27.0")]
3137#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3138pub const fn _mm_castps_pd(a: __m128) -> __m128d {
3139    unsafe { transmute(a) }
3140}
3141
3142/// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit
3143/// integer vector.
3144///
3145/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128)
3146#[inline]
3147#[target_feature(enable = "sse2")]
3148#[stable(feature = "simd_x86", since = "1.27.0")]
3149#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3150pub const fn _mm_castps_si128(a: __m128) -> __m128i {
3151    unsafe { transmute(a) }
3152}
3153
3154/// Casts a 128-bit integer vector into a 128-bit floating-point vector
3155/// of `[2 x double]`.
3156///
3157/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd)
3158#[inline]
3159#[target_feature(enable = "sse2")]
3160#[stable(feature = "simd_x86", since = "1.27.0")]
3161#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3162pub const fn _mm_castsi128_pd(a: __m128i) -> __m128d {
3163    unsafe { transmute(a) }
3164}
3165
3166/// Casts a 128-bit integer vector into a 128-bit floating-point vector
3167/// of `[4 x float]`.
3168///
3169/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps)
3170#[inline]
3171#[target_feature(enable = "sse2")]
3172#[stable(feature = "simd_x86", since = "1.27.0")]
3173#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3174pub const fn _mm_castsi128_ps(a: __m128i) -> __m128 {
3175    unsafe { transmute(a) }
3176}
3177
3178/// Returns vector of type __m128d with indeterminate elements.with indetermination elements.
3179/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
3180/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
3181/// In practice, this is typically equivalent to [`mem::zeroed`].
3182///
3183/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd)
3184#[inline]
3185#[target_feature(enable = "sse2")]
3186#[stable(feature = "simd_x86", since = "1.27.0")]
3187#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3188pub const fn _mm_undefined_pd() -> __m128d {
3189    const { unsafe { mem::zeroed() } }
3190}
3191
3192/// Returns vector of type __m128i with indeterminate elements.with indetermination elements.
3193/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
3194/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
3195/// In practice, this is typically equivalent to [`mem::zeroed`].
3196///
3197/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_si128)
3198#[inline]
3199#[target_feature(enable = "sse2")]
3200#[stable(feature = "simd_x86", since = "1.27.0")]
3201#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3202pub const fn _mm_undefined_si128() -> __m128i {
3203    const { unsafe { mem::zeroed() } }
3204}
3205
3206/// The resulting `__m128d` element is composed by the low-order values of
3207/// the two `__m128d` interleaved input elements, i.e.:
3208///
3209/// * The `[127:64]` bits are copied from the `[127:64]` bits of the second input
3210/// * The `[63:0]` bits are copied from the `[127:64]` bits of the first input
3211///
3212/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd)
3213#[inline]
3214#[target_feature(enable = "sse2")]
3215#[cfg_attr(test, assert_instr(unpckhpd))]
3216#[stable(feature = "simd_x86", since = "1.27.0")]
3217#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3218pub const fn _mm_unpackhi_pd(a: __m128d, b: __m128d) -> __m128d {
3219    unsafe { simd_shuffle!(a, b, [1, 3]) }
3220}
3221
3222/// The resulting `__m128d` element is composed by the high-order values of
3223/// the two `__m128d` interleaved input elements, i.e.:
3224///
3225/// * The `[127:64]` bits are copied from the `[63:0]` bits of the second input
3226/// * The `[63:0]` bits are copied from the `[63:0]` bits of the first input
3227///
3228/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd)
3229#[inline]
3230#[target_feature(enable = "sse2")]
3231#[cfg_attr(test, assert_instr(movlhps))]
3232#[stable(feature = "simd_x86", since = "1.27.0")]
3233#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3234pub const fn _mm_unpacklo_pd(a: __m128d, b: __m128d) -> __m128d {
3235    unsafe { simd_shuffle!(a, b, [0, 2]) }
3236}
3237
3238#[allow(improper_ctypes)]
3239unsafe extern "C" {
3240    #[link_name = "llvm.x86.sse2.pause"]
3241    fn pause();
3242    #[link_name = "llvm.x86.sse2.clflush"]
3243    fn clflush(p: *const u8);
3244    #[link_name = "llvm.x86.sse2.lfence"]
3245    fn lfence();
3246    #[link_name = "llvm.x86.sse2.mfence"]
3247    fn mfence();
3248    #[link_name = "llvm.x86.sse2.pmadd.wd"]
3249    fn pmaddwd(a: i16x8, b: i16x8) -> i32x4;
3250    #[link_name = "llvm.x86.sse2.psad.bw"]
3251    fn psadbw(a: u8x16, b: u8x16) -> u64x2;
3252    #[link_name = "llvm.x86.sse2.psll.w"]
3253    fn psllw(a: i16x8, count: i16x8) -> i16x8;
3254    #[link_name = "llvm.x86.sse2.psll.d"]
3255    fn pslld(a: i32x4, count: i32x4) -> i32x4;
3256    #[link_name = "llvm.x86.sse2.psll.q"]
3257    fn psllq(a: i64x2, count: i64x2) -> i64x2;
3258    #[link_name = "llvm.x86.sse2.psra.w"]
3259    fn psraw(a: i16x8, count: i16x8) -> i16x8;
3260    #[link_name = "llvm.x86.sse2.psra.d"]
3261    fn psrad(a: i32x4, count: i32x4) -> i32x4;
3262    #[link_name = "llvm.x86.sse2.psrl.w"]
3263    fn psrlw(a: i16x8, count: i16x8) -> i16x8;
3264    #[link_name = "llvm.x86.sse2.psrl.d"]
3265    fn psrld(a: i32x4, count: i32x4) -> i32x4;
3266    #[link_name = "llvm.x86.sse2.psrl.q"]
3267    fn psrlq(a: i64x2, count: i64x2) -> i64x2;
3268    #[link_name = "llvm.x86.sse2.cvtps2dq"]
3269    fn cvtps2dq(a: __m128) -> i32x4;
3270    #[link_name = "llvm.x86.sse2.maskmov.dqu"]
3271    fn maskmovdqu(a: i8x16, mask: i8x16, mem_addr: *mut i8);
3272    #[link_name = "llvm.x86.sse2.max.sd"]
3273    fn maxsd(a: __m128d, b: __m128d) -> __m128d;
3274    #[link_name = "llvm.x86.sse2.max.pd"]
3275    fn maxpd(a: __m128d, b: __m128d) -> __m128d;
3276    #[link_name = "llvm.x86.sse2.min.sd"]
3277    fn minsd(a: __m128d, b: __m128d) -> __m128d;
3278    #[link_name = "llvm.x86.sse2.min.pd"]
3279    fn minpd(a: __m128d, b: __m128d) -> __m128d;
3280    #[link_name = "llvm.x86.sse2.cmp.sd"]
3281    fn cmpsd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3282    #[link_name = "llvm.x86.sse2.cmp.pd"]
3283    fn cmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3284    #[link_name = "llvm.x86.sse2.comieq.sd"]
3285    fn comieqsd(a: __m128d, b: __m128d) -> i32;
3286    #[link_name = "llvm.x86.sse2.comilt.sd"]
3287    fn comiltsd(a: __m128d, b: __m128d) -> i32;
3288    #[link_name = "llvm.x86.sse2.comile.sd"]
3289    fn comilesd(a: __m128d, b: __m128d) -> i32;
3290    #[link_name = "llvm.x86.sse2.comigt.sd"]
3291    fn comigtsd(a: __m128d, b: __m128d) -> i32;
3292    #[link_name = "llvm.x86.sse2.comige.sd"]
3293    fn comigesd(a: __m128d, b: __m128d) -> i32;
3294    #[link_name = "llvm.x86.sse2.comineq.sd"]
3295    fn comineqsd(a: __m128d, b: __m128d) -> i32;
3296    #[link_name = "llvm.x86.sse2.ucomieq.sd"]
3297    fn ucomieqsd(a: __m128d, b: __m128d) -> i32;
3298    #[link_name = "llvm.x86.sse2.ucomilt.sd"]
3299    fn ucomiltsd(a: __m128d, b: __m128d) -> i32;
3300    #[link_name = "llvm.x86.sse2.ucomile.sd"]
3301    fn ucomilesd(a: __m128d, b: __m128d) -> i32;
3302    #[link_name = "llvm.x86.sse2.ucomigt.sd"]
3303    fn ucomigtsd(a: __m128d, b: __m128d) -> i32;
3304    #[link_name = "llvm.x86.sse2.ucomige.sd"]
3305    fn ucomigesd(a: __m128d, b: __m128d) -> i32;
3306    #[link_name = "llvm.x86.sse2.ucomineq.sd"]
3307    fn ucomineqsd(a: __m128d, b: __m128d) -> i32;
3308    #[link_name = "llvm.x86.sse2.cvtpd2dq"]
3309    fn cvtpd2dq(a: __m128d) -> i32x4;
3310    #[link_name = "llvm.x86.sse2.cvtsd2si"]
3311    fn cvtsd2si(a: __m128d) -> i32;
3312    #[link_name = "llvm.x86.sse2.cvtsd2ss"]
3313    fn cvtsd2ss(a: __m128, b: __m128d) -> __m128;
3314    #[link_name = "llvm.x86.sse2.cvttpd2dq"]
3315    fn cvttpd2dq(a: __m128d) -> i32x4;
3316    #[link_name = "llvm.x86.sse2.cvttsd2si"]
3317    fn cvttsd2si(a: __m128d) -> i32;
3318    #[link_name = "llvm.x86.sse2.cvttps2dq"]
3319    fn cvttps2dq(a: __m128) -> i32x4;
3320}
3321
3322#[cfg(test)]
3323mod tests {
3324    use crate::core_arch::assert_eq_const as assert_eq;
3325    use crate::{
3326        core_arch::{simd::*, x86::*},
3327        hint::black_box,
3328    };
3329    use std::{boxed, f32, f64, mem, ptr};
3330    use stdarch_test::simd_test;
3331
3332    const NAN: f64 = f64::NAN;
3333
3334    #[test]
3335    fn test_mm_pause() {
3336        _mm_pause()
3337    }
3338
3339    #[simd_test(enable = "sse2")]
3340    fn test_mm_clflush() {
3341        let x = 0_u8;
3342        unsafe {
3343            _mm_clflush(ptr::addr_of!(x));
3344        }
3345    }
3346
3347    #[simd_test(enable = "sse2")]
3348    // Miri cannot support this until it is clear how it fits in the Rust memory model
3349    #[cfg_attr(miri, ignore)]
3350    fn test_mm_lfence() {
3351        _mm_lfence();
3352    }
3353
3354    #[simd_test(enable = "sse2")]
3355    // Miri cannot support this until it is clear how it fits in the Rust memory model
3356    #[cfg_attr(miri, ignore)]
3357    fn test_mm_mfence() {
3358        _mm_mfence();
3359    }
3360
3361    #[simd_test(enable = "sse2")]
3362    const fn test_mm_add_epi8() {
3363        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3364        #[rustfmt::skip]
3365        let b = _mm_setr_epi8(
3366            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3367        );
3368        let r = _mm_add_epi8(a, b);
3369        #[rustfmt::skip]
3370        let e = _mm_setr_epi8(
3371            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
3372        );
3373        assert_eq_m128i(r, e);
3374    }
3375
3376    #[simd_test(enable = "sse2")]
3377    fn test_mm_add_epi8_overflow() {
3378        let a = _mm_set1_epi8(0x7F);
3379        let b = _mm_set1_epi8(1);
3380        let r = _mm_add_epi8(a, b);
3381        assert_eq_m128i(r, _mm_set1_epi8(-128));
3382    }
3383
3384    #[simd_test(enable = "sse2")]
3385    const fn test_mm_add_epi16() {
3386        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3387        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
3388        let r = _mm_add_epi16(a, b);
3389        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
3390        assert_eq_m128i(r, e);
3391    }
3392
3393    #[simd_test(enable = "sse2")]
3394    const fn test_mm_add_epi32() {
3395        let a = _mm_setr_epi32(0, 1, 2, 3);
3396        let b = _mm_setr_epi32(4, 5, 6, 7);
3397        let r = _mm_add_epi32(a, b);
3398        let e = _mm_setr_epi32(4, 6, 8, 10);
3399        assert_eq_m128i(r, e);
3400    }
3401
3402    #[simd_test(enable = "sse2")]
3403    const fn test_mm_add_epi64() {
3404        let a = _mm_setr_epi64x(0, 1);
3405        let b = _mm_setr_epi64x(2, 3);
3406        let r = _mm_add_epi64(a, b);
3407        let e = _mm_setr_epi64x(2, 4);
3408        assert_eq_m128i(r, e);
3409    }
3410
3411    #[simd_test(enable = "sse2")]
3412    const fn test_mm_adds_epi8() {
3413        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3414        #[rustfmt::skip]
3415        let b = _mm_setr_epi8(
3416            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3417        );
3418        let r = _mm_adds_epi8(a, b);
3419        #[rustfmt::skip]
3420        let e = _mm_setr_epi8(
3421            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
3422        );
3423        assert_eq_m128i(r, e);
3424    }
3425
3426    #[simd_test(enable = "sse2")]
3427    fn test_mm_adds_epi8_saturate_positive() {
3428        let a = _mm_set1_epi8(0x7F);
3429        let b = _mm_set1_epi8(1);
3430        let r = _mm_adds_epi8(a, b);
3431        assert_eq_m128i(r, a);
3432    }
3433
3434    #[simd_test(enable = "sse2")]
3435    fn test_mm_adds_epi8_saturate_negative() {
3436        let a = _mm_set1_epi8(-0x80);
3437        let b = _mm_set1_epi8(-1);
3438        let r = _mm_adds_epi8(a, b);
3439        assert_eq_m128i(r, a);
3440    }
3441
3442    #[simd_test(enable = "sse2")]
3443    const fn test_mm_adds_epi16() {
3444        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3445        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
3446        let r = _mm_adds_epi16(a, b);
3447        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
3448        assert_eq_m128i(r, e);
3449    }
3450
3451    #[simd_test(enable = "sse2")]
3452    fn test_mm_adds_epi16_saturate_positive() {
3453        let a = _mm_set1_epi16(0x7FFF);
3454        let b = _mm_set1_epi16(1);
3455        let r = _mm_adds_epi16(a, b);
3456        assert_eq_m128i(r, a);
3457    }
3458
3459    #[simd_test(enable = "sse2")]
3460    fn test_mm_adds_epi16_saturate_negative() {
3461        let a = _mm_set1_epi16(-0x8000);
3462        let b = _mm_set1_epi16(-1);
3463        let r = _mm_adds_epi16(a, b);
3464        assert_eq_m128i(r, a);
3465    }
3466
3467    #[simd_test(enable = "sse2")]
3468    const fn test_mm_adds_epu8() {
3469        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3470        #[rustfmt::skip]
3471        let b = _mm_setr_epi8(
3472            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3473        );
3474        let r = _mm_adds_epu8(a, b);
3475        #[rustfmt::skip]
3476        let e = _mm_setr_epi8(
3477            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
3478        );
3479        assert_eq_m128i(r, e);
3480    }
3481
3482    #[simd_test(enable = "sse2")]
3483    fn test_mm_adds_epu8_saturate() {
3484        let a = _mm_set1_epi8(!0);
3485        let b = _mm_set1_epi8(1);
3486        let r = _mm_adds_epu8(a, b);
3487        assert_eq_m128i(r, a);
3488    }
3489
3490    #[simd_test(enable = "sse2")]
3491    const fn test_mm_adds_epu16() {
3492        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3493        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
3494        let r = _mm_adds_epu16(a, b);
3495        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
3496        assert_eq_m128i(r, e);
3497    }
3498
3499    #[simd_test(enable = "sse2")]
3500    fn test_mm_adds_epu16_saturate() {
3501        let a = _mm_set1_epi16(!0);
3502        let b = _mm_set1_epi16(1);
3503        let r = _mm_adds_epu16(a, b);
3504        assert_eq_m128i(r, a);
3505    }
3506
3507    #[simd_test(enable = "sse2")]
3508    const fn test_mm_avg_epu8() {
3509        let (a, b) = (_mm_set1_epi8(3), _mm_set1_epi8(9));
3510        let r = _mm_avg_epu8(a, b);
3511        assert_eq_m128i(r, _mm_set1_epi8(6));
3512    }
3513
3514    #[simd_test(enable = "sse2")]
3515    const fn test_mm_avg_epu16() {
3516        let (a, b) = (_mm_set1_epi16(3), _mm_set1_epi16(9));
3517        let r = _mm_avg_epu16(a, b);
3518        assert_eq_m128i(r, _mm_set1_epi16(6));
3519    }
3520
3521    #[simd_test(enable = "sse2")]
3522    fn test_mm_madd_epi16() {
3523        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
3524        let b = _mm_setr_epi16(9, 10, 11, 12, 13, 14, 15, 16);
3525        let r = _mm_madd_epi16(a, b);
3526        let e = _mm_setr_epi32(29, 81, 149, 233);
3527        assert_eq_m128i(r, e);
3528
3529        // Test large values.
3530        // MIN*MIN+MIN*MIN will overflow into i32::MIN.
3531        let a = _mm_setr_epi16(
3532            i16::MAX,
3533            i16::MAX,
3534            i16::MIN,
3535            i16::MIN,
3536            i16::MIN,
3537            i16::MAX,
3538            0,
3539            0,
3540        );
3541        let b = _mm_setr_epi16(
3542            i16::MAX,
3543            i16::MAX,
3544            i16::MIN,
3545            i16::MIN,
3546            i16::MAX,
3547            i16::MIN,
3548            0,
3549            0,
3550        );
3551        let r = _mm_madd_epi16(a, b);
3552        let e = _mm_setr_epi32(0x7FFE0002, i32::MIN, -0x7FFF0000, 0);
3553        assert_eq_m128i(r, e);
3554    }
3555
3556    #[simd_test(enable = "sse2")]
3557    const fn test_mm_max_epi16() {
3558        let a = _mm_set1_epi16(1);
3559        let b = _mm_set1_epi16(-1);
3560        let r = _mm_max_epi16(a, b);
3561        assert_eq_m128i(r, a);
3562    }
3563
3564    #[simd_test(enable = "sse2")]
3565    const fn test_mm_max_epu8() {
3566        let a = _mm_set1_epi8(1);
3567        let b = _mm_set1_epi8(!0);
3568        let r = _mm_max_epu8(a, b);
3569        assert_eq_m128i(r, b);
3570    }
3571
3572    #[simd_test(enable = "sse2")]
3573    const fn test_mm_min_epi16() {
3574        let a = _mm_set1_epi16(1);
3575        let b = _mm_set1_epi16(-1);
3576        let r = _mm_min_epi16(a, b);
3577        assert_eq_m128i(r, b);
3578    }
3579
3580    #[simd_test(enable = "sse2")]
3581    const fn test_mm_min_epu8() {
3582        let a = _mm_set1_epi8(1);
3583        let b = _mm_set1_epi8(!0);
3584        let r = _mm_min_epu8(a, b);
3585        assert_eq_m128i(r, a);
3586    }
3587
3588    #[simd_test(enable = "sse2")]
3589    const fn test_mm_mulhi_epi16() {
3590        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(-1001));
3591        let r = _mm_mulhi_epi16(a, b);
3592        assert_eq_m128i(r, _mm_set1_epi16(-16));
3593    }
3594
3595    #[simd_test(enable = "sse2")]
3596    const fn test_mm_mulhi_epu16() {
3597        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(1001));
3598        let r = _mm_mulhi_epu16(a, b);
3599        assert_eq_m128i(r, _mm_set1_epi16(15));
3600    }
3601
3602    #[simd_test(enable = "sse2")]
3603    const fn test_mm_mullo_epi16() {
3604        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(-1001));
3605        let r = _mm_mullo_epi16(a, b);
3606        assert_eq_m128i(r, _mm_set1_epi16(-17960));
3607    }
3608
3609    #[simd_test(enable = "sse2")]
3610    const fn test_mm_mul_epu32() {
3611        let a = _mm_setr_epi64x(1_000_000_000, 1 << 34);
3612        let b = _mm_setr_epi64x(1_000_000_000, 1 << 35);
3613        let r = _mm_mul_epu32(a, b);
3614        let e = _mm_setr_epi64x(1_000_000_000 * 1_000_000_000, 0);
3615        assert_eq_m128i(r, e);
3616    }
3617
3618    #[simd_test(enable = "sse2")]
3619    fn test_mm_sad_epu8() {
3620        #[rustfmt::skip]
3621        let a = _mm_setr_epi8(
3622            255u8 as i8, 254u8 as i8, 253u8 as i8, 252u8 as i8,
3623            1, 2, 3, 4,
3624            155u8 as i8, 154u8 as i8, 153u8 as i8, 152u8 as i8,
3625            1, 2, 3, 4,
3626        );
3627        let b = _mm_setr_epi8(0, 0, 0, 0, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2);
3628        let r = _mm_sad_epu8(a, b);
3629        let e = _mm_setr_epi64x(1020, 614);
3630        assert_eq_m128i(r, e);
3631    }
3632
3633    #[simd_test(enable = "sse2")]
3634    const fn test_mm_sub_epi8() {
3635        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(6));
3636        let r = _mm_sub_epi8(a, b);
3637        assert_eq_m128i(r, _mm_set1_epi8(-1));
3638    }
3639
3640    #[simd_test(enable = "sse2")]
3641    const fn test_mm_sub_epi16() {
3642        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(6));
3643        let r = _mm_sub_epi16(a, b);
3644        assert_eq_m128i(r, _mm_set1_epi16(-1));
3645    }
3646
3647    #[simd_test(enable = "sse2")]
3648    const fn test_mm_sub_epi32() {
3649        let (a, b) = (_mm_set1_epi32(5), _mm_set1_epi32(6));
3650        let r = _mm_sub_epi32(a, b);
3651        assert_eq_m128i(r, _mm_set1_epi32(-1));
3652    }
3653
3654    #[simd_test(enable = "sse2")]
3655    const fn test_mm_sub_epi64() {
3656        let (a, b) = (_mm_set1_epi64x(5), _mm_set1_epi64x(6));
3657        let r = _mm_sub_epi64(a, b);
3658        assert_eq_m128i(r, _mm_set1_epi64x(-1));
3659    }
3660
3661    #[simd_test(enable = "sse2")]
3662    const fn test_mm_subs_epi8() {
3663        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(2));
3664        let r = _mm_subs_epi8(a, b);
3665        assert_eq_m128i(r, _mm_set1_epi8(3));
3666    }
3667
3668    #[simd_test(enable = "sse2")]
3669    fn test_mm_subs_epi8_saturate_positive() {
3670        let a = _mm_set1_epi8(0x7F);
3671        let b = _mm_set1_epi8(-1);
3672        let r = _mm_subs_epi8(a, b);
3673        assert_eq_m128i(r, a);
3674    }
3675
3676    #[simd_test(enable = "sse2")]
3677    fn test_mm_subs_epi8_saturate_negative() {
3678        let a = _mm_set1_epi8(-0x80);
3679        let b = _mm_set1_epi8(1);
3680        let r = _mm_subs_epi8(a, b);
3681        assert_eq_m128i(r, a);
3682    }
3683
3684    #[simd_test(enable = "sse2")]
3685    const fn test_mm_subs_epi16() {
3686        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(2));
3687        let r = _mm_subs_epi16(a, b);
3688        assert_eq_m128i(r, _mm_set1_epi16(3));
3689    }
3690
3691    #[simd_test(enable = "sse2")]
3692    fn test_mm_subs_epi16_saturate_positive() {
3693        let a = _mm_set1_epi16(0x7FFF);
3694        let b = _mm_set1_epi16(-1);
3695        let r = _mm_subs_epi16(a, b);
3696        assert_eq_m128i(r, a);
3697    }
3698
3699    #[simd_test(enable = "sse2")]
3700    fn test_mm_subs_epi16_saturate_negative() {
3701        let a = _mm_set1_epi16(-0x8000);
3702        let b = _mm_set1_epi16(1);
3703        let r = _mm_subs_epi16(a, b);
3704        assert_eq_m128i(r, a);
3705    }
3706
3707    #[simd_test(enable = "sse2")]
3708    const fn test_mm_subs_epu8() {
3709        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(2));
3710        let r = _mm_subs_epu8(a, b);
3711        assert_eq_m128i(r, _mm_set1_epi8(3));
3712    }
3713
3714    #[simd_test(enable = "sse2")]
3715    fn test_mm_subs_epu8_saturate() {
3716        let a = _mm_set1_epi8(0);
3717        let b = _mm_set1_epi8(1);
3718        let r = _mm_subs_epu8(a, b);
3719        assert_eq_m128i(r, a);
3720    }
3721
3722    #[simd_test(enable = "sse2")]
3723    const fn test_mm_subs_epu16() {
3724        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(2));
3725        let r = _mm_subs_epu16(a, b);
3726        assert_eq_m128i(r, _mm_set1_epi16(3));
3727    }
3728
3729    #[simd_test(enable = "sse2")]
3730    fn test_mm_subs_epu16_saturate() {
3731        let a = _mm_set1_epi16(0);
3732        let b = _mm_set1_epi16(1);
3733        let r = _mm_subs_epu16(a, b);
3734        assert_eq_m128i(r, a);
3735    }
3736
3737    #[simd_test(enable = "sse2")]
3738    const fn test_mm_slli_si128() {
3739        #[rustfmt::skip]
3740        let a = _mm_setr_epi8(
3741            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3742        );
3743        let r = _mm_slli_si128::<1>(a);
3744        let e = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3745        assert_eq_m128i(r, e);
3746
3747        #[rustfmt::skip]
3748        let a = _mm_setr_epi8(
3749            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3750        );
3751        let r = _mm_slli_si128::<15>(a);
3752        let e = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
3753        assert_eq_m128i(r, e);
3754
3755        #[rustfmt::skip]
3756        let a = _mm_setr_epi8(
3757            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3758        );
3759        let r = _mm_slli_si128::<16>(a);
3760        assert_eq_m128i(r, _mm_set1_epi8(0));
3761    }
3762
3763    #[simd_test(enable = "sse2")]
3764    const fn test_mm_slli_epi16() {
3765        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3766        let r = _mm_slli_epi16::<4>(a);
3767        assert_eq_m128i(
3768            r,
3769            _mm_setr_epi16(0xCC0, -0xCC0, 0xDD0, -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0),
3770        );
3771        let r = _mm_slli_epi16::<16>(a);
3772        assert_eq_m128i(r, _mm_set1_epi16(0));
3773    }
3774
3775    #[simd_test(enable = "sse2")]
3776    fn test_mm_sll_epi16() {
3777        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3778        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 4));
3779        assert_eq_m128i(
3780            r,
3781            _mm_setr_epi16(0xCC0, -0xCC0, 0xDD0, -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0),
3782        );
3783        let r = _mm_sll_epi16(a, _mm_set_epi64x(4, 0));
3784        assert_eq_m128i(r, a);
3785        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 16));
3786        assert_eq_m128i(r, _mm_set1_epi16(0));
3787        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, i64::MAX));
3788        assert_eq_m128i(r, _mm_set1_epi16(0));
3789    }
3790
3791    #[simd_test(enable = "sse2")]
3792    const fn test_mm_slli_epi32() {
3793        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3794        let r = _mm_slli_epi32::<4>(a);
3795        assert_eq_m128i(r, _mm_setr_epi32(0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0));
3796        let r = _mm_slli_epi32::<32>(a);
3797        assert_eq_m128i(r, _mm_set1_epi32(0));
3798    }
3799
3800    #[simd_test(enable = "sse2")]
3801    fn test_mm_sll_epi32() {
3802        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3803        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 4));
3804        assert_eq_m128i(r, _mm_setr_epi32(0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0));
3805        let r = _mm_sll_epi32(a, _mm_set_epi64x(4, 0));
3806        assert_eq_m128i(r, a);
3807        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 32));
3808        assert_eq_m128i(r, _mm_set1_epi32(0));
3809        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, i64::MAX));
3810        assert_eq_m128i(r, _mm_set1_epi32(0));
3811    }
3812
3813    #[simd_test(enable = "sse2")]
3814    const fn test_mm_slli_epi64() {
3815        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3816        let r = _mm_slli_epi64::<4>(a);
3817        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFFF0, -0xFFFFFFFF0));
3818        let r = _mm_slli_epi64::<64>(a);
3819        assert_eq_m128i(r, _mm_set1_epi64x(0));
3820    }
3821
3822    #[simd_test(enable = "sse2")]
3823    fn test_mm_sll_epi64() {
3824        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3825        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 4));
3826        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFFF0, -0xFFFFFFFF0));
3827        let r = _mm_sll_epi64(a, _mm_set_epi64x(4, 0));
3828        assert_eq_m128i(r, a);
3829        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 64));
3830        assert_eq_m128i(r, _mm_set1_epi64x(0));
3831        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, i64::MAX));
3832        assert_eq_m128i(r, _mm_set1_epi64x(0));
3833    }
3834
3835    #[simd_test(enable = "sse2")]
3836    const fn test_mm_srai_epi16() {
3837        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3838        let r = _mm_srai_epi16::<4>(a);
3839        assert_eq_m128i(
3840            r,
3841            _mm_setr_epi16(0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF, -0x10),
3842        );
3843        let r = _mm_srai_epi16::<16>(a);
3844        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3845    }
3846
3847    #[simd_test(enable = "sse2")]
3848    fn test_mm_sra_epi16() {
3849        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3850        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 4));
3851        assert_eq_m128i(
3852            r,
3853            _mm_setr_epi16(0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF, -0x10),
3854        );
3855        let r = _mm_sra_epi16(a, _mm_set_epi64x(4, 0));
3856        assert_eq_m128i(r, a);
3857        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 16));
3858        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3859        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, i64::MAX));
3860        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3861    }
3862
3863    #[simd_test(enable = "sse2")]
3864    const fn test_mm_srai_epi32() {
3865        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3866        let r = _mm_srai_epi32::<4>(a);
3867        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, -0xEEF, 0xFFF, -0x1000));
3868        let r = _mm_srai_epi32::<32>(a);
3869        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
3870    }
3871
3872    #[simd_test(enable = "sse2")]
3873    fn test_mm_sra_epi32() {
3874        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3875        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 4));
3876        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, -0xEEF, 0xFFF, -0x1000));
3877        let r = _mm_sra_epi32(a, _mm_set_epi64x(4, 0));
3878        assert_eq_m128i(r, a);
3879        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 32));
3880        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
3881        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, i64::MAX));
3882        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
3883    }
3884
3885    #[simd_test(enable = "sse2")]
3886    const fn test_mm_srli_si128() {
3887        #[rustfmt::skip]
3888        let a = _mm_setr_epi8(
3889            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3890        );
3891        let r = _mm_srli_si128::<1>(a);
3892        #[rustfmt::skip]
3893        let e = _mm_setr_epi8(
3894            2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0,
3895        );
3896        assert_eq_m128i(r, e);
3897
3898        #[rustfmt::skip]
3899        let a = _mm_setr_epi8(
3900            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3901        );
3902        let r = _mm_srli_si128::<15>(a);
3903        let e = _mm_setr_epi8(16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3904        assert_eq_m128i(r, e);
3905
3906        #[rustfmt::skip]
3907        let a = _mm_setr_epi8(
3908            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3909        );
3910        let r = _mm_srli_si128::<16>(a);
3911        assert_eq_m128i(r, _mm_set1_epi8(0));
3912    }
3913
3914    #[simd_test(enable = "sse2")]
3915    const fn test_mm_srli_epi16() {
3916        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3917        let r = _mm_srli_epi16::<4>(a);
3918        assert_eq_m128i(
3919            r,
3920            _mm_setr_epi16(0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1, 0xF, 0xFF0),
3921        );
3922        let r = _mm_srli_epi16::<16>(a);
3923        assert_eq_m128i(r, _mm_set1_epi16(0));
3924    }
3925
3926    #[simd_test(enable = "sse2")]
3927    fn test_mm_srl_epi16() {
3928        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3929        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 4));
3930        assert_eq_m128i(
3931            r,
3932            _mm_setr_epi16(0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1, 0xF, 0xFF0),
3933        );
3934        let r = _mm_srl_epi16(a, _mm_set_epi64x(4, 0));
3935        assert_eq_m128i(r, a);
3936        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 16));
3937        assert_eq_m128i(r, _mm_set1_epi16(0));
3938        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, i64::MAX));
3939        assert_eq_m128i(r, _mm_set1_epi16(0));
3940    }
3941
3942    #[simd_test(enable = "sse2")]
3943    const fn test_mm_srli_epi32() {
3944        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3945        let r = _mm_srli_epi32::<4>(a);
3946        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000));
3947        let r = _mm_srli_epi32::<32>(a);
3948        assert_eq_m128i(r, _mm_set1_epi32(0));
3949    }
3950
3951    #[simd_test(enable = "sse2")]
3952    fn test_mm_srl_epi32() {
3953        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3954        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 4));
3955        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000));
3956        let r = _mm_srl_epi32(a, _mm_set_epi64x(4, 0));
3957        assert_eq_m128i(r, a);
3958        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 32));
3959        assert_eq_m128i(r, _mm_set1_epi32(0));
3960        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, i64::MAX));
3961        assert_eq_m128i(r, _mm_set1_epi32(0));
3962    }
3963
3964    #[simd_test(enable = "sse2")]
3965    const fn test_mm_srli_epi64() {
3966        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3967        let r = _mm_srli_epi64::<4>(a);
3968        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFF, 0xFFFFFFFF0000000));
3969        let r = _mm_srli_epi64::<64>(a);
3970        assert_eq_m128i(r, _mm_set1_epi64x(0));
3971    }
3972
3973    #[simd_test(enable = "sse2")]
3974    fn test_mm_srl_epi64() {
3975        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3976        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 4));
3977        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFF, 0xFFFFFFFF0000000));
3978        let r = _mm_srl_epi64(a, _mm_set_epi64x(4, 0));
3979        assert_eq_m128i(r, a);
3980        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 64));
3981        assert_eq_m128i(r, _mm_set1_epi64x(0));
3982        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, i64::MAX));
3983        assert_eq_m128i(r, _mm_set1_epi64x(0));
3984    }
3985
3986    #[simd_test(enable = "sse2")]
3987    const fn test_mm_and_si128() {
3988        let a = _mm_set1_epi8(5);
3989        let b = _mm_set1_epi8(3);
3990        let r = _mm_and_si128(a, b);
3991        assert_eq_m128i(r, _mm_set1_epi8(1));
3992    }
3993
3994    #[simd_test(enable = "sse2")]
3995    const fn test_mm_andnot_si128() {
3996        let a = _mm_set1_epi8(5);
3997        let b = _mm_set1_epi8(3);
3998        let r = _mm_andnot_si128(a, b);
3999        assert_eq_m128i(r, _mm_set1_epi8(2));
4000    }
4001
4002    #[simd_test(enable = "sse2")]
4003    const fn test_mm_or_si128() {
4004        let a = _mm_set1_epi8(5);
4005        let b = _mm_set1_epi8(3);
4006        let r = _mm_or_si128(a, b);
4007        assert_eq_m128i(r, _mm_set1_epi8(7));
4008    }
4009
4010    #[simd_test(enable = "sse2")]
4011    const fn test_mm_xor_si128() {
4012        let a = _mm_set1_epi8(5);
4013        let b = _mm_set1_epi8(3);
4014        let r = _mm_xor_si128(a, b);
4015        assert_eq_m128i(r, _mm_set1_epi8(6));
4016    }
4017
4018    #[simd_test(enable = "sse2")]
4019    const fn test_mm_cmpeq_epi8() {
4020        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
4021        let b = _mm_setr_epi8(15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
4022        let r = _mm_cmpeq_epi8(a, b);
4023        #[rustfmt::skip]
4024        assert_eq_m128i(
4025            r,
4026            _mm_setr_epi8(
4027                0, 0, 0xFFu8 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
4028            )
4029        );
4030    }
4031
4032    #[simd_test(enable = "sse2")]
4033    const fn test_mm_cmpeq_epi16() {
4034        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4035        let b = _mm_setr_epi16(7, 6, 2, 4, 3, 2, 1, 0);
4036        let r = _mm_cmpeq_epi16(a, b);
4037        assert_eq_m128i(r, _mm_setr_epi16(0, 0, !0, 0, 0, 0, 0, 0));
4038    }
4039
4040    #[simd_test(enable = "sse2")]
4041    const fn test_mm_cmpeq_epi32() {
4042        let a = _mm_setr_epi32(0, 1, 2, 3);
4043        let b = _mm_setr_epi32(3, 2, 2, 0);
4044        let r = _mm_cmpeq_epi32(a, b);
4045        assert_eq_m128i(r, _mm_setr_epi32(0, 0, !0, 0));
4046    }
4047
4048    #[simd_test(enable = "sse2")]
4049    const fn test_mm_cmpgt_epi8() {
4050        let a = _mm_set_epi8(5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
4051        let b = _mm_set1_epi8(0);
4052        let r = _mm_cmpgt_epi8(a, b);
4053        let e = _mm_set_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
4054        assert_eq_m128i(r, e);
4055    }
4056
4057    #[simd_test(enable = "sse2")]
4058    const fn test_mm_cmpgt_epi16() {
4059        let a = _mm_set_epi16(5, 0, 0, 0, 0, 0, 0, 0);
4060        let b = _mm_set1_epi16(0);
4061        let r = _mm_cmpgt_epi16(a, b);
4062        let e = _mm_set_epi16(!0, 0, 0, 0, 0, 0, 0, 0);
4063        assert_eq_m128i(r, e);
4064    }
4065
4066    #[simd_test(enable = "sse2")]
4067    const fn test_mm_cmpgt_epi32() {
4068        let a = _mm_set_epi32(5, 0, 0, 0);
4069        let b = _mm_set1_epi32(0);
4070        let r = _mm_cmpgt_epi32(a, b);
4071        assert_eq_m128i(r, _mm_set_epi32(!0, 0, 0, 0));
4072    }
4073
4074    #[simd_test(enable = "sse2")]
4075    const fn test_mm_cmplt_epi8() {
4076        let a = _mm_set1_epi8(0);
4077        let b = _mm_set_epi8(5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
4078        let r = _mm_cmplt_epi8(a, b);
4079        let e = _mm_set_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
4080        assert_eq_m128i(r, e);
4081    }
4082
4083    #[simd_test(enable = "sse2")]
4084    const fn test_mm_cmplt_epi16() {
4085        let a = _mm_set1_epi16(0);
4086        let b = _mm_set_epi16(5, 0, 0, 0, 0, 0, 0, 0);
4087        let r = _mm_cmplt_epi16(a, b);
4088        let e = _mm_set_epi16(!0, 0, 0, 0, 0, 0, 0, 0);
4089        assert_eq_m128i(r, e);
4090    }
4091
4092    #[simd_test(enable = "sse2")]
4093    const fn test_mm_cmplt_epi32() {
4094        let a = _mm_set1_epi32(0);
4095        let b = _mm_set_epi32(5, 0, 0, 0);
4096        let r = _mm_cmplt_epi32(a, b);
4097        assert_eq_m128i(r, _mm_set_epi32(!0, 0, 0, 0));
4098    }
4099
4100    #[simd_test(enable = "sse2")]
4101    const fn test_mm_cvtepi32_pd() {
4102        let a = _mm_set_epi32(35, 25, 15, 5);
4103        let r = _mm_cvtepi32_pd(a);
4104        assert_eq_m128d(r, _mm_setr_pd(5.0, 15.0));
4105    }
4106
4107    #[simd_test(enable = "sse2")]
4108    const fn test_mm_cvtsi32_sd() {
4109        let a = _mm_set1_pd(3.5);
4110        let r = _mm_cvtsi32_sd(a, 5);
4111        assert_eq_m128d(r, _mm_setr_pd(5.0, 3.5));
4112    }
4113
4114    #[simd_test(enable = "sse2")]
4115    const fn test_mm_cvtepi32_ps() {
4116        let a = _mm_setr_epi32(1, 2, 3, 4);
4117        let r = _mm_cvtepi32_ps(a);
4118        assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 3.0, 4.0));
4119    }
4120
4121    #[simd_test(enable = "sse2")]
4122    fn test_mm_cvtps_epi32() {
4123        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
4124        let r = _mm_cvtps_epi32(a);
4125        assert_eq_m128i(r, _mm_setr_epi32(1, 2, 3, 4));
4126    }
4127
4128    #[simd_test(enable = "sse2")]
4129    const fn test_mm_cvtsi32_si128() {
4130        let r = _mm_cvtsi32_si128(5);
4131        assert_eq_m128i(r, _mm_setr_epi32(5, 0, 0, 0));
4132    }
4133
4134    #[simd_test(enable = "sse2")]
4135    const fn test_mm_cvtsi128_si32() {
4136        let r = _mm_cvtsi128_si32(_mm_setr_epi32(5, 0, 0, 0));
4137        assert_eq!(r, 5);
4138    }
4139
4140    #[simd_test(enable = "sse2")]
4141    const fn test_mm_set_epi64x() {
4142        let r = _mm_set_epi64x(0, 1);
4143        assert_eq_m128i(r, _mm_setr_epi64x(1, 0));
4144    }
4145
4146    #[simd_test(enable = "sse2")]
4147    const fn test_mm_set_epi32() {
4148        let r = _mm_set_epi32(0, 1, 2, 3);
4149        assert_eq_m128i(r, _mm_setr_epi32(3, 2, 1, 0));
4150    }
4151
4152    #[simd_test(enable = "sse2")]
4153    const fn test_mm_set_epi16() {
4154        let r = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4155        assert_eq_m128i(r, _mm_setr_epi16(7, 6, 5, 4, 3, 2, 1, 0));
4156    }
4157
4158    #[simd_test(enable = "sse2")]
4159    const fn test_mm_set_epi8() {
4160        #[rustfmt::skip]
4161        let r = _mm_set_epi8(
4162            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4163        );
4164        #[rustfmt::skip]
4165        let e = _mm_setr_epi8(
4166            15, 14, 13, 12, 11, 10, 9, 8,
4167            7, 6, 5, 4, 3, 2, 1, 0,
4168        );
4169        assert_eq_m128i(r, e);
4170    }
4171
4172    #[simd_test(enable = "sse2")]
4173    const fn test_mm_set1_epi64x() {
4174        let r = _mm_set1_epi64x(1);
4175        assert_eq_m128i(r, _mm_set1_epi64x(1));
4176    }
4177
4178    #[simd_test(enable = "sse2")]
4179    const fn test_mm_set1_epi32() {
4180        let r = _mm_set1_epi32(1);
4181        assert_eq_m128i(r, _mm_set1_epi32(1));
4182    }
4183
4184    #[simd_test(enable = "sse2")]
4185    const fn test_mm_set1_epi16() {
4186        let r = _mm_set1_epi16(1);
4187        assert_eq_m128i(r, _mm_set1_epi16(1));
4188    }
4189
4190    #[simd_test(enable = "sse2")]
4191    const fn test_mm_set1_epi8() {
4192        let r = _mm_set1_epi8(1);
4193        assert_eq_m128i(r, _mm_set1_epi8(1));
4194    }
4195
4196    #[simd_test(enable = "sse2")]
4197    const fn test_mm_setr_epi32() {
4198        let r = _mm_setr_epi32(0, 1, 2, 3);
4199        assert_eq_m128i(r, _mm_setr_epi32(0, 1, 2, 3));
4200    }
4201
4202    #[simd_test(enable = "sse2")]
4203    const fn test_mm_setr_epi16() {
4204        let r = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4205        assert_eq_m128i(r, _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7));
4206    }
4207
4208    #[simd_test(enable = "sse2")]
4209    const fn test_mm_setr_epi8() {
4210        #[rustfmt::skip]
4211        let r = _mm_setr_epi8(
4212            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4213        );
4214        #[rustfmt::skip]
4215        let e = _mm_setr_epi8(
4216            0, 1, 2, 3, 4, 5, 6, 7,
4217            8, 9, 10, 11, 12, 13, 14, 15,
4218        );
4219        assert_eq_m128i(r, e);
4220    }
4221
4222    #[simd_test(enable = "sse2")]
4223    const fn test_mm_setzero_si128() {
4224        let r = _mm_setzero_si128();
4225        assert_eq_m128i(r, _mm_set1_epi64x(0));
4226    }
4227
4228    #[simd_test(enable = "sse2")]
4229    const fn test_mm_loadl_epi64() {
4230        let a = _mm_setr_epi64x(6, 5);
4231        let r = unsafe { _mm_loadl_epi64(ptr::addr_of!(a)) };
4232        assert_eq_m128i(r, _mm_setr_epi64x(6, 0));
4233    }
4234
4235    #[simd_test(enable = "sse2")]
4236    const fn test_mm_load_si128() {
4237        let a = _mm_set_epi64x(5, 6);
4238        let r = unsafe { _mm_load_si128(ptr::addr_of!(a) as *const _) };
4239        assert_eq_m128i(a, r);
4240    }
4241
4242    #[simd_test(enable = "sse2")]
4243    const fn test_mm_loadu_si128() {
4244        let a = _mm_set_epi64x(5, 6);
4245        let r = unsafe { _mm_loadu_si128(ptr::addr_of!(a) as *const _) };
4246        assert_eq_m128i(a, r);
4247    }
4248
4249    #[simd_test(enable = "sse2")]
4250    // Miri cannot support this until it is clear how it fits in the Rust memory model
4251    // (non-temporal store)
4252    #[cfg_attr(miri, ignore)]
4253    fn test_mm_maskmoveu_si128() {
4254        let a = _mm_set1_epi8(9);
4255        #[rustfmt::skip]
4256        let mask = _mm_set_epi8(
4257            0, 0, 0x80u8 as i8, 0, 0, 0, 0, 0,
4258            0, 0, 0, 0, 0, 0, 0, 0,
4259        );
4260        let mut r = _mm_set1_epi8(0);
4261        unsafe {
4262            _mm_maskmoveu_si128(a, mask, ptr::addr_of_mut!(r) as *mut i8);
4263        }
4264        _mm_sfence();
4265        let e = _mm_set_epi8(0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
4266        assert_eq_m128i(r, e);
4267    }
4268
4269    #[simd_test(enable = "sse2")]
4270    const fn test_mm_store_si128() {
4271        let a = _mm_set1_epi8(9);
4272        let mut r = _mm_set1_epi8(0);
4273        unsafe {
4274            _mm_store_si128(&mut r, a);
4275        }
4276        assert_eq_m128i(r, a);
4277    }
4278
4279    #[simd_test(enable = "sse2")]
4280    const fn test_mm_storeu_si128() {
4281        let a = _mm_set1_epi8(9);
4282        let mut r = _mm_set1_epi8(0);
4283        unsafe {
4284            _mm_storeu_si128(&mut r, a);
4285        }
4286        assert_eq_m128i(r, a);
4287    }
4288
4289    #[simd_test(enable = "sse2")]
4290    const fn test_mm_storel_epi64() {
4291        let a = _mm_setr_epi64x(2, 9);
4292        let mut r = _mm_set1_epi8(0);
4293        unsafe {
4294            _mm_storel_epi64(&mut r, a);
4295        }
4296        assert_eq_m128i(r, _mm_setr_epi64x(2, 0));
4297    }
4298
4299    #[simd_test(enable = "sse2")]
4300    // Miri cannot support this until it is clear how it fits in the Rust memory model
4301    // (non-temporal store)
4302    #[cfg_attr(miri, ignore)]
4303    fn test_mm_stream_si128() {
4304        let a = _mm_setr_epi32(1, 2, 3, 4);
4305        let mut r = _mm_undefined_si128();
4306        unsafe {
4307            _mm_stream_si128(ptr::addr_of_mut!(r), a);
4308        }
4309        _mm_sfence();
4310        assert_eq_m128i(r, a);
4311    }
4312
4313    #[simd_test(enable = "sse2")]
4314    // Miri cannot support this until it is clear how it fits in the Rust memory model
4315    // (non-temporal store)
4316    #[cfg_attr(miri, ignore)]
4317    fn test_mm_stream_si32() {
4318        let a: i32 = 7;
4319        let mut mem = boxed::Box::<i32>::new(-1);
4320        unsafe {
4321            _mm_stream_si32(ptr::addr_of_mut!(*mem), a);
4322        }
4323        _mm_sfence();
4324        assert_eq!(a, *mem);
4325    }
4326
4327    #[simd_test(enable = "sse2")]
4328    const fn test_mm_move_epi64() {
4329        let a = _mm_setr_epi64x(5, 6);
4330        let r = _mm_move_epi64(a);
4331        assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
4332    }
4333
4334    #[simd_test(enable = "sse2")]
4335    const fn test_mm_packs_epi16() {
4336        let a = _mm_setr_epi16(0x80, -0x81, 0, 0, 0, 0, 0, 0);
4337        let b = _mm_setr_epi16(0, 0, 0, 0, 0, 0, -0x81, 0x80);
4338        let r = _mm_packs_epi16(a, b);
4339        #[rustfmt::skip]
4340        assert_eq_m128i(
4341            r,
4342            _mm_setr_epi8(
4343                0x7F, -0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0x80, 0x7F
4344            )
4345        );
4346    }
4347
4348    #[simd_test(enable = "sse2")]
4349    const fn test_mm_packs_epi32() {
4350        let a = _mm_setr_epi32(0x8000, -0x8001, 0, 0);
4351        let b = _mm_setr_epi32(0, 0, -0x8001, 0x8000);
4352        let r = _mm_packs_epi32(a, b);
4353        assert_eq_m128i(
4354            r,
4355            _mm_setr_epi16(0x7FFF, -0x8000, 0, 0, 0, 0, -0x8000, 0x7FFF),
4356        );
4357    }
4358
4359    #[simd_test(enable = "sse2")]
4360    const fn test_mm_packus_epi16() {
4361        let a = _mm_setr_epi16(0x100, -1, 0, 0, 0, 0, 0, 0);
4362        let b = _mm_setr_epi16(0, 0, 0, 0, 0, 0, -1, 0x100);
4363        let r = _mm_packus_epi16(a, b);
4364        assert_eq_m128i(
4365            r,
4366            _mm_setr_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, !0),
4367        );
4368    }
4369
4370    #[simd_test(enable = "sse2")]
4371    const fn test_mm_extract_epi16() {
4372        let a = _mm_setr_epi16(-1, 1, 2, 3, 4, 5, 6, 7);
4373        let r1 = _mm_extract_epi16::<0>(a);
4374        let r2 = _mm_extract_epi16::<3>(a);
4375        assert_eq!(r1, 0xFFFF);
4376        assert_eq!(r2, 3);
4377    }
4378
4379    #[simd_test(enable = "sse2")]
4380    const fn test_mm_insert_epi16() {
4381        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4382        let r = _mm_insert_epi16::<0>(a, 9);
4383        let e = _mm_setr_epi16(9, 1, 2, 3, 4, 5, 6, 7);
4384        assert_eq_m128i(r, e);
4385    }
4386
4387    #[simd_test(enable = "sse2")]
4388    const fn test_mm_movemask_epi8() {
4389        #[rustfmt::skip]
4390        let a = _mm_setr_epi8(
4391            0b1000_0000u8 as i8, 0b0, 0b1000_0000u8 as i8, 0b01,
4392            0b0101, 0b1111_0000u8 as i8, 0, 0,
4393            0, 0b1011_0101u8 as i8, 0b1111_0000u8 as i8, 0b0101,
4394            0b01, 0b1000_0000u8 as i8, 0b0, 0b1000_0000u8 as i8,
4395        );
4396        let r = _mm_movemask_epi8(a);
4397        assert_eq!(r, 0b10100110_00100101);
4398    }
4399
4400    #[simd_test(enable = "sse2")]
4401    const fn test_mm_shuffle_epi32() {
4402        let a = _mm_setr_epi32(5, 10, 15, 20);
4403        let r = _mm_shuffle_epi32::<0b00_01_01_11>(a);
4404        let e = _mm_setr_epi32(20, 10, 10, 5);
4405        assert_eq_m128i(r, e);
4406    }
4407
4408    #[simd_test(enable = "sse2")]
4409    const fn test_mm_shufflehi_epi16() {
4410        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 10, 15, 20);
4411        let r = _mm_shufflehi_epi16::<0b00_01_01_11>(a);
4412        let e = _mm_setr_epi16(1, 2, 3, 4, 20, 10, 10, 5);
4413        assert_eq_m128i(r, e);
4414    }
4415
4416    #[simd_test(enable = "sse2")]
4417    const fn test_mm_shufflelo_epi16() {
4418        let a = _mm_setr_epi16(5, 10, 15, 20, 1, 2, 3, 4);
4419        let r = _mm_shufflelo_epi16::<0b00_01_01_11>(a);
4420        let e = _mm_setr_epi16(20, 10, 10, 5, 1, 2, 3, 4);
4421        assert_eq_m128i(r, e);
4422    }
4423
4424    #[simd_test(enable = "sse2")]
4425    const fn test_mm_unpackhi_epi8() {
4426        #[rustfmt::skip]
4427        let a = _mm_setr_epi8(
4428            0, 1, 2, 3, 4, 5, 6, 7,
4429            8, 9, 10, 11, 12, 13, 14, 15,
4430        );
4431        #[rustfmt::skip]
4432        let b = _mm_setr_epi8(
4433            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
4434        );
4435        let r = _mm_unpackhi_epi8(a, b);
4436        #[rustfmt::skip]
4437        let e = _mm_setr_epi8(
4438            8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31,
4439        );
4440        assert_eq_m128i(r, e);
4441    }
4442
4443    #[simd_test(enable = "sse2")]
4444    const fn test_mm_unpackhi_epi16() {
4445        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4446        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
4447        let r = _mm_unpackhi_epi16(a, b);
4448        let e = _mm_setr_epi16(4, 12, 5, 13, 6, 14, 7, 15);
4449        assert_eq_m128i(r, e);
4450    }
4451
4452    #[simd_test(enable = "sse2")]
4453    const fn test_mm_unpackhi_epi32() {
4454        let a = _mm_setr_epi32(0, 1, 2, 3);
4455        let b = _mm_setr_epi32(4, 5, 6, 7);
4456        let r = _mm_unpackhi_epi32(a, b);
4457        let e = _mm_setr_epi32(2, 6, 3, 7);
4458        assert_eq_m128i(r, e);
4459    }
4460
4461    #[simd_test(enable = "sse2")]
4462    const fn test_mm_unpackhi_epi64() {
4463        let a = _mm_setr_epi64x(0, 1);
4464        let b = _mm_setr_epi64x(2, 3);
4465        let r = _mm_unpackhi_epi64(a, b);
4466        let e = _mm_setr_epi64x(1, 3);
4467        assert_eq_m128i(r, e);
4468    }
4469
4470    #[simd_test(enable = "sse2")]
4471    const fn test_mm_unpacklo_epi8() {
4472        #[rustfmt::skip]
4473        let a = _mm_setr_epi8(
4474            0, 1, 2, 3, 4, 5, 6, 7,
4475            8, 9, 10, 11, 12, 13, 14, 15,
4476        );
4477        #[rustfmt::skip]
4478        let b = _mm_setr_epi8(
4479            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
4480        );
4481        let r = _mm_unpacklo_epi8(a, b);
4482        #[rustfmt::skip]
4483        let e = _mm_setr_epi8(
4484            0, 16, 1, 17, 2, 18, 3, 19,
4485            4, 20, 5, 21, 6, 22, 7, 23,
4486        );
4487        assert_eq_m128i(r, e);
4488    }
4489
4490    #[simd_test(enable = "sse2")]
4491    const fn test_mm_unpacklo_epi16() {
4492        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4493        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
4494        let r = _mm_unpacklo_epi16(a, b);
4495        let e = _mm_setr_epi16(0, 8, 1, 9, 2, 10, 3, 11);
4496        assert_eq_m128i(r, e);
4497    }
4498
4499    #[simd_test(enable = "sse2")]
4500    const fn test_mm_unpacklo_epi32() {
4501        let a = _mm_setr_epi32(0, 1, 2, 3);
4502        let b = _mm_setr_epi32(4, 5, 6, 7);
4503        let r = _mm_unpacklo_epi32(a, b);
4504        let e = _mm_setr_epi32(0, 4, 1, 5);
4505        assert_eq_m128i(r, e);
4506    }
4507
4508    #[simd_test(enable = "sse2")]
4509    const fn test_mm_unpacklo_epi64() {
4510        let a = _mm_setr_epi64x(0, 1);
4511        let b = _mm_setr_epi64x(2, 3);
4512        let r = _mm_unpacklo_epi64(a, b);
4513        let e = _mm_setr_epi64x(0, 2);
4514        assert_eq_m128i(r, e);
4515    }
4516
4517    #[simd_test(enable = "sse2")]
4518    const fn test_mm_add_sd() {
4519        let a = _mm_setr_pd(1.0, 2.0);
4520        let b = _mm_setr_pd(5.0, 10.0);
4521        let r = _mm_add_sd(a, b);
4522        assert_eq_m128d(r, _mm_setr_pd(6.0, 2.0));
4523    }
4524
4525    #[simd_test(enable = "sse2")]
4526    const fn test_mm_add_pd() {
4527        let a = _mm_setr_pd(1.0, 2.0);
4528        let b = _mm_setr_pd(5.0, 10.0);
4529        let r = _mm_add_pd(a, b);
4530        assert_eq_m128d(r, _mm_setr_pd(6.0, 12.0));
4531    }
4532
4533    #[simd_test(enable = "sse2")]
4534    const fn test_mm_div_sd() {
4535        let a = _mm_setr_pd(1.0, 2.0);
4536        let b = _mm_setr_pd(5.0, 10.0);
4537        let r = _mm_div_sd(a, b);
4538        assert_eq_m128d(r, _mm_setr_pd(0.2, 2.0));
4539    }
4540
4541    #[simd_test(enable = "sse2")]
4542    const fn test_mm_div_pd() {
4543        let a = _mm_setr_pd(1.0, 2.0);
4544        let b = _mm_setr_pd(5.0, 10.0);
4545        let r = _mm_div_pd(a, b);
4546        assert_eq_m128d(r, _mm_setr_pd(0.2, 0.2));
4547    }
4548
4549    #[simd_test(enable = "sse2")]
4550    fn test_mm_max_sd() {
4551        let a = _mm_setr_pd(1.0, 2.0);
4552        let b = _mm_setr_pd(5.0, 10.0);
4553        let r = _mm_max_sd(a, b);
4554        assert_eq_m128d(r, _mm_setr_pd(5.0, 2.0));
4555    }
4556
4557    #[simd_test(enable = "sse2")]
4558    fn test_mm_max_pd() {
4559        let a = _mm_setr_pd(1.0, 2.0);
4560        let b = _mm_setr_pd(5.0, 10.0);
4561        let r = _mm_max_pd(a, b);
4562        assert_eq_m128d(r, _mm_setr_pd(5.0, 10.0));
4563
4564        // Check SSE(2)-specific semantics for -0.0 handling.
4565        let a = _mm_setr_pd(-0.0, 0.0);
4566        let b = _mm_setr_pd(0.0, 0.0);
4567        // Cast to __m128i to compare exact bit patterns
4568        let r1 = _mm_castpd_si128(_mm_max_pd(a, b));
4569        let r2 = _mm_castpd_si128(_mm_max_pd(b, a));
4570        let a = _mm_castpd_si128(a);
4571        let b = _mm_castpd_si128(b);
4572        assert_eq_m128i(r1, b);
4573        assert_eq_m128i(r2, a);
4574        assert_ne!(a.as_u8x16(), b.as_u8x16()); // sanity check that -0.0 is actually present
4575    }
4576
4577    #[simd_test(enable = "sse2")]
4578    fn test_mm_min_sd() {
4579        let a = _mm_setr_pd(1.0, 2.0);
4580        let b = _mm_setr_pd(5.0, 10.0);
4581        let r = _mm_min_sd(a, b);
4582        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
4583    }
4584
4585    #[simd_test(enable = "sse2")]
4586    fn test_mm_min_pd() {
4587        let a = _mm_setr_pd(1.0, 2.0);
4588        let b = _mm_setr_pd(5.0, 10.0);
4589        let r = _mm_min_pd(a, b);
4590        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
4591
4592        // Check SSE(2)-specific semantics for -0.0 handling.
4593        let a = _mm_setr_pd(-0.0, 0.0);
4594        let b = _mm_setr_pd(0.0, 0.0);
4595        // Cast to __m128i to compare exact bit patterns
4596        let r1 = _mm_castpd_si128(_mm_min_pd(a, b));
4597        let r2 = _mm_castpd_si128(_mm_min_pd(b, a));
4598        let a = _mm_castpd_si128(a);
4599        let b = _mm_castpd_si128(b);
4600        assert_eq_m128i(r1, b);
4601        assert_eq_m128i(r2, a);
4602        assert_ne!(a.as_u8x16(), b.as_u8x16()); // sanity check that -0.0 is actually present
4603    }
4604
4605    #[simd_test(enable = "sse2")]
4606    const fn test_mm_mul_sd() {
4607        let a = _mm_setr_pd(1.0, 2.0);
4608        let b = _mm_setr_pd(5.0, 10.0);
4609        let r = _mm_mul_sd(a, b);
4610        assert_eq_m128d(r, _mm_setr_pd(5.0, 2.0));
4611    }
4612
4613    #[simd_test(enable = "sse2")]
4614    const fn test_mm_mul_pd() {
4615        let a = _mm_setr_pd(1.0, 2.0);
4616        let b = _mm_setr_pd(5.0, 10.0);
4617        let r = _mm_mul_pd(a, b);
4618        assert_eq_m128d(r, _mm_setr_pd(5.0, 20.0));
4619    }
4620
4621    #[simd_test(enable = "sse2")]
4622    fn test_mm_sqrt_sd() {
4623        let a = _mm_setr_pd(1.0, 2.0);
4624        let b = _mm_setr_pd(5.0, 10.0);
4625        let r = _mm_sqrt_sd(a, b);
4626        assert_eq_m128d(r, _mm_setr_pd(5.0f64.sqrt(), 2.0));
4627    }
4628
4629    #[simd_test(enable = "sse2")]
4630    fn test_mm_sqrt_pd() {
4631        let r = _mm_sqrt_pd(_mm_setr_pd(1.0, 2.0));
4632        assert_eq_m128d(r, _mm_setr_pd(1.0f64.sqrt(), 2.0f64.sqrt()));
4633    }
4634
4635    #[simd_test(enable = "sse2")]
4636    const fn test_mm_sub_sd() {
4637        let a = _mm_setr_pd(1.0, 2.0);
4638        let b = _mm_setr_pd(5.0, 10.0);
4639        let r = _mm_sub_sd(a, b);
4640        assert_eq_m128d(r, _mm_setr_pd(-4.0, 2.0));
4641    }
4642
4643    #[simd_test(enable = "sse2")]
4644    const fn test_mm_sub_pd() {
4645        let a = _mm_setr_pd(1.0, 2.0);
4646        let b = _mm_setr_pd(5.0, 10.0);
4647        let r = _mm_sub_pd(a, b);
4648        assert_eq_m128d(r, _mm_setr_pd(-4.0, -8.0));
4649    }
4650
4651    #[simd_test(enable = "sse2")]
4652    const fn test_mm_and_pd() {
4653        let a = f64x2::from_bits(u64x2::splat(5)).as_m128d();
4654        let b = f64x2::from_bits(u64x2::splat(3)).as_m128d();
4655        let r = _mm_and_pd(a, b);
4656        let e = f64x2::from_bits(u64x2::splat(1)).as_m128d();
4657        assert_eq_m128d(r, e);
4658    }
4659
4660    #[simd_test(enable = "sse2")]
4661    const fn test_mm_andnot_pd() {
4662        let a = f64x2::from_bits(u64x2::splat(5)).as_m128d();
4663        let b = f64x2::from_bits(u64x2::splat(3)).as_m128d();
4664        let r = _mm_andnot_pd(a, b);
4665        let e = f64x2::from_bits(u64x2::splat(2)).as_m128d();
4666        assert_eq_m128d(r, e);
4667    }
4668
4669    #[simd_test(enable = "sse2")]
4670    const fn test_mm_or_pd() {
4671        let a = f64x2::from_bits(u64x2::splat(5)).as_m128d();
4672        let b = f64x2::from_bits(u64x2::splat(3)).as_m128d();
4673        let r = _mm_or_pd(a, b);
4674        let e = f64x2::from_bits(u64x2::splat(7)).as_m128d();
4675        assert_eq_m128d(r, e);
4676    }
4677
4678    #[simd_test(enable = "sse2")]
4679    const fn test_mm_xor_pd() {
4680        let a = f64x2::from_bits(u64x2::splat(5)).as_m128d();
4681        let b = f64x2::from_bits(u64x2::splat(3)).as_m128d();
4682        let r = _mm_xor_pd(a, b);
4683        let e = f64x2::from_bits(u64x2::splat(6)).as_m128d();
4684        assert_eq_m128d(r, e);
4685    }
4686
4687    #[simd_test(enable = "sse2")]
4688    fn test_mm_cmpeq_sd() {
4689        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4690        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4691        let r = _mm_castpd_si128(_mm_cmpeq_sd(a, b));
4692        assert_eq_m128i(r, e);
4693    }
4694
4695    #[simd_test(enable = "sse2")]
4696    fn test_mm_cmplt_sd() {
4697        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4698        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4699        let r = _mm_castpd_si128(_mm_cmplt_sd(a, b));
4700        assert_eq_m128i(r, e);
4701    }
4702
4703    #[simd_test(enable = "sse2")]
4704    fn test_mm_cmple_sd() {
4705        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4706        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4707        let r = _mm_castpd_si128(_mm_cmple_sd(a, b));
4708        assert_eq_m128i(r, e);
4709    }
4710
4711    #[simd_test(enable = "sse2")]
4712    fn test_mm_cmpgt_sd() {
4713        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
4714        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4715        let r = _mm_castpd_si128(_mm_cmpgt_sd(a, b));
4716        assert_eq_m128i(r, e);
4717    }
4718
4719    #[simd_test(enable = "sse2")]
4720    fn test_mm_cmpge_sd() {
4721        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4722        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4723        let r = _mm_castpd_si128(_mm_cmpge_sd(a, b));
4724        assert_eq_m128i(r, e);
4725    }
4726
4727    #[simd_test(enable = "sse2")]
4728    fn test_mm_cmpord_sd() {
4729        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4730        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4731        let r = _mm_castpd_si128(_mm_cmpord_sd(a, b));
4732        assert_eq_m128i(r, e);
4733    }
4734
4735    #[simd_test(enable = "sse2")]
4736    fn test_mm_cmpunord_sd() {
4737        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4738        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4739        let r = _mm_castpd_si128(_mm_cmpunord_sd(a, b));
4740        assert_eq_m128i(r, e);
4741    }
4742
4743    #[simd_test(enable = "sse2")]
4744    fn test_mm_cmpneq_sd() {
4745        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4746        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4747        let r = _mm_castpd_si128(_mm_cmpneq_sd(a, b));
4748        assert_eq_m128i(r, e);
4749    }
4750
4751    #[simd_test(enable = "sse2")]
4752    fn test_mm_cmpnlt_sd() {
4753        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4754        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4755        let r = _mm_castpd_si128(_mm_cmpnlt_sd(a, b));
4756        assert_eq_m128i(r, e);
4757    }
4758
4759    #[simd_test(enable = "sse2")]
4760    fn test_mm_cmpnle_sd() {
4761        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4762        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4763        let r = _mm_castpd_si128(_mm_cmpnle_sd(a, b));
4764        assert_eq_m128i(r, e);
4765    }
4766
4767    #[simd_test(enable = "sse2")]
4768    fn test_mm_cmpngt_sd() {
4769        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
4770        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4771        let r = _mm_castpd_si128(_mm_cmpngt_sd(a, b));
4772        assert_eq_m128i(r, e);
4773    }
4774
4775    #[simd_test(enable = "sse2")]
4776    fn test_mm_cmpnge_sd() {
4777        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4778        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4779        let r = _mm_castpd_si128(_mm_cmpnge_sd(a, b));
4780        assert_eq_m128i(r, e);
4781    }
4782
4783    #[simd_test(enable = "sse2")]
4784    fn test_mm_cmpeq_pd() {
4785        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4786        let e = _mm_setr_epi64x(!0, 0);
4787        let r = _mm_castpd_si128(_mm_cmpeq_pd(a, b));
4788        assert_eq_m128i(r, e);
4789    }
4790
4791    #[simd_test(enable = "sse2")]
4792    fn test_mm_cmplt_pd() {
4793        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4794        let e = _mm_setr_epi64x(0, !0);
4795        let r = _mm_castpd_si128(_mm_cmplt_pd(a, b));
4796        assert_eq_m128i(r, e);
4797    }
4798
4799    #[simd_test(enable = "sse2")]
4800    fn test_mm_cmple_pd() {
4801        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4802        let e = _mm_setr_epi64x(!0, !0);
4803        let r = _mm_castpd_si128(_mm_cmple_pd(a, b));
4804        assert_eq_m128i(r, e);
4805    }
4806
4807    #[simd_test(enable = "sse2")]
4808    fn test_mm_cmpgt_pd() {
4809        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4810        let e = _mm_setr_epi64x(0, 0);
4811        let r = _mm_castpd_si128(_mm_cmpgt_pd(a, b));
4812        assert_eq_m128i(r, e);
4813    }
4814
4815    #[simd_test(enable = "sse2")]
4816    fn test_mm_cmpge_pd() {
4817        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4818        let e = _mm_setr_epi64x(!0, 0);
4819        let r = _mm_castpd_si128(_mm_cmpge_pd(a, b));
4820        assert_eq_m128i(r, e);
4821    }
4822
4823    #[simd_test(enable = "sse2")]
4824    fn test_mm_cmpord_pd() {
4825        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4826        let e = _mm_setr_epi64x(0, !0);
4827        let r = _mm_castpd_si128(_mm_cmpord_pd(a, b));
4828        assert_eq_m128i(r, e);
4829    }
4830
4831    #[simd_test(enable = "sse2")]
4832    fn test_mm_cmpunord_pd() {
4833        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4834        let e = _mm_setr_epi64x(!0, 0);
4835        let r = _mm_castpd_si128(_mm_cmpunord_pd(a, b));
4836        assert_eq_m128i(r, e);
4837    }
4838
4839    #[simd_test(enable = "sse2")]
4840    fn test_mm_cmpneq_pd() {
4841        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4842        let e = _mm_setr_epi64x(!0, !0);
4843        let r = _mm_castpd_si128(_mm_cmpneq_pd(a, b));
4844        assert_eq_m128i(r, e);
4845    }
4846
4847    #[simd_test(enable = "sse2")]
4848    fn test_mm_cmpnlt_pd() {
4849        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4850        let e = _mm_setr_epi64x(0, 0);
4851        let r = _mm_castpd_si128(_mm_cmpnlt_pd(a, b));
4852        assert_eq_m128i(r, e);
4853    }
4854
4855    #[simd_test(enable = "sse2")]
4856    fn test_mm_cmpnle_pd() {
4857        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4858        let e = _mm_setr_epi64x(0, 0);
4859        let r = _mm_castpd_si128(_mm_cmpnle_pd(a, b));
4860        assert_eq_m128i(r, e);
4861    }
4862
4863    #[simd_test(enable = "sse2")]
4864    fn test_mm_cmpngt_pd() {
4865        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
4866        let e = _mm_setr_epi64x(0, !0);
4867        let r = _mm_castpd_si128(_mm_cmpngt_pd(a, b));
4868        assert_eq_m128i(r, e);
4869    }
4870
4871    #[simd_test(enable = "sse2")]
4872    fn test_mm_cmpnge_pd() {
4873        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4874        let e = _mm_setr_epi64x(0, !0);
4875        let r = _mm_castpd_si128(_mm_cmpnge_pd(a, b));
4876        assert_eq_m128i(r, e);
4877    }
4878
4879    #[simd_test(enable = "sse2")]
4880    fn test_mm_comieq_sd() {
4881        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4882        assert!(_mm_comieq_sd(a, b) != 0);
4883
4884        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(1.0, 3.0));
4885        assert!(_mm_comieq_sd(a, b) == 0);
4886    }
4887
4888    #[simd_test(enable = "sse2")]
4889    fn test_mm_comilt_sd() {
4890        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4891        assert!(_mm_comilt_sd(a, b) == 0);
4892    }
4893
4894    #[simd_test(enable = "sse2")]
4895    fn test_mm_comile_sd() {
4896        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4897        assert!(_mm_comile_sd(a, b) != 0);
4898    }
4899
4900    #[simd_test(enable = "sse2")]
4901    fn test_mm_comigt_sd() {
4902        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4903        assert!(_mm_comigt_sd(a, b) == 0);
4904    }
4905
4906    #[simd_test(enable = "sse2")]
4907    fn test_mm_comige_sd() {
4908        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4909        assert!(_mm_comige_sd(a, b) != 0);
4910    }
4911
4912    #[simd_test(enable = "sse2")]
4913    fn test_mm_comineq_sd() {
4914        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4915        assert!(_mm_comineq_sd(a, b) == 0);
4916    }
4917
4918    #[simd_test(enable = "sse2")]
4919    fn test_mm_ucomieq_sd() {
4920        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4921        assert!(_mm_ucomieq_sd(a, b) != 0);
4922
4923        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(NAN, 3.0));
4924        assert!(_mm_ucomieq_sd(a, b) == 0);
4925    }
4926
4927    #[simd_test(enable = "sse2")]
4928    fn test_mm_ucomilt_sd() {
4929        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4930        assert!(_mm_ucomilt_sd(a, b) == 0);
4931    }
4932
4933    #[simd_test(enable = "sse2")]
4934    fn test_mm_ucomile_sd() {
4935        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4936        assert!(_mm_ucomile_sd(a, b) != 0);
4937    }
4938
4939    #[simd_test(enable = "sse2")]
4940    fn test_mm_ucomigt_sd() {
4941        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4942        assert!(_mm_ucomigt_sd(a, b) == 0);
4943    }
4944
4945    #[simd_test(enable = "sse2")]
4946    fn test_mm_ucomige_sd() {
4947        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4948        assert!(_mm_ucomige_sd(a, b) != 0);
4949    }
4950
4951    #[simd_test(enable = "sse2")]
4952    fn test_mm_ucomineq_sd() {
4953        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4954        assert!(_mm_ucomineq_sd(a, b) == 0);
4955    }
4956
4957    #[simd_test(enable = "sse2")]
4958    const fn test_mm_movemask_pd() {
4959        let r = _mm_movemask_pd(_mm_setr_pd(-1.0, 5.0));
4960        assert_eq!(r, 0b01);
4961
4962        let r = _mm_movemask_pd(_mm_setr_pd(-1.0, -5.0));
4963        assert_eq!(r, 0b11);
4964    }
4965
4966    #[repr(align(16))]
4967    struct Memory {
4968        data: [f64; 4],
4969    }
4970
4971    #[simd_test(enable = "sse2")]
4972    const fn test_mm_load_pd() {
4973        let mem = Memory {
4974            data: [1.0f64, 2.0, 3.0, 4.0],
4975        };
4976        let vals = &mem.data;
4977        let d = vals.as_ptr();
4978
4979        let r = unsafe { _mm_load_pd(d) };
4980        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
4981    }
4982
4983    #[simd_test(enable = "sse2")]
4984    const fn test_mm_load_sd() {
4985        let a = 1.;
4986        let expected = _mm_setr_pd(a, 0.);
4987        let r = unsafe { _mm_load_sd(&a) };
4988        assert_eq_m128d(r, expected);
4989    }
4990
4991    #[simd_test(enable = "sse2")]
4992    const fn test_mm_loadh_pd() {
4993        let a = _mm_setr_pd(1., 2.);
4994        let b = 3.;
4995        let expected = _mm_setr_pd(_mm_cvtsd_f64(a), 3.);
4996        let r = unsafe { _mm_loadh_pd(a, &b) };
4997        assert_eq_m128d(r, expected);
4998    }
4999
5000    #[simd_test(enable = "sse2")]
5001    const fn test_mm_loadl_pd() {
5002        let a = _mm_setr_pd(1., 2.);
5003        let b = 3.;
5004        let expected = _mm_setr_pd(3., get_m128d(a, 1));
5005        let r = unsafe { _mm_loadl_pd(a, &b) };
5006        assert_eq_m128d(r, expected);
5007    }
5008
5009    #[simd_test(enable = "sse2")]
5010    // Miri cannot support this until it is clear how it fits in the Rust memory model
5011    // (non-temporal store)
5012    #[cfg_attr(miri, ignore)]
5013    fn test_mm_stream_pd() {
5014        #[repr(align(128))]
5015        struct Memory {
5016            pub data: [f64; 2],
5017        }
5018        let a = _mm_set1_pd(7.0);
5019        let mut mem = Memory { data: [-1.0; 2] };
5020
5021        unsafe {
5022            _mm_stream_pd(ptr::addr_of_mut!(mem.data[0]), a);
5023        }
5024        _mm_sfence();
5025        for i in 0..2 {
5026            assert_eq!(mem.data[i], get_m128d(a, i));
5027        }
5028    }
5029
5030    #[simd_test(enable = "sse2")]
5031    const fn test_mm_store_sd() {
5032        let mut dest = 0.;
5033        let a = _mm_setr_pd(1., 2.);
5034        unsafe {
5035            _mm_store_sd(&mut dest, a);
5036        }
5037        assert_eq!(dest, _mm_cvtsd_f64(a));
5038    }
5039
5040    #[simd_test(enable = "sse2")]
5041    const fn test_mm_store_pd() {
5042        let mut mem = Memory { data: [0.0f64; 4] };
5043        let vals = &mut mem.data;
5044        let a = _mm_setr_pd(1.0, 2.0);
5045        let d = vals.as_mut_ptr();
5046
5047        unsafe {
5048            _mm_store_pd(d, *black_box(&a));
5049        }
5050        assert_eq!(vals[0], 1.0);
5051        assert_eq!(vals[1], 2.0);
5052    }
5053
5054    #[simd_test(enable = "sse2")]
5055    const fn test_mm_storeu_pd() {
5056        // guaranteed to be aligned to 16 bytes
5057        let mut mem = Memory { data: [0.0f64; 4] };
5058        let vals = &mut mem.data;
5059        let a = _mm_setr_pd(1.0, 2.0);
5060
5061        // so p is *not* aligned to 16 bytes
5062        unsafe {
5063            let p = vals.as_mut_ptr().offset(1);
5064            _mm_storeu_pd(p, *black_box(&a));
5065        }
5066
5067        assert_eq!(*vals, [0.0, 1.0, 2.0, 0.0]);
5068    }
5069
5070    #[simd_test(enable = "sse2")]
5071    const fn test_mm_storeu_si16() {
5072        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
5073        let mut r = _mm_setr_epi16(9, 10, 11, 12, 13, 14, 15, 16);
5074        unsafe {
5075            _mm_storeu_si16(ptr::addr_of_mut!(r).cast(), a);
5076        }
5077        let e = _mm_setr_epi16(1, 10, 11, 12, 13, 14, 15, 16);
5078        assert_eq_m128i(r, e);
5079    }
5080
5081    #[simd_test(enable = "sse2")]
5082    const fn test_mm_storeu_si32() {
5083        let a = _mm_setr_epi32(1, 2, 3, 4);
5084        let mut r = _mm_setr_epi32(5, 6, 7, 8);
5085        unsafe {
5086            _mm_storeu_si32(ptr::addr_of_mut!(r).cast(), a);
5087        }
5088        let e = _mm_setr_epi32(1, 6, 7, 8);
5089        assert_eq_m128i(r, e);
5090    }
5091
5092    #[simd_test(enable = "sse2")]
5093    const fn test_mm_storeu_si64() {
5094        let a = _mm_setr_epi64x(1, 2);
5095        let mut r = _mm_setr_epi64x(3, 4);
5096        unsafe {
5097            _mm_storeu_si64(ptr::addr_of_mut!(r).cast(), a);
5098        }
5099        let e = _mm_setr_epi64x(1, 4);
5100        assert_eq_m128i(r, e);
5101    }
5102
5103    #[simd_test(enable = "sse2")]
5104    const fn test_mm_store1_pd() {
5105        let mut mem = Memory { data: [0.0f64; 4] };
5106        let vals = &mut mem.data;
5107        let a = _mm_setr_pd(1.0, 2.0);
5108        let d = vals.as_mut_ptr();
5109
5110        unsafe {
5111            _mm_store1_pd(d, *black_box(&a));
5112        }
5113        assert_eq!(vals[0], 1.0);
5114        assert_eq!(vals[1], 1.0);
5115    }
5116
5117    #[simd_test(enable = "sse2")]
5118    const fn test_mm_store_pd1() {
5119        let mut mem = Memory { data: [0.0f64; 4] };
5120        let vals = &mut mem.data;
5121        let a = _mm_setr_pd(1.0, 2.0);
5122        let d = vals.as_mut_ptr();
5123
5124        unsafe {
5125            _mm_store_pd1(d, *black_box(&a));
5126        }
5127        assert_eq!(vals[0], 1.0);
5128        assert_eq!(vals[1], 1.0);
5129    }
5130
5131    #[simd_test(enable = "sse2")]
5132    const fn test_mm_storer_pd() {
5133        let mut mem = Memory { data: [0.0f64; 4] };
5134        let vals = &mut mem.data;
5135        let a = _mm_setr_pd(1.0, 2.0);
5136        let d = vals.as_mut_ptr();
5137
5138        unsafe {
5139            _mm_storer_pd(d, *black_box(&a));
5140        }
5141        assert_eq!(vals[0], 2.0);
5142        assert_eq!(vals[1], 1.0);
5143    }
5144
5145    #[simd_test(enable = "sse2")]
5146    const fn test_mm_storeh_pd() {
5147        let mut dest = 0.;
5148        let a = _mm_setr_pd(1., 2.);
5149        unsafe {
5150            _mm_storeh_pd(&mut dest, a);
5151        }
5152        assert_eq!(dest, get_m128d(a, 1));
5153    }
5154
5155    #[simd_test(enable = "sse2")]
5156    const fn test_mm_storel_pd() {
5157        let mut dest = 0.;
5158        let a = _mm_setr_pd(1., 2.);
5159        unsafe {
5160            _mm_storel_pd(&mut dest, a);
5161        }
5162        assert_eq!(dest, _mm_cvtsd_f64(a));
5163    }
5164
5165    #[simd_test(enable = "sse2")]
5166    const fn test_mm_loadr_pd() {
5167        let mut mem = Memory {
5168            data: [1.0f64, 2.0, 3.0, 4.0],
5169        };
5170        let vals = &mut mem.data;
5171        let d = vals.as_ptr();
5172
5173        let r = unsafe { _mm_loadr_pd(d) };
5174        assert_eq_m128d(r, _mm_setr_pd(2.0, 1.0));
5175    }
5176
5177    #[simd_test(enable = "sse2")]
5178    const fn test_mm_loadu_pd() {
5179        // guaranteed to be aligned to 16 bytes
5180        let mut mem = Memory {
5181            data: [1.0f64, 2.0, 3.0, 4.0],
5182        };
5183        let vals = &mut mem.data;
5184
5185        // so this will *not* be aligned to 16 bytes
5186        let d = unsafe { vals.as_ptr().offset(1) };
5187
5188        let r = unsafe { _mm_loadu_pd(d) };
5189        let e = _mm_setr_pd(2.0, 3.0);
5190        assert_eq_m128d(r, e);
5191    }
5192
5193    #[simd_test(enable = "sse2")]
5194    const fn test_mm_loadu_si16() {
5195        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
5196        let r = unsafe { _mm_loadu_si16(ptr::addr_of!(a) as *const _) };
5197        assert_eq_m128i(r, _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0));
5198    }
5199
5200    #[simd_test(enable = "sse2")]
5201    const fn test_mm_loadu_si32() {
5202        let a = _mm_setr_epi32(1, 2, 3, 4);
5203        let r = unsafe { _mm_loadu_si32(ptr::addr_of!(a) as *const _) };
5204        assert_eq_m128i(r, _mm_setr_epi32(1, 0, 0, 0));
5205    }
5206
5207    #[simd_test(enable = "sse2")]
5208    const fn test_mm_loadu_si64() {
5209        let a = _mm_setr_epi64x(5, 6);
5210        let r = unsafe { _mm_loadu_si64(ptr::addr_of!(a) as *const _) };
5211        assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
5212    }
5213
5214    #[simd_test(enable = "sse2")]
5215    const fn test_mm_cvtpd_ps() {
5216        let r = _mm_cvtpd_ps(_mm_setr_pd(-1.0, 5.0));
5217        assert_eq_m128(r, _mm_setr_ps(-1.0, 5.0, 0.0, 0.0));
5218
5219        let r = _mm_cvtpd_ps(_mm_setr_pd(-1.0, -5.0));
5220        assert_eq_m128(r, _mm_setr_ps(-1.0, -5.0, 0.0, 0.0));
5221
5222        let r = _mm_cvtpd_ps(_mm_setr_pd(f64::MAX, f64::MIN));
5223        assert_eq_m128(r, _mm_setr_ps(f32::INFINITY, f32::NEG_INFINITY, 0.0, 0.0));
5224
5225        let r = _mm_cvtpd_ps(_mm_setr_pd(f32::MAX as f64, f32::MIN as f64));
5226        assert_eq_m128(r, _mm_setr_ps(f32::MAX, f32::MIN, 0.0, 0.0));
5227    }
5228
5229    #[simd_test(enable = "sse2")]
5230    const fn test_mm_cvtps_pd() {
5231        let r = _mm_cvtps_pd(_mm_setr_ps(-1.0, 2.0, -3.0, 5.0));
5232        assert_eq_m128d(r, _mm_setr_pd(-1.0, 2.0));
5233
5234        let r = _mm_cvtps_pd(_mm_setr_ps(
5235            f32::MAX,
5236            f32::INFINITY,
5237            f32::NEG_INFINITY,
5238            f32::MIN,
5239        ));
5240        assert_eq_m128d(r, _mm_setr_pd(f32::MAX as f64, f64::INFINITY));
5241    }
5242
5243    #[simd_test(enable = "sse2")]
5244    fn test_mm_cvtpd_epi32() {
5245        let r = _mm_cvtpd_epi32(_mm_setr_pd(-1.0, 5.0));
5246        assert_eq_m128i(r, _mm_setr_epi32(-1, 5, 0, 0));
5247
5248        let r = _mm_cvtpd_epi32(_mm_setr_pd(-1.0, -5.0));
5249        assert_eq_m128i(r, _mm_setr_epi32(-1, -5, 0, 0));
5250
5251        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::MAX, f64::MIN));
5252        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5253
5254        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::INFINITY, f64::NEG_INFINITY));
5255        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5256
5257        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::NAN, f64::NAN));
5258        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5259    }
5260
5261    #[simd_test(enable = "sse2")]
5262    fn test_mm_cvtsd_si32() {
5263        let r = _mm_cvtsd_si32(_mm_setr_pd(-2.0, 5.0));
5264        assert_eq!(r, -2);
5265
5266        let r = _mm_cvtsd_si32(_mm_setr_pd(f64::MAX, f64::MIN));
5267        assert_eq!(r, i32::MIN);
5268
5269        let r = _mm_cvtsd_si32(_mm_setr_pd(f64::NAN, f64::NAN));
5270        assert_eq!(r, i32::MIN);
5271    }
5272
5273    #[simd_test(enable = "sse2")]
5274    fn test_mm_cvtsd_ss() {
5275        let a = _mm_setr_ps(-1.1, -2.2, 3.3, 4.4);
5276        let b = _mm_setr_pd(2.0, -5.0);
5277
5278        let r = _mm_cvtsd_ss(a, b);
5279
5280        assert_eq_m128(r, _mm_setr_ps(2.0, -2.2, 3.3, 4.4));
5281
5282        let a = _mm_setr_ps(-1.1, f32::NEG_INFINITY, f32::MAX, f32::NEG_INFINITY);
5283        let b = _mm_setr_pd(f64::INFINITY, -5.0);
5284
5285        let r = _mm_cvtsd_ss(a, b);
5286
5287        assert_eq_m128(
5288            r,
5289            _mm_setr_ps(
5290                f32::INFINITY,
5291                f32::NEG_INFINITY,
5292                f32::MAX,
5293                f32::NEG_INFINITY,
5294            ),
5295        );
5296    }
5297
5298    #[simd_test(enable = "sse2")]
5299    const fn test_mm_cvtsd_f64() {
5300        let r = _mm_cvtsd_f64(_mm_setr_pd(-1.1, 2.2));
5301        assert_eq!(r, -1.1);
5302    }
5303
5304    #[simd_test(enable = "sse2")]
5305    const fn test_mm_cvtss_sd() {
5306        let a = _mm_setr_pd(-1.1, 2.2);
5307        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
5308
5309        let r = _mm_cvtss_sd(a, b);
5310        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.2));
5311
5312        let a = _mm_setr_pd(-1.1, f64::INFINITY);
5313        let b = _mm_setr_ps(f32::NEG_INFINITY, 2.0, 3.0, 4.0);
5314
5315        let r = _mm_cvtss_sd(a, b);
5316        assert_eq_m128d(r, _mm_setr_pd(f64::NEG_INFINITY, f64::INFINITY));
5317    }
5318
5319    #[simd_test(enable = "sse2")]
5320    fn test_mm_cvttpd_epi32() {
5321        let a = _mm_setr_pd(-1.1, 2.2);
5322        let r = _mm_cvttpd_epi32(a);
5323        assert_eq_m128i(r, _mm_setr_epi32(-1, 2, 0, 0));
5324
5325        let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
5326        let r = _mm_cvttpd_epi32(a);
5327        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5328    }
5329
5330    #[simd_test(enable = "sse2")]
5331    fn test_mm_cvttsd_si32() {
5332        let a = _mm_setr_pd(-1.1, 2.2);
5333        let r = _mm_cvttsd_si32(a);
5334        assert_eq!(r, -1);
5335
5336        let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
5337        let r = _mm_cvttsd_si32(a);
5338        assert_eq!(r, i32::MIN);
5339    }
5340
5341    #[simd_test(enable = "sse2")]
5342    fn test_mm_cvttps_epi32() {
5343        let a = _mm_setr_ps(-1.1, 2.2, -3.3, 6.6);
5344        let r = _mm_cvttps_epi32(a);
5345        assert_eq_m128i(r, _mm_setr_epi32(-1, 2, -3, 6));
5346
5347        let a = _mm_setr_ps(f32::NEG_INFINITY, f32::INFINITY, f32::MIN, f32::MAX);
5348        let r = _mm_cvttps_epi32(a);
5349        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, i32::MIN, i32::MIN));
5350    }
5351
5352    #[simd_test(enable = "sse2")]
5353    const fn test_mm_set_sd() {
5354        let r = _mm_set_sd(-1.0_f64);
5355        assert_eq_m128d(r, _mm_setr_pd(-1.0_f64, 0_f64));
5356    }
5357
5358    #[simd_test(enable = "sse2")]
5359    const fn test_mm_set1_pd() {
5360        let r = _mm_set1_pd(-1.0_f64);
5361        assert_eq_m128d(r, _mm_setr_pd(-1.0_f64, -1.0_f64));
5362    }
5363
5364    #[simd_test(enable = "sse2")]
5365    const fn test_mm_set_pd1() {
5366        let r = _mm_set_pd1(-2.0_f64);
5367        assert_eq_m128d(r, _mm_setr_pd(-2.0_f64, -2.0_f64));
5368    }
5369
5370    #[simd_test(enable = "sse2")]
5371    const fn test_mm_set_pd() {
5372        let r = _mm_set_pd(1.0_f64, 5.0_f64);
5373        assert_eq_m128d(r, _mm_setr_pd(5.0_f64, 1.0_f64));
5374    }
5375
5376    #[simd_test(enable = "sse2")]
5377    const fn test_mm_setr_pd() {
5378        let r = _mm_setr_pd(1.0_f64, -5.0_f64);
5379        assert_eq_m128d(r, _mm_setr_pd(1.0_f64, -5.0_f64));
5380    }
5381
5382    #[simd_test(enable = "sse2")]
5383    const fn test_mm_setzero_pd() {
5384        let r = _mm_setzero_pd();
5385        assert_eq_m128d(r, _mm_setr_pd(0_f64, 0_f64));
5386    }
5387
5388    #[simd_test(enable = "sse2")]
5389    const fn test_mm_load1_pd() {
5390        let d = -5.0;
5391        let r = unsafe { _mm_load1_pd(&d) };
5392        assert_eq_m128d(r, _mm_setr_pd(d, d));
5393    }
5394
5395    #[simd_test(enable = "sse2")]
5396    const fn test_mm_load_pd1() {
5397        let d = -5.0;
5398        let r = unsafe { _mm_load_pd1(&d) };
5399        assert_eq_m128d(r, _mm_setr_pd(d, d));
5400    }
5401
5402    #[simd_test(enable = "sse2")]
5403    const fn test_mm_unpackhi_pd() {
5404        let a = _mm_setr_pd(1.0, 2.0);
5405        let b = _mm_setr_pd(3.0, 4.0);
5406        let r = _mm_unpackhi_pd(a, b);
5407        assert_eq_m128d(r, _mm_setr_pd(2.0, 4.0));
5408    }
5409
5410    #[simd_test(enable = "sse2")]
5411    const fn test_mm_unpacklo_pd() {
5412        let a = _mm_setr_pd(1.0, 2.0);
5413        let b = _mm_setr_pd(3.0, 4.0);
5414        let r = _mm_unpacklo_pd(a, b);
5415        assert_eq_m128d(r, _mm_setr_pd(1.0, 3.0));
5416    }
5417
5418    #[simd_test(enable = "sse2")]
5419    const fn test_mm_shuffle_pd() {
5420        let a = _mm_setr_pd(1., 2.);
5421        let b = _mm_setr_pd(3., 4.);
5422        let expected = _mm_setr_pd(1., 3.);
5423        let r = _mm_shuffle_pd::<0b00_00_00_00>(a, b);
5424        assert_eq_m128d(r, expected);
5425    }
5426
5427    #[simd_test(enable = "sse2")]
5428    const fn test_mm_move_sd() {
5429        let a = _mm_setr_pd(1., 2.);
5430        let b = _mm_setr_pd(3., 4.);
5431        let expected = _mm_setr_pd(3., 2.);
5432        let r = _mm_move_sd(a, b);
5433        assert_eq_m128d(r, expected);
5434    }
5435
5436    #[simd_test(enable = "sse2")]
5437    const fn test_mm_castpd_ps() {
5438        let a = _mm_set1_pd(0.);
5439        let expected = _mm_set1_ps(0.);
5440        let r = _mm_castpd_ps(a);
5441        assert_eq_m128(r, expected);
5442    }
5443
5444    #[simd_test(enable = "sse2")]
5445    const fn test_mm_castpd_si128() {
5446        let a = _mm_set1_pd(0.);
5447        let expected = _mm_set1_epi64x(0);
5448        let r = _mm_castpd_si128(a);
5449        assert_eq_m128i(r, expected);
5450    }
5451
5452    #[simd_test(enable = "sse2")]
5453    const fn test_mm_castps_pd() {
5454        let a = _mm_set1_ps(0.);
5455        let expected = _mm_set1_pd(0.);
5456        let r = _mm_castps_pd(a);
5457        assert_eq_m128d(r, expected);
5458    }
5459
5460    #[simd_test(enable = "sse2")]
5461    const fn test_mm_castps_si128() {
5462        let a = _mm_set1_ps(0.);
5463        let expected = _mm_set1_epi32(0);
5464        let r = _mm_castps_si128(a);
5465        assert_eq_m128i(r, expected);
5466    }
5467
5468    #[simd_test(enable = "sse2")]
5469    const fn test_mm_castsi128_pd() {
5470        let a = _mm_set1_epi64x(0);
5471        let expected = _mm_set1_pd(0.);
5472        let r = _mm_castsi128_pd(a);
5473        assert_eq_m128d(r, expected);
5474    }
5475
5476    #[simd_test(enable = "sse2")]
5477    const fn test_mm_castsi128_ps() {
5478        let a = _mm_set1_epi32(0);
5479        let expected = _mm_set1_ps(0.);
5480        let r = _mm_castsi128_ps(a);
5481        assert_eq_m128(r, expected);
5482    }
5483}