core/stdarch/crates/core_arch/src/x86/
sse2.rs

1//! Streaming SIMD Extensions 2 (SSE2)
2
3#[cfg(test)]
4use stdarch_test::assert_instr;
5
6use crate::{
7    core_arch::{simd::*, x86::*},
8    intrinsics::simd::*,
9    intrinsics::sqrtf64,
10    mem, ptr,
11};
12
13/// Provides a hint to the processor that the code sequence is a spin-wait loop.
14///
15/// This can help improve the performance and power consumption of spin-wait
16/// loops.
17///
18/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause)
19#[inline]
20#[cfg_attr(all(test, target_feature = "sse2"), assert_instr(pause))]
21#[stable(feature = "simd_x86", since = "1.27.0")]
22pub fn _mm_pause() {
23    // note: `pause` is guaranteed to be interpreted as a `nop` by CPUs without
24    // the SSE2 target-feature - therefore it does not require any target features
25    unsafe { pause() }
26}
27
28/// Invalidates and flushes the cache line that contains `p` from all levels of
29/// the cache hierarchy.
30///
31/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush)
32#[inline]
33#[target_feature(enable = "sse2")]
34#[cfg_attr(test, assert_instr(clflush))]
35#[stable(feature = "simd_x86", since = "1.27.0")]
36pub unsafe fn _mm_clflush(p: *const u8) {
37    clflush(p)
38}
39
40/// Performs a serializing operation on all load-from-memory instructions
41/// that were issued prior to this instruction.
42///
43/// Guarantees that every load instruction that precedes, in program order, is
44/// globally visible before any load instruction which follows the fence in
45/// program order.
46///
47/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence)
48#[inline]
49#[target_feature(enable = "sse2")]
50#[cfg_attr(test, assert_instr(lfence))]
51#[stable(feature = "simd_x86", since = "1.27.0")]
52pub fn _mm_lfence() {
53    unsafe { lfence() }
54}
55
56/// Performs a serializing operation on all load-from-memory and store-to-memory
57/// instructions that were issued prior to this instruction.
58///
59/// Guarantees that every memory access that precedes, in program order, the
60/// memory fence instruction is globally visible before any memory instruction
61/// which follows the fence in program order.
62///
63/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence)
64#[inline]
65#[target_feature(enable = "sse2")]
66#[cfg_attr(test, assert_instr(mfence))]
67#[stable(feature = "simd_x86", since = "1.27.0")]
68pub fn _mm_mfence() {
69    unsafe { mfence() }
70}
71
72/// Adds packed 8-bit integers in `a` and `b`.
73///
74/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8)
75#[inline]
76#[target_feature(enable = "sse2")]
77#[cfg_attr(test, assert_instr(paddb))]
78#[stable(feature = "simd_x86", since = "1.27.0")]
79pub fn _mm_add_epi8(a: __m128i, b: __m128i) -> __m128i {
80    unsafe { transmute(simd_add(a.as_i8x16(), b.as_i8x16())) }
81}
82
83/// Adds packed 16-bit integers in `a` and `b`.
84///
85/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16)
86#[inline]
87#[target_feature(enable = "sse2")]
88#[cfg_attr(test, assert_instr(paddw))]
89#[stable(feature = "simd_x86", since = "1.27.0")]
90pub fn _mm_add_epi16(a: __m128i, b: __m128i) -> __m128i {
91    unsafe { transmute(simd_add(a.as_i16x8(), b.as_i16x8())) }
92}
93
94/// Adds packed 32-bit integers in `a` and `b`.
95///
96/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32)
97#[inline]
98#[target_feature(enable = "sse2")]
99#[cfg_attr(test, assert_instr(paddd))]
100#[stable(feature = "simd_x86", since = "1.27.0")]
101pub fn _mm_add_epi32(a: __m128i, b: __m128i) -> __m128i {
102    unsafe { transmute(simd_add(a.as_i32x4(), b.as_i32x4())) }
103}
104
105/// Adds packed 64-bit integers in `a` and `b`.
106///
107/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64)
108#[inline]
109#[target_feature(enable = "sse2")]
110#[cfg_attr(test, assert_instr(paddq))]
111#[stable(feature = "simd_x86", since = "1.27.0")]
112pub fn _mm_add_epi64(a: __m128i, b: __m128i) -> __m128i {
113    unsafe { transmute(simd_add(a.as_i64x2(), b.as_i64x2())) }
114}
115
116/// Adds packed 8-bit integers in `a` and `b` using saturation.
117///
118/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8)
119#[inline]
120#[target_feature(enable = "sse2")]
121#[cfg_attr(test, assert_instr(paddsb))]
122#[stable(feature = "simd_x86", since = "1.27.0")]
123pub fn _mm_adds_epi8(a: __m128i, b: __m128i) -> __m128i {
124    unsafe { transmute(simd_saturating_add(a.as_i8x16(), b.as_i8x16())) }
125}
126
127/// Adds packed 16-bit integers in `a` and `b` using saturation.
128///
129/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16)
130#[inline]
131#[target_feature(enable = "sse2")]
132#[cfg_attr(test, assert_instr(paddsw))]
133#[stable(feature = "simd_x86", since = "1.27.0")]
134pub fn _mm_adds_epi16(a: __m128i, b: __m128i) -> __m128i {
135    unsafe { transmute(simd_saturating_add(a.as_i16x8(), b.as_i16x8())) }
136}
137
138/// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
139///
140/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8)
141#[inline]
142#[target_feature(enable = "sse2")]
143#[cfg_attr(test, assert_instr(paddusb))]
144#[stable(feature = "simd_x86", since = "1.27.0")]
145pub fn _mm_adds_epu8(a: __m128i, b: __m128i) -> __m128i {
146    unsafe { transmute(simd_saturating_add(a.as_u8x16(), b.as_u8x16())) }
147}
148
149/// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
150///
151/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16)
152#[inline]
153#[target_feature(enable = "sse2")]
154#[cfg_attr(test, assert_instr(paddusw))]
155#[stable(feature = "simd_x86", since = "1.27.0")]
156pub fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i {
157    unsafe { transmute(simd_saturating_add(a.as_u16x8(), b.as_u16x8())) }
158}
159
160/// Averages packed unsigned 8-bit integers in `a` and `b`.
161///
162/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8)
163#[inline]
164#[target_feature(enable = "sse2")]
165#[cfg_attr(test, assert_instr(pavgb))]
166#[stable(feature = "simd_x86", since = "1.27.0")]
167pub fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i {
168    unsafe {
169        let a = simd_cast::<_, u16x16>(a.as_u8x16());
170        let b = simd_cast::<_, u16x16>(b.as_u8x16());
171        let r = simd_shr(simd_add(simd_add(a, b), u16x16::splat(1)), u16x16::splat(1));
172        transmute(simd_cast::<_, u8x16>(r))
173    }
174}
175
176/// Averages packed unsigned 16-bit integers in `a` and `b`.
177///
178/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16)
179#[inline]
180#[target_feature(enable = "sse2")]
181#[cfg_attr(test, assert_instr(pavgw))]
182#[stable(feature = "simd_x86", since = "1.27.0")]
183pub fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
184    unsafe {
185        let a = simd_cast::<_, u32x8>(a.as_u16x8());
186        let b = simd_cast::<_, u32x8>(b.as_u16x8());
187        let r = simd_shr(simd_add(simd_add(a, b), u32x8::splat(1)), u32x8::splat(1));
188        transmute(simd_cast::<_, u16x8>(r))
189    }
190}
191
192/// Multiplies and then horizontally add signed 16 bit integers in `a` and `b`.
193///
194/// Multiplies packed signed 16-bit integers in `a` and `b`, producing
195/// intermediate signed 32-bit integers. Horizontally add adjacent pairs of
196/// intermediate 32-bit integers.
197///
198/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16)
199#[inline]
200#[target_feature(enable = "sse2")]
201#[cfg_attr(test, assert_instr(pmaddwd))]
202#[stable(feature = "simd_x86", since = "1.27.0")]
203pub fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
204    unsafe {
205        let r: i32x8 = simd_mul(simd_cast(a.as_i16x8()), simd_cast(b.as_i16x8()));
206        let even: i32x4 = simd_shuffle!(r, r, [0, 2, 4, 6]);
207        let odd: i32x4 = simd_shuffle!(r, r, [1, 3, 5, 7]);
208        simd_add(even, odd).as_m128i()
209    }
210}
211
212/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
213/// maximum values.
214///
215/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16)
216#[inline]
217#[target_feature(enable = "sse2")]
218#[cfg_attr(test, assert_instr(pmaxsw))]
219#[stable(feature = "simd_x86", since = "1.27.0")]
220pub fn _mm_max_epi16(a: __m128i, b: __m128i) -> __m128i {
221    unsafe {
222        let a = a.as_i16x8();
223        let b = b.as_i16x8();
224        transmute(simd_select::<i16x8, _>(simd_gt(a, b), a, b))
225    }
226}
227
228/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
229/// packed maximum values.
230///
231/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8)
232#[inline]
233#[target_feature(enable = "sse2")]
234#[cfg_attr(test, assert_instr(pmaxub))]
235#[stable(feature = "simd_x86", since = "1.27.0")]
236pub fn _mm_max_epu8(a: __m128i, b: __m128i) -> __m128i {
237    unsafe {
238        let a = a.as_u8x16();
239        let b = b.as_u8x16();
240        transmute(simd_select::<i8x16, _>(simd_gt(a, b), a, b))
241    }
242}
243
244/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
245/// minimum values.
246///
247/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16)
248#[inline]
249#[target_feature(enable = "sse2")]
250#[cfg_attr(test, assert_instr(pminsw))]
251#[stable(feature = "simd_x86", since = "1.27.0")]
252pub fn _mm_min_epi16(a: __m128i, b: __m128i) -> __m128i {
253    unsafe {
254        let a = a.as_i16x8();
255        let b = b.as_i16x8();
256        transmute(simd_select::<i16x8, _>(simd_lt(a, b), a, b))
257    }
258}
259
260/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
261/// packed minimum values.
262///
263/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8)
264#[inline]
265#[target_feature(enable = "sse2")]
266#[cfg_attr(test, assert_instr(pminub))]
267#[stable(feature = "simd_x86", since = "1.27.0")]
268pub fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i {
269    unsafe {
270        let a = a.as_u8x16();
271        let b = b.as_u8x16();
272        transmute(simd_select::<i8x16, _>(simd_lt(a, b), a, b))
273    }
274}
275
276/// Multiplies the packed 16-bit integers in `a` and `b`.
277///
278/// The multiplication produces intermediate 32-bit integers, and returns the
279/// high 16 bits of the intermediate integers.
280///
281/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16)
282#[inline]
283#[target_feature(enable = "sse2")]
284#[cfg_attr(test, assert_instr(pmulhw))]
285#[stable(feature = "simd_x86", since = "1.27.0")]
286pub fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i {
287    unsafe {
288        let a = simd_cast::<_, i32x8>(a.as_i16x8());
289        let b = simd_cast::<_, i32x8>(b.as_i16x8());
290        let r = simd_shr(simd_mul(a, b), i32x8::splat(16));
291        transmute(simd_cast::<i32x8, i16x8>(r))
292    }
293}
294
295/// Multiplies the packed unsigned 16-bit integers in `a` and `b`.
296///
297/// The multiplication produces intermediate 32-bit integers, and returns the
298/// high 16 bits of the intermediate integers.
299///
300/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16)
301#[inline]
302#[target_feature(enable = "sse2")]
303#[cfg_attr(test, assert_instr(pmulhuw))]
304#[stable(feature = "simd_x86", since = "1.27.0")]
305pub fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i {
306    unsafe {
307        let a = simd_cast::<_, u32x8>(a.as_u16x8());
308        let b = simd_cast::<_, u32x8>(b.as_u16x8());
309        let r = simd_shr(simd_mul(a, b), u32x8::splat(16));
310        transmute(simd_cast::<u32x8, u16x8>(r))
311    }
312}
313
314/// Multiplies the packed 16-bit integers in `a` and `b`.
315///
316/// The multiplication produces intermediate 32-bit integers, and returns the
317/// low 16 bits of the intermediate integers.
318///
319/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16)
320#[inline]
321#[target_feature(enable = "sse2")]
322#[cfg_attr(test, assert_instr(pmullw))]
323#[stable(feature = "simd_x86", since = "1.27.0")]
324pub fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i {
325    unsafe { transmute(simd_mul(a.as_i16x8(), b.as_i16x8())) }
326}
327
328/// Multiplies the low unsigned 32-bit integers from each packed 64-bit element
329/// in `a` and `b`.
330///
331/// Returns the unsigned 64-bit results.
332///
333/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32)
334#[inline]
335#[target_feature(enable = "sse2")]
336#[cfg_attr(test, assert_instr(pmuludq))]
337#[stable(feature = "simd_x86", since = "1.27.0")]
338pub fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i {
339    unsafe {
340        let a = a.as_u64x2();
341        let b = b.as_u64x2();
342        let mask = u64x2::splat(u32::MAX.into());
343        transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
344    }
345}
346
347/// Sum the absolute differences of packed unsigned 8-bit integers.
348///
349/// Computes the absolute differences of packed unsigned 8-bit integers in `a`
350/// and `b`, then horizontally sum each consecutive 8 differences to produce
351/// two unsigned 16-bit integers, and pack these unsigned 16-bit integers in
352/// the low 16 bits of 64-bit elements returned.
353///
354/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8)
355#[inline]
356#[target_feature(enable = "sse2")]
357#[cfg_attr(test, assert_instr(psadbw))]
358#[stable(feature = "simd_x86", since = "1.27.0")]
359pub fn _mm_sad_epu8(a: __m128i, b: __m128i) -> __m128i {
360    unsafe { transmute(psadbw(a.as_u8x16(), b.as_u8x16())) }
361}
362
363/// Subtracts packed 8-bit integers in `b` from packed 8-bit integers in `a`.
364///
365/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8)
366#[inline]
367#[target_feature(enable = "sse2")]
368#[cfg_attr(test, assert_instr(psubb))]
369#[stable(feature = "simd_x86", since = "1.27.0")]
370pub fn _mm_sub_epi8(a: __m128i, b: __m128i) -> __m128i {
371    unsafe { transmute(simd_sub(a.as_i8x16(), b.as_i8x16())) }
372}
373
374/// Subtracts packed 16-bit integers in `b` from packed 16-bit integers in `a`.
375///
376/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16)
377#[inline]
378#[target_feature(enable = "sse2")]
379#[cfg_attr(test, assert_instr(psubw))]
380#[stable(feature = "simd_x86", since = "1.27.0")]
381pub fn _mm_sub_epi16(a: __m128i, b: __m128i) -> __m128i {
382    unsafe { transmute(simd_sub(a.as_i16x8(), b.as_i16x8())) }
383}
384
385/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
386///
387/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32)
388#[inline]
389#[target_feature(enable = "sse2")]
390#[cfg_attr(test, assert_instr(psubd))]
391#[stable(feature = "simd_x86", since = "1.27.0")]
392pub fn _mm_sub_epi32(a: __m128i, b: __m128i) -> __m128i {
393    unsafe { transmute(simd_sub(a.as_i32x4(), b.as_i32x4())) }
394}
395
396/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
397///
398/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64)
399#[inline]
400#[target_feature(enable = "sse2")]
401#[cfg_attr(test, assert_instr(psubq))]
402#[stable(feature = "simd_x86", since = "1.27.0")]
403pub fn _mm_sub_epi64(a: __m128i, b: __m128i) -> __m128i {
404    unsafe { transmute(simd_sub(a.as_i64x2(), b.as_i64x2())) }
405}
406
407/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
408/// using saturation.
409///
410/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8)
411#[inline]
412#[target_feature(enable = "sse2")]
413#[cfg_attr(test, assert_instr(psubsb))]
414#[stable(feature = "simd_x86", since = "1.27.0")]
415pub fn _mm_subs_epi8(a: __m128i, b: __m128i) -> __m128i {
416    unsafe { transmute(simd_saturating_sub(a.as_i8x16(), b.as_i8x16())) }
417}
418
419/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
420/// using saturation.
421///
422/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16)
423#[inline]
424#[target_feature(enable = "sse2")]
425#[cfg_attr(test, assert_instr(psubsw))]
426#[stable(feature = "simd_x86", since = "1.27.0")]
427pub fn _mm_subs_epi16(a: __m128i, b: __m128i) -> __m128i {
428    unsafe { transmute(simd_saturating_sub(a.as_i16x8(), b.as_i16x8())) }
429}
430
431/// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit
432/// integers in `a` using saturation.
433///
434/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8)
435#[inline]
436#[target_feature(enable = "sse2")]
437#[cfg_attr(test, assert_instr(psubusb))]
438#[stable(feature = "simd_x86", since = "1.27.0")]
439pub fn _mm_subs_epu8(a: __m128i, b: __m128i) -> __m128i {
440    unsafe { transmute(simd_saturating_sub(a.as_u8x16(), b.as_u8x16())) }
441}
442
443/// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit
444/// integers in `a` using saturation.
445///
446/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16)
447#[inline]
448#[target_feature(enable = "sse2")]
449#[cfg_attr(test, assert_instr(psubusw))]
450#[stable(feature = "simd_x86", since = "1.27.0")]
451pub fn _mm_subs_epu16(a: __m128i, b: __m128i) -> __m128i {
452    unsafe { transmute(simd_saturating_sub(a.as_u16x8(), b.as_u16x8())) }
453}
454
455/// Shifts `a` left by `IMM8` bytes while shifting in zeros.
456///
457/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128)
458#[inline]
459#[target_feature(enable = "sse2")]
460#[cfg_attr(test, assert_instr(pslldq, IMM8 = 1))]
461#[rustc_legacy_const_generics(1)]
462#[stable(feature = "simd_x86", since = "1.27.0")]
463pub fn _mm_slli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
464    static_assert_uimm_bits!(IMM8, 8);
465    unsafe { _mm_slli_si128_impl::<IMM8>(a) }
466}
467
468/// Implementation detail: converts the immediate argument of the
469/// `_mm_slli_si128` intrinsic into a compile-time constant.
470#[inline]
471#[target_feature(enable = "sse2")]
472unsafe fn _mm_slli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
473    const fn mask(shift: i32, i: u32) -> u32 {
474        let shift = shift as u32 & 0xff;
475        if shift > 15 { i } else { 16 - shift + i }
476    }
477    transmute::<i8x16, _>(simd_shuffle!(
478        i8x16::ZERO,
479        a.as_i8x16(),
480        [
481            mask(IMM8, 0),
482            mask(IMM8, 1),
483            mask(IMM8, 2),
484            mask(IMM8, 3),
485            mask(IMM8, 4),
486            mask(IMM8, 5),
487            mask(IMM8, 6),
488            mask(IMM8, 7),
489            mask(IMM8, 8),
490            mask(IMM8, 9),
491            mask(IMM8, 10),
492            mask(IMM8, 11),
493            mask(IMM8, 12),
494            mask(IMM8, 13),
495            mask(IMM8, 14),
496            mask(IMM8, 15),
497        ],
498    ))
499}
500
501/// Shifts `a` left by `IMM8` bytes while shifting in zeros.
502///
503/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128)
504#[inline]
505#[target_feature(enable = "sse2")]
506#[cfg_attr(test, assert_instr(pslldq, IMM8 = 1))]
507#[rustc_legacy_const_generics(1)]
508#[stable(feature = "simd_x86", since = "1.27.0")]
509pub fn _mm_bslli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
510    unsafe {
511        static_assert_uimm_bits!(IMM8, 8);
512        _mm_slli_si128_impl::<IMM8>(a)
513    }
514}
515
516/// Shifts `a` right by `IMM8` bytes while shifting in zeros.
517///
518/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128)
519#[inline]
520#[target_feature(enable = "sse2")]
521#[cfg_attr(test, assert_instr(psrldq, IMM8 = 1))]
522#[rustc_legacy_const_generics(1)]
523#[stable(feature = "simd_x86", since = "1.27.0")]
524pub fn _mm_bsrli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
525    unsafe {
526        static_assert_uimm_bits!(IMM8, 8);
527        _mm_srli_si128_impl::<IMM8>(a)
528    }
529}
530
531/// Shifts packed 16-bit integers in `a` left by `IMM8` while shifting in zeros.
532///
533/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16)
534#[inline]
535#[target_feature(enable = "sse2")]
536#[cfg_attr(test, assert_instr(psllw, IMM8 = 7))]
537#[rustc_legacy_const_generics(1)]
538#[stable(feature = "simd_x86", since = "1.27.0")]
539pub fn _mm_slli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
540    static_assert_uimm_bits!(IMM8, 8);
541    unsafe {
542        if IMM8 >= 16 {
543            _mm_setzero_si128()
544        } else {
545            transmute(simd_shl(a.as_u16x8(), u16x8::splat(IMM8 as u16)))
546        }
547    }
548}
549
550/// Shifts packed 16-bit integers in `a` left by `count` while shifting in
551/// zeros.
552///
553/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16)
554#[inline]
555#[target_feature(enable = "sse2")]
556#[cfg_attr(test, assert_instr(psllw))]
557#[stable(feature = "simd_x86", since = "1.27.0")]
558pub fn _mm_sll_epi16(a: __m128i, count: __m128i) -> __m128i {
559    unsafe { transmute(psllw(a.as_i16x8(), count.as_i16x8())) }
560}
561
562/// Shifts packed 32-bit integers in `a` left by `IMM8` while shifting in zeros.
563///
564/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32)
565#[inline]
566#[target_feature(enable = "sse2")]
567#[cfg_attr(test, assert_instr(pslld, IMM8 = 7))]
568#[rustc_legacy_const_generics(1)]
569#[stable(feature = "simd_x86", since = "1.27.0")]
570pub fn _mm_slli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
571    static_assert_uimm_bits!(IMM8, 8);
572    unsafe {
573        if IMM8 >= 32 {
574            _mm_setzero_si128()
575        } else {
576            transmute(simd_shl(a.as_u32x4(), u32x4::splat(IMM8 as u32)))
577        }
578    }
579}
580
581/// Shifts packed 32-bit integers in `a` left by `count` while shifting in
582/// zeros.
583///
584/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32)
585#[inline]
586#[target_feature(enable = "sse2")]
587#[cfg_attr(test, assert_instr(pslld))]
588#[stable(feature = "simd_x86", since = "1.27.0")]
589pub fn _mm_sll_epi32(a: __m128i, count: __m128i) -> __m128i {
590    unsafe { transmute(pslld(a.as_i32x4(), count.as_i32x4())) }
591}
592
593/// Shifts packed 64-bit integers in `a` left by `IMM8` while shifting in zeros.
594///
595/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64)
596#[inline]
597#[target_feature(enable = "sse2")]
598#[cfg_attr(test, assert_instr(psllq, IMM8 = 7))]
599#[rustc_legacy_const_generics(1)]
600#[stable(feature = "simd_x86", since = "1.27.0")]
601pub fn _mm_slli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
602    static_assert_uimm_bits!(IMM8, 8);
603    unsafe {
604        if IMM8 >= 64 {
605            _mm_setzero_si128()
606        } else {
607            transmute(simd_shl(a.as_u64x2(), u64x2::splat(IMM8 as u64)))
608        }
609    }
610}
611
612/// Shifts packed 64-bit integers in `a` left by `count` while shifting in
613/// zeros.
614///
615/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64)
616#[inline]
617#[target_feature(enable = "sse2")]
618#[cfg_attr(test, assert_instr(psllq))]
619#[stable(feature = "simd_x86", since = "1.27.0")]
620pub fn _mm_sll_epi64(a: __m128i, count: __m128i) -> __m128i {
621    unsafe { transmute(psllq(a.as_i64x2(), count.as_i64x2())) }
622}
623
624/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in sign
625/// bits.
626///
627/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16)
628#[inline]
629#[target_feature(enable = "sse2")]
630#[cfg_attr(test, assert_instr(psraw, IMM8 = 1))]
631#[rustc_legacy_const_generics(1)]
632#[stable(feature = "simd_x86", since = "1.27.0")]
633pub fn _mm_srai_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
634    static_assert_uimm_bits!(IMM8, 8);
635    unsafe { transmute(simd_shr(a.as_i16x8(), i16x8::splat(IMM8.min(15) as i16))) }
636}
637
638/// Shifts packed 16-bit integers in `a` right by `count` while shifting in sign
639/// bits.
640///
641/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16)
642#[inline]
643#[target_feature(enable = "sse2")]
644#[cfg_attr(test, assert_instr(psraw))]
645#[stable(feature = "simd_x86", since = "1.27.0")]
646pub fn _mm_sra_epi16(a: __m128i, count: __m128i) -> __m128i {
647    unsafe { transmute(psraw(a.as_i16x8(), count.as_i16x8())) }
648}
649
650/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in sign
651/// bits.
652///
653/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32)
654#[inline]
655#[target_feature(enable = "sse2")]
656#[cfg_attr(test, assert_instr(psrad, IMM8 = 1))]
657#[rustc_legacy_const_generics(1)]
658#[stable(feature = "simd_x86", since = "1.27.0")]
659pub fn _mm_srai_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
660    static_assert_uimm_bits!(IMM8, 8);
661    unsafe { transmute(simd_shr(a.as_i32x4(), i32x4::splat(IMM8.min(31)))) }
662}
663
664/// Shifts packed 32-bit integers in `a` right by `count` while shifting in sign
665/// bits.
666///
667/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32)
668#[inline]
669#[target_feature(enable = "sse2")]
670#[cfg_attr(test, assert_instr(psrad))]
671#[stable(feature = "simd_x86", since = "1.27.0")]
672pub fn _mm_sra_epi32(a: __m128i, count: __m128i) -> __m128i {
673    unsafe { transmute(psrad(a.as_i32x4(), count.as_i32x4())) }
674}
675
676/// Shifts `a` right by `IMM8` bytes while shifting in zeros.
677///
678/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128)
679#[inline]
680#[target_feature(enable = "sse2")]
681#[cfg_attr(test, assert_instr(psrldq, IMM8 = 1))]
682#[rustc_legacy_const_generics(1)]
683#[stable(feature = "simd_x86", since = "1.27.0")]
684pub fn _mm_srli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
685    static_assert_uimm_bits!(IMM8, 8);
686    unsafe { _mm_srli_si128_impl::<IMM8>(a) }
687}
688
689/// Implementation detail: converts the immediate argument of the
690/// `_mm_srli_si128` intrinsic into a compile-time constant.
691#[inline]
692#[target_feature(enable = "sse2")]
693unsafe fn _mm_srli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
694    const fn mask(shift: i32, i: u32) -> u32 {
695        if (shift as u32) > 15 {
696            i + 16
697        } else {
698            i + (shift as u32)
699        }
700    }
701    let x: i8x16 = simd_shuffle!(
702        a.as_i8x16(),
703        i8x16::ZERO,
704        [
705            mask(IMM8, 0),
706            mask(IMM8, 1),
707            mask(IMM8, 2),
708            mask(IMM8, 3),
709            mask(IMM8, 4),
710            mask(IMM8, 5),
711            mask(IMM8, 6),
712            mask(IMM8, 7),
713            mask(IMM8, 8),
714            mask(IMM8, 9),
715            mask(IMM8, 10),
716            mask(IMM8, 11),
717            mask(IMM8, 12),
718            mask(IMM8, 13),
719            mask(IMM8, 14),
720            mask(IMM8, 15),
721        ],
722    );
723    transmute(x)
724}
725
726/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
727/// zeros.
728///
729/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16)
730#[inline]
731#[target_feature(enable = "sse2")]
732#[cfg_attr(test, assert_instr(psrlw, IMM8 = 1))]
733#[rustc_legacy_const_generics(1)]
734#[stable(feature = "simd_x86", since = "1.27.0")]
735pub fn _mm_srli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
736    static_assert_uimm_bits!(IMM8, 8);
737    unsafe {
738        if IMM8 >= 16 {
739            _mm_setzero_si128()
740        } else {
741            transmute(simd_shr(a.as_u16x8(), u16x8::splat(IMM8 as u16)))
742        }
743    }
744}
745
746/// Shifts packed 16-bit integers in `a` right by `count` while shifting in
747/// zeros.
748///
749/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16)
750#[inline]
751#[target_feature(enable = "sse2")]
752#[cfg_attr(test, assert_instr(psrlw))]
753#[stable(feature = "simd_x86", since = "1.27.0")]
754pub fn _mm_srl_epi16(a: __m128i, count: __m128i) -> __m128i {
755    unsafe { transmute(psrlw(a.as_i16x8(), count.as_i16x8())) }
756}
757
758/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
759/// zeros.
760///
761/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32)
762#[inline]
763#[target_feature(enable = "sse2")]
764#[cfg_attr(test, assert_instr(psrld, IMM8 = 8))]
765#[rustc_legacy_const_generics(1)]
766#[stable(feature = "simd_x86", since = "1.27.0")]
767pub fn _mm_srli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
768    static_assert_uimm_bits!(IMM8, 8);
769    unsafe {
770        if IMM8 >= 32 {
771            _mm_setzero_si128()
772        } else {
773            transmute(simd_shr(a.as_u32x4(), u32x4::splat(IMM8 as u32)))
774        }
775    }
776}
777
778/// Shifts packed 32-bit integers in `a` right by `count` while shifting in
779/// zeros.
780///
781/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32)
782#[inline]
783#[target_feature(enable = "sse2")]
784#[cfg_attr(test, assert_instr(psrld))]
785#[stable(feature = "simd_x86", since = "1.27.0")]
786pub fn _mm_srl_epi32(a: __m128i, count: __m128i) -> __m128i {
787    unsafe { transmute(psrld(a.as_i32x4(), count.as_i32x4())) }
788}
789
790/// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
791/// zeros.
792///
793/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64)
794#[inline]
795#[target_feature(enable = "sse2")]
796#[cfg_attr(test, assert_instr(psrlq, IMM8 = 1))]
797#[rustc_legacy_const_generics(1)]
798#[stable(feature = "simd_x86", since = "1.27.0")]
799pub fn _mm_srli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
800    static_assert_uimm_bits!(IMM8, 8);
801    unsafe {
802        if IMM8 >= 64 {
803            _mm_setzero_si128()
804        } else {
805            transmute(simd_shr(a.as_u64x2(), u64x2::splat(IMM8 as u64)))
806        }
807    }
808}
809
810/// Shifts packed 64-bit integers in `a` right by `count` while shifting in
811/// zeros.
812///
813/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64)
814#[inline]
815#[target_feature(enable = "sse2")]
816#[cfg_attr(test, assert_instr(psrlq))]
817#[stable(feature = "simd_x86", since = "1.27.0")]
818pub fn _mm_srl_epi64(a: __m128i, count: __m128i) -> __m128i {
819    unsafe { transmute(psrlq(a.as_i64x2(), count.as_i64x2())) }
820}
821
822/// Computes the bitwise AND of 128 bits (representing integer data) in `a` and
823/// `b`.
824///
825/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128)
826#[inline]
827#[target_feature(enable = "sse2")]
828#[cfg_attr(test, assert_instr(andps))]
829#[stable(feature = "simd_x86", since = "1.27.0")]
830pub fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i {
831    unsafe { simd_and(a, b) }
832}
833
834/// Computes the bitwise NOT of 128 bits (representing integer data) in `a` and
835/// then AND with `b`.
836///
837/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128)
838#[inline]
839#[target_feature(enable = "sse2")]
840#[cfg_attr(test, assert_instr(andnps))]
841#[stable(feature = "simd_x86", since = "1.27.0")]
842pub fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i {
843    unsafe { simd_and(simd_xor(_mm_set1_epi8(-1), a), b) }
844}
845
846/// Computes the bitwise OR of 128 bits (representing integer data) in `a` and
847/// `b`.
848///
849/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128)
850#[inline]
851#[target_feature(enable = "sse2")]
852#[cfg_attr(test, assert_instr(orps))]
853#[stable(feature = "simd_x86", since = "1.27.0")]
854pub fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i {
855    unsafe { simd_or(a, b) }
856}
857
858/// Computes the bitwise XOR of 128 bits (representing integer data) in `a` and
859/// `b`.
860///
861/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128)
862#[inline]
863#[target_feature(enable = "sse2")]
864#[cfg_attr(test, assert_instr(xorps))]
865#[stable(feature = "simd_x86", since = "1.27.0")]
866pub fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i {
867    unsafe { simd_xor(a, b) }
868}
869
870/// Compares packed 8-bit integers in `a` and `b` for equality.
871///
872/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8)
873#[inline]
874#[target_feature(enable = "sse2")]
875#[cfg_attr(test, assert_instr(pcmpeqb))]
876#[stable(feature = "simd_x86", since = "1.27.0")]
877pub fn _mm_cmpeq_epi8(a: __m128i, b: __m128i) -> __m128i {
878    unsafe { transmute::<i8x16, _>(simd_eq(a.as_i8x16(), b.as_i8x16())) }
879}
880
881/// Compares packed 16-bit integers in `a` and `b` for equality.
882///
883/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16)
884#[inline]
885#[target_feature(enable = "sse2")]
886#[cfg_attr(test, assert_instr(pcmpeqw))]
887#[stable(feature = "simd_x86", since = "1.27.0")]
888pub fn _mm_cmpeq_epi16(a: __m128i, b: __m128i) -> __m128i {
889    unsafe { transmute::<i16x8, _>(simd_eq(a.as_i16x8(), b.as_i16x8())) }
890}
891
892/// Compares packed 32-bit integers in `a` and `b` for equality.
893///
894/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32)
895#[inline]
896#[target_feature(enable = "sse2")]
897#[cfg_attr(test, assert_instr(pcmpeqd))]
898#[stable(feature = "simd_x86", since = "1.27.0")]
899pub fn _mm_cmpeq_epi32(a: __m128i, b: __m128i) -> __m128i {
900    unsafe { transmute::<i32x4, _>(simd_eq(a.as_i32x4(), b.as_i32x4())) }
901}
902
903/// Compares packed 8-bit integers in `a` and `b` for greater-than.
904///
905/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8)
906#[inline]
907#[target_feature(enable = "sse2")]
908#[cfg_attr(test, assert_instr(pcmpgtb))]
909#[stable(feature = "simd_x86", since = "1.27.0")]
910pub fn _mm_cmpgt_epi8(a: __m128i, b: __m128i) -> __m128i {
911    unsafe { transmute::<i8x16, _>(simd_gt(a.as_i8x16(), b.as_i8x16())) }
912}
913
914/// Compares packed 16-bit integers in `a` and `b` for greater-than.
915///
916/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16)
917#[inline]
918#[target_feature(enable = "sse2")]
919#[cfg_attr(test, assert_instr(pcmpgtw))]
920#[stable(feature = "simd_x86", since = "1.27.0")]
921pub fn _mm_cmpgt_epi16(a: __m128i, b: __m128i) -> __m128i {
922    unsafe { transmute::<i16x8, _>(simd_gt(a.as_i16x8(), b.as_i16x8())) }
923}
924
925/// Compares packed 32-bit integers in `a` and `b` for greater-than.
926///
927/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32)
928#[inline]
929#[target_feature(enable = "sse2")]
930#[cfg_attr(test, assert_instr(pcmpgtd))]
931#[stable(feature = "simd_x86", since = "1.27.0")]
932pub fn _mm_cmpgt_epi32(a: __m128i, b: __m128i) -> __m128i {
933    unsafe { transmute::<i32x4, _>(simd_gt(a.as_i32x4(), b.as_i32x4())) }
934}
935
936/// Compares packed 8-bit integers in `a` and `b` for less-than.
937///
938/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8)
939#[inline]
940#[target_feature(enable = "sse2")]
941#[cfg_attr(test, assert_instr(pcmpgtb))]
942#[stable(feature = "simd_x86", since = "1.27.0")]
943pub fn _mm_cmplt_epi8(a: __m128i, b: __m128i) -> __m128i {
944    unsafe { transmute::<i8x16, _>(simd_lt(a.as_i8x16(), b.as_i8x16())) }
945}
946
947/// Compares packed 16-bit integers in `a` and `b` for less-than.
948///
949/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16)
950#[inline]
951#[target_feature(enable = "sse2")]
952#[cfg_attr(test, assert_instr(pcmpgtw))]
953#[stable(feature = "simd_x86", since = "1.27.0")]
954pub fn _mm_cmplt_epi16(a: __m128i, b: __m128i) -> __m128i {
955    unsafe { transmute::<i16x8, _>(simd_lt(a.as_i16x8(), b.as_i16x8())) }
956}
957
958/// Compares packed 32-bit integers in `a` and `b` for less-than.
959///
960/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32)
961#[inline]
962#[target_feature(enable = "sse2")]
963#[cfg_attr(test, assert_instr(pcmpgtd))]
964#[stable(feature = "simd_x86", since = "1.27.0")]
965pub fn _mm_cmplt_epi32(a: __m128i, b: __m128i) -> __m128i {
966    unsafe { transmute::<i32x4, _>(simd_lt(a.as_i32x4(), b.as_i32x4())) }
967}
968
969/// Converts the lower two packed 32-bit integers in `a` to packed
970/// double-precision (64-bit) floating-point elements.
971///
972/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd)
973#[inline]
974#[target_feature(enable = "sse2")]
975#[cfg_attr(test, assert_instr(cvtdq2pd))]
976#[stable(feature = "simd_x86", since = "1.27.0")]
977pub fn _mm_cvtepi32_pd(a: __m128i) -> __m128d {
978    unsafe {
979        let a = a.as_i32x4();
980        simd_cast::<i32x2, __m128d>(simd_shuffle!(a, a, [0, 1]))
981    }
982}
983
984/// Returns `a` with its lower element replaced by `b` after converting it to
985/// an `f64`.
986///
987/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd)
988#[inline]
989#[target_feature(enable = "sse2")]
990#[cfg_attr(test, assert_instr(cvtsi2sd))]
991#[stable(feature = "simd_x86", since = "1.27.0")]
992pub fn _mm_cvtsi32_sd(a: __m128d, b: i32) -> __m128d {
993    unsafe { simd_insert!(a, 0, b as f64) }
994}
995
996/// Converts packed 32-bit integers in `a` to packed single-precision (32-bit)
997/// floating-point elements.
998///
999/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps)
1000#[inline]
1001#[target_feature(enable = "sse2")]
1002#[cfg_attr(test, assert_instr(cvtdq2ps))]
1003#[stable(feature = "simd_x86", since = "1.27.0")]
1004pub fn _mm_cvtepi32_ps(a: __m128i) -> __m128 {
1005    unsafe { transmute(simd_cast::<_, f32x4>(a.as_i32x4())) }
1006}
1007
1008/// Converts packed single-precision (32-bit) floating-point elements in `a`
1009/// to packed 32-bit integers.
1010///
1011/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32)
1012#[inline]
1013#[target_feature(enable = "sse2")]
1014#[cfg_attr(test, assert_instr(cvtps2dq))]
1015#[stable(feature = "simd_x86", since = "1.27.0")]
1016pub fn _mm_cvtps_epi32(a: __m128) -> __m128i {
1017    unsafe { transmute(cvtps2dq(a)) }
1018}
1019
1020/// Returns a vector whose lowest element is `a` and all higher elements are
1021/// `0`.
1022///
1023/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128)
1024#[inline]
1025#[target_feature(enable = "sse2")]
1026#[stable(feature = "simd_x86", since = "1.27.0")]
1027pub fn _mm_cvtsi32_si128(a: i32) -> __m128i {
1028    unsafe { transmute(i32x4::new(a, 0, 0, 0)) }
1029}
1030
1031/// Returns the lowest element of `a`.
1032///
1033/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32)
1034#[inline]
1035#[target_feature(enable = "sse2")]
1036#[stable(feature = "simd_x86", since = "1.27.0")]
1037pub fn _mm_cvtsi128_si32(a: __m128i) -> i32 {
1038    unsafe { simd_extract!(a.as_i32x4(), 0) }
1039}
1040
1041/// Sets packed 64-bit integers with the supplied values, from highest to
1042/// lowest.
1043///
1044/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x)
1045#[inline]
1046#[target_feature(enable = "sse2")]
1047// no particular instruction to test
1048#[stable(feature = "simd_x86", since = "1.27.0")]
1049pub fn _mm_set_epi64x(e1: i64, e0: i64) -> __m128i {
1050    unsafe { transmute(i64x2::new(e0, e1)) }
1051}
1052
1053/// Sets packed 32-bit integers with the supplied values.
1054///
1055/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32)
1056#[inline]
1057#[target_feature(enable = "sse2")]
1058// no particular instruction to test
1059#[stable(feature = "simd_x86", since = "1.27.0")]
1060pub fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
1061    unsafe { transmute(i32x4::new(e0, e1, e2, e3)) }
1062}
1063
1064/// Sets packed 16-bit integers with the supplied values.
1065///
1066/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16)
1067#[inline]
1068#[target_feature(enable = "sse2")]
1069// no particular instruction to test
1070#[stable(feature = "simd_x86", since = "1.27.0")]
1071pub fn _mm_set_epi16(
1072    e7: i16,
1073    e6: i16,
1074    e5: i16,
1075    e4: i16,
1076    e3: i16,
1077    e2: i16,
1078    e1: i16,
1079    e0: i16,
1080) -> __m128i {
1081    unsafe { transmute(i16x8::new(e0, e1, e2, e3, e4, e5, e6, e7)) }
1082}
1083
1084/// Sets packed 8-bit integers with the supplied values.
1085///
1086/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8)
1087#[inline]
1088#[target_feature(enable = "sse2")]
1089// no particular instruction to test
1090#[stable(feature = "simd_x86", since = "1.27.0")]
1091pub fn _mm_set_epi8(
1092    e15: i8,
1093    e14: i8,
1094    e13: i8,
1095    e12: i8,
1096    e11: i8,
1097    e10: i8,
1098    e9: i8,
1099    e8: i8,
1100    e7: i8,
1101    e6: i8,
1102    e5: i8,
1103    e4: i8,
1104    e3: i8,
1105    e2: i8,
1106    e1: i8,
1107    e0: i8,
1108) -> __m128i {
1109    unsafe {
1110        #[rustfmt::skip]
1111        transmute(i8x16::new(
1112            e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
1113        ))
1114    }
1115}
1116
1117/// Broadcasts 64-bit integer `a` to all elements.
1118///
1119/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x)
1120#[inline]
1121#[target_feature(enable = "sse2")]
1122// no particular instruction to test
1123#[stable(feature = "simd_x86", since = "1.27.0")]
1124pub fn _mm_set1_epi64x(a: i64) -> __m128i {
1125    _mm_set_epi64x(a, a)
1126}
1127
1128/// Broadcasts 32-bit integer `a` to all elements.
1129///
1130/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32)
1131#[inline]
1132#[target_feature(enable = "sse2")]
1133// no particular instruction to test
1134#[stable(feature = "simd_x86", since = "1.27.0")]
1135pub fn _mm_set1_epi32(a: i32) -> __m128i {
1136    _mm_set_epi32(a, a, a, a)
1137}
1138
1139/// Broadcasts 16-bit integer `a` to all elements.
1140///
1141/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16)
1142#[inline]
1143#[target_feature(enable = "sse2")]
1144// no particular instruction to test
1145#[stable(feature = "simd_x86", since = "1.27.0")]
1146pub fn _mm_set1_epi16(a: i16) -> __m128i {
1147    _mm_set_epi16(a, a, a, a, a, a, a, a)
1148}
1149
1150/// Broadcasts 8-bit integer `a` to all elements.
1151///
1152/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8)
1153#[inline]
1154#[target_feature(enable = "sse2")]
1155// no particular instruction to test
1156#[stable(feature = "simd_x86", since = "1.27.0")]
1157pub fn _mm_set1_epi8(a: i8) -> __m128i {
1158    _mm_set_epi8(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
1159}
1160
1161/// Sets packed 32-bit integers with the supplied values in reverse order.
1162///
1163/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32)
1164#[inline]
1165#[target_feature(enable = "sse2")]
1166// no particular instruction to test
1167#[stable(feature = "simd_x86", since = "1.27.0")]
1168pub fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
1169    _mm_set_epi32(e0, e1, e2, e3)
1170}
1171
1172/// Sets packed 16-bit integers with the supplied values in reverse order.
1173///
1174/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16)
1175#[inline]
1176#[target_feature(enable = "sse2")]
1177// no particular instruction to test
1178#[stable(feature = "simd_x86", since = "1.27.0")]
1179pub fn _mm_setr_epi16(
1180    e7: i16,
1181    e6: i16,
1182    e5: i16,
1183    e4: i16,
1184    e3: i16,
1185    e2: i16,
1186    e1: i16,
1187    e0: i16,
1188) -> __m128i {
1189    _mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7)
1190}
1191
1192/// Sets packed 8-bit integers with the supplied values in reverse order.
1193///
1194/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8)
1195#[inline]
1196#[target_feature(enable = "sse2")]
1197// no particular instruction to test
1198#[stable(feature = "simd_x86", since = "1.27.0")]
1199pub fn _mm_setr_epi8(
1200    e15: i8,
1201    e14: i8,
1202    e13: i8,
1203    e12: i8,
1204    e11: i8,
1205    e10: i8,
1206    e9: i8,
1207    e8: i8,
1208    e7: i8,
1209    e6: i8,
1210    e5: i8,
1211    e4: i8,
1212    e3: i8,
1213    e2: i8,
1214    e1: i8,
1215    e0: i8,
1216) -> __m128i {
1217    #[rustfmt::skip]
1218    _mm_set_epi8(
1219        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
1220    )
1221}
1222
1223/// Returns a vector with all elements set to zero.
1224///
1225/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128)
1226#[inline]
1227#[target_feature(enable = "sse2")]
1228#[cfg_attr(test, assert_instr(xorps))]
1229#[stable(feature = "simd_x86", since = "1.27.0")]
1230pub fn _mm_setzero_si128() -> __m128i {
1231    const { unsafe { mem::zeroed() } }
1232}
1233
1234/// Loads 64-bit integer from memory into first element of returned vector.
1235///
1236/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64)
1237#[inline]
1238#[target_feature(enable = "sse2")]
1239#[stable(feature = "simd_x86", since = "1.27.0")]
1240pub unsafe fn _mm_loadl_epi64(mem_addr: *const __m128i) -> __m128i {
1241    _mm_set_epi64x(0, ptr::read_unaligned(mem_addr as *const i64))
1242}
1243
1244/// Loads 128-bits of integer data from memory into a new vector.
1245///
1246/// `mem_addr` must be aligned on a 16-byte boundary.
1247///
1248/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128)
1249#[inline]
1250#[target_feature(enable = "sse2")]
1251#[cfg_attr(
1252    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1253    assert_instr(movaps)
1254)]
1255#[stable(feature = "simd_x86", since = "1.27.0")]
1256pub unsafe fn _mm_load_si128(mem_addr: *const __m128i) -> __m128i {
1257    *mem_addr
1258}
1259
1260/// Loads 128-bits of integer data from memory into a new vector.
1261///
1262/// `mem_addr` does not need to be aligned on any particular boundary.
1263///
1264/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128)
1265#[inline]
1266#[target_feature(enable = "sse2")]
1267#[cfg_attr(test, assert_instr(movups))]
1268#[stable(feature = "simd_x86", since = "1.27.0")]
1269pub unsafe fn _mm_loadu_si128(mem_addr: *const __m128i) -> __m128i {
1270    let mut dst: __m128i = _mm_undefined_si128();
1271    ptr::copy_nonoverlapping(
1272        mem_addr as *const u8,
1273        ptr::addr_of_mut!(dst) as *mut u8,
1274        mem::size_of::<__m128i>(),
1275    );
1276    dst
1277}
1278
1279/// Conditionally store 8-bit integer elements from `a` into memory using
1280/// `mask` flagged as non-temporal (unlikely to be used again soon).
1281///
1282/// Elements are not stored when the highest bit is not set in the
1283/// corresponding element.
1284///
1285/// `mem_addr` should correspond to a 128-bit memory location and does not need
1286/// to be aligned on any particular boundary.
1287///
1288/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128)
1289///
1290/// # Safety of non-temporal stores
1291///
1292/// After using this intrinsic, but before any other access to the memory that this intrinsic
1293/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1294/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1295/// return.
1296///
1297/// See [`_mm_sfence`] for details.
1298#[inline]
1299#[target_feature(enable = "sse2")]
1300#[cfg_attr(test, assert_instr(maskmovdqu))]
1301#[stable(feature = "simd_x86", since = "1.27.0")]
1302pub unsafe fn _mm_maskmoveu_si128(a: __m128i, mask: __m128i, mem_addr: *mut i8) {
1303    maskmovdqu(a.as_i8x16(), mask.as_i8x16(), mem_addr)
1304}
1305
1306/// Stores 128-bits of integer data from `a` into memory.
1307///
1308/// `mem_addr` must be aligned on a 16-byte boundary.
1309///
1310/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128)
1311#[inline]
1312#[target_feature(enable = "sse2")]
1313#[cfg_attr(
1314    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1315    assert_instr(movaps)
1316)]
1317#[stable(feature = "simd_x86", since = "1.27.0")]
1318pub unsafe fn _mm_store_si128(mem_addr: *mut __m128i, a: __m128i) {
1319    *mem_addr = a;
1320}
1321
1322/// Stores 128-bits of integer data from `a` into memory.
1323///
1324/// `mem_addr` does not need to be aligned on any particular boundary.
1325///
1326/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128)
1327#[inline]
1328#[target_feature(enable = "sse2")]
1329#[cfg_attr(test, assert_instr(movups))] // FIXME movdqu expected
1330#[stable(feature = "simd_x86", since = "1.27.0")]
1331pub unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) {
1332    mem_addr.write_unaligned(a);
1333}
1334
1335/// Stores the lower 64-bit integer `a` to a memory location.
1336///
1337/// `mem_addr` does not need to be aligned on any particular boundary.
1338///
1339/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64)
1340#[inline]
1341#[target_feature(enable = "sse2")]
1342#[stable(feature = "simd_x86", since = "1.27.0")]
1343pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
1344    ptr::copy_nonoverlapping(ptr::addr_of!(a) as *const u8, mem_addr as *mut u8, 8);
1345}
1346
1347/// Stores a 128-bit integer vector to a 128-bit aligned memory location.
1348/// To minimize caching, the data is flagged as non-temporal (unlikely to be
1349/// used again soon).
1350///
1351/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128)
1352///
1353/// # Safety of non-temporal stores
1354///
1355/// After using this intrinsic, but before any other access to the memory that this intrinsic
1356/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1357/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1358/// return.
1359///
1360/// See [`_mm_sfence`] for details.
1361#[inline]
1362#[target_feature(enable = "sse2")]
1363#[cfg_attr(test, assert_instr(movntdq))]
1364#[stable(feature = "simd_x86", since = "1.27.0")]
1365pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
1366    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
1367    crate::arch::asm!(
1368        vps!("movntdq",  ",{a}"),
1369        p = in(reg) mem_addr,
1370        a = in(xmm_reg) a,
1371        options(nostack, preserves_flags),
1372    );
1373}
1374
1375/// Stores a 32-bit integer value in the specified memory location.
1376/// To minimize caching, the data is flagged as non-temporal (unlikely to be
1377/// used again soon).
1378///
1379/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32)
1380///
1381/// # Safety of non-temporal stores
1382///
1383/// After using this intrinsic, but before any other access to the memory that this intrinsic
1384/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1385/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1386/// return.
1387///
1388/// See [`_mm_sfence`] for details.
1389#[inline]
1390#[target_feature(enable = "sse2")]
1391#[cfg_attr(test, assert_instr(movnti))]
1392#[stable(feature = "simd_x86", since = "1.27.0")]
1393pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) {
1394    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
1395    crate::arch::asm!(
1396        vps!("movnti", ",{a:e}"), // `:e` for 32bit value
1397        p = in(reg) mem_addr,
1398        a = in(reg) a,
1399        options(nostack, preserves_flags),
1400    );
1401}
1402
1403/// Returns a vector where the low element is extracted from `a` and its upper
1404/// element is zero.
1405///
1406/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64)
1407#[inline]
1408#[target_feature(enable = "sse2")]
1409// FIXME movd on msvc, movd on i686
1410#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(movq))]
1411#[stable(feature = "simd_x86", since = "1.27.0")]
1412pub fn _mm_move_epi64(a: __m128i) -> __m128i {
1413    unsafe {
1414        let r: i64x2 = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 2]);
1415        transmute(r)
1416    }
1417}
1418
1419/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
1420/// using signed saturation.
1421///
1422/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16)
1423#[inline]
1424#[target_feature(enable = "sse2")]
1425#[cfg_attr(test, assert_instr(packsswb))]
1426#[stable(feature = "simd_x86", since = "1.27.0")]
1427pub fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i {
1428    unsafe { transmute(packsswb(a.as_i16x8(), b.as_i16x8())) }
1429}
1430
1431/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
1432/// using signed saturation.
1433///
1434/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32)
1435#[inline]
1436#[target_feature(enable = "sse2")]
1437#[cfg_attr(test, assert_instr(packssdw))]
1438#[stable(feature = "simd_x86", since = "1.27.0")]
1439pub fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i {
1440    unsafe { transmute(packssdw(a.as_i32x4(), b.as_i32x4())) }
1441}
1442
1443/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
1444/// using unsigned saturation.
1445///
1446/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16)
1447#[inline]
1448#[target_feature(enable = "sse2")]
1449#[cfg_attr(test, assert_instr(packuswb))]
1450#[stable(feature = "simd_x86", since = "1.27.0")]
1451pub fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i {
1452    unsafe { transmute(packuswb(a.as_i16x8(), b.as_i16x8())) }
1453}
1454
1455/// Returns the `imm8` element of `a`.
1456///
1457/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16)
1458#[inline]
1459#[target_feature(enable = "sse2")]
1460#[cfg_attr(test, assert_instr(pextrw, IMM8 = 7))]
1461#[rustc_legacy_const_generics(1)]
1462#[stable(feature = "simd_x86", since = "1.27.0")]
1463pub fn _mm_extract_epi16<const IMM8: i32>(a: __m128i) -> i32 {
1464    static_assert_uimm_bits!(IMM8, 3);
1465    unsafe { simd_extract!(a.as_u16x8(), IMM8 as u32, u16) as i32 }
1466}
1467
1468/// Returns a new vector where the `imm8` element of `a` is replaced with `i`.
1469///
1470/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16)
1471#[inline]
1472#[target_feature(enable = "sse2")]
1473#[cfg_attr(test, assert_instr(pinsrw, IMM8 = 7))]
1474#[rustc_legacy_const_generics(2)]
1475#[stable(feature = "simd_x86", since = "1.27.0")]
1476pub fn _mm_insert_epi16<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
1477    static_assert_uimm_bits!(IMM8, 3);
1478    unsafe { transmute(simd_insert!(a.as_i16x8(), IMM8 as u32, i as i16)) }
1479}
1480
1481/// Returns a mask of the most significant bit of each element in `a`.
1482///
1483/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8)
1484#[inline]
1485#[target_feature(enable = "sse2")]
1486#[cfg_attr(test, assert_instr(pmovmskb))]
1487#[stable(feature = "simd_x86", since = "1.27.0")]
1488pub fn _mm_movemask_epi8(a: __m128i) -> i32 {
1489    unsafe {
1490        let z = i8x16::ZERO;
1491        let m: i8x16 = simd_lt(a.as_i8x16(), z);
1492        simd_bitmask::<_, u16>(m) as u32 as i32
1493    }
1494}
1495
1496/// Shuffles 32-bit integers in `a` using the control in `IMM8`.
1497///
1498/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32)
1499#[inline]
1500#[target_feature(enable = "sse2")]
1501#[cfg_attr(test, assert_instr(pshufd, IMM8 = 9))]
1502#[rustc_legacy_const_generics(1)]
1503#[stable(feature = "simd_x86", since = "1.27.0")]
1504pub fn _mm_shuffle_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
1505    static_assert_uimm_bits!(IMM8, 8);
1506    unsafe {
1507        let a = a.as_i32x4();
1508        let x: i32x4 = simd_shuffle!(
1509            a,
1510            a,
1511            [
1512                IMM8 as u32 & 0b11,
1513                (IMM8 as u32 >> 2) & 0b11,
1514                (IMM8 as u32 >> 4) & 0b11,
1515                (IMM8 as u32 >> 6) & 0b11,
1516            ],
1517        );
1518        transmute(x)
1519    }
1520}
1521
1522/// Shuffles 16-bit integers in the high 64 bits of `a` using the control in
1523/// `IMM8`.
1524///
1525/// Put the results in the high 64 bits of the returned vector, with the low 64
1526/// bits being copied from `a`.
1527///
1528/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16)
1529#[inline]
1530#[target_feature(enable = "sse2")]
1531#[cfg_attr(test, assert_instr(pshufhw, IMM8 = 9))]
1532#[rustc_legacy_const_generics(1)]
1533#[stable(feature = "simd_x86", since = "1.27.0")]
1534pub fn _mm_shufflehi_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
1535    static_assert_uimm_bits!(IMM8, 8);
1536    unsafe {
1537        let a = a.as_i16x8();
1538        let x: i16x8 = simd_shuffle!(
1539            a,
1540            a,
1541            [
1542                0,
1543                1,
1544                2,
1545                3,
1546                (IMM8 as u32 & 0b11) + 4,
1547                ((IMM8 as u32 >> 2) & 0b11) + 4,
1548                ((IMM8 as u32 >> 4) & 0b11) + 4,
1549                ((IMM8 as u32 >> 6) & 0b11) + 4,
1550            ],
1551        );
1552        transmute(x)
1553    }
1554}
1555
1556/// Shuffles 16-bit integers in the low 64 bits of `a` using the control in
1557/// `IMM8`.
1558///
1559/// Put the results in the low 64 bits of the returned vector, with the high 64
1560/// bits being copied from `a`.
1561///
1562/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16)
1563#[inline]
1564#[target_feature(enable = "sse2")]
1565#[cfg_attr(test, assert_instr(pshuflw, IMM8 = 9))]
1566#[rustc_legacy_const_generics(1)]
1567#[stable(feature = "simd_x86", since = "1.27.0")]
1568pub fn _mm_shufflelo_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
1569    static_assert_uimm_bits!(IMM8, 8);
1570    unsafe {
1571        let a = a.as_i16x8();
1572        let x: i16x8 = simd_shuffle!(
1573            a,
1574            a,
1575            [
1576                IMM8 as u32 & 0b11,
1577                (IMM8 as u32 >> 2) & 0b11,
1578                (IMM8 as u32 >> 4) & 0b11,
1579                (IMM8 as u32 >> 6) & 0b11,
1580                4,
1581                5,
1582                6,
1583                7,
1584            ],
1585        );
1586        transmute(x)
1587    }
1588}
1589
1590/// Unpacks and interleave 8-bit integers from the high half of `a` and `b`.
1591///
1592/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8)
1593#[inline]
1594#[target_feature(enable = "sse2")]
1595#[cfg_attr(test, assert_instr(punpckhbw))]
1596#[stable(feature = "simd_x86", since = "1.27.0")]
1597pub fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i {
1598    unsafe {
1599        transmute::<i8x16, _>(simd_shuffle!(
1600            a.as_i8x16(),
1601            b.as_i8x16(),
1602            [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31],
1603        ))
1604    }
1605}
1606
1607/// Unpacks and interleave 16-bit integers from the high half of `a` and `b`.
1608///
1609/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16)
1610#[inline]
1611#[target_feature(enable = "sse2")]
1612#[cfg_attr(test, assert_instr(punpckhwd))]
1613#[stable(feature = "simd_x86", since = "1.27.0")]
1614pub fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i {
1615    unsafe {
1616        let x = simd_shuffle!(a.as_i16x8(), b.as_i16x8(), [4, 12, 5, 13, 6, 14, 7, 15]);
1617        transmute::<i16x8, _>(x)
1618    }
1619}
1620
1621/// Unpacks and interleave 32-bit integers from the high half of `a` and `b`.
1622///
1623/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32)
1624#[inline]
1625#[target_feature(enable = "sse2")]
1626#[cfg_attr(test, assert_instr(unpckhps))]
1627#[stable(feature = "simd_x86", since = "1.27.0")]
1628pub fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i {
1629    unsafe { transmute::<i32x4, _>(simd_shuffle!(a.as_i32x4(), b.as_i32x4(), [2, 6, 3, 7])) }
1630}
1631
1632/// Unpacks and interleave 64-bit integers from the high half of `a` and `b`.
1633///
1634/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64)
1635#[inline]
1636#[target_feature(enable = "sse2")]
1637#[cfg_attr(test, assert_instr(unpckhpd))]
1638#[stable(feature = "simd_x86", since = "1.27.0")]
1639pub fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i {
1640    unsafe { transmute::<i64x2, _>(simd_shuffle!(a.as_i64x2(), b.as_i64x2(), [1, 3])) }
1641}
1642
1643/// Unpacks and interleave 8-bit integers from the low half of `a` and `b`.
1644///
1645/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8)
1646#[inline]
1647#[target_feature(enable = "sse2")]
1648#[cfg_attr(test, assert_instr(punpcklbw))]
1649#[stable(feature = "simd_x86", since = "1.27.0")]
1650pub fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
1651    unsafe {
1652        transmute::<i8x16, _>(simd_shuffle!(
1653            a.as_i8x16(),
1654            b.as_i8x16(),
1655            [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23],
1656        ))
1657    }
1658}
1659
1660/// Unpacks and interleave 16-bit integers from the low half of `a` and `b`.
1661///
1662/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16)
1663#[inline]
1664#[target_feature(enable = "sse2")]
1665#[cfg_attr(test, assert_instr(punpcklwd))]
1666#[stable(feature = "simd_x86", since = "1.27.0")]
1667pub fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
1668    unsafe {
1669        let x = simd_shuffle!(a.as_i16x8(), b.as_i16x8(), [0, 8, 1, 9, 2, 10, 3, 11]);
1670        transmute::<i16x8, _>(x)
1671    }
1672}
1673
1674/// Unpacks and interleave 32-bit integers from the low half of `a` and `b`.
1675///
1676/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32)
1677#[inline]
1678#[target_feature(enable = "sse2")]
1679#[cfg_attr(test, assert_instr(unpcklps))]
1680#[stable(feature = "simd_x86", since = "1.27.0")]
1681pub fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i {
1682    unsafe { transmute::<i32x4, _>(simd_shuffle!(a.as_i32x4(), b.as_i32x4(), [0, 4, 1, 5])) }
1683}
1684
1685/// Unpacks and interleave 64-bit integers from the low half of `a` and `b`.
1686///
1687/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64)
1688#[inline]
1689#[target_feature(enable = "sse2")]
1690#[cfg_attr(test, assert_instr(movlhps))]
1691#[stable(feature = "simd_x86", since = "1.27.0")]
1692pub fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
1693    unsafe { transmute::<i64x2, _>(simd_shuffle!(a.as_i64x2(), b.as_i64x2(), [0, 2])) }
1694}
1695
1696/// Returns a new vector with the low element of `a` replaced by the sum of the
1697/// low elements of `a` and `b`.
1698///
1699/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd)
1700#[inline]
1701#[target_feature(enable = "sse2")]
1702#[cfg_attr(test, assert_instr(addsd))]
1703#[stable(feature = "simd_x86", since = "1.27.0")]
1704pub fn _mm_add_sd(a: __m128d, b: __m128d) -> __m128d {
1705    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) + _mm_cvtsd_f64(b)) }
1706}
1707
1708/// Adds packed double-precision (64-bit) floating-point elements in `a` and
1709/// `b`.
1710///
1711/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd)
1712#[inline]
1713#[target_feature(enable = "sse2")]
1714#[cfg_attr(test, assert_instr(addpd))]
1715#[stable(feature = "simd_x86", since = "1.27.0")]
1716pub fn _mm_add_pd(a: __m128d, b: __m128d) -> __m128d {
1717    unsafe { simd_add(a, b) }
1718}
1719
1720/// Returns a new vector with the low element of `a` replaced by the result of
1721/// diving the lower element of `a` by the lower element of `b`.
1722///
1723/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd)
1724#[inline]
1725#[target_feature(enable = "sse2")]
1726#[cfg_attr(test, assert_instr(divsd))]
1727#[stable(feature = "simd_x86", since = "1.27.0")]
1728pub fn _mm_div_sd(a: __m128d, b: __m128d) -> __m128d {
1729    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) / _mm_cvtsd_f64(b)) }
1730}
1731
1732/// Divide packed double-precision (64-bit) floating-point elements in `a` by
1733/// packed elements in `b`.
1734///
1735/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd)
1736#[inline]
1737#[target_feature(enable = "sse2")]
1738#[cfg_attr(test, assert_instr(divpd))]
1739#[stable(feature = "simd_x86", since = "1.27.0")]
1740pub fn _mm_div_pd(a: __m128d, b: __m128d) -> __m128d {
1741    unsafe { simd_div(a, b) }
1742}
1743
1744/// Returns a new vector with the low element of `a` replaced by the maximum
1745/// of the lower elements of `a` and `b`.
1746///
1747/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd)
1748#[inline]
1749#[target_feature(enable = "sse2")]
1750#[cfg_attr(test, assert_instr(maxsd))]
1751#[stable(feature = "simd_x86", since = "1.27.0")]
1752pub fn _mm_max_sd(a: __m128d, b: __m128d) -> __m128d {
1753    unsafe { maxsd(a, b) }
1754}
1755
1756/// Returns a new vector with the maximum values from corresponding elements in
1757/// `a` and `b`.
1758///
1759/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd)
1760#[inline]
1761#[target_feature(enable = "sse2")]
1762#[cfg_attr(test, assert_instr(maxpd))]
1763#[stable(feature = "simd_x86", since = "1.27.0")]
1764pub fn _mm_max_pd(a: __m128d, b: __m128d) -> __m128d {
1765    unsafe { maxpd(a, b) }
1766}
1767
1768/// Returns a new vector with the low element of `a` replaced by the minimum
1769/// of the lower elements of `a` and `b`.
1770///
1771/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd)
1772#[inline]
1773#[target_feature(enable = "sse2")]
1774#[cfg_attr(test, assert_instr(minsd))]
1775#[stable(feature = "simd_x86", since = "1.27.0")]
1776pub fn _mm_min_sd(a: __m128d, b: __m128d) -> __m128d {
1777    unsafe { minsd(a, b) }
1778}
1779
1780/// Returns a new vector with the minimum values from corresponding elements in
1781/// `a` and `b`.
1782///
1783/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd)
1784#[inline]
1785#[target_feature(enable = "sse2")]
1786#[cfg_attr(test, assert_instr(minpd))]
1787#[stable(feature = "simd_x86", since = "1.27.0")]
1788pub fn _mm_min_pd(a: __m128d, b: __m128d) -> __m128d {
1789    unsafe { minpd(a, b) }
1790}
1791
1792/// Returns a new vector with the low element of `a` replaced by multiplying the
1793/// low elements of `a` and `b`.
1794///
1795/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_sd)
1796#[inline]
1797#[target_feature(enable = "sse2")]
1798#[cfg_attr(test, assert_instr(mulsd))]
1799#[stable(feature = "simd_x86", since = "1.27.0")]
1800pub fn _mm_mul_sd(a: __m128d, b: __m128d) -> __m128d {
1801    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) * _mm_cvtsd_f64(b)) }
1802}
1803
1804/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
1805/// and `b`.
1806///
1807/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd)
1808#[inline]
1809#[target_feature(enable = "sse2")]
1810#[cfg_attr(test, assert_instr(mulpd))]
1811#[stable(feature = "simd_x86", since = "1.27.0")]
1812pub fn _mm_mul_pd(a: __m128d, b: __m128d) -> __m128d {
1813    unsafe { simd_mul(a, b) }
1814}
1815
1816/// Returns a new vector with the low element of `a` replaced by the square
1817/// root of the lower element `b`.
1818///
1819/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd)
1820#[inline]
1821#[target_feature(enable = "sse2")]
1822#[cfg_attr(test, assert_instr(sqrtsd))]
1823#[stable(feature = "simd_x86", since = "1.27.0")]
1824pub fn _mm_sqrt_sd(a: __m128d, b: __m128d) -> __m128d {
1825    unsafe { simd_insert!(a, 0, sqrtf64(_mm_cvtsd_f64(b))) }
1826}
1827
1828/// Returns a new vector with the square root of each of the values in `a`.
1829///
1830/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd)
1831#[inline]
1832#[target_feature(enable = "sse2")]
1833#[cfg_attr(test, assert_instr(sqrtpd))]
1834#[stable(feature = "simd_x86", since = "1.27.0")]
1835pub fn _mm_sqrt_pd(a: __m128d) -> __m128d {
1836    unsafe { simd_fsqrt(a) }
1837}
1838
1839/// Returns a new vector with the low element of `a` replaced by subtracting the
1840/// low element by `b` from the low element of `a`.
1841///
1842/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd)
1843#[inline]
1844#[target_feature(enable = "sse2")]
1845#[cfg_attr(test, assert_instr(subsd))]
1846#[stable(feature = "simd_x86", since = "1.27.0")]
1847pub fn _mm_sub_sd(a: __m128d, b: __m128d) -> __m128d {
1848    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) - _mm_cvtsd_f64(b)) }
1849}
1850
1851/// Subtract packed double-precision (64-bit) floating-point elements in `b`
1852/// from `a`.
1853///
1854/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_pd)
1855#[inline]
1856#[target_feature(enable = "sse2")]
1857#[cfg_attr(test, assert_instr(subpd))]
1858#[stable(feature = "simd_x86", since = "1.27.0")]
1859pub fn _mm_sub_pd(a: __m128d, b: __m128d) -> __m128d {
1860    unsafe { simd_sub(a, b) }
1861}
1862
1863/// Computes the bitwise AND of packed double-precision (64-bit) floating-point
1864/// elements in `a` and `b`.
1865///
1866/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd)
1867#[inline]
1868#[target_feature(enable = "sse2")]
1869#[cfg_attr(test, assert_instr(andps))]
1870#[stable(feature = "simd_x86", since = "1.27.0")]
1871pub fn _mm_and_pd(a: __m128d, b: __m128d) -> __m128d {
1872    unsafe {
1873        let a: __m128i = transmute(a);
1874        let b: __m128i = transmute(b);
1875        transmute(_mm_and_si128(a, b))
1876    }
1877}
1878
1879/// Computes the bitwise NOT of `a` and then AND with `b`.
1880///
1881/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd)
1882#[inline]
1883#[target_feature(enable = "sse2")]
1884#[cfg_attr(test, assert_instr(andnps))]
1885#[stable(feature = "simd_x86", since = "1.27.0")]
1886pub fn _mm_andnot_pd(a: __m128d, b: __m128d) -> __m128d {
1887    unsafe {
1888        let a: __m128i = transmute(a);
1889        let b: __m128i = transmute(b);
1890        transmute(_mm_andnot_si128(a, b))
1891    }
1892}
1893
1894/// Computes the bitwise OR of `a` and `b`.
1895///
1896/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_pd)
1897#[inline]
1898#[target_feature(enable = "sse2")]
1899#[cfg_attr(test, assert_instr(orps))]
1900#[stable(feature = "simd_x86", since = "1.27.0")]
1901pub fn _mm_or_pd(a: __m128d, b: __m128d) -> __m128d {
1902    unsafe {
1903        let a: __m128i = transmute(a);
1904        let b: __m128i = transmute(b);
1905        transmute(_mm_or_si128(a, b))
1906    }
1907}
1908
1909/// Computes the bitwise XOR of `a` and `b`.
1910///
1911/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd)
1912#[inline]
1913#[target_feature(enable = "sse2")]
1914#[cfg_attr(test, assert_instr(xorps))]
1915#[stable(feature = "simd_x86", since = "1.27.0")]
1916pub fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d {
1917    unsafe {
1918        let a: __m128i = transmute(a);
1919        let b: __m128i = transmute(b);
1920        transmute(_mm_xor_si128(a, b))
1921    }
1922}
1923
1924/// Returns a new vector with the low element of `a` replaced by the equality
1925/// comparison of the lower elements of `a` and `b`.
1926///
1927/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd)
1928#[inline]
1929#[target_feature(enable = "sse2")]
1930#[cfg_attr(test, assert_instr(cmpeqsd))]
1931#[stable(feature = "simd_x86", since = "1.27.0")]
1932pub fn _mm_cmpeq_sd(a: __m128d, b: __m128d) -> __m128d {
1933    unsafe { cmpsd(a, b, 0) }
1934}
1935
1936/// Returns a new vector with the low element of `a` replaced by the less-than
1937/// comparison of the lower elements of `a` and `b`.
1938///
1939/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd)
1940#[inline]
1941#[target_feature(enable = "sse2")]
1942#[cfg_attr(test, assert_instr(cmpltsd))]
1943#[stable(feature = "simd_x86", since = "1.27.0")]
1944pub fn _mm_cmplt_sd(a: __m128d, b: __m128d) -> __m128d {
1945    unsafe { cmpsd(a, b, 1) }
1946}
1947
1948/// Returns a new vector with the low element of `a` replaced by the
1949/// less-than-or-equal comparison of the lower elements of `a` and `b`.
1950///
1951/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd)
1952#[inline]
1953#[target_feature(enable = "sse2")]
1954#[cfg_attr(test, assert_instr(cmplesd))]
1955#[stable(feature = "simd_x86", since = "1.27.0")]
1956pub fn _mm_cmple_sd(a: __m128d, b: __m128d) -> __m128d {
1957    unsafe { cmpsd(a, b, 2) }
1958}
1959
1960/// Returns a new vector with the low element of `a` replaced by the
1961/// greater-than comparison of the lower elements of `a` and `b`.
1962///
1963/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd)
1964#[inline]
1965#[target_feature(enable = "sse2")]
1966#[cfg_attr(test, assert_instr(cmpltsd))]
1967#[stable(feature = "simd_x86", since = "1.27.0")]
1968pub fn _mm_cmpgt_sd(a: __m128d, b: __m128d) -> __m128d {
1969    unsafe { simd_insert!(_mm_cmplt_sd(b, a), 1, simd_extract!(a, 1, f64)) }
1970}
1971
1972/// Returns a new vector with the low element of `a` replaced by the
1973/// greater-than-or-equal comparison of the lower elements of `a` and `b`.
1974///
1975/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd)
1976#[inline]
1977#[target_feature(enable = "sse2")]
1978#[cfg_attr(test, assert_instr(cmplesd))]
1979#[stable(feature = "simd_x86", since = "1.27.0")]
1980pub fn _mm_cmpge_sd(a: __m128d, b: __m128d) -> __m128d {
1981    unsafe { simd_insert!(_mm_cmple_sd(b, a), 1, simd_extract!(a, 1, f64)) }
1982}
1983
1984/// Returns a new vector with the low element of `a` replaced by the result
1985/// of comparing both of the lower elements of `a` and `b` to `NaN`. If
1986/// neither are equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0`
1987/// otherwise.
1988///
1989/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd)
1990#[inline]
1991#[target_feature(enable = "sse2")]
1992#[cfg_attr(test, assert_instr(cmpordsd))]
1993#[stable(feature = "simd_x86", since = "1.27.0")]
1994pub fn _mm_cmpord_sd(a: __m128d, b: __m128d) -> __m128d {
1995    unsafe { cmpsd(a, b, 7) }
1996}
1997
1998/// Returns a new vector with the low element of `a` replaced by the result of
1999/// comparing both of the lower elements of `a` and `b` to `NaN`. If either is
2000/// equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0` otherwise.
2001///
2002/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd)
2003#[inline]
2004#[target_feature(enable = "sse2")]
2005#[cfg_attr(test, assert_instr(cmpunordsd))]
2006#[stable(feature = "simd_x86", since = "1.27.0")]
2007pub fn _mm_cmpunord_sd(a: __m128d, b: __m128d) -> __m128d {
2008    unsafe { cmpsd(a, b, 3) }
2009}
2010
2011/// Returns a new vector with the low element of `a` replaced by the not-equal
2012/// comparison of the lower elements of `a` and `b`.
2013///
2014/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd)
2015#[inline]
2016#[target_feature(enable = "sse2")]
2017#[cfg_attr(test, assert_instr(cmpneqsd))]
2018#[stable(feature = "simd_x86", since = "1.27.0")]
2019pub fn _mm_cmpneq_sd(a: __m128d, b: __m128d) -> __m128d {
2020    unsafe { cmpsd(a, b, 4) }
2021}
2022
2023/// Returns a new vector with the low element of `a` replaced by the
2024/// not-less-than comparison of the lower elements of `a` and `b`.
2025///
2026/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd)
2027#[inline]
2028#[target_feature(enable = "sse2")]
2029#[cfg_attr(test, assert_instr(cmpnltsd))]
2030#[stable(feature = "simd_x86", since = "1.27.0")]
2031pub fn _mm_cmpnlt_sd(a: __m128d, b: __m128d) -> __m128d {
2032    unsafe { cmpsd(a, b, 5) }
2033}
2034
2035/// Returns a new vector with the low element of `a` replaced by the
2036/// not-less-than-or-equal comparison of the lower elements of `a` and `b`.
2037///
2038/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd)
2039#[inline]
2040#[target_feature(enable = "sse2")]
2041#[cfg_attr(test, assert_instr(cmpnlesd))]
2042#[stable(feature = "simd_x86", since = "1.27.0")]
2043pub fn _mm_cmpnle_sd(a: __m128d, b: __m128d) -> __m128d {
2044    unsafe { cmpsd(a, b, 6) }
2045}
2046
2047/// Returns a new vector with the low element of `a` replaced by the
2048/// not-greater-than comparison of the lower elements of `a` and `b`.
2049///
2050/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd)
2051#[inline]
2052#[target_feature(enable = "sse2")]
2053#[cfg_attr(test, assert_instr(cmpnltsd))]
2054#[stable(feature = "simd_x86", since = "1.27.0")]
2055pub fn _mm_cmpngt_sd(a: __m128d, b: __m128d) -> __m128d {
2056    unsafe { simd_insert!(_mm_cmpnlt_sd(b, a), 1, simd_extract!(a, 1, f64)) }
2057}
2058
2059/// Returns a new vector with the low element of `a` replaced by the
2060/// not-greater-than-or-equal comparison of the lower elements of `a` and `b`.
2061///
2062/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd)
2063#[inline]
2064#[target_feature(enable = "sse2")]
2065#[cfg_attr(test, assert_instr(cmpnlesd))]
2066#[stable(feature = "simd_x86", since = "1.27.0")]
2067pub fn _mm_cmpnge_sd(a: __m128d, b: __m128d) -> __m128d {
2068    unsafe { simd_insert!(_mm_cmpnle_sd(b, a), 1, simd_extract!(a, 1, f64)) }
2069}
2070
2071/// Compares corresponding elements in `a` and `b` for equality.
2072///
2073/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd)
2074#[inline]
2075#[target_feature(enable = "sse2")]
2076#[cfg_attr(test, assert_instr(cmpeqpd))]
2077#[stable(feature = "simd_x86", since = "1.27.0")]
2078pub fn _mm_cmpeq_pd(a: __m128d, b: __m128d) -> __m128d {
2079    unsafe { cmppd(a, b, 0) }
2080}
2081
2082/// Compares corresponding elements in `a` and `b` for less-than.
2083///
2084/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd)
2085#[inline]
2086#[target_feature(enable = "sse2")]
2087#[cfg_attr(test, assert_instr(cmpltpd))]
2088#[stable(feature = "simd_x86", since = "1.27.0")]
2089pub fn _mm_cmplt_pd(a: __m128d, b: __m128d) -> __m128d {
2090    unsafe { cmppd(a, b, 1) }
2091}
2092
2093/// Compares corresponding elements in `a` and `b` for less-than-or-equal
2094///
2095/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd)
2096#[inline]
2097#[target_feature(enable = "sse2")]
2098#[cfg_attr(test, assert_instr(cmplepd))]
2099#[stable(feature = "simd_x86", since = "1.27.0")]
2100pub fn _mm_cmple_pd(a: __m128d, b: __m128d) -> __m128d {
2101    unsafe { cmppd(a, b, 2) }
2102}
2103
2104/// Compares corresponding elements in `a` and `b` for greater-than.
2105///
2106/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd)
2107#[inline]
2108#[target_feature(enable = "sse2")]
2109#[cfg_attr(test, assert_instr(cmpltpd))]
2110#[stable(feature = "simd_x86", since = "1.27.0")]
2111pub fn _mm_cmpgt_pd(a: __m128d, b: __m128d) -> __m128d {
2112    _mm_cmplt_pd(b, a)
2113}
2114
2115/// Compares corresponding elements in `a` and `b` for greater-than-or-equal.
2116///
2117/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd)
2118#[inline]
2119#[target_feature(enable = "sse2")]
2120#[cfg_attr(test, assert_instr(cmplepd))]
2121#[stable(feature = "simd_x86", since = "1.27.0")]
2122pub fn _mm_cmpge_pd(a: __m128d, b: __m128d) -> __m128d {
2123    _mm_cmple_pd(b, a)
2124}
2125
2126/// Compares corresponding elements in `a` and `b` to see if neither is `NaN`.
2127///
2128/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd)
2129#[inline]
2130#[target_feature(enable = "sse2")]
2131#[cfg_attr(test, assert_instr(cmpordpd))]
2132#[stable(feature = "simd_x86", since = "1.27.0")]
2133pub fn _mm_cmpord_pd(a: __m128d, b: __m128d) -> __m128d {
2134    unsafe { cmppd(a, b, 7) }
2135}
2136
2137/// Compares corresponding elements in `a` and `b` to see if either is `NaN`.
2138///
2139/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd)
2140#[inline]
2141#[target_feature(enable = "sse2")]
2142#[cfg_attr(test, assert_instr(cmpunordpd))]
2143#[stable(feature = "simd_x86", since = "1.27.0")]
2144pub fn _mm_cmpunord_pd(a: __m128d, b: __m128d) -> __m128d {
2145    unsafe { cmppd(a, b, 3) }
2146}
2147
2148/// Compares corresponding elements in `a` and `b` for not-equal.
2149///
2150/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd)
2151#[inline]
2152#[target_feature(enable = "sse2")]
2153#[cfg_attr(test, assert_instr(cmpneqpd))]
2154#[stable(feature = "simd_x86", since = "1.27.0")]
2155pub fn _mm_cmpneq_pd(a: __m128d, b: __m128d) -> __m128d {
2156    unsafe { cmppd(a, b, 4) }
2157}
2158
2159/// Compares corresponding elements in `a` and `b` for not-less-than.
2160///
2161/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd)
2162#[inline]
2163#[target_feature(enable = "sse2")]
2164#[cfg_attr(test, assert_instr(cmpnltpd))]
2165#[stable(feature = "simd_x86", since = "1.27.0")]
2166pub fn _mm_cmpnlt_pd(a: __m128d, b: __m128d) -> __m128d {
2167    unsafe { cmppd(a, b, 5) }
2168}
2169
2170/// Compares corresponding elements in `a` and `b` for not-less-than-or-equal.
2171///
2172/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd)
2173#[inline]
2174#[target_feature(enable = "sse2")]
2175#[cfg_attr(test, assert_instr(cmpnlepd))]
2176#[stable(feature = "simd_x86", since = "1.27.0")]
2177pub fn _mm_cmpnle_pd(a: __m128d, b: __m128d) -> __m128d {
2178    unsafe { cmppd(a, b, 6) }
2179}
2180
2181/// Compares corresponding elements in `a` and `b` for not-greater-than.
2182///
2183/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_pd)
2184#[inline]
2185#[target_feature(enable = "sse2")]
2186#[cfg_attr(test, assert_instr(cmpnltpd))]
2187#[stable(feature = "simd_x86", since = "1.27.0")]
2188pub fn _mm_cmpngt_pd(a: __m128d, b: __m128d) -> __m128d {
2189    _mm_cmpnlt_pd(b, a)
2190}
2191
2192/// Compares corresponding elements in `a` and `b` for
2193/// not-greater-than-or-equal.
2194///
2195/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd)
2196#[inline]
2197#[target_feature(enable = "sse2")]
2198#[cfg_attr(test, assert_instr(cmpnlepd))]
2199#[stable(feature = "simd_x86", since = "1.27.0")]
2200pub fn _mm_cmpnge_pd(a: __m128d, b: __m128d) -> __m128d {
2201    _mm_cmpnle_pd(b, a)
2202}
2203
2204/// Compares the lower element of `a` and `b` for equality.
2205///
2206/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd)
2207#[inline]
2208#[target_feature(enable = "sse2")]
2209#[cfg_attr(test, assert_instr(comisd))]
2210#[stable(feature = "simd_x86", since = "1.27.0")]
2211pub fn _mm_comieq_sd(a: __m128d, b: __m128d) -> i32 {
2212    unsafe { comieqsd(a, b) }
2213}
2214
2215/// Compares the lower element of `a` and `b` for less-than.
2216///
2217/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd)
2218#[inline]
2219#[target_feature(enable = "sse2")]
2220#[cfg_attr(test, assert_instr(comisd))]
2221#[stable(feature = "simd_x86", since = "1.27.0")]
2222pub fn _mm_comilt_sd(a: __m128d, b: __m128d) -> i32 {
2223    unsafe { comiltsd(a, b) }
2224}
2225
2226/// Compares the lower element of `a` and `b` for less-than-or-equal.
2227///
2228/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd)
2229#[inline]
2230#[target_feature(enable = "sse2")]
2231#[cfg_attr(test, assert_instr(comisd))]
2232#[stable(feature = "simd_x86", since = "1.27.0")]
2233pub fn _mm_comile_sd(a: __m128d, b: __m128d) -> i32 {
2234    unsafe { comilesd(a, b) }
2235}
2236
2237/// Compares the lower element of `a` and `b` for greater-than.
2238///
2239/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd)
2240#[inline]
2241#[target_feature(enable = "sse2")]
2242#[cfg_attr(test, assert_instr(comisd))]
2243#[stable(feature = "simd_x86", since = "1.27.0")]
2244pub fn _mm_comigt_sd(a: __m128d, b: __m128d) -> i32 {
2245    unsafe { comigtsd(a, b) }
2246}
2247
2248/// Compares the lower element of `a` and `b` for greater-than-or-equal.
2249///
2250/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd)
2251#[inline]
2252#[target_feature(enable = "sse2")]
2253#[cfg_attr(test, assert_instr(comisd))]
2254#[stable(feature = "simd_x86", since = "1.27.0")]
2255pub fn _mm_comige_sd(a: __m128d, b: __m128d) -> i32 {
2256    unsafe { comigesd(a, b) }
2257}
2258
2259/// Compares the lower element of `a` and `b` for not-equal.
2260///
2261/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd)
2262#[inline]
2263#[target_feature(enable = "sse2")]
2264#[cfg_attr(test, assert_instr(comisd))]
2265#[stable(feature = "simd_x86", since = "1.27.0")]
2266pub fn _mm_comineq_sd(a: __m128d, b: __m128d) -> i32 {
2267    unsafe { comineqsd(a, b) }
2268}
2269
2270/// Compares the lower element of `a` and `b` for equality.
2271///
2272/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomieq_sd)
2273#[inline]
2274#[target_feature(enable = "sse2")]
2275#[cfg_attr(test, assert_instr(ucomisd))]
2276#[stable(feature = "simd_x86", since = "1.27.0")]
2277pub fn _mm_ucomieq_sd(a: __m128d, b: __m128d) -> i32 {
2278    unsafe { ucomieqsd(a, b) }
2279}
2280
2281/// Compares the lower element of `a` and `b` for less-than.
2282///
2283/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomilt_sd)
2284#[inline]
2285#[target_feature(enable = "sse2")]
2286#[cfg_attr(test, assert_instr(ucomisd))]
2287#[stable(feature = "simd_x86", since = "1.27.0")]
2288pub fn _mm_ucomilt_sd(a: __m128d, b: __m128d) -> i32 {
2289    unsafe { ucomiltsd(a, b) }
2290}
2291
2292/// Compares the lower element of `a` and `b` for less-than-or-equal.
2293///
2294/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomile_sd)
2295#[inline]
2296#[target_feature(enable = "sse2")]
2297#[cfg_attr(test, assert_instr(ucomisd))]
2298#[stable(feature = "simd_x86", since = "1.27.0")]
2299pub fn _mm_ucomile_sd(a: __m128d, b: __m128d) -> i32 {
2300    unsafe { ucomilesd(a, b) }
2301}
2302
2303/// Compares the lower element of `a` and `b` for greater-than.
2304///
2305/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomigt_sd)
2306#[inline]
2307#[target_feature(enable = "sse2")]
2308#[cfg_attr(test, assert_instr(ucomisd))]
2309#[stable(feature = "simd_x86", since = "1.27.0")]
2310pub fn _mm_ucomigt_sd(a: __m128d, b: __m128d) -> i32 {
2311    unsafe { ucomigtsd(a, b) }
2312}
2313
2314/// Compares the lower element of `a` and `b` for greater-than-or-equal.
2315///
2316/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomige_sd)
2317#[inline]
2318#[target_feature(enable = "sse2")]
2319#[cfg_attr(test, assert_instr(ucomisd))]
2320#[stable(feature = "simd_x86", since = "1.27.0")]
2321pub fn _mm_ucomige_sd(a: __m128d, b: __m128d) -> i32 {
2322    unsafe { ucomigesd(a, b) }
2323}
2324
2325/// Compares the lower element of `a` and `b` for not-equal.
2326///
2327/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomineq_sd)
2328#[inline]
2329#[target_feature(enable = "sse2")]
2330#[cfg_attr(test, assert_instr(ucomisd))]
2331#[stable(feature = "simd_x86", since = "1.27.0")]
2332pub fn _mm_ucomineq_sd(a: __m128d, b: __m128d) -> i32 {
2333    unsafe { ucomineqsd(a, b) }
2334}
2335
2336/// Converts packed double-precision (64-bit) floating-point elements in `a` to
2337/// packed single-precision (32-bit) floating-point elements
2338///
2339/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps)
2340#[inline]
2341#[target_feature(enable = "sse2")]
2342#[cfg_attr(test, assert_instr(cvtpd2ps))]
2343#[stable(feature = "simd_x86", since = "1.27.0")]
2344pub fn _mm_cvtpd_ps(a: __m128d) -> __m128 {
2345    unsafe {
2346        let r = simd_cast::<_, f32x2>(a.as_f64x2());
2347        let zero = f32x2::ZERO;
2348        transmute::<f32x4, _>(simd_shuffle!(r, zero, [0, 1, 2, 3]))
2349    }
2350}
2351
2352/// Converts packed single-precision (32-bit) floating-point elements in `a` to
2353/// packed
2354/// double-precision (64-bit) floating-point elements.
2355///
2356/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd)
2357#[inline]
2358#[target_feature(enable = "sse2")]
2359#[cfg_attr(test, assert_instr(cvtps2pd))]
2360#[stable(feature = "simd_x86", since = "1.27.0")]
2361pub fn _mm_cvtps_pd(a: __m128) -> __m128d {
2362    unsafe {
2363        let a = a.as_f32x4();
2364        transmute(simd_cast::<f32x2, f64x2>(simd_shuffle!(a, a, [0, 1])))
2365    }
2366}
2367
2368/// Converts packed double-precision (64-bit) floating-point elements in `a` to
2369/// packed 32-bit integers.
2370///
2371/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32)
2372#[inline]
2373#[target_feature(enable = "sse2")]
2374#[cfg_attr(test, assert_instr(cvtpd2dq))]
2375#[stable(feature = "simd_x86", since = "1.27.0")]
2376pub fn _mm_cvtpd_epi32(a: __m128d) -> __m128i {
2377    unsafe { transmute(cvtpd2dq(a)) }
2378}
2379
2380/// Converts the lower double-precision (64-bit) floating-point element in a to
2381/// a 32-bit integer.
2382///
2383/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32)
2384#[inline]
2385#[target_feature(enable = "sse2")]
2386#[cfg_attr(test, assert_instr(cvtsd2si))]
2387#[stable(feature = "simd_x86", since = "1.27.0")]
2388pub fn _mm_cvtsd_si32(a: __m128d) -> i32 {
2389    unsafe { cvtsd2si(a) }
2390}
2391
2392/// Converts the lower double-precision (64-bit) floating-point element in `b`
2393/// to a single-precision (32-bit) floating-point element, store the result in
2394/// the lower element of the return value, and copies the upper element from `a`
2395/// to the upper element the return value.
2396///
2397/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss)
2398#[inline]
2399#[target_feature(enable = "sse2")]
2400#[cfg_attr(test, assert_instr(cvtsd2ss))]
2401#[stable(feature = "simd_x86", since = "1.27.0")]
2402pub fn _mm_cvtsd_ss(a: __m128, b: __m128d) -> __m128 {
2403    unsafe { cvtsd2ss(a, b) }
2404}
2405
2406/// Returns the lower double-precision (64-bit) floating-point element of `a`.
2407///
2408/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64)
2409#[inline]
2410#[target_feature(enable = "sse2")]
2411#[stable(feature = "simd_x86", since = "1.27.0")]
2412pub fn _mm_cvtsd_f64(a: __m128d) -> f64 {
2413    unsafe { simd_extract!(a, 0) }
2414}
2415
2416/// Converts the lower single-precision (32-bit) floating-point element in `b`
2417/// to a double-precision (64-bit) floating-point element, store the result in
2418/// the lower element of the return value, and copies the upper element from `a`
2419/// to the upper element the return value.
2420///
2421/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd)
2422#[inline]
2423#[target_feature(enable = "sse2")]
2424#[cfg_attr(test, assert_instr(cvtss2sd))]
2425#[stable(feature = "simd_x86", since = "1.27.0")]
2426pub fn _mm_cvtss_sd(a: __m128d, b: __m128) -> __m128d {
2427    unsafe {
2428        let elt: f32 = simd_extract!(b, 0);
2429        simd_insert!(a, 0, elt as f64)
2430    }
2431}
2432
2433/// Converts packed double-precision (64-bit) floating-point elements in `a` to
2434/// packed 32-bit integers with truncation.
2435///
2436/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32)
2437#[inline]
2438#[target_feature(enable = "sse2")]
2439#[cfg_attr(test, assert_instr(cvttpd2dq))]
2440#[stable(feature = "simd_x86", since = "1.27.0")]
2441pub fn _mm_cvttpd_epi32(a: __m128d) -> __m128i {
2442    unsafe { transmute(cvttpd2dq(a)) }
2443}
2444
2445/// Converts the lower double-precision (64-bit) floating-point element in `a`
2446/// to a 32-bit integer with truncation.
2447///
2448/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32)
2449#[inline]
2450#[target_feature(enable = "sse2")]
2451#[cfg_attr(test, assert_instr(cvttsd2si))]
2452#[stable(feature = "simd_x86", since = "1.27.0")]
2453pub fn _mm_cvttsd_si32(a: __m128d) -> i32 {
2454    unsafe { cvttsd2si(a) }
2455}
2456
2457/// Converts packed single-precision (32-bit) floating-point elements in `a` to
2458/// packed 32-bit integers with truncation.
2459///
2460/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32)
2461#[inline]
2462#[target_feature(enable = "sse2")]
2463#[cfg_attr(test, assert_instr(cvttps2dq))]
2464#[stable(feature = "simd_x86", since = "1.27.0")]
2465pub fn _mm_cvttps_epi32(a: __m128) -> __m128i {
2466    unsafe { transmute(cvttps2dq(a)) }
2467}
2468
2469/// Copies double-precision (64-bit) floating-point element `a` to the lower
2470/// element of the packed 64-bit return value.
2471///
2472/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd)
2473#[inline]
2474#[target_feature(enable = "sse2")]
2475#[stable(feature = "simd_x86", since = "1.27.0")]
2476pub fn _mm_set_sd(a: f64) -> __m128d {
2477    _mm_set_pd(0.0, a)
2478}
2479
2480/// Broadcasts double-precision (64-bit) floating-point value a to all elements
2481/// of the return value.
2482///
2483/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd)
2484#[inline]
2485#[target_feature(enable = "sse2")]
2486#[stable(feature = "simd_x86", since = "1.27.0")]
2487pub fn _mm_set1_pd(a: f64) -> __m128d {
2488    _mm_set_pd(a, a)
2489}
2490
2491/// Broadcasts double-precision (64-bit) floating-point value a to all elements
2492/// of the return value.
2493///
2494/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1)
2495#[inline]
2496#[target_feature(enable = "sse2")]
2497#[stable(feature = "simd_x86", since = "1.27.0")]
2498pub fn _mm_set_pd1(a: f64) -> __m128d {
2499    _mm_set_pd(a, a)
2500}
2501
2502/// Sets packed double-precision (64-bit) floating-point elements in the return
2503/// value with the supplied values.
2504///
2505/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd)
2506#[inline]
2507#[target_feature(enable = "sse2")]
2508#[stable(feature = "simd_x86", since = "1.27.0")]
2509pub fn _mm_set_pd(a: f64, b: f64) -> __m128d {
2510    __m128d([b, a])
2511}
2512
2513/// Sets packed double-precision (64-bit) floating-point elements in the return
2514/// value with the supplied values in reverse order.
2515///
2516/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd)
2517#[inline]
2518#[target_feature(enable = "sse2")]
2519#[stable(feature = "simd_x86", since = "1.27.0")]
2520pub fn _mm_setr_pd(a: f64, b: f64) -> __m128d {
2521    _mm_set_pd(b, a)
2522}
2523
2524/// Returns packed double-precision (64-bit) floating-point elements with all
2525/// zeros.
2526///
2527/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd)
2528#[inline]
2529#[target_feature(enable = "sse2")]
2530#[cfg_attr(test, assert_instr(xorp))]
2531#[stable(feature = "simd_x86", since = "1.27.0")]
2532pub fn _mm_setzero_pd() -> __m128d {
2533    const { unsafe { mem::zeroed() } }
2534}
2535
2536/// Returns a mask of the most significant bit of each element in `a`.
2537///
2538/// The mask is stored in the 2 least significant bits of the return value.
2539/// All other bits are set to `0`.
2540///
2541/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd)
2542#[inline]
2543#[target_feature(enable = "sse2")]
2544#[cfg_attr(test, assert_instr(movmskpd))]
2545#[stable(feature = "simd_x86", since = "1.27.0")]
2546pub fn _mm_movemask_pd(a: __m128d) -> i32 {
2547    // Propagate the highest bit to the rest, because simd_bitmask
2548    // requires all-1 or all-0.
2549    unsafe {
2550        let mask: i64x2 = simd_lt(transmute(a), i64x2::ZERO);
2551        simd_bitmask::<i64x2, u8>(mask).into()
2552    }
2553}
2554
2555/// Loads 128-bits (composed of 2 packed double-precision (64-bit)
2556/// floating-point elements) from memory into the returned vector.
2557/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
2558/// exception may be generated.
2559///
2560/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd)
2561#[inline]
2562#[target_feature(enable = "sse2")]
2563#[cfg_attr(
2564    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
2565    assert_instr(movaps)
2566)]
2567#[stable(feature = "simd_x86", since = "1.27.0")]
2568#[allow(clippy::cast_ptr_alignment)]
2569pub unsafe fn _mm_load_pd(mem_addr: *const f64) -> __m128d {
2570    *(mem_addr as *const __m128d)
2571}
2572
2573/// Loads a 64-bit double-precision value to the low element of a
2574/// 128-bit integer vector and clears the upper element.
2575///
2576/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd)
2577#[inline]
2578#[target_feature(enable = "sse2")]
2579#[cfg_attr(test, assert_instr(movsd))]
2580#[stable(feature = "simd_x86", since = "1.27.0")]
2581pub unsafe fn _mm_load_sd(mem_addr: *const f64) -> __m128d {
2582    _mm_setr_pd(*mem_addr, 0.)
2583}
2584
2585/// Loads a double-precision value into the high-order bits of a 128-bit
2586/// vector of `[2 x double]`. The low-order bits are copied from the low-order
2587/// bits of the first operand.
2588///
2589/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd)
2590#[inline]
2591#[target_feature(enable = "sse2")]
2592#[cfg_attr(test, assert_instr(movhps))]
2593#[stable(feature = "simd_x86", since = "1.27.0")]
2594pub unsafe fn _mm_loadh_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
2595    _mm_setr_pd(simd_extract!(a, 0), *mem_addr)
2596}
2597
2598/// Loads a double-precision value into the low-order bits of a 128-bit
2599/// vector of `[2 x double]`. The high-order bits are copied from the
2600/// high-order bits of the first operand.
2601///
2602/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd)
2603#[inline]
2604#[target_feature(enable = "sse2")]
2605#[cfg_attr(test, assert_instr(movlps))]
2606#[stable(feature = "simd_x86", since = "1.27.0")]
2607pub unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
2608    _mm_setr_pd(*mem_addr, simd_extract!(a, 1))
2609}
2610
2611/// Stores a 128-bit floating point vector of `[2 x double]` to a 128-bit
2612/// aligned memory location.
2613/// To minimize caching, the data is flagged as non-temporal (unlikely to be
2614/// used again soon).
2615///
2616/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd)
2617///
2618/// # Safety of non-temporal stores
2619///
2620/// After using this intrinsic, but before any other access to the memory that this intrinsic
2621/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
2622/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
2623/// return.
2624///
2625/// See [`_mm_sfence`] for details.
2626#[inline]
2627#[target_feature(enable = "sse2")]
2628#[cfg_attr(test, assert_instr(movntpd))]
2629#[stable(feature = "simd_x86", since = "1.27.0")]
2630#[allow(clippy::cast_ptr_alignment)]
2631pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) {
2632    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
2633    crate::arch::asm!(
2634        vps!("movntpd", ",{a}"),
2635        p = in(reg) mem_addr,
2636        a = in(xmm_reg) a,
2637        options(nostack, preserves_flags),
2638    );
2639}
2640
2641/// Stores the lower 64 bits of a 128-bit vector of `[2 x double]` to a
2642/// memory location.
2643///
2644/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_sd)
2645#[inline]
2646#[target_feature(enable = "sse2")]
2647#[cfg_attr(test, assert_instr(movlps))]
2648#[stable(feature = "simd_x86", since = "1.27.0")]
2649pub unsafe fn _mm_store_sd(mem_addr: *mut f64, a: __m128d) {
2650    *mem_addr = simd_extract!(a, 0)
2651}
2652
2653/// Stores 128-bits (composed of 2 packed double-precision (64-bit)
2654/// floating-point elements) from `a` into memory. `mem_addr` must be aligned
2655/// on a 16-byte boundary or a general-protection exception may be generated.
2656///
2657/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd)
2658#[inline]
2659#[target_feature(enable = "sse2")]
2660#[cfg_attr(
2661    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
2662    assert_instr(movaps)
2663)]
2664#[stable(feature = "simd_x86", since = "1.27.0")]
2665#[allow(clippy::cast_ptr_alignment)]
2666pub unsafe fn _mm_store_pd(mem_addr: *mut f64, a: __m128d) {
2667    *(mem_addr as *mut __m128d) = a;
2668}
2669
2670/// Stores 128-bits (composed of 2 packed double-precision (64-bit)
2671/// floating-point elements) from `a` into memory.
2672/// `mem_addr` does not need to be aligned on any particular boundary.
2673///
2674/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd)
2675#[inline]
2676#[target_feature(enable = "sse2")]
2677#[cfg_attr(test, assert_instr(movups))] // FIXME movupd expected
2678#[stable(feature = "simd_x86", since = "1.27.0")]
2679pub unsafe fn _mm_storeu_pd(mem_addr: *mut f64, a: __m128d) {
2680    mem_addr.cast::<__m128d>().write_unaligned(a);
2681}
2682
2683/// Store 16-bit integer from the first element of a into memory.
2684///
2685/// `mem_addr` does not need to be aligned on any particular boundary.
2686///
2687/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16)
2688#[inline]
2689#[target_feature(enable = "sse2")]
2690#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2691pub unsafe fn _mm_storeu_si16(mem_addr: *mut u8, a: __m128i) {
2692    ptr::write_unaligned(mem_addr as *mut i16, simd_extract(a.as_i16x8(), 0))
2693}
2694
2695/// Store 32-bit integer from the first element of a into memory.
2696///
2697/// `mem_addr` does not need to be aligned on any particular boundary.
2698///
2699/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32)
2700#[inline]
2701#[target_feature(enable = "sse2")]
2702#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2703pub unsafe fn _mm_storeu_si32(mem_addr: *mut u8, a: __m128i) {
2704    ptr::write_unaligned(mem_addr as *mut i32, simd_extract(a.as_i32x4(), 0))
2705}
2706
2707/// Store 64-bit integer from the first element of a into memory.
2708///
2709/// `mem_addr` does not need to be aligned on any particular boundary.
2710///
2711/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64)
2712#[inline]
2713#[target_feature(enable = "sse2")]
2714#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2715pub unsafe fn _mm_storeu_si64(mem_addr: *mut u8, a: __m128i) {
2716    ptr::write_unaligned(mem_addr as *mut i64, simd_extract(a.as_i64x2(), 0))
2717}
2718
2719/// Stores the lower double-precision (64-bit) floating-point element from `a`
2720/// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
2721/// 16-byte boundary or a general-protection exception may be generated.
2722///
2723/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_pd)
2724#[inline]
2725#[target_feature(enable = "sse2")]
2726#[stable(feature = "simd_x86", since = "1.27.0")]
2727#[allow(clippy::cast_ptr_alignment)]
2728pub unsafe fn _mm_store1_pd(mem_addr: *mut f64, a: __m128d) {
2729    let b: __m128d = simd_shuffle!(a, a, [0, 0]);
2730    *(mem_addr as *mut __m128d) = b;
2731}
2732
2733/// Stores the lower double-precision (64-bit) floating-point element from `a`
2734/// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
2735/// 16-byte boundary or a general-protection exception may be generated.
2736///
2737/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1)
2738#[inline]
2739#[target_feature(enable = "sse2")]
2740#[stable(feature = "simd_x86", since = "1.27.0")]
2741#[allow(clippy::cast_ptr_alignment)]
2742pub unsafe fn _mm_store_pd1(mem_addr: *mut f64, a: __m128d) {
2743    let b: __m128d = simd_shuffle!(a, a, [0, 0]);
2744    *(mem_addr as *mut __m128d) = b;
2745}
2746
2747/// Stores 2 double-precision (64-bit) floating-point elements from `a` into
2748/// memory in reverse order.
2749/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
2750/// exception may be generated.
2751///
2752/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd)
2753#[inline]
2754#[target_feature(enable = "sse2")]
2755#[stable(feature = "simd_x86", since = "1.27.0")]
2756#[allow(clippy::cast_ptr_alignment)]
2757pub unsafe fn _mm_storer_pd(mem_addr: *mut f64, a: __m128d) {
2758    let b: __m128d = simd_shuffle!(a, a, [1, 0]);
2759    *(mem_addr as *mut __m128d) = b;
2760}
2761
2762/// Stores the upper 64 bits of a 128-bit vector of `[2 x double]` to a
2763/// memory location.
2764///
2765/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd)
2766#[inline]
2767#[target_feature(enable = "sse2")]
2768#[cfg_attr(test, assert_instr(movhps))]
2769#[stable(feature = "simd_x86", since = "1.27.0")]
2770pub unsafe fn _mm_storeh_pd(mem_addr: *mut f64, a: __m128d) {
2771    *mem_addr = simd_extract!(a, 1);
2772}
2773
2774/// Stores the lower 64 bits of a 128-bit vector of `[2 x double]` to a
2775/// memory location.
2776///
2777/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd)
2778#[inline]
2779#[target_feature(enable = "sse2")]
2780#[cfg_attr(test, assert_instr(movlps))]
2781#[stable(feature = "simd_x86", since = "1.27.0")]
2782pub unsafe fn _mm_storel_pd(mem_addr: *mut f64, a: __m128d) {
2783    *mem_addr = simd_extract!(a, 0);
2784}
2785
2786/// Loads a double-precision (64-bit) floating-point element from memory
2787/// into both elements of returned vector.
2788///
2789/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd)
2790#[inline]
2791#[target_feature(enable = "sse2")]
2792// #[cfg_attr(test, assert_instr(movapd))] // FIXME LLVM uses different codegen
2793#[stable(feature = "simd_x86", since = "1.27.0")]
2794pub unsafe fn _mm_load1_pd(mem_addr: *const f64) -> __m128d {
2795    let d = *mem_addr;
2796    _mm_setr_pd(d, d)
2797}
2798
2799/// Loads a double-precision (64-bit) floating-point element from memory
2800/// into both elements of returned vector.
2801///
2802/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1)
2803#[inline]
2804#[target_feature(enable = "sse2")]
2805// #[cfg_attr(test, assert_instr(movapd))] // FIXME same as _mm_load1_pd
2806#[stable(feature = "simd_x86", since = "1.27.0")]
2807pub unsafe fn _mm_load_pd1(mem_addr: *const f64) -> __m128d {
2808    _mm_load1_pd(mem_addr)
2809}
2810
2811/// Loads 2 double-precision (64-bit) floating-point elements from memory into
2812/// the returned vector in reverse order. `mem_addr` must be aligned on a
2813/// 16-byte boundary or a general-protection exception may be generated.
2814///
2815/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd)
2816#[inline]
2817#[target_feature(enable = "sse2")]
2818#[cfg_attr(
2819    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
2820    assert_instr(movaps)
2821)]
2822#[stable(feature = "simd_x86", since = "1.27.0")]
2823pub unsafe fn _mm_loadr_pd(mem_addr: *const f64) -> __m128d {
2824    let a = _mm_load_pd(mem_addr);
2825    simd_shuffle!(a, a, [1, 0])
2826}
2827
2828/// Loads 128-bits (composed of 2 packed double-precision (64-bit)
2829/// floating-point elements) from memory into the returned vector.
2830/// `mem_addr` does not need to be aligned on any particular boundary.
2831///
2832/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd)
2833#[inline]
2834#[target_feature(enable = "sse2")]
2835#[cfg_attr(test, assert_instr(movups))]
2836#[stable(feature = "simd_x86", since = "1.27.0")]
2837pub unsafe fn _mm_loadu_pd(mem_addr: *const f64) -> __m128d {
2838    let mut dst = _mm_undefined_pd();
2839    ptr::copy_nonoverlapping(
2840        mem_addr as *const u8,
2841        ptr::addr_of_mut!(dst) as *mut u8,
2842        mem::size_of::<__m128d>(),
2843    );
2844    dst
2845}
2846
2847/// Loads unaligned 16-bits of integer data from memory into new vector.
2848///
2849/// `mem_addr` does not need to be aligned on any particular boundary.
2850///
2851/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16)
2852#[inline]
2853#[target_feature(enable = "sse2")]
2854#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2855pub unsafe fn _mm_loadu_si16(mem_addr: *const u8) -> __m128i {
2856    transmute(i16x8::new(
2857        ptr::read_unaligned(mem_addr as *const i16),
2858        0,
2859        0,
2860        0,
2861        0,
2862        0,
2863        0,
2864        0,
2865    ))
2866}
2867
2868/// Loads unaligned 32-bits of integer data from memory into new vector.
2869///
2870/// `mem_addr` does not need to be aligned on any particular boundary.
2871///
2872/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32)
2873#[inline]
2874#[target_feature(enable = "sse2")]
2875#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2876pub unsafe fn _mm_loadu_si32(mem_addr: *const u8) -> __m128i {
2877    transmute(i32x4::new(
2878        ptr::read_unaligned(mem_addr as *const i32),
2879        0,
2880        0,
2881        0,
2882    ))
2883}
2884
2885/// Loads unaligned 64-bits of integer data from memory into new vector.
2886///
2887/// `mem_addr` does not need to be aligned on any particular boundary.
2888///
2889/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64)
2890#[inline]
2891#[target_feature(enable = "sse2")]
2892#[stable(feature = "simd_x86_mm_loadu_si64", since = "1.46.0")]
2893pub unsafe fn _mm_loadu_si64(mem_addr: *const u8) -> __m128i {
2894    transmute(i64x2::new(ptr::read_unaligned(mem_addr as *const i64), 0))
2895}
2896
2897/// Constructs a 128-bit floating-point vector of `[2 x double]` from two
2898/// 128-bit vector parameters of `[2 x double]`, using the immediate-value
2899/// parameter as a specifier.
2900///
2901/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd)
2902#[inline]
2903#[target_feature(enable = "sse2")]
2904#[cfg_attr(test, assert_instr(shufps, MASK = 2))]
2905#[rustc_legacy_const_generics(2)]
2906#[stable(feature = "simd_x86", since = "1.27.0")]
2907pub fn _mm_shuffle_pd<const MASK: i32>(a: __m128d, b: __m128d) -> __m128d {
2908    static_assert_uimm_bits!(MASK, 8);
2909    unsafe { simd_shuffle!(a, b, [MASK as u32 & 0b1, ((MASK as u32 >> 1) & 0b1) + 2]) }
2910}
2911
2912/// Constructs a 128-bit floating-point vector of `[2 x double]`. The lower
2913/// 64 bits are set to the lower 64 bits of the second parameter. The upper
2914/// 64 bits are set to the upper 64 bits of the first parameter.
2915///
2916/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd)
2917#[inline]
2918#[target_feature(enable = "sse2")]
2919#[cfg_attr(test, assert_instr(movsd))]
2920#[stable(feature = "simd_x86", since = "1.27.0")]
2921pub fn _mm_move_sd(a: __m128d, b: __m128d) -> __m128d {
2922    unsafe { _mm_setr_pd(simd_extract!(b, 0), simd_extract!(a, 1)) }
2923}
2924
2925/// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit
2926/// floating-point vector of `[4 x float]`.
2927///
2928/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps)
2929#[inline]
2930#[target_feature(enable = "sse2")]
2931#[stable(feature = "simd_x86", since = "1.27.0")]
2932pub fn _mm_castpd_ps(a: __m128d) -> __m128 {
2933    unsafe { transmute(a) }
2934}
2935
2936/// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit
2937/// integer vector.
2938///
2939/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128)
2940#[inline]
2941#[target_feature(enable = "sse2")]
2942#[stable(feature = "simd_x86", since = "1.27.0")]
2943pub fn _mm_castpd_si128(a: __m128d) -> __m128i {
2944    unsafe { transmute(a) }
2945}
2946
2947/// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit
2948/// floating-point vector of `[2 x double]`.
2949///
2950/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd)
2951#[inline]
2952#[target_feature(enable = "sse2")]
2953#[stable(feature = "simd_x86", since = "1.27.0")]
2954pub fn _mm_castps_pd(a: __m128) -> __m128d {
2955    unsafe { transmute(a) }
2956}
2957
2958/// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit
2959/// integer vector.
2960///
2961/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128)
2962#[inline]
2963#[target_feature(enable = "sse2")]
2964#[stable(feature = "simd_x86", since = "1.27.0")]
2965pub fn _mm_castps_si128(a: __m128) -> __m128i {
2966    unsafe { transmute(a) }
2967}
2968
2969/// Casts a 128-bit integer vector into a 128-bit floating-point vector
2970/// of `[2 x double]`.
2971///
2972/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd)
2973#[inline]
2974#[target_feature(enable = "sse2")]
2975#[stable(feature = "simd_x86", since = "1.27.0")]
2976pub fn _mm_castsi128_pd(a: __m128i) -> __m128d {
2977    unsafe { transmute(a) }
2978}
2979
2980/// Casts a 128-bit integer vector into a 128-bit floating-point vector
2981/// of `[4 x float]`.
2982///
2983/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps)
2984#[inline]
2985#[target_feature(enable = "sse2")]
2986#[stable(feature = "simd_x86", since = "1.27.0")]
2987pub fn _mm_castsi128_ps(a: __m128i) -> __m128 {
2988    unsafe { transmute(a) }
2989}
2990
2991/// Returns vector of type __m128d with indeterminate elements.with indetermination elements.
2992/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
2993/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
2994/// In practice, this is typically equivalent to [`mem::zeroed`].
2995///
2996/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd)
2997#[inline]
2998#[target_feature(enable = "sse2")]
2999#[stable(feature = "simd_x86", since = "1.27.0")]
3000pub fn _mm_undefined_pd() -> __m128d {
3001    const { unsafe { mem::zeroed() } }
3002}
3003
3004/// Returns vector of type __m128i with indeterminate elements.with indetermination elements.
3005/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
3006/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
3007/// In practice, this is typically equivalent to [`mem::zeroed`].
3008///
3009/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_si128)
3010#[inline]
3011#[target_feature(enable = "sse2")]
3012#[stable(feature = "simd_x86", since = "1.27.0")]
3013pub fn _mm_undefined_si128() -> __m128i {
3014    const { unsafe { mem::zeroed() } }
3015}
3016
3017/// The resulting `__m128d` element is composed by the low-order values of
3018/// the two `__m128d` interleaved input elements, i.e.:
3019///
3020/// * The `[127:64]` bits are copied from the `[127:64]` bits of the second input
3021/// * The `[63:0]` bits are copied from the `[127:64]` bits of the first input
3022///
3023/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd)
3024#[inline]
3025#[target_feature(enable = "sse2")]
3026#[cfg_attr(test, assert_instr(unpckhpd))]
3027#[stable(feature = "simd_x86", since = "1.27.0")]
3028pub fn _mm_unpackhi_pd(a: __m128d, b: __m128d) -> __m128d {
3029    unsafe { simd_shuffle!(a, b, [1, 3]) }
3030}
3031
3032/// The resulting `__m128d` element is composed by the high-order values of
3033/// the two `__m128d` interleaved input elements, i.e.:
3034///
3035/// * The `[127:64]` bits are copied from the `[63:0]` bits of the second input
3036/// * The `[63:0]` bits are copied from the `[63:0]` bits of the first input
3037///
3038/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd)
3039#[inline]
3040#[target_feature(enable = "sse2")]
3041#[cfg_attr(test, assert_instr(movlhps))]
3042#[stable(feature = "simd_x86", since = "1.27.0")]
3043pub fn _mm_unpacklo_pd(a: __m128d, b: __m128d) -> __m128d {
3044    unsafe { simd_shuffle!(a, b, [0, 2]) }
3045}
3046
3047#[allow(improper_ctypes)]
3048unsafe extern "C" {
3049    #[link_name = "llvm.x86.sse2.pause"]
3050    fn pause();
3051    #[link_name = "llvm.x86.sse2.clflush"]
3052    fn clflush(p: *const u8);
3053    #[link_name = "llvm.x86.sse2.lfence"]
3054    fn lfence();
3055    #[link_name = "llvm.x86.sse2.mfence"]
3056    fn mfence();
3057    #[link_name = "llvm.x86.sse2.psad.bw"]
3058    fn psadbw(a: u8x16, b: u8x16) -> u64x2;
3059    #[link_name = "llvm.x86.sse2.psll.w"]
3060    fn psllw(a: i16x8, count: i16x8) -> i16x8;
3061    #[link_name = "llvm.x86.sse2.psll.d"]
3062    fn pslld(a: i32x4, count: i32x4) -> i32x4;
3063    #[link_name = "llvm.x86.sse2.psll.q"]
3064    fn psllq(a: i64x2, count: i64x2) -> i64x2;
3065    #[link_name = "llvm.x86.sse2.psra.w"]
3066    fn psraw(a: i16x8, count: i16x8) -> i16x8;
3067    #[link_name = "llvm.x86.sse2.psra.d"]
3068    fn psrad(a: i32x4, count: i32x4) -> i32x4;
3069    #[link_name = "llvm.x86.sse2.psrl.w"]
3070    fn psrlw(a: i16x8, count: i16x8) -> i16x8;
3071    #[link_name = "llvm.x86.sse2.psrl.d"]
3072    fn psrld(a: i32x4, count: i32x4) -> i32x4;
3073    #[link_name = "llvm.x86.sse2.psrl.q"]
3074    fn psrlq(a: i64x2, count: i64x2) -> i64x2;
3075    #[link_name = "llvm.x86.sse2.cvtps2dq"]
3076    fn cvtps2dq(a: __m128) -> i32x4;
3077    #[link_name = "llvm.x86.sse2.maskmov.dqu"]
3078    fn maskmovdqu(a: i8x16, mask: i8x16, mem_addr: *mut i8);
3079    #[link_name = "llvm.x86.sse2.packsswb.128"]
3080    fn packsswb(a: i16x8, b: i16x8) -> i8x16;
3081    #[link_name = "llvm.x86.sse2.packssdw.128"]
3082    fn packssdw(a: i32x4, b: i32x4) -> i16x8;
3083    #[link_name = "llvm.x86.sse2.packuswb.128"]
3084    fn packuswb(a: i16x8, b: i16x8) -> u8x16;
3085    #[link_name = "llvm.x86.sse2.max.sd"]
3086    fn maxsd(a: __m128d, b: __m128d) -> __m128d;
3087    #[link_name = "llvm.x86.sse2.max.pd"]
3088    fn maxpd(a: __m128d, b: __m128d) -> __m128d;
3089    #[link_name = "llvm.x86.sse2.min.sd"]
3090    fn minsd(a: __m128d, b: __m128d) -> __m128d;
3091    #[link_name = "llvm.x86.sse2.min.pd"]
3092    fn minpd(a: __m128d, b: __m128d) -> __m128d;
3093    #[link_name = "llvm.x86.sse2.cmp.sd"]
3094    fn cmpsd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3095    #[link_name = "llvm.x86.sse2.cmp.pd"]
3096    fn cmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3097    #[link_name = "llvm.x86.sse2.comieq.sd"]
3098    fn comieqsd(a: __m128d, b: __m128d) -> i32;
3099    #[link_name = "llvm.x86.sse2.comilt.sd"]
3100    fn comiltsd(a: __m128d, b: __m128d) -> i32;
3101    #[link_name = "llvm.x86.sse2.comile.sd"]
3102    fn comilesd(a: __m128d, b: __m128d) -> i32;
3103    #[link_name = "llvm.x86.sse2.comigt.sd"]
3104    fn comigtsd(a: __m128d, b: __m128d) -> i32;
3105    #[link_name = "llvm.x86.sse2.comige.sd"]
3106    fn comigesd(a: __m128d, b: __m128d) -> i32;
3107    #[link_name = "llvm.x86.sse2.comineq.sd"]
3108    fn comineqsd(a: __m128d, b: __m128d) -> i32;
3109    #[link_name = "llvm.x86.sse2.ucomieq.sd"]
3110    fn ucomieqsd(a: __m128d, b: __m128d) -> i32;
3111    #[link_name = "llvm.x86.sse2.ucomilt.sd"]
3112    fn ucomiltsd(a: __m128d, b: __m128d) -> i32;
3113    #[link_name = "llvm.x86.sse2.ucomile.sd"]
3114    fn ucomilesd(a: __m128d, b: __m128d) -> i32;
3115    #[link_name = "llvm.x86.sse2.ucomigt.sd"]
3116    fn ucomigtsd(a: __m128d, b: __m128d) -> i32;
3117    #[link_name = "llvm.x86.sse2.ucomige.sd"]
3118    fn ucomigesd(a: __m128d, b: __m128d) -> i32;
3119    #[link_name = "llvm.x86.sse2.ucomineq.sd"]
3120    fn ucomineqsd(a: __m128d, b: __m128d) -> i32;
3121    #[link_name = "llvm.x86.sse2.cvtpd2dq"]
3122    fn cvtpd2dq(a: __m128d) -> i32x4;
3123    #[link_name = "llvm.x86.sse2.cvtsd2si"]
3124    fn cvtsd2si(a: __m128d) -> i32;
3125    #[link_name = "llvm.x86.sse2.cvtsd2ss"]
3126    fn cvtsd2ss(a: __m128, b: __m128d) -> __m128;
3127    #[link_name = "llvm.x86.sse2.cvttpd2dq"]
3128    fn cvttpd2dq(a: __m128d) -> i32x4;
3129    #[link_name = "llvm.x86.sse2.cvttsd2si"]
3130    fn cvttsd2si(a: __m128d) -> i32;
3131    #[link_name = "llvm.x86.sse2.cvttps2dq"]
3132    fn cvttps2dq(a: __m128) -> i32x4;
3133}
3134
3135#[cfg(test)]
3136mod tests {
3137    use crate::{
3138        core_arch::{simd::*, x86::*},
3139        hint::black_box,
3140    };
3141    use std::{
3142        boxed, f32, f64,
3143        mem::{self, transmute},
3144        ptr,
3145    };
3146    use stdarch_test::simd_test;
3147
3148    const NAN: f64 = f64::NAN;
3149
3150    #[test]
3151    fn test_mm_pause() {
3152        _mm_pause()
3153    }
3154
3155    #[simd_test(enable = "sse2")]
3156    unsafe fn test_mm_clflush() {
3157        let x = 0_u8;
3158        _mm_clflush(ptr::addr_of!(x));
3159    }
3160
3161    #[simd_test(enable = "sse2")]
3162    // Miri cannot support this until it is clear how it fits in the Rust memory model
3163    #[cfg_attr(miri, ignore)]
3164    unsafe fn test_mm_lfence() {
3165        _mm_lfence();
3166    }
3167
3168    #[simd_test(enable = "sse2")]
3169    // Miri cannot support this until it is clear how it fits in the Rust memory model
3170    #[cfg_attr(miri, ignore)]
3171    unsafe fn test_mm_mfence() {
3172        _mm_mfence();
3173    }
3174
3175    #[simd_test(enable = "sse2")]
3176    unsafe fn test_mm_add_epi8() {
3177        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3178        #[rustfmt::skip]
3179        let b = _mm_setr_epi8(
3180            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3181        );
3182        let r = _mm_add_epi8(a, b);
3183        #[rustfmt::skip]
3184        let e = _mm_setr_epi8(
3185            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
3186        );
3187        assert_eq_m128i(r, e);
3188    }
3189
3190    #[simd_test(enable = "sse2")]
3191    unsafe fn test_mm_add_epi8_overflow() {
3192        let a = _mm_set1_epi8(0x7F);
3193        let b = _mm_set1_epi8(1);
3194        let r = _mm_add_epi8(a, b);
3195        assert_eq_m128i(r, _mm_set1_epi8(-128));
3196    }
3197
3198    #[simd_test(enable = "sse2")]
3199    unsafe fn test_mm_add_epi16() {
3200        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3201        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
3202        let r = _mm_add_epi16(a, b);
3203        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
3204        assert_eq_m128i(r, e);
3205    }
3206
3207    #[simd_test(enable = "sse2")]
3208    unsafe fn test_mm_add_epi32() {
3209        let a = _mm_setr_epi32(0, 1, 2, 3);
3210        let b = _mm_setr_epi32(4, 5, 6, 7);
3211        let r = _mm_add_epi32(a, b);
3212        let e = _mm_setr_epi32(4, 6, 8, 10);
3213        assert_eq_m128i(r, e);
3214    }
3215
3216    #[simd_test(enable = "sse2")]
3217    unsafe fn test_mm_add_epi64() {
3218        let a = _mm_setr_epi64x(0, 1);
3219        let b = _mm_setr_epi64x(2, 3);
3220        let r = _mm_add_epi64(a, b);
3221        let e = _mm_setr_epi64x(2, 4);
3222        assert_eq_m128i(r, e);
3223    }
3224
3225    #[simd_test(enable = "sse2")]
3226    unsafe fn test_mm_adds_epi8() {
3227        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3228        #[rustfmt::skip]
3229        let b = _mm_setr_epi8(
3230            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3231        );
3232        let r = _mm_adds_epi8(a, b);
3233        #[rustfmt::skip]
3234        let e = _mm_setr_epi8(
3235            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
3236        );
3237        assert_eq_m128i(r, e);
3238    }
3239
3240    #[simd_test(enable = "sse2")]
3241    unsafe fn test_mm_adds_epi8_saturate_positive() {
3242        let a = _mm_set1_epi8(0x7F);
3243        let b = _mm_set1_epi8(1);
3244        let r = _mm_adds_epi8(a, b);
3245        assert_eq_m128i(r, a);
3246    }
3247
3248    #[simd_test(enable = "sse2")]
3249    unsafe fn test_mm_adds_epi8_saturate_negative() {
3250        let a = _mm_set1_epi8(-0x80);
3251        let b = _mm_set1_epi8(-1);
3252        let r = _mm_adds_epi8(a, b);
3253        assert_eq_m128i(r, a);
3254    }
3255
3256    #[simd_test(enable = "sse2")]
3257    unsafe fn test_mm_adds_epi16() {
3258        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3259        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
3260        let r = _mm_adds_epi16(a, b);
3261        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
3262        assert_eq_m128i(r, e);
3263    }
3264
3265    #[simd_test(enable = "sse2")]
3266    unsafe fn test_mm_adds_epi16_saturate_positive() {
3267        let a = _mm_set1_epi16(0x7FFF);
3268        let b = _mm_set1_epi16(1);
3269        let r = _mm_adds_epi16(a, b);
3270        assert_eq_m128i(r, a);
3271    }
3272
3273    #[simd_test(enable = "sse2")]
3274    unsafe fn test_mm_adds_epi16_saturate_negative() {
3275        let a = _mm_set1_epi16(-0x8000);
3276        let b = _mm_set1_epi16(-1);
3277        let r = _mm_adds_epi16(a, b);
3278        assert_eq_m128i(r, a);
3279    }
3280
3281    #[simd_test(enable = "sse2")]
3282    unsafe fn test_mm_adds_epu8() {
3283        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3284        #[rustfmt::skip]
3285        let b = _mm_setr_epi8(
3286            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3287        );
3288        let r = _mm_adds_epu8(a, b);
3289        #[rustfmt::skip]
3290        let e = _mm_setr_epi8(
3291            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
3292        );
3293        assert_eq_m128i(r, e);
3294    }
3295
3296    #[simd_test(enable = "sse2")]
3297    unsafe fn test_mm_adds_epu8_saturate() {
3298        let a = _mm_set1_epi8(!0);
3299        let b = _mm_set1_epi8(1);
3300        let r = _mm_adds_epu8(a, b);
3301        assert_eq_m128i(r, a);
3302    }
3303
3304    #[simd_test(enable = "sse2")]
3305    unsafe fn test_mm_adds_epu16() {
3306        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3307        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
3308        let r = _mm_adds_epu16(a, b);
3309        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
3310        assert_eq_m128i(r, e);
3311    }
3312
3313    #[simd_test(enable = "sse2")]
3314    unsafe fn test_mm_adds_epu16_saturate() {
3315        let a = _mm_set1_epi16(!0);
3316        let b = _mm_set1_epi16(1);
3317        let r = _mm_adds_epu16(a, b);
3318        assert_eq_m128i(r, a);
3319    }
3320
3321    #[simd_test(enable = "sse2")]
3322    unsafe fn test_mm_avg_epu8() {
3323        let (a, b) = (_mm_set1_epi8(3), _mm_set1_epi8(9));
3324        let r = _mm_avg_epu8(a, b);
3325        assert_eq_m128i(r, _mm_set1_epi8(6));
3326    }
3327
3328    #[simd_test(enable = "sse2")]
3329    unsafe fn test_mm_avg_epu16() {
3330        let (a, b) = (_mm_set1_epi16(3), _mm_set1_epi16(9));
3331        let r = _mm_avg_epu16(a, b);
3332        assert_eq_m128i(r, _mm_set1_epi16(6));
3333    }
3334
3335    #[simd_test(enable = "sse2")]
3336    unsafe fn test_mm_madd_epi16() {
3337        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
3338        let b = _mm_setr_epi16(9, 10, 11, 12, 13, 14, 15, 16);
3339        let r = _mm_madd_epi16(a, b);
3340        let e = _mm_setr_epi32(29, 81, 149, 233);
3341        assert_eq_m128i(r, e);
3342
3343        // Test large values.
3344        // MIN*MIN+MIN*MIN will overflow into i32::MIN.
3345        let a = _mm_setr_epi16(
3346            i16::MAX,
3347            i16::MAX,
3348            i16::MIN,
3349            i16::MIN,
3350            i16::MIN,
3351            i16::MAX,
3352            0,
3353            0,
3354        );
3355        let b = _mm_setr_epi16(
3356            i16::MAX,
3357            i16::MAX,
3358            i16::MIN,
3359            i16::MIN,
3360            i16::MAX,
3361            i16::MIN,
3362            0,
3363            0,
3364        );
3365        let r = _mm_madd_epi16(a, b);
3366        let e = _mm_setr_epi32(0x7FFE0002, i32::MIN, -0x7FFF0000, 0);
3367        assert_eq_m128i(r, e);
3368    }
3369
3370    #[simd_test(enable = "sse2")]
3371    unsafe fn test_mm_max_epi16() {
3372        let a = _mm_set1_epi16(1);
3373        let b = _mm_set1_epi16(-1);
3374        let r = _mm_max_epi16(a, b);
3375        assert_eq_m128i(r, a);
3376    }
3377
3378    #[simd_test(enable = "sse2")]
3379    unsafe fn test_mm_max_epu8() {
3380        let a = _mm_set1_epi8(1);
3381        let b = _mm_set1_epi8(!0);
3382        let r = _mm_max_epu8(a, b);
3383        assert_eq_m128i(r, b);
3384    }
3385
3386    #[simd_test(enable = "sse2")]
3387    unsafe fn test_mm_min_epi16() {
3388        let a = _mm_set1_epi16(1);
3389        let b = _mm_set1_epi16(-1);
3390        let r = _mm_min_epi16(a, b);
3391        assert_eq_m128i(r, b);
3392    }
3393
3394    #[simd_test(enable = "sse2")]
3395    unsafe fn test_mm_min_epu8() {
3396        let a = _mm_set1_epi8(1);
3397        let b = _mm_set1_epi8(!0);
3398        let r = _mm_min_epu8(a, b);
3399        assert_eq_m128i(r, a);
3400    }
3401
3402    #[simd_test(enable = "sse2")]
3403    unsafe fn test_mm_mulhi_epi16() {
3404        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(-1001));
3405        let r = _mm_mulhi_epi16(a, b);
3406        assert_eq_m128i(r, _mm_set1_epi16(-16));
3407    }
3408
3409    #[simd_test(enable = "sse2")]
3410    unsafe fn test_mm_mulhi_epu16() {
3411        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(1001));
3412        let r = _mm_mulhi_epu16(a, b);
3413        assert_eq_m128i(r, _mm_set1_epi16(15));
3414    }
3415
3416    #[simd_test(enable = "sse2")]
3417    unsafe fn test_mm_mullo_epi16() {
3418        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(-1001));
3419        let r = _mm_mullo_epi16(a, b);
3420        assert_eq_m128i(r, _mm_set1_epi16(-17960));
3421    }
3422
3423    #[simd_test(enable = "sse2")]
3424    unsafe fn test_mm_mul_epu32() {
3425        let a = _mm_setr_epi64x(1_000_000_000, 1 << 34);
3426        let b = _mm_setr_epi64x(1_000_000_000, 1 << 35);
3427        let r = _mm_mul_epu32(a, b);
3428        let e = _mm_setr_epi64x(1_000_000_000 * 1_000_000_000, 0);
3429        assert_eq_m128i(r, e);
3430    }
3431
3432    #[simd_test(enable = "sse2")]
3433    unsafe fn test_mm_sad_epu8() {
3434        #[rustfmt::skip]
3435        let a = _mm_setr_epi8(
3436            255u8 as i8, 254u8 as i8, 253u8 as i8, 252u8 as i8,
3437            1, 2, 3, 4,
3438            155u8 as i8, 154u8 as i8, 153u8 as i8, 152u8 as i8,
3439            1, 2, 3, 4,
3440        );
3441        let b = _mm_setr_epi8(0, 0, 0, 0, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2);
3442        let r = _mm_sad_epu8(a, b);
3443        let e = _mm_setr_epi64x(1020, 614);
3444        assert_eq_m128i(r, e);
3445    }
3446
3447    #[simd_test(enable = "sse2")]
3448    unsafe fn test_mm_sub_epi8() {
3449        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(6));
3450        let r = _mm_sub_epi8(a, b);
3451        assert_eq_m128i(r, _mm_set1_epi8(-1));
3452    }
3453
3454    #[simd_test(enable = "sse2")]
3455    unsafe fn test_mm_sub_epi16() {
3456        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(6));
3457        let r = _mm_sub_epi16(a, b);
3458        assert_eq_m128i(r, _mm_set1_epi16(-1));
3459    }
3460
3461    #[simd_test(enable = "sse2")]
3462    unsafe fn test_mm_sub_epi32() {
3463        let (a, b) = (_mm_set1_epi32(5), _mm_set1_epi32(6));
3464        let r = _mm_sub_epi32(a, b);
3465        assert_eq_m128i(r, _mm_set1_epi32(-1));
3466    }
3467
3468    #[simd_test(enable = "sse2")]
3469    unsafe fn test_mm_sub_epi64() {
3470        let (a, b) = (_mm_set1_epi64x(5), _mm_set1_epi64x(6));
3471        let r = _mm_sub_epi64(a, b);
3472        assert_eq_m128i(r, _mm_set1_epi64x(-1));
3473    }
3474
3475    #[simd_test(enable = "sse2")]
3476    unsafe fn test_mm_subs_epi8() {
3477        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(2));
3478        let r = _mm_subs_epi8(a, b);
3479        assert_eq_m128i(r, _mm_set1_epi8(3));
3480    }
3481
3482    #[simd_test(enable = "sse2")]
3483    unsafe fn test_mm_subs_epi8_saturate_positive() {
3484        let a = _mm_set1_epi8(0x7F);
3485        let b = _mm_set1_epi8(-1);
3486        let r = _mm_subs_epi8(a, b);
3487        assert_eq_m128i(r, a);
3488    }
3489
3490    #[simd_test(enable = "sse2")]
3491    unsafe fn test_mm_subs_epi8_saturate_negative() {
3492        let a = _mm_set1_epi8(-0x80);
3493        let b = _mm_set1_epi8(1);
3494        let r = _mm_subs_epi8(a, b);
3495        assert_eq_m128i(r, a);
3496    }
3497
3498    #[simd_test(enable = "sse2")]
3499    unsafe fn test_mm_subs_epi16() {
3500        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(2));
3501        let r = _mm_subs_epi16(a, b);
3502        assert_eq_m128i(r, _mm_set1_epi16(3));
3503    }
3504
3505    #[simd_test(enable = "sse2")]
3506    unsafe fn test_mm_subs_epi16_saturate_positive() {
3507        let a = _mm_set1_epi16(0x7FFF);
3508        let b = _mm_set1_epi16(-1);
3509        let r = _mm_subs_epi16(a, b);
3510        assert_eq_m128i(r, a);
3511    }
3512
3513    #[simd_test(enable = "sse2")]
3514    unsafe fn test_mm_subs_epi16_saturate_negative() {
3515        let a = _mm_set1_epi16(-0x8000);
3516        let b = _mm_set1_epi16(1);
3517        let r = _mm_subs_epi16(a, b);
3518        assert_eq_m128i(r, a);
3519    }
3520
3521    #[simd_test(enable = "sse2")]
3522    unsafe fn test_mm_subs_epu8() {
3523        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(2));
3524        let r = _mm_subs_epu8(a, b);
3525        assert_eq_m128i(r, _mm_set1_epi8(3));
3526    }
3527
3528    #[simd_test(enable = "sse2")]
3529    unsafe fn test_mm_subs_epu8_saturate() {
3530        let a = _mm_set1_epi8(0);
3531        let b = _mm_set1_epi8(1);
3532        let r = _mm_subs_epu8(a, b);
3533        assert_eq_m128i(r, a);
3534    }
3535
3536    #[simd_test(enable = "sse2")]
3537    unsafe fn test_mm_subs_epu16() {
3538        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(2));
3539        let r = _mm_subs_epu16(a, b);
3540        assert_eq_m128i(r, _mm_set1_epi16(3));
3541    }
3542
3543    #[simd_test(enable = "sse2")]
3544    unsafe fn test_mm_subs_epu16_saturate() {
3545        let a = _mm_set1_epi16(0);
3546        let b = _mm_set1_epi16(1);
3547        let r = _mm_subs_epu16(a, b);
3548        assert_eq_m128i(r, a);
3549    }
3550
3551    #[simd_test(enable = "sse2")]
3552    unsafe fn test_mm_slli_si128() {
3553        #[rustfmt::skip]
3554        let a = _mm_setr_epi8(
3555            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3556        );
3557        let r = _mm_slli_si128::<1>(a);
3558        let e = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3559        assert_eq_m128i(r, e);
3560
3561        #[rustfmt::skip]
3562        let a = _mm_setr_epi8(
3563            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3564        );
3565        let r = _mm_slli_si128::<15>(a);
3566        let e = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
3567        assert_eq_m128i(r, e);
3568
3569        #[rustfmt::skip]
3570        let a = _mm_setr_epi8(
3571            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3572        );
3573        let r = _mm_slli_si128::<16>(a);
3574        assert_eq_m128i(r, _mm_set1_epi8(0));
3575    }
3576
3577    #[simd_test(enable = "sse2")]
3578    unsafe fn test_mm_slli_epi16() {
3579        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3580        let r = _mm_slli_epi16::<4>(a);
3581        assert_eq_m128i(
3582            r,
3583            _mm_setr_epi16(0xCC0, -0xCC0, 0xDD0, -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0),
3584        );
3585        let r = _mm_slli_epi16::<16>(a);
3586        assert_eq_m128i(r, _mm_set1_epi16(0));
3587    }
3588
3589    #[simd_test(enable = "sse2")]
3590    unsafe fn test_mm_sll_epi16() {
3591        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3592        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 4));
3593        assert_eq_m128i(
3594            r,
3595            _mm_setr_epi16(0xCC0, -0xCC0, 0xDD0, -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0),
3596        );
3597        let r = _mm_sll_epi16(a, _mm_set_epi64x(4, 0));
3598        assert_eq_m128i(r, a);
3599        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 16));
3600        assert_eq_m128i(r, _mm_set1_epi16(0));
3601        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, i64::MAX));
3602        assert_eq_m128i(r, _mm_set1_epi16(0));
3603    }
3604
3605    #[simd_test(enable = "sse2")]
3606    unsafe fn test_mm_slli_epi32() {
3607        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3608        let r = _mm_slli_epi32::<4>(a);
3609        assert_eq_m128i(r, _mm_setr_epi32(0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0));
3610        let r = _mm_slli_epi32::<32>(a);
3611        assert_eq_m128i(r, _mm_set1_epi32(0));
3612    }
3613
3614    #[simd_test(enable = "sse2")]
3615    unsafe fn test_mm_sll_epi32() {
3616        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3617        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 4));
3618        assert_eq_m128i(r, _mm_setr_epi32(0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0));
3619        let r = _mm_sll_epi32(a, _mm_set_epi64x(4, 0));
3620        assert_eq_m128i(r, a);
3621        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 32));
3622        assert_eq_m128i(r, _mm_set1_epi32(0));
3623        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, i64::MAX));
3624        assert_eq_m128i(r, _mm_set1_epi32(0));
3625    }
3626
3627    #[simd_test(enable = "sse2")]
3628    unsafe fn test_mm_slli_epi64() {
3629        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3630        let r = _mm_slli_epi64::<4>(a);
3631        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFFF0, -0xFFFFFFFF0));
3632        let r = _mm_slli_epi64::<64>(a);
3633        assert_eq_m128i(r, _mm_set1_epi64x(0));
3634    }
3635
3636    #[simd_test(enable = "sse2")]
3637    unsafe fn test_mm_sll_epi64() {
3638        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3639        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 4));
3640        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFFF0, -0xFFFFFFFF0));
3641        let r = _mm_sll_epi64(a, _mm_set_epi64x(4, 0));
3642        assert_eq_m128i(r, a);
3643        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 64));
3644        assert_eq_m128i(r, _mm_set1_epi64x(0));
3645        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, i64::MAX));
3646        assert_eq_m128i(r, _mm_set1_epi64x(0));
3647    }
3648
3649    #[simd_test(enable = "sse2")]
3650    unsafe fn test_mm_srai_epi16() {
3651        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3652        let r = _mm_srai_epi16::<4>(a);
3653        assert_eq_m128i(
3654            r,
3655            _mm_setr_epi16(0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF, -0x10),
3656        );
3657        let r = _mm_srai_epi16::<16>(a);
3658        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3659    }
3660
3661    #[simd_test(enable = "sse2")]
3662    unsafe fn test_mm_sra_epi16() {
3663        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3664        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 4));
3665        assert_eq_m128i(
3666            r,
3667            _mm_setr_epi16(0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF, -0x10),
3668        );
3669        let r = _mm_sra_epi16(a, _mm_set_epi64x(4, 0));
3670        assert_eq_m128i(r, a);
3671        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 16));
3672        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3673        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, i64::MAX));
3674        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3675    }
3676
3677    #[simd_test(enable = "sse2")]
3678    unsafe fn test_mm_srai_epi32() {
3679        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3680        let r = _mm_srai_epi32::<4>(a);
3681        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, -0xEEF, 0xFFF, -0x1000));
3682        let r = _mm_srai_epi32::<32>(a);
3683        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
3684    }
3685
3686    #[simd_test(enable = "sse2")]
3687    unsafe fn test_mm_sra_epi32() {
3688        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3689        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 4));
3690        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, -0xEEF, 0xFFF, -0x1000));
3691        let r = _mm_sra_epi32(a, _mm_set_epi64x(4, 0));
3692        assert_eq_m128i(r, a);
3693        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 32));
3694        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
3695        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, i64::MAX));
3696        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
3697    }
3698
3699    #[simd_test(enable = "sse2")]
3700    unsafe fn test_mm_srli_si128() {
3701        #[rustfmt::skip]
3702        let a = _mm_setr_epi8(
3703            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3704        );
3705        let r = _mm_srli_si128::<1>(a);
3706        #[rustfmt::skip]
3707        let e = _mm_setr_epi8(
3708            2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0,
3709        );
3710        assert_eq_m128i(r, e);
3711
3712        #[rustfmt::skip]
3713        let a = _mm_setr_epi8(
3714            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3715        );
3716        let r = _mm_srli_si128::<15>(a);
3717        let e = _mm_setr_epi8(16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3718        assert_eq_m128i(r, e);
3719
3720        #[rustfmt::skip]
3721        let a = _mm_setr_epi8(
3722            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3723        );
3724        let r = _mm_srli_si128::<16>(a);
3725        assert_eq_m128i(r, _mm_set1_epi8(0));
3726    }
3727
3728    #[simd_test(enable = "sse2")]
3729    unsafe fn test_mm_srli_epi16() {
3730        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3731        let r = _mm_srli_epi16::<4>(a);
3732        assert_eq_m128i(
3733            r,
3734            _mm_setr_epi16(0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1, 0xF, 0xFF0),
3735        );
3736        let r = _mm_srli_epi16::<16>(a);
3737        assert_eq_m128i(r, _mm_set1_epi16(0));
3738    }
3739
3740    #[simd_test(enable = "sse2")]
3741    unsafe fn test_mm_srl_epi16() {
3742        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3743        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 4));
3744        assert_eq_m128i(
3745            r,
3746            _mm_setr_epi16(0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1, 0xF, 0xFF0),
3747        );
3748        let r = _mm_srl_epi16(a, _mm_set_epi64x(4, 0));
3749        assert_eq_m128i(r, a);
3750        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 16));
3751        assert_eq_m128i(r, _mm_set1_epi16(0));
3752        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, i64::MAX));
3753        assert_eq_m128i(r, _mm_set1_epi16(0));
3754    }
3755
3756    #[simd_test(enable = "sse2")]
3757    unsafe fn test_mm_srli_epi32() {
3758        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3759        let r = _mm_srli_epi32::<4>(a);
3760        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000));
3761        let r = _mm_srli_epi32::<32>(a);
3762        assert_eq_m128i(r, _mm_set1_epi32(0));
3763    }
3764
3765    #[simd_test(enable = "sse2")]
3766    unsafe fn test_mm_srl_epi32() {
3767        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3768        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 4));
3769        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000));
3770        let r = _mm_srl_epi32(a, _mm_set_epi64x(4, 0));
3771        assert_eq_m128i(r, a);
3772        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 32));
3773        assert_eq_m128i(r, _mm_set1_epi32(0));
3774        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, i64::MAX));
3775        assert_eq_m128i(r, _mm_set1_epi32(0));
3776    }
3777
3778    #[simd_test(enable = "sse2")]
3779    unsafe fn test_mm_srli_epi64() {
3780        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3781        let r = _mm_srli_epi64::<4>(a);
3782        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFF, 0xFFFFFFFF0000000));
3783        let r = _mm_srli_epi64::<64>(a);
3784        assert_eq_m128i(r, _mm_set1_epi64x(0));
3785    }
3786
3787    #[simd_test(enable = "sse2")]
3788    unsafe fn test_mm_srl_epi64() {
3789        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3790        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 4));
3791        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFF, 0xFFFFFFFF0000000));
3792        let r = _mm_srl_epi64(a, _mm_set_epi64x(4, 0));
3793        assert_eq_m128i(r, a);
3794        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 64));
3795        assert_eq_m128i(r, _mm_set1_epi64x(0));
3796        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, i64::MAX));
3797        assert_eq_m128i(r, _mm_set1_epi64x(0));
3798    }
3799
3800    #[simd_test(enable = "sse2")]
3801    unsafe fn test_mm_and_si128() {
3802        let a = _mm_set1_epi8(5);
3803        let b = _mm_set1_epi8(3);
3804        let r = _mm_and_si128(a, b);
3805        assert_eq_m128i(r, _mm_set1_epi8(1));
3806    }
3807
3808    #[simd_test(enable = "sse2")]
3809    unsafe fn test_mm_andnot_si128() {
3810        let a = _mm_set1_epi8(5);
3811        let b = _mm_set1_epi8(3);
3812        let r = _mm_andnot_si128(a, b);
3813        assert_eq_m128i(r, _mm_set1_epi8(2));
3814    }
3815
3816    #[simd_test(enable = "sse2")]
3817    unsafe fn test_mm_or_si128() {
3818        let a = _mm_set1_epi8(5);
3819        let b = _mm_set1_epi8(3);
3820        let r = _mm_or_si128(a, b);
3821        assert_eq_m128i(r, _mm_set1_epi8(7));
3822    }
3823
3824    #[simd_test(enable = "sse2")]
3825    unsafe fn test_mm_xor_si128() {
3826        let a = _mm_set1_epi8(5);
3827        let b = _mm_set1_epi8(3);
3828        let r = _mm_xor_si128(a, b);
3829        assert_eq_m128i(r, _mm_set1_epi8(6));
3830    }
3831
3832    #[simd_test(enable = "sse2")]
3833    unsafe fn test_mm_cmpeq_epi8() {
3834        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3835        let b = _mm_setr_epi8(15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
3836        let r = _mm_cmpeq_epi8(a, b);
3837        #[rustfmt::skip]
3838        assert_eq_m128i(
3839            r,
3840            _mm_setr_epi8(
3841                0, 0, 0xFFu8 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
3842            )
3843        );
3844    }
3845
3846    #[simd_test(enable = "sse2")]
3847    unsafe fn test_mm_cmpeq_epi16() {
3848        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3849        let b = _mm_setr_epi16(7, 6, 2, 4, 3, 2, 1, 0);
3850        let r = _mm_cmpeq_epi16(a, b);
3851        assert_eq_m128i(r, _mm_setr_epi16(0, 0, !0, 0, 0, 0, 0, 0));
3852    }
3853
3854    #[simd_test(enable = "sse2")]
3855    unsafe fn test_mm_cmpeq_epi32() {
3856        let a = _mm_setr_epi32(0, 1, 2, 3);
3857        let b = _mm_setr_epi32(3, 2, 2, 0);
3858        let r = _mm_cmpeq_epi32(a, b);
3859        assert_eq_m128i(r, _mm_setr_epi32(0, 0, !0, 0));
3860    }
3861
3862    #[simd_test(enable = "sse2")]
3863    unsafe fn test_mm_cmpgt_epi8() {
3864        let a = _mm_set_epi8(5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3865        let b = _mm_set1_epi8(0);
3866        let r = _mm_cmpgt_epi8(a, b);
3867        let e = _mm_set_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3868        assert_eq_m128i(r, e);
3869    }
3870
3871    #[simd_test(enable = "sse2")]
3872    unsafe fn test_mm_cmpgt_epi16() {
3873        let a = _mm_set_epi16(5, 0, 0, 0, 0, 0, 0, 0);
3874        let b = _mm_set1_epi16(0);
3875        let r = _mm_cmpgt_epi16(a, b);
3876        let e = _mm_set_epi16(!0, 0, 0, 0, 0, 0, 0, 0);
3877        assert_eq_m128i(r, e);
3878    }
3879
3880    #[simd_test(enable = "sse2")]
3881    unsafe fn test_mm_cmpgt_epi32() {
3882        let a = _mm_set_epi32(5, 0, 0, 0);
3883        let b = _mm_set1_epi32(0);
3884        let r = _mm_cmpgt_epi32(a, b);
3885        assert_eq_m128i(r, _mm_set_epi32(!0, 0, 0, 0));
3886    }
3887
3888    #[simd_test(enable = "sse2")]
3889    unsafe fn test_mm_cmplt_epi8() {
3890        let a = _mm_set1_epi8(0);
3891        let b = _mm_set_epi8(5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3892        let r = _mm_cmplt_epi8(a, b);
3893        let e = _mm_set_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3894        assert_eq_m128i(r, e);
3895    }
3896
3897    #[simd_test(enable = "sse2")]
3898    unsafe fn test_mm_cmplt_epi16() {
3899        let a = _mm_set1_epi16(0);
3900        let b = _mm_set_epi16(5, 0, 0, 0, 0, 0, 0, 0);
3901        let r = _mm_cmplt_epi16(a, b);
3902        let e = _mm_set_epi16(!0, 0, 0, 0, 0, 0, 0, 0);
3903        assert_eq_m128i(r, e);
3904    }
3905
3906    #[simd_test(enable = "sse2")]
3907    unsafe fn test_mm_cmplt_epi32() {
3908        let a = _mm_set1_epi32(0);
3909        let b = _mm_set_epi32(5, 0, 0, 0);
3910        let r = _mm_cmplt_epi32(a, b);
3911        assert_eq_m128i(r, _mm_set_epi32(!0, 0, 0, 0));
3912    }
3913
3914    #[simd_test(enable = "sse2")]
3915    unsafe fn test_mm_cvtepi32_pd() {
3916        let a = _mm_set_epi32(35, 25, 15, 5);
3917        let r = _mm_cvtepi32_pd(a);
3918        assert_eq_m128d(r, _mm_setr_pd(5.0, 15.0));
3919    }
3920
3921    #[simd_test(enable = "sse2")]
3922    unsafe fn test_mm_cvtsi32_sd() {
3923        let a = _mm_set1_pd(3.5);
3924        let r = _mm_cvtsi32_sd(a, 5);
3925        assert_eq_m128d(r, _mm_setr_pd(5.0, 3.5));
3926    }
3927
3928    #[simd_test(enable = "sse2")]
3929    unsafe fn test_mm_cvtepi32_ps() {
3930        let a = _mm_setr_epi32(1, 2, 3, 4);
3931        let r = _mm_cvtepi32_ps(a);
3932        assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 3.0, 4.0));
3933    }
3934
3935    #[simd_test(enable = "sse2")]
3936    unsafe fn test_mm_cvtps_epi32() {
3937        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3938        let r = _mm_cvtps_epi32(a);
3939        assert_eq_m128i(r, _mm_setr_epi32(1, 2, 3, 4));
3940    }
3941
3942    #[simd_test(enable = "sse2")]
3943    unsafe fn test_mm_cvtsi32_si128() {
3944        let r = _mm_cvtsi32_si128(5);
3945        assert_eq_m128i(r, _mm_setr_epi32(5, 0, 0, 0));
3946    }
3947
3948    #[simd_test(enable = "sse2")]
3949    unsafe fn test_mm_cvtsi128_si32() {
3950        let r = _mm_cvtsi128_si32(_mm_setr_epi32(5, 0, 0, 0));
3951        assert_eq!(r, 5);
3952    }
3953
3954    #[simd_test(enable = "sse2")]
3955    unsafe fn test_mm_set_epi64x() {
3956        let r = _mm_set_epi64x(0, 1);
3957        assert_eq_m128i(r, _mm_setr_epi64x(1, 0));
3958    }
3959
3960    #[simd_test(enable = "sse2")]
3961    unsafe fn test_mm_set_epi32() {
3962        let r = _mm_set_epi32(0, 1, 2, 3);
3963        assert_eq_m128i(r, _mm_setr_epi32(3, 2, 1, 0));
3964    }
3965
3966    #[simd_test(enable = "sse2")]
3967    unsafe fn test_mm_set_epi16() {
3968        let r = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3969        assert_eq_m128i(r, _mm_setr_epi16(7, 6, 5, 4, 3, 2, 1, 0));
3970    }
3971
3972    #[simd_test(enable = "sse2")]
3973    unsafe fn test_mm_set_epi8() {
3974        #[rustfmt::skip]
3975        let r = _mm_set_epi8(
3976            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3977        );
3978        #[rustfmt::skip]
3979        let e = _mm_setr_epi8(
3980            15, 14, 13, 12, 11, 10, 9, 8,
3981            7, 6, 5, 4, 3, 2, 1, 0,
3982        );
3983        assert_eq_m128i(r, e);
3984    }
3985
3986    #[simd_test(enable = "sse2")]
3987    unsafe fn test_mm_set1_epi64x() {
3988        let r = _mm_set1_epi64x(1);
3989        assert_eq_m128i(r, _mm_set1_epi64x(1));
3990    }
3991
3992    #[simd_test(enable = "sse2")]
3993    unsafe fn test_mm_set1_epi32() {
3994        let r = _mm_set1_epi32(1);
3995        assert_eq_m128i(r, _mm_set1_epi32(1));
3996    }
3997
3998    #[simd_test(enable = "sse2")]
3999    unsafe fn test_mm_set1_epi16() {
4000        let r = _mm_set1_epi16(1);
4001        assert_eq_m128i(r, _mm_set1_epi16(1));
4002    }
4003
4004    #[simd_test(enable = "sse2")]
4005    unsafe fn test_mm_set1_epi8() {
4006        let r = _mm_set1_epi8(1);
4007        assert_eq_m128i(r, _mm_set1_epi8(1));
4008    }
4009
4010    #[simd_test(enable = "sse2")]
4011    unsafe fn test_mm_setr_epi32() {
4012        let r = _mm_setr_epi32(0, 1, 2, 3);
4013        assert_eq_m128i(r, _mm_setr_epi32(0, 1, 2, 3));
4014    }
4015
4016    #[simd_test(enable = "sse2")]
4017    unsafe fn test_mm_setr_epi16() {
4018        let r = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4019        assert_eq_m128i(r, _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7));
4020    }
4021
4022    #[simd_test(enable = "sse2")]
4023    unsafe fn test_mm_setr_epi8() {
4024        #[rustfmt::skip]
4025        let r = _mm_setr_epi8(
4026            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4027        );
4028        #[rustfmt::skip]
4029        let e = _mm_setr_epi8(
4030            0, 1, 2, 3, 4, 5, 6, 7,
4031            8, 9, 10, 11, 12, 13, 14, 15,
4032        );
4033        assert_eq_m128i(r, e);
4034    }
4035
4036    #[simd_test(enable = "sse2")]
4037    unsafe fn test_mm_setzero_si128() {
4038        let r = _mm_setzero_si128();
4039        assert_eq_m128i(r, _mm_set1_epi64x(0));
4040    }
4041
4042    #[simd_test(enable = "sse2")]
4043    unsafe fn test_mm_loadl_epi64() {
4044        let a = _mm_setr_epi64x(6, 5);
4045        let r = _mm_loadl_epi64(ptr::addr_of!(a));
4046        assert_eq_m128i(r, _mm_setr_epi64x(6, 0));
4047    }
4048
4049    #[simd_test(enable = "sse2")]
4050    unsafe fn test_mm_load_si128() {
4051        let a = _mm_set_epi64x(5, 6);
4052        let r = _mm_load_si128(ptr::addr_of!(a) as *const _);
4053        assert_eq_m128i(a, r);
4054    }
4055
4056    #[simd_test(enable = "sse2")]
4057    unsafe fn test_mm_loadu_si128() {
4058        let a = _mm_set_epi64x(5, 6);
4059        let r = _mm_loadu_si128(ptr::addr_of!(a) as *const _);
4060        assert_eq_m128i(a, r);
4061    }
4062
4063    #[simd_test(enable = "sse2")]
4064    // Miri cannot support this until it is clear how it fits in the Rust memory model
4065    // (non-temporal store)
4066    #[cfg_attr(miri, ignore)]
4067    unsafe fn test_mm_maskmoveu_si128() {
4068        let a = _mm_set1_epi8(9);
4069        #[rustfmt::skip]
4070        let mask = _mm_set_epi8(
4071            0, 0, 0x80u8 as i8, 0, 0, 0, 0, 0,
4072            0, 0, 0, 0, 0, 0, 0, 0,
4073        );
4074        let mut r = _mm_set1_epi8(0);
4075        _mm_maskmoveu_si128(a, mask, ptr::addr_of_mut!(r) as *mut i8);
4076        _mm_sfence();
4077        let e = _mm_set_epi8(0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
4078        assert_eq_m128i(r, e);
4079    }
4080
4081    #[simd_test(enable = "sse2")]
4082    unsafe fn test_mm_store_si128() {
4083        let a = _mm_set1_epi8(9);
4084        let mut r = _mm_set1_epi8(0);
4085        _mm_store_si128(&mut r, a);
4086        assert_eq_m128i(r, a);
4087    }
4088
4089    #[simd_test(enable = "sse2")]
4090    unsafe fn test_mm_storeu_si128() {
4091        let a = _mm_set1_epi8(9);
4092        let mut r = _mm_set1_epi8(0);
4093        _mm_storeu_si128(&mut r, a);
4094        assert_eq_m128i(r, a);
4095    }
4096
4097    #[simd_test(enable = "sse2")]
4098    unsafe fn test_mm_storel_epi64() {
4099        let a = _mm_setr_epi64x(2, 9);
4100        let mut r = _mm_set1_epi8(0);
4101        _mm_storel_epi64(&mut r, a);
4102        assert_eq_m128i(r, _mm_setr_epi64x(2, 0));
4103    }
4104
4105    #[simd_test(enable = "sse2")]
4106    // Miri cannot support this until it is clear how it fits in the Rust memory model
4107    // (non-temporal store)
4108    #[cfg_attr(miri, ignore)]
4109    unsafe fn test_mm_stream_si128() {
4110        let a = _mm_setr_epi32(1, 2, 3, 4);
4111        let mut r = _mm_undefined_si128();
4112        _mm_stream_si128(ptr::addr_of_mut!(r), a);
4113        _mm_sfence();
4114        assert_eq_m128i(r, a);
4115    }
4116
4117    #[simd_test(enable = "sse2")]
4118    // Miri cannot support this until it is clear how it fits in the Rust memory model
4119    // (non-temporal store)
4120    #[cfg_attr(miri, ignore)]
4121    unsafe fn test_mm_stream_si32() {
4122        let a: i32 = 7;
4123        let mut mem = boxed::Box::<i32>::new(-1);
4124        _mm_stream_si32(ptr::addr_of_mut!(*mem), a);
4125        _mm_sfence();
4126        assert_eq!(a, *mem);
4127    }
4128
4129    #[simd_test(enable = "sse2")]
4130    unsafe fn test_mm_move_epi64() {
4131        let a = _mm_setr_epi64x(5, 6);
4132        let r = _mm_move_epi64(a);
4133        assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
4134    }
4135
4136    #[simd_test(enable = "sse2")]
4137    unsafe fn test_mm_packs_epi16() {
4138        let a = _mm_setr_epi16(0x80, -0x81, 0, 0, 0, 0, 0, 0);
4139        let b = _mm_setr_epi16(0, 0, 0, 0, 0, 0, -0x81, 0x80);
4140        let r = _mm_packs_epi16(a, b);
4141        #[rustfmt::skip]
4142        assert_eq_m128i(
4143            r,
4144            _mm_setr_epi8(
4145                0x7F, -0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0x80, 0x7F
4146            )
4147        );
4148    }
4149
4150    #[simd_test(enable = "sse2")]
4151    unsafe fn test_mm_packs_epi32() {
4152        let a = _mm_setr_epi32(0x8000, -0x8001, 0, 0);
4153        let b = _mm_setr_epi32(0, 0, -0x8001, 0x8000);
4154        let r = _mm_packs_epi32(a, b);
4155        assert_eq_m128i(
4156            r,
4157            _mm_setr_epi16(0x7FFF, -0x8000, 0, 0, 0, 0, -0x8000, 0x7FFF),
4158        );
4159    }
4160
4161    #[simd_test(enable = "sse2")]
4162    unsafe fn test_mm_packus_epi16() {
4163        let a = _mm_setr_epi16(0x100, -1, 0, 0, 0, 0, 0, 0);
4164        let b = _mm_setr_epi16(0, 0, 0, 0, 0, 0, -1, 0x100);
4165        let r = _mm_packus_epi16(a, b);
4166        assert_eq_m128i(
4167            r,
4168            _mm_setr_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, !0),
4169        );
4170    }
4171
4172    #[simd_test(enable = "sse2")]
4173    unsafe fn test_mm_extract_epi16() {
4174        let a = _mm_setr_epi16(-1, 1, 2, 3, 4, 5, 6, 7);
4175        let r1 = _mm_extract_epi16::<0>(a);
4176        let r2 = _mm_extract_epi16::<3>(a);
4177        assert_eq!(r1, 0xFFFF);
4178        assert_eq!(r2, 3);
4179    }
4180
4181    #[simd_test(enable = "sse2")]
4182    unsafe fn test_mm_insert_epi16() {
4183        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4184        let r = _mm_insert_epi16::<0>(a, 9);
4185        let e = _mm_setr_epi16(9, 1, 2, 3, 4, 5, 6, 7);
4186        assert_eq_m128i(r, e);
4187    }
4188
4189    #[simd_test(enable = "sse2")]
4190    unsafe fn test_mm_movemask_epi8() {
4191        #[rustfmt::skip]
4192        let a = _mm_setr_epi8(
4193            0b1000_0000u8 as i8, 0b0, 0b1000_0000u8 as i8, 0b01,
4194            0b0101, 0b1111_0000u8 as i8, 0, 0,
4195            0, 0b1011_0101u8 as i8, 0b1111_0000u8 as i8, 0b0101,
4196            0b01, 0b1000_0000u8 as i8, 0b0, 0b1000_0000u8 as i8,
4197        );
4198        let r = _mm_movemask_epi8(a);
4199        assert_eq!(r, 0b10100110_00100101);
4200    }
4201
4202    #[simd_test(enable = "sse2")]
4203    unsafe fn test_mm_shuffle_epi32() {
4204        let a = _mm_setr_epi32(5, 10, 15, 20);
4205        let r = _mm_shuffle_epi32::<0b00_01_01_11>(a);
4206        let e = _mm_setr_epi32(20, 10, 10, 5);
4207        assert_eq_m128i(r, e);
4208    }
4209
4210    #[simd_test(enable = "sse2")]
4211    unsafe fn test_mm_shufflehi_epi16() {
4212        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 10, 15, 20);
4213        let r = _mm_shufflehi_epi16::<0b00_01_01_11>(a);
4214        let e = _mm_setr_epi16(1, 2, 3, 4, 20, 10, 10, 5);
4215        assert_eq_m128i(r, e);
4216    }
4217
4218    #[simd_test(enable = "sse2")]
4219    unsafe fn test_mm_shufflelo_epi16() {
4220        let a = _mm_setr_epi16(5, 10, 15, 20, 1, 2, 3, 4);
4221        let r = _mm_shufflelo_epi16::<0b00_01_01_11>(a);
4222        let e = _mm_setr_epi16(20, 10, 10, 5, 1, 2, 3, 4);
4223        assert_eq_m128i(r, e);
4224    }
4225
4226    #[simd_test(enable = "sse2")]
4227    unsafe fn test_mm_unpackhi_epi8() {
4228        #[rustfmt::skip]
4229        let a = _mm_setr_epi8(
4230            0, 1, 2, 3, 4, 5, 6, 7,
4231            8, 9, 10, 11, 12, 13, 14, 15,
4232        );
4233        #[rustfmt::skip]
4234        let b = _mm_setr_epi8(
4235            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
4236        );
4237        let r = _mm_unpackhi_epi8(a, b);
4238        #[rustfmt::skip]
4239        let e = _mm_setr_epi8(
4240            8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31,
4241        );
4242        assert_eq_m128i(r, e);
4243    }
4244
4245    #[simd_test(enable = "sse2")]
4246    unsafe fn test_mm_unpackhi_epi16() {
4247        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4248        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
4249        let r = _mm_unpackhi_epi16(a, b);
4250        let e = _mm_setr_epi16(4, 12, 5, 13, 6, 14, 7, 15);
4251        assert_eq_m128i(r, e);
4252    }
4253
4254    #[simd_test(enable = "sse2")]
4255    unsafe fn test_mm_unpackhi_epi32() {
4256        let a = _mm_setr_epi32(0, 1, 2, 3);
4257        let b = _mm_setr_epi32(4, 5, 6, 7);
4258        let r = _mm_unpackhi_epi32(a, b);
4259        let e = _mm_setr_epi32(2, 6, 3, 7);
4260        assert_eq_m128i(r, e);
4261    }
4262
4263    #[simd_test(enable = "sse2")]
4264    unsafe fn test_mm_unpackhi_epi64() {
4265        let a = _mm_setr_epi64x(0, 1);
4266        let b = _mm_setr_epi64x(2, 3);
4267        let r = _mm_unpackhi_epi64(a, b);
4268        let e = _mm_setr_epi64x(1, 3);
4269        assert_eq_m128i(r, e);
4270    }
4271
4272    #[simd_test(enable = "sse2")]
4273    unsafe fn test_mm_unpacklo_epi8() {
4274        #[rustfmt::skip]
4275        let a = _mm_setr_epi8(
4276            0, 1, 2, 3, 4, 5, 6, 7,
4277            8, 9, 10, 11, 12, 13, 14, 15,
4278        );
4279        #[rustfmt::skip]
4280        let b = _mm_setr_epi8(
4281            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
4282        );
4283        let r = _mm_unpacklo_epi8(a, b);
4284        #[rustfmt::skip]
4285        let e = _mm_setr_epi8(
4286            0, 16, 1, 17, 2, 18, 3, 19,
4287            4, 20, 5, 21, 6, 22, 7, 23,
4288        );
4289        assert_eq_m128i(r, e);
4290    }
4291
4292    #[simd_test(enable = "sse2")]
4293    unsafe fn test_mm_unpacklo_epi16() {
4294        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4295        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
4296        let r = _mm_unpacklo_epi16(a, b);
4297        let e = _mm_setr_epi16(0, 8, 1, 9, 2, 10, 3, 11);
4298        assert_eq_m128i(r, e);
4299    }
4300
4301    #[simd_test(enable = "sse2")]
4302    unsafe fn test_mm_unpacklo_epi32() {
4303        let a = _mm_setr_epi32(0, 1, 2, 3);
4304        let b = _mm_setr_epi32(4, 5, 6, 7);
4305        let r = _mm_unpacklo_epi32(a, b);
4306        let e = _mm_setr_epi32(0, 4, 1, 5);
4307        assert_eq_m128i(r, e);
4308    }
4309
4310    #[simd_test(enable = "sse2")]
4311    unsafe fn test_mm_unpacklo_epi64() {
4312        let a = _mm_setr_epi64x(0, 1);
4313        let b = _mm_setr_epi64x(2, 3);
4314        let r = _mm_unpacklo_epi64(a, b);
4315        let e = _mm_setr_epi64x(0, 2);
4316        assert_eq_m128i(r, e);
4317    }
4318
4319    #[simd_test(enable = "sse2")]
4320    unsafe fn test_mm_add_sd() {
4321        let a = _mm_setr_pd(1.0, 2.0);
4322        let b = _mm_setr_pd(5.0, 10.0);
4323        let r = _mm_add_sd(a, b);
4324        assert_eq_m128d(r, _mm_setr_pd(6.0, 2.0));
4325    }
4326
4327    #[simd_test(enable = "sse2")]
4328    unsafe fn test_mm_add_pd() {
4329        let a = _mm_setr_pd(1.0, 2.0);
4330        let b = _mm_setr_pd(5.0, 10.0);
4331        let r = _mm_add_pd(a, b);
4332        assert_eq_m128d(r, _mm_setr_pd(6.0, 12.0));
4333    }
4334
4335    #[simd_test(enable = "sse2")]
4336    unsafe fn test_mm_div_sd() {
4337        let a = _mm_setr_pd(1.0, 2.0);
4338        let b = _mm_setr_pd(5.0, 10.0);
4339        let r = _mm_div_sd(a, b);
4340        assert_eq_m128d(r, _mm_setr_pd(0.2, 2.0));
4341    }
4342
4343    #[simd_test(enable = "sse2")]
4344    unsafe fn test_mm_div_pd() {
4345        let a = _mm_setr_pd(1.0, 2.0);
4346        let b = _mm_setr_pd(5.0, 10.0);
4347        let r = _mm_div_pd(a, b);
4348        assert_eq_m128d(r, _mm_setr_pd(0.2, 0.2));
4349    }
4350
4351    #[simd_test(enable = "sse2")]
4352    unsafe fn test_mm_max_sd() {
4353        let a = _mm_setr_pd(1.0, 2.0);
4354        let b = _mm_setr_pd(5.0, 10.0);
4355        let r = _mm_max_sd(a, b);
4356        assert_eq_m128d(r, _mm_setr_pd(5.0, 2.0));
4357    }
4358
4359    #[simd_test(enable = "sse2")]
4360    unsafe fn test_mm_max_pd() {
4361        let a = _mm_setr_pd(1.0, 2.0);
4362        let b = _mm_setr_pd(5.0, 10.0);
4363        let r = _mm_max_pd(a, b);
4364        assert_eq_m128d(r, _mm_setr_pd(5.0, 10.0));
4365
4366        // Check SSE(2)-specific semantics for -0.0 handling.
4367        let a = _mm_setr_pd(-0.0, 0.0);
4368        let b = _mm_setr_pd(0.0, 0.0);
4369        let r1: [u8; 16] = transmute(_mm_max_pd(a, b));
4370        let r2: [u8; 16] = transmute(_mm_max_pd(b, a));
4371        let a: [u8; 16] = transmute(a);
4372        let b: [u8; 16] = transmute(b);
4373        assert_eq!(r1, b);
4374        assert_eq!(r2, a);
4375        assert_ne!(a, b); // sanity check that -0.0 is actually present
4376    }
4377
4378    #[simd_test(enable = "sse2")]
4379    unsafe fn test_mm_min_sd() {
4380        let a = _mm_setr_pd(1.0, 2.0);
4381        let b = _mm_setr_pd(5.0, 10.0);
4382        let r = _mm_min_sd(a, b);
4383        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
4384    }
4385
4386    #[simd_test(enable = "sse2")]
4387    unsafe fn test_mm_min_pd() {
4388        let a = _mm_setr_pd(1.0, 2.0);
4389        let b = _mm_setr_pd(5.0, 10.0);
4390        let r = _mm_min_pd(a, b);
4391        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
4392
4393        // Check SSE(2)-specific semantics for -0.0 handling.
4394        let a = _mm_setr_pd(-0.0, 0.0);
4395        let b = _mm_setr_pd(0.0, 0.0);
4396        let r1: [u8; 16] = transmute(_mm_min_pd(a, b));
4397        let r2: [u8; 16] = transmute(_mm_min_pd(b, a));
4398        let a: [u8; 16] = transmute(a);
4399        let b: [u8; 16] = transmute(b);
4400        assert_eq!(r1, b);
4401        assert_eq!(r2, a);
4402        assert_ne!(a, b); // sanity check that -0.0 is actually present
4403    }
4404
4405    #[simd_test(enable = "sse2")]
4406    unsafe fn test_mm_mul_sd() {
4407        let a = _mm_setr_pd(1.0, 2.0);
4408        let b = _mm_setr_pd(5.0, 10.0);
4409        let r = _mm_mul_sd(a, b);
4410        assert_eq_m128d(r, _mm_setr_pd(5.0, 2.0));
4411    }
4412
4413    #[simd_test(enable = "sse2")]
4414    unsafe fn test_mm_mul_pd() {
4415        let a = _mm_setr_pd(1.0, 2.0);
4416        let b = _mm_setr_pd(5.0, 10.0);
4417        let r = _mm_mul_pd(a, b);
4418        assert_eq_m128d(r, _mm_setr_pd(5.0, 20.0));
4419    }
4420
4421    #[simd_test(enable = "sse2")]
4422    unsafe fn test_mm_sqrt_sd() {
4423        let a = _mm_setr_pd(1.0, 2.0);
4424        let b = _mm_setr_pd(5.0, 10.0);
4425        let r = _mm_sqrt_sd(a, b);
4426        assert_eq_m128d(r, _mm_setr_pd(5.0f64.sqrt(), 2.0));
4427    }
4428
4429    #[simd_test(enable = "sse2")]
4430    unsafe fn test_mm_sqrt_pd() {
4431        let r = _mm_sqrt_pd(_mm_setr_pd(1.0, 2.0));
4432        assert_eq_m128d(r, _mm_setr_pd(1.0f64.sqrt(), 2.0f64.sqrt()));
4433    }
4434
4435    #[simd_test(enable = "sse2")]
4436    unsafe fn test_mm_sub_sd() {
4437        let a = _mm_setr_pd(1.0, 2.0);
4438        let b = _mm_setr_pd(5.0, 10.0);
4439        let r = _mm_sub_sd(a, b);
4440        assert_eq_m128d(r, _mm_setr_pd(-4.0, 2.0));
4441    }
4442
4443    #[simd_test(enable = "sse2")]
4444    unsafe fn test_mm_sub_pd() {
4445        let a = _mm_setr_pd(1.0, 2.0);
4446        let b = _mm_setr_pd(5.0, 10.0);
4447        let r = _mm_sub_pd(a, b);
4448        assert_eq_m128d(r, _mm_setr_pd(-4.0, -8.0));
4449    }
4450
4451    #[simd_test(enable = "sse2")]
4452    unsafe fn test_mm_and_pd() {
4453        let a = transmute(u64x2::splat(5));
4454        let b = transmute(u64x2::splat(3));
4455        let r = _mm_and_pd(a, b);
4456        let e = transmute(u64x2::splat(1));
4457        assert_eq_m128d(r, e);
4458    }
4459
4460    #[simd_test(enable = "sse2")]
4461    unsafe fn test_mm_andnot_pd() {
4462        let a = transmute(u64x2::splat(5));
4463        let b = transmute(u64x2::splat(3));
4464        let r = _mm_andnot_pd(a, b);
4465        let e = transmute(u64x2::splat(2));
4466        assert_eq_m128d(r, e);
4467    }
4468
4469    #[simd_test(enable = "sse2")]
4470    unsafe fn test_mm_or_pd() {
4471        let a = transmute(u64x2::splat(5));
4472        let b = transmute(u64x2::splat(3));
4473        let r = _mm_or_pd(a, b);
4474        let e = transmute(u64x2::splat(7));
4475        assert_eq_m128d(r, e);
4476    }
4477
4478    #[simd_test(enable = "sse2")]
4479    unsafe fn test_mm_xor_pd() {
4480        let a = transmute(u64x2::splat(5));
4481        let b = transmute(u64x2::splat(3));
4482        let r = _mm_xor_pd(a, b);
4483        let e = transmute(u64x2::splat(6));
4484        assert_eq_m128d(r, e);
4485    }
4486
4487    #[simd_test(enable = "sse2")]
4488    unsafe fn test_mm_cmpeq_sd() {
4489        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4490        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4491        let r = transmute::<_, __m128i>(_mm_cmpeq_sd(a, b));
4492        assert_eq_m128i(r, e);
4493    }
4494
4495    #[simd_test(enable = "sse2")]
4496    unsafe fn test_mm_cmplt_sd() {
4497        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4498        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4499        let r = transmute::<_, __m128i>(_mm_cmplt_sd(a, b));
4500        assert_eq_m128i(r, e);
4501    }
4502
4503    #[simd_test(enable = "sse2")]
4504    unsafe fn test_mm_cmple_sd() {
4505        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4506        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4507        let r = transmute::<_, __m128i>(_mm_cmple_sd(a, b));
4508        assert_eq_m128i(r, e);
4509    }
4510
4511    #[simd_test(enable = "sse2")]
4512    unsafe fn test_mm_cmpgt_sd() {
4513        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
4514        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4515        let r = transmute::<_, __m128i>(_mm_cmpgt_sd(a, b));
4516        assert_eq_m128i(r, e);
4517    }
4518
4519    #[simd_test(enable = "sse2")]
4520    unsafe fn test_mm_cmpge_sd() {
4521        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4522        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4523        let r = transmute::<_, __m128i>(_mm_cmpge_sd(a, b));
4524        assert_eq_m128i(r, e);
4525    }
4526
4527    #[simd_test(enable = "sse2")]
4528    unsafe fn test_mm_cmpord_sd() {
4529        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4530        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4531        let r = transmute::<_, __m128i>(_mm_cmpord_sd(a, b));
4532        assert_eq_m128i(r, e);
4533    }
4534
4535    #[simd_test(enable = "sse2")]
4536    unsafe fn test_mm_cmpunord_sd() {
4537        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4538        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4539        let r = transmute::<_, __m128i>(_mm_cmpunord_sd(a, b));
4540        assert_eq_m128i(r, e);
4541    }
4542
4543    #[simd_test(enable = "sse2")]
4544    unsafe fn test_mm_cmpneq_sd() {
4545        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4546        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4547        let r = transmute::<_, __m128i>(_mm_cmpneq_sd(a, b));
4548        assert_eq_m128i(r, e);
4549    }
4550
4551    #[simd_test(enable = "sse2")]
4552    unsafe fn test_mm_cmpnlt_sd() {
4553        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4554        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4555        let r = transmute::<_, __m128i>(_mm_cmpnlt_sd(a, b));
4556        assert_eq_m128i(r, e);
4557    }
4558
4559    #[simd_test(enable = "sse2")]
4560    unsafe fn test_mm_cmpnle_sd() {
4561        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4562        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4563        let r = transmute::<_, __m128i>(_mm_cmpnle_sd(a, b));
4564        assert_eq_m128i(r, e);
4565    }
4566
4567    #[simd_test(enable = "sse2")]
4568    unsafe fn test_mm_cmpngt_sd() {
4569        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
4570        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4571        let r = transmute::<_, __m128i>(_mm_cmpngt_sd(a, b));
4572        assert_eq_m128i(r, e);
4573    }
4574
4575    #[simd_test(enable = "sse2")]
4576    unsafe fn test_mm_cmpnge_sd() {
4577        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4578        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4579        let r = transmute::<_, __m128i>(_mm_cmpnge_sd(a, b));
4580        assert_eq_m128i(r, e);
4581    }
4582
4583    #[simd_test(enable = "sse2")]
4584    unsafe fn test_mm_cmpeq_pd() {
4585        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4586        let e = _mm_setr_epi64x(!0, 0);
4587        let r = transmute::<_, __m128i>(_mm_cmpeq_pd(a, b));
4588        assert_eq_m128i(r, e);
4589    }
4590
4591    #[simd_test(enable = "sse2")]
4592    unsafe fn test_mm_cmplt_pd() {
4593        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4594        let e = _mm_setr_epi64x(0, !0);
4595        let r = transmute::<_, __m128i>(_mm_cmplt_pd(a, b));
4596        assert_eq_m128i(r, e);
4597    }
4598
4599    #[simd_test(enable = "sse2")]
4600    unsafe fn test_mm_cmple_pd() {
4601        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4602        let e = _mm_setr_epi64x(!0, !0);
4603        let r = transmute::<_, __m128i>(_mm_cmple_pd(a, b));
4604        assert_eq_m128i(r, e);
4605    }
4606
4607    #[simd_test(enable = "sse2")]
4608    unsafe fn test_mm_cmpgt_pd() {
4609        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4610        let e = _mm_setr_epi64x(0, 0);
4611        let r = transmute::<_, __m128i>(_mm_cmpgt_pd(a, b));
4612        assert_eq_m128i(r, e);
4613    }
4614
4615    #[simd_test(enable = "sse2")]
4616    unsafe fn test_mm_cmpge_pd() {
4617        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4618        let e = _mm_setr_epi64x(!0, 0);
4619        let r = transmute::<_, __m128i>(_mm_cmpge_pd(a, b));
4620        assert_eq_m128i(r, e);
4621    }
4622
4623    #[simd_test(enable = "sse2")]
4624    unsafe fn test_mm_cmpord_pd() {
4625        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4626        let e = _mm_setr_epi64x(0, !0);
4627        let r = transmute::<_, __m128i>(_mm_cmpord_pd(a, b));
4628        assert_eq_m128i(r, e);
4629    }
4630
4631    #[simd_test(enable = "sse2")]
4632    unsafe fn test_mm_cmpunord_pd() {
4633        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4634        let e = _mm_setr_epi64x(!0, 0);
4635        let r = transmute::<_, __m128i>(_mm_cmpunord_pd(a, b));
4636        assert_eq_m128i(r, e);
4637    }
4638
4639    #[simd_test(enable = "sse2")]
4640    unsafe fn test_mm_cmpneq_pd() {
4641        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4642        let e = _mm_setr_epi64x(!0, !0);
4643        let r = transmute::<_, __m128i>(_mm_cmpneq_pd(a, b));
4644        assert_eq_m128i(r, e);
4645    }
4646
4647    #[simd_test(enable = "sse2")]
4648    unsafe fn test_mm_cmpnlt_pd() {
4649        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4650        let e = _mm_setr_epi64x(0, 0);
4651        let r = transmute::<_, __m128i>(_mm_cmpnlt_pd(a, b));
4652        assert_eq_m128i(r, e);
4653    }
4654
4655    #[simd_test(enable = "sse2")]
4656    unsafe fn test_mm_cmpnle_pd() {
4657        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4658        let e = _mm_setr_epi64x(0, 0);
4659        let r = transmute::<_, __m128i>(_mm_cmpnle_pd(a, b));
4660        assert_eq_m128i(r, e);
4661    }
4662
4663    #[simd_test(enable = "sse2")]
4664    unsafe fn test_mm_cmpngt_pd() {
4665        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
4666        let e = _mm_setr_epi64x(0, !0);
4667        let r = transmute::<_, __m128i>(_mm_cmpngt_pd(a, b));
4668        assert_eq_m128i(r, e);
4669    }
4670
4671    #[simd_test(enable = "sse2")]
4672    unsafe fn test_mm_cmpnge_pd() {
4673        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4674        let e = _mm_setr_epi64x(0, !0);
4675        let r = transmute::<_, __m128i>(_mm_cmpnge_pd(a, b));
4676        assert_eq_m128i(r, e);
4677    }
4678
4679    #[simd_test(enable = "sse2")]
4680    unsafe fn test_mm_comieq_sd() {
4681        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4682        assert!(_mm_comieq_sd(a, b) != 0);
4683
4684        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(1.0, 3.0));
4685        assert!(_mm_comieq_sd(a, b) == 0);
4686    }
4687
4688    #[simd_test(enable = "sse2")]
4689    unsafe fn test_mm_comilt_sd() {
4690        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4691        assert!(_mm_comilt_sd(a, b) == 0);
4692    }
4693
4694    #[simd_test(enable = "sse2")]
4695    unsafe fn test_mm_comile_sd() {
4696        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4697        assert!(_mm_comile_sd(a, b) != 0);
4698    }
4699
4700    #[simd_test(enable = "sse2")]
4701    unsafe fn test_mm_comigt_sd() {
4702        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4703        assert!(_mm_comigt_sd(a, b) == 0);
4704    }
4705
4706    #[simd_test(enable = "sse2")]
4707    unsafe fn test_mm_comige_sd() {
4708        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4709        assert!(_mm_comige_sd(a, b) != 0);
4710    }
4711
4712    #[simd_test(enable = "sse2")]
4713    unsafe fn test_mm_comineq_sd() {
4714        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4715        assert!(_mm_comineq_sd(a, b) == 0);
4716    }
4717
4718    #[simd_test(enable = "sse2")]
4719    unsafe fn test_mm_ucomieq_sd() {
4720        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4721        assert!(_mm_ucomieq_sd(a, b) != 0);
4722
4723        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(NAN, 3.0));
4724        assert!(_mm_ucomieq_sd(a, b) == 0);
4725    }
4726
4727    #[simd_test(enable = "sse2")]
4728    unsafe fn test_mm_ucomilt_sd() {
4729        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4730        assert!(_mm_ucomilt_sd(a, b) == 0);
4731    }
4732
4733    #[simd_test(enable = "sse2")]
4734    unsafe fn test_mm_ucomile_sd() {
4735        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4736        assert!(_mm_ucomile_sd(a, b) != 0);
4737    }
4738
4739    #[simd_test(enable = "sse2")]
4740    unsafe fn test_mm_ucomigt_sd() {
4741        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4742        assert!(_mm_ucomigt_sd(a, b) == 0);
4743    }
4744
4745    #[simd_test(enable = "sse2")]
4746    unsafe fn test_mm_ucomige_sd() {
4747        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4748        assert!(_mm_ucomige_sd(a, b) != 0);
4749    }
4750
4751    #[simd_test(enable = "sse2")]
4752    unsafe fn test_mm_ucomineq_sd() {
4753        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4754        assert!(_mm_ucomineq_sd(a, b) == 0);
4755    }
4756
4757    #[simd_test(enable = "sse2")]
4758    unsafe fn test_mm_movemask_pd() {
4759        let r = _mm_movemask_pd(_mm_setr_pd(-1.0, 5.0));
4760        assert_eq!(r, 0b01);
4761
4762        let r = _mm_movemask_pd(_mm_setr_pd(-1.0, -5.0));
4763        assert_eq!(r, 0b11);
4764    }
4765
4766    #[repr(align(16))]
4767    struct Memory {
4768        data: [f64; 4],
4769    }
4770
4771    #[simd_test(enable = "sse2")]
4772    unsafe fn test_mm_load_pd() {
4773        let mem = Memory {
4774            data: [1.0f64, 2.0, 3.0, 4.0],
4775        };
4776        let vals = &mem.data;
4777        let d = vals.as_ptr();
4778
4779        let r = _mm_load_pd(d);
4780        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
4781    }
4782
4783    #[simd_test(enable = "sse2")]
4784    unsafe fn test_mm_load_sd() {
4785        let a = 1.;
4786        let expected = _mm_setr_pd(a, 0.);
4787        let r = _mm_load_sd(&a);
4788        assert_eq_m128d(r, expected);
4789    }
4790
4791    #[simd_test(enable = "sse2")]
4792    unsafe fn test_mm_loadh_pd() {
4793        let a = _mm_setr_pd(1., 2.);
4794        let b = 3.;
4795        let expected = _mm_setr_pd(_mm_cvtsd_f64(a), 3.);
4796        let r = _mm_loadh_pd(a, &b);
4797        assert_eq_m128d(r, expected);
4798    }
4799
4800    #[simd_test(enable = "sse2")]
4801    unsafe fn test_mm_loadl_pd() {
4802        let a = _mm_setr_pd(1., 2.);
4803        let b = 3.;
4804        let expected = _mm_setr_pd(3., get_m128d(a, 1));
4805        let r = _mm_loadl_pd(a, &b);
4806        assert_eq_m128d(r, expected);
4807    }
4808
4809    #[simd_test(enable = "sse2")]
4810    // Miri cannot support this until it is clear how it fits in the Rust memory model
4811    // (non-temporal store)
4812    #[cfg_attr(miri, ignore)]
4813    unsafe fn test_mm_stream_pd() {
4814        #[repr(align(128))]
4815        struct Memory {
4816            pub data: [f64; 2],
4817        }
4818        let a = _mm_set1_pd(7.0);
4819        let mut mem = Memory { data: [-1.0; 2] };
4820
4821        _mm_stream_pd(ptr::addr_of_mut!(mem.data[0]), a);
4822        _mm_sfence();
4823        for i in 0..2 {
4824            assert_eq!(mem.data[i], get_m128d(a, i));
4825        }
4826    }
4827
4828    #[simd_test(enable = "sse2")]
4829    unsafe fn test_mm_store_sd() {
4830        let mut dest = 0.;
4831        let a = _mm_setr_pd(1., 2.);
4832        _mm_store_sd(&mut dest, a);
4833        assert_eq!(dest, _mm_cvtsd_f64(a));
4834    }
4835
4836    #[simd_test(enable = "sse2")]
4837    unsafe fn test_mm_store_pd() {
4838        let mut mem = Memory { data: [0.0f64; 4] };
4839        let vals = &mut mem.data;
4840        let a = _mm_setr_pd(1.0, 2.0);
4841        let d = vals.as_mut_ptr();
4842
4843        _mm_store_pd(d, *black_box(&a));
4844        assert_eq!(vals[0], 1.0);
4845        assert_eq!(vals[1], 2.0);
4846    }
4847
4848    #[simd_test(enable = "sse2")]
4849    unsafe fn test_mm_storeu_pd() {
4850        let mut mem = Memory { data: [0.0f64; 4] };
4851        let vals = &mut mem.data;
4852        let a = _mm_setr_pd(1.0, 2.0);
4853
4854        let mut ofs = 0;
4855        let mut p = vals.as_mut_ptr();
4856
4857        // Make sure p is **not** aligned to 16-byte boundary
4858        if (p as usize) & 0xf == 0 {
4859            ofs = 1;
4860            p = p.add(1);
4861        }
4862
4863        _mm_storeu_pd(p, *black_box(&a));
4864
4865        if ofs > 0 {
4866            assert_eq!(vals[ofs - 1], 0.0);
4867        }
4868        assert_eq!(vals[ofs + 0], 1.0);
4869        assert_eq!(vals[ofs + 1], 2.0);
4870    }
4871
4872    #[simd_test(enable = "sse2")]
4873    unsafe fn test_mm_storeu_si16() {
4874        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
4875        let mut r = _mm_setr_epi16(9, 10, 11, 12, 13, 14, 15, 16);
4876        _mm_storeu_si16(ptr::addr_of_mut!(r).cast(), a);
4877        let e = _mm_setr_epi16(1, 10, 11, 12, 13, 14, 15, 16);
4878        assert_eq_m128i(r, e);
4879    }
4880
4881    #[simd_test(enable = "sse2")]
4882    unsafe fn test_mm_storeu_si32() {
4883        let a = _mm_setr_epi32(1, 2, 3, 4);
4884        let mut r = _mm_setr_epi32(5, 6, 7, 8);
4885        _mm_storeu_si32(ptr::addr_of_mut!(r).cast(), a);
4886        let e = _mm_setr_epi32(1, 6, 7, 8);
4887        assert_eq_m128i(r, e);
4888    }
4889
4890    #[simd_test(enable = "sse2")]
4891    unsafe fn test_mm_storeu_si64() {
4892        let a = _mm_setr_epi64x(1, 2);
4893        let mut r = _mm_setr_epi64x(3, 4);
4894        _mm_storeu_si64(ptr::addr_of_mut!(r).cast(), a);
4895        let e = _mm_setr_epi64x(1, 4);
4896        assert_eq_m128i(r, e);
4897    }
4898
4899    #[simd_test(enable = "sse2")]
4900    unsafe fn test_mm_store1_pd() {
4901        let mut mem = Memory { data: [0.0f64; 4] };
4902        let vals = &mut mem.data;
4903        let a = _mm_setr_pd(1.0, 2.0);
4904        let d = vals.as_mut_ptr();
4905
4906        _mm_store1_pd(d, *black_box(&a));
4907        assert_eq!(vals[0], 1.0);
4908        assert_eq!(vals[1], 1.0);
4909    }
4910
4911    #[simd_test(enable = "sse2")]
4912    unsafe fn test_mm_store_pd1() {
4913        let mut mem = Memory { data: [0.0f64; 4] };
4914        let vals = &mut mem.data;
4915        let a = _mm_setr_pd(1.0, 2.0);
4916        let d = vals.as_mut_ptr();
4917
4918        _mm_store_pd1(d, *black_box(&a));
4919        assert_eq!(vals[0], 1.0);
4920        assert_eq!(vals[1], 1.0);
4921    }
4922
4923    #[simd_test(enable = "sse2")]
4924    unsafe fn test_mm_storer_pd() {
4925        let mut mem = Memory { data: [0.0f64; 4] };
4926        let vals = &mut mem.data;
4927        let a = _mm_setr_pd(1.0, 2.0);
4928        let d = vals.as_mut_ptr();
4929
4930        _mm_storer_pd(d, *black_box(&a));
4931        assert_eq!(vals[0], 2.0);
4932        assert_eq!(vals[1], 1.0);
4933    }
4934
4935    #[simd_test(enable = "sse2")]
4936    unsafe fn test_mm_storeh_pd() {
4937        let mut dest = 0.;
4938        let a = _mm_setr_pd(1., 2.);
4939        _mm_storeh_pd(&mut dest, a);
4940        assert_eq!(dest, get_m128d(a, 1));
4941    }
4942
4943    #[simd_test(enable = "sse2")]
4944    unsafe fn test_mm_storel_pd() {
4945        let mut dest = 0.;
4946        let a = _mm_setr_pd(1., 2.);
4947        _mm_storel_pd(&mut dest, a);
4948        assert_eq!(dest, _mm_cvtsd_f64(a));
4949    }
4950
4951    #[simd_test(enable = "sse2")]
4952    unsafe fn test_mm_loadr_pd() {
4953        let mut mem = Memory {
4954            data: [1.0f64, 2.0, 3.0, 4.0],
4955        };
4956        let vals = &mut mem.data;
4957        let d = vals.as_ptr();
4958
4959        let r = _mm_loadr_pd(d);
4960        assert_eq_m128d(r, _mm_setr_pd(2.0, 1.0));
4961    }
4962
4963    #[simd_test(enable = "sse2")]
4964    unsafe fn test_mm_loadu_pd() {
4965        let mut mem = Memory {
4966            data: [1.0f64, 2.0, 3.0, 4.0],
4967        };
4968        let vals = &mut mem.data;
4969        let mut d = vals.as_ptr();
4970
4971        // make sure d is not aligned to 16-byte boundary
4972        let mut offset = 0;
4973        if (d as usize) & 0xf == 0 {
4974            offset = 1;
4975            d = d.add(offset);
4976        }
4977
4978        let r = _mm_loadu_pd(d);
4979        let e = _mm_add_pd(_mm_setr_pd(1.0, 2.0), _mm_set1_pd(offset as f64));
4980        assert_eq_m128d(r, e);
4981    }
4982
4983    #[simd_test(enable = "sse2")]
4984    unsafe fn test_mm_loadu_si16() {
4985        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
4986        let r = _mm_loadu_si16(ptr::addr_of!(a) as *const _);
4987        assert_eq_m128i(r, _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0));
4988    }
4989
4990    #[simd_test(enable = "sse2")]
4991    unsafe fn test_mm_loadu_si32() {
4992        let a = _mm_setr_epi32(1, 2, 3, 4);
4993        let r = _mm_loadu_si32(ptr::addr_of!(a) as *const _);
4994        assert_eq_m128i(r, _mm_setr_epi32(1, 0, 0, 0));
4995    }
4996
4997    #[simd_test(enable = "sse2")]
4998    unsafe fn test_mm_loadu_si64() {
4999        let a = _mm_setr_epi64x(5, 6);
5000        let r = _mm_loadu_si64(ptr::addr_of!(a) as *const _);
5001        assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
5002    }
5003
5004    #[simd_test(enable = "sse2")]
5005    unsafe fn test_mm_cvtpd_ps() {
5006        let r = _mm_cvtpd_ps(_mm_setr_pd(-1.0, 5.0));
5007        assert_eq_m128(r, _mm_setr_ps(-1.0, 5.0, 0.0, 0.0));
5008
5009        let r = _mm_cvtpd_ps(_mm_setr_pd(-1.0, -5.0));
5010        assert_eq_m128(r, _mm_setr_ps(-1.0, -5.0, 0.0, 0.0));
5011
5012        let r = _mm_cvtpd_ps(_mm_setr_pd(f64::MAX, f64::MIN));
5013        assert_eq_m128(r, _mm_setr_ps(f32::INFINITY, f32::NEG_INFINITY, 0.0, 0.0));
5014
5015        let r = _mm_cvtpd_ps(_mm_setr_pd(f32::MAX as f64, f32::MIN as f64));
5016        assert_eq_m128(r, _mm_setr_ps(f32::MAX, f32::MIN, 0.0, 0.0));
5017    }
5018
5019    #[simd_test(enable = "sse2")]
5020    unsafe fn test_mm_cvtps_pd() {
5021        let r = _mm_cvtps_pd(_mm_setr_ps(-1.0, 2.0, -3.0, 5.0));
5022        assert_eq_m128d(r, _mm_setr_pd(-1.0, 2.0));
5023
5024        let r = _mm_cvtps_pd(_mm_setr_ps(
5025            f32::MAX,
5026            f32::INFINITY,
5027            f32::NEG_INFINITY,
5028            f32::MIN,
5029        ));
5030        assert_eq_m128d(r, _mm_setr_pd(f32::MAX as f64, f64::INFINITY));
5031    }
5032
5033    #[simd_test(enable = "sse2")]
5034    unsafe fn test_mm_cvtpd_epi32() {
5035        let r = _mm_cvtpd_epi32(_mm_setr_pd(-1.0, 5.0));
5036        assert_eq_m128i(r, _mm_setr_epi32(-1, 5, 0, 0));
5037
5038        let r = _mm_cvtpd_epi32(_mm_setr_pd(-1.0, -5.0));
5039        assert_eq_m128i(r, _mm_setr_epi32(-1, -5, 0, 0));
5040
5041        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::MAX, f64::MIN));
5042        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5043
5044        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::INFINITY, f64::NEG_INFINITY));
5045        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5046
5047        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::NAN, f64::NAN));
5048        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5049    }
5050
5051    #[simd_test(enable = "sse2")]
5052    unsafe fn test_mm_cvtsd_si32() {
5053        let r = _mm_cvtsd_si32(_mm_setr_pd(-2.0, 5.0));
5054        assert_eq!(r, -2);
5055
5056        let r = _mm_cvtsd_si32(_mm_setr_pd(f64::MAX, f64::MIN));
5057        assert_eq!(r, i32::MIN);
5058
5059        let r = _mm_cvtsd_si32(_mm_setr_pd(f64::NAN, f64::NAN));
5060        assert_eq!(r, i32::MIN);
5061    }
5062
5063    #[simd_test(enable = "sse2")]
5064    unsafe fn test_mm_cvtsd_ss() {
5065        let a = _mm_setr_ps(-1.1, -2.2, 3.3, 4.4);
5066        let b = _mm_setr_pd(2.0, -5.0);
5067
5068        let r = _mm_cvtsd_ss(a, b);
5069
5070        assert_eq_m128(r, _mm_setr_ps(2.0, -2.2, 3.3, 4.4));
5071
5072        let a = _mm_setr_ps(-1.1, f32::NEG_INFINITY, f32::MAX, f32::NEG_INFINITY);
5073        let b = _mm_setr_pd(f64::INFINITY, -5.0);
5074
5075        let r = _mm_cvtsd_ss(a, b);
5076
5077        assert_eq_m128(
5078            r,
5079            _mm_setr_ps(
5080                f32::INFINITY,
5081                f32::NEG_INFINITY,
5082                f32::MAX,
5083                f32::NEG_INFINITY,
5084            ),
5085        );
5086    }
5087
5088    #[simd_test(enable = "sse2")]
5089    unsafe fn test_mm_cvtsd_f64() {
5090        let r = _mm_cvtsd_f64(_mm_setr_pd(-1.1, 2.2));
5091        assert_eq!(r, -1.1);
5092    }
5093
5094    #[simd_test(enable = "sse2")]
5095    unsafe fn test_mm_cvtss_sd() {
5096        let a = _mm_setr_pd(-1.1, 2.2);
5097        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
5098
5099        let r = _mm_cvtss_sd(a, b);
5100        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.2));
5101
5102        let a = _mm_setr_pd(-1.1, f64::INFINITY);
5103        let b = _mm_setr_ps(f32::NEG_INFINITY, 2.0, 3.0, 4.0);
5104
5105        let r = _mm_cvtss_sd(a, b);
5106        assert_eq_m128d(r, _mm_setr_pd(f64::NEG_INFINITY, f64::INFINITY));
5107    }
5108
5109    #[simd_test(enable = "sse2")]
5110    unsafe fn test_mm_cvttpd_epi32() {
5111        let a = _mm_setr_pd(-1.1, 2.2);
5112        let r = _mm_cvttpd_epi32(a);
5113        assert_eq_m128i(r, _mm_setr_epi32(-1, 2, 0, 0));
5114
5115        let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
5116        let r = _mm_cvttpd_epi32(a);
5117        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5118    }
5119
5120    #[simd_test(enable = "sse2")]
5121    unsafe fn test_mm_cvttsd_si32() {
5122        let a = _mm_setr_pd(-1.1, 2.2);
5123        let r = _mm_cvttsd_si32(a);
5124        assert_eq!(r, -1);
5125
5126        let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
5127        let r = _mm_cvttsd_si32(a);
5128        assert_eq!(r, i32::MIN);
5129    }
5130
5131    #[simd_test(enable = "sse2")]
5132    unsafe fn test_mm_cvttps_epi32() {
5133        let a = _mm_setr_ps(-1.1, 2.2, -3.3, 6.6);
5134        let r = _mm_cvttps_epi32(a);
5135        assert_eq_m128i(r, _mm_setr_epi32(-1, 2, -3, 6));
5136
5137        let a = _mm_setr_ps(f32::NEG_INFINITY, f32::INFINITY, f32::MIN, f32::MAX);
5138        let r = _mm_cvttps_epi32(a);
5139        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, i32::MIN, i32::MIN));
5140    }
5141
5142    #[simd_test(enable = "sse2")]
5143    unsafe fn test_mm_set_sd() {
5144        let r = _mm_set_sd(-1.0_f64);
5145        assert_eq_m128d(r, _mm_setr_pd(-1.0_f64, 0_f64));
5146    }
5147
5148    #[simd_test(enable = "sse2")]
5149    unsafe fn test_mm_set1_pd() {
5150        let r = _mm_set1_pd(-1.0_f64);
5151        assert_eq_m128d(r, _mm_setr_pd(-1.0_f64, -1.0_f64));
5152    }
5153
5154    #[simd_test(enable = "sse2")]
5155    unsafe fn test_mm_set_pd1() {
5156        let r = _mm_set_pd1(-2.0_f64);
5157        assert_eq_m128d(r, _mm_setr_pd(-2.0_f64, -2.0_f64));
5158    }
5159
5160    #[simd_test(enable = "sse2")]
5161    unsafe fn test_mm_set_pd() {
5162        let r = _mm_set_pd(1.0_f64, 5.0_f64);
5163        assert_eq_m128d(r, _mm_setr_pd(5.0_f64, 1.0_f64));
5164    }
5165
5166    #[simd_test(enable = "sse2")]
5167    unsafe fn test_mm_setr_pd() {
5168        let r = _mm_setr_pd(1.0_f64, -5.0_f64);
5169        assert_eq_m128d(r, _mm_setr_pd(1.0_f64, -5.0_f64));
5170    }
5171
5172    #[simd_test(enable = "sse2")]
5173    unsafe fn test_mm_setzero_pd() {
5174        let r = _mm_setzero_pd();
5175        assert_eq_m128d(r, _mm_setr_pd(0_f64, 0_f64));
5176    }
5177
5178    #[simd_test(enable = "sse2")]
5179    unsafe fn test_mm_load1_pd() {
5180        let d = -5.0;
5181        let r = _mm_load1_pd(&d);
5182        assert_eq_m128d(r, _mm_setr_pd(d, d));
5183    }
5184
5185    #[simd_test(enable = "sse2")]
5186    unsafe fn test_mm_load_pd1() {
5187        let d = -5.0;
5188        let r = _mm_load_pd1(&d);
5189        assert_eq_m128d(r, _mm_setr_pd(d, d));
5190    }
5191
5192    #[simd_test(enable = "sse2")]
5193    unsafe fn test_mm_unpackhi_pd() {
5194        let a = _mm_setr_pd(1.0, 2.0);
5195        let b = _mm_setr_pd(3.0, 4.0);
5196        let r = _mm_unpackhi_pd(a, b);
5197        assert_eq_m128d(r, _mm_setr_pd(2.0, 4.0));
5198    }
5199
5200    #[simd_test(enable = "sse2")]
5201    unsafe fn test_mm_unpacklo_pd() {
5202        let a = _mm_setr_pd(1.0, 2.0);
5203        let b = _mm_setr_pd(3.0, 4.0);
5204        let r = _mm_unpacklo_pd(a, b);
5205        assert_eq_m128d(r, _mm_setr_pd(1.0, 3.0));
5206    }
5207
5208    #[simd_test(enable = "sse2")]
5209    unsafe fn test_mm_shuffle_pd() {
5210        let a = _mm_setr_pd(1., 2.);
5211        let b = _mm_setr_pd(3., 4.);
5212        let expected = _mm_setr_pd(1., 3.);
5213        let r = _mm_shuffle_pd::<0b00_00_00_00>(a, b);
5214        assert_eq_m128d(r, expected);
5215    }
5216
5217    #[simd_test(enable = "sse2")]
5218    unsafe fn test_mm_move_sd() {
5219        let a = _mm_setr_pd(1., 2.);
5220        let b = _mm_setr_pd(3., 4.);
5221        let expected = _mm_setr_pd(3., 2.);
5222        let r = _mm_move_sd(a, b);
5223        assert_eq_m128d(r, expected);
5224    }
5225
5226    #[simd_test(enable = "sse2")]
5227    unsafe fn test_mm_castpd_ps() {
5228        let a = _mm_set1_pd(0.);
5229        let expected = _mm_set1_ps(0.);
5230        let r = _mm_castpd_ps(a);
5231        assert_eq_m128(r, expected);
5232    }
5233
5234    #[simd_test(enable = "sse2")]
5235    unsafe fn test_mm_castpd_si128() {
5236        let a = _mm_set1_pd(0.);
5237        let expected = _mm_set1_epi64x(0);
5238        let r = _mm_castpd_si128(a);
5239        assert_eq_m128i(r, expected);
5240    }
5241
5242    #[simd_test(enable = "sse2")]
5243    unsafe fn test_mm_castps_pd() {
5244        let a = _mm_set1_ps(0.);
5245        let expected = _mm_set1_pd(0.);
5246        let r = _mm_castps_pd(a);
5247        assert_eq_m128d(r, expected);
5248    }
5249
5250    #[simd_test(enable = "sse2")]
5251    unsafe fn test_mm_castps_si128() {
5252        let a = _mm_set1_ps(0.);
5253        let expected = _mm_set1_epi32(0);
5254        let r = _mm_castps_si128(a);
5255        assert_eq_m128i(r, expected);
5256    }
5257
5258    #[simd_test(enable = "sse2")]
5259    unsafe fn test_mm_castsi128_pd() {
5260        let a = _mm_set1_epi64x(0);
5261        let expected = _mm_set1_pd(0.);
5262        let r = _mm_castsi128_pd(a);
5263        assert_eq_m128d(r, expected);
5264    }
5265
5266    #[simd_test(enable = "sse2")]
5267    unsafe fn test_mm_castsi128_ps() {
5268        let a = _mm_set1_epi32(0);
5269        let expected = _mm_set1_ps(0.);
5270        let r = _mm_castsi128_ps(a);
5271        assert_eq_m128(r, expected);
5272    }
5273}