core/stdarch/crates/core_arch/src/x86/
sse2.rs

1//! Streaming SIMD Extensions 2 (SSE2)
2
3#[cfg(test)]
4use stdarch_test::assert_instr;
5
6use crate::{
7    core_arch::{simd::*, x86::*},
8    intrinsics::simd::*,
9    intrinsics::sqrtf64,
10    mem, ptr,
11};
12
13/// Provides a hint to the processor that the code sequence is a spin-wait loop.
14///
15/// This can help improve the performance and power consumption of spin-wait
16/// loops.
17///
18/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause)
19#[inline]
20#[cfg_attr(all(test, target_feature = "sse2"), assert_instr(pause))]
21#[stable(feature = "simd_x86", since = "1.27.0")]
22pub unsafe fn _mm_pause() {
23    // note: `pause` is guaranteed to be interpreted as a `nop` by CPUs without
24    // the SSE2 target-feature - therefore it does not require any target features
25    pause()
26}
27
28/// Invalidates and flushes the cache line that contains `p` from all levels of
29/// the cache hierarchy.
30///
31/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush)
32#[inline]
33#[target_feature(enable = "sse2")]
34#[cfg_attr(test, assert_instr(clflush))]
35#[stable(feature = "simd_x86", since = "1.27.0")]
36pub unsafe fn _mm_clflush(p: *const u8) {
37    clflush(p)
38}
39
40/// Performs a serializing operation on all load-from-memory instructions
41/// that were issued prior to this instruction.
42///
43/// Guarantees that every load instruction that precedes, in program order, is
44/// globally visible before any load instruction which follows the fence in
45/// program order.
46///
47/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence)
48#[inline]
49#[target_feature(enable = "sse2")]
50#[cfg_attr(test, assert_instr(lfence))]
51#[stable(feature = "simd_x86", since = "1.27.0")]
52pub unsafe fn _mm_lfence() {
53    lfence()
54}
55
56/// Performs a serializing operation on all load-from-memory and store-to-memory
57/// instructions that were issued prior to this instruction.
58///
59/// Guarantees that every memory access that precedes, in program order, the
60/// memory fence instruction is globally visible before any memory instruction
61/// which follows the fence in program order.
62///
63/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence)
64#[inline]
65#[target_feature(enable = "sse2")]
66#[cfg_attr(test, assert_instr(mfence))]
67#[stable(feature = "simd_x86", since = "1.27.0")]
68pub unsafe fn _mm_mfence() {
69    mfence()
70}
71
72/// Adds packed 8-bit integers in `a` and `b`.
73///
74/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8)
75#[inline]
76#[target_feature(enable = "sse2")]
77#[cfg_attr(test, assert_instr(paddb))]
78#[stable(feature = "simd_x86", since = "1.27.0")]
79pub fn _mm_add_epi8(a: __m128i, b: __m128i) -> __m128i {
80    unsafe { transmute(simd_add(a.as_i8x16(), b.as_i8x16())) }
81}
82
83/// Adds packed 16-bit integers in `a` and `b`.
84///
85/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16)
86#[inline]
87#[target_feature(enable = "sse2")]
88#[cfg_attr(test, assert_instr(paddw))]
89#[stable(feature = "simd_x86", since = "1.27.0")]
90pub fn _mm_add_epi16(a: __m128i, b: __m128i) -> __m128i {
91    unsafe { transmute(simd_add(a.as_i16x8(), b.as_i16x8())) }
92}
93
94/// Adds packed 32-bit integers in `a` and `b`.
95///
96/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32)
97#[inline]
98#[target_feature(enable = "sse2")]
99#[cfg_attr(test, assert_instr(paddd))]
100#[stable(feature = "simd_x86", since = "1.27.0")]
101pub fn _mm_add_epi32(a: __m128i, b: __m128i) -> __m128i {
102    unsafe { transmute(simd_add(a.as_i32x4(), b.as_i32x4())) }
103}
104
105/// Adds packed 64-bit integers in `a` and `b`.
106///
107/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64)
108#[inline]
109#[target_feature(enable = "sse2")]
110#[cfg_attr(test, assert_instr(paddq))]
111#[stable(feature = "simd_x86", since = "1.27.0")]
112pub fn _mm_add_epi64(a: __m128i, b: __m128i) -> __m128i {
113    unsafe { transmute(simd_add(a.as_i64x2(), b.as_i64x2())) }
114}
115
116/// Adds packed 8-bit integers in `a` and `b` using saturation.
117///
118/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8)
119#[inline]
120#[target_feature(enable = "sse2")]
121#[cfg_attr(test, assert_instr(paddsb))]
122#[stable(feature = "simd_x86", since = "1.27.0")]
123pub fn _mm_adds_epi8(a: __m128i, b: __m128i) -> __m128i {
124    unsafe { transmute(simd_saturating_add(a.as_i8x16(), b.as_i8x16())) }
125}
126
127/// Adds packed 16-bit integers in `a` and `b` using saturation.
128///
129/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16)
130#[inline]
131#[target_feature(enable = "sse2")]
132#[cfg_attr(test, assert_instr(paddsw))]
133#[stable(feature = "simd_x86", since = "1.27.0")]
134pub fn _mm_adds_epi16(a: __m128i, b: __m128i) -> __m128i {
135    unsafe { transmute(simd_saturating_add(a.as_i16x8(), b.as_i16x8())) }
136}
137
138/// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
139///
140/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8)
141#[inline]
142#[target_feature(enable = "sse2")]
143#[cfg_attr(test, assert_instr(paddusb))]
144#[stable(feature = "simd_x86", since = "1.27.0")]
145pub fn _mm_adds_epu8(a: __m128i, b: __m128i) -> __m128i {
146    unsafe { transmute(simd_saturating_add(a.as_u8x16(), b.as_u8x16())) }
147}
148
149/// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
150///
151/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16)
152#[inline]
153#[target_feature(enable = "sse2")]
154#[cfg_attr(test, assert_instr(paddusw))]
155#[stable(feature = "simd_x86", since = "1.27.0")]
156pub fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i {
157    unsafe { transmute(simd_saturating_add(a.as_u16x8(), b.as_u16x8())) }
158}
159
160/// Averages packed unsigned 8-bit integers in `a` and `b`.
161///
162/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8)
163#[inline]
164#[target_feature(enable = "sse2")]
165#[cfg_attr(test, assert_instr(pavgb))]
166#[stable(feature = "simd_x86", since = "1.27.0")]
167pub fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i {
168    unsafe {
169        let a = simd_cast::<_, u16x16>(a.as_u8x16());
170        let b = simd_cast::<_, u16x16>(b.as_u8x16());
171        let r = simd_shr(simd_add(simd_add(a, b), u16x16::splat(1)), u16x16::splat(1));
172        transmute(simd_cast::<_, u8x16>(r))
173    }
174}
175
176/// Averages packed unsigned 16-bit integers in `a` and `b`.
177///
178/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16)
179#[inline]
180#[target_feature(enable = "sse2")]
181#[cfg_attr(test, assert_instr(pavgw))]
182#[stable(feature = "simd_x86", since = "1.27.0")]
183pub fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
184    unsafe {
185        let a = simd_cast::<_, u32x8>(a.as_u16x8());
186        let b = simd_cast::<_, u32x8>(b.as_u16x8());
187        let r = simd_shr(simd_add(simd_add(a, b), u32x8::splat(1)), u32x8::splat(1));
188        transmute(simd_cast::<_, u16x8>(r))
189    }
190}
191
192/// Multiplies and then horizontally add signed 16 bit integers in `a` and `b`.
193///
194/// Multiplies packed signed 16-bit integers in `a` and `b`, producing
195/// intermediate signed 32-bit integers. Horizontally add adjacent pairs of
196/// intermediate 32-bit integers.
197///
198/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16)
199#[inline]
200#[target_feature(enable = "sse2")]
201#[cfg_attr(test, assert_instr(pmaddwd))]
202#[stable(feature = "simd_x86", since = "1.27.0")]
203pub fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
204    unsafe { transmute(pmaddwd(a.as_i16x8(), b.as_i16x8())) }
205}
206
207/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
208/// maximum values.
209///
210/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16)
211#[inline]
212#[target_feature(enable = "sse2")]
213#[cfg_attr(test, assert_instr(pmaxsw))]
214#[stable(feature = "simd_x86", since = "1.27.0")]
215pub fn _mm_max_epi16(a: __m128i, b: __m128i) -> __m128i {
216    unsafe {
217        let a = a.as_i16x8();
218        let b = b.as_i16x8();
219        transmute(simd_select::<i16x8, _>(simd_gt(a, b), a, b))
220    }
221}
222
223/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
224/// packed maximum values.
225///
226/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8)
227#[inline]
228#[target_feature(enable = "sse2")]
229#[cfg_attr(test, assert_instr(pmaxub))]
230#[stable(feature = "simd_x86", since = "1.27.0")]
231pub fn _mm_max_epu8(a: __m128i, b: __m128i) -> __m128i {
232    unsafe {
233        let a = a.as_u8x16();
234        let b = b.as_u8x16();
235        transmute(simd_select::<i8x16, _>(simd_gt(a, b), a, b))
236    }
237}
238
239/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
240/// minimum values.
241///
242/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16)
243#[inline]
244#[target_feature(enable = "sse2")]
245#[cfg_attr(test, assert_instr(pminsw))]
246#[stable(feature = "simd_x86", since = "1.27.0")]
247pub fn _mm_min_epi16(a: __m128i, b: __m128i) -> __m128i {
248    unsafe {
249        let a = a.as_i16x8();
250        let b = b.as_i16x8();
251        transmute(simd_select::<i16x8, _>(simd_lt(a, b), a, b))
252    }
253}
254
255/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
256/// packed minimum values.
257///
258/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8)
259#[inline]
260#[target_feature(enable = "sse2")]
261#[cfg_attr(test, assert_instr(pminub))]
262#[stable(feature = "simd_x86", since = "1.27.0")]
263pub fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i {
264    unsafe {
265        let a = a.as_u8x16();
266        let b = b.as_u8x16();
267        transmute(simd_select::<i8x16, _>(simd_lt(a, b), a, b))
268    }
269}
270
271/// Multiplies the packed 16-bit integers in `a` and `b`.
272///
273/// The multiplication produces intermediate 32-bit integers, and returns the
274/// high 16 bits of the intermediate integers.
275///
276/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16)
277#[inline]
278#[target_feature(enable = "sse2")]
279#[cfg_attr(test, assert_instr(pmulhw))]
280#[stable(feature = "simd_x86", since = "1.27.0")]
281pub fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i {
282    unsafe {
283        let a = simd_cast::<_, i32x8>(a.as_i16x8());
284        let b = simd_cast::<_, i32x8>(b.as_i16x8());
285        let r = simd_shr(simd_mul(a, b), i32x8::splat(16));
286        transmute(simd_cast::<i32x8, i16x8>(r))
287    }
288}
289
290/// Multiplies the packed unsigned 16-bit integers in `a` and `b`.
291///
292/// The multiplication produces intermediate 32-bit integers, and returns the
293/// high 16 bits of the intermediate integers.
294///
295/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16)
296#[inline]
297#[target_feature(enable = "sse2")]
298#[cfg_attr(test, assert_instr(pmulhuw))]
299#[stable(feature = "simd_x86", since = "1.27.0")]
300pub fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i {
301    unsafe {
302        let a = simd_cast::<_, u32x8>(a.as_u16x8());
303        let b = simd_cast::<_, u32x8>(b.as_u16x8());
304        let r = simd_shr(simd_mul(a, b), u32x8::splat(16));
305        transmute(simd_cast::<u32x8, u16x8>(r))
306    }
307}
308
309/// Multiplies the packed 16-bit integers in `a` and `b`.
310///
311/// The multiplication produces intermediate 32-bit integers, and returns the
312/// low 16 bits of the intermediate integers.
313///
314/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16)
315#[inline]
316#[target_feature(enable = "sse2")]
317#[cfg_attr(test, assert_instr(pmullw))]
318#[stable(feature = "simd_x86", since = "1.27.0")]
319pub fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i {
320    unsafe { transmute(simd_mul(a.as_i16x8(), b.as_i16x8())) }
321}
322
323/// Multiplies the low unsigned 32-bit integers from each packed 64-bit element
324/// in `a` and `b`.
325///
326/// Returns the unsigned 64-bit results.
327///
328/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32)
329#[inline]
330#[target_feature(enable = "sse2")]
331#[cfg_attr(test, assert_instr(pmuludq))]
332#[stable(feature = "simd_x86", since = "1.27.0")]
333pub fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i {
334    unsafe {
335        let a = a.as_u64x2();
336        let b = b.as_u64x2();
337        let mask = u64x2::splat(u32::MAX.into());
338        transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
339    }
340}
341
342/// Sum the absolute differences of packed unsigned 8-bit integers.
343///
344/// Computes the absolute differences of packed unsigned 8-bit integers in `a`
345/// and `b`, then horizontally sum each consecutive 8 differences to produce
346/// two unsigned 16-bit integers, and pack these unsigned 16-bit integers in
347/// the low 16 bits of 64-bit elements returned.
348///
349/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8)
350#[inline]
351#[target_feature(enable = "sse2")]
352#[cfg_attr(test, assert_instr(psadbw))]
353#[stable(feature = "simd_x86", since = "1.27.0")]
354pub fn _mm_sad_epu8(a: __m128i, b: __m128i) -> __m128i {
355    unsafe { transmute(psadbw(a.as_u8x16(), b.as_u8x16())) }
356}
357
358/// Subtracts packed 8-bit integers in `b` from packed 8-bit integers in `a`.
359///
360/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8)
361#[inline]
362#[target_feature(enable = "sse2")]
363#[cfg_attr(test, assert_instr(psubb))]
364#[stable(feature = "simd_x86", since = "1.27.0")]
365pub fn _mm_sub_epi8(a: __m128i, b: __m128i) -> __m128i {
366    unsafe { transmute(simd_sub(a.as_i8x16(), b.as_i8x16())) }
367}
368
369/// Subtracts packed 16-bit integers in `b` from packed 16-bit integers in `a`.
370///
371/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16)
372#[inline]
373#[target_feature(enable = "sse2")]
374#[cfg_attr(test, assert_instr(psubw))]
375#[stable(feature = "simd_x86", since = "1.27.0")]
376pub fn _mm_sub_epi16(a: __m128i, b: __m128i) -> __m128i {
377    unsafe { transmute(simd_sub(a.as_i16x8(), b.as_i16x8())) }
378}
379
380/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
381///
382/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32)
383#[inline]
384#[target_feature(enable = "sse2")]
385#[cfg_attr(test, assert_instr(psubd))]
386#[stable(feature = "simd_x86", since = "1.27.0")]
387pub fn _mm_sub_epi32(a: __m128i, b: __m128i) -> __m128i {
388    unsafe { transmute(simd_sub(a.as_i32x4(), b.as_i32x4())) }
389}
390
391/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
392///
393/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64)
394#[inline]
395#[target_feature(enable = "sse2")]
396#[cfg_attr(test, assert_instr(psubq))]
397#[stable(feature = "simd_x86", since = "1.27.0")]
398pub fn _mm_sub_epi64(a: __m128i, b: __m128i) -> __m128i {
399    unsafe { transmute(simd_sub(a.as_i64x2(), b.as_i64x2())) }
400}
401
402/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
403/// using saturation.
404///
405/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8)
406#[inline]
407#[target_feature(enable = "sse2")]
408#[cfg_attr(test, assert_instr(psubsb))]
409#[stable(feature = "simd_x86", since = "1.27.0")]
410pub fn _mm_subs_epi8(a: __m128i, b: __m128i) -> __m128i {
411    unsafe { transmute(simd_saturating_sub(a.as_i8x16(), b.as_i8x16())) }
412}
413
414/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
415/// using saturation.
416///
417/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16)
418#[inline]
419#[target_feature(enable = "sse2")]
420#[cfg_attr(test, assert_instr(psubsw))]
421#[stable(feature = "simd_x86", since = "1.27.0")]
422pub fn _mm_subs_epi16(a: __m128i, b: __m128i) -> __m128i {
423    unsafe { transmute(simd_saturating_sub(a.as_i16x8(), b.as_i16x8())) }
424}
425
426/// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit
427/// integers in `a` using saturation.
428///
429/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8)
430#[inline]
431#[target_feature(enable = "sse2")]
432#[cfg_attr(test, assert_instr(psubusb))]
433#[stable(feature = "simd_x86", since = "1.27.0")]
434pub fn _mm_subs_epu8(a: __m128i, b: __m128i) -> __m128i {
435    unsafe { transmute(simd_saturating_sub(a.as_u8x16(), b.as_u8x16())) }
436}
437
438/// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit
439/// integers in `a` using saturation.
440///
441/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16)
442#[inline]
443#[target_feature(enable = "sse2")]
444#[cfg_attr(test, assert_instr(psubusw))]
445#[stable(feature = "simd_x86", since = "1.27.0")]
446pub fn _mm_subs_epu16(a: __m128i, b: __m128i) -> __m128i {
447    unsafe { transmute(simd_saturating_sub(a.as_u16x8(), b.as_u16x8())) }
448}
449
450/// Shifts `a` left by `IMM8` bytes while shifting in zeros.
451///
452/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128)
453#[inline]
454#[target_feature(enable = "sse2")]
455#[cfg_attr(test, assert_instr(pslldq, IMM8 = 1))]
456#[rustc_legacy_const_generics(1)]
457#[stable(feature = "simd_x86", since = "1.27.0")]
458pub fn _mm_slli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
459    static_assert_uimm_bits!(IMM8, 8);
460    unsafe { _mm_slli_si128_impl::<IMM8>(a) }
461}
462
463/// Implementation detail: converts the immediate argument of the
464/// `_mm_slli_si128` intrinsic into a compile-time constant.
465#[inline]
466#[target_feature(enable = "sse2")]
467unsafe fn _mm_slli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
468    const fn mask(shift: i32, i: u32) -> u32 {
469        let shift = shift as u32 & 0xff;
470        if shift > 15 { i } else { 16 - shift + i }
471    }
472    transmute::<i8x16, _>(simd_shuffle!(
473        i8x16::ZERO,
474        a.as_i8x16(),
475        [
476            mask(IMM8, 0),
477            mask(IMM8, 1),
478            mask(IMM8, 2),
479            mask(IMM8, 3),
480            mask(IMM8, 4),
481            mask(IMM8, 5),
482            mask(IMM8, 6),
483            mask(IMM8, 7),
484            mask(IMM8, 8),
485            mask(IMM8, 9),
486            mask(IMM8, 10),
487            mask(IMM8, 11),
488            mask(IMM8, 12),
489            mask(IMM8, 13),
490            mask(IMM8, 14),
491            mask(IMM8, 15),
492        ],
493    ))
494}
495
496/// Shifts `a` left by `IMM8` bytes while shifting in zeros.
497///
498/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128)
499#[inline]
500#[target_feature(enable = "sse2")]
501#[cfg_attr(test, assert_instr(pslldq, IMM8 = 1))]
502#[rustc_legacy_const_generics(1)]
503#[stable(feature = "simd_x86", since = "1.27.0")]
504pub fn _mm_bslli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
505    unsafe {
506        static_assert_uimm_bits!(IMM8, 8);
507        _mm_slli_si128_impl::<IMM8>(a)
508    }
509}
510
511/// Shifts `a` right by `IMM8` bytes while shifting in zeros.
512///
513/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128)
514#[inline]
515#[target_feature(enable = "sse2")]
516#[cfg_attr(test, assert_instr(psrldq, IMM8 = 1))]
517#[rustc_legacy_const_generics(1)]
518#[stable(feature = "simd_x86", since = "1.27.0")]
519pub fn _mm_bsrli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
520    unsafe {
521        static_assert_uimm_bits!(IMM8, 8);
522        _mm_srli_si128_impl::<IMM8>(a)
523    }
524}
525
526/// Shifts packed 16-bit integers in `a` left by `IMM8` while shifting in zeros.
527///
528/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16)
529#[inline]
530#[target_feature(enable = "sse2")]
531#[cfg_attr(test, assert_instr(psllw, IMM8 = 7))]
532#[rustc_legacy_const_generics(1)]
533#[stable(feature = "simd_x86", since = "1.27.0")]
534pub fn _mm_slli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
535    static_assert_uimm_bits!(IMM8, 8);
536    unsafe {
537        if IMM8 >= 16 {
538            _mm_setzero_si128()
539        } else {
540            transmute(simd_shl(a.as_u16x8(), u16x8::splat(IMM8 as u16)))
541        }
542    }
543}
544
545/// Shifts packed 16-bit integers in `a` left by `count` while shifting in
546/// zeros.
547///
548/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16)
549#[inline]
550#[target_feature(enable = "sse2")]
551#[cfg_attr(test, assert_instr(psllw))]
552#[stable(feature = "simd_x86", since = "1.27.0")]
553pub fn _mm_sll_epi16(a: __m128i, count: __m128i) -> __m128i {
554    unsafe { transmute(psllw(a.as_i16x8(), count.as_i16x8())) }
555}
556
557/// Shifts packed 32-bit integers in `a` left by `IMM8` while shifting in zeros.
558///
559/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32)
560#[inline]
561#[target_feature(enable = "sse2")]
562#[cfg_attr(test, assert_instr(pslld, IMM8 = 7))]
563#[rustc_legacy_const_generics(1)]
564#[stable(feature = "simd_x86", since = "1.27.0")]
565pub fn _mm_slli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
566    static_assert_uimm_bits!(IMM8, 8);
567    unsafe {
568        if IMM8 >= 32 {
569            _mm_setzero_si128()
570        } else {
571            transmute(simd_shl(a.as_u32x4(), u32x4::splat(IMM8 as u32)))
572        }
573    }
574}
575
576/// Shifts packed 32-bit integers in `a` left by `count` while shifting in
577/// zeros.
578///
579/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32)
580#[inline]
581#[target_feature(enable = "sse2")]
582#[cfg_attr(test, assert_instr(pslld))]
583#[stable(feature = "simd_x86", since = "1.27.0")]
584pub fn _mm_sll_epi32(a: __m128i, count: __m128i) -> __m128i {
585    unsafe { transmute(pslld(a.as_i32x4(), count.as_i32x4())) }
586}
587
588/// Shifts packed 64-bit integers in `a` left by `IMM8` while shifting in zeros.
589///
590/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64)
591#[inline]
592#[target_feature(enable = "sse2")]
593#[cfg_attr(test, assert_instr(psllq, IMM8 = 7))]
594#[rustc_legacy_const_generics(1)]
595#[stable(feature = "simd_x86", since = "1.27.0")]
596pub fn _mm_slli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
597    static_assert_uimm_bits!(IMM8, 8);
598    unsafe {
599        if IMM8 >= 64 {
600            _mm_setzero_si128()
601        } else {
602            transmute(simd_shl(a.as_u64x2(), u64x2::splat(IMM8 as u64)))
603        }
604    }
605}
606
607/// Shifts packed 64-bit integers in `a` left by `count` while shifting in
608/// zeros.
609///
610/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64)
611#[inline]
612#[target_feature(enable = "sse2")]
613#[cfg_attr(test, assert_instr(psllq))]
614#[stable(feature = "simd_x86", since = "1.27.0")]
615pub fn _mm_sll_epi64(a: __m128i, count: __m128i) -> __m128i {
616    unsafe { transmute(psllq(a.as_i64x2(), count.as_i64x2())) }
617}
618
619/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in sign
620/// bits.
621///
622/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16)
623#[inline]
624#[target_feature(enable = "sse2")]
625#[cfg_attr(test, assert_instr(psraw, IMM8 = 1))]
626#[rustc_legacy_const_generics(1)]
627#[stable(feature = "simd_x86", since = "1.27.0")]
628pub fn _mm_srai_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
629    static_assert_uimm_bits!(IMM8, 8);
630    unsafe { transmute(simd_shr(a.as_i16x8(), i16x8::splat(IMM8.min(15) as i16))) }
631}
632
633/// Shifts packed 16-bit integers in `a` right by `count` while shifting in sign
634/// bits.
635///
636/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16)
637#[inline]
638#[target_feature(enable = "sse2")]
639#[cfg_attr(test, assert_instr(psraw))]
640#[stable(feature = "simd_x86", since = "1.27.0")]
641pub fn _mm_sra_epi16(a: __m128i, count: __m128i) -> __m128i {
642    unsafe { transmute(psraw(a.as_i16x8(), count.as_i16x8())) }
643}
644
645/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in sign
646/// bits.
647///
648/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32)
649#[inline]
650#[target_feature(enable = "sse2")]
651#[cfg_attr(test, assert_instr(psrad, IMM8 = 1))]
652#[rustc_legacy_const_generics(1)]
653#[stable(feature = "simd_x86", since = "1.27.0")]
654pub fn _mm_srai_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
655    static_assert_uimm_bits!(IMM8, 8);
656    unsafe { transmute(simd_shr(a.as_i32x4(), i32x4::splat(IMM8.min(31)))) }
657}
658
659/// Shifts packed 32-bit integers in `a` right by `count` while shifting in sign
660/// bits.
661///
662/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32)
663#[inline]
664#[target_feature(enable = "sse2")]
665#[cfg_attr(test, assert_instr(psrad))]
666#[stable(feature = "simd_x86", since = "1.27.0")]
667pub fn _mm_sra_epi32(a: __m128i, count: __m128i) -> __m128i {
668    unsafe { transmute(psrad(a.as_i32x4(), count.as_i32x4())) }
669}
670
671/// Shifts `a` right by `IMM8` bytes while shifting in zeros.
672///
673/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128)
674#[inline]
675#[target_feature(enable = "sse2")]
676#[cfg_attr(test, assert_instr(psrldq, IMM8 = 1))]
677#[rustc_legacy_const_generics(1)]
678#[stable(feature = "simd_x86", since = "1.27.0")]
679pub fn _mm_srli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
680    static_assert_uimm_bits!(IMM8, 8);
681    unsafe { _mm_srli_si128_impl::<IMM8>(a) }
682}
683
684/// Implementation detail: converts the immediate argument of the
685/// `_mm_srli_si128` intrinsic into a compile-time constant.
686#[inline]
687#[target_feature(enable = "sse2")]
688unsafe fn _mm_srli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
689    const fn mask(shift: i32, i: u32) -> u32 {
690        if (shift as u32) > 15 {
691            i + 16
692        } else {
693            i + (shift as u32)
694        }
695    }
696    let x: i8x16 = simd_shuffle!(
697        a.as_i8x16(),
698        i8x16::ZERO,
699        [
700            mask(IMM8, 0),
701            mask(IMM8, 1),
702            mask(IMM8, 2),
703            mask(IMM8, 3),
704            mask(IMM8, 4),
705            mask(IMM8, 5),
706            mask(IMM8, 6),
707            mask(IMM8, 7),
708            mask(IMM8, 8),
709            mask(IMM8, 9),
710            mask(IMM8, 10),
711            mask(IMM8, 11),
712            mask(IMM8, 12),
713            mask(IMM8, 13),
714            mask(IMM8, 14),
715            mask(IMM8, 15),
716        ],
717    );
718    transmute(x)
719}
720
721/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
722/// zeros.
723///
724/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16)
725#[inline]
726#[target_feature(enable = "sse2")]
727#[cfg_attr(test, assert_instr(psrlw, IMM8 = 1))]
728#[rustc_legacy_const_generics(1)]
729#[stable(feature = "simd_x86", since = "1.27.0")]
730pub fn _mm_srli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
731    static_assert_uimm_bits!(IMM8, 8);
732    unsafe {
733        if IMM8 >= 16 {
734            _mm_setzero_si128()
735        } else {
736            transmute(simd_shr(a.as_u16x8(), u16x8::splat(IMM8 as u16)))
737        }
738    }
739}
740
741/// Shifts packed 16-bit integers in `a` right by `count` while shifting in
742/// zeros.
743///
744/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16)
745#[inline]
746#[target_feature(enable = "sse2")]
747#[cfg_attr(test, assert_instr(psrlw))]
748#[stable(feature = "simd_x86", since = "1.27.0")]
749pub fn _mm_srl_epi16(a: __m128i, count: __m128i) -> __m128i {
750    unsafe { transmute(psrlw(a.as_i16x8(), count.as_i16x8())) }
751}
752
753/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
754/// zeros.
755///
756/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32)
757#[inline]
758#[target_feature(enable = "sse2")]
759#[cfg_attr(test, assert_instr(psrld, IMM8 = 8))]
760#[rustc_legacy_const_generics(1)]
761#[stable(feature = "simd_x86", since = "1.27.0")]
762pub fn _mm_srli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
763    static_assert_uimm_bits!(IMM8, 8);
764    unsafe {
765        if IMM8 >= 32 {
766            _mm_setzero_si128()
767        } else {
768            transmute(simd_shr(a.as_u32x4(), u32x4::splat(IMM8 as u32)))
769        }
770    }
771}
772
773/// Shifts packed 32-bit integers in `a` right by `count` while shifting in
774/// zeros.
775///
776/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32)
777#[inline]
778#[target_feature(enable = "sse2")]
779#[cfg_attr(test, assert_instr(psrld))]
780#[stable(feature = "simd_x86", since = "1.27.0")]
781pub fn _mm_srl_epi32(a: __m128i, count: __m128i) -> __m128i {
782    unsafe { transmute(psrld(a.as_i32x4(), count.as_i32x4())) }
783}
784
785/// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
786/// zeros.
787///
788/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64)
789#[inline]
790#[target_feature(enable = "sse2")]
791#[cfg_attr(test, assert_instr(psrlq, IMM8 = 1))]
792#[rustc_legacy_const_generics(1)]
793#[stable(feature = "simd_x86", since = "1.27.0")]
794pub fn _mm_srli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
795    static_assert_uimm_bits!(IMM8, 8);
796    unsafe {
797        if IMM8 >= 64 {
798            _mm_setzero_si128()
799        } else {
800            transmute(simd_shr(a.as_u64x2(), u64x2::splat(IMM8 as u64)))
801        }
802    }
803}
804
805/// Shifts packed 64-bit integers in `a` right by `count` while shifting in
806/// zeros.
807///
808/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64)
809#[inline]
810#[target_feature(enable = "sse2")]
811#[cfg_attr(test, assert_instr(psrlq))]
812#[stable(feature = "simd_x86", since = "1.27.0")]
813pub fn _mm_srl_epi64(a: __m128i, count: __m128i) -> __m128i {
814    unsafe { transmute(psrlq(a.as_i64x2(), count.as_i64x2())) }
815}
816
817/// Computes the bitwise AND of 128 bits (representing integer data) in `a` and
818/// `b`.
819///
820/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128)
821#[inline]
822#[target_feature(enable = "sse2")]
823#[cfg_attr(test, assert_instr(andps))]
824#[stable(feature = "simd_x86", since = "1.27.0")]
825pub fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i {
826    unsafe { simd_and(a, b) }
827}
828
829/// Computes the bitwise NOT of 128 bits (representing integer data) in `a` and
830/// then AND with `b`.
831///
832/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128)
833#[inline]
834#[target_feature(enable = "sse2")]
835#[cfg_attr(test, assert_instr(andnps))]
836#[stable(feature = "simd_x86", since = "1.27.0")]
837pub fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i {
838    unsafe { simd_and(simd_xor(_mm_set1_epi8(-1), a), b) }
839}
840
841/// Computes the bitwise OR of 128 bits (representing integer data) in `a` and
842/// `b`.
843///
844/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128)
845#[inline]
846#[target_feature(enable = "sse2")]
847#[cfg_attr(test, assert_instr(orps))]
848#[stable(feature = "simd_x86", since = "1.27.0")]
849pub fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i {
850    unsafe { simd_or(a, b) }
851}
852
853/// Computes the bitwise XOR of 128 bits (representing integer data) in `a` and
854/// `b`.
855///
856/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128)
857#[inline]
858#[target_feature(enable = "sse2")]
859#[cfg_attr(test, assert_instr(xorps))]
860#[stable(feature = "simd_x86", since = "1.27.0")]
861pub fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i {
862    unsafe { simd_xor(a, b) }
863}
864
865/// Compares packed 8-bit integers in `a` and `b` for equality.
866///
867/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8)
868#[inline]
869#[target_feature(enable = "sse2")]
870#[cfg_attr(test, assert_instr(pcmpeqb))]
871#[stable(feature = "simd_x86", since = "1.27.0")]
872pub fn _mm_cmpeq_epi8(a: __m128i, b: __m128i) -> __m128i {
873    unsafe { transmute::<i8x16, _>(simd_eq(a.as_i8x16(), b.as_i8x16())) }
874}
875
876/// Compares packed 16-bit integers in `a` and `b` for equality.
877///
878/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16)
879#[inline]
880#[target_feature(enable = "sse2")]
881#[cfg_attr(test, assert_instr(pcmpeqw))]
882#[stable(feature = "simd_x86", since = "1.27.0")]
883pub fn _mm_cmpeq_epi16(a: __m128i, b: __m128i) -> __m128i {
884    unsafe { transmute::<i16x8, _>(simd_eq(a.as_i16x8(), b.as_i16x8())) }
885}
886
887/// Compares packed 32-bit integers in `a` and `b` for equality.
888///
889/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32)
890#[inline]
891#[target_feature(enable = "sse2")]
892#[cfg_attr(test, assert_instr(pcmpeqd))]
893#[stable(feature = "simd_x86", since = "1.27.0")]
894pub fn _mm_cmpeq_epi32(a: __m128i, b: __m128i) -> __m128i {
895    unsafe { transmute::<i32x4, _>(simd_eq(a.as_i32x4(), b.as_i32x4())) }
896}
897
898/// Compares packed 8-bit integers in `a` and `b` for greater-than.
899///
900/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8)
901#[inline]
902#[target_feature(enable = "sse2")]
903#[cfg_attr(test, assert_instr(pcmpgtb))]
904#[stable(feature = "simd_x86", since = "1.27.0")]
905pub fn _mm_cmpgt_epi8(a: __m128i, b: __m128i) -> __m128i {
906    unsafe { transmute::<i8x16, _>(simd_gt(a.as_i8x16(), b.as_i8x16())) }
907}
908
909/// Compares packed 16-bit integers in `a` and `b` for greater-than.
910///
911/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16)
912#[inline]
913#[target_feature(enable = "sse2")]
914#[cfg_attr(test, assert_instr(pcmpgtw))]
915#[stable(feature = "simd_x86", since = "1.27.0")]
916pub fn _mm_cmpgt_epi16(a: __m128i, b: __m128i) -> __m128i {
917    unsafe { transmute::<i16x8, _>(simd_gt(a.as_i16x8(), b.as_i16x8())) }
918}
919
920/// Compares packed 32-bit integers in `a` and `b` for greater-than.
921///
922/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32)
923#[inline]
924#[target_feature(enable = "sse2")]
925#[cfg_attr(test, assert_instr(pcmpgtd))]
926#[stable(feature = "simd_x86", since = "1.27.0")]
927pub fn _mm_cmpgt_epi32(a: __m128i, b: __m128i) -> __m128i {
928    unsafe { transmute::<i32x4, _>(simd_gt(a.as_i32x4(), b.as_i32x4())) }
929}
930
931/// Compares packed 8-bit integers in `a` and `b` for less-than.
932///
933/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8)
934#[inline]
935#[target_feature(enable = "sse2")]
936#[cfg_attr(test, assert_instr(pcmpgtb))]
937#[stable(feature = "simd_x86", since = "1.27.0")]
938pub fn _mm_cmplt_epi8(a: __m128i, b: __m128i) -> __m128i {
939    unsafe { transmute::<i8x16, _>(simd_lt(a.as_i8x16(), b.as_i8x16())) }
940}
941
942/// Compares packed 16-bit integers in `a` and `b` for less-than.
943///
944/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16)
945#[inline]
946#[target_feature(enable = "sse2")]
947#[cfg_attr(test, assert_instr(pcmpgtw))]
948#[stable(feature = "simd_x86", since = "1.27.0")]
949pub fn _mm_cmplt_epi16(a: __m128i, b: __m128i) -> __m128i {
950    unsafe { transmute::<i16x8, _>(simd_lt(a.as_i16x8(), b.as_i16x8())) }
951}
952
953/// Compares packed 32-bit integers in `a` and `b` for less-than.
954///
955/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32)
956#[inline]
957#[target_feature(enable = "sse2")]
958#[cfg_attr(test, assert_instr(pcmpgtd))]
959#[stable(feature = "simd_x86", since = "1.27.0")]
960pub fn _mm_cmplt_epi32(a: __m128i, b: __m128i) -> __m128i {
961    unsafe { transmute::<i32x4, _>(simd_lt(a.as_i32x4(), b.as_i32x4())) }
962}
963
964/// Converts the lower two packed 32-bit integers in `a` to packed
965/// double-precision (64-bit) floating-point elements.
966///
967/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd)
968#[inline]
969#[target_feature(enable = "sse2")]
970#[cfg_attr(test, assert_instr(cvtdq2pd))]
971#[stable(feature = "simd_x86", since = "1.27.0")]
972pub fn _mm_cvtepi32_pd(a: __m128i) -> __m128d {
973    unsafe {
974        let a = a.as_i32x4();
975        simd_cast::<i32x2, __m128d>(simd_shuffle!(a, a, [0, 1]))
976    }
977}
978
979/// Returns `a` with its lower element replaced by `b` after converting it to
980/// an `f64`.
981///
982/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd)
983#[inline]
984#[target_feature(enable = "sse2")]
985#[cfg_attr(test, assert_instr(cvtsi2sd))]
986#[stable(feature = "simd_x86", since = "1.27.0")]
987pub fn _mm_cvtsi32_sd(a: __m128d, b: i32) -> __m128d {
988    unsafe { simd_insert!(a, 0, b as f64) }
989}
990
991/// Converts packed 32-bit integers in `a` to packed single-precision (32-bit)
992/// floating-point elements.
993///
994/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps)
995#[inline]
996#[target_feature(enable = "sse2")]
997#[cfg_attr(test, assert_instr(cvtdq2ps))]
998#[stable(feature = "simd_x86", since = "1.27.0")]
999pub fn _mm_cvtepi32_ps(a: __m128i) -> __m128 {
1000    unsafe { transmute(simd_cast::<_, f32x4>(a.as_i32x4())) }
1001}
1002
1003/// Converts packed single-precision (32-bit) floating-point elements in `a`
1004/// to packed 32-bit integers.
1005///
1006/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32)
1007#[inline]
1008#[target_feature(enable = "sse2")]
1009#[cfg_attr(test, assert_instr(cvtps2dq))]
1010#[stable(feature = "simd_x86", since = "1.27.0")]
1011pub fn _mm_cvtps_epi32(a: __m128) -> __m128i {
1012    unsafe { transmute(cvtps2dq(a)) }
1013}
1014
1015/// Returns a vector whose lowest element is `a` and all higher elements are
1016/// `0`.
1017///
1018/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128)
1019#[inline]
1020#[target_feature(enable = "sse2")]
1021#[stable(feature = "simd_x86", since = "1.27.0")]
1022pub fn _mm_cvtsi32_si128(a: i32) -> __m128i {
1023    unsafe { transmute(i32x4::new(a, 0, 0, 0)) }
1024}
1025
1026/// Returns the lowest element of `a`.
1027///
1028/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32)
1029#[inline]
1030#[target_feature(enable = "sse2")]
1031#[stable(feature = "simd_x86", since = "1.27.0")]
1032pub fn _mm_cvtsi128_si32(a: __m128i) -> i32 {
1033    unsafe { simd_extract!(a.as_i32x4(), 0) }
1034}
1035
1036/// Sets packed 64-bit integers with the supplied values, from highest to
1037/// lowest.
1038///
1039/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x)
1040#[inline]
1041#[target_feature(enable = "sse2")]
1042// no particular instruction to test
1043#[stable(feature = "simd_x86", since = "1.27.0")]
1044pub fn _mm_set_epi64x(e1: i64, e0: i64) -> __m128i {
1045    unsafe { transmute(i64x2::new(e0, e1)) }
1046}
1047
1048/// Sets packed 32-bit integers with the supplied values.
1049///
1050/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32)
1051#[inline]
1052#[target_feature(enable = "sse2")]
1053// no particular instruction to test
1054#[stable(feature = "simd_x86", since = "1.27.0")]
1055pub fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
1056    unsafe { transmute(i32x4::new(e0, e1, e2, e3)) }
1057}
1058
1059/// Sets packed 16-bit integers with the supplied values.
1060///
1061/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16)
1062#[inline]
1063#[target_feature(enable = "sse2")]
1064// no particular instruction to test
1065#[stable(feature = "simd_x86", since = "1.27.0")]
1066pub fn _mm_set_epi16(
1067    e7: i16,
1068    e6: i16,
1069    e5: i16,
1070    e4: i16,
1071    e3: i16,
1072    e2: i16,
1073    e1: i16,
1074    e0: i16,
1075) -> __m128i {
1076    unsafe { transmute(i16x8::new(e0, e1, e2, e3, e4, e5, e6, e7)) }
1077}
1078
1079/// Sets packed 8-bit integers with the supplied values.
1080///
1081/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8)
1082#[inline]
1083#[target_feature(enable = "sse2")]
1084// no particular instruction to test
1085#[stable(feature = "simd_x86", since = "1.27.0")]
1086pub fn _mm_set_epi8(
1087    e15: i8,
1088    e14: i8,
1089    e13: i8,
1090    e12: i8,
1091    e11: i8,
1092    e10: i8,
1093    e9: i8,
1094    e8: i8,
1095    e7: i8,
1096    e6: i8,
1097    e5: i8,
1098    e4: i8,
1099    e3: i8,
1100    e2: i8,
1101    e1: i8,
1102    e0: i8,
1103) -> __m128i {
1104    unsafe {
1105        #[rustfmt::skip]
1106        transmute(i8x16::new(
1107            e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
1108        ))
1109    }
1110}
1111
1112/// Broadcasts 64-bit integer `a` to all elements.
1113///
1114/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x)
1115#[inline]
1116#[target_feature(enable = "sse2")]
1117// no particular instruction to test
1118#[stable(feature = "simd_x86", since = "1.27.0")]
1119pub fn _mm_set1_epi64x(a: i64) -> __m128i {
1120    _mm_set_epi64x(a, a)
1121}
1122
1123/// Broadcasts 32-bit integer `a` to all elements.
1124///
1125/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32)
1126#[inline]
1127#[target_feature(enable = "sse2")]
1128// no particular instruction to test
1129#[stable(feature = "simd_x86", since = "1.27.0")]
1130pub fn _mm_set1_epi32(a: i32) -> __m128i {
1131    _mm_set_epi32(a, a, a, a)
1132}
1133
1134/// Broadcasts 16-bit integer `a` to all elements.
1135///
1136/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16)
1137#[inline]
1138#[target_feature(enable = "sse2")]
1139// no particular instruction to test
1140#[stable(feature = "simd_x86", since = "1.27.0")]
1141pub fn _mm_set1_epi16(a: i16) -> __m128i {
1142    _mm_set_epi16(a, a, a, a, a, a, a, a)
1143}
1144
1145/// Broadcasts 8-bit integer `a` to all elements.
1146///
1147/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8)
1148#[inline]
1149#[target_feature(enable = "sse2")]
1150// no particular instruction to test
1151#[stable(feature = "simd_x86", since = "1.27.0")]
1152pub fn _mm_set1_epi8(a: i8) -> __m128i {
1153    _mm_set_epi8(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
1154}
1155
1156/// Sets packed 32-bit integers with the supplied values in reverse order.
1157///
1158/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32)
1159#[inline]
1160#[target_feature(enable = "sse2")]
1161// no particular instruction to test
1162#[stable(feature = "simd_x86", since = "1.27.0")]
1163pub fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
1164    _mm_set_epi32(e0, e1, e2, e3)
1165}
1166
1167/// Sets packed 16-bit integers with the supplied values in reverse order.
1168///
1169/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16)
1170#[inline]
1171#[target_feature(enable = "sse2")]
1172// no particular instruction to test
1173#[stable(feature = "simd_x86", since = "1.27.0")]
1174pub fn _mm_setr_epi16(
1175    e7: i16,
1176    e6: i16,
1177    e5: i16,
1178    e4: i16,
1179    e3: i16,
1180    e2: i16,
1181    e1: i16,
1182    e0: i16,
1183) -> __m128i {
1184    _mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7)
1185}
1186
1187/// Sets packed 8-bit integers with the supplied values in reverse order.
1188///
1189/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8)
1190#[inline]
1191#[target_feature(enable = "sse2")]
1192// no particular instruction to test
1193#[stable(feature = "simd_x86", since = "1.27.0")]
1194pub fn _mm_setr_epi8(
1195    e15: i8,
1196    e14: i8,
1197    e13: i8,
1198    e12: i8,
1199    e11: i8,
1200    e10: i8,
1201    e9: i8,
1202    e8: i8,
1203    e7: i8,
1204    e6: i8,
1205    e5: i8,
1206    e4: i8,
1207    e3: i8,
1208    e2: i8,
1209    e1: i8,
1210    e0: i8,
1211) -> __m128i {
1212    #[rustfmt::skip]
1213    _mm_set_epi8(
1214        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
1215    )
1216}
1217
1218/// Returns a vector with all elements set to zero.
1219///
1220/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128)
1221#[inline]
1222#[target_feature(enable = "sse2")]
1223#[cfg_attr(test, assert_instr(xorps))]
1224#[stable(feature = "simd_x86", since = "1.27.0")]
1225pub fn _mm_setzero_si128() -> __m128i {
1226    const { unsafe { mem::zeroed() } }
1227}
1228
1229/// Loads 64-bit integer from memory into first element of returned vector.
1230///
1231/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64)
1232#[inline]
1233#[target_feature(enable = "sse2")]
1234#[stable(feature = "simd_x86", since = "1.27.0")]
1235pub unsafe fn _mm_loadl_epi64(mem_addr: *const __m128i) -> __m128i {
1236    _mm_set_epi64x(0, ptr::read_unaligned(mem_addr as *const i64))
1237}
1238
1239/// Loads 128-bits of integer data from memory into a new vector.
1240///
1241/// `mem_addr` must be aligned on a 16-byte boundary.
1242///
1243/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128)
1244#[inline]
1245#[target_feature(enable = "sse2")]
1246#[cfg_attr(
1247    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1248    assert_instr(movaps)
1249)]
1250#[stable(feature = "simd_x86", since = "1.27.0")]
1251pub unsafe fn _mm_load_si128(mem_addr: *const __m128i) -> __m128i {
1252    *mem_addr
1253}
1254
1255/// Loads 128-bits of integer data from memory into a new vector.
1256///
1257/// `mem_addr` does not need to be aligned on any particular boundary.
1258///
1259/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128)
1260#[inline]
1261#[target_feature(enable = "sse2")]
1262#[cfg_attr(test, assert_instr(movups))]
1263#[stable(feature = "simd_x86", since = "1.27.0")]
1264pub unsafe fn _mm_loadu_si128(mem_addr: *const __m128i) -> __m128i {
1265    let mut dst: __m128i = _mm_undefined_si128();
1266    ptr::copy_nonoverlapping(
1267        mem_addr as *const u8,
1268        ptr::addr_of_mut!(dst) as *mut u8,
1269        mem::size_of::<__m128i>(),
1270    );
1271    dst
1272}
1273
1274/// Conditionally store 8-bit integer elements from `a` into memory using
1275/// `mask` flagged as non-temporal (unlikely to be used again soon).
1276///
1277/// Elements are not stored when the highest bit is not set in the
1278/// corresponding element.
1279///
1280/// `mem_addr` should correspond to a 128-bit memory location and does not need
1281/// to be aligned on any particular boundary.
1282///
1283/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128)
1284///
1285/// # Safety of non-temporal stores
1286///
1287/// After using this intrinsic, but before any other access to the memory that this intrinsic
1288/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1289/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1290/// return.
1291///
1292/// See [`_mm_sfence`] for details.
1293#[inline]
1294#[target_feature(enable = "sse2")]
1295#[cfg_attr(test, assert_instr(maskmovdqu))]
1296#[stable(feature = "simd_x86", since = "1.27.0")]
1297pub unsafe fn _mm_maskmoveu_si128(a: __m128i, mask: __m128i, mem_addr: *mut i8) {
1298    maskmovdqu(a.as_i8x16(), mask.as_i8x16(), mem_addr)
1299}
1300
1301/// Stores 128-bits of integer data from `a` into memory.
1302///
1303/// `mem_addr` must be aligned on a 16-byte boundary.
1304///
1305/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128)
1306#[inline]
1307#[target_feature(enable = "sse2")]
1308#[cfg_attr(
1309    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1310    assert_instr(movaps)
1311)]
1312#[stable(feature = "simd_x86", since = "1.27.0")]
1313pub unsafe fn _mm_store_si128(mem_addr: *mut __m128i, a: __m128i) {
1314    *mem_addr = a;
1315}
1316
1317/// Stores 128-bits of integer data from `a` into memory.
1318///
1319/// `mem_addr` does not need to be aligned on any particular boundary.
1320///
1321/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128)
1322#[inline]
1323#[target_feature(enable = "sse2")]
1324#[cfg_attr(test, assert_instr(movups))] // FIXME movdqu expected
1325#[stable(feature = "simd_x86", since = "1.27.0")]
1326pub unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) {
1327    mem_addr.write_unaligned(a);
1328}
1329
1330/// Stores the lower 64-bit integer `a` to a memory location.
1331///
1332/// `mem_addr` does not need to be aligned on any particular boundary.
1333///
1334/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64)
1335#[inline]
1336#[target_feature(enable = "sse2")]
1337#[stable(feature = "simd_x86", since = "1.27.0")]
1338pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
1339    ptr::copy_nonoverlapping(ptr::addr_of!(a) as *const u8, mem_addr as *mut u8, 8);
1340}
1341
1342/// Stores a 128-bit integer vector to a 128-bit aligned memory location.
1343/// To minimize caching, the data is flagged as non-temporal (unlikely to be
1344/// used again soon).
1345///
1346/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128)
1347///
1348/// # Safety of non-temporal stores
1349///
1350/// After using this intrinsic, but before any other access to the memory that this intrinsic
1351/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1352/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1353/// return.
1354///
1355/// See [`_mm_sfence`] for details.
1356#[inline]
1357#[target_feature(enable = "sse2")]
1358#[cfg_attr(test, assert_instr(movntdq))]
1359#[stable(feature = "simd_x86", since = "1.27.0")]
1360pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
1361    crate::arch::asm!(
1362        vps!("movntdq",  ",{a}"),
1363        p = in(reg) mem_addr,
1364        a = in(xmm_reg) a,
1365        options(nostack, preserves_flags),
1366    );
1367}
1368
1369/// Stores a 32-bit integer value in the specified memory location.
1370/// To minimize caching, the data is flagged as non-temporal (unlikely to be
1371/// used again soon).
1372///
1373/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32)
1374///
1375/// # Safety of non-temporal stores
1376///
1377/// After using this intrinsic, but before any other access to the memory that this intrinsic
1378/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1379/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1380/// return.
1381///
1382/// See [`_mm_sfence`] for details.
1383#[inline]
1384#[target_feature(enable = "sse2")]
1385#[cfg_attr(test, assert_instr(movnti))]
1386#[stable(feature = "simd_x86", since = "1.27.0")]
1387pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) {
1388    crate::arch::asm!(
1389        vps!("movnti", ",{a:e}"), // `:e` for 32bit value
1390        p = in(reg) mem_addr,
1391        a = in(reg) a,
1392        options(nostack, preserves_flags),
1393    );
1394}
1395
1396/// Returns a vector where the low element is extracted from `a` and its upper
1397/// element is zero.
1398///
1399/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64)
1400#[inline]
1401#[target_feature(enable = "sse2")]
1402// FIXME movd on msvc, movd on i686
1403#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(movq))]
1404#[stable(feature = "simd_x86", since = "1.27.0")]
1405pub fn _mm_move_epi64(a: __m128i) -> __m128i {
1406    unsafe {
1407        let r: i64x2 = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 2]);
1408        transmute(r)
1409    }
1410}
1411
1412/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
1413/// using signed saturation.
1414///
1415/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16)
1416#[inline]
1417#[target_feature(enable = "sse2")]
1418#[cfg_attr(test, assert_instr(packsswb))]
1419#[stable(feature = "simd_x86", since = "1.27.0")]
1420pub fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i {
1421    unsafe { transmute(packsswb(a.as_i16x8(), b.as_i16x8())) }
1422}
1423
1424/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
1425/// using signed saturation.
1426///
1427/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32)
1428#[inline]
1429#[target_feature(enable = "sse2")]
1430#[cfg_attr(test, assert_instr(packssdw))]
1431#[stable(feature = "simd_x86", since = "1.27.0")]
1432pub fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i {
1433    unsafe { transmute(packssdw(a.as_i32x4(), b.as_i32x4())) }
1434}
1435
1436/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
1437/// using unsigned saturation.
1438///
1439/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16)
1440#[inline]
1441#[target_feature(enable = "sse2")]
1442#[cfg_attr(test, assert_instr(packuswb))]
1443#[stable(feature = "simd_x86", since = "1.27.0")]
1444pub fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i {
1445    unsafe { transmute(packuswb(a.as_i16x8(), b.as_i16x8())) }
1446}
1447
1448/// Returns the `imm8` element of `a`.
1449///
1450/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16)
1451#[inline]
1452#[target_feature(enable = "sse2")]
1453#[cfg_attr(test, assert_instr(pextrw, IMM8 = 7))]
1454#[rustc_legacy_const_generics(1)]
1455#[stable(feature = "simd_x86", since = "1.27.0")]
1456pub fn _mm_extract_epi16<const IMM8: i32>(a: __m128i) -> i32 {
1457    static_assert_uimm_bits!(IMM8, 3);
1458    unsafe { simd_extract!(a.as_u16x8(), IMM8 as u32, u16) as i32 }
1459}
1460
1461/// Returns a new vector where the `imm8` element of `a` is replaced with `i`.
1462///
1463/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16)
1464#[inline]
1465#[target_feature(enable = "sse2")]
1466#[cfg_attr(test, assert_instr(pinsrw, IMM8 = 7))]
1467#[rustc_legacy_const_generics(2)]
1468#[stable(feature = "simd_x86", since = "1.27.0")]
1469pub fn _mm_insert_epi16<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
1470    static_assert_uimm_bits!(IMM8, 3);
1471    unsafe { transmute(simd_insert!(a.as_i16x8(), IMM8 as u32, i as i16)) }
1472}
1473
1474/// Returns a mask of the most significant bit of each element in `a`.
1475///
1476/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8)
1477#[inline]
1478#[target_feature(enable = "sse2")]
1479#[cfg_attr(test, assert_instr(pmovmskb))]
1480#[stable(feature = "simd_x86", since = "1.27.0")]
1481pub fn _mm_movemask_epi8(a: __m128i) -> i32 {
1482    unsafe {
1483        let z = i8x16::ZERO;
1484        let m: i8x16 = simd_lt(a.as_i8x16(), z);
1485        simd_bitmask::<_, u16>(m) as u32 as i32
1486    }
1487}
1488
1489/// Shuffles 32-bit integers in `a` using the control in `IMM8`.
1490///
1491/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32)
1492#[inline]
1493#[target_feature(enable = "sse2")]
1494#[cfg_attr(test, assert_instr(pshufd, IMM8 = 9))]
1495#[rustc_legacy_const_generics(1)]
1496#[stable(feature = "simd_x86", since = "1.27.0")]
1497pub fn _mm_shuffle_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
1498    static_assert_uimm_bits!(IMM8, 8);
1499    unsafe {
1500        let a = a.as_i32x4();
1501        let x: i32x4 = simd_shuffle!(
1502            a,
1503            a,
1504            [
1505                IMM8 as u32 & 0b11,
1506                (IMM8 as u32 >> 2) & 0b11,
1507                (IMM8 as u32 >> 4) & 0b11,
1508                (IMM8 as u32 >> 6) & 0b11,
1509            ],
1510        );
1511        transmute(x)
1512    }
1513}
1514
1515/// Shuffles 16-bit integers in the high 64 bits of `a` using the control in
1516/// `IMM8`.
1517///
1518/// Put the results in the high 64 bits of the returned vector, with the low 64
1519/// bits being copied from `a`.
1520///
1521/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16)
1522#[inline]
1523#[target_feature(enable = "sse2")]
1524#[cfg_attr(test, assert_instr(pshufhw, IMM8 = 9))]
1525#[rustc_legacy_const_generics(1)]
1526#[stable(feature = "simd_x86", since = "1.27.0")]
1527pub fn _mm_shufflehi_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
1528    static_assert_uimm_bits!(IMM8, 8);
1529    unsafe {
1530        let a = a.as_i16x8();
1531        let x: i16x8 = simd_shuffle!(
1532            a,
1533            a,
1534            [
1535                0,
1536                1,
1537                2,
1538                3,
1539                (IMM8 as u32 & 0b11) + 4,
1540                ((IMM8 as u32 >> 2) & 0b11) + 4,
1541                ((IMM8 as u32 >> 4) & 0b11) + 4,
1542                ((IMM8 as u32 >> 6) & 0b11) + 4,
1543            ],
1544        );
1545        transmute(x)
1546    }
1547}
1548
1549/// Shuffles 16-bit integers in the low 64 bits of `a` using the control in
1550/// `IMM8`.
1551///
1552/// Put the results in the low 64 bits of the returned vector, with the high 64
1553/// bits being copied from `a`.
1554///
1555/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16)
1556#[inline]
1557#[target_feature(enable = "sse2")]
1558#[cfg_attr(test, assert_instr(pshuflw, IMM8 = 9))]
1559#[rustc_legacy_const_generics(1)]
1560#[stable(feature = "simd_x86", since = "1.27.0")]
1561pub fn _mm_shufflelo_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
1562    static_assert_uimm_bits!(IMM8, 8);
1563    unsafe {
1564        let a = a.as_i16x8();
1565        let x: i16x8 = simd_shuffle!(
1566            a,
1567            a,
1568            [
1569                IMM8 as u32 & 0b11,
1570                (IMM8 as u32 >> 2) & 0b11,
1571                (IMM8 as u32 >> 4) & 0b11,
1572                (IMM8 as u32 >> 6) & 0b11,
1573                4,
1574                5,
1575                6,
1576                7,
1577            ],
1578        );
1579        transmute(x)
1580    }
1581}
1582
1583/// Unpacks and interleave 8-bit integers from the high half of `a` and `b`.
1584///
1585/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8)
1586#[inline]
1587#[target_feature(enable = "sse2")]
1588#[cfg_attr(test, assert_instr(punpckhbw))]
1589#[stable(feature = "simd_x86", since = "1.27.0")]
1590pub fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i {
1591    unsafe {
1592        transmute::<i8x16, _>(simd_shuffle!(
1593            a.as_i8x16(),
1594            b.as_i8x16(),
1595            [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31],
1596        ))
1597    }
1598}
1599
1600/// Unpacks and interleave 16-bit integers from the high half of `a` and `b`.
1601///
1602/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16)
1603#[inline]
1604#[target_feature(enable = "sse2")]
1605#[cfg_attr(test, assert_instr(punpckhwd))]
1606#[stable(feature = "simd_x86", since = "1.27.0")]
1607pub fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i {
1608    unsafe {
1609        let x = simd_shuffle!(a.as_i16x8(), b.as_i16x8(), [4, 12, 5, 13, 6, 14, 7, 15]);
1610        transmute::<i16x8, _>(x)
1611    }
1612}
1613
1614/// Unpacks and interleave 32-bit integers from the high half of `a` and `b`.
1615///
1616/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32)
1617#[inline]
1618#[target_feature(enable = "sse2")]
1619#[cfg_attr(test, assert_instr(unpckhps))]
1620#[stable(feature = "simd_x86", since = "1.27.0")]
1621pub fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i {
1622    unsafe { transmute::<i32x4, _>(simd_shuffle!(a.as_i32x4(), b.as_i32x4(), [2, 6, 3, 7])) }
1623}
1624
1625/// Unpacks and interleave 64-bit integers from the high half of `a` and `b`.
1626///
1627/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64)
1628#[inline]
1629#[target_feature(enable = "sse2")]
1630#[cfg_attr(test, assert_instr(unpckhpd))]
1631#[stable(feature = "simd_x86", since = "1.27.0")]
1632pub fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i {
1633    unsafe { transmute::<i64x2, _>(simd_shuffle!(a.as_i64x2(), b.as_i64x2(), [1, 3])) }
1634}
1635
1636/// Unpacks and interleave 8-bit integers from the low half of `a` and `b`.
1637///
1638/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8)
1639#[inline]
1640#[target_feature(enable = "sse2")]
1641#[cfg_attr(test, assert_instr(punpcklbw))]
1642#[stable(feature = "simd_x86", since = "1.27.0")]
1643pub fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
1644    unsafe {
1645        transmute::<i8x16, _>(simd_shuffle!(
1646            a.as_i8x16(),
1647            b.as_i8x16(),
1648            [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23],
1649        ))
1650    }
1651}
1652
1653/// Unpacks and interleave 16-bit integers from the low half of `a` and `b`.
1654///
1655/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16)
1656#[inline]
1657#[target_feature(enable = "sse2")]
1658#[cfg_attr(test, assert_instr(punpcklwd))]
1659#[stable(feature = "simd_x86", since = "1.27.0")]
1660pub fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
1661    unsafe {
1662        let x = simd_shuffle!(a.as_i16x8(), b.as_i16x8(), [0, 8, 1, 9, 2, 10, 3, 11]);
1663        transmute::<i16x8, _>(x)
1664    }
1665}
1666
1667/// Unpacks and interleave 32-bit integers from the low half of `a` and `b`.
1668///
1669/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32)
1670#[inline]
1671#[target_feature(enable = "sse2")]
1672#[cfg_attr(test, assert_instr(unpcklps))]
1673#[stable(feature = "simd_x86", since = "1.27.0")]
1674pub fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i {
1675    unsafe { transmute::<i32x4, _>(simd_shuffle!(a.as_i32x4(), b.as_i32x4(), [0, 4, 1, 5])) }
1676}
1677
1678/// Unpacks and interleave 64-bit integers from the low half of `a` and `b`.
1679///
1680/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64)
1681#[inline]
1682#[target_feature(enable = "sse2")]
1683#[cfg_attr(test, assert_instr(movlhps))]
1684#[stable(feature = "simd_x86", since = "1.27.0")]
1685pub fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
1686    unsafe { transmute::<i64x2, _>(simd_shuffle!(a.as_i64x2(), b.as_i64x2(), [0, 2])) }
1687}
1688
1689/// Returns a new vector with the low element of `a` replaced by the sum of the
1690/// low elements of `a` and `b`.
1691///
1692/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd)
1693#[inline]
1694#[target_feature(enable = "sse2")]
1695#[cfg_attr(test, assert_instr(addsd))]
1696#[stable(feature = "simd_x86", since = "1.27.0")]
1697pub fn _mm_add_sd(a: __m128d, b: __m128d) -> __m128d {
1698    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) + _mm_cvtsd_f64(b)) }
1699}
1700
1701/// Adds packed double-precision (64-bit) floating-point elements in `a` and
1702/// `b`.
1703///
1704/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd)
1705#[inline]
1706#[target_feature(enable = "sse2")]
1707#[cfg_attr(test, assert_instr(addpd))]
1708#[stable(feature = "simd_x86", since = "1.27.0")]
1709pub fn _mm_add_pd(a: __m128d, b: __m128d) -> __m128d {
1710    unsafe { simd_add(a, b) }
1711}
1712
1713/// Returns a new vector with the low element of `a` replaced by the result of
1714/// diving the lower element of `a` by the lower element of `b`.
1715///
1716/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd)
1717#[inline]
1718#[target_feature(enable = "sse2")]
1719#[cfg_attr(test, assert_instr(divsd))]
1720#[stable(feature = "simd_x86", since = "1.27.0")]
1721pub fn _mm_div_sd(a: __m128d, b: __m128d) -> __m128d {
1722    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) / _mm_cvtsd_f64(b)) }
1723}
1724
1725/// Divide packed double-precision (64-bit) floating-point elements in `a` by
1726/// packed elements in `b`.
1727///
1728/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd)
1729#[inline]
1730#[target_feature(enable = "sse2")]
1731#[cfg_attr(test, assert_instr(divpd))]
1732#[stable(feature = "simd_x86", since = "1.27.0")]
1733pub fn _mm_div_pd(a: __m128d, b: __m128d) -> __m128d {
1734    unsafe { simd_div(a, b) }
1735}
1736
1737/// Returns a new vector with the low element of `a` replaced by the maximum
1738/// of the lower elements of `a` and `b`.
1739///
1740/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd)
1741#[inline]
1742#[target_feature(enable = "sse2")]
1743#[cfg_attr(test, assert_instr(maxsd))]
1744#[stable(feature = "simd_x86", since = "1.27.0")]
1745pub fn _mm_max_sd(a: __m128d, b: __m128d) -> __m128d {
1746    unsafe { maxsd(a, b) }
1747}
1748
1749/// Returns a new vector with the maximum values from corresponding elements in
1750/// `a` and `b`.
1751///
1752/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd)
1753#[inline]
1754#[target_feature(enable = "sse2")]
1755#[cfg_attr(test, assert_instr(maxpd))]
1756#[stable(feature = "simd_x86", since = "1.27.0")]
1757pub fn _mm_max_pd(a: __m128d, b: __m128d) -> __m128d {
1758    unsafe { maxpd(a, b) }
1759}
1760
1761/// Returns a new vector with the low element of `a` replaced by the minimum
1762/// of the lower elements of `a` and `b`.
1763///
1764/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd)
1765#[inline]
1766#[target_feature(enable = "sse2")]
1767#[cfg_attr(test, assert_instr(minsd))]
1768#[stable(feature = "simd_x86", since = "1.27.0")]
1769pub fn _mm_min_sd(a: __m128d, b: __m128d) -> __m128d {
1770    unsafe { minsd(a, b) }
1771}
1772
1773/// Returns a new vector with the minimum values from corresponding elements in
1774/// `a` and `b`.
1775///
1776/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd)
1777#[inline]
1778#[target_feature(enable = "sse2")]
1779#[cfg_attr(test, assert_instr(minpd))]
1780#[stable(feature = "simd_x86", since = "1.27.0")]
1781pub fn _mm_min_pd(a: __m128d, b: __m128d) -> __m128d {
1782    unsafe { minpd(a, b) }
1783}
1784
1785/// Returns a new vector with the low element of `a` replaced by multiplying the
1786/// low elements of `a` and `b`.
1787///
1788/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_sd)
1789#[inline]
1790#[target_feature(enable = "sse2")]
1791#[cfg_attr(test, assert_instr(mulsd))]
1792#[stable(feature = "simd_x86", since = "1.27.0")]
1793pub fn _mm_mul_sd(a: __m128d, b: __m128d) -> __m128d {
1794    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) * _mm_cvtsd_f64(b)) }
1795}
1796
1797/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
1798/// and `b`.
1799///
1800/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd)
1801#[inline]
1802#[target_feature(enable = "sse2")]
1803#[cfg_attr(test, assert_instr(mulpd))]
1804#[stable(feature = "simd_x86", since = "1.27.0")]
1805pub fn _mm_mul_pd(a: __m128d, b: __m128d) -> __m128d {
1806    unsafe { simd_mul(a, b) }
1807}
1808
1809/// Returns a new vector with the low element of `a` replaced by the square
1810/// root of the lower element `b`.
1811///
1812/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd)
1813#[inline]
1814#[target_feature(enable = "sse2")]
1815#[cfg_attr(test, assert_instr(sqrtsd))]
1816#[stable(feature = "simd_x86", since = "1.27.0")]
1817pub fn _mm_sqrt_sd(a: __m128d, b: __m128d) -> __m128d {
1818    unsafe { simd_insert!(a, 0, sqrtf64(_mm_cvtsd_f64(b))) }
1819}
1820
1821/// Returns a new vector with the square root of each of the values in `a`.
1822///
1823/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd)
1824#[inline]
1825#[target_feature(enable = "sse2")]
1826#[cfg_attr(test, assert_instr(sqrtpd))]
1827#[stable(feature = "simd_x86", since = "1.27.0")]
1828pub fn _mm_sqrt_pd(a: __m128d) -> __m128d {
1829    unsafe { simd_fsqrt(a) }
1830}
1831
1832/// Returns a new vector with the low element of `a` replaced by subtracting the
1833/// low element by `b` from the low element of `a`.
1834///
1835/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd)
1836#[inline]
1837#[target_feature(enable = "sse2")]
1838#[cfg_attr(test, assert_instr(subsd))]
1839#[stable(feature = "simd_x86", since = "1.27.0")]
1840pub fn _mm_sub_sd(a: __m128d, b: __m128d) -> __m128d {
1841    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) - _mm_cvtsd_f64(b)) }
1842}
1843
1844/// Subtract packed double-precision (64-bit) floating-point elements in `b`
1845/// from `a`.
1846///
1847/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_pd)
1848#[inline]
1849#[target_feature(enable = "sse2")]
1850#[cfg_attr(test, assert_instr(subpd))]
1851#[stable(feature = "simd_x86", since = "1.27.0")]
1852pub fn _mm_sub_pd(a: __m128d, b: __m128d) -> __m128d {
1853    unsafe { simd_sub(a, b) }
1854}
1855
1856/// Computes the bitwise AND of packed double-precision (64-bit) floating-point
1857/// elements in `a` and `b`.
1858///
1859/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd)
1860#[inline]
1861#[target_feature(enable = "sse2")]
1862#[cfg_attr(test, assert_instr(andps))]
1863#[stable(feature = "simd_x86", since = "1.27.0")]
1864pub fn _mm_and_pd(a: __m128d, b: __m128d) -> __m128d {
1865    unsafe {
1866        let a: __m128i = transmute(a);
1867        let b: __m128i = transmute(b);
1868        transmute(_mm_and_si128(a, b))
1869    }
1870}
1871
1872/// Computes the bitwise NOT of `a` and then AND with `b`.
1873///
1874/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd)
1875#[inline]
1876#[target_feature(enable = "sse2")]
1877#[cfg_attr(test, assert_instr(andnps))]
1878#[stable(feature = "simd_x86", since = "1.27.0")]
1879pub fn _mm_andnot_pd(a: __m128d, b: __m128d) -> __m128d {
1880    unsafe {
1881        let a: __m128i = transmute(a);
1882        let b: __m128i = transmute(b);
1883        transmute(_mm_andnot_si128(a, b))
1884    }
1885}
1886
1887/// Computes the bitwise OR of `a` and `b`.
1888///
1889/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_pd)
1890#[inline]
1891#[target_feature(enable = "sse2")]
1892#[cfg_attr(test, assert_instr(orps))]
1893#[stable(feature = "simd_x86", since = "1.27.0")]
1894pub fn _mm_or_pd(a: __m128d, b: __m128d) -> __m128d {
1895    unsafe {
1896        let a: __m128i = transmute(a);
1897        let b: __m128i = transmute(b);
1898        transmute(_mm_or_si128(a, b))
1899    }
1900}
1901
1902/// Computes the bitwise XOR of `a` and `b`.
1903///
1904/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd)
1905#[inline]
1906#[target_feature(enable = "sse2")]
1907#[cfg_attr(test, assert_instr(xorps))]
1908#[stable(feature = "simd_x86", since = "1.27.0")]
1909pub fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d {
1910    unsafe {
1911        let a: __m128i = transmute(a);
1912        let b: __m128i = transmute(b);
1913        transmute(_mm_xor_si128(a, b))
1914    }
1915}
1916
1917/// Returns a new vector with the low element of `a` replaced by the equality
1918/// comparison of the lower elements of `a` and `b`.
1919///
1920/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd)
1921#[inline]
1922#[target_feature(enable = "sse2")]
1923#[cfg_attr(test, assert_instr(cmpeqsd))]
1924#[stable(feature = "simd_x86", since = "1.27.0")]
1925pub fn _mm_cmpeq_sd(a: __m128d, b: __m128d) -> __m128d {
1926    unsafe { cmpsd(a, b, 0) }
1927}
1928
1929/// Returns a new vector with the low element of `a` replaced by the less-than
1930/// comparison of the lower elements of `a` and `b`.
1931///
1932/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd)
1933#[inline]
1934#[target_feature(enable = "sse2")]
1935#[cfg_attr(test, assert_instr(cmpltsd))]
1936#[stable(feature = "simd_x86", since = "1.27.0")]
1937pub fn _mm_cmplt_sd(a: __m128d, b: __m128d) -> __m128d {
1938    unsafe { cmpsd(a, b, 1) }
1939}
1940
1941/// Returns a new vector with the low element of `a` replaced by the
1942/// less-than-or-equal comparison of the lower elements of `a` and `b`.
1943///
1944/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd)
1945#[inline]
1946#[target_feature(enable = "sse2")]
1947#[cfg_attr(test, assert_instr(cmplesd))]
1948#[stable(feature = "simd_x86", since = "1.27.0")]
1949pub fn _mm_cmple_sd(a: __m128d, b: __m128d) -> __m128d {
1950    unsafe { cmpsd(a, b, 2) }
1951}
1952
1953/// Returns a new vector with the low element of `a` replaced by the
1954/// greater-than comparison of the lower elements of `a` and `b`.
1955///
1956/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd)
1957#[inline]
1958#[target_feature(enable = "sse2")]
1959#[cfg_attr(test, assert_instr(cmpltsd))]
1960#[stable(feature = "simd_x86", since = "1.27.0")]
1961pub fn _mm_cmpgt_sd(a: __m128d, b: __m128d) -> __m128d {
1962    unsafe { simd_insert!(_mm_cmplt_sd(b, a), 1, simd_extract!(a, 1, f64)) }
1963}
1964
1965/// Returns a new vector with the low element of `a` replaced by the
1966/// greater-than-or-equal comparison of the lower elements of `a` and `b`.
1967///
1968/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd)
1969#[inline]
1970#[target_feature(enable = "sse2")]
1971#[cfg_attr(test, assert_instr(cmplesd))]
1972#[stable(feature = "simd_x86", since = "1.27.0")]
1973pub fn _mm_cmpge_sd(a: __m128d, b: __m128d) -> __m128d {
1974    unsafe { simd_insert!(_mm_cmple_sd(b, a), 1, simd_extract!(a, 1, f64)) }
1975}
1976
1977/// Returns a new vector with the low element of `a` replaced by the result
1978/// of comparing both of the lower elements of `a` and `b` to `NaN`. If
1979/// neither are equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0`
1980/// otherwise.
1981///
1982/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd)
1983#[inline]
1984#[target_feature(enable = "sse2")]
1985#[cfg_attr(test, assert_instr(cmpordsd))]
1986#[stable(feature = "simd_x86", since = "1.27.0")]
1987pub fn _mm_cmpord_sd(a: __m128d, b: __m128d) -> __m128d {
1988    unsafe { cmpsd(a, b, 7) }
1989}
1990
1991/// Returns a new vector with the low element of `a` replaced by the result of
1992/// comparing both of the lower elements of `a` and `b` to `NaN`. If either is
1993/// equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0` otherwise.
1994///
1995/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd)
1996#[inline]
1997#[target_feature(enable = "sse2")]
1998#[cfg_attr(test, assert_instr(cmpunordsd))]
1999#[stable(feature = "simd_x86", since = "1.27.0")]
2000pub fn _mm_cmpunord_sd(a: __m128d, b: __m128d) -> __m128d {
2001    unsafe { cmpsd(a, b, 3) }
2002}
2003
2004/// Returns a new vector with the low element of `a` replaced by the not-equal
2005/// comparison of the lower elements of `a` and `b`.
2006///
2007/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd)
2008#[inline]
2009#[target_feature(enable = "sse2")]
2010#[cfg_attr(test, assert_instr(cmpneqsd))]
2011#[stable(feature = "simd_x86", since = "1.27.0")]
2012pub fn _mm_cmpneq_sd(a: __m128d, b: __m128d) -> __m128d {
2013    unsafe { cmpsd(a, b, 4) }
2014}
2015
2016/// Returns a new vector with the low element of `a` replaced by the
2017/// not-less-than comparison of the lower elements of `a` and `b`.
2018///
2019/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd)
2020#[inline]
2021#[target_feature(enable = "sse2")]
2022#[cfg_attr(test, assert_instr(cmpnltsd))]
2023#[stable(feature = "simd_x86", since = "1.27.0")]
2024pub fn _mm_cmpnlt_sd(a: __m128d, b: __m128d) -> __m128d {
2025    unsafe { cmpsd(a, b, 5) }
2026}
2027
2028/// Returns a new vector with the low element of `a` replaced by the
2029/// not-less-than-or-equal comparison of the lower elements of `a` and `b`.
2030///
2031/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd)
2032#[inline]
2033#[target_feature(enable = "sse2")]
2034#[cfg_attr(test, assert_instr(cmpnlesd))]
2035#[stable(feature = "simd_x86", since = "1.27.0")]
2036pub fn _mm_cmpnle_sd(a: __m128d, b: __m128d) -> __m128d {
2037    unsafe { cmpsd(a, b, 6) }
2038}
2039
2040/// Returns a new vector with the low element of `a` replaced by the
2041/// not-greater-than comparison of the lower elements of `a` and `b`.
2042///
2043/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd)
2044#[inline]
2045#[target_feature(enable = "sse2")]
2046#[cfg_attr(test, assert_instr(cmpnltsd))]
2047#[stable(feature = "simd_x86", since = "1.27.0")]
2048pub fn _mm_cmpngt_sd(a: __m128d, b: __m128d) -> __m128d {
2049    unsafe { simd_insert!(_mm_cmpnlt_sd(b, a), 1, simd_extract!(a, 1, f64)) }
2050}
2051
2052/// Returns a new vector with the low element of `a` replaced by the
2053/// not-greater-than-or-equal comparison of the lower elements of `a` and `b`.
2054///
2055/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd)
2056#[inline]
2057#[target_feature(enable = "sse2")]
2058#[cfg_attr(test, assert_instr(cmpnlesd))]
2059#[stable(feature = "simd_x86", since = "1.27.0")]
2060pub fn _mm_cmpnge_sd(a: __m128d, b: __m128d) -> __m128d {
2061    unsafe { simd_insert!(_mm_cmpnle_sd(b, a), 1, simd_extract!(a, 1, f64)) }
2062}
2063
2064/// Compares corresponding elements in `a` and `b` for equality.
2065///
2066/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd)
2067#[inline]
2068#[target_feature(enable = "sse2")]
2069#[cfg_attr(test, assert_instr(cmpeqpd))]
2070#[stable(feature = "simd_x86", since = "1.27.0")]
2071pub fn _mm_cmpeq_pd(a: __m128d, b: __m128d) -> __m128d {
2072    unsafe { cmppd(a, b, 0) }
2073}
2074
2075/// Compares corresponding elements in `a` and `b` for less-than.
2076///
2077/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd)
2078#[inline]
2079#[target_feature(enable = "sse2")]
2080#[cfg_attr(test, assert_instr(cmpltpd))]
2081#[stable(feature = "simd_x86", since = "1.27.0")]
2082pub fn _mm_cmplt_pd(a: __m128d, b: __m128d) -> __m128d {
2083    unsafe { cmppd(a, b, 1) }
2084}
2085
2086/// Compares corresponding elements in `a` and `b` for less-than-or-equal
2087///
2088/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd)
2089#[inline]
2090#[target_feature(enable = "sse2")]
2091#[cfg_attr(test, assert_instr(cmplepd))]
2092#[stable(feature = "simd_x86", since = "1.27.0")]
2093pub fn _mm_cmple_pd(a: __m128d, b: __m128d) -> __m128d {
2094    unsafe { cmppd(a, b, 2) }
2095}
2096
2097/// Compares corresponding elements in `a` and `b` for greater-than.
2098///
2099/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd)
2100#[inline]
2101#[target_feature(enable = "sse2")]
2102#[cfg_attr(test, assert_instr(cmpltpd))]
2103#[stable(feature = "simd_x86", since = "1.27.0")]
2104pub fn _mm_cmpgt_pd(a: __m128d, b: __m128d) -> __m128d {
2105    _mm_cmplt_pd(b, a)
2106}
2107
2108/// Compares corresponding elements in `a` and `b` for greater-than-or-equal.
2109///
2110/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd)
2111#[inline]
2112#[target_feature(enable = "sse2")]
2113#[cfg_attr(test, assert_instr(cmplepd))]
2114#[stable(feature = "simd_x86", since = "1.27.0")]
2115pub fn _mm_cmpge_pd(a: __m128d, b: __m128d) -> __m128d {
2116    _mm_cmple_pd(b, a)
2117}
2118
2119/// Compares corresponding elements in `a` and `b` to see if neither is `NaN`.
2120///
2121/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd)
2122#[inline]
2123#[target_feature(enable = "sse2")]
2124#[cfg_attr(test, assert_instr(cmpordpd))]
2125#[stable(feature = "simd_x86", since = "1.27.0")]
2126pub fn _mm_cmpord_pd(a: __m128d, b: __m128d) -> __m128d {
2127    unsafe { cmppd(a, b, 7) }
2128}
2129
2130/// Compares corresponding elements in `a` and `b` to see if either is `NaN`.
2131///
2132/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd)
2133#[inline]
2134#[target_feature(enable = "sse2")]
2135#[cfg_attr(test, assert_instr(cmpunordpd))]
2136#[stable(feature = "simd_x86", since = "1.27.0")]
2137pub fn _mm_cmpunord_pd(a: __m128d, b: __m128d) -> __m128d {
2138    unsafe { cmppd(a, b, 3) }
2139}
2140
2141/// Compares corresponding elements in `a` and `b` for not-equal.
2142///
2143/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd)
2144#[inline]
2145#[target_feature(enable = "sse2")]
2146#[cfg_attr(test, assert_instr(cmpneqpd))]
2147#[stable(feature = "simd_x86", since = "1.27.0")]
2148pub fn _mm_cmpneq_pd(a: __m128d, b: __m128d) -> __m128d {
2149    unsafe { cmppd(a, b, 4) }
2150}
2151
2152/// Compares corresponding elements in `a` and `b` for not-less-than.
2153///
2154/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd)
2155#[inline]
2156#[target_feature(enable = "sse2")]
2157#[cfg_attr(test, assert_instr(cmpnltpd))]
2158#[stable(feature = "simd_x86", since = "1.27.0")]
2159pub fn _mm_cmpnlt_pd(a: __m128d, b: __m128d) -> __m128d {
2160    unsafe { cmppd(a, b, 5) }
2161}
2162
2163/// Compares corresponding elements in `a` and `b` for not-less-than-or-equal.
2164///
2165/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd)
2166#[inline]
2167#[target_feature(enable = "sse2")]
2168#[cfg_attr(test, assert_instr(cmpnlepd))]
2169#[stable(feature = "simd_x86", since = "1.27.0")]
2170pub fn _mm_cmpnle_pd(a: __m128d, b: __m128d) -> __m128d {
2171    unsafe { cmppd(a, b, 6) }
2172}
2173
2174/// Compares corresponding elements in `a` and `b` for not-greater-than.
2175///
2176/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_pd)
2177#[inline]
2178#[target_feature(enable = "sse2")]
2179#[cfg_attr(test, assert_instr(cmpnltpd))]
2180#[stable(feature = "simd_x86", since = "1.27.0")]
2181pub fn _mm_cmpngt_pd(a: __m128d, b: __m128d) -> __m128d {
2182    _mm_cmpnlt_pd(b, a)
2183}
2184
2185/// Compares corresponding elements in `a` and `b` for
2186/// not-greater-than-or-equal.
2187///
2188/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd)
2189#[inline]
2190#[target_feature(enable = "sse2")]
2191#[cfg_attr(test, assert_instr(cmpnlepd))]
2192#[stable(feature = "simd_x86", since = "1.27.0")]
2193pub fn _mm_cmpnge_pd(a: __m128d, b: __m128d) -> __m128d {
2194    _mm_cmpnle_pd(b, a)
2195}
2196
2197/// Compares the lower element of `a` and `b` for equality.
2198///
2199/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd)
2200#[inline]
2201#[target_feature(enable = "sse2")]
2202#[cfg_attr(test, assert_instr(comisd))]
2203#[stable(feature = "simd_x86", since = "1.27.0")]
2204pub fn _mm_comieq_sd(a: __m128d, b: __m128d) -> i32 {
2205    unsafe { comieqsd(a, b) }
2206}
2207
2208/// Compares the lower element of `a` and `b` for less-than.
2209///
2210/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd)
2211#[inline]
2212#[target_feature(enable = "sse2")]
2213#[cfg_attr(test, assert_instr(comisd))]
2214#[stable(feature = "simd_x86", since = "1.27.0")]
2215pub fn _mm_comilt_sd(a: __m128d, b: __m128d) -> i32 {
2216    unsafe { comiltsd(a, b) }
2217}
2218
2219/// Compares the lower element of `a` and `b` for less-than-or-equal.
2220///
2221/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd)
2222#[inline]
2223#[target_feature(enable = "sse2")]
2224#[cfg_attr(test, assert_instr(comisd))]
2225#[stable(feature = "simd_x86", since = "1.27.0")]
2226pub fn _mm_comile_sd(a: __m128d, b: __m128d) -> i32 {
2227    unsafe { comilesd(a, b) }
2228}
2229
2230/// Compares the lower element of `a` and `b` for greater-than.
2231///
2232/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd)
2233#[inline]
2234#[target_feature(enable = "sse2")]
2235#[cfg_attr(test, assert_instr(comisd))]
2236#[stable(feature = "simd_x86", since = "1.27.0")]
2237pub fn _mm_comigt_sd(a: __m128d, b: __m128d) -> i32 {
2238    unsafe { comigtsd(a, b) }
2239}
2240
2241/// Compares the lower element of `a` and `b` for greater-than-or-equal.
2242///
2243/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd)
2244#[inline]
2245#[target_feature(enable = "sse2")]
2246#[cfg_attr(test, assert_instr(comisd))]
2247#[stable(feature = "simd_x86", since = "1.27.0")]
2248pub fn _mm_comige_sd(a: __m128d, b: __m128d) -> i32 {
2249    unsafe { comigesd(a, b) }
2250}
2251
2252/// Compares the lower element of `a` and `b` for not-equal.
2253///
2254/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd)
2255#[inline]
2256#[target_feature(enable = "sse2")]
2257#[cfg_attr(test, assert_instr(comisd))]
2258#[stable(feature = "simd_x86", since = "1.27.0")]
2259pub fn _mm_comineq_sd(a: __m128d, b: __m128d) -> i32 {
2260    unsafe { comineqsd(a, b) }
2261}
2262
2263/// Compares the lower element of `a` and `b` for equality.
2264///
2265/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomieq_sd)
2266#[inline]
2267#[target_feature(enable = "sse2")]
2268#[cfg_attr(test, assert_instr(ucomisd))]
2269#[stable(feature = "simd_x86", since = "1.27.0")]
2270pub fn _mm_ucomieq_sd(a: __m128d, b: __m128d) -> i32 {
2271    unsafe { ucomieqsd(a, b) }
2272}
2273
2274/// Compares the lower element of `a` and `b` for less-than.
2275///
2276/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomilt_sd)
2277#[inline]
2278#[target_feature(enable = "sse2")]
2279#[cfg_attr(test, assert_instr(ucomisd))]
2280#[stable(feature = "simd_x86", since = "1.27.0")]
2281pub fn _mm_ucomilt_sd(a: __m128d, b: __m128d) -> i32 {
2282    unsafe { ucomiltsd(a, b) }
2283}
2284
2285/// Compares the lower element of `a` and `b` for less-than-or-equal.
2286///
2287/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomile_sd)
2288#[inline]
2289#[target_feature(enable = "sse2")]
2290#[cfg_attr(test, assert_instr(ucomisd))]
2291#[stable(feature = "simd_x86", since = "1.27.0")]
2292pub fn _mm_ucomile_sd(a: __m128d, b: __m128d) -> i32 {
2293    unsafe { ucomilesd(a, b) }
2294}
2295
2296/// Compares the lower element of `a` and `b` for greater-than.
2297///
2298/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomigt_sd)
2299#[inline]
2300#[target_feature(enable = "sse2")]
2301#[cfg_attr(test, assert_instr(ucomisd))]
2302#[stable(feature = "simd_x86", since = "1.27.0")]
2303pub fn _mm_ucomigt_sd(a: __m128d, b: __m128d) -> i32 {
2304    unsafe { ucomigtsd(a, b) }
2305}
2306
2307/// Compares the lower element of `a` and `b` for greater-than-or-equal.
2308///
2309/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomige_sd)
2310#[inline]
2311#[target_feature(enable = "sse2")]
2312#[cfg_attr(test, assert_instr(ucomisd))]
2313#[stable(feature = "simd_x86", since = "1.27.0")]
2314pub fn _mm_ucomige_sd(a: __m128d, b: __m128d) -> i32 {
2315    unsafe { ucomigesd(a, b) }
2316}
2317
2318/// Compares the lower element of `a` and `b` for not-equal.
2319///
2320/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomineq_sd)
2321#[inline]
2322#[target_feature(enable = "sse2")]
2323#[cfg_attr(test, assert_instr(ucomisd))]
2324#[stable(feature = "simd_x86", since = "1.27.0")]
2325pub fn _mm_ucomineq_sd(a: __m128d, b: __m128d) -> i32 {
2326    unsafe { ucomineqsd(a, b) }
2327}
2328
2329/// Converts packed double-precision (64-bit) floating-point elements in `a` to
2330/// packed single-precision (32-bit) floating-point elements
2331///
2332/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps)
2333#[inline]
2334#[target_feature(enable = "sse2")]
2335#[cfg_attr(test, assert_instr(cvtpd2ps))]
2336#[stable(feature = "simd_x86", since = "1.27.0")]
2337pub fn _mm_cvtpd_ps(a: __m128d) -> __m128 {
2338    unsafe {
2339        let r = simd_cast::<_, f32x2>(a.as_f64x2());
2340        let zero = f32x2::ZERO;
2341        transmute::<f32x4, _>(simd_shuffle!(r, zero, [0, 1, 2, 3]))
2342    }
2343}
2344
2345/// Converts packed single-precision (32-bit) floating-point elements in `a` to
2346/// packed
2347/// double-precision (64-bit) floating-point elements.
2348///
2349/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd)
2350#[inline]
2351#[target_feature(enable = "sse2")]
2352#[cfg_attr(test, assert_instr(cvtps2pd))]
2353#[stable(feature = "simd_x86", since = "1.27.0")]
2354pub fn _mm_cvtps_pd(a: __m128) -> __m128d {
2355    unsafe {
2356        let a = a.as_f32x4();
2357        transmute(simd_cast::<f32x2, f64x2>(simd_shuffle!(a, a, [0, 1])))
2358    }
2359}
2360
2361/// Converts packed double-precision (64-bit) floating-point elements in `a` to
2362/// packed 32-bit integers.
2363///
2364/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32)
2365#[inline]
2366#[target_feature(enable = "sse2")]
2367#[cfg_attr(test, assert_instr(cvtpd2dq))]
2368#[stable(feature = "simd_x86", since = "1.27.0")]
2369pub fn _mm_cvtpd_epi32(a: __m128d) -> __m128i {
2370    unsafe { transmute(cvtpd2dq(a)) }
2371}
2372
2373/// Converts the lower double-precision (64-bit) floating-point element in a to
2374/// a 32-bit integer.
2375///
2376/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32)
2377#[inline]
2378#[target_feature(enable = "sse2")]
2379#[cfg_attr(test, assert_instr(cvtsd2si))]
2380#[stable(feature = "simd_x86", since = "1.27.0")]
2381pub fn _mm_cvtsd_si32(a: __m128d) -> i32 {
2382    unsafe { cvtsd2si(a) }
2383}
2384
2385/// Converts the lower double-precision (64-bit) floating-point element in `b`
2386/// to a single-precision (32-bit) floating-point element, store the result in
2387/// the lower element of the return value, and copies the upper element from `a`
2388/// to the upper element the return value.
2389///
2390/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss)
2391#[inline]
2392#[target_feature(enable = "sse2")]
2393#[cfg_attr(test, assert_instr(cvtsd2ss))]
2394#[stable(feature = "simd_x86", since = "1.27.0")]
2395pub fn _mm_cvtsd_ss(a: __m128, b: __m128d) -> __m128 {
2396    unsafe { cvtsd2ss(a, b) }
2397}
2398
2399/// Returns the lower double-precision (64-bit) floating-point element of `a`.
2400///
2401/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64)
2402#[inline]
2403#[target_feature(enable = "sse2")]
2404#[stable(feature = "simd_x86", since = "1.27.0")]
2405pub fn _mm_cvtsd_f64(a: __m128d) -> f64 {
2406    unsafe { simd_extract!(a, 0) }
2407}
2408
2409/// Converts the lower single-precision (32-bit) floating-point element in `b`
2410/// to a double-precision (64-bit) floating-point element, store the result in
2411/// the lower element of the return value, and copies the upper element from `a`
2412/// to the upper element the return value.
2413///
2414/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd)
2415#[inline]
2416#[target_feature(enable = "sse2")]
2417#[cfg_attr(test, assert_instr(cvtss2sd))]
2418#[stable(feature = "simd_x86", since = "1.27.0")]
2419pub fn _mm_cvtss_sd(a: __m128d, b: __m128) -> __m128d {
2420    unsafe { cvtss2sd(a, b) }
2421}
2422
2423/// Converts packed double-precision (64-bit) floating-point elements in `a` to
2424/// packed 32-bit integers with truncation.
2425///
2426/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32)
2427#[inline]
2428#[target_feature(enable = "sse2")]
2429#[cfg_attr(test, assert_instr(cvttpd2dq))]
2430#[stable(feature = "simd_x86", since = "1.27.0")]
2431pub fn _mm_cvttpd_epi32(a: __m128d) -> __m128i {
2432    unsafe { transmute(cvttpd2dq(a)) }
2433}
2434
2435/// Converts the lower double-precision (64-bit) floating-point element in `a`
2436/// to a 32-bit integer with truncation.
2437///
2438/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32)
2439#[inline]
2440#[target_feature(enable = "sse2")]
2441#[cfg_attr(test, assert_instr(cvttsd2si))]
2442#[stable(feature = "simd_x86", since = "1.27.0")]
2443pub fn _mm_cvttsd_si32(a: __m128d) -> i32 {
2444    unsafe { cvttsd2si(a) }
2445}
2446
2447/// Converts packed single-precision (32-bit) floating-point elements in `a` to
2448/// packed 32-bit integers with truncation.
2449///
2450/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32)
2451#[inline]
2452#[target_feature(enable = "sse2")]
2453#[cfg_attr(test, assert_instr(cvttps2dq))]
2454#[stable(feature = "simd_x86", since = "1.27.0")]
2455pub fn _mm_cvttps_epi32(a: __m128) -> __m128i {
2456    unsafe { transmute(cvttps2dq(a)) }
2457}
2458
2459/// Copies double-precision (64-bit) floating-point element `a` to the lower
2460/// element of the packed 64-bit return value.
2461///
2462/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd)
2463#[inline]
2464#[target_feature(enable = "sse2")]
2465#[stable(feature = "simd_x86", since = "1.27.0")]
2466pub fn _mm_set_sd(a: f64) -> __m128d {
2467    _mm_set_pd(0.0, a)
2468}
2469
2470/// Broadcasts double-precision (64-bit) floating-point value a to all elements
2471/// of the return value.
2472///
2473/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd)
2474#[inline]
2475#[target_feature(enable = "sse2")]
2476#[stable(feature = "simd_x86", since = "1.27.0")]
2477pub fn _mm_set1_pd(a: f64) -> __m128d {
2478    _mm_set_pd(a, a)
2479}
2480
2481/// Broadcasts double-precision (64-bit) floating-point value a to all elements
2482/// of the return value.
2483///
2484/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1)
2485#[inline]
2486#[target_feature(enable = "sse2")]
2487#[stable(feature = "simd_x86", since = "1.27.0")]
2488pub fn _mm_set_pd1(a: f64) -> __m128d {
2489    _mm_set_pd(a, a)
2490}
2491
2492/// Sets packed double-precision (64-bit) floating-point elements in the return
2493/// value with the supplied values.
2494///
2495/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd)
2496#[inline]
2497#[target_feature(enable = "sse2")]
2498#[stable(feature = "simd_x86", since = "1.27.0")]
2499pub fn _mm_set_pd(a: f64, b: f64) -> __m128d {
2500    __m128d([b, a])
2501}
2502
2503/// Sets packed double-precision (64-bit) floating-point elements in the return
2504/// value with the supplied values in reverse order.
2505///
2506/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd)
2507#[inline]
2508#[target_feature(enable = "sse2")]
2509#[stable(feature = "simd_x86", since = "1.27.0")]
2510pub fn _mm_setr_pd(a: f64, b: f64) -> __m128d {
2511    _mm_set_pd(b, a)
2512}
2513
2514/// Returns packed double-precision (64-bit) floating-point elements with all
2515/// zeros.
2516///
2517/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd)
2518#[inline]
2519#[target_feature(enable = "sse2")]
2520#[cfg_attr(test, assert_instr(xorp))]
2521#[stable(feature = "simd_x86", since = "1.27.0")]
2522pub fn _mm_setzero_pd() -> __m128d {
2523    const { unsafe { mem::zeroed() } }
2524}
2525
2526/// Returns a mask of the most significant bit of each element in `a`.
2527///
2528/// The mask is stored in the 2 least significant bits of the return value.
2529/// All other bits are set to `0`.
2530///
2531/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd)
2532#[inline]
2533#[target_feature(enable = "sse2")]
2534#[cfg_attr(test, assert_instr(movmskpd))]
2535#[stable(feature = "simd_x86", since = "1.27.0")]
2536pub fn _mm_movemask_pd(a: __m128d) -> i32 {
2537    // Propagate the highest bit to the rest, because simd_bitmask
2538    // requires all-1 or all-0.
2539    unsafe {
2540        let mask: i64x2 = simd_lt(transmute(a), i64x2::ZERO);
2541        simd_bitmask::<i64x2, u8>(mask).into()
2542    }
2543}
2544
2545/// Loads 128-bits (composed of 2 packed double-precision (64-bit)
2546/// floating-point elements) from memory into the returned vector.
2547/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
2548/// exception may be generated.
2549///
2550/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd)
2551#[inline]
2552#[target_feature(enable = "sse2")]
2553#[cfg_attr(
2554    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
2555    assert_instr(movaps)
2556)]
2557#[stable(feature = "simd_x86", since = "1.27.0")]
2558#[allow(clippy::cast_ptr_alignment)]
2559pub unsafe fn _mm_load_pd(mem_addr: *const f64) -> __m128d {
2560    *(mem_addr as *const __m128d)
2561}
2562
2563/// Loads a 64-bit double-precision value to the low element of a
2564/// 128-bit integer vector and clears the upper element.
2565///
2566/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd)
2567#[inline]
2568#[target_feature(enable = "sse2")]
2569#[cfg_attr(test, assert_instr(movsd))]
2570#[stable(feature = "simd_x86", since = "1.27.0")]
2571pub unsafe fn _mm_load_sd(mem_addr: *const f64) -> __m128d {
2572    _mm_setr_pd(*mem_addr, 0.)
2573}
2574
2575/// Loads a double-precision value into the high-order bits of a 128-bit
2576/// vector of `[2 x double]`. The low-order bits are copied from the low-order
2577/// bits of the first operand.
2578///
2579/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd)
2580#[inline]
2581#[target_feature(enable = "sse2")]
2582#[cfg_attr(test, assert_instr(movhps))]
2583#[stable(feature = "simd_x86", since = "1.27.0")]
2584pub unsafe fn _mm_loadh_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
2585    _mm_setr_pd(simd_extract!(a, 0), *mem_addr)
2586}
2587
2588/// Loads a double-precision value into the low-order bits of a 128-bit
2589/// vector of `[2 x double]`. The high-order bits are copied from the
2590/// high-order bits of the first operand.
2591///
2592/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd)
2593#[inline]
2594#[target_feature(enable = "sse2")]
2595#[cfg_attr(test, assert_instr(movlps))]
2596#[stable(feature = "simd_x86", since = "1.27.0")]
2597pub unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
2598    _mm_setr_pd(*mem_addr, simd_extract!(a, 1))
2599}
2600
2601/// Stores a 128-bit floating point vector of `[2 x double]` to a 128-bit
2602/// aligned memory location.
2603/// To minimize caching, the data is flagged as non-temporal (unlikely to be
2604/// used again soon).
2605///
2606/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd)
2607///
2608/// # Safety of non-temporal stores
2609///
2610/// After using this intrinsic, but before any other access to the memory that this intrinsic
2611/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
2612/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
2613/// return.
2614///
2615/// See [`_mm_sfence`] for details.
2616#[inline]
2617#[target_feature(enable = "sse2")]
2618#[cfg_attr(test, assert_instr(movntpd))]
2619#[stable(feature = "simd_x86", since = "1.27.0")]
2620#[allow(clippy::cast_ptr_alignment)]
2621pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) {
2622    crate::arch::asm!(
2623        vps!("movntpd", ",{a}"),
2624        p = in(reg) mem_addr,
2625        a = in(xmm_reg) a,
2626        options(nostack, preserves_flags),
2627    );
2628}
2629
2630/// Stores the lower 64 bits of a 128-bit vector of `[2 x double]` to a
2631/// memory location.
2632///
2633/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_sd)
2634#[inline]
2635#[target_feature(enable = "sse2")]
2636#[cfg_attr(test, assert_instr(movlps))]
2637#[stable(feature = "simd_x86", since = "1.27.0")]
2638pub unsafe fn _mm_store_sd(mem_addr: *mut f64, a: __m128d) {
2639    *mem_addr = simd_extract!(a, 0)
2640}
2641
2642/// Stores 128-bits (composed of 2 packed double-precision (64-bit)
2643/// floating-point elements) from `a` into memory. `mem_addr` must be aligned
2644/// on a 16-byte boundary or a general-protection exception may be generated.
2645///
2646/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd)
2647#[inline]
2648#[target_feature(enable = "sse2")]
2649#[cfg_attr(
2650    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
2651    assert_instr(movaps)
2652)]
2653#[stable(feature = "simd_x86", since = "1.27.0")]
2654#[allow(clippy::cast_ptr_alignment)]
2655pub unsafe fn _mm_store_pd(mem_addr: *mut f64, a: __m128d) {
2656    *(mem_addr as *mut __m128d) = a;
2657}
2658
2659/// Stores 128-bits (composed of 2 packed double-precision (64-bit)
2660/// floating-point elements) from `a` into memory.
2661/// `mem_addr` does not need to be aligned on any particular boundary.
2662///
2663/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd)
2664#[inline]
2665#[target_feature(enable = "sse2")]
2666#[cfg_attr(test, assert_instr(movups))] // FIXME movupd expected
2667#[stable(feature = "simd_x86", since = "1.27.0")]
2668pub unsafe fn _mm_storeu_pd(mem_addr: *mut f64, a: __m128d) {
2669    mem_addr.cast::<__m128d>().write_unaligned(a);
2670}
2671
2672/// Store 16-bit integer from the first element of a into memory.
2673///
2674/// `mem_addr` does not need to be aligned on any particular boundary.
2675///
2676/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16)
2677#[inline]
2678#[target_feature(enable = "sse2")]
2679#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2680pub unsafe fn _mm_storeu_si16(mem_addr: *mut u8, a: __m128i) {
2681    ptr::write_unaligned(mem_addr as *mut i16, simd_extract(a.as_i16x8(), 0))
2682}
2683
2684/// Store 32-bit integer from the first element of a into memory.
2685///
2686/// `mem_addr` does not need to be aligned on any particular boundary.
2687///
2688/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32)
2689#[inline]
2690#[target_feature(enable = "sse2")]
2691#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2692pub unsafe fn _mm_storeu_si32(mem_addr: *mut u8, a: __m128i) {
2693    ptr::write_unaligned(mem_addr as *mut i32, simd_extract(a.as_i32x4(), 0))
2694}
2695
2696/// Store 64-bit integer from the first element of a into memory.
2697///
2698/// `mem_addr` does not need to be aligned on any particular boundary.
2699///
2700/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64)
2701#[inline]
2702#[target_feature(enable = "sse2")]
2703#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2704pub unsafe fn _mm_storeu_si64(mem_addr: *mut u8, a: __m128i) {
2705    ptr::write_unaligned(mem_addr as *mut i64, simd_extract(a.as_i64x2(), 0))
2706}
2707
2708/// Stores the lower double-precision (64-bit) floating-point element from `a`
2709/// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
2710/// 16-byte boundary or a general-protection exception may be generated.
2711///
2712/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_pd)
2713#[inline]
2714#[target_feature(enable = "sse2")]
2715#[stable(feature = "simd_x86", since = "1.27.0")]
2716#[allow(clippy::cast_ptr_alignment)]
2717pub unsafe fn _mm_store1_pd(mem_addr: *mut f64, a: __m128d) {
2718    let b: __m128d = simd_shuffle!(a, a, [0, 0]);
2719    *(mem_addr as *mut __m128d) = b;
2720}
2721
2722/// Stores the lower double-precision (64-bit) floating-point element from `a`
2723/// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
2724/// 16-byte boundary or a general-protection exception may be generated.
2725///
2726/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1)
2727#[inline]
2728#[target_feature(enable = "sse2")]
2729#[stable(feature = "simd_x86", since = "1.27.0")]
2730#[allow(clippy::cast_ptr_alignment)]
2731pub unsafe fn _mm_store_pd1(mem_addr: *mut f64, a: __m128d) {
2732    let b: __m128d = simd_shuffle!(a, a, [0, 0]);
2733    *(mem_addr as *mut __m128d) = b;
2734}
2735
2736/// Stores 2 double-precision (64-bit) floating-point elements from `a` into
2737/// memory in reverse order.
2738/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
2739/// exception may be generated.
2740///
2741/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd)
2742#[inline]
2743#[target_feature(enable = "sse2")]
2744#[stable(feature = "simd_x86", since = "1.27.0")]
2745#[allow(clippy::cast_ptr_alignment)]
2746pub unsafe fn _mm_storer_pd(mem_addr: *mut f64, a: __m128d) {
2747    let b: __m128d = simd_shuffle!(a, a, [1, 0]);
2748    *(mem_addr as *mut __m128d) = b;
2749}
2750
2751/// Stores the upper 64 bits of a 128-bit vector of `[2 x double]` to a
2752/// memory location.
2753///
2754/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd)
2755#[inline]
2756#[target_feature(enable = "sse2")]
2757#[cfg_attr(test, assert_instr(movhps))]
2758#[stable(feature = "simd_x86", since = "1.27.0")]
2759pub unsafe fn _mm_storeh_pd(mem_addr: *mut f64, a: __m128d) {
2760    *mem_addr = simd_extract!(a, 1);
2761}
2762
2763/// Stores the lower 64 bits of a 128-bit vector of `[2 x double]` to a
2764/// memory location.
2765///
2766/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd)
2767#[inline]
2768#[target_feature(enable = "sse2")]
2769#[cfg_attr(test, assert_instr(movlps))]
2770#[stable(feature = "simd_x86", since = "1.27.0")]
2771pub unsafe fn _mm_storel_pd(mem_addr: *mut f64, a: __m128d) {
2772    *mem_addr = simd_extract!(a, 0);
2773}
2774
2775/// Loads a double-precision (64-bit) floating-point element from memory
2776/// into both elements of returned vector.
2777///
2778/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd)
2779#[inline]
2780#[target_feature(enable = "sse2")]
2781// #[cfg_attr(test, assert_instr(movapd))] // FIXME LLVM uses different codegen
2782#[stable(feature = "simd_x86", since = "1.27.0")]
2783pub unsafe fn _mm_load1_pd(mem_addr: *const f64) -> __m128d {
2784    let d = *mem_addr;
2785    _mm_setr_pd(d, d)
2786}
2787
2788/// Loads a double-precision (64-bit) floating-point element from memory
2789/// into both elements of returned vector.
2790///
2791/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1)
2792#[inline]
2793#[target_feature(enable = "sse2")]
2794// #[cfg_attr(test, assert_instr(movapd))] // FIXME same as _mm_load1_pd
2795#[stable(feature = "simd_x86", since = "1.27.0")]
2796pub unsafe fn _mm_load_pd1(mem_addr: *const f64) -> __m128d {
2797    _mm_load1_pd(mem_addr)
2798}
2799
2800/// Loads 2 double-precision (64-bit) floating-point elements from memory into
2801/// the returned vector in reverse order. `mem_addr` must be aligned on a
2802/// 16-byte boundary or a general-protection exception may be generated.
2803///
2804/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd)
2805#[inline]
2806#[target_feature(enable = "sse2")]
2807#[cfg_attr(
2808    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
2809    assert_instr(movaps)
2810)]
2811#[stable(feature = "simd_x86", since = "1.27.0")]
2812pub unsafe fn _mm_loadr_pd(mem_addr: *const f64) -> __m128d {
2813    let a = _mm_load_pd(mem_addr);
2814    simd_shuffle!(a, a, [1, 0])
2815}
2816
2817/// Loads 128-bits (composed of 2 packed double-precision (64-bit)
2818/// floating-point elements) from memory into the returned vector.
2819/// `mem_addr` does not need to be aligned on any particular boundary.
2820///
2821/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd)
2822#[inline]
2823#[target_feature(enable = "sse2")]
2824#[cfg_attr(test, assert_instr(movups))]
2825#[stable(feature = "simd_x86", since = "1.27.0")]
2826pub unsafe fn _mm_loadu_pd(mem_addr: *const f64) -> __m128d {
2827    let mut dst = _mm_undefined_pd();
2828    ptr::copy_nonoverlapping(
2829        mem_addr as *const u8,
2830        ptr::addr_of_mut!(dst) as *mut u8,
2831        mem::size_of::<__m128d>(),
2832    );
2833    dst
2834}
2835
2836/// Loads unaligned 16-bits of integer data from memory into new vector.
2837///
2838/// `mem_addr` does not need to be aligned on any particular boundary.
2839///
2840/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16)
2841#[inline]
2842#[target_feature(enable = "sse2")]
2843#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2844pub unsafe fn _mm_loadu_si16(mem_addr: *const u8) -> __m128i {
2845    transmute(i16x8::new(
2846        ptr::read_unaligned(mem_addr as *const i16),
2847        0,
2848        0,
2849        0,
2850        0,
2851        0,
2852        0,
2853        0,
2854    ))
2855}
2856
2857/// Loads unaligned 32-bits of integer data from memory into new vector.
2858///
2859/// `mem_addr` does not need to be aligned on any particular boundary.
2860///
2861/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32)
2862#[inline]
2863#[target_feature(enable = "sse2")]
2864#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2865pub unsafe fn _mm_loadu_si32(mem_addr: *const u8) -> __m128i {
2866    transmute(i32x4::new(
2867        ptr::read_unaligned(mem_addr as *const i32),
2868        0,
2869        0,
2870        0,
2871    ))
2872}
2873
2874/// Loads unaligned 64-bits of integer data from memory into new vector.
2875///
2876/// `mem_addr` does not need to be aligned on any particular boundary.
2877///
2878/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64)
2879#[inline]
2880#[target_feature(enable = "sse2")]
2881#[stable(feature = "simd_x86_mm_loadu_si64", since = "1.46.0")]
2882pub unsafe fn _mm_loadu_si64(mem_addr: *const u8) -> __m128i {
2883    transmute(i64x2::new(ptr::read_unaligned(mem_addr as *const i64), 0))
2884}
2885
2886/// Constructs a 128-bit floating-point vector of `[2 x double]` from two
2887/// 128-bit vector parameters of `[2 x double]`, using the immediate-value
2888/// parameter as a specifier.
2889///
2890/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd)
2891#[inline]
2892#[target_feature(enable = "sse2")]
2893#[cfg_attr(test, assert_instr(shufps, MASK = 2))]
2894#[rustc_legacy_const_generics(2)]
2895#[stable(feature = "simd_x86", since = "1.27.0")]
2896pub fn _mm_shuffle_pd<const MASK: i32>(a: __m128d, b: __m128d) -> __m128d {
2897    static_assert_uimm_bits!(MASK, 8);
2898    unsafe { simd_shuffle!(a, b, [MASK as u32 & 0b1, ((MASK as u32 >> 1) & 0b1) + 2]) }
2899}
2900
2901/// Constructs a 128-bit floating-point vector of `[2 x double]`. The lower
2902/// 64 bits are set to the lower 64 bits of the second parameter. The upper
2903/// 64 bits are set to the upper 64 bits of the first parameter.
2904///
2905/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd)
2906#[inline]
2907#[target_feature(enable = "sse2")]
2908#[cfg_attr(test, assert_instr(movsd))]
2909#[stable(feature = "simd_x86", since = "1.27.0")]
2910pub fn _mm_move_sd(a: __m128d, b: __m128d) -> __m128d {
2911    unsafe { _mm_setr_pd(simd_extract!(b, 0), simd_extract!(a, 1)) }
2912}
2913
2914/// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit
2915/// floating-point vector of `[4 x float]`.
2916///
2917/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps)
2918#[inline]
2919#[target_feature(enable = "sse2")]
2920#[stable(feature = "simd_x86", since = "1.27.0")]
2921pub fn _mm_castpd_ps(a: __m128d) -> __m128 {
2922    unsafe { transmute(a) }
2923}
2924
2925/// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit
2926/// integer vector.
2927///
2928/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128)
2929#[inline]
2930#[target_feature(enable = "sse2")]
2931#[stable(feature = "simd_x86", since = "1.27.0")]
2932pub fn _mm_castpd_si128(a: __m128d) -> __m128i {
2933    unsafe { transmute(a) }
2934}
2935
2936/// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit
2937/// floating-point vector of `[2 x double]`.
2938///
2939/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd)
2940#[inline]
2941#[target_feature(enable = "sse2")]
2942#[stable(feature = "simd_x86", since = "1.27.0")]
2943pub fn _mm_castps_pd(a: __m128) -> __m128d {
2944    unsafe { transmute(a) }
2945}
2946
2947/// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit
2948/// integer vector.
2949///
2950/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128)
2951#[inline]
2952#[target_feature(enable = "sse2")]
2953#[stable(feature = "simd_x86", since = "1.27.0")]
2954pub fn _mm_castps_si128(a: __m128) -> __m128i {
2955    unsafe { transmute(a) }
2956}
2957
2958/// Casts a 128-bit integer vector into a 128-bit floating-point vector
2959/// of `[2 x double]`.
2960///
2961/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd)
2962#[inline]
2963#[target_feature(enable = "sse2")]
2964#[stable(feature = "simd_x86", since = "1.27.0")]
2965pub fn _mm_castsi128_pd(a: __m128i) -> __m128d {
2966    unsafe { transmute(a) }
2967}
2968
2969/// Casts a 128-bit integer vector into a 128-bit floating-point vector
2970/// of `[4 x float]`.
2971///
2972/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps)
2973#[inline]
2974#[target_feature(enable = "sse2")]
2975#[stable(feature = "simd_x86", since = "1.27.0")]
2976pub fn _mm_castsi128_ps(a: __m128i) -> __m128 {
2977    unsafe { transmute(a) }
2978}
2979
2980/// Returns vector of type __m128d with indeterminate elements.with indetermination elements.
2981/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
2982/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
2983/// In practice, this is typically equivalent to [`mem::zeroed`].
2984///
2985/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd)
2986#[inline]
2987#[target_feature(enable = "sse2")]
2988#[stable(feature = "simd_x86", since = "1.27.0")]
2989pub fn _mm_undefined_pd() -> __m128d {
2990    const { unsafe { mem::zeroed() } }
2991}
2992
2993/// Returns vector of type __m128i with indeterminate elements.with indetermination elements.
2994/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
2995/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
2996/// In practice, this is typically equivalent to [`mem::zeroed`].
2997///
2998/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_si128)
2999#[inline]
3000#[target_feature(enable = "sse2")]
3001#[stable(feature = "simd_x86", since = "1.27.0")]
3002pub fn _mm_undefined_si128() -> __m128i {
3003    const { unsafe { mem::zeroed() } }
3004}
3005
3006/// The resulting `__m128d` element is composed by the low-order values of
3007/// the two `__m128d` interleaved input elements, i.e.:
3008///
3009/// * The `[127:64]` bits are copied from the `[127:64]` bits of the second input
3010/// * The `[63:0]` bits are copied from the `[127:64]` bits of the first input
3011///
3012/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd)
3013#[inline]
3014#[target_feature(enable = "sse2")]
3015#[cfg_attr(test, assert_instr(unpckhpd))]
3016#[stable(feature = "simd_x86", since = "1.27.0")]
3017pub fn _mm_unpackhi_pd(a: __m128d, b: __m128d) -> __m128d {
3018    unsafe { simd_shuffle!(a, b, [1, 3]) }
3019}
3020
3021/// The resulting `__m128d` element is composed by the high-order values of
3022/// the two `__m128d` interleaved input elements, i.e.:
3023///
3024/// * The `[127:64]` bits are copied from the `[63:0]` bits of the second input
3025/// * The `[63:0]` bits are copied from the `[63:0]` bits of the first input
3026///
3027/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd)
3028#[inline]
3029#[target_feature(enable = "sse2")]
3030#[cfg_attr(test, assert_instr(movlhps))]
3031#[stable(feature = "simd_x86", since = "1.27.0")]
3032pub fn _mm_unpacklo_pd(a: __m128d, b: __m128d) -> __m128d {
3033    unsafe { simd_shuffle!(a, b, [0, 2]) }
3034}
3035
3036#[allow(improper_ctypes)]
3037unsafe extern "C" {
3038    #[link_name = "llvm.x86.sse2.pause"]
3039    fn pause();
3040    #[link_name = "llvm.x86.sse2.clflush"]
3041    fn clflush(p: *const u8);
3042    #[link_name = "llvm.x86.sse2.lfence"]
3043    fn lfence();
3044    #[link_name = "llvm.x86.sse2.mfence"]
3045    fn mfence();
3046    #[link_name = "llvm.x86.sse2.pmadd.wd"]
3047    fn pmaddwd(a: i16x8, b: i16x8) -> i32x4;
3048    #[link_name = "llvm.x86.sse2.psad.bw"]
3049    fn psadbw(a: u8x16, b: u8x16) -> u64x2;
3050    #[link_name = "llvm.x86.sse2.psll.w"]
3051    fn psllw(a: i16x8, count: i16x8) -> i16x8;
3052    #[link_name = "llvm.x86.sse2.psll.d"]
3053    fn pslld(a: i32x4, count: i32x4) -> i32x4;
3054    #[link_name = "llvm.x86.sse2.psll.q"]
3055    fn psllq(a: i64x2, count: i64x2) -> i64x2;
3056    #[link_name = "llvm.x86.sse2.psra.w"]
3057    fn psraw(a: i16x8, count: i16x8) -> i16x8;
3058    #[link_name = "llvm.x86.sse2.psra.d"]
3059    fn psrad(a: i32x4, count: i32x4) -> i32x4;
3060    #[link_name = "llvm.x86.sse2.psrl.w"]
3061    fn psrlw(a: i16x8, count: i16x8) -> i16x8;
3062    #[link_name = "llvm.x86.sse2.psrl.d"]
3063    fn psrld(a: i32x4, count: i32x4) -> i32x4;
3064    #[link_name = "llvm.x86.sse2.psrl.q"]
3065    fn psrlq(a: i64x2, count: i64x2) -> i64x2;
3066    #[link_name = "llvm.x86.sse2.cvtps2dq"]
3067    fn cvtps2dq(a: __m128) -> i32x4;
3068    #[link_name = "llvm.x86.sse2.maskmov.dqu"]
3069    fn maskmovdqu(a: i8x16, mask: i8x16, mem_addr: *mut i8);
3070    #[link_name = "llvm.x86.sse2.packsswb.128"]
3071    fn packsswb(a: i16x8, b: i16x8) -> i8x16;
3072    #[link_name = "llvm.x86.sse2.packssdw.128"]
3073    fn packssdw(a: i32x4, b: i32x4) -> i16x8;
3074    #[link_name = "llvm.x86.sse2.packuswb.128"]
3075    fn packuswb(a: i16x8, b: i16x8) -> u8x16;
3076    #[link_name = "llvm.x86.sse2.max.sd"]
3077    fn maxsd(a: __m128d, b: __m128d) -> __m128d;
3078    #[link_name = "llvm.x86.sse2.max.pd"]
3079    fn maxpd(a: __m128d, b: __m128d) -> __m128d;
3080    #[link_name = "llvm.x86.sse2.min.sd"]
3081    fn minsd(a: __m128d, b: __m128d) -> __m128d;
3082    #[link_name = "llvm.x86.sse2.min.pd"]
3083    fn minpd(a: __m128d, b: __m128d) -> __m128d;
3084    #[link_name = "llvm.x86.sse2.cmp.sd"]
3085    fn cmpsd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3086    #[link_name = "llvm.x86.sse2.cmp.pd"]
3087    fn cmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3088    #[link_name = "llvm.x86.sse2.comieq.sd"]
3089    fn comieqsd(a: __m128d, b: __m128d) -> i32;
3090    #[link_name = "llvm.x86.sse2.comilt.sd"]
3091    fn comiltsd(a: __m128d, b: __m128d) -> i32;
3092    #[link_name = "llvm.x86.sse2.comile.sd"]
3093    fn comilesd(a: __m128d, b: __m128d) -> i32;
3094    #[link_name = "llvm.x86.sse2.comigt.sd"]
3095    fn comigtsd(a: __m128d, b: __m128d) -> i32;
3096    #[link_name = "llvm.x86.sse2.comige.sd"]
3097    fn comigesd(a: __m128d, b: __m128d) -> i32;
3098    #[link_name = "llvm.x86.sse2.comineq.sd"]
3099    fn comineqsd(a: __m128d, b: __m128d) -> i32;
3100    #[link_name = "llvm.x86.sse2.ucomieq.sd"]
3101    fn ucomieqsd(a: __m128d, b: __m128d) -> i32;
3102    #[link_name = "llvm.x86.sse2.ucomilt.sd"]
3103    fn ucomiltsd(a: __m128d, b: __m128d) -> i32;
3104    #[link_name = "llvm.x86.sse2.ucomile.sd"]
3105    fn ucomilesd(a: __m128d, b: __m128d) -> i32;
3106    #[link_name = "llvm.x86.sse2.ucomigt.sd"]
3107    fn ucomigtsd(a: __m128d, b: __m128d) -> i32;
3108    #[link_name = "llvm.x86.sse2.ucomige.sd"]
3109    fn ucomigesd(a: __m128d, b: __m128d) -> i32;
3110    #[link_name = "llvm.x86.sse2.ucomineq.sd"]
3111    fn ucomineqsd(a: __m128d, b: __m128d) -> i32;
3112    #[link_name = "llvm.x86.sse2.cvtpd2dq"]
3113    fn cvtpd2dq(a: __m128d) -> i32x4;
3114    #[link_name = "llvm.x86.sse2.cvtsd2si"]
3115    fn cvtsd2si(a: __m128d) -> i32;
3116    #[link_name = "llvm.x86.sse2.cvtsd2ss"]
3117    fn cvtsd2ss(a: __m128, b: __m128d) -> __m128;
3118    #[link_name = "llvm.x86.sse2.cvtss2sd"]
3119    fn cvtss2sd(a: __m128d, b: __m128) -> __m128d;
3120    #[link_name = "llvm.x86.sse2.cvttpd2dq"]
3121    fn cvttpd2dq(a: __m128d) -> i32x4;
3122    #[link_name = "llvm.x86.sse2.cvttsd2si"]
3123    fn cvttsd2si(a: __m128d) -> i32;
3124    #[link_name = "llvm.x86.sse2.cvttps2dq"]
3125    fn cvttps2dq(a: __m128) -> i32x4;
3126}
3127
3128#[cfg(test)]
3129mod tests {
3130    use crate::{
3131        core_arch::{simd::*, x86::*},
3132        hint::black_box,
3133    };
3134    use std::{
3135        boxed, f32, f64,
3136        mem::{self, transmute},
3137        ptr,
3138    };
3139    use stdarch_test::simd_test;
3140
3141    const NAN: f64 = f64::NAN;
3142
3143    #[test]
3144    fn test_mm_pause() {
3145        unsafe { _mm_pause() }
3146    }
3147
3148    #[simd_test(enable = "sse2")]
3149    unsafe fn test_mm_clflush() {
3150        let x = 0_u8;
3151        _mm_clflush(ptr::addr_of!(x));
3152    }
3153
3154    #[simd_test(enable = "sse2")]
3155    // Miri cannot support this until it is clear how it fits in the Rust memory model
3156    #[cfg_attr(miri, ignore)]
3157    unsafe fn test_mm_lfence() {
3158        _mm_lfence();
3159    }
3160
3161    #[simd_test(enable = "sse2")]
3162    // Miri cannot support this until it is clear how it fits in the Rust memory model
3163    #[cfg_attr(miri, ignore)]
3164    unsafe fn test_mm_mfence() {
3165        _mm_mfence();
3166    }
3167
3168    #[simd_test(enable = "sse2")]
3169    unsafe fn test_mm_add_epi8() {
3170        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3171        #[rustfmt::skip]
3172        let b = _mm_setr_epi8(
3173            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3174        );
3175        let r = _mm_add_epi8(a, b);
3176        #[rustfmt::skip]
3177        let e = _mm_setr_epi8(
3178            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
3179        );
3180        assert_eq_m128i(r, e);
3181    }
3182
3183    #[simd_test(enable = "sse2")]
3184    unsafe fn test_mm_add_epi8_overflow() {
3185        let a = _mm_set1_epi8(0x7F);
3186        let b = _mm_set1_epi8(1);
3187        let r = _mm_add_epi8(a, b);
3188        assert_eq_m128i(r, _mm_set1_epi8(-128));
3189    }
3190
3191    #[simd_test(enable = "sse2")]
3192    unsafe fn test_mm_add_epi16() {
3193        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3194        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
3195        let r = _mm_add_epi16(a, b);
3196        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
3197        assert_eq_m128i(r, e);
3198    }
3199
3200    #[simd_test(enable = "sse2")]
3201    unsafe fn test_mm_add_epi32() {
3202        let a = _mm_setr_epi32(0, 1, 2, 3);
3203        let b = _mm_setr_epi32(4, 5, 6, 7);
3204        let r = _mm_add_epi32(a, b);
3205        let e = _mm_setr_epi32(4, 6, 8, 10);
3206        assert_eq_m128i(r, e);
3207    }
3208
3209    #[simd_test(enable = "sse2")]
3210    unsafe fn test_mm_add_epi64() {
3211        let a = _mm_setr_epi64x(0, 1);
3212        let b = _mm_setr_epi64x(2, 3);
3213        let r = _mm_add_epi64(a, b);
3214        let e = _mm_setr_epi64x(2, 4);
3215        assert_eq_m128i(r, e);
3216    }
3217
3218    #[simd_test(enable = "sse2")]
3219    unsafe fn test_mm_adds_epi8() {
3220        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3221        #[rustfmt::skip]
3222        let b = _mm_setr_epi8(
3223            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3224        );
3225        let r = _mm_adds_epi8(a, b);
3226        #[rustfmt::skip]
3227        let e = _mm_setr_epi8(
3228            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
3229        );
3230        assert_eq_m128i(r, e);
3231    }
3232
3233    #[simd_test(enable = "sse2")]
3234    unsafe fn test_mm_adds_epi8_saturate_positive() {
3235        let a = _mm_set1_epi8(0x7F);
3236        let b = _mm_set1_epi8(1);
3237        let r = _mm_adds_epi8(a, b);
3238        assert_eq_m128i(r, a);
3239    }
3240
3241    #[simd_test(enable = "sse2")]
3242    unsafe fn test_mm_adds_epi8_saturate_negative() {
3243        let a = _mm_set1_epi8(-0x80);
3244        let b = _mm_set1_epi8(-1);
3245        let r = _mm_adds_epi8(a, b);
3246        assert_eq_m128i(r, a);
3247    }
3248
3249    #[simd_test(enable = "sse2")]
3250    unsafe fn test_mm_adds_epi16() {
3251        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3252        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
3253        let r = _mm_adds_epi16(a, b);
3254        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
3255        assert_eq_m128i(r, e);
3256    }
3257
3258    #[simd_test(enable = "sse2")]
3259    unsafe fn test_mm_adds_epi16_saturate_positive() {
3260        let a = _mm_set1_epi16(0x7FFF);
3261        let b = _mm_set1_epi16(1);
3262        let r = _mm_adds_epi16(a, b);
3263        assert_eq_m128i(r, a);
3264    }
3265
3266    #[simd_test(enable = "sse2")]
3267    unsafe fn test_mm_adds_epi16_saturate_negative() {
3268        let a = _mm_set1_epi16(-0x8000);
3269        let b = _mm_set1_epi16(-1);
3270        let r = _mm_adds_epi16(a, b);
3271        assert_eq_m128i(r, a);
3272    }
3273
3274    #[simd_test(enable = "sse2")]
3275    unsafe fn test_mm_adds_epu8() {
3276        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3277        #[rustfmt::skip]
3278        let b = _mm_setr_epi8(
3279            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3280        );
3281        let r = _mm_adds_epu8(a, b);
3282        #[rustfmt::skip]
3283        let e = _mm_setr_epi8(
3284            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
3285        );
3286        assert_eq_m128i(r, e);
3287    }
3288
3289    #[simd_test(enable = "sse2")]
3290    unsafe fn test_mm_adds_epu8_saturate() {
3291        let a = _mm_set1_epi8(!0);
3292        let b = _mm_set1_epi8(1);
3293        let r = _mm_adds_epu8(a, b);
3294        assert_eq_m128i(r, a);
3295    }
3296
3297    #[simd_test(enable = "sse2")]
3298    unsafe fn test_mm_adds_epu16() {
3299        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3300        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
3301        let r = _mm_adds_epu16(a, b);
3302        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
3303        assert_eq_m128i(r, e);
3304    }
3305
3306    #[simd_test(enable = "sse2")]
3307    unsafe fn test_mm_adds_epu16_saturate() {
3308        let a = _mm_set1_epi16(!0);
3309        let b = _mm_set1_epi16(1);
3310        let r = _mm_adds_epu16(a, b);
3311        assert_eq_m128i(r, a);
3312    }
3313
3314    #[simd_test(enable = "sse2")]
3315    unsafe fn test_mm_avg_epu8() {
3316        let (a, b) = (_mm_set1_epi8(3), _mm_set1_epi8(9));
3317        let r = _mm_avg_epu8(a, b);
3318        assert_eq_m128i(r, _mm_set1_epi8(6));
3319    }
3320
3321    #[simd_test(enable = "sse2")]
3322    unsafe fn test_mm_avg_epu16() {
3323        let (a, b) = (_mm_set1_epi16(3), _mm_set1_epi16(9));
3324        let r = _mm_avg_epu16(a, b);
3325        assert_eq_m128i(r, _mm_set1_epi16(6));
3326    }
3327
3328    #[simd_test(enable = "sse2")]
3329    unsafe fn test_mm_madd_epi16() {
3330        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
3331        let b = _mm_setr_epi16(9, 10, 11, 12, 13, 14, 15, 16);
3332        let r = _mm_madd_epi16(a, b);
3333        let e = _mm_setr_epi32(29, 81, 149, 233);
3334        assert_eq_m128i(r, e);
3335
3336        // Test large values.
3337        // MIN*MIN+MIN*MIN will overflow into i32::MIN.
3338        let a = _mm_setr_epi16(
3339            i16::MAX,
3340            i16::MAX,
3341            i16::MIN,
3342            i16::MIN,
3343            i16::MIN,
3344            i16::MAX,
3345            0,
3346            0,
3347        );
3348        let b = _mm_setr_epi16(
3349            i16::MAX,
3350            i16::MAX,
3351            i16::MIN,
3352            i16::MIN,
3353            i16::MAX,
3354            i16::MIN,
3355            0,
3356            0,
3357        );
3358        let r = _mm_madd_epi16(a, b);
3359        let e = _mm_setr_epi32(0x7FFE0002, i32::MIN, -0x7FFF0000, 0);
3360        assert_eq_m128i(r, e);
3361    }
3362
3363    #[simd_test(enable = "sse2")]
3364    unsafe fn test_mm_max_epi16() {
3365        let a = _mm_set1_epi16(1);
3366        let b = _mm_set1_epi16(-1);
3367        let r = _mm_max_epi16(a, b);
3368        assert_eq_m128i(r, a);
3369    }
3370
3371    #[simd_test(enable = "sse2")]
3372    unsafe fn test_mm_max_epu8() {
3373        let a = _mm_set1_epi8(1);
3374        let b = _mm_set1_epi8(!0);
3375        let r = _mm_max_epu8(a, b);
3376        assert_eq_m128i(r, b);
3377    }
3378
3379    #[simd_test(enable = "sse2")]
3380    unsafe fn test_mm_min_epi16() {
3381        let a = _mm_set1_epi16(1);
3382        let b = _mm_set1_epi16(-1);
3383        let r = _mm_min_epi16(a, b);
3384        assert_eq_m128i(r, b);
3385    }
3386
3387    #[simd_test(enable = "sse2")]
3388    unsafe fn test_mm_min_epu8() {
3389        let a = _mm_set1_epi8(1);
3390        let b = _mm_set1_epi8(!0);
3391        let r = _mm_min_epu8(a, b);
3392        assert_eq_m128i(r, a);
3393    }
3394
3395    #[simd_test(enable = "sse2")]
3396    unsafe fn test_mm_mulhi_epi16() {
3397        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(-1001));
3398        let r = _mm_mulhi_epi16(a, b);
3399        assert_eq_m128i(r, _mm_set1_epi16(-16));
3400    }
3401
3402    #[simd_test(enable = "sse2")]
3403    unsafe fn test_mm_mulhi_epu16() {
3404        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(1001));
3405        let r = _mm_mulhi_epu16(a, b);
3406        assert_eq_m128i(r, _mm_set1_epi16(15));
3407    }
3408
3409    #[simd_test(enable = "sse2")]
3410    unsafe fn test_mm_mullo_epi16() {
3411        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(-1001));
3412        let r = _mm_mullo_epi16(a, b);
3413        assert_eq_m128i(r, _mm_set1_epi16(-17960));
3414    }
3415
3416    #[simd_test(enable = "sse2")]
3417    unsafe fn test_mm_mul_epu32() {
3418        let a = _mm_setr_epi64x(1_000_000_000, 1 << 34);
3419        let b = _mm_setr_epi64x(1_000_000_000, 1 << 35);
3420        let r = _mm_mul_epu32(a, b);
3421        let e = _mm_setr_epi64x(1_000_000_000 * 1_000_000_000, 0);
3422        assert_eq_m128i(r, e);
3423    }
3424
3425    #[simd_test(enable = "sse2")]
3426    unsafe fn test_mm_sad_epu8() {
3427        #[rustfmt::skip]
3428        let a = _mm_setr_epi8(
3429            255u8 as i8, 254u8 as i8, 253u8 as i8, 252u8 as i8,
3430            1, 2, 3, 4,
3431            155u8 as i8, 154u8 as i8, 153u8 as i8, 152u8 as i8,
3432            1, 2, 3, 4,
3433        );
3434        let b = _mm_setr_epi8(0, 0, 0, 0, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2);
3435        let r = _mm_sad_epu8(a, b);
3436        let e = _mm_setr_epi64x(1020, 614);
3437        assert_eq_m128i(r, e);
3438    }
3439
3440    #[simd_test(enable = "sse2")]
3441    unsafe fn test_mm_sub_epi8() {
3442        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(6));
3443        let r = _mm_sub_epi8(a, b);
3444        assert_eq_m128i(r, _mm_set1_epi8(-1));
3445    }
3446
3447    #[simd_test(enable = "sse2")]
3448    unsafe fn test_mm_sub_epi16() {
3449        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(6));
3450        let r = _mm_sub_epi16(a, b);
3451        assert_eq_m128i(r, _mm_set1_epi16(-1));
3452    }
3453
3454    #[simd_test(enable = "sse2")]
3455    unsafe fn test_mm_sub_epi32() {
3456        let (a, b) = (_mm_set1_epi32(5), _mm_set1_epi32(6));
3457        let r = _mm_sub_epi32(a, b);
3458        assert_eq_m128i(r, _mm_set1_epi32(-1));
3459    }
3460
3461    #[simd_test(enable = "sse2")]
3462    unsafe fn test_mm_sub_epi64() {
3463        let (a, b) = (_mm_set1_epi64x(5), _mm_set1_epi64x(6));
3464        let r = _mm_sub_epi64(a, b);
3465        assert_eq_m128i(r, _mm_set1_epi64x(-1));
3466    }
3467
3468    #[simd_test(enable = "sse2")]
3469    unsafe fn test_mm_subs_epi8() {
3470        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(2));
3471        let r = _mm_subs_epi8(a, b);
3472        assert_eq_m128i(r, _mm_set1_epi8(3));
3473    }
3474
3475    #[simd_test(enable = "sse2")]
3476    unsafe fn test_mm_subs_epi8_saturate_positive() {
3477        let a = _mm_set1_epi8(0x7F);
3478        let b = _mm_set1_epi8(-1);
3479        let r = _mm_subs_epi8(a, b);
3480        assert_eq_m128i(r, a);
3481    }
3482
3483    #[simd_test(enable = "sse2")]
3484    unsafe fn test_mm_subs_epi8_saturate_negative() {
3485        let a = _mm_set1_epi8(-0x80);
3486        let b = _mm_set1_epi8(1);
3487        let r = _mm_subs_epi8(a, b);
3488        assert_eq_m128i(r, a);
3489    }
3490
3491    #[simd_test(enable = "sse2")]
3492    unsafe fn test_mm_subs_epi16() {
3493        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(2));
3494        let r = _mm_subs_epi16(a, b);
3495        assert_eq_m128i(r, _mm_set1_epi16(3));
3496    }
3497
3498    #[simd_test(enable = "sse2")]
3499    unsafe fn test_mm_subs_epi16_saturate_positive() {
3500        let a = _mm_set1_epi16(0x7FFF);
3501        let b = _mm_set1_epi16(-1);
3502        let r = _mm_subs_epi16(a, b);
3503        assert_eq_m128i(r, a);
3504    }
3505
3506    #[simd_test(enable = "sse2")]
3507    unsafe fn test_mm_subs_epi16_saturate_negative() {
3508        let a = _mm_set1_epi16(-0x8000);
3509        let b = _mm_set1_epi16(1);
3510        let r = _mm_subs_epi16(a, b);
3511        assert_eq_m128i(r, a);
3512    }
3513
3514    #[simd_test(enable = "sse2")]
3515    unsafe fn test_mm_subs_epu8() {
3516        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(2));
3517        let r = _mm_subs_epu8(a, b);
3518        assert_eq_m128i(r, _mm_set1_epi8(3));
3519    }
3520
3521    #[simd_test(enable = "sse2")]
3522    unsafe fn test_mm_subs_epu8_saturate() {
3523        let a = _mm_set1_epi8(0);
3524        let b = _mm_set1_epi8(1);
3525        let r = _mm_subs_epu8(a, b);
3526        assert_eq_m128i(r, a);
3527    }
3528
3529    #[simd_test(enable = "sse2")]
3530    unsafe fn test_mm_subs_epu16() {
3531        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(2));
3532        let r = _mm_subs_epu16(a, b);
3533        assert_eq_m128i(r, _mm_set1_epi16(3));
3534    }
3535
3536    #[simd_test(enable = "sse2")]
3537    unsafe fn test_mm_subs_epu16_saturate() {
3538        let a = _mm_set1_epi16(0);
3539        let b = _mm_set1_epi16(1);
3540        let r = _mm_subs_epu16(a, b);
3541        assert_eq_m128i(r, a);
3542    }
3543
3544    #[simd_test(enable = "sse2")]
3545    unsafe fn test_mm_slli_si128() {
3546        #[rustfmt::skip]
3547        let a = _mm_setr_epi8(
3548            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3549        );
3550        let r = _mm_slli_si128::<1>(a);
3551        let e = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3552        assert_eq_m128i(r, e);
3553
3554        #[rustfmt::skip]
3555        let a = _mm_setr_epi8(
3556            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3557        );
3558        let r = _mm_slli_si128::<15>(a);
3559        let e = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
3560        assert_eq_m128i(r, e);
3561
3562        #[rustfmt::skip]
3563        let a = _mm_setr_epi8(
3564            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3565        );
3566        let r = _mm_slli_si128::<16>(a);
3567        assert_eq_m128i(r, _mm_set1_epi8(0));
3568    }
3569
3570    #[simd_test(enable = "sse2")]
3571    unsafe fn test_mm_slli_epi16() {
3572        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3573        let r = _mm_slli_epi16::<4>(a);
3574        assert_eq_m128i(
3575            r,
3576            _mm_setr_epi16(0xCC0, -0xCC0, 0xDD0, -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0),
3577        );
3578        let r = _mm_slli_epi16::<16>(a);
3579        assert_eq_m128i(r, _mm_set1_epi16(0));
3580    }
3581
3582    #[simd_test(enable = "sse2")]
3583    unsafe fn test_mm_sll_epi16() {
3584        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3585        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 4));
3586        assert_eq_m128i(
3587            r,
3588            _mm_setr_epi16(0xCC0, -0xCC0, 0xDD0, -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0),
3589        );
3590        let r = _mm_sll_epi16(a, _mm_set_epi64x(4, 0));
3591        assert_eq_m128i(r, a);
3592        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 16));
3593        assert_eq_m128i(r, _mm_set1_epi16(0));
3594        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, i64::MAX));
3595        assert_eq_m128i(r, _mm_set1_epi16(0));
3596    }
3597
3598    #[simd_test(enable = "sse2")]
3599    unsafe fn test_mm_slli_epi32() {
3600        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3601        let r = _mm_slli_epi32::<4>(a);
3602        assert_eq_m128i(r, _mm_setr_epi32(0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0));
3603        let r = _mm_slli_epi32::<32>(a);
3604        assert_eq_m128i(r, _mm_set1_epi32(0));
3605    }
3606
3607    #[simd_test(enable = "sse2")]
3608    unsafe fn test_mm_sll_epi32() {
3609        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3610        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 4));
3611        assert_eq_m128i(r, _mm_setr_epi32(0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0));
3612        let r = _mm_sll_epi32(a, _mm_set_epi64x(4, 0));
3613        assert_eq_m128i(r, a);
3614        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 32));
3615        assert_eq_m128i(r, _mm_set1_epi32(0));
3616        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, i64::MAX));
3617        assert_eq_m128i(r, _mm_set1_epi32(0));
3618    }
3619
3620    #[simd_test(enable = "sse2")]
3621    unsafe fn test_mm_slli_epi64() {
3622        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3623        let r = _mm_slli_epi64::<4>(a);
3624        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFFF0, -0xFFFFFFFF0));
3625        let r = _mm_slli_epi64::<64>(a);
3626        assert_eq_m128i(r, _mm_set1_epi64x(0));
3627    }
3628
3629    #[simd_test(enable = "sse2")]
3630    unsafe fn test_mm_sll_epi64() {
3631        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3632        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 4));
3633        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFFF0, -0xFFFFFFFF0));
3634        let r = _mm_sll_epi64(a, _mm_set_epi64x(4, 0));
3635        assert_eq_m128i(r, a);
3636        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 64));
3637        assert_eq_m128i(r, _mm_set1_epi64x(0));
3638        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, i64::MAX));
3639        assert_eq_m128i(r, _mm_set1_epi64x(0));
3640    }
3641
3642    #[simd_test(enable = "sse2")]
3643    unsafe fn test_mm_srai_epi16() {
3644        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3645        let r = _mm_srai_epi16::<4>(a);
3646        assert_eq_m128i(
3647            r,
3648            _mm_setr_epi16(0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF, -0x10),
3649        );
3650        let r = _mm_srai_epi16::<16>(a);
3651        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3652    }
3653
3654    #[simd_test(enable = "sse2")]
3655    unsafe fn test_mm_sra_epi16() {
3656        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3657        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 4));
3658        assert_eq_m128i(
3659            r,
3660            _mm_setr_epi16(0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF, -0x10),
3661        );
3662        let r = _mm_sra_epi16(a, _mm_set_epi64x(4, 0));
3663        assert_eq_m128i(r, a);
3664        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 16));
3665        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3666        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, i64::MAX));
3667        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3668    }
3669
3670    #[simd_test(enable = "sse2")]
3671    unsafe fn test_mm_srai_epi32() {
3672        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3673        let r = _mm_srai_epi32::<4>(a);
3674        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, -0xEEF, 0xFFF, -0x1000));
3675        let r = _mm_srai_epi32::<32>(a);
3676        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
3677    }
3678
3679    #[simd_test(enable = "sse2")]
3680    unsafe fn test_mm_sra_epi32() {
3681        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3682        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 4));
3683        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, -0xEEF, 0xFFF, -0x1000));
3684        let r = _mm_sra_epi32(a, _mm_set_epi64x(4, 0));
3685        assert_eq_m128i(r, a);
3686        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 32));
3687        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
3688        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, i64::MAX));
3689        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
3690    }
3691
3692    #[simd_test(enable = "sse2")]
3693    unsafe fn test_mm_srli_si128() {
3694        #[rustfmt::skip]
3695        let a = _mm_setr_epi8(
3696            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3697        );
3698        let r = _mm_srli_si128::<1>(a);
3699        #[rustfmt::skip]
3700        let e = _mm_setr_epi8(
3701            2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0,
3702        );
3703        assert_eq_m128i(r, e);
3704
3705        #[rustfmt::skip]
3706        let a = _mm_setr_epi8(
3707            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3708        );
3709        let r = _mm_srli_si128::<15>(a);
3710        let e = _mm_setr_epi8(16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3711        assert_eq_m128i(r, e);
3712
3713        #[rustfmt::skip]
3714        let a = _mm_setr_epi8(
3715            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3716        );
3717        let r = _mm_srli_si128::<16>(a);
3718        assert_eq_m128i(r, _mm_set1_epi8(0));
3719    }
3720
3721    #[simd_test(enable = "sse2")]
3722    unsafe fn test_mm_srli_epi16() {
3723        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3724        let r = _mm_srli_epi16::<4>(a);
3725        assert_eq_m128i(
3726            r,
3727            _mm_setr_epi16(0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1, 0xF, 0xFF0),
3728        );
3729        let r = _mm_srli_epi16::<16>(a);
3730        assert_eq_m128i(r, _mm_set1_epi16(0));
3731    }
3732
3733    #[simd_test(enable = "sse2")]
3734    unsafe fn test_mm_srl_epi16() {
3735        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3736        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 4));
3737        assert_eq_m128i(
3738            r,
3739            _mm_setr_epi16(0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1, 0xF, 0xFF0),
3740        );
3741        let r = _mm_srl_epi16(a, _mm_set_epi64x(4, 0));
3742        assert_eq_m128i(r, a);
3743        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 16));
3744        assert_eq_m128i(r, _mm_set1_epi16(0));
3745        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, i64::MAX));
3746        assert_eq_m128i(r, _mm_set1_epi16(0));
3747    }
3748
3749    #[simd_test(enable = "sse2")]
3750    unsafe fn test_mm_srli_epi32() {
3751        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3752        let r = _mm_srli_epi32::<4>(a);
3753        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000));
3754        let r = _mm_srli_epi32::<32>(a);
3755        assert_eq_m128i(r, _mm_set1_epi32(0));
3756    }
3757
3758    #[simd_test(enable = "sse2")]
3759    unsafe fn test_mm_srl_epi32() {
3760        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3761        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 4));
3762        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000));
3763        let r = _mm_srl_epi32(a, _mm_set_epi64x(4, 0));
3764        assert_eq_m128i(r, a);
3765        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 32));
3766        assert_eq_m128i(r, _mm_set1_epi32(0));
3767        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, i64::MAX));
3768        assert_eq_m128i(r, _mm_set1_epi32(0));
3769    }
3770
3771    #[simd_test(enable = "sse2")]
3772    unsafe fn test_mm_srli_epi64() {
3773        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3774        let r = _mm_srli_epi64::<4>(a);
3775        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFF, 0xFFFFFFFF0000000));
3776        let r = _mm_srli_epi64::<64>(a);
3777        assert_eq_m128i(r, _mm_set1_epi64x(0));
3778    }
3779
3780    #[simd_test(enable = "sse2")]
3781    unsafe fn test_mm_srl_epi64() {
3782        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3783        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 4));
3784        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFF, 0xFFFFFFFF0000000));
3785        let r = _mm_srl_epi64(a, _mm_set_epi64x(4, 0));
3786        assert_eq_m128i(r, a);
3787        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 64));
3788        assert_eq_m128i(r, _mm_set1_epi64x(0));
3789        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, i64::MAX));
3790        assert_eq_m128i(r, _mm_set1_epi64x(0));
3791    }
3792
3793    #[simd_test(enable = "sse2")]
3794    unsafe fn test_mm_and_si128() {
3795        let a = _mm_set1_epi8(5);
3796        let b = _mm_set1_epi8(3);
3797        let r = _mm_and_si128(a, b);
3798        assert_eq_m128i(r, _mm_set1_epi8(1));
3799    }
3800
3801    #[simd_test(enable = "sse2")]
3802    unsafe fn test_mm_andnot_si128() {
3803        let a = _mm_set1_epi8(5);
3804        let b = _mm_set1_epi8(3);
3805        let r = _mm_andnot_si128(a, b);
3806        assert_eq_m128i(r, _mm_set1_epi8(2));
3807    }
3808
3809    #[simd_test(enable = "sse2")]
3810    unsafe fn test_mm_or_si128() {
3811        let a = _mm_set1_epi8(5);
3812        let b = _mm_set1_epi8(3);
3813        let r = _mm_or_si128(a, b);
3814        assert_eq_m128i(r, _mm_set1_epi8(7));
3815    }
3816
3817    #[simd_test(enable = "sse2")]
3818    unsafe fn test_mm_xor_si128() {
3819        let a = _mm_set1_epi8(5);
3820        let b = _mm_set1_epi8(3);
3821        let r = _mm_xor_si128(a, b);
3822        assert_eq_m128i(r, _mm_set1_epi8(6));
3823    }
3824
3825    #[simd_test(enable = "sse2")]
3826    unsafe fn test_mm_cmpeq_epi8() {
3827        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3828        let b = _mm_setr_epi8(15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
3829        let r = _mm_cmpeq_epi8(a, b);
3830        #[rustfmt::skip]
3831        assert_eq_m128i(
3832            r,
3833            _mm_setr_epi8(
3834                0, 0, 0xFFu8 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
3835            )
3836        );
3837    }
3838
3839    #[simd_test(enable = "sse2")]
3840    unsafe fn test_mm_cmpeq_epi16() {
3841        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3842        let b = _mm_setr_epi16(7, 6, 2, 4, 3, 2, 1, 0);
3843        let r = _mm_cmpeq_epi16(a, b);
3844        assert_eq_m128i(r, _mm_setr_epi16(0, 0, !0, 0, 0, 0, 0, 0));
3845    }
3846
3847    #[simd_test(enable = "sse2")]
3848    unsafe fn test_mm_cmpeq_epi32() {
3849        let a = _mm_setr_epi32(0, 1, 2, 3);
3850        let b = _mm_setr_epi32(3, 2, 2, 0);
3851        let r = _mm_cmpeq_epi32(a, b);
3852        assert_eq_m128i(r, _mm_setr_epi32(0, 0, !0, 0));
3853    }
3854
3855    #[simd_test(enable = "sse2")]
3856    unsafe fn test_mm_cmpgt_epi8() {
3857        let a = _mm_set_epi8(5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3858        let b = _mm_set1_epi8(0);
3859        let r = _mm_cmpgt_epi8(a, b);
3860        let e = _mm_set_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3861        assert_eq_m128i(r, e);
3862    }
3863
3864    #[simd_test(enable = "sse2")]
3865    unsafe fn test_mm_cmpgt_epi16() {
3866        let a = _mm_set_epi16(5, 0, 0, 0, 0, 0, 0, 0);
3867        let b = _mm_set1_epi16(0);
3868        let r = _mm_cmpgt_epi16(a, b);
3869        let e = _mm_set_epi16(!0, 0, 0, 0, 0, 0, 0, 0);
3870        assert_eq_m128i(r, e);
3871    }
3872
3873    #[simd_test(enable = "sse2")]
3874    unsafe fn test_mm_cmpgt_epi32() {
3875        let a = _mm_set_epi32(5, 0, 0, 0);
3876        let b = _mm_set1_epi32(0);
3877        let r = _mm_cmpgt_epi32(a, b);
3878        assert_eq_m128i(r, _mm_set_epi32(!0, 0, 0, 0));
3879    }
3880
3881    #[simd_test(enable = "sse2")]
3882    unsafe fn test_mm_cmplt_epi8() {
3883        let a = _mm_set1_epi8(0);
3884        let b = _mm_set_epi8(5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3885        let r = _mm_cmplt_epi8(a, b);
3886        let e = _mm_set_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3887        assert_eq_m128i(r, e);
3888    }
3889
3890    #[simd_test(enable = "sse2")]
3891    unsafe fn test_mm_cmplt_epi16() {
3892        let a = _mm_set1_epi16(0);
3893        let b = _mm_set_epi16(5, 0, 0, 0, 0, 0, 0, 0);
3894        let r = _mm_cmplt_epi16(a, b);
3895        let e = _mm_set_epi16(!0, 0, 0, 0, 0, 0, 0, 0);
3896        assert_eq_m128i(r, e);
3897    }
3898
3899    #[simd_test(enable = "sse2")]
3900    unsafe fn test_mm_cmplt_epi32() {
3901        let a = _mm_set1_epi32(0);
3902        let b = _mm_set_epi32(5, 0, 0, 0);
3903        let r = _mm_cmplt_epi32(a, b);
3904        assert_eq_m128i(r, _mm_set_epi32(!0, 0, 0, 0));
3905    }
3906
3907    #[simd_test(enable = "sse2")]
3908    unsafe fn test_mm_cvtepi32_pd() {
3909        let a = _mm_set_epi32(35, 25, 15, 5);
3910        let r = _mm_cvtepi32_pd(a);
3911        assert_eq_m128d(r, _mm_setr_pd(5.0, 15.0));
3912    }
3913
3914    #[simd_test(enable = "sse2")]
3915    unsafe fn test_mm_cvtsi32_sd() {
3916        let a = _mm_set1_pd(3.5);
3917        let r = _mm_cvtsi32_sd(a, 5);
3918        assert_eq_m128d(r, _mm_setr_pd(5.0, 3.5));
3919    }
3920
3921    #[simd_test(enable = "sse2")]
3922    unsafe fn test_mm_cvtepi32_ps() {
3923        let a = _mm_setr_epi32(1, 2, 3, 4);
3924        let r = _mm_cvtepi32_ps(a);
3925        assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 3.0, 4.0));
3926    }
3927
3928    #[simd_test(enable = "sse2")]
3929    unsafe fn test_mm_cvtps_epi32() {
3930        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3931        let r = _mm_cvtps_epi32(a);
3932        assert_eq_m128i(r, _mm_setr_epi32(1, 2, 3, 4));
3933    }
3934
3935    #[simd_test(enable = "sse2")]
3936    unsafe fn test_mm_cvtsi32_si128() {
3937        let r = _mm_cvtsi32_si128(5);
3938        assert_eq_m128i(r, _mm_setr_epi32(5, 0, 0, 0));
3939    }
3940
3941    #[simd_test(enable = "sse2")]
3942    unsafe fn test_mm_cvtsi128_si32() {
3943        let r = _mm_cvtsi128_si32(_mm_setr_epi32(5, 0, 0, 0));
3944        assert_eq!(r, 5);
3945    }
3946
3947    #[simd_test(enable = "sse2")]
3948    unsafe fn test_mm_set_epi64x() {
3949        let r = _mm_set_epi64x(0, 1);
3950        assert_eq_m128i(r, _mm_setr_epi64x(1, 0));
3951    }
3952
3953    #[simd_test(enable = "sse2")]
3954    unsafe fn test_mm_set_epi32() {
3955        let r = _mm_set_epi32(0, 1, 2, 3);
3956        assert_eq_m128i(r, _mm_setr_epi32(3, 2, 1, 0));
3957    }
3958
3959    #[simd_test(enable = "sse2")]
3960    unsafe fn test_mm_set_epi16() {
3961        let r = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3962        assert_eq_m128i(r, _mm_setr_epi16(7, 6, 5, 4, 3, 2, 1, 0));
3963    }
3964
3965    #[simd_test(enable = "sse2")]
3966    unsafe fn test_mm_set_epi8() {
3967        #[rustfmt::skip]
3968        let r = _mm_set_epi8(
3969            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3970        );
3971        #[rustfmt::skip]
3972        let e = _mm_setr_epi8(
3973            15, 14, 13, 12, 11, 10, 9, 8,
3974            7, 6, 5, 4, 3, 2, 1, 0,
3975        );
3976        assert_eq_m128i(r, e);
3977    }
3978
3979    #[simd_test(enable = "sse2")]
3980    unsafe fn test_mm_set1_epi64x() {
3981        let r = _mm_set1_epi64x(1);
3982        assert_eq_m128i(r, _mm_set1_epi64x(1));
3983    }
3984
3985    #[simd_test(enable = "sse2")]
3986    unsafe fn test_mm_set1_epi32() {
3987        let r = _mm_set1_epi32(1);
3988        assert_eq_m128i(r, _mm_set1_epi32(1));
3989    }
3990
3991    #[simd_test(enable = "sse2")]
3992    unsafe fn test_mm_set1_epi16() {
3993        let r = _mm_set1_epi16(1);
3994        assert_eq_m128i(r, _mm_set1_epi16(1));
3995    }
3996
3997    #[simd_test(enable = "sse2")]
3998    unsafe fn test_mm_set1_epi8() {
3999        let r = _mm_set1_epi8(1);
4000        assert_eq_m128i(r, _mm_set1_epi8(1));
4001    }
4002
4003    #[simd_test(enable = "sse2")]
4004    unsafe fn test_mm_setr_epi32() {
4005        let r = _mm_setr_epi32(0, 1, 2, 3);
4006        assert_eq_m128i(r, _mm_setr_epi32(0, 1, 2, 3));
4007    }
4008
4009    #[simd_test(enable = "sse2")]
4010    unsafe fn test_mm_setr_epi16() {
4011        let r = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4012        assert_eq_m128i(r, _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7));
4013    }
4014
4015    #[simd_test(enable = "sse2")]
4016    unsafe fn test_mm_setr_epi8() {
4017        #[rustfmt::skip]
4018        let r = _mm_setr_epi8(
4019            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4020        );
4021        #[rustfmt::skip]
4022        let e = _mm_setr_epi8(
4023            0, 1, 2, 3, 4, 5, 6, 7,
4024            8, 9, 10, 11, 12, 13, 14, 15,
4025        );
4026        assert_eq_m128i(r, e);
4027    }
4028
4029    #[simd_test(enable = "sse2")]
4030    unsafe fn test_mm_setzero_si128() {
4031        let r = _mm_setzero_si128();
4032        assert_eq_m128i(r, _mm_set1_epi64x(0));
4033    }
4034
4035    #[simd_test(enable = "sse2")]
4036    unsafe fn test_mm_loadl_epi64() {
4037        let a = _mm_setr_epi64x(6, 5);
4038        let r = _mm_loadl_epi64(ptr::addr_of!(a));
4039        assert_eq_m128i(r, _mm_setr_epi64x(6, 0));
4040    }
4041
4042    #[simd_test(enable = "sse2")]
4043    unsafe fn test_mm_load_si128() {
4044        let a = _mm_set_epi64x(5, 6);
4045        let r = _mm_load_si128(ptr::addr_of!(a) as *const _);
4046        assert_eq_m128i(a, r);
4047    }
4048
4049    #[simd_test(enable = "sse2")]
4050    unsafe fn test_mm_loadu_si128() {
4051        let a = _mm_set_epi64x(5, 6);
4052        let r = _mm_loadu_si128(ptr::addr_of!(a) as *const _);
4053        assert_eq_m128i(a, r);
4054    }
4055
4056    #[simd_test(enable = "sse2")]
4057    // Miri cannot support this until it is clear how it fits in the Rust memory model
4058    // (non-temporal store)
4059    #[cfg_attr(miri, ignore)]
4060    unsafe fn test_mm_maskmoveu_si128() {
4061        let a = _mm_set1_epi8(9);
4062        #[rustfmt::skip]
4063        let mask = _mm_set_epi8(
4064            0, 0, 0x80u8 as i8, 0, 0, 0, 0, 0,
4065            0, 0, 0, 0, 0, 0, 0, 0,
4066        );
4067        let mut r = _mm_set1_epi8(0);
4068        _mm_maskmoveu_si128(a, mask, ptr::addr_of_mut!(r) as *mut i8);
4069        let e = _mm_set_epi8(0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
4070        assert_eq_m128i(r, e);
4071    }
4072
4073    #[simd_test(enable = "sse2")]
4074    unsafe fn test_mm_store_si128() {
4075        let a = _mm_set1_epi8(9);
4076        let mut r = _mm_set1_epi8(0);
4077        _mm_store_si128(&mut r, a);
4078        assert_eq_m128i(r, a);
4079    }
4080
4081    #[simd_test(enable = "sse2")]
4082    unsafe fn test_mm_storeu_si128() {
4083        let a = _mm_set1_epi8(9);
4084        let mut r = _mm_set1_epi8(0);
4085        _mm_storeu_si128(&mut r, a);
4086        assert_eq_m128i(r, a);
4087    }
4088
4089    #[simd_test(enable = "sse2")]
4090    unsafe fn test_mm_storel_epi64() {
4091        let a = _mm_setr_epi64x(2, 9);
4092        let mut r = _mm_set1_epi8(0);
4093        _mm_storel_epi64(&mut r, a);
4094        assert_eq_m128i(r, _mm_setr_epi64x(2, 0));
4095    }
4096
4097    #[simd_test(enable = "sse2")]
4098    // Miri cannot support this until it is clear how it fits in the Rust memory model
4099    // (non-temporal store)
4100    #[cfg_attr(miri, ignore)]
4101    unsafe fn test_mm_stream_si128() {
4102        let a = _mm_setr_epi32(1, 2, 3, 4);
4103        let mut r = _mm_undefined_si128();
4104        _mm_stream_si128(ptr::addr_of_mut!(r), a);
4105        assert_eq_m128i(r, a);
4106    }
4107
4108    #[simd_test(enable = "sse2")]
4109    // Miri cannot support this until it is clear how it fits in the Rust memory model
4110    // (non-temporal store)
4111    #[cfg_attr(miri, ignore)]
4112    unsafe fn test_mm_stream_si32() {
4113        let a: i32 = 7;
4114        let mut mem = boxed::Box::<i32>::new(-1);
4115        _mm_stream_si32(ptr::addr_of_mut!(*mem), a);
4116        assert_eq!(a, *mem);
4117    }
4118
4119    #[simd_test(enable = "sse2")]
4120    unsafe fn test_mm_move_epi64() {
4121        let a = _mm_setr_epi64x(5, 6);
4122        let r = _mm_move_epi64(a);
4123        assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
4124    }
4125
4126    #[simd_test(enable = "sse2")]
4127    unsafe fn test_mm_packs_epi16() {
4128        let a = _mm_setr_epi16(0x80, -0x81, 0, 0, 0, 0, 0, 0);
4129        let b = _mm_setr_epi16(0, 0, 0, 0, 0, 0, -0x81, 0x80);
4130        let r = _mm_packs_epi16(a, b);
4131        #[rustfmt::skip]
4132        assert_eq_m128i(
4133            r,
4134            _mm_setr_epi8(
4135                0x7F, -0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0x80, 0x7F
4136            )
4137        );
4138    }
4139
4140    #[simd_test(enable = "sse2")]
4141    unsafe fn test_mm_packs_epi32() {
4142        let a = _mm_setr_epi32(0x8000, -0x8001, 0, 0);
4143        let b = _mm_setr_epi32(0, 0, -0x8001, 0x8000);
4144        let r = _mm_packs_epi32(a, b);
4145        assert_eq_m128i(
4146            r,
4147            _mm_setr_epi16(0x7FFF, -0x8000, 0, 0, 0, 0, -0x8000, 0x7FFF),
4148        );
4149    }
4150
4151    #[simd_test(enable = "sse2")]
4152    unsafe fn test_mm_packus_epi16() {
4153        let a = _mm_setr_epi16(0x100, -1, 0, 0, 0, 0, 0, 0);
4154        let b = _mm_setr_epi16(0, 0, 0, 0, 0, 0, -1, 0x100);
4155        let r = _mm_packus_epi16(a, b);
4156        assert_eq_m128i(
4157            r,
4158            _mm_setr_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, !0),
4159        );
4160    }
4161
4162    #[simd_test(enable = "sse2")]
4163    unsafe fn test_mm_extract_epi16() {
4164        let a = _mm_setr_epi16(-1, 1, 2, 3, 4, 5, 6, 7);
4165        let r1 = _mm_extract_epi16::<0>(a);
4166        let r2 = _mm_extract_epi16::<3>(a);
4167        assert_eq!(r1, 0xFFFF);
4168        assert_eq!(r2, 3);
4169    }
4170
4171    #[simd_test(enable = "sse2")]
4172    unsafe fn test_mm_insert_epi16() {
4173        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4174        let r = _mm_insert_epi16::<0>(a, 9);
4175        let e = _mm_setr_epi16(9, 1, 2, 3, 4, 5, 6, 7);
4176        assert_eq_m128i(r, e);
4177    }
4178
4179    #[simd_test(enable = "sse2")]
4180    unsafe fn test_mm_movemask_epi8() {
4181        #[rustfmt::skip]
4182        let a = _mm_setr_epi8(
4183            0b1000_0000u8 as i8, 0b0, 0b1000_0000u8 as i8, 0b01,
4184            0b0101, 0b1111_0000u8 as i8, 0, 0,
4185            0, 0b1011_0101u8 as i8, 0b1111_0000u8 as i8, 0b0101,
4186            0b01, 0b1000_0000u8 as i8, 0b0, 0b1000_0000u8 as i8,
4187        );
4188        let r = _mm_movemask_epi8(a);
4189        assert_eq!(r, 0b10100110_00100101);
4190    }
4191
4192    #[simd_test(enable = "sse2")]
4193    unsafe fn test_mm_shuffle_epi32() {
4194        let a = _mm_setr_epi32(5, 10, 15, 20);
4195        let r = _mm_shuffle_epi32::<0b00_01_01_11>(a);
4196        let e = _mm_setr_epi32(20, 10, 10, 5);
4197        assert_eq_m128i(r, e);
4198    }
4199
4200    #[simd_test(enable = "sse2")]
4201    unsafe fn test_mm_shufflehi_epi16() {
4202        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 10, 15, 20);
4203        let r = _mm_shufflehi_epi16::<0b00_01_01_11>(a);
4204        let e = _mm_setr_epi16(1, 2, 3, 4, 20, 10, 10, 5);
4205        assert_eq_m128i(r, e);
4206    }
4207
4208    #[simd_test(enable = "sse2")]
4209    unsafe fn test_mm_shufflelo_epi16() {
4210        let a = _mm_setr_epi16(5, 10, 15, 20, 1, 2, 3, 4);
4211        let r = _mm_shufflelo_epi16::<0b00_01_01_11>(a);
4212        let e = _mm_setr_epi16(20, 10, 10, 5, 1, 2, 3, 4);
4213        assert_eq_m128i(r, e);
4214    }
4215
4216    #[simd_test(enable = "sse2")]
4217    unsafe fn test_mm_unpackhi_epi8() {
4218        #[rustfmt::skip]
4219        let a = _mm_setr_epi8(
4220            0, 1, 2, 3, 4, 5, 6, 7,
4221            8, 9, 10, 11, 12, 13, 14, 15,
4222        );
4223        #[rustfmt::skip]
4224        let b = _mm_setr_epi8(
4225            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
4226        );
4227        let r = _mm_unpackhi_epi8(a, b);
4228        #[rustfmt::skip]
4229        let e = _mm_setr_epi8(
4230            8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31,
4231        );
4232        assert_eq_m128i(r, e);
4233    }
4234
4235    #[simd_test(enable = "sse2")]
4236    unsafe fn test_mm_unpackhi_epi16() {
4237        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4238        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
4239        let r = _mm_unpackhi_epi16(a, b);
4240        let e = _mm_setr_epi16(4, 12, 5, 13, 6, 14, 7, 15);
4241        assert_eq_m128i(r, e);
4242    }
4243
4244    #[simd_test(enable = "sse2")]
4245    unsafe fn test_mm_unpackhi_epi32() {
4246        let a = _mm_setr_epi32(0, 1, 2, 3);
4247        let b = _mm_setr_epi32(4, 5, 6, 7);
4248        let r = _mm_unpackhi_epi32(a, b);
4249        let e = _mm_setr_epi32(2, 6, 3, 7);
4250        assert_eq_m128i(r, e);
4251    }
4252
4253    #[simd_test(enable = "sse2")]
4254    unsafe fn test_mm_unpackhi_epi64() {
4255        let a = _mm_setr_epi64x(0, 1);
4256        let b = _mm_setr_epi64x(2, 3);
4257        let r = _mm_unpackhi_epi64(a, b);
4258        let e = _mm_setr_epi64x(1, 3);
4259        assert_eq_m128i(r, e);
4260    }
4261
4262    #[simd_test(enable = "sse2")]
4263    unsafe fn test_mm_unpacklo_epi8() {
4264        #[rustfmt::skip]
4265        let a = _mm_setr_epi8(
4266            0, 1, 2, 3, 4, 5, 6, 7,
4267            8, 9, 10, 11, 12, 13, 14, 15,
4268        );
4269        #[rustfmt::skip]
4270        let b = _mm_setr_epi8(
4271            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
4272        );
4273        let r = _mm_unpacklo_epi8(a, b);
4274        #[rustfmt::skip]
4275        let e = _mm_setr_epi8(
4276            0, 16, 1, 17, 2, 18, 3, 19,
4277            4, 20, 5, 21, 6, 22, 7, 23,
4278        );
4279        assert_eq_m128i(r, e);
4280    }
4281
4282    #[simd_test(enable = "sse2")]
4283    unsafe fn test_mm_unpacklo_epi16() {
4284        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4285        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
4286        let r = _mm_unpacklo_epi16(a, b);
4287        let e = _mm_setr_epi16(0, 8, 1, 9, 2, 10, 3, 11);
4288        assert_eq_m128i(r, e);
4289    }
4290
4291    #[simd_test(enable = "sse2")]
4292    unsafe fn test_mm_unpacklo_epi32() {
4293        let a = _mm_setr_epi32(0, 1, 2, 3);
4294        let b = _mm_setr_epi32(4, 5, 6, 7);
4295        let r = _mm_unpacklo_epi32(a, b);
4296        let e = _mm_setr_epi32(0, 4, 1, 5);
4297        assert_eq_m128i(r, e);
4298    }
4299
4300    #[simd_test(enable = "sse2")]
4301    unsafe fn test_mm_unpacklo_epi64() {
4302        let a = _mm_setr_epi64x(0, 1);
4303        let b = _mm_setr_epi64x(2, 3);
4304        let r = _mm_unpacklo_epi64(a, b);
4305        let e = _mm_setr_epi64x(0, 2);
4306        assert_eq_m128i(r, e);
4307    }
4308
4309    #[simd_test(enable = "sse2")]
4310    unsafe fn test_mm_add_sd() {
4311        let a = _mm_setr_pd(1.0, 2.0);
4312        let b = _mm_setr_pd(5.0, 10.0);
4313        let r = _mm_add_sd(a, b);
4314        assert_eq_m128d(r, _mm_setr_pd(6.0, 2.0));
4315    }
4316
4317    #[simd_test(enable = "sse2")]
4318    unsafe fn test_mm_add_pd() {
4319        let a = _mm_setr_pd(1.0, 2.0);
4320        let b = _mm_setr_pd(5.0, 10.0);
4321        let r = _mm_add_pd(a, b);
4322        assert_eq_m128d(r, _mm_setr_pd(6.0, 12.0));
4323    }
4324
4325    #[simd_test(enable = "sse2")]
4326    unsafe fn test_mm_div_sd() {
4327        let a = _mm_setr_pd(1.0, 2.0);
4328        let b = _mm_setr_pd(5.0, 10.0);
4329        let r = _mm_div_sd(a, b);
4330        assert_eq_m128d(r, _mm_setr_pd(0.2, 2.0));
4331    }
4332
4333    #[simd_test(enable = "sse2")]
4334    unsafe fn test_mm_div_pd() {
4335        let a = _mm_setr_pd(1.0, 2.0);
4336        let b = _mm_setr_pd(5.0, 10.0);
4337        let r = _mm_div_pd(a, b);
4338        assert_eq_m128d(r, _mm_setr_pd(0.2, 0.2));
4339    }
4340
4341    #[simd_test(enable = "sse2")]
4342    unsafe fn test_mm_max_sd() {
4343        let a = _mm_setr_pd(1.0, 2.0);
4344        let b = _mm_setr_pd(5.0, 10.0);
4345        let r = _mm_max_sd(a, b);
4346        assert_eq_m128d(r, _mm_setr_pd(5.0, 2.0));
4347    }
4348
4349    #[simd_test(enable = "sse2")]
4350    unsafe fn test_mm_max_pd() {
4351        let a = _mm_setr_pd(1.0, 2.0);
4352        let b = _mm_setr_pd(5.0, 10.0);
4353        let r = _mm_max_pd(a, b);
4354        assert_eq_m128d(r, _mm_setr_pd(5.0, 10.0));
4355
4356        // Check SSE(2)-specific semantics for -0.0 handling.
4357        let a = _mm_setr_pd(-0.0, 0.0);
4358        let b = _mm_setr_pd(0.0, 0.0);
4359        let r1: [u8; 16] = transmute(_mm_max_pd(a, b));
4360        let r2: [u8; 16] = transmute(_mm_max_pd(b, a));
4361        let a: [u8; 16] = transmute(a);
4362        let b: [u8; 16] = transmute(b);
4363        assert_eq!(r1, b);
4364        assert_eq!(r2, a);
4365        assert_ne!(a, b); // sanity check that -0.0 is actually present
4366    }
4367
4368    #[simd_test(enable = "sse2")]
4369    unsafe fn test_mm_min_sd() {
4370        let a = _mm_setr_pd(1.0, 2.0);
4371        let b = _mm_setr_pd(5.0, 10.0);
4372        let r = _mm_min_sd(a, b);
4373        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
4374    }
4375
4376    #[simd_test(enable = "sse2")]
4377    unsafe fn test_mm_min_pd() {
4378        let a = _mm_setr_pd(1.0, 2.0);
4379        let b = _mm_setr_pd(5.0, 10.0);
4380        let r = _mm_min_pd(a, b);
4381        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
4382
4383        // Check SSE(2)-specific semantics for -0.0 handling.
4384        let a = _mm_setr_pd(-0.0, 0.0);
4385        let b = _mm_setr_pd(0.0, 0.0);
4386        let r1: [u8; 16] = transmute(_mm_min_pd(a, b));
4387        let r2: [u8; 16] = transmute(_mm_min_pd(b, a));
4388        let a: [u8; 16] = transmute(a);
4389        let b: [u8; 16] = transmute(b);
4390        assert_eq!(r1, b);
4391        assert_eq!(r2, a);
4392        assert_ne!(a, b); // sanity check that -0.0 is actually present
4393    }
4394
4395    #[simd_test(enable = "sse2")]
4396    unsafe fn test_mm_mul_sd() {
4397        let a = _mm_setr_pd(1.0, 2.0);
4398        let b = _mm_setr_pd(5.0, 10.0);
4399        let r = _mm_mul_sd(a, b);
4400        assert_eq_m128d(r, _mm_setr_pd(5.0, 2.0));
4401    }
4402
4403    #[simd_test(enable = "sse2")]
4404    unsafe fn test_mm_mul_pd() {
4405        let a = _mm_setr_pd(1.0, 2.0);
4406        let b = _mm_setr_pd(5.0, 10.0);
4407        let r = _mm_mul_pd(a, b);
4408        assert_eq_m128d(r, _mm_setr_pd(5.0, 20.0));
4409    }
4410
4411    #[simd_test(enable = "sse2")]
4412    unsafe fn test_mm_sqrt_sd() {
4413        let a = _mm_setr_pd(1.0, 2.0);
4414        let b = _mm_setr_pd(5.0, 10.0);
4415        let r = _mm_sqrt_sd(a, b);
4416        assert_eq_m128d(r, _mm_setr_pd(5.0f64.sqrt(), 2.0));
4417    }
4418
4419    #[simd_test(enable = "sse2")]
4420    unsafe fn test_mm_sqrt_pd() {
4421        let r = _mm_sqrt_pd(_mm_setr_pd(1.0, 2.0));
4422        assert_eq_m128d(r, _mm_setr_pd(1.0f64.sqrt(), 2.0f64.sqrt()));
4423    }
4424
4425    #[simd_test(enable = "sse2")]
4426    unsafe fn test_mm_sub_sd() {
4427        let a = _mm_setr_pd(1.0, 2.0);
4428        let b = _mm_setr_pd(5.0, 10.0);
4429        let r = _mm_sub_sd(a, b);
4430        assert_eq_m128d(r, _mm_setr_pd(-4.0, 2.0));
4431    }
4432
4433    #[simd_test(enable = "sse2")]
4434    unsafe fn test_mm_sub_pd() {
4435        let a = _mm_setr_pd(1.0, 2.0);
4436        let b = _mm_setr_pd(5.0, 10.0);
4437        let r = _mm_sub_pd(a, b);
4438        assert_eq_m128d(r, _mm_setr_pd(-4.0, -8.0));
4439    }
4440
4441    #[simd_test(enable = "sse2")]
4442    unsafe fn test_mm_and_pd() {
4443        let a = transmute(u64x2::splat(5));
4444        let b = transmute(u64x2::splat(3));
4445        let r = _mm_and_pd(a, b);
4446        let e = transmute(u64x2::splat(1));
4447        assert_eq_m128d(r, e);
4448    }
4449
4450    #[simd_test(enable = "sse2")]
4451    unsafe fn test_mm_andnot_pd() {
4452        let a = transmute(u64x2::splat(5));
4453        let b = transmute(u64x2::splat(3));
4454        let r = _mm_andnot_pd(a, b);
4455        let e = transmute(u64x2::splat(2));
4456        assert_eq_m128d(r, e);
4457    }
4458
4459    #[simd_test(enable = "sse2")]
4460    unsafe fn test_mm_or_pd() {
4461        let a = transmute(u64x2::splat(5));
4462        let b = transmute(u64x2::splat(3));
4463        let r = _mm_or_pd(a, b);
4464        let e = transmute(u64x2::splat(7));
4465        assert_eq_m128d(r, e);
4466    }
4467
4468    #[simd_test(enable = "sse2")]
4469    unsafe fn test_mm_xor_pd() {
4470        let a = transmute(u64x2::splat(5));
4471        let b = transmute(u64x2::splat(3));
4472        let r = _mm_xor_pd(a, b);
4473        let e = transmute(u64x2::splat(6));
4474        assert_eq_m128d(r, e);
4475    }
4476
4477    #[simd_test(enable = "sse2")]
4478    unsafe fn test_mm_cmpeq_sd() {
4479        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4480        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4481        let r = transmute::<_, __m128i>(_mm_cmpeq_sd(a, b));
4482        assert_eq_m128i(r, e);
4483    }
4484
4485    #[simd_test(enable = "sse2")]
4486    unsafe fn test_mm_cmplt_sd() {
4487        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4488        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4489        let r = transmute::<_, __m128i>(_mm_cmplt_sd(a, b));
4490        assert_eq_m128i(r, e);
4491    }
4492
4493    #[simd_test(enable = "sse2")]
4494    unsafe fn test_mm_cmple_sd() {
4495        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4496        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4497        let r = transmute::<_, __m128i>(_mm_cmple_sd(a, b));
4498        assert_eq_m128i(r, e);
4499    }
4500
4501    #[simd_test(enable = "sse2")]
4502    unsafe fn test_mm_cmpgt_sd() {
4503        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
4504        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4505        let r = transmute::<_, __m128i>(_mm_cmpgt_sd(a, b));
4506        assert_eq_m128i(r, e);
4507    }
4508
4509    #[simd_test(enable = "sse2")]
4510    unsafe fn test_mm_cmpge_sd() {
4511        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4512        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4513        let r = transmute::<_, __m128i>(_mm_cmpge_sd(a, b));
4514        assert_eq_m128i(r, e);
4515    }
4516
4517    #[simd_test(enable = "sse2")]
4518    unsafe fn test_mm_cmpord_sd() {
4519        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4520        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4521        let r = transmute::<_, __m128i>(_mm_cmpord_sd(a, b));
4522        assert_eq_m128i(r, e);
4523    }
4524
4525    #[simd_test(enable = "sse2")]
4526    unsafe fn test_mm_cmpunord_sd() {
4527        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4528        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4529        let r = transmute::<_, __m128i>(_mm_cmpunord_sd(a, b));
4530        assert_eq_m128i(r, e);
4531    }
4532
4533    #[simd_test(enable = "sse2")]
4534    unsafe fn test_mm_cmpneq_sd() {
4535        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4536        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4537        let r = transmute::<_, __m128i>(_mm_cmpneq_sd(a, b));
4538        assert_eq_m128i(r, e);
4539    }
4540
4541    #[simd_test(enable = "sse2")]
4542    unsafe fn test_mm_cmpnlt_sd() {
4543        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4544        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4545        let r = transmute::<_, __m128i>(_mm_cmpnlt_sd(a, b));
4546        assert_eq_m128i(r, e);
4547    }
4548
4549    #[simd_test(enable = "sse2")]
4550    unsafe fn test_mm_cmpnle_sd() {
4551        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4552        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4553        let r = transmute::<_, __m128i>(_mm_cmpnle_sd(a, b));
4554        assert_eq_m128i(r, e);
4555    }
4556
4557    #[simd_test(enable = "sse2")]
4558    unsafe fn test_mm_cmpngt_sd() {
4559        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
4560        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4561        let r = transmute::<_, __m128i>(_mm_cmpngt_sd(a, b));
4562        assert_eq_m128i(r, e);
4563    }
4564
4565    #[simd_test(enable = "sse2")]
4566    unsafe fn test_mm_cmpnge_sd() {
4567        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4568        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4569        let r = transmute::<_, __m128i>(_mm_cmpnge_sd(a, b));
4570        assert_eq_m128i(r, e);
4571    }
4572
4573    #[simd_test(enable = "sse2")]
4574    unsafe fn test_mm_cmpeq_pd() {
4575        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4576        let e = _mm_setr_epi64x(!0, 0);
4577        let r = transmute::<_, __m128i>(_mm_cmpeq_pd(a, b));
4578        assert_eq_m128i(r, e);
4579    }
4580
4581    #[simd_test(enable = "sse2")]
4582    unsafe fn test_mm_cmplt_pd() {
4583        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4584        let e = _mm_setr_epi64x(0, !0);
4585        let r = transmute::<_, __m128i>(_mm_cmplt_pd(a, b));
4586        assert_eq_m128i(r, e);
4587    }
4588
4589    #[simd_test(enable = "sse2")]
4590    unsafe fn test_mm_cmple_pd() {
4591        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4592        let e = _mm_setr_epi64x(!0, !0);
4593        let r = transmute::<_, __m128i>(_mm_cmple_pd(a, b));
4594        assert_eq_m128i(r, e);
4595    }
4596
4597    #[simd_test(enable = "sse2")]
4598    unsafe fn test_mm_cmpgt_pd() {
4599        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4600        let e = _mm_setr_epi64x(0, 0);
4601        let r = transmute::<_, __m128i>(_mm_cmpgt_pd(a, b));
4602        assert_eq_m128i(r, e);
4603    }
4604
4605    #[simd_test(enable = "sse2")]
4606    unsafe fn test_mm_cmpge_pd() {
4607        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4608        let e = _mm_setr_epi64x(!0, 0);
4609        let r = transmute::<_, __m128i>(_mm_cmpge_pd(a, b));
4610        assert_eq_m128i(r, e);
4611    }
4612
4613    #[simd_test(enable = "sse2")]
4614    unsafe fn test_mm_cmpord_pd() {
4615        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4616        let e = _mm_setr_epi64x(0, !0);
4617        let r = transmute::<_, __m128i>(_mm_cmpord_pd(a, b));
4618        assert_eq_m128i(r, e);
4619    }
4620
4621    #[simd_test(enable = "sse2")]
4622    unsafe fn test_mm_cmpunord_pd() {
4623        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4624        let e = _mm_setr_epi64x(!0, 0);
4625        let r = transmute::<_, __m128i>(_mm_cmpunord_pd(a, b));
4626        assert_eq_m128i(r, e);
4627    }
4628
4629    #[simd_test(enable = "sse2")]
4630    unsafe fn test_mm_cmpneq_pd() {
4631        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4632        let e = _mm_setr_epi64x(!0, !0);
4633        let r = transmute::<_, __m128i>(_mm_cmpneq_pd(a, b));
4634        assert_eq_m128i(r, e);
4635    }
4636
4637    #[simd_test(enable = "sse2")]
4638    unsafe fn test_mm_cmpnlt_pd() {
4639        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4640        let e = _mm_setr_epi64x(0, 0);
4641        let r = transmute::<_, __m128i>(_mm_cmpnlt_pd(a, b));
4642        assert_eq_m128i(r, e);
4643    }
4644
4645    #[simd_test(enable = "sse2")]
4646    unsafe fn test_mm_cmpnle_pd() {
4647        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4648        let e = _mm_setr_epi64x(0, 0);
4649        let r = transmute::<_, __m128i>(_mm_cmpnle_pd(a, b));
4650        assert_eq_m128i(r, e);
4651    }
4652
4653    #[simd_test(enable = "sse2")]
4654    unsafe fn test_mm_cmpngt_pd() {
4655        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
4656        let e = _mm_setr_epi64x(0, !0);
4657        let r = transmute::<_, __m128i>(_mm_cmpngt_pd(a, b));
4658        assert_eq_m128i(r, e);
4659    }
4660
4661    #[simd_test(enable = "sse2")]
4662    unsafe fn test_mm_cmpnge_pd() {
4663        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4664        let e = _mm_setr_epi64x(0, !0);
4665        let r = transmute::<_, __m128i>(_mm_cmpnge_pd(a, b));
4666        assert_eq_m128i(r, e);
4667    }
4668
4669    #[simd_test(enable = "sse2")]
4670    unsafe fn test_mm_comieq_sd() {
4671        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4672        assert!(_mm_comieq_sd(a, b) != 0);
4673
4674        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(1.0, 3.0));
4675        assert!(_mm_comieq_sd(a, b) == 0);
4676    }
4677
4678    #[simd_test(enable = "sse2")]
4679    unsafe fn test_mm_comilt_sd() {
4680        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4681        assert!(_mm_comilt_sd(a, b) == 0);
4682    }
4683
4684    #[simd_test(enable = "sse2")]
4685    unsafe fn test_mm_comile_sd() {
4686        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4687        assert!(_mm_comile_sd(a, b) != 0);
4688    }
4689
4690    #[simd_test(enable = "sse2")]
4691    unsafe fn test_mm_comigt_sd() {
4692        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4693        assert!(_mm_comigt_sd(a, b) == 0);
4694    }
4695
4696    #[simd_test(enable = "sse2")]
4697    unsafe fn test_mm_comige_sd() {
4698        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4699        assert!(_mm_comige_sd(a, b) != 0);
4700    }
4701
4702    #[simd_test(enable = "sse2")]
4703    unsafe fn test_mm_comineq_sd() {
4704        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4705        assert!(_mm_comineq_sd(a, b) == 0);
4706    }
4707
4708    #[simd_test(enable = "sse2")]
4709    unsafe fn test_mm_ucomieq_sd() {
4710        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4711        assert!(_mm_ucomieq_sd(a, b) != 0);
4712
4713        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(NAN, 3.0));
4714        assert!(_mm_ucomieq_sd(a, b) == 0);
4715    }
4716
4717    #[simd_test(enable = "sse2")]
4718    unsafe fn test_mm_ucomilt_sd() {
4719        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4720        assert!(_mm_ucomilt_sd(a, b) == 0);
4721    }
4722
4723    #[simd_test(enable = "sse2")]
4724    unsafe fn test_mm_ucomile_sd() {
4725        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4726        assert!(_mm_ucomile_sd(a, b) != 0);
4727    }
4728
4729    #[simd_test(enable = "sse2")]
4730    unsafe fn test_mm_ucomigt_sd() {
4731        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4732        assert!(_mm_ucomigt_sd(a, b) == 0);
4733    }
4734
4735    #[simd_test(enable = "sse2")]
4736    unsafe fn test_mm_ucomige_sd() {
4737        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4738        assert!(_mm_ucomige_sd(a, b) != 0);
4739    }
4740
4741    #[simd_test(enable = "sse2")]
4742    unsafe fn test_mm_ucomineq_sd() {
4743        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4744        assert!(_mm_ucomineq_sd(a, b) == 0);
4745    }
4746
4747    #[simd_test(enable = "sse2")]
4748    unsafe fn test_mm_movemask_pd() {
4749        let r = _mm_movemask_pd(_mm_setr_pd(-1.0, 5.0));
4750        assert_eq!(r, 0b01);
4751
4752        let r = _mm_movemask_pd(_mm_setr_pd(-1.0, -5.0));
4753        assert_eq!(r, 0b11);
4754    }
4755
4756    #[repr(align(16))]
4757    struct Memory {
4758        data: [f64; 4],
4759    }
4760
4761    #[simd_test(enable = "sse2")]
4762    unsafe fn test_mm_load_pd() {
4763        let mem = Memory {
4764            data: [1.0f64, 2.0, 3.0, 4.0],
4765        };
4766        let vals = &mem.data;
4767        let d = vals.as_ptr();
4768
4769        let r = _mm_load_pd(d);
4770        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
4771    }
4772
4773    #[simd_test(enable = "sse2")]
4774    unsafe fn test_mm_load_sd() {
4775        let a = 1.;
4776        let expected = _mm_setr_pd(a, 0.);
4777        let r = _mm_load_sd(&a);
4778        assert_eq_m128d(r, expected);
4779    }
4780
4781    #[simd_test(enable = "sse2")]
4782    unsafe fn test_mm_loadh_pd() {
4783        let a = _mm_setr_pd(1., 2.);
4784        let b = 3.;
4785        let expected = _mm_setr_pd(_mm_cvtsd_f64(a), 3.);
4786        let r = _mm_loadh_pd(a, &b);
4787        assert_eq_m128d(r, expected);
4788    }
4789
4790    #[simd_test(enable = "sse2")]
4791    unsafe fn test_mm_loadl_pd() {
4792        let a = _mm_setr_pd(1., 2.);
4793        let b = 3.;
4794        let expected = _mm_setr_pd(3., get_m128d(a, 1));
4795        let r = _mm_loadl_pd(a, &b);
4796        assert_eq_m128d(r, expected);
4797    }
4798
4799    #[simd_test(enable = "sse2")]
4800    // Miri cannot support this until it is clear how it fits in the Rust memory model
4801    // (non-temporal store)
4802    #[cfg_attr(miri, ignore)]
4803    unsafe fn test_mm_stream_pd() {
4804        #[repr(align(128))]
4805        struct Memory {
4806            pub data: [f64; 2],
4807        }
4808        let a = _mm_set1_pd(7.0);
4809        let mut mem = Memory { data: [-1.0; 2] };
4810
4811        _mm_stream_pd(ptr::addr_of_mut!(mem.data[0]), a);
4812        for i in 0..2 {
4813            assert_eq!(mem.data[i], get_m128d(a, i));
4814        }
4815    }
4816
4817    #[simd_test(enable = "sse2")]
4818    unsafe fn test_mm_store_sd() {
4819        let mut dest = 0.;
4820        let a = _mm_setr_pd(1., 2.);
4821        _mm_store_sd(&mut dest, a);
4822        assert_eq!(dest, _mm_cvtsd_f64(a));
4823    }
4824
4825    #[simd_test(enable = "sse2")]
4826    unsafe fn test_mm_store_pd() {
4827        let mut mem = Memory { data: [0.0f64; 4] };
4828        let vals = &mut mem.data;
4829        let a = _mm_setr_pd(1.0, 2.0);
4830        let d = vals.as_mut_ptr();
4831
4832        _mm_store_pd(d, *black_box(&a));
4833        assert_eq!(vals[0], 1.0);
4834        assert_eq!(vals[1], 2.0);
4835    }
4836
4837    #[simd_test(enable = "sse2")]
4838    unsafe fn test_mm_storeu_pd() {
4839        let mut mem = Memory { data: [0.0f64; 4] };
4840        let vals = &mut mem.data;
4841        let a = _mm_setr_pd(1.0, 2.0);
4842
4843        let mut ofs = 0;
4844        let mut p = vals.as_mut_ptr();
4845
4846        // Make sure p is **not** aligned to 16-byte boundary
4847        if (p as usize) & 0xf == 0 {
4848            ofs = 1;
4849            p = p.add(1);
4850        }
4851
4852        _mm_storeu_pd(p, *black_box(&a));
4853
4854        if ofs > 0 {
4855            assert_eq!(vals[ofs - 1], 0.0);
4856        }
4857        assert_eq!(vals[ofs + 0], 1.0);
4858        assert_eq!(vals[ofs + 1], 2.0);
4859    }
4860
4861    #[simd_test(enable = "sse2")]
4862    unsafe fn test_mm_storeu_si16() {
4863        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
4864        let mut r = _mm_setr_epi16(9, 10, 11, 12, 13, 14, 15, 16);
4865        _mm_storeu_si16(ptr::addr_of_mut!(r).cast(), a);
4866        let e = _mm_setr_epi16(1, 10, 11, 12, 13, 14, 15, 16);
4867        assert_eq_m128i(r, e);
4868    }
4869
4870    #[simd_test(enable = "sse2")]
4871    unsafe fn test_mm_storeu_si32() {
4872        let a = _mm_setr_epi32(1, 2, 3, 4);
4873        let mut r = _mm_setr_epi32(5, 6, 7, 8);
4874        _mm_storeu_si32(ptr::addr_of_mut!(r).cast(), a);
4875        let e = _mm_setr_epi32(1, 6, 7, 8);
4876        assert_eq_m128i(r, e);
4877    }
4878
4879    #[simd_test(enable = "sse2")]
4880    unsafe fn test_mm_storeu_si64() {
4881        let a = _mm_setr_epi64x(1, 2);
4882        let mut r = _mm_setr_epi64x(3, 4);
4883        _mm_storeu_si64(ptr::addr_of_mut!(r).cast(), a);
4884        let e = _mm_setr_epi64x(1, 4);
4885        assert_eq_m128i(r, e);
4886    }
4887
4888    #[simd_test(enable = "sse2")]
4889    unsafe fn test_mm_store1_pd() {
4890        let mut mem = Memory { data: [0.0f64; 4] };
4891        let vals = &mut mem.data;
4892        let a = _mm_setr_pd(1.0, 2.0);
4893        let d = vals.as_mut_ptr();
4894
4895        _mm_store1_pd(d, *black_box(&a));
4896        assert_eq!(vals[0], 1.0);
4897        assert_eq!(vals[1], 1.0);
4898    }
4899
4900    #[simd_test(enable = "sse2")]
4901    unsafe fn test_mm_store_pd1() {
4902        let mut mem = Memory { data: [0.0f64; 4] };
4903        let vals = &mut mem.data;
4904        let a = _mm_setr_pd(1.0, 2.0);
4905        let d = vals.as_mut_ptr();
4906
4907        _mm_store_pd1(d, *black_box(&a));
4908        assert_eq!(vals[0], 1.0);
4909        assert_eq!(vals[1], 1.0);
4910    }
4911
4912    #[simd_test(enable = "sse2")]
4913    unsafe fn test_mm_storer_pd() {
4914        let mut mem = Memory { data: [0.0f64; 4] };
4915        let vals = &mut mem.data;
4916        let a = _mm_setr_pd(1.0, 2.0);
4917        let d = vals.as_mut_ptr();
4918
4919        _mm_storer_pd(d, *black_box(&a));
4920        assert_eq!(vals[0], 2.0);
4921        assert_eq!(vals[1], 1.0);
4922    }
4923
4924    #[simd_test(enable = "sse2")]
4925    unsafe fn test_mm_storeh_pd() {
4926        let mut dest = 0.;
4927        let a = _mm_setr_pd(1., 2.);
4928        _mm_storeh_pd(&mut dest, a);
4929        assert_eq!(dest, get_m128d(a, 1));
4930    }
4931
4932    #[simd_test(enable = "sse2")]
4933    unsafe fn test_mm_storel_pd() {
4934        let mut dest = 0.;
4935        let a = _mm_setr_pd(1., 2.);
4936        _mm_storel_pd(&mut dest, a);
4937        assert_eq!(dest, _mm_cvtsd_f64(a));
4938    }
4939
4940    #[simd_test(enable = "sse2")]
4941    unsafe fn test_mm_loadr_pd() {
4942        let mut mem = Memory {
4943            data: [1.0f64, 2.0, 3.0, 4.0],
4944        };
4945        let vals = &mut mem.data;
4946        let d = vals.as_ptr();
4947
4948        let r = _mm_loadr_pd(d);
4949        assert_eq_m128d(r, _mm_setr_pd(2.0, 1.0));
4950    }
4951
4952    #[simd_test(enable = "sse2")]
4953    unsafe fn test_mm_loadu_pd() {
4954        let mut mem = Memory {
4955            data: [1.0f64, 2.0, 3.0, 4.0],
4956        };
4957        let vals = &mut mem.data;
4958        let mut d = vals.as_ptr();
4959
4960        // make sure d is not aligned to 16-byte boundary
4961        let mut offset = 0;
4962        if (d as usize) & 0xf == 0 {
4963            offset = 1;
4964            d = d.add(offset);
4965        }
4966
4967        let r = _mm_loadu_pd(d);
4968        let e = _mm_add_pd(_mm_setr_pd(1.0, 2.0), _mm_set1_pd(offset as f64));
4969        assert_eq_m128d(r, e);
4970    }
4971
4972    #[simd_test(enable = "sse2")]
4973    unsafe fn test_mm_loadu_si16() {
4974        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
4975        let r = _mm_loadu_si16(ptr::addr_of!(a) as *const _);
4976        assert_eq_m128i(r, _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0));
4977    }
4978
4979    #[simd_test(enable = "sse2")]
4980    unsafe fn test_mm_loadu_si32() {
4981        let a = _mm_setr_epi32(1, 2, 3, 4);
4982        let r = _mm_loadu_si32(ptr::addr_of!(a) as *const _);
4983        assert_eq_m128i(r, _mm_setr_epi32(1, 0, 0, 0));
4984    }
4985
4986    #[simd_test(enable = "sse2")]
4987    unsafe fn test_mm_loadu_si64() {
4988        let a = _mm_setr_epi64x(5, 6);
4989        let r = _mm_loadu_si64(ptr::addr_of!(a) as *const _);
4990        assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
4991    }
4992
4993    #[simd_test(enable = "sse2")]
4994    unsafe fn test_mm_cvtpd_ps() {
4995        let r = _mm_cvtpd_ps(_mm_setr_pd(-1.0, 5.0));
4996        assert_eq_m128(r, _mm_setr_ps(-1.0, 5.0, 0.0, 0.0));
4997
4998        let r = _mm_cvtpd_ps(_mm_setr_pd(-1.0, -5.0));
4999        assert_eq_m128(r, _mm_setr_ps(-1.0, -5.0, 0.0, 0.0));
5000
5001        let r = _mm_cvtpd_ps(_mm_setr_pd(f64::MAX, f64::MIN));
5002        assert_eq_m128(r, _mm_setr_ps(f32::INFINITY, f32::NEG_INFINITY, 0.0, 0.0));
5003
5004        let r = _mm_cvtpd_ps(_mm_setr_pd(f32::MAX as f64, f32::MIN as f64));
5005        assert_eq_m128(r, _mm_setr_ps(f32::MAX, f32::MIN, 0.0, 0.0));
5006    }
5007
5008    #[simd_test(enable = "sse2")]
5009    unsafe fn test_mm_cvtps_pd() {
5010        let r = _mm_cvtps_pd(_mm_setr_ps(-1.0, 2.0, -3.0, 5.0));
5011        assert_eq_m128d(r, _mm_setr_pd(-1.0, 2.0));
5012
5013        let r = _mm_cvtps_pd(_mm_setr_ps(
5014            f32::MAX,
5015            f32::INFINITY,
5016            f32::NEG_INFINITY,
5017            f32::MIN,
5018        ));
5019        assert_eq_m128d(r, _mm_setr_pd(f32::MAX as f64, f64::INFINITY));
5020    }
5021
5022    #[simd_test(enable = "sse2")]
5023    unsafe fn test_mm_cvtpd_epi32() {
5024        let r = _mm_cvtpd_epi32(_mm_setr_pd(-1.0, 5.0));
5025        assert_eq_m128i(r, _mm_setr_epi32(-1, 5, 0, 0));
5026
5027        let r = _mm_cvtpd_epi32(_mm_setr_pd(-1.0, -5.0));
5028        assert_eq_m128i(r, _mm_setr_epi32(-1, -5, 0, 0));
5029
5030        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::MAX, f64::MIN));
5031        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5032
5033        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::INFINITY, f64::NEG_INFINITY));
5034        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5035
5036        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::NAN, f64::NAN));
5037        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5038    }
5039
5040    #[simd_test(enable = "sse2")]
5041    unsafe fn test_mm_cvtsd_si32() {
5042        let r = _mm_cvtsd_si32(_mm_setr_pd(-2.0, 5.0));
5043        assert_eq!(r, -2);
5044
5045        let r = _mm_cvtsd_si32(_mm_setr_pd(f64::MAX, f64::MIN));
5046        assert_eq!(r, i32::MIN);
5047
5048        let r = _mm_cvtsd_si32(_mm_setr_pd(f64::NAN, f64::NAN));
5049        assert_eq!(r, i32::MIN);
5050    }
5051
5052    #[simd_test(enable = "sse2")]
5053    unsafe fn test_mm_cvtsd_ss() {
5054        let a = _mm_setr_ps(-1.1, -2.2, 3.3, 4.4);
5055        let b = _mm_setr_pd(2.0, -5.0);
5056
5057        let r = _mm_cvtsd_ss(a, b);
5058
5059        assert_eq_m128(r, _mm_setr_ps(2.0, -2.2, 3.3, 4.4));
5060
5061        let a = _mm_setr_ps(-1.1, f32::NEG_INFINITY, f32::MAX, f32::NEG_INFINITY);
5062        let b = _mm_setr_pd(f64::INFINITY, -5.0);
5063
5064        let r = _mm_cvtsd_ss(a, b);
5065
5066        assert_eq_m128(
5067            r,
5068            _mm_setr_ps(
5069                f32::INFINITY,
5070                f32::NEG_INFINITY,
5071                f32::MAX,
5072                f32::NEG_INFINITY,
5073            ),
5074        );
5075    }
5076
5077    #[simd_test(enable = "sse2")]
5078    unsafe fn test_mm_cvtsd_f64() {
5079        let r = _mm_cvtsd_f64(_mm_setr_pd(-1.1, 2.2));
5080        assert_eq!(r, -1.1);
5081    }
5082
5083    #[simd_test(enable = "sse2")]
5084    unsafe fn test_mm_cvtss_sd() {
5085        let a = _mm_setr_pd(-1.1, 2.2);
5086        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
5087
5088        let r = _mm_cvtss_sd(a, b);
5089        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.2));
5090
5091        let a = _mm_setr_pd(-1.1, f64::INFINITY);
5092        let b = _mm_setr_ps(f32::NEG_INFINITY, 2.0, 3.0, 4.0);
5093
5094        let r = _mm_cvtss_sd(a, b);
5095        assert_eq_m128d(r, _mm_setr_pd(f64::NEG_INFINITY, f64::INFINITY));
5096    }
5097
5098    #[simd_test(enable = "sse2")]
5099    unsafe fn test_mm_cvttpd_epi32() {
5100        let a = _mm_setr_pd(-1.1, 2.2);
5101        let r = _mm_cvttpd_epi32(a);
5102        assert_eq_m128i(r, _mm_setr_epi32(-1, 2, 0, 0));
5103
5104        let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
5105        let r = _mm_cvttpd_epi32(a);
5106        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5107    }
5108
5109    #[simd_test(enable = "sse2")]
5110    unsafe fn test_mm_cvttsd_si32() {
5111        let a = _mm_setr_pd(-1.1, 2.2);
5112        let r = _mm_cvttsd_si32(a);
5113        assert_eq!(r, -1);
5114
5115        let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
5116        let r = _mm_cvttsd_si32(a);
5117        assert_eq!(r, i32::MIN);
5118    }
5119
5120    #[simd_test(enable = "sse2")]
5121    unsafe fn test_mm_cvttps_epi32() {
5122        let a = _mm_setr_ps(-1.1, 2.2, -3.3, 6.6);
5123        let r = _mm_cvttps_epi32(a);
5124        assert_eq_m128i(r, _mm_setr_epi32(-1, 2, -3, 6));
5125
5126        let a = _mm_setr_ps(f32::NEG_INFINITY, f32::INFINITY, f32::MIN, f32::MAX);
5127        let r = _mm_cvttps_epi32(a);
5128        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, i32::MIN, i32::MIN));
5129    }
5130
5131    #[simd_test(enable = "sse2")]
5132    unsafe fn test_mm_set_sd() {
5133        let r = _mm_set_sd(-1.0_f64);
5134        assert_eq_m128d(r, _mm_setr_pd(-1.0_f64, 0_f64));
5135    }
5136
5137    #[simd_test(enable = "sse2")]
5138    unsafe fn test_mm_set1_pd() {
5139        let r = _mm_set1_pd(-1.0_f64);
5140        assert_eq_m128d(r, _mm_setr_pd(-1.0_f64, -1.0_f64));
5141    }
5142
5143    #[simd_test(enable = "sse2")]
5144    unsafe fn test_mm_set_pd1() {
5145        let r = _mm_set_pd1(-2.0_f64);
5146        assert_eq_m128d(r, _mm_setr_pd(-2.0_f64, -2.0_f64));
5147    }
5148
5149    #[simd_test(enable = "sse2")]
5150    unsafe fn test_mm_set_pd() {
5151        let r = _mm_set_pd(1.0_f64, 5.0_f64);
5152        assert_eq_m128d(r, _mm_setr_pd(5.0_f64, 1.0_f64));
5153    }
5154
5155    #[simd_test(enable = "sse2")]
5156    unsafe fn test_mm_setr_pd() {
5157        let r = _mm_setr_pd(1.0_f64, -5.0_f64);
5158        assert_eq_m128d(r, _mm_setr_pd(1.0_f64, -5.0_f64));
5159    }
5160
5161    #[simd_test(enable = "sse2")]
5162    unsafe fn test_mm_setzero_pd() {
5163        let r = _mm_setzero_pd();
5164        assert_eq_m128d(r, _mm_setr_pd(0_f64, 0_f64));
5165    }
5166
5167    #[simd_test(enable = "sse2")]
5168    unsafe fn test_mm_load1_pd() {
5169        let d = -5.0;
5170        let r = _mm_load1_pd(&d);
5171        assert_eq_m128d(r, _mm_setr_pd(d, d));
5172    }
5173
5174    #[simd_test(enable = "sse2")]
5175    unsafe fn test_mm_load_pd1() {
5176        let d = -5.0;
5177        let r = _mm_load_pd1(&d);
5178        assert_eq_m128d(r, _mm_setr_pd(d, d));
5179    }
5180
5181    #[simd_test(enable = "sse2")]
5182    unsafe fn test_mm_unpackhi_pd() {
5183        let a = _mm_setr_pd(1.0, 2.0);
5184        let b = _mm_setr_pd(3.0, 4.0);
5185        let r = _mm_unpackhi_pd(a, b);
5186        assert_eq_m128d(r, _mm_setr_pd(2.0, 4.0));
5187    }
5188
5189    #[simd_test(enable = "sse2")]
5190    unsafe fn test_mm_unpacklo_pd() {
5191        let a = _mm_setr_pd(1.0, 2.0);
5192        let b = _mm_setr_pd(3.0, 4.0);
5193        let r = _mm_unpacklo_pd(a, b);
5194        assert_eq_m128d(r, _mm_setr_pd(1.0, 3.0));
5195    }
5196
5197    #[simd_test(enable = "sse2")]
5198    unsafe fn test_mm_shuffle_pd() {
5199        let a = _mm_setr_pd(1., 2.);
5200        let b = _mm_setr_pd(3., 4.);
5201        let expected = _mm_setr_pd(1., 3.);
5202        let r = _mm_shuffle_pd::<0b00_00_00_00>(a, b);
5203        assert_eq_m128d(r, expected);
5204    }
5205
5206    #[simd_test(enable = "sse2")]
5207    unsafe fn test_mm_move_sd() {
5208        let a = _mm_setr_pd(1., 2.);
5209        let b = _mm_setr_pd(3., 4.);
5210        let expected = _mm_setr_pd(3., 2.);
5211        let r = _mm_move_sd(a, b);
5212        assert_eq_m128d(r, expected);
5213    }
5214
5215    #[simd_test(enable = "sse2")]
5216    unsafe fn test_mm_castpd_ps() {
5217        let a = _mm_set1_pd(0.);
5218        let expected = _mm_set1_ps(0.);
5219        let r = _mm_castpd_ps(a);
5220        assert_eq_m128(r, expected);
5221    }
5222
5223    #[simd_test(enable = "sse2")]
5224    unsafe fn test_mm_castpd_si128() {
5225        let a = _mm_set1_pd(0.);
5226        let expected = _mm_set1_epi64x(0);
5227        let r = _mm_castpd_si128(a);
5228        assert_eq_m128i(r, expected);
5229    }
5230
5231    #[simd_test(enable = "sse2")]
5232    unsafe fn test_mm_castps_pd() {
5233        let a = _mm_set1_ps(0.);
5234        let expected = _mm_set1_pd(0.);
5235        let r = _mm_castps_pd(a);
5236        assert_eq_m128d(r, expected);
5237    }
5238
5239    #[simd_test(enable = "sse2")]
5240    unsafe fn test_mm_castps_si128() {
5241        let a = _mm_set1_ps(0.);
5242        let expected = _mm_set1_epi32(0);
5243        let r = _mm_castps_si128(a);
5244        assert_eq_m128i(r, expected);
5245    }
5246
5247    #[simd_test(enable = "sse2")]
5248    unsafe fn test_mm_castsi128_pd() {
5249        let a = _mm_set1_epi64x(0);
5250        let expected = _mm_set1_pd(0.);
5251        let r = _mm_castsi128_pd(a);
5252        assert_eq_m128d(r, expected);
5253    }
5254
5255    #[simd_test(enable = "sse2")]
5256    unsafe fn test_mm_castsi128_ps() {
5257        let a = _mm_set1_epi32(0);
5258        let expected = _mm_set1_ps(0.);
5259        let r = _mm_castsi128_ps(a);
5260        assert_eq_m128(r, expected);
5261    }
5262}