core/stdarch/crates/core_arch/src/nvptx/
packed.rs

Help
1//! NVPTX Packed data types (SIMD)
2//!
3//! Packed Data Types is what PTX calls SIMD types. See [PTX ISA (Packed Data Types)](https://docs.nvidia.com/cuda/parallel-thread-execution/#packed-data-types) for a full reference.
4
5// Note: #[assert_instr] tests are not actually being run on nvptx due to being a `no_std` target incapable of running tests. Something like FileCheck would be appropriate for verifying the correct instruction is used.
6
7use crate::intrinsics::simd::*;
8
9#[allow(improper_ctypes)]
10unsafe extern "C" {
11    #[link_name = "llvm.minimum.v2f16"]
12    fn llvm_f16x2_minimum(a: f16x2, b: f16x2) -> f16x2;
13    #[link_name = "llvm.maximum.v2f16"]
14    fn llvm_f16x2_maximum(a: f16x2, b: f16x2) -> f16x2;
15}
16
17types! {
18    #![unstable(feature = "stdarch_nvptx", issue = "111199")]
19
20    /// PTX-specific 32-bit wide floating point (f16 x 2) vector type
21    pub struct f16x2(2 x f16);
22
23}
24
25/// Add two values, round to nearest even
26///
27/// <https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-add>
28///
29/// Corresponds to the CUDA C intrinsics:
30///  - [`__hadd2`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__ARITHMETIC.html#group__CUDA__MATH____HALF2__ARITHMETIC_1g921c795176eaa31265bd80ef4fe4b8e6)
31///  - [`__hadd2_rn`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__ARITHMETIC.html#group__CUDA__MATH____HALF2__ARITHMETIC_1g6cd8ddb2c3d670e1a10c3eb2e7644f82)
32#[inline]
33#[cfg_attr(test, assert_instr(add.rn.f16x22))]
34#[unstable(feature = "stdarch_nvptx", issue = "111199")]
35pub unsafe fn f16x2_add(a: f16x2, b: f16x2) -> f16x2 {
36    simd_add(a, b)
37}
38
39/// Subtract two values, round to nearest even
40///
41/// <https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-sub>
42///
43/// Corresponds to the CUDA C intrinsics:
44///  - [`__hsub2`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__ARITHMETIC.html#group__CUDA__MATH____HALF2__ARITHMETIC_1ga5536c9c3d853d8c8b9de60e18b41e54)
45///  - [`__hsub2_rn`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__ARITHMETIC.html#group__CUDA__MATH____HALF2__ARITHMETIC_1g8adc164c68d553354f749f0f0645a874)
46#[inline]
47#[cfg_attr(test, assert_instr(sub.rn.f16x2))]
48#[unstable(feature = "stdarch_nvptx", issue = "111199")]
49pub unsafe fn f16x2_sub(a: f16x2, b: f16x2) -> f16x2 {
50    simd_sub(a, b)
51}
52
53/// Multiply two values, round to nearest even
54///
55/// <https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-mul>
56///
57/// Corresponds to the CUDA C intrinsics:
58///  - [`__hmul2`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__ARITHMETIC.html#group__CUDA__MATH____HALF2__ARITHMETIC_1g70de3f2ee48babe4e0969397ac17708e)
59///  - [`__hmul2_rn`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__ARITHMETIC.html#group__CUDA__MATH____HALF2__ARITHMETIC_1g99f8fe23a4b4c6898d6faf999afaa76e)
60#[inline]
61#[cfg_attr(test, assert_instr(mul.rn.f16x2))]
62#[unstable(feature = "stdarch_nvptx", issue = "111199")]
63pub unsafe fn f16x2_mul(a: f16x2, b: f16x2) -> f16x2 {
64    simd_mul(a, b)
65}
66
67/// Fused multiply-add, round to nearest even
68///
69/// <https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-fma>
70///
71/// Corresponds to the CUDA C intrinsics:
72///  - [`__fma2`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__ARITHMETIC.html#group__CUDA__MATH____HALF2__ARITHMETIC_1g43628ba21ded8b1e188a367348008dab)
73///  - [`__fma2_rn`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__ARITHMETIC.html#group__CUDA__MATH____HALF2__ARITHMETIC_1g43628ba21ded8b1e188a367348008dab)
74#[inline]
75#[cfg_attr(test, assert_instr(fma.rn.f16x2))]
76#[unstable(feature = "stdarch_nvptx", issue = "111199")]
77pub unsafe fn f16x2_fma(a: f16x2, b: f16x2, c: f16x2) -> f16x2 {
78    simd_fma(a, b, c)
79}
80
81/// Arithmetic negate
82///
83/// <https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-neg>
84///
85/// Corresponds to the CUDA C intrinsic [`__hmin2`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__COMPARISON.html#group__CUDA__MATH____HALF2__COMPARISON_1g9e17a33f96061804166f3fbd395422b6)
86#[inline]
87#[cfg_attr(test, assert_instr(neg.f16x2))]
88#[unstable(feature = "stdarch_nvptx", issue = "111199")]
89pub unsafe fn f16x2_neg(a: f16x2) -> f16x2 {
90    simd_neg(a)
91}
92
93/// Find the minimum of two values
94///
95/// <https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-min>
96///
97/// Corresponds to the CUDA C intrinsic [`__hmin2`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__COMPARISON.html#group__CUDA__MATH____HALF2__COMPARISON_1g9e17a33f96061804166f3fbd395422b6)
98#[inline]
99#[cfg_attr(test, assert_instr(min.f16x2))]
100#[unstable(feature = "stdarch_nvptx", issue = "111199")]
101pub unsafe fn f16x2_min(a: f16x2, b: f16x2) -> f16x2 {
102    simd_fmin(a, b)
103}
104
105/// Find the minimum of two values, NaNs pass through.
106///
107/// <https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-min>
108///
109/// Corresponds to the CUDA C intrinsic [`__hmin2_nan`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__COMPARISON.html#group__CUDA__MATH____HALF2__COMPARISON_1g8bb8f58e9294cc261d2f42c4d5aecd6b)
110#[inline]
111#[cfg_attr(test, assert_instr(min.NaN.f16x2))]
112#[unstable(feature = "stdarch_nvptx", issue = "111199")]
113pub unsafe fn f16x2_min_nan(a: f16x2, b: f16x2) -> f16x2 {
114    llvm_f16x2_minimum(a, b)
115}
116
117/// Find the maximum of two values
118///
119/// <https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-max>
120///
121/// Corresponds to the CUDA C intrinsic [`__hmax2`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__COMPARISON.html#group__CUDA__MATH____HALF2__COMPARISON_1g59fc7fc7975d8127b202444a05e57e3d)
122#[inline]
123#[cfg_attr(test, assert_instr(max.f16x2))]
124#[unstable(feature = "stdarch_nvptx", issue = "111199")]
125pub unsafe fn f16x2_max(a: f16x2, b: f16x2) -> f16x2 {
126    simd_fmax(a, b)
127}
128
129/// Find the maximum of two values, NaNs pass through.
130///
131/// <https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-max>
132///
133/// Corresponds to the CUDA C intrinsic [`__hmax2_nan`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__COMPARISON.html#group__CUDA__MATH____HALF2__COMPARISON_1g41623db7850e3074fd9daa80a14c3897)
134#[inline]
135#[cfg_attr(test, assert_instr(max.NaN.f16x2))]
136#[unstable(feature = "stdarch_nvptx", issue = "111199")]
137pub unsafe fn f16x2_max_nan(a: f16x2, b: f16x2) -> f16x2 {
138    llvm_f16x2_maximum(a, b)
139}
core/stdarch/crates/core_arch/src/nvptx/packed.rs

core/stdarch/crates/core_arch/src/nvptx/
packed.rs