core\stdarch\crates\core_arch\src\x86/
avxneconvert.rs

1use crate::arch::asm;
2use crate::core_arch::x86::*;
3
4#[cfg(test)]
5use stdarch_test::assert_instr;
6
7/// Convert scalar BF16 (16-bit) floating point element stored at memory locations starting at location
8/// a to single precision (32-bit) floating-point, broadcast it to packed single precision (32-bit)
9/// floating-point elements, and store the results in dst.
10///
11/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bcstnebf16_ps)
12#[inline]
13#[target_feature(enable = "avxneconvert")]
14#[cfg_attr(test, assert_instr(vbcstnebf162ps))]
15#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
16pub unsafe fn _mm_bcstnebf16_ps(a: *const bf16) -> __m128 {
17    bcstnebf162ps_128(a)
18}
19
20/// Convert scalar BF16 (16-bit) floating point element stored at memory locations starting at location
21/// a to single precision (32-bit) floating-point, broadcast it to packed single precision (32-bit) floating-point
22/// elements, and store the results in dst.
23///
24/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bcstnebf16_ps)
25#[inline]
26#[target_feature(enable = "avxneconvert")]
27#[cfg_attr(test, assert_instr(vbcstnebf162ps))]
28#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
29pub unsafe fn _mm256_bcstnebf16_ps(a: *const bf16) -> __m256 {
30    bcstnebf162ps_256(a)
31}
32
33/// Convert scalar half-precision (16-bit) floating-point element stored at memory locations starting
34/// at location a to a single-precision (32-bit) floating-point, broadcast it to packed single-precision
35/// (32-bit) floating-point elements, and store the results in dst.
36///
37/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bcstnesh_ps)
38#[inline]
39#[target_feature(enable = "avxneconvert")]
40#[cfg_attr(test, assert_instr(vbcstnesh2ps))]
41#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
42pub unsafe fn _mm_bcstnesh_ps(a: *const f16) -> __m128 {
43    bcstnesh2ps_128(a)
44}
45
46/// Convert scalar half-precision (16-bit) floating-point element stored at memory locations starting
47/// at location a to a single-precision (32-bit) floating-point, broadcast it to packed single-precision
48/// (32-bit) floating-point elements, and store the results in dst.
49///
50/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bcstnesh_ps)
51#[inline]
52#[target_feature(enable = "avxneconvert")]
53#[cfg_attr(test, assert_instr(vbcstnesh2ps))]
54#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
55pub unsafe fn _mm256_bcstnesh_ps(a: *const f16) -> __m256 {
56    bcstnesh2ps_256(a)
57}
58
59/// Convert packed BF16 (16-bit) floating-point even-indexed elements stored at memory locations starting at
60/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
61///
62/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneebf16_ps)
63#[inline]
64#[target_feature(enable = "avxneconvert")]
65#[cfg_attr(test, assert_instr(vcvtneebf162ps))]
66#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
67pub unsafe fn _mm_cvtneebf16_ps(a: *const __m128bh) -> __m128 {
68    transmute(cvtneebf162ps_128(a))
69}
70
71/// Convert packed BF16 (16-bit) floating-point even-indexed elements stored at memory locations starting at
72/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
73///
74/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneebf16_ps)
75#[inline]
76#[target_feature(enable = "avxneconvert")]
77#[cfg_attr(test, assert_instr(vcvtneebf162ps))]
78#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
79pub unsafe fn _mm256_cvtneebf16_ps(a: *const __m256bh) -> __m256 {
80    transmute(cvtneebf162ps_256(a))
81}
82
83/// Convert packed half-precision (16-bit) floating-point even-indexed elements stored at memory locations starting at
84/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
85///
86/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneeph_ps)
87#[inline]
88#[target_feature(enable = "avxneconvert")]
89#[cfg_attr(test, assert_instr(vcvtneeph2ps))]
90#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
91pub unsafe fn _mm_cvtneeph_ps(a: *const __m128h) -> __m128 {
92    transmute(cvtneeph2ps_128(a))
93}
94
95/// Convert packed half-precision (16-bit) floating-point even-indexed elements stored at memory locations starting at
96/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
97///
98/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneeph_ps)
99#[inline]
100#[target_feature(enable = "avxneconvert")]
101#[cfg_attr(test, assert_instr(vcvtneeph2ps))]
102#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
103pub unsafe fn _mm256_cvtneeph_ps(a: *const __m256h) -> __m256 {
104    transmute(cvtneeph2ps_256(a))
105}
106
107/// Convert packed BF16 (16-bit) floating-point odd-indexed elements stored at memory locations starting at
108/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
109///
110/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneobf16_ps)
111#[inline]
112#[target_feature(enable = "avxneconvert")]
113#[cfg_attr(test, assert_instr(vcvtneobf162ps))]
114#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
115pub unsafe fn _mm_cvtneobf16_ps(a: *const __m128bh) -> __m128 {
116    transmute(cvtneobf162ps_128(a))
117}
118
119/// Convert packed BF16 (16-bit) floating-point odd-indexed elements stored at memory locations starting at
120/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
121///
122/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneobf16_ps)
123#[inline]
124#[target_feature(enable = "avxneconvert")]
125#[cfg_attr(test, assert_instr(vcvtneobf162ps))]
126#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
127pub unsafe fn _mm256_cvtneobf16_ps(a: *const __m256bh) -> __m256 {
128    transmute(cvtneobf162ps_256(a))
129}
130
131/// Convert packed half-precision (16-bit) floating-point odd-indexed elements stored at memory locations starting at
132/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
133///
134/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneoph_ps)
135#[inline]
136#[target_feature(enable = "avxneconvert")]
137#[cfg_attr(test, assert_instr(vcvtneoph2ps))]
138#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
139pub unsafe fn _mm_cvtneoph_ps(a: *const __m128h) -> __m128 {
140    transmute(cvtneoph2ps_128(a))
141}
142
143/// Convert packed half-precision (16-bit) floating-point odd-indexed elements stored at memory locations starting at
144/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
145///
146/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneoph_ps)
147#[inline]
148#[target_feature(enable = "avxneconvert")]
149#[cfg_attr(test, assert_instr(vcvtneoph2ps))]
150#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
151pub unsafe fn _mm256_cvtneoph_ps(a: *const __m256h) -> __m256 {
152    transmute(cvtneoph2ps_256(a))
153}
154
155/// Convert packed single precision (32-bit) floating-point elements in a to packed BF16 (16-bit) floating-point
156/// elements, and store the results in dst.
157///
158/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_avx_pbh)
159#[inline]
160#[target_feature(enable = "avxneconvert")]
161#[cfg_attr(test, assert_instr(vcvtneps2bf16))]
162#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
163pub fn _mm_cvtneps_avx_pbh(a: __m128) -> __m128bh {
164    unsafe {
165        let mut dst: __m128bh;
166        asm!(
167            "{{vex}}vcvtneps2bf16 {dst},{src}",
168            dst = lateout(xmm_reg) dst,
169            src = in(xmm_reg) a,
170            options(pure, nomem, nostack, preserves_flags)
171        );
172        dst
173    }
174}
175
176/// Convert packed single precision (32-bit) floating-point elements in a to packed BF16 (16-bit) floating-point
177/// elements, and store the results in dst.
178///
179/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneps_avx_pbh)
180#[inline]
181#[target_feature(enable = "avxneconvert")]
182#[cfg_attr(test, assert_instr(vcvtneps2bf16))]
183#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
184pub fn _mm256_cvtneps_avx_pbh(a: __m256) -> __m128bh {
185    unsafe {
186        let mut dst: __m128bh;
187        asm!(
188            "{{vex}}vcvtneps2bf16 {dst},{src}",
189            dst = lateout(xmm_reg) dst,
190            src = in(ymm_reg) a,
191            options(pure, nomem, nostack, preserves_flags)
192        );
193        dst
194    }
195}
196
197#[allow(improper_ctypes)]
198unsafe extern "C" {
199    #[link_name = "llvm.x86.vbcstnebf162ps128"]
200    fn bcstnebf162ps_128(a: *const bf16) -> __m128;
201    #[link_name = "llvm.x86.vbcstnebf162ps256"]
202    fn bcstnebf162ps_256(a: *const bf16) -> __m256;
203    #[link_name = "llvm.x86.vbcstnesh2ps128"]
204    fn bcstnesh2ps_128(a: *const f16) -> __m128;
205    #[link_name = "llvm.x86.vbcstnesh2ps256"]
206    fn bcstnesh2ps_256(a: *const f16) -> __m256;
207
208    #[link_name = "llvm.x86.vcvtneebf162ps128"]
209    fn cvtneebf162ps_128(a: *const __m128bh) -> __m128;
210    #[link_name = "llvm.x86.vcvtneebf162ps256"]
211    fn cvtneebf162ps_256(a: *const __m256bh) -> __m256;
212    #[link_name = "llvm.x86.vcvtneeph2ps128"]
213    fn cvtneeph2ps_128(a: *const __m128h) -> __m128;
214    #[link_name = "llvm.x86.vcvtneeph2ps256"]
215    fn cvtneeph2ps_256(a: *const __m256h) -> __m256;
216
217    #[link_name = "llvm.x86.vcvtneobf162ps128"]
218    fn cvtneobf162ps_128(a: *const __m128bh) -> __m128;
219    #[link_name = "llvm.x86.vcvtneobf162ps256"]
220    fn cvtneobf162ps_256(a: *const __m256bh) -> __m256;
221    #[link_name = "llvm.x86.vcvtneoph2ps128"]
222    fn cvtneoph2ps_128(a: *const __m128h) -> __m128;
223    #[link_name = "llvm.x86.vcvtneoph2ps256"]
224    fn cvtneoph2ps_256(a: *const __m256h) -> __m256;
225}
226
227#[cfg(test)]
228mod tests {
229    use crate::core_arch::simd::{u16x4, u16x8};
230    use crate::core_arch::x86::*;
231    use crate::mem::transmute_copy;
232    use std::ptr::addr_of;
233    use stdarch_test::simd_test;
234
235    const BF16_ONE: u16 = 0b0_01111111_0000000;
236    const BF16_TWO: u16 = 0b0_10000000_0000000;
237    const BF16_THREE: u16 = 0b0_10000000_1000000;
238    const BF16_FOUR: u16 = 0b0_10000001_0000000;
239    const BF16_FIVE: u16 = 0b0_10000001_0100000;
240    const BF16_SIX: u16 = 0b0_10000001_1000000;
241    const BF16_SEVEN: u16 = 0b0_10000001_1100000;
242    const BF16_EIGHT: u16 = 0b0_10000010_0000000;
243
244    #[simd_test(enable = "avxneconvert")]
245    unsafe fn test_mm_bcstnebf16_ps() {
246        let a = bf16::from_bits(BF16_ONE);
247        let r = _mm_bcstnebf16_ps(addr_of!(a));
248        let e = _mm_set_ps(1., 1., 1., 1.);
249        assert_eq_m128(r, e);
250    }
251
252    #[simd_test(enable = "avxneconvert")]
253    unsafe fn test_mm256_bcstnebf16_ps() {
254        let a = bf16::from_bits(BF16_ONE);
255        let r = _mm256_bcstnebf16_ps(addr_of!(a));
256        let e = _mm256_set_ps(1., 1., 1., 1., 1., 1., 1., 1.);
257        assert_eq_m256(r, e);
258    }
259
260    #[simd_test(enable = "avxneconvert")]
261    unsafe fn test_mm_bcstnesh_ps() {
262        let a = 1.0_f16;
263        let r = _mm_bcstnesh_ps(addr_of!(a));
264        let e = _mm_set_ps(1., 1., 1., 1.);
265        assert_eq_m128(r, e);
266    }
267
268    #[simd_test(enable = "avxneconvert")]
269    unsafe fn test_mm256_bcstnesh_ps() {
270        let a = 1.0_f16;
271        let r = _mm256_bcstnesh_ps(addr_of!(a));
272        let e = _mm256_set_ps(1., 1., 1., 1., 1., 1., 1., 1.);
273        assert_eq_m256(r, e);
274    }
275
276    #[simd_test(enable = "avxneconvert")]
277    unsafe fn test_mm_cvtneebf16_ps() {
278        let a = __m128bh([
279            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
280        ]);
281        let r = _mm_cvtneebf16_ps(addr_of!(a));
282        let e = _mm_setr_ps(1., 3., 5., 7.);
283        assert_eq_m128(r, e);
284    }
285
286    #[simd_test(enable = "avxneconvert")]
287    unsafe fn test_mm256_cvtneebf16_ps() {
288        let a = __m256bh([
289            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
290            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
291        ]);
292        let r = _mm256_cvtneebf16_ps(addr_of!(a));
293        let e = _mm256_setr_ps(1., 3., 5., 7., 1., 3., 5., 7.);
294        assert_eq_m256(r, e);
295    }
296
297    #[simd_test(enable = "avxneconvert")]
298    unsafe fn test_mm_cvtneeph_ps() {
299        let a = __m128h([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]);
300        let r = _mm_cvtneeph_ps(addr_of!(a));
301        let e = _mm_setr_ps(1., 3., 5., 7.);
302        assert_eq_m128(r, e);
303    }
304
305    #[simd_test(enable = "avxneconvert")]
306    unsafe fn test_mm256_cvtneeph_ps() {
307        let a = __m256h([
308            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
309        ]);
310        let r = _mm256_cvtneeph_ps(addr_of!(a));
311        let e = _mm256_setr_ps(1., 3., 5., 7., 9., 11., 13., 15.);
312        assert_eq_m256(r, e);
313    }
314
315    #[simd_test(enable = "avxneconvert")]
316    unsafe fn test_mm_cvtneobf16_ps() {
317        let a = __m128bh([
318            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
319        ]);
320        let r = _mm_cvtneobf16_ps(addr_of!(a));
321        let e = _mm_setr_ps(2., 4., 6., 8.);
322        assert_eq_m128(r, e);
323    }
324
325    #[simd_test(enable = "avxneconvert")]
326    unsafe fn test_mm256_cvtneobf16_ps() {
327        let a = __m256bh([
328            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
329            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
330        ]);
331        let r = _mm256_cvtneobf16_ps(addr_of!(a));
332        let e = _mm256_setr_ps(2., 4., 6., 8., 2., 4., 6., 8.);
333        assert_eq_m256(r, e);
334    }
335
336    #[simd_test(enable = "avxneconvert")]
337    unsafe fn test_mm_cvtneoph_ps() {
338        let a = __m128h([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]);
339        let r = _mm_cvtneoph_ps(addr_of!(a));
340        let e = _mm_setr_ps(2., 4., 6., 8.);
341        assert_eq_m128(r, e);
342    }
343
344    #[simd_test(enable = "avxneconvert")]
345    unsafe fn test_mm256_cvtneoph_ps() {
346        let a = __m256h([
347            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
348        ]);
349        let r = _mm256_cvtneoph_ps(addr_of!(a));
350        let e = _mm256_setr_ps(2., 4., 6., 8., 10., 12., 14., 16.);
351        assert_eq_m256(r, e);
352    }
353
354    #[simd_test(enable = "avxneconvert")]
355    unsafe fn test_mm_cvtneps_avx_pbh() {
356        let a = _mm_setr_ps(1., 2., 3., 4.);
357        let r: u16x4 = transmute_copy(&_mm_cvtneps_avx_pbh(a));
358        let e = u16x4::new(BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR);
359        assert_eq!(r, e);
360    }
361
362    #[simd_test(enable = "avxneconvert")]
363    unsafe fn test_mm256_cvtneps_avx_pbh() {
364        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
365        let r: u16x8 = transmute(_mm256_cvtneps_avx_pbh(a));
366        let e = u16x8::new(
367            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
368        );
369        assert_eq!(r, e);
370    }
371}