core\stdarch\crates\core_arch\src\x86/
avx512fp16.rs

1use crate::arch::asm;
2use crate::core_arch::{simd::*, x86::*};
3use crate::intrinsics::{fmaf16, simd::*};
4use crate::ptr;
5
6/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values.
7///
8/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ph)
9#[inline]
10#[target_feature(enable = "avx512fp16")]
11#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12pub fn _mm_set_ph(
13    e7: f16,
14    e6: f16,
15    e5: f16,
16    e4: f16,
17    e3: f16,
18    e2: f16,
19    e1: f16,
20    e0: f16,
21) -> __m128h {
22    __m128h([e0, e1, e2, e3, e4, e5, e6, e7])
23}
24
25/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values.
26///
27/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_ph)
28#[inline]
29#[target_feature(enable = "avx512fp16")]
30#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
31pub fn _mm256_set_ph(
32    e15: f16,
33    e14: f16,
34    e13: f16,
35    e12: f16,
36    e11: f16,
37    e10: f16,
38    e9: f16,
39    e8: f16,
40    e7: f16,
41    e6: f16,
42    e5: f16,
43    e4: f16,
44    e3: f16,
45    e2: f16,
46    e1: f16,
47    e0: f16,
48) -> __m256h {
49    __m256h([
50        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
51    ])
52}
53
54/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values.
55///
56/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set_ph)
57#[inline]
58#[target_feature(enable = "avx512fp16")]
59#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
60pub fn _mm512_set_ph(
61    e31: f16,
62    e30: f16,
63    e29: f16,
64    e28: f16,
65    e27: f16,
66    e26: f16,
67    e25: f16,
68    e24: f16,
69    e23: f16,
70    e22: f16,
71    e21: f16,
72    e20: f16,
73    e19: f16,
74    e18: f16,
75    e17: f16,
76    e16: f16,
77    e15: f16,
78    e14: f16,
79    e13: f16,
80    e12: f16,
81    e11: f16,
82    e10: f16,
83    e9: f16,
84    e8: f16,
85    e7: f16,
86    e6: f16,
87    e5: f16,
88    e4: f16,
89    e3: f16,
90    e2: f16,
91    e1: f16,
92    e0: f16,
93) -> __m512h {
94    __m512h([
95        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19,
96        e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
97    ])
98}
99
100/// Copy half-precision (16-bit) floating-point elements from a to the lower element of dst and zero
101/// the upper 7 elements.
102///
103/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sh)
104#[inline]
105#[target_feature(enable = "avx512fp16")]
106#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
107pub fn _mm_set_sh(a: f16) -> __m128h {
108    __m128h([a, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
109}
110
111/// Broadcast the half-precision (16-bit) floating-point value a to all elements of dst.
112///
113/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_ph)
114#[inline]
115#[target_feature(enable = "avx512fp16")]
116#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
117pub fn _mm_set1_ph(a: f16) -> __m128h {
118    unsafe { transmute(f16x8::splat(a)) }
119}
120
121/// Broadcast the half-precision (16-bit) floating-point value a to all elements of dst.
122///
123/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set1_ph)
124#[inline]
125#[target_feature(enable = "avx512fp16")]
126#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
127pub fn _mm256_set1_ph(a: f16) -> __m256h {
128    unsafe { transmute(f16x16::splat(a)) }
129}
130
131/// Broadcast the half-precision (16-bit) floating-point value a to all elements of dst.
132///
133/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_ph)
134#[inline]
135#[target_feature(enable = "avx512fp16")]
136#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
137pub fn _mm512_set1_ph(a: f16) -> __m512h {
138    unsafe { transmute(f16x32::splat(a)) }
139}
140
141/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values in reverse order.
142///
143/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_ph)
144#[inline]
145#[target_feature(enable = "avx512fp16")]
146#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
147pub fn _mm_setr_ph(
148    e0: f16,
149    e1: f16,
150    e2: f16,
151    e3: f16,
152    e4: f16,
153    e5: f16,
154    e6: f16,
155    e7: f16,
156) -> __m128h {
157    __m128h([e0, e1, e2, e3, e4, e5, e6, e7])
158}
159
160/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values in reverse order.
161///
162/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_ph)
163#[inline]
164#[target_feature(enable = "avx512fp16")]
165#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
166pub fn _mm256_setr_ph(
167    e0: f16,
168    e1: f16,
169    e2: f16,
170    e3: f16,
171    e4: f16,
172    e5: f16,
173    e6: f16,
174    e7: f16,
175    e8: f16,
176    e9: f16,
177    e10: f16,
178    e11: f16,
179    e12: f16,
180    e13: f16,
181    e14: f16,
182    e15: f16,
183) -> __m256h {
184    __m256h([
185        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
186    ])
187}
188
189/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values in reverse order.
190///
191/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr_ph)
192#[inline]
193#[target_feature(enable = "avx512fp16")]
194#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
195pub fn _mm512_setr_ph(
196    e0: f16,
197    e1: f16,
198    e2: f16,
199    e3: f16,
200    e4: f16,
201    e5: f16,
202    e6: f16,
203    e7: f16,
204    e8: f16,
205    e9: f16,
206    e10: f16,
207    e11: f16,
208    e12: f16,
209    e13: f16,
210    e14: f16,
211    e15: f16,
212    e16: f16,
213    e17: f16,
214    e18: f16,
215    e19: f16,
216    e20: f16,
217    e21: f16,
218    e22: f16,
219    e23: f16,
220    e24: f16,
221    e25: f16,
222    e26: f16,
223    e27: f16,
224    e28: f16,
225    e29: f16,
226    e30: f16,
227    e31: f16,
228) -> __m512h {
229    __m512h([
230        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19,
231        e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
232    ])
233}
234
235/// Return vector of type __m128h with all elements set to zero.
236///
237/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_ph)
238#[inline]
239#[target_feature(enable = "avx512fp16,avx512vl")]
240#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
241pub fn _mm_setzero_ph() -> __m128h {
242    unsafe { transmute(f16x8::ZERO) }
243}
244
245/// Return vector of type __m256h with all elements set to zero.
246///
247/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setzero_ph)
248#[inline]
249#[target_feature(enable = "avx512fp16,avx512vl")]
250#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
251pub fn _mm256_setzero_ph() -> __m256h {
252    f16x16::ZERO.as_m256h()
253}
254
255/// Return vector of type __m512h with all elements set to zero.
256///
257/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setzero_ph)
258#[inline]
259#[target_feature(enable = "avx512fp16")]
260#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
261pub fn _mm512_setzero_ph() -> __m512h {
262    f16x32::ZERO.as_m512h()
263}
264
265/// Return vector of type `__m128h` with indetermination elements.
266/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
267/// picks some valid value and is not equivalent to [`mem::MaybeUninit`](crate::mem::MaybeUninit).
268/// In practice, this is typically equivalent to [`mem::zeroed`](crate::mem::zeroed).
269///
270/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ph)
271#[inline]
272#[target_feature(enable = "avx512fp16,avx512vl")]
273#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
274pub fn _mm_undefined_ph() -> __m128h {
275    f16x8::ZERO.as_m128h()
276}
277
278/// Return vector of type `__m256h` with indetermination elements.
279/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
280/// picks some valid value and is not equivalent to [`mem::MaybeUninit`](crate::mem::MaybeUninit).
281/// In practice, this is typically equivalent to [`mem::zeroed`](crate::mem::zeroed).
282///
283/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_undefined_ph)
284#[inline]
285#[target_feature(enable = "avx512fp16,avx512vl")]
286#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
287pub fn _mm256_undefined_ph() -> __m256h {
288    f16x16::ZERO.as_m256h()
289}
290
291/// Return vector of type `__m512h` with indetermination elements.
292/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
293/// picks some valid value and is not equivalent to [`mem::MaybeUninit`](crate::mem::MaybeUninit).
294/// In practice, this is typically equivalent to [`mem::zeroed`](crate::mem::zeroed).
295///
296/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined_ph)
297#[inline]
298#[target_feature(enable = "avx512fp16")]
299#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
300pub fn _mm512_undefined_ph() -> __m512h {
301    f16x32::ZERO.as_m512h()
302}
303
304/// Cast vector of type `__m128d` to type `__m128h`. This intrinsic is only used for compilation and
305/// does not generate any instructions, thus it has zero latency.
306///
307/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ph)
308#[inline]
309#[target_feature(enable = "avx512fp16")]
310#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
311pub fn _mm_castpd_ph(a: __m128d) -> __m128h {
312    unsafe { transmute(a) }
313}
314
315/// Cast vector of type `__m256d` to type `__m256h`. This intrinsic is only used for compilation and
316/// does not generate any instructions, thus it has zero latency.
317///
318/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castpd_ph)
319#[inline]
320#[target_feature(enable = "avx512fp16")]
321#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
322pub fn _mm256_castpd_ph(a: __m256d) -> __m256h {
323    unsafe { transmute(a) }
324}
325
326/// Cast vector of type `__m512d` to type `__m512h`. This intrinsic is only used for compilation and
327/// does not generate any instructions, thus it has zero latency.
328///
329/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd_ph)
330#[inline]
331#[target_feature(enable = "avx512fp16")]
332#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
333pub fn _mm512_castpd_ph(a: __m512d) -> __m512h {
334    unsafe { transmute(a) }
335}
336
337/// Cast vector of type `__m128h` to type `__m128d`. This intrinsic is only used for compilation and
338/// does not generate any instructions, thus it has zero latency.
339///
340/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castph_pd)
341#[inline]
342#[target_feature(enable = "avx512fp16")]
343#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
344pub fn _mm_castph_pd(a: __m128h) -> __m128d {
345    unsafe { transmute(a) }
346}
347
348/// Cast vector of type `__m256h` to type `__m256d`. This intrinsic is only used for compilation and
349/// does not generate any instructions, thus it has zero latency.
350///
351/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph_pd)
352#[inline]
353#[target_feature(enable = "avx512fp16")]
354#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
355pub fn _mm256_castph_pd(a: __m256h) -> __m256d {
356    unsafe { transmute(a) }
357}
358
359/// Cast vector of type `__m512h` to type `__m512d`. This intrinsic is only used for compilation and
360/// does not generate any instructions, thus it has zero latency.
361///
362/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph_pd)
363#[inline]
364#[target_feature(enable = "avx512fp16")]
365#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
366pub fn _mm512_castph_pd(a: __m512h) -> __m512d {
367    unsafe { transmute(a) }
368}
369
370/// Cast vector of type `__m128` to type `__m128h`. This intrinsic is only used for compilation and
371/// does not generate any instructions, thus it has zero latency.
372///
373/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_ph)
374#[inline]
375#[target_feature(enable = "avx512fp16")]
376#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
377pub fn _mm_castps_ph(a: __m128) -> __m128h {
378    unsafe { transmute(a) }
379}
380
381/// Cast vector of type `__m256` to type `__m256h`. This intrinsic is only used for compilation and
382/// does not generate any instructions, thus it has zero latency.
383///
384/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castps_ph)
385#[inline]
386#[target_feature(enable = "avx512fp16")]
387#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
388pub fn _mm256_castps_ph(a: __m256) -> __m256h {
389    unsafe { transmute(a) }
390}
391
392/// Cast vector of type `__m512` to type `__m512h`. This intrinsic is only used for compilation and
393/// does not generate any instructions, thus it has zero latency.
394///
395/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps_ph)
396#[inline]
397#[target_feature(enable = "avx512fp16")]
398#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
399pub fn _mm512_castps_ph(a: __m512) -> __m512h {
400    unsafe { transmute(a) }
401}
402
403/// Cast vector of type `__m128h` to type `__m128`. This intrinsic is only used for compilation and
404/// does not generate any instructions, thus it has zero latency.
405///
406/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castph_ps)
407#[inline]
408#[target_feature(enable = "avx512fp16")]
409#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
410pub fn _mm_castph_ps(a: __m128h) -> __m128 {
411    unsafe { transmute(a) }
412}
413
414/// Cast vector of type `__m256h` to type `__m256`. This intrinsic is only used for compilation and
415/// does not generate any instructions, thus it has zero latency.
416///
417/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph_ps)
418#[inline]
419#[target_feature(enable = "avx512fp16")]
420#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
421pub fn _mm256_castph_ps(a: __m256h) -> __m256 {
422    unsafe { transmute(a) }
423}
424
425/// Cast vector of type `__m512h` to type `__m512`. This intrinsic is only used for compilation and
426/// does not generate any instructions, thus it has zero latency.
427///
428/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph_ps)
429#[inline]
430#[target_feature(enable = "avx512fp16")]
431#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
432pub fn _mm512_castph_ps(a: __m512h) -> __m512 {
433    unsafe { transmute(a) }
434}
435
436/// Cast vector of type `__m128i` to type `__m128h`. This intrinsic is only used for compilation and
437/// does not generate any instructions, thus it has zero latency.
438///
439/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_ph)
440#[inline]
441#[target_feature(enable = "avx512fp16")]
442#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
443pub fn _mm_castsi128_ph(a: __m128i) -> __m128h {
444    unsafe { transmute(a) }
445}
446
447/// Cast vector of type `__m256i` to type `__m256h`. This intrinsic is only used for compilation and
448/// does not generate any instructions, thus it has zero latency.
449///
450/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castsi256_ph)
451#[inline]
452#[target_feature(enable = "avx512fp16")]
453#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
454pub fn _mm256_castsi256_ph(a: __m256i) -> __m256h {
455    unsafe { transmute(a) }
456}
457
458/// Cast vector of type `__m512i` to type `__m512h`. This intrinsic is only used for compilation and
459/// does not generate any instructions, thus it has zero latency.
460///
461/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi512_ph)
462#[inline]
463#[target_feature(enable = "avx512fp16")]
464#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
465pub fn _mm512_castsi512_ph(a: __m512i) -> __m512h {
466    unsafe { transmute(a) }
467}
468
469/// Cast vector of type `__m128h` to type `__m128i`. This intrinsic is only used for compilation and
470/// does not generate any instructions, thus it has zero latency.
471///
472/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castph_si128)
473#[inline]
474#[target_feature(enable = "avx512fp16")]
475#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
476pub fn _mm_castph_si128(a: __m128h) -> __m128i {
477    unsafe { transmute(a) }
478}
479
480/// Cast vector of type `__m256h` to type `__m256i`. This intrinsic is only used for compilation and
481/// does not generate any instructions, thus it has zero latency.
482///
483/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph_si256)
484#[inline]
485#[target_feature(enable = "avx512fp16")]
486#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
487pub fn _mm256_castph_si256(a: __m256h) -> __m256i {
488    unsafe { transmute(a) }
489}
490
491/// Cast vector of type `__m512h` to type `__m512i`. This intrinsic is only used for compilation and
492/// does not generate any instructions, thus it has zero latency.
493///
494/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph_si512)
495#[inline]
496#[target_feature(enable = "avx512fp16")]
497#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
498pub fn _mm512_castph_si512(a: __m512h) -> __m512i {
499    unsafe { transmute(a) }
500}
501
502/// Cast vector of type `__m256h` to type `__m128h`. This intrinsic is only used for compilation and
503/// does not generate any instructions, thus it has zero latency.
504///
505/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph256_ph128)
506#[inline]
507#[target_feature(enable = "avx512fp16")]
508#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
509pub fn _mm256_castph256_ph128(a: __m256h) -> __m128h {
510    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]) }
511}
512
513/// Cast vector of type `__m512h` to type `__m128h`. This intrinsic is only used for compilation and
514/// does not generate any instructions, thus it has zero latency.
515///
516/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph512_ph128)
517#[inline]
518#[target_feature(enable = "avx512fp16")]
519#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
520pub fn _mm512_castph512_ph128(a: __m512h) -> __m128h {
521    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]) }
522}
523
524/// Cast vector of type `__m512h` to type `__m256h`. This intrinsic is only used for compilation and
525/// does not generate any instructions, thus it has zero latency.
526///
527/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph512_ph256)
528#[inline]
529#[target_feature(enable = "avx512fp16")]
530#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
531pub fn _mm512_castph512_ph256(a: __m512h) -> __m256h {
532    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) }
533}
534
535/// Cast vector of type `__m128h` to type `__m256h`. The upper 8 elements of the result are undefined.
536/// In practice, the upper elements are zeroed. This intrinsic can generate the `vzeroupper` instruction,
537/// but most of the time it does not generate any instructions.
538///
539/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph128_ph256)
540#[inline]
541#[target_feature(enable = "avx512fp16")]
542#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
543pub fn _mm256_castph128_ph256(a: __m128h) -> __m256h {
544    unsafe {
545        simd_shuffle!(
546            a,
547            _mm_undefined_ph(),
548            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]
549        )
550    }
551}
552
553/// Cast vector of type `__m128h` to type `__m512h`. The upper 24 elements of the result are undefined.
554/// In practice, the upper elements are zeroed. This intrinsic can generate the `vzeroupper` instruction,
555/// but most of the time it does not generate any instructions.
556///
557/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph128_ph512)
558#[inline]
559#[target_feature(enable = "avx512fp16")]
560#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
561pub fn _mm512_castph128_ph512(a: __m128h) -> __m512h {
562    unsafe {
563        simd_shuffle!(
564            a,
565            _mm_undefined_ph(),
566            [
567                0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
568                8, 8, 8, 8
569            ]
570        )
571    }
572}
573
574/// Cast vector of type `__m256h` to type `__m512h`. The upper 16 elements of the result are undefined.
575/// In practice, the upper elements are zeroed. This intrinsic can generate the `vzeroupper` instruction,
576/// but most of the time it does not generate any instructions.
577///
578/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph256_ph512)
579#[inline]
580#[target_feature(enable = "avx512fp16")]
581#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
582pub fn _mm512_castph256_ph512(a: __m256h) -> __m512h {
583    unsafe {
584        simd_shuffle!(
585            a,
586            _mm256_undefined_ph(),
587            [
588                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16,
589                16, 16, 16, 16, 16, 16, 16, 16, 16
590            ]
591        )
592    }
593}
594
595/// Cast vector of type `__m256h` to type `__m128h`. The upper 8 elements of the result are zeroed.
596/// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
597/// any instructions.
598///
599/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_zextph128_ph256)
600#[inline]
601#[target_feature(enable = "avx512fp16")]
602#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
603pub fn _mm256_zextph128_ph256(a: __m128h) -> __m256h {
604    unsafe {
605        simd_shuffle!(
606            a,
607            _mm_setzero_ph(),
608            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]
609        )
610    }
611}
612
613/// Cast vector of type `__m256h` to type `__m512h`. The upper 16 elements of the result are zeroed.
614/// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
615/// any instructions.
616///
617/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextph256_ph512)
618#[inline]
619#[target_feature(enable = "avx512fp16")]
620#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
621pub fn _mm512_zextph256_ph512(a: __m256h) -> __m512h {
622    unsafe {
623        simd_shuffle!(
624            a,
625            _mm256_setzero_ph(),
626            [
627                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16,
628                16, 16, 16, 16, 16, 16, 16, 16, 16
629            ]
630        )
631    }
632}
633
634/// Cast vector of type `__m128h` to type `__m512h`. The upper 24 elements of the result are zeroed.
635/// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
636/// any instructions.
637///
638/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextph128_ph512)
639#[inline]
640#[target_feature(enable = "avx512fp16")]
641#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
642pub fn _mm512_zextph128_ph512(a: __m128h) -> __m512h {
643    unsafe {
644        simd_shuffle!(
645            a,
646            _mm_setzero_ph(),
647            [
648                0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
649                8, 8, 8, 8
650            ]
651        )
652    }
653}
654
655macro_rules! cmp_asm { // FIXME: use LLVM intrinsics
656    ($mask_type: ty, $reg: ident, $a: expr, $b: expr) => {{
657        let dst: $mask_type;
658        asm!(
659            "vcmpph {k}, {a}, {b}, {imm8}",
660            k = lateout(kreg) dst,
661            a = in($reg) $a,
662            b = in($reg) $b,
663            imm8 = const IMM5,
664            options(pure, nomem, nostack)
665        );
666        dst
667    }};
668    ($mask_type: ty, $mask: expr, $reg: ident, $a: expr, $b: expr) => {{
669        let dst: $mask_type;
670        asm!(
671            "vcmpph {k} {{ {mask} }}, {a}, {b}, {imm8}",
672            k = lateout(kreg) dst,
673            mask = in(kreg) $mask,
674            a = in($reg) $a,
675            b = in($reg) $b,
676            imm8 = const IMM5,
677            options(pure, nomem, nostack)
678        );
679        dst
680    }};
681}
682
683/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
684/// operand specified by imm8, and store the results in mask vector k.
685///
686/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_ph_mask)
687#[inline]
688#[target_feature(enable = "avx512fp16,avx512vl")]
689#[rustc_legacy_const_generics(2)]
690#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
691pub fn _mm_cmp_ph_mask<const IMM5: i32>(a: __m128h, b: __m128h) -> __mmask8 {
692    unsafe {
693        static_assert_uimm_bits!(IMM5, 5);
694        cmp_asm!(__mmask8, xmm_reg, a, b)
695    }
696}
697
698/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
699/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
700/// zeroed out when the corresponding mask bit is not set).
701///
702/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_ph_mask)
703#[inline]
704#[target_feature(enable = "avx512fp16,avx512vl")]
705#[rustc_legacy_const_generics(3)]
706#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
707pub fn _mm_mask_cmp_ph_mask<const IMM5: i32>(k1: __mmask8, a: __m128h, b: __m128h) -> __mmask8 {
708    unsafe {
709        static_assert_uimm_bits!(IMM5, 5);
710        cmp_asm!(__mmask8, k1, xmm_reg, a, b)
711    }
712}
713
714/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
715/// operand specified by imm8, and store the results in mask vector k.
716///
717/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_ph_mask)
718#[inline]
719#[target_feature(enable = "avx512fp16,avx512vl")]
720#[rustc_legacy_const_generics(2)]
721#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
722pub fn _mm256_cmp_ph_mask<const IMM5: i32>(a: __m256h, b: __m256h) -> __mmask16 {
723    unsafe {
724        static_assert_uimm_bits!(IMM5, 5);
725        cmp_asm!(__mmask16, ymm_reg, a, b)
726    }
727}
728
729/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
730/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
731/// zeroed out when the corresponding mask bit is not set).
732///
733/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmp_ph_mask)
734#[inline]
735#[target_feature(enable = "avx512fp16,avx512vl")]
736#[rustc_legacy_const_generics(3)]
737#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
738pub fn _mm256_mask_cmp_ph_mask<const IMM5: i32>(
739    k1: __mmask16,
740    a: __m256h,
741    b: __m256h,
742) -> __mmask16 {
743    unsafe {
744        static_assert_uimm_bits!(IMM5, 5);
745        cmp_asm!(__mmask16, k1, ymm_reg, a, b)
746    }
747}
748
749/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
750/// operand specified by imm8, and store the results in mask vector k.
751///
752/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_ph_mask)
753#[inline]
754#[target_feature(enable = "avx512fp16")]
755#[rustc_legacy_const_generics(2)]
756#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
757pub fn _mm512_cmp_ph_mask<const IMM5: i32>(a: __m512h, b: __m512h) -> __mmask32 {
758    unsafe {
759        static_assert_uimm_bits!(IMM5, 5);
760        cmp_asm!(__mmask32, zmm_reg, a, b)
761    }
762}
763
764/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
765/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
766/// zeroed out when the corresponding mask bit is not set).
767///
768/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_ph_mask)
769#[inline]
770#[target_feature(enable = "avx512fp16")]
771#[rustc_legacy_const_generics(3)]
772#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
773pub fn _mm512_mask_cmp_ph_mask<const IMM5: i32>(
774    k1: __mmask32,
775    a: __m512h,
776    b: __m512h,
777) -> __mmask32 {
778    unsafe {
779        static_assert_uimm_bits!(IMM5, 5);
780        cmp_asm!(__mmask32, k1, zmm_reg, a, b)
781    }
782}
783
784/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
785/// operand specified by imm8, and store the results in mask vector k.
786///
787/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
788///
789/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_round_ph_mask)
790#[inline]
791#[target_feature(enable = "avx512fp16")]
792#[rustc_legacy_const_generics(2, 3)]
793#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
794pub fn _mm512_cmp_round_ph_mask<const IMM5: i32, const SAE: i32>(
795    a: __m512h,
796    b: __m512h,
797) -> __mmask32 {
798    unsafe {
799        static_assert_uimm_bits!(IMM5, 5);
800        static_assert_sae!(SAE);
801        if SAE == _MM_FROUND_NO_EXC {
802            let dst: __mmask32;
803            asm!(
804                "vcmpph {k}, {a}, {b}, {{sae}}, {imm8}",
805                k = lateout(kreg) dst,
806                a = in(zmm_reg) a,
807                b = in(zmm_reg) b,
808                imm8 = const IMM5,
809                options(pure, nomem, nostack)
810            );
811            dst
812        } else {
813            cmp_asm!(__mmask32, zmm_reg, a, b)
814        }
815    }
816}
817
818/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
819/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
820/// zeroed out when the corresponding mask bit is not set).
821///
822/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
823///
824/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_round_ph_mask)
825#[inline]
826#[target_feature(enable = "avx512fp16")]
827#[rustc_legacy_const_generics(3, 4)]
828#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
829pub fn _mm512_mask_cmp_round_ph_mask<const IMM5: i32, const SAE: i32>(
830    k1: __mmask32,
831    a: __m512h,
832    b: __m512h,
833) -> __mmask32 {
834    unsafe {
835        static_assert_uimm_bits!(IMM5, 5);
836        static_assert_sae!(SAE);
837        if SAE == _MM_FROUND_NO_EXC {
838            let dst: __mmask32;
839            asm!(
840                "vcmpph {k} {{{k1}}}, {a}, {b}, {{sae}}, {imm8}",
841                k = lateout(kreg) dst,
842                k1 = in(kreg) k1,
843                a = in(zmm_reg) a,
844                b = in(zmm_reg) b,
845                imm8 = const IMM5,
846                options(pure, nomem, nostack)
847            );
848            dst
849        } else {
850            cmp_asm!(__mmask32, k1, zmm_reg, a, b)
851        }
852    }
853}
854
855/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
856/// operand specified by imm8, and store the result in mask vector k. Exceptions can be suppressed by
857/// passing _MM_FROUND_NO_EXC in the sae parameter.
858///
859/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_round_sh_mask)
860#[inline]
861#[target_feature(enable = "avx512fp16")]
862#[rustc_legacy_const_generics(2, 3)]
863#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
864pub fn _mm_cmp_round_sh_mask<const IMM5: i32, const SAE: i32>(a: __m128h, b: __m128h) -> __mmask8 {
865    static_assert_uimm_bits!(IMM5, 5);
866    static_assert_sae!(SAE);
867    _mm_mask_cmp_round_sh_mask::<IMM5, SAE>(0xff, a, b)
868}
869
870/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
871/// operand specified by imm8, and store the result in mask vector k using zeromask k1. Exceptions can be
872/// suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
873///
874/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_round_sh_mask)
875#[inline]
876#[target_feature(enable = "avx512fp16")]
877#[rustc_legacy_const_generics(3, 4)]
878#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
879pub fn _mm_mask_cmp_round_sh_mask<const IMM5: i32, const SAE: i32>(
880    k1: __mmask8,
881    a: __m128h,
882    b: __m128h,
883) -> __mmask8 {
884    unsafe {
885        static_assert_uimm_bits!(IMM5, 5);
886        static_assert_sae!(SAE);
887        vcmpsh(a, b, IMM5, k1, SAE)
888    }
889}
890
891/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
892/// operand specified by imm8, and store the result in mask vector k.
893///
894/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_sh_mask)
895#[inline]
896#[target_feature(enable = "avx512fp16")]
897#[rustc_legacy_const_generics(2)]
898#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
899pub fn _mm_cmp_sh_mask<const IMM5: i32>(a: __m128h, b: __m128h) -> __mmask8 {
900    static_assert_uimm_bits!(IMM5, 5);
901    _mm_cmp_round_sh_mask::<IMM5, _MM_FROUND_CUR_DIRECTION>(a, b)
902}
903
904/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
905/// operand specified by imm8, and store the result in mask vector k using zeromask k1.
906///
907/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_sh_mask)
908#[inline]
909#[target_feature(enable = "avx512fp16")]
910#[rustc_legacy_const_generics(3)]
911#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
912pub fn _mm_mask_cmp_sh_mask<const IMM5: i32>(k1: __mmask8, a: __m128h, b: __m128h) -> __mmask8 {
913    static_assert_uimm_bits!(IMM5, 5);
914    _mm_mask_cmp_round_sh_mask::<IMM5, _MM_FROUND_CUR_DIRECTION>(k1, a, b)
915}
916
917/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
918/// operand specified by imm8, and return the boolean result (0 or 1).
919/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
920///
921/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comi_round_sh)
922#[inline]
923#[target_feature(enable = "avx512fp16")]
924#[rustc_legacy_const_generics(2, 3)]
925#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
926pub fn _mm_comi_round_sh<const IMM5: i32, const SAE: i32>(a: __m128h, b: __m128h) -> i32 {
927    unsafe {
928        static_assert_uimm_bits!(IMM5, 5);
929        static_assert_sae!(SAE);
930        vcomish(a, b, IMM5, SAE)
931    }
932}
933
934/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
935/// operand specified by imm8, and return the boolean result (0 or 1).
936///
937/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comi_sh)
938#[inline]
939#[target_feature(enable = "avx512fp16")]
940#[rustc_legacy_const_generics(2)]
941#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
942pub fn _mm_comi_sh<const IMM5: i32>(a: __m128h, b: __m128h) -> i32 {
943    static_assert_uimm_bits!(IMM5, 5);
944    _mm_comi_round_sh::<IMM5, _MM_FROUND_CUR_DIRECTION>(a, b)
945}
946
947/// Compare the lower half-precision (16-bit) floating-point elements in a and b for equality, and return
948/// the boolean result (0 or 1).
949///
950/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_sh)
951#[inline]
952#[target_feature(enable = "avx512fp16")]
953#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
954pub fn _mm_comieq_sh(a: __m128h, b: __m128h) -> i32 {
955    _mm_comi_sh::<_CMP_EQ_OS>(a, b)
956}
957
958/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than-or-equal,
959/// and return the boolean result (0 or 1).
960///
961/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_sh)
962#[inline]
963#[target_feature(enable = "avx512fp16")]
964#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
965pub fn _mm_comige_sh(a: __m128h, b: __m128h) -> i32 {
966    _mm_comi_sh::<_CMP_GE_OS>(a, b)
967}
968
969/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than, and return
970/// the boolean result (0 or 1).
971///
972/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_sh)
973#[inline]
974#[target_feature(enable = "avx512fp16")]
975#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
976pub fn _mm_comigt_sh(a: __m128h, b: __m128h) -> i32 {
977    _mm_comi_sh::<_CMP_GT_OS>(a, b)
978}
979
980/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than-or-equal, and
981/// return the boolean result (0 or 1).
982///
983/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_sh)
984#[inline]
985#[target_feature(enable = "avx512fp16")]
986#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
987pub fn _mm_comile_sh(a: __m128h, b: __m128h) -> i32 {
988    _mm_comi_sh::<_CMP_LE_OS>(a, b)
989}
990
991/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than, and return
992/// the boolean result (0 or 1).
993///
994/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_sh)
995#[inline]
996#[target_feature(enable = "avx512fp16")]
997#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
998pub fn _mm_comilt_sh(a: __m128h, b: __m128h) -> i32 {
999    _mm_comi_sh::<_CMP_LT_OS>(a, b)
1000}
1001
1002/// Compare the lower half-precision (16-bit) floating-point elements in a and b for not-equal, and return
1003/// the boolean result (0 or 1).
1004///
1005/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_sh)
1006#[inline]
1007#[target_feature(enable = "avx512fp16")]
1008#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1009pub fn _mm_comineq_sh(a: __m128h, b: __m128h) -> i32 {
1010    _mm_comi_sh::<_CMP_NEQ_OS>(a, b)
1011}
1012
1013/// Compare the lower half-precision (16-bit) floating-point elements in a and b for equality, and
1014/// return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1015///
1016/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomieq_sh)
1017#[inline]
1018#[target_feature(enable = "avx512fp16")]
1019#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1020pub fn _mm_ucomieq_sh(a: __m128h, b: __m128h) -> i32 {
1021    _mm_comi_sh::<_CMP_EQ_OQ>(a, b)
1022}
1023
1024/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than-or-equal,
1025/// and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1026///
1027/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomige_sh)
1028#[inline]
1029#[target_feature(enable = "avx512fp16")]
1030#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1031pub fn _mm_ucomige_sh(a: __m128h, b: __m128h) -> i32 {
1032    _mm_comi_sh::<_CMP_GE_OQ>(a, b)
1033}
1034
1035/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than, and return
1036/// the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1037///
1038/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomigt_sh)
1039#[inline]
1040#[target_feature(enable = "avx512fp16")]
1041#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1042pub fn _mm_ucomigt_sh(a: __m128h, b: __m128h) -> i32 {
1043    _mm_comi_sh::<_CMP_GT_OQ>(a, b)
1044}
1045
1046/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than-or-equal, and
1047/// return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1048///
1049/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomile_sh)
1050#[inline]
1051#[target_feature(enable = "avx512fp16")]
1052#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1053pub fn _mm_ucomile_sh(a: __m128h, b: __m128h) -> i32 {
1054    _mm_comi_sh::<_CMP_LE_OQ>(a, b)
1055}
1056
1057/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than, and return
1058/// the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1059///
1060/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomilt_sh)
1061#[inline]
1062#[target_feature(enable = "avx512fp16")]
1063#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1064pub fn _mm_ucomilt_sh(a: __m128h, b: __m128h) -> i32 {
1065    _mm_comi_sh::<_CMP_LT_OQ>(a, b)
1066}
1067
1068/// Compare the lower half-precision (16-bit) floating-point elements in a and b for not-equal, and return
1069/// the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1070///
1071/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomineq_sh)
1072#[inline]
1073#[target_feature(enable = "avx512fp16")]
1074#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1075pub fn _mm_ucomineq_sh(a: __m128h, b: __m128h) -> i32 {
1076    _mm_comi_sh::<_CMP_NEQ_OQ>(a, b)
1077}
1078
1079/// Load 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from memory into
1080/// a new vector. The address must be aligned to 16 bytes or a general-protection exception may be generated.
1081///
1082/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ph)
1083#[inline]
1084#[target_feature(enable = "avx512fp16,avx512vl")]
1085#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1086pub unsafe fn _mm_load_ph(mem_addr: *const f16) -> __m128h {
1087    *mem_addr.cast()
1088}
1089
1090/// Load 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from memory into
1091/// a new vector. The address must be aligned to 32 bytes or a general-protection exception may be generated.
1092///
1093/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_load_ph)
1094#[inline]
1095#[target_feature(enable = "avx512fp16,avx512vl")]
1096#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1097pub unsafe fn _mm256_load_ph(mem_addr: *const f16) -> __m256h {
1098    *mem_addr.cast()
1099}
1100
1101/// Load 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from memory into
1102/// a new vector. The address must be aligned to 64 bytes or a general-protection exception may be generated.
1103///
1104/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_load_ph)
1105#[inline]
1106#[target_feature(enable = "avx512fp16")]
1107#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1108pub unsafe fn _mm512_load_ph(mem_addr: *const f16) -> __m512h {
1109    *mem_addr.cast()
1110}
1111
1112/// Load a half-precision (16-bit) floating-point element from memory into the lower element of a new vector,
1113/// and zero the upper elements
1114///
1115/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sh)
1116#[inline]
1117#[target_feature(enable = "avx512fp16")]
1118#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1119pub unsafe fn _mm_load_sh(mem_addr: *const f16) -> __m128h {
1120    _mm_set_sh(*mem_addr)
1121}
1122
1123/// Load a half-precision (16-bit) floating-point element from memory into the lower element of a new vector
1124/// using writemask k (the element is copied from src when mask bit 0 is not set), and zero the upper elements.
1125///
1126/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_sh)
1127#[inline]
1128#[target_feature(enable = "avx512fp16")]
1129#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1130pub unsafe fn _mm_mask_load_sh(src: __m128h, k: __mmask8, mem_addr: *const f16) -> __m128h {
1131    let mut dst = src;
1132    asm!(
1133        vpl!("vmovsh {dst}{{{k}}}"),
1134        dst = inout(xmm_reg) dst,
1135        k = in(kreg) k,
1136        p = in(reg) mem_addr,
1137        options(pure, readonly, nostack, preserves_flags)
1138    );
1139    dst
1140}
1141
1142/// Load a half-precision (16-bit) floating-point element from memory into the lower element of a new vector
1143/// using zeromask k (the element is zeroed out when mask bit 0 is not set), and zero the upper elements.
1144///
1145/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_sh)
1146#[inline]
1147#[target_feature(enable = "avx512fp16")]
1148#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1149pub unsafe fn _mm_maskz_load_sh(k: __mmask8, mem_addr: *const f16) -> __m128h {
1150    let mut dst: __m128h;
1151    asm!(
1152        vpl!("vmovsh {dst}{{{k}}}{{z}}"),
1153        dst = out(xmm_reg) dst,
1154        k = in(kreg) k,
1155        p = in(reg) mem_addr,
1156        options(pure, readonly, nostack, preserves_flags)
1157    );
1158    dst
1159}
1160
1161/// Load 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from memory into
1162/// a new vector. The address does not need to be aligned to any particular boundary.
1163///
1164/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_ph)
1165#[inline]
1166#[target_feature(enable = "avx512fp16,avx512vl")]
1167#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1168pub unsafe fn _mm_loadu_ph(mem_addr: *const f16) -> __m128h {
1169    ptr::read_unaligned(mem_addr.cast())
1170}
1171
1172/// Load 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from memory into
1173/// a new vector. The address does not need to be aligned to any particular boundary.
1174///
1175/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu_ph)
1176#[inline]
1177#[target_feature(enable = "avx512fp16,avx512vl")]
1178#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1179pub unsafe fn _mm256_loadu_ph(mem_addr: *const f16) -> __m256h {
1180    ptr::read_unaligned(mem_addr.cast())
1181}
1182
1183/// Load 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from memory into
1184/// a new vector. The address does not need to be aligned to any particular boundary.
1185///
1186/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_loadu_ph)
1187#[inline]
1188#[target_feature(enable = "avx512fp16")]
1189#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1190pub unsafe fn _mm512_loadu_ph(mem_addr: *const f16) -> __m512h {
1191    ptr::read_unaligned(mem_addr.cast())
1192}
1193
1194/// Move the lower half-precision (16-bit) floating-point element from b to the lower element of dst
1195/// using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper
1196/// 7 packed elements from a to the upper elements of dst.
1197///
1198/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_move_sh)
1199#[inline]
1200#[target_feature(enable = "avx512fp16")]
1201#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1202pub fn _mm_mask_move_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1203    unsafe {
1204        let mut mov: f16 = simd_extract!(src, 0);
1205        if (k & 1) != 0 {
1206            mov = simd_extract!(b, 0);
1207        }
1208        simd_insert!(a, 0, mov)
1209    }
1210}
1211
1212/// Move the lower half-precision (16-bit) floating-point element from b to the lower element of dst
1213/// using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed
1214/// elements from a to the upper elements of dst.
1215///
1216/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_move_sh)
1217#[inline]
1218#[target_feature(enable = "avx512fp16")]
1219#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1220pub fn _mm_maskz_move_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1221    unsafe {
1222        let mut mov: f16 = 0.;
1223        if (k & 1) != 0 {
1224            mov = simd_extract!(b, 0);
1225        }
1226        simd_insert!(a, 0, mov)
1227    }
1228}
1229
1230/// Move the lower half-precision (16-bit) floating-point element from b to the lower element of dst,
1231/// and copy the upper 7 packed elements from a to the upper elements of dst.
1232///
1233/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sh)
1234#[inline]
1235#[target_feature(enable = "avx512fp16")]
1236#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1237pub fn _mm_move_sh(a: __m128h, b: __m128h) -> __m128h {
1238    unsafe {
1239        let mov: f16 = simd_extract!(b, 0);
1240        simd_insert!(a, 0, mov)
1241    }
1242}
1243
1244/// Store 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from a into memory.
1245/// The address must be aligned to 16 bytes or a general-protection exception may be generated.
1246///
1247/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ph)
1248#[inline]
1249#[target_feature(enable = "avx512fp16,avx512vl")]
1250#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1251pub unsafe fn _mm_store_ph(mem_addr: *mut f16, a: __m128h) {
1252    *mem_addr.cast() = a;
1253}
1254
1255/// Store 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from a into memory.
1256/// The address must be aligned to 32 bytes or a general-protection exception may be generated.
1257///
1258/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_store_ph)
1259#[inline]
1260#[target_feature(enable = "avx512fp16,avx512vl")]
1261#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1262pub unsafe fn _mm256_store_ph(mem_addr: *mut f16, a: __m256h) {
1263    *mem_addr.cast() = a;
1264}
1265
1266/// Store 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from a into memory.
1267/// The address must be aligned to 64 bytes or a general-protection exception may be generated.
1268///
1269/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_store_ph)
1270#[inline]
1271#[target_feature(enable = "avx512fp16")]
1272#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1273pub unsafe fn _mm512_store_ph(mem_addr: *mut f16, a: __m512h) {
1274    *mem_addr.cast() = a;
1275}
1276
1277/// Store the lower half-precision (16-bit) floating-point element from a into memory.
1278///
1279/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_sh)
1280#[inline]
1281#[target_feature(enable = "avx512fp16")]
1282#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1283pub unsafe fn _mm_store_sh(mem_addr: *mut f16, a: __m128h) {
1284    *mem_addr = simd_extract!(a, 0);
1285}
1286
1287/// Store the lower half-precision (16-bit) floating-point element from a into memory using writemask k
1288///
1289/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_sh)
1290#[inline]
1291#[target_feature(enable = "avx512fp16")]
1292#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1293pub unsafe fn _mm_mask_store_sh(mem_addr: *mut f16, k: __mmask8, a: __m128h) {
1294    asm!(
1295        vps!("vmovdqu16", "{{{k}}}, {src}"),
1296        p = in(reg) mem_addr,
1297        k = in(kreg) k,
1298        src = in(xmm_reg) a,
1299        options(nostack, preserves_flags)
1300    );
1301}
1302
1303/// Store 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from a into memory.
1304/// The address does not need to be aligned to any particular boundary.
1305///
1306/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_ph)
1307#[inline]
1308#[target_feature(enable = "avx512fp16,avx512vl")]
1309#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1310pub unsafe fn _mm_storeu_ph(mem_addr: *mut f16, a: __m128h) {
1311    ptr::write_unaligned(mem_addr.cast(), a);
1312}
1313
1314/// Store 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from a into memory.
1315/// The address does not need to be aligned to any particular boundary.
1316///
1317/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu_ph)
1318#[inline]
1319#[target_feature(enable = "avx512fp16,avx512vl")]
1320#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1321pub unsafe fn _mm256_storeu_ph(mem_addr: *mut f16, a: __m256h) {
1322    ptr::write_unaligned(mem_addr.cast(), a);
1323}
1324
1325/// Store 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from a into memory.
1326/// The address does not need to be aligned to any particular boundary.
1327///
1328/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_storeu_ph)
1329#[inline]
1330#[target_feature(enable = "avx512fp16")]
1331#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1332pub unsafe fn _mm512_storeu_ph(mem_addr: *mut f16, a: __m512h) {
1333    ptr::write_unaligned(mem_addr.cast(), a);
1334}
1335
1336/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1337///
1338/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_ph)
1339#[inline]
1340#[target_feature(enable = "avx512fp16,avx512vl")]
1341#[cfg_attr(test, assert_instr(vaddph))]
1342#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1343pub fn _mm_add_ph(a: __m128h, b: __m128h) -> __m128h {
1344    unsafe { simd_add(a, b) }
1345}
1346
1347/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1348/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1349///
1350/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_ph)
1351#[inline]
1352#[target_feature(enable = "avx512fp16,avx512vl")]
1353#[cfg_attr(test, assert_instr(vaddph))]
1354#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1355pub fn _mm_mask_add_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1356    unsafe {
1357        let r = _mm_add_ph(a, b);
1358        simd_select_bitmask(k, r, src)
1359    }
1360}
1361
1362/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1363/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1364///
1365/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_ph)
1366#[inline]
1367#[target_feature(enable = "avx512fp16,avx512vl")]
1368#[cfg_attr(test, assert_instr(vaddph))]
1369#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1370pub fn _mm_maskz_add_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1371    unsafe {
1372        let r = _mm_add_ph(a, b);
1373        simd_select_bitmask(k, r, _mm_setzero_ph())
1374    }
1375}
1376
1377/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1378///
1379/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_ph)
1380#[inline]
1381#[target_feature(enable = "avx512fp16,avx512vl")]
1382#[cfg_attr(test, assert_instr(vaddph))]
1383#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1384pub fn _mm256_add_ph(a: __m256h, b: __m256h) -> __m256h {
1385    unsafe { simd_add(a, b) }
1386}
1387
1388/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1389/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1390///
1391/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_add_ph)
1392#[inline]
1393#[target_feature(enable = "avx512fp16,avx512vl")]
1394#[cfg_attr(test, assert_instr(vaddph))]
1395#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1396pub fn _mm256_mask_add_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
1397    unsafe {
1398        let r = _mm256_add_ph(a, b);
1399        simd_select_bitmask(k, r, src)
1400    }
1401}
1402
1403/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1404/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1405///
1406/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_add_ph)
1407#[inline]
1408#[target_feature(enable = "avx512fp16,avx512vl")]
1409#[cfg_attr(test, assert_instr(vaddph))]
1410#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1411pub fn _mm256_maskz_add_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
1412    unsafe {
1413        let r = _mm256_add_ph(a, b);
1414        simd_select_bitmask(k, r, _mm256_setzero_ph())
1415    }
1416}
1417
1418/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1419///
1420/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_add_ph)
1421#[inline]
1422#[target_feature(enable = "avx512fp16")]
1423#[cfg_attr(test, assert_instr(vaddph))]
1424#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1425pub fn _mm512_add_ph(a: __m512h, b: __m512h) -> __m512h {
1426    unsafe { simd_add(a, b) }
1427}
1428
1429/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1430/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1431///
1432/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_add_ph)
1433#[inline]
1434#[target_feature(enable = "avx512fp16")]
1435#[cfg_attr(test, assert_instr(vaddph))]
1436#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1437pub fn _mm512_mask_add_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
1438    unsafe {
1439        let r = _mm512_add_ph(a, b);
1440        simd_select_bitmask(k, r, src)
1441    }
1442}
1443
1444/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1445/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1446///
1447/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_add_ph)
1448#[inline]
1449#[target_feature(enable = "avx512fp16")]
1450#[cfg_attr(test, assert_instr(vaddph))]
1451#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1452pub fn _mm512_maskz_add_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
1453    unsafe {
1454        let r = _mm512_add_ph(a, b);
1455        simd_select_bitmask(k, r, _mm512_setzero_ph())
1456    }
1457}
1458
1459/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1460/// Rounding is done according to the rounding parameter, which can be one of:
1461///
1462/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1463/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1464/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1465/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1466/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1467///
1468/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_add_round_ph)
1469#[inline]
1470#[target_feature(enable = "avx512fp16")]
1471#[cfg_attr(test, assert_instr(vaddph, ROUNDING = 8))]
1472#[rustc_legacy_const_generics(2)]
1473#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1474pub fn _mm512_add_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
1475    unsafe {
1476        static_assert_rounding!(ROUNDING);
1477        vaddph(a, b, ROUNDING)
1478    }
1479}
1480
1481/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1482/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1483/// Rounding is done according to the rounding parameter, which can be one of:
1484///
1485/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1486/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1487/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1488/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1489/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1490///
1491/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_add_round_ph)
1492#[inline]
1493#[target_feature(enable = "avx512fp16")]
1494#[cfg_attr(test, assert_instr(vaddph, ROUNDING = 8))]
1495#[rustc_legacy_const_generics(4)]
1496#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1497pub fn _mm512_mask_add_round_ph<const ROUNDING: i32>(
1498    src: __m512h,
1499    k: __mmask32,
1500    a: __m512h,
1501    b: __m512h,
1502) -> __m512h {
1503    unsafe {
1504        static_assert_rounding!(ROUNDING);
1505        let r = _mm512_add_round_ph::<ROUNDING>(a, b);
1506        simd_select_bitmask(k, r, src)
1507    }
1508}
1509
1510/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1511/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1512/// Rounding is done according to the rounding parameter, which can be one of:
1513///
1514/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1515/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1516/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1517/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1518///
1519/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_add_round_ph)
1520#[inline]
1521#[target_feature(enable = "avx512fp16")]
1522#[cfg_attr(test, assert_instr(vaddph, ROUNDING = 8))]
1523#[rustc_legacy_const_generics(3)]
1524#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1525pub fn _mm512_maskz_add_round_ph<const ROUNDING: i32>(
1526    k: __mmask32,
1527    a: __m512h,
1528    b: __m512h,
1529) -> __m512h {
1530    unsafe {
1531        static_assert_rounding!(ROUNDING);
1532        let r = _mm512_add_round_ph::<ROUNDING>(a, b);
1533        simd_select_bitmask(k, r, _mm512_setzero_ph())
1534    }
1535}
1536
1537/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1538/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
1539/// Rounding is done according to the rounding parameter, which can be one of:
1540///
1541/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1542/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1543/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1544/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1545/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1546///
1547/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_sh)
1548#[inline]
1549#[target_feature(enable = "avx512fp16")]
1550#[cfg_attr(test, assert_instr(vaddsh, ROUNDING = 8))]
1551#[rustc_legacy_const_generics(2)]
1552#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1553pub fn _mm_add_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
1554    static_assert_rounding!(ROUNDING);
1555    _mm_mask_add_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
1556}
1557
1558/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1559/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1560/// writemask k (the element is copied from src when mask bit 0 is not set).
1561/// Rounding is done according to the rounding parameter, which can be one of:
1562///
1563/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1564/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1565/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1566/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1567/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1568///
1569/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_round_sh)
1570#[inline]
1571#[target_feature(enable = "avx512fp16")]
1572#[cfg_attr(test, assert_instr(vaddsh, ROUNDING = 8))]
1573#[rustc_legacy_const_generics(4)]
1574#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1575pub fn _mm_mask_add_round_sh<const ROUNDING: i32>(
1576    src: __m128h,
1577    k: __mmask8,
1578    a: __m128h,
1579    b: __m128h,
1580) -> __m128h {
1581    unsafe {
1582        static_assert_rounding!(ROUNDING);
1583        vaddsh(a, b, src, k, ROUNDING)
1584    }
1585}
1586
1587/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1588/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1589/// zeromask k (the element is zeroed out when mask bit 0 is not set).
1590/// Rounding is done according to the rounding parameter, which can be one of:
1591///
1592/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1593/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1594/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1595/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1596/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1597///
1598/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sh)
1599#[inline]
1600#[target_feature(enable = "avx512fp16")]
1601#[cfg_attr(test, assert_instr(vaddsh, ROUNDING = 8))]
1602#[rustc_legacy_const_generics(3)]
1603#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1604pub fn _mm_maskz_add_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1605    static_assert_rounding!(ROUNDING);
1606    _mm_mask_add_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
1607}
1608
1609/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1610/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
1611///
1612/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sh)
1613#[inline]
1614#[target_feature(enable = "avx512fp16")]
1615#[cfg_attr(test, assert_instr(vaddsh))]
1616#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1617pub fn _mm_add_sh(a: __m128h, b: __m128h) -> __m128h {
1618    _mm_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
1619}
1620
1621/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1622/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1623/// writemask k (the element is copied from src when mask bit 0 is not set).
1624///
1625/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_sh)
1626#[inline]
1627#[target_feature(enable = "avx512fp16")]
1628#[cfg_attr(test, assert_instr(vaddsh))]
1629#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1630pub fn _mm_mask_add_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1631    _mm_mask_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
1632}
1633
1634/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1635/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1636/// zeromask k (the element is zeroed out when mask bit 0 is not set).
1637///
1638/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_sh)
1639#[inline]
1640#[target_feature(enable = "avx512fp16")]
1641#[cfg_attr(test, assert_instr(vaddsh))]
1642#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1643pub fn _mm_maskz_add_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1644    _mm_maskz_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
1645}
1646
1647/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
1648///
1649/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ph)
1650#[inline]
1651#[target_feature(enable = "avx512fp16,avx512vl")]
1652#[cfg_attr(test, assert_instr(vsubph))]
1653#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1654pub fn _mm_sub_ph(a: __m128h, b: __m128h) -> __m128h {
1655    unsafe { simd_sub(a, b) }
1656}
1657
1658/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1659/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1660///
1661/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_ph)
1662#[inline]
1663#[target_feature(enable = "avx512fp16,avx512vl")]
1664#[cfg_attr(test, assert_instr(vsubph))]
1665#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1666pub fn _mm_mask_sub_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1667    unsafe {
1668        let r = _mm_sub_ph(a, b);
1669        simd_select_bitmask(k, r, src)
1670    }
1671}
1672
1673/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1674/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1675///
1676/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_ph)
1677#[inline]
1678#[target_feature(enable = "avx512fp16,avx512vl")]
1679#[cfg_attr(test, assert_instr(vsubph))]
1680#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1681pub fn _mm_maskz_sub_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1682    unsafe {
1683        let r = _mm_sub_ph(a, b);
1684        simd_select_bitmask(k, r, _mm_setzero_ph())
1685    }
1686}
1687
1688/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
1689///
1690/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_ph)
1691#[inline]
1692#[target_feature(enable = "avx512fp16,avx512vl")]
1693#[cfg_attr(test, assert_instr(vsubph))]
1694#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1695pub fn _mm256_sub_ph(a: __m256h, b: __m256h) -> __m256h {
1696    unsafe { simd_sub(a, b) }
1697}
1698
1699/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1700/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1701///
1702/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sub_ph)
1703#[inline]
1704#[target_feature(enable = "avx512fp16,avx512vl")]
1705#[cfg_attr(test, assert_instr(vsubph))]
1706#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1707pub fn _mm256_mask_sub_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
1708    unsafe {
1709        let r = _mm256_sub_ph(a, b);
1710        simd_select_bitmask(k, r, src)
1711    }
1712}
1713
1714/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1715/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1716///
1717/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sub_ph)
1718#[inline]
1719#[target_feature(enable = "avx512fp16,avx512vl")]
1720#[cfg_attr(test, assert_instr(vsubph))]
1721#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1722pub fn _mm256_maskz_sub_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
1723    unsafe {
1724        let r = _mm256_sub_ph(a, b);
1725        simd_select_bitmask(k, r, _mm256_setzero_ph())
1726    }
1727}
1728
1729/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
1730///
1731/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_ph)
1732#[inline]
1733#[target_feature(enable = "avx512fp16")]
1734#[cfg_attr(test, assert_instr(vsubph))]
1735#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1736pub fn _mm512_sub_ph(a: __m512h, b: __m512h) -> __m512h {
1737    unsafe { simd_sub(a, b) }
1738}
1739
1740/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1741/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1742///
1743/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_ph)
1744#[inline]
1745#[target_feature(enable = "avx512fp16")]
1746#[cfg_attr(test, assert_instr(vsubph))]
1747#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1748pub fn _mm512_mask_sub_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
1749    unsafe {
1750        let r = _mm512_sub_ph(a, b);
1751        simd_select_bitmask(k, r, src)
1752    }
1753}
1754
1755/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1756/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1757///
1758/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_ph)
1759#[inline]
1760#[target_feature(enable = "avx512fp16")]
1761#[cfg_attr(test, assert_instr(vsubph))]
1762#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1763pub fn _mm512_maskz_sub_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
1764    unsafe {
1765        let r = _mm512_sub_ph(a, b);
1766        simd_select_bitmask(k, r, _mm512_setzero_ph())
1767    }
1768}
1769
1770/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
1771/// Rounding is done according to the rounding parameter, which can be one of:
1772///
1773/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1774/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1775/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1776/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1777/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1778///
1779/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_round_ph)
1780#[inline]
1781#[target_feature(enable = "avx512fp16")]
1782#[cfg_attr(test, assert_instr(vsubph, ROUNDING = 8))]
1783#[rustc_legacy_const_generics(2)]
1784#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1785pub fn _mm512_sub_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
1786    unsafe {
1787        static_assert_rounding!(ROUNDING);
1788        vsubph(a, b, ROUNDING)
1789    }
1790}
1791
1792/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1793/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1794/// Rounding is done according to the rounding parameter, which can be one of:
1795///
1796/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1797/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1798/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1799/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1800/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1801///
1802/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_round_ph)
1803#[inline]
1804#[target_feature(enable = "avx512fp16")]
1805#[cfg_attr(test, assert_instr(vsubph, ROUNDING = 8))]
1806#[rustc_legacy_const_generics(4)]
1807#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1808pub fn _mm512_mask_sub_round_ph<const ROUNDING: i32>(
1809    src: __m512h,
1810    k: __mmask32,
1811    a: __m512h,
1812    b: __m512h,
1813) -> __m512h {
1814    unsafe {
1815        static_assert_rounding!(ROUNDING);
1816        let r = _mm512_sub_round_ph::<ROUNDING>(a, b);
1817        simd_select_bitmask(k, r, src)
1818    }
1819}
1820
1821/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1822/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1823/// Rounding is done according to the rounding parameter, which can be one of:
1824///
1825/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1826/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1827/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1828/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1829/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1830///
1831/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_round_ph)
1832#[inline]
1833#[target_feature(enable = "avx512fp16")]
1834#[cfg_attr(test, assert_instr(vsubph, ROUNDING = 8))]
1835#[rustc_legacy_const_generics(3)]
1836#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1837pub fn _mm512_maskz_sub_round_ph<const ROUNDING: i32>(
1838    k: __mmask32,
1839    a: __m512h,
1840    b: __m512h,
1841) -> __m512h {
1842    unsafe {
1843        static_assert_rounding!(ROUNDING);
1844        let r = _mm512_sub_round_ph::<ROUNDING>(a, b);
1845        simd_select_bitmask(k, r, _mm512_setzero_ph())
1846    }
1847}
1848
1849/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1850/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
1851/// Rounding is done according to the rounding parameter, which can be one of:
1852///
1853/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1854/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1855/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1856/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1857/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1858///
1859/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_round_sh)
1860#[inline]
1861#[target_feature(enable = "avx512fp16")]
1862#[cfg_attr(test, assert_instr(vsubsh, ROUNDING = 8))]
1863#[rustc_legacy_const_generics(2)]
1864#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1865pub fn _mm_sub_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
1866    static_assert_rounding!(ROUNDING);
1867    _mm_mask_sub_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
1868}
1869
1870/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1871/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1872/// writemask k (the element is copied from src when mask bit 0 is not set).
1873/// Rounding is done according to the rounding parameter, which can be one of:
1874///
1875/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1876/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1877/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1878/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1879/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1880///
1881/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_round_sh)
1882#[inline]
1883#[target_feature(enable = "avx512fp16")]
1884#[cfg_attr(test, assert_instr(vsubsh, ROUNDING = 8))]
1885#[rustc_legacy_const_generics(4)]
1886#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1887pub fn _mm_mask_sub_round_sh<const ROUNDING: i32>(
1888    src: __m128h,
1889    k: __mmask8,
1890    a: __m128h,
1891    b: __m128h,
1892) -> __m128h {
1893    unsafe {
1894        static_assert_rounding!(ROUNDING);
1895        vsubsh(a, b, src, k, ROUNDING)
1896    }
1897}
1898
1899/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1900/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1901/// zeromask k (the element is zeroed out when mask bit 0 is not set).
1902/// Rounding is done according to the rounding parameter, which can be one of:
1903///
1904/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1905/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1906/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1907/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1908/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1909///
1910/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_round_sh)
1911#[inline]
1912#[target_feature(enable = "avx512fp16")]
1913#[cfg_attr(test, assert_instr(vsubsh, ROUNDING = 8))]
1914#[rustc_legacy_const_generics(3)]
1915#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1916pub fn _mm_maskz_sub_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1917    static_assert_rounding!(ROUNDING);
1918    _mm_mask_sub_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
1919}
1920
1921/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1922/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
1923///
1924/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sh)
1925#[inline]
1926#[target_feature(enable = "avx512fp16")]
1927#[cfg_attr(test, assert_instr(vsubsh))]
1928#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1929pub fn _mm_sub_sh(a: __m128h, b: __m128h) -> __m128h {
1930    _mm_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
1931}
1932
1933/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1934/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1935/// writemask k (the element is copied from src when mask bit 0 is not set).
1936///
1937/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_sh)
1938#[inline]
1939#[target_feature(enable = "avx512fp16")]
1940#[cfg_attr(test, assert_instr(vsubsh))]
1941#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1942pub fn _mm_mask_sub_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1943    _mm_mask_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
1944}
1945
1946/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1947/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1948/// zeromask k (the element is zeroed out when mask bit 0 is not set).
1949///
1950/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_sh)
1951#[inline]
1952#[target_feature(enable = "avx512fp16")]
1953#[cfg_attr(test, assert_instr(vsubsh))]
1954#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1955pub fn _mm_maskz_sub_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1956    _mm_maskz_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
1957}
1958
1959/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1960///
1961/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ph)
1962#[inline]
1963#[target_feature(enable = "avx512fp16,avx512vl")]
1964#[cfg_attr(test, assert_instr(vmulph))]
1965#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1966pub fn _mm_mul_ph(a: __m128h, b: __m128h) -> __m128h {
1967    unsafe { simd_mul(a, b) }
1968}
1969
1970/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1971/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1972///
1973/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_ph)
1974#[inline]
1975#[target_feature(enable = "avx512fp16,avx512vl")]
1976#[cfg_attr(test, assert_instr(vmulph))]
1977#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1978pub fn _mm_mask_mul_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1979    unsafe {
1980        let r = _mm_mul_ph(a, b);
1981        simd_select_bitmask(k, r, src)
1982    }
1983}
1984
1985/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1986/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1987///
1988/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_ph)
1989#[inline]
1990#[target_feature(enable = "avx512fp16,avx512vl")]
1991#[cfg_attr(test, assert_instr(vmulph))]
1992#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1993pub fn _mm_maskz_mul_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1994    unsafe {
1995        let r = _mm_mul_ph(a, b);
1996        simd_select_bitmask(k, r, _mm_setzero_ph())
1997    }
1998}
1999
2000/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
2001///
2002/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_ph)
2003#[inline]
2004#[target_feature(enable = "avx512fp16,avx512vl")]
2005#[cfg_attr(test, assert_instr(vmulph))]
2006#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2007pub fn _mm256_mul_ph(a: __m256h, b: __m256h) -> __m256h {
2008    unsafe { simd_mul(a, b) }
2009}
2010
2011/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2012/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2013///
2014/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mul_ph)
2015#[inline]
2016#[target_feature(enable = "avx512fp16,avx512vl")]
2017#[cfg_attr(test, assert_instr(vmulph))]
2018#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2019pub fn _mm256_mask_mul_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
2020    unsafe {
2021        let r = _mm256_mul_ph(a, b);
2022        simd_select_bitmask(k, r, src)
2023    }
2024}
2025
2026/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2027/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2028///
2029/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mul_ph)
2030#[inline]
2031#[target_feature(enable = "avx512fp16,avx512vl")]
2032#[cfg_attr(test, assert_instr(vmulph))]
2033#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2034pub fn _mm256_maskz_mul_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
2035    unsafe {
2036        let r = _mm256_mul_ph(a, b);
2037        simd_select_bitmask(k, r, _mm256_setzero_ph())
2038    }
2039}
2040
2041/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
2042///
2043/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_ph)
2044#[inline]
2045#[target_feature(enable = "avx512fp16")]
2046#[cfg_attr(test, assert_instr(vmulph))]
2047#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2048pub fn _mm512_mul_ph(a: __m512h, b: __m512h) -> __m512h {
2049    unsafe { simd_mul(a, b) }
2050}
2051
2052/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2053/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2054///
2055/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_ph)
2056#[inline]
2057#[target_feature(enable = "avx512fp16")]
2058#[cfg_attr(test, assert_instr(vmulph))]
2059#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2060pub fn _mm512_mask_mul_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
2061    unsafe {
2062        let r = _mm512_mul_ph(a, b);
2063        simd_select_bitmask(k, r, src)
2064    }
2065}
2066
2067/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2068/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2069///
2070/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_ph)
2071#[inline]
2072#[target_feature(enable = "avx512fp16")]
2073#[cfg_attr(test, assert_instr(vmulph))]
2074#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2075pub fn _mm512_maskz_mul_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
2076    unsafe {
2077        let r = _mm512_mul_ph(a, b);
2078        simd_select_bitmask(k, r, _mm512_setzero_ph())
2079    }
2080}
2081
2082/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
2083/// Rounding is done according to the rounding parameter, which can be one of:
2084///
2085/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2086/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2087/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2088/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2089/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2090///
2091/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_round_ph)
2092#[inline]
2093#[target_feature(enable = "avx512fp16")]
2094#[cfg_attr(test, assert_instr(vmulph, ROUNDING = 8))]
2095#[rustc_legacy_const_generics(2)]
2096#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2097pub fn _mm512_mul_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
2098    unsafe {
2099        static_assert_rounding!(ROUNDING);
2100        vmulph(a, b, ROUNDING)
2101    }
2102}
2103
2104/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2105/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2106/// Rounding is done according to the rounding parameter, which can be one of:
2107///
2108/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2109/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2110/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2111/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2112/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2113///
2114/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_round_ph)
2115#[inline]
2116#[target_feature(enable = "avx512fp16")]
2117#[cfg_attr(test, assert_instr(vmulph, ROUNDING = 8))]
2118#[rustc_legacy_const_generics(4)]
2119#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2120pub fn _mm512_mask_mul_round_ph<const ROUNDING: i32>(
2121    src: __m512h,
2122    k: __mmask32,
2123    a: __m512h,
2124    b: __m512h,
2125) -> __m512h {
2126    unsafe {
2127        static_assert_rounding!(ROUNDING);
2128        let r = _mm512_mul_round_ph::<ROUNDING>(a, b);
2129        simd_select_bitmask(k, r, src)
2130    }
2131}
2132
2133/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2134/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2135/// Rounding is done according to the rounding parameter, which can be one of:
2136///
2137/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2138/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2139/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2140/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2141/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2142///
2143/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_round_ph)
2144#[inline]
2145#[target_feature(enable = "avx512fp16")]
2146#[cfg_attr(test, assert_instr(vmulph, ROUNDING = 8))]
2147#[rustc_legacy_const_generics(3)]
2148#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2149pub fn _mm512_maskz_mul_round_ph<const ROUNDING: i32>(
2150    k: __mmask32,
2151    a: __m512h,
2152    b: __m512h,
2153) -> __m512h {
2154    unsafe {
2155        static_assert_rounding!(ROUNDING);
2156        let r = _mm512_mul_round_ph::<ROUNDING>(a, b);
2157        simd_select_bitmask(k, r, _mm512_setzero_ph())
2158    }
2159}
2160
2161/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2162/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
2163/// Rounding is done according to the rounding parameter, which can be one of:
2164///
2165/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2166/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2167/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2168/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2169/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2170///
2171/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_sh)
2172#[inline]
2173#[target_feature(enable = "avx512fp16")]
2174#[cfg_attr(test, assert_instr(vmulsh, ROUNDING = 8))]
2175#[rustc_legacy_const_generics(2)]
2176#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2177pub fn _mm_mul_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
2178    static_assert_rounding!(ROUNDING);
2179    _mm_mask_mul_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
2180}
2181
2182/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2183/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2184/// writemask k (the element is copied from src when mask bit 0 is not set).
2185/// Rounding is done according to the rounding parameter, which can be one of:
2186///
2187/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2188/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2189/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2190/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2191/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2192///
2193/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_sh)
2194#[inline]
2195#[target_feature(enable = "avx512fp16")]
2196#[cfg_attr(test, assert_instr(vmulsh, ROUNDING = 8))]
2197#[rustc_legacy_const_generics(4)]
2198#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2199pub fn _mm_mask_mul_round_sh<const ROUNDING: i32>(
2200    src: __m128h,
2201    k: __mmask8,
2202    a: __m128h,
2203    b: __m128h,
2204) -> __m128h {
2205    unsafe {
2206        static_assert_rounding!(ROUNDING);
2207        vmulsh(a, b, src, k, ROUNDING)
2208    }
2209}
2210
2211/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2212/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2213/// zeromask k (the element is zeroed out when mask bit 0 is not set).
2214/// Rounding is done according to the rounding parameter, which can be one of:
2215///
2216/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2217/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2218/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2219/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2220/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2221///
2222/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_sh)
2223#[inline]
2224#[target_feature(enable = "avx512fp16")]
2225#[cfg_attr(test, assert_instr(vmulsh, ROUNDING = 8))]
2226#[rustc_legacy_const_generics(3)]
2227#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2228pub fn _mm_maskz_mul_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2229    static_assert_rounding!(ROUNDING);
2230    _mm_mask_mul_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
2231}
2232
2233/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2234/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
2235///
2236/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_sh)
2237#[inline]
2238#[target_feature(enable = "avx512fp16")]
2239#[cfg_attr(test, assert_instr(vmulsh))]
2240#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2241pub fn _mm_mul_sh(a: __m128h, b: __m128h) -> __m128h {
2242    _mm_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
2243}
2244
2245/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2246/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2247/// writemask k (the element is copied from src when mask bit 0 is not set).
2248///
2249/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_sh)
2250#[inline]
2251#[target_feature(enable = "avx512fp16")]
2252#[cfg_attr(test, assert_instr(vmulsh))]
2253#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2254pub fn _mm_mask_mul_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2255    _mm_mask_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
2256}
2257
2258/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2259/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2260/// zeromask k (the element is zeroed out when mask bit 0 is not set).
2261///
2262/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_sh)
2263#[inline]
2264#[target_feature(enable = "avx512fp16")]
2265#[cfg_attr(test, assert_instr(vmulsh))]
2266#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2267pub fn _mm_maskz_mul_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2268    _mm_maskz_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
2269}
2270
2271/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
2272///
2273/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_ph)
2274#[inline]
2275#[target_feature(enable = "avx512fp16,avx512vl")]
2276#[cfg_attr(test, assert_instr(vdivph))]
2277#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2278pub fn _mm_div_ph(a: __m128h, b: __m128h) -> __m128h {
2279    unsafe { simd_div(a, b) }
2280}
2281
2282/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2283/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2284///
2285/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_ph)
2286#[inline]
2287#[target_feature(enable = "avx512fp16,avx512vl")]
2288#[cfg_attr(test, assert_instr(vdivph))]
2289#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2290pub fn _mm_mask_div_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2291    unsafe {
2292        let r = _mm_div_ph(a, b);
2293        simd_select_bitmask(k, r, src)
2294    }
2295}
2296
2297/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2298/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2299///
2300/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_ph)
2301#[inline]
2302#[target_feature(enable = "avx512fp16,avx512vl")]
2303#[cfg_attr(test, assert_instr(vdivph))]
2304#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2305pub fn _mm_maskz_div_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2306    unsafe {
2307        let r = _mm_div_ph(a, b);
2308        simd_select_bitmask(k, r, _mm_setzero_ph())
2309    }
2310}
2311
2312/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
2313///
2314/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_div_ph)
2315#[inline]
2316#[target_feature(enable = "avx512fp16,avx512vl")]
2317#[cfg_attr(test, assert_instr(vdivph))]
2318#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2319pub fn _mm256_div_ph(a: __m256h, b: __m256h) -> __m256h {
2320    unsafe { simd_div(a, b) }
2321}
2322
2323/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2324/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2325///
2326/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_div_ph)
2327#[inline]
2328#[target_feature(enable = "avx512fp16,avx512vl")]
2329#[cfg_attr(test, assert_instr(vdivph))]
2330#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2331pub fn _mm256_mask_div_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
2332    unsafe {
2333        let r = _mm256_div_ph(a, b);
2334        simd_select_bitmask(k, r, src)
2335    }
2336}
2337
2338/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2339/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2340///
2341/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_div_ph)
2342#[inline]
2343#[target_feature(enable = "avx512fp16,avx512vl")]
2344#[cfg_attr(test, assert_instr(vdivph))]
2345#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2346pub fn _mm256_maskz_div_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
2347    unsafe {
2348        let r = _mm256_div_ph(a, b);
2349        simd_select_bitmask(k, r, _mm256_setzero_ph())
2350    }
2351}
2352
2353/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
2354///
2355/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_div_ph)
2356#[inline]
2357#[target_feature(enable = "avx512fp16")]
2358#[cfg_attr(test, assert_instr(vdivph))]
2359#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2360pub fn _mm512_div_ph(a: __m512h, b: __m512h) -> __m512h {
2361    unsafe { simd_div(a, b) }
2362}
2363
2364/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2365/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2366///
2367/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_div_ph)
2368#[inline]
2369#[target_feature(enable = "avx512fp16")]
2370#[cfg_attr(test, assert_instr(vdivph))]
2371#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2372pub fn _mm512_mask_div_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
2373    unsafe {
2374        let r = _mm512_div_ph(a, b);
2375        simd_select_bitmask(k, r, src)
2376    }
2377}
2378
2379/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2380/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2381///
2382/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_div_ph)
2383#[inline]
2384#[target_feature(enable = "avx512fp16")]
2385#[cfg_attr(test, assert_instr(vdivph))]
2386#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2387pub fn _mm512_maskz_div_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
2388    unsafe {
2389        let r = _mm512_div_ph(a, b);
2390        simd_select_bitmask(k, r, _mm512_setzero_ph())
2391    }
2392}
2393
2394/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
2395/// Rounding is done according to the rounding parameter, which can be one of:
2396///
2397/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2398/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2399/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2400/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2401/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2402///
2403/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_div_round_ph)
2404#[inline]
2405#[target_feature(enable = "avx512fp16")]
2406#[cfg_attr(test, assert_instr(vdivph, ROUNDING = 8))]
2407#[rustc_legacy_const_generics(2)]
2408#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2409pub fn _mm512_div_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
2410    unsafe {
2411        static_assert_rounding!(ROUNDING);
2412        vdivph(a, b, ROUNDING)
2413    }
2414}
2415
2416/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2417/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2418/// Rounding is done according to the rounding parameter, which can be one of:
2419///
2420/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2421/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2422/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2423/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2424/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2425///
2426/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_div_round_ph)
2427#[inline]
2428#[target_feature(enable = "avx512fp16")]
2429#[cfg_attr(test, assert_instr(vdivph, ROUNDING = 8))]
2430#[rustc_legacy_const_generics(4)]
2431#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2432pub fn _mm512_mask_div_round_ph<const ROUNDING: i32>(
2433    src: __m512h,
2434    k: __mmask32,
2435    a: __m512h,
2436    b: __m512h,
2437) -> __m512h {
2438    unsafe {
2439        static_assert_rounding!(ROUNDING);
2440        let r = _mm512_div_round_ph::<ROUNDING>(a, b);
2441        simd_select_bitmask(k, r, src)
2442    }
2443}
2444
2445/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2446/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2447/// Rounding is done according to the rounding parameter, which can be one of:
2448///
2449/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2450/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2451/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2452/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2453/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2454///
2455/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_div_round_ph)
2456#[inline]
2457#[target_feature(enable = "avx512fp16")]
2458#[cfg_attr(test, assert_instr(vdivph, ROUNDING = 8))]
2459#[rustc_legacy_const_generics(3)]
2460#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2461pub fn _mm512_maskz_div_round_ph<const ROUNDING: i32>(
2462    k: __mmask32,
2463    a: __m512h,
2464    b: __m512h,
2465) -> __m512h {
2466    unsafe {
2467        static_assert_rounding!(ROUNDING);
2468        let r = _mm512_div_round_ph::<ROUNDING>(a, b);
2469        simd_select_bitmask(k, r, _mm512_setzero_ph())
2470    }
2471}
2472
2473/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2474/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
2475/// Rounding is done according to the rounding parameter, which can be one of:
2476///
2477/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2478/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2479/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2480/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2481/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2482///
2483/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_round_sh)
2484#[inline]
2485#[target_feature(enable = "avx512fp16")]
2486#[cfg_attr(test, assert_instr(vdivsh, ROUNDING = 8))]
2487#[rustc_legacy_const_generics(2)]
2488#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2489pub fn _mm_div_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
2490    static_assert_rounding!(ROUNDING);
2491    _mm_mask_div_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
2492}
2493
2494/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2495/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2496/// writemask k (the element is copied from src when mask bit 0 is not set).
2497/// Rounding is done according to the rounding parameter, which can be one of:
2498///
2499/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2500/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2501/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2502/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2503/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2504///
2505/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_round_sh)
2506#[inline]
2507#[target_feature(enable = "avx512fp16")]
2508#[cfg_attr(test, assert_instr(vdivsh, ROUNDING = 8))]
2509#[rustc_legacy_const_generics(4)]
2510#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2511pub fn _mm_mask_div_round_sh<const ROUNDING: i32>(
2512    src: __m128h,
2513    k: __mmask8,
2514    a: __m128h,
2515    b: __m128h,
2516) -> __m128h {
2517    unsafe {
2518        static_assert_rounding!(ROUNDING);
2519        vdivsh(a, b, src, k, ROUNDING)
2520    }
2521}
2522
2523/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2524/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2525/// zeromask k (the element is zeroed out when mask bit 0 is not set).
2526/// Rounding is done according to the rounding parameter, which can be one of:
2527///
2528/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2529/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2530/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2531/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2532/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2533///
2534/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_round_sh)
2535#[inline]
2536#[target_feature(enable = "avx512fp16")]
2537#[cfg_attr(test, assert_instr(vdivsh, ROUNDING = 8))]
2538#[rustc_legacy_const_generics(3)]
2539#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2540pub fn _mm_maskz_div_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2541    static_assert_rounding!(ROUNDING);
2542    _mm_mask_div_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
2543}
2544
2545/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2546/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
2547///
2548/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sh)
2549#[inline]
2550#[target_feature(enable = "avx512fp16")]
2551#[cfg_attr(test, assert_instr(vdivsh))]
2552#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2553pub fn _mm_div_sh(a: __m128h, b: __m128h) -> __m128h {
2554    _mm_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
2555}
2556
2557/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2558/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2559/// writemask k (the element is copied from src when mask bit 0 is not set).
2560///
2561/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_sh)
2562#[inline]
2563#[target_feature(enable = "avx512fp16")]
2564#[cfg_attr(test, assert_instr(vdivsh))]
2565#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2566pub fn _mm_mask_div_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2567    _mm_mask_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
2568}
2569
2570/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2571/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2572/// zeromask k (the element is zeroed out when mask bit 0 is not set).
2573///
2574/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_sh)
2575#[inline]
2576#[target_feature(enable = "avx512fp16")]
2577#[cfg_attr(test, assert_instr(vdivsh))]
2578#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2579pub fn _mm_maskz_div_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2580    _mm_maskz_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
2581}
2582
2583/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
2584/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2585/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2586///
2587/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pch)
2588#[inline]
2589#[target_feature(enable = "avx512fp16,avx512vl")]
2590#[cfg_attr(test, assert_instr(vfmulcph))]
2591#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2592pub fn _mm_mul_pch(a: __m128h, b: __m128h) -> __m128h {
2593    _mm_mask_mul_pch(_mm_undefined_ph(), 0xff, a, b)
2594}
2595
2596/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
2597/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
2598/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2599///
2600/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_pch)
2601#[inline]
2602#[target_feature(enable = "avx512fp16,avx512vl")]
2603#[cfg_attr(test, assert_instr(vfmulcph))]
2604#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2605pub fn _mm_mask_mul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2606    unsafe { transmute(vfmulcph_128(transmute(a), transmute(b), transmute(src), k)) }
2607}
2608
2609/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2610/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
2611/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2612///
2613/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_pch)
2614#[inline]
2615#[target_feature(enable = "avx512fp16,avx512vl")]
2616#[cfg_attr(test, assert_instr(vfmulcph))]
2617#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2618pub fn _mm_maskz_mul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2619    _mm_mask_mul_pch(_mm_setzero_ph(), k, a, b)
2620}
2621
2622/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
2623/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2624/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2625///
2626/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_pch)
2627#[inline]
2628#[target_feature(enable = "avx512fp16,avx512vl")]
2629#[cfg_attr(test, assert_instr(vfmulcph))]
2630#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2631pub fn _mm256_mul_pch(a: __m256h, b: __m256h) -> __m256h {
2632    _mm256_mask_mul_pch(_mm256_undefined_ph(), 0xff, a, b)
2633}
2634
2635/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
2636/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
2637/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2638///
2639/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mul_pch)
2640#[inline]
2641#[target_feature(enable = "avx512fp16,avx512vl")]
2642#[cfg_attr(test, assert_instr(vfmulcph))]
2643#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2644pub fn _mm256_mask_mul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
2645    unsafe { transmute(vfmulcph_256(transmute(a), transmute(b), transmute(src), k)) }
2646}
2647
2648/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2649/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
2650/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2651///
2652/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mul_pch)
2653#[inline]
2654#[target_feature(enable = "avx512fp16,avx512vl")]
2655#[cfg_attr(test, assert_instr(vfmulcph))]
2656#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2657pub fn _mm256_maskz_mul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
2658    _mm256_mask_mul_pch(_mm256_setzero_ph(), k, a, b)
2659}
2660
2661/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
2662/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2663/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2664///
2665/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_pch)
2666#[inline]
2667#[target_feature(enable = "avx512fp16")]
2668#[cfg_attr(test, assert_instr(vfmulcph))]
2669#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2670pub fn _mm512_mul_pch(a: __m512h, b: __m512h) -> __m512h {
2671    _mm512_mask_mul_pch(_mm512_undefined_ph(), 0xffff, a, b)
2672}
2673
2674/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
2675/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
2676/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2677///
2678/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_pch)
2679#[inline]
2680#[target_feature(enable = "avx512fp16")]
2681#[cfg_attr(test, assert_instr(vfmulcph))]
2682#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2683pub fn _mm512_mask_mul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
2684    _mm512_mask_mul_round_pch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
2685}
2686
2687/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2688/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
2689/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2690///
2691/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_pch)
2692#[inline]
2693#[target_feature(enable = "avx512fp16")]
2694#[cfg_attr(test, assert_instr(vfmulcph))]
2695#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2696pub fn _mm512_maskz_mul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
2697    _mm512_mask_mul_pch(_mm512_setzero_ph(), k, a, b)
2698}
2699
2700/// Multiply the packed complex numbers in a and b, and store the results in dst. Each complex number is
2701/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2702/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2703///
2704/// Rounding is done according to the rounding parameter, which can be one of:
2705///
2706/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2707/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2708/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2709/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2710/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2711///
2712/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_round_pch)
2713#[inline]
2714#[target_feature(enable = "avx512fp16")]
2715#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
2716#[rustc_legacy_const_generics(2)]
2717#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2718pub fn _mm512_mul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
2719    static_assert_rounding!(ROUNDING);
2720    _mm512_mask_mul_round_pch::<ROUNDING>(_mm512_undefined_ph(), 0xffff, a, b)
2721}
2722
2723/// Multiply the packed complex numbers in a and b, and store the results in dst using writemask k (the element
2724/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
2725/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2726///
2727/// Rounding is done according to the rounding parameter, which can be one of:
2728///
2729/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2730/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2731/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2732/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2733/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2734///
2735/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_round_pch)
2736#[inline]
2737#[target_feature(enable = "avx512fp16")]
2738#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
2739#[rustc_legacy_const_generics(4)]
2740#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2741pub fn _mm512_mask_mul_round_pch<const ROUNDING: i32>(
2742    src: __m512h,
2743    k: __mmask16,
2744    a: __m512h,
2745    b: __m512h,
2746) -> __m512h {
2747    unsafe {
2748        static_assert_rounding!(ROUNDING);
2749        transmute(vfmulcph_512(
2750            transmute(a),
2751            transmute(b),
2752            transmute(src),
2753            k,
2754            ROUNDING,
2755        ))
2756    }
2757}
2758
2759/// Multiply the packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2760/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
2761/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2762///
2763/// Rounding is done according to the rounding parameter, which can be one of:
2764///
2765/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2766/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2767/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2768/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2769/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2770///
2771/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_round_pch)
2772#[inline]
2773#[target_feature(enable = "avx512fp16")]
2774#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
2775#[rustc_legacy_const_generics(3)]
2776#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2777pub fn _mm512_maskz_mul_round_pch<const ROUNDING: i32>(
2778    k: __mmask16,
2779    a: __m512h,
2780    b: __m512h,
2781) -> __m512h {
2782    static_assert_rounding!(ROUNDING);
2783    _mm512_mask_mul_round_pch::<ROUNDING>(_mm512_setzero_ph(), k, a, b)
2784}
2785
2786/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst,
2787/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
2788/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2789/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2790///
2791/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_sch)
2792#[inline]
2793#[target_feature(enable = "avx512fp16")]
2794#[cfg_attr(test, assert_instr(vfmulcsh))]
2795#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2796pub fn _mm_mul_sch(a: __m128h, b: __m128h) -> __m128h {
2797    _mm_mask_mul_sch(f16x8::ZERO.as_m128h(), 0xff, a, b)
2798}
2799
2800/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
2801/// writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 6 packed
2802/// elements from a to the upper elements of dst. Each complex number is composed of two adjacent
2803/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2804///
2805/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_sch)
2806#[inline]
2807#[target_feature(enable = "avx512fp16")]
2808#[cfg_attr(test, assert_instr(vfmulcsh))]
2809#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2810pub fn _mm_mask_mul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2811    _mm_mask_mul_round_sch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
2812}
2813
2814/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
2815/// zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements
2816/// from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision
2817/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2818///
2819/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_sch)
2820#[inline]
2821#[target_feature(enable = "avx512fp16")]
2822#[cfg_attr(test, assert_instr(vfmulcsh))]
2823#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2824pub fn _mm_maskz_mul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2825    _mm_mask_mul_sch(f16x8::ZERO.as_m128h(), k, a, b)
2826}
2827
2828/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst,
2829/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
2830/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2831/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2832///
2833/// Rounding is done according to the rounding parameter, which can be one of:
2834///
2835/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2836/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2837/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2838/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2839/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2840///
2841/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_sch)
2842#[inline]
2843#[target_feature(enable = "avx512fp16")]
2844#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
2845#[rustc_legacy_const_generics(2)]
2846#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2847pub fn _mm_mul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
2848    static_assert_rounding!(ROUNDING);
2849    _mm_mask_mul_round_sch::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
2850}
2851
2852/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
2853/// writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 6 packed
2854/// elements from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision
2855/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2856///
2857/// Rounding is done according to the rounding parameter, which can be one of:
2858///
2859/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2860/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2861/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2862/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2863/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2864///
2865/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_sch)
2866#[inline]
2867#[target_feature(enable = "avx512fp16")]
2868#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
2869#[rustc_legacy_const_generics(4)]
2870#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2871pub fn _mm_mask_mul_round_sch<const ROUNDING: i32>(
2872    src: __m128h,
2873    k: __mmask8,
2874    a: __m128h,
2875    b: __m128h,
2876) -> __m128h {
2877    unsafe {
2878        static_assert_rounding!(ROUNDING);
2879        transmute(vfmulcsh(
2880            transmute(a),
2881            transmute(b),
2882            transmute(src),
2883            k,
2884            ROUNDING,
2885        ))
2886    }
2887}
2888
2889/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
2890/// zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements
2891/// from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision
2892/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2893///
2894/// Rounding is done according to the rounding parameter, which can be one of:
2895///
2896/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2897/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2898/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2899/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2900/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2901///
2902/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_sch)
2903#[inline]
2904#[target_feature(enable = "avx512fp16")]
2905#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
2906#[rustc_legacy_const_generics(3)]
2907#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2908pub fn _mm_maskz_mul_round_sch<const ROUNDING: i32>(
2909    k: __mmask8,
2910    a: __m128h,
2911    b: __m128h,
2912) -> __m128h {
2913    static_assert_rounding!(ROUNDING);
2914    _mm_mask_mul_round_sch::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
2915}
2916
2917/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
2918/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2919/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2920///
2921/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_pch)
2922#[inline]
2923#[target_feature(enable = "avx512fp16,avx512vl")]
2924#[cfg_attr(test, assert_instr(vfmulcph))]
2925#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2926pub fn _mm_fmul_pch(a: __m128h, b: __m128h) -> __m128h {
2927    _mm_mul_pch(a, b)
2928}
2929
2930/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
2931/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
2932/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2933///
2934/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_pch)
2935#[inline]
2936#[target_feature(enable = "avx512fp16,avx512vl")]
2937#[cfg_attr(test, assert_instr(vfmulcph))]
2938#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2939pub fn _mm_mask_fmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2940    _mm_mask_mul_pch(src, k, a, b)
2941}
2942
2943/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2944/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
2945/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2946///
2947/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_pch)
2948#[inline]
2949#[target_feature(enable = "avx512fp16,avx512vl")]
2950#[cfg_attr(test, assert_instr(vfmulcph))]
2951#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2952pub fn _mm_maskz_fmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2953    _mm_maskz_mul_pch(k, a, b)
2954}
2955
2956/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
2957/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2958/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2959///
2960/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmul_pch)
2961#[inline]
2962#[target_feature(enable = "avx512fp16,avx512vl")]
2963#[cfg_attr(test, assert_instr(vfmulcph))]
2964#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2965pub fn _mm256_fmul_pch(a: __m256h, b: __m256h) -> __m256h {
2966    _mm256_mul_pch(a, b)
2967}
2968
2969/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
2970/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
2971/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2972///
2973/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmul_pch)
2974#[inline]
2975#[target_feature(enable = "avx512fp16,avx512vl")]
2976#[cfg_attr(test, assert_instr(vfmulcph))]
2977#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2978pub fn _mm256_mask_fmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
2979    _mm256_mask_mul_pch(src, k, a, b)
2980}
2981
2982/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2983/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
2984/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2985///
2986/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmul_pch)
2987#[inline]
2988#[target_feature(enable = "avx512fp16,avx512vl")]
2989#[cfg_attr(test, assert_instr(vfmulcph))]
2990#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2991pub fn _mm256_maskz_fmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
2992    _mm256_maskz_mul_pch(k, a, b)
2993}
2994
2995/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is composed
2996/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2997///
2998/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmul_pch)
2999#[inline]
3000#[target_feature(enable = "avx512fp16")]
3001#[cfg_attr(test, assert_instr(vfmulcph))]
3002#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3003pub fn _mm512_fmul_pch(a: __m512h, b: __m512h) -> __m512h {
3004    _mm512_mul_pch(a, b)
3005}
3006
3007/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
3008/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3009/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3010///
3011/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmul_pch)
3012#[inline]
3013#[target_feature(enable = "avx512fp16")]
3014#[cfg_attr(test, assert_instr(vfmulcph))]
3015#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3016pub fn _mm512_mask_fmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3017    _mm512_mask_mul_pch(src, k, a, b)
3018}
3019
3020/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
3021/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3022/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3023///
3024/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmul_pch)
3025#[inline]
3026#[target_feature(enable = "avx512fp16")]
3027#[cfg_attr(test, assert_instr(vfmulcph))]
3028#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3029pub fn _mm512_maskz_fmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3030    _mm512_maskz_mul_pch(k, a, b)
3031}
3032
3033/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is composed
3034/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3035/// Rounding is done according to the rounding parameter, which can be one of:
3036///
3037/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3038/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3039/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3040/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3041/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3042///
3043/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmul_round_pch)
3044#[inline]
3045#[target_feature(enable = "avx512fp16")]
3046#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
3047#[rustc_legacy_const_generics(2)]
3048#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3049pub fn _mm512_fmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
3050    static_assert_rounding!(ROUNDING);
3051    _mm512_mul_round_pch::<ROUNDING>(a, b)
3052}
3053
3054/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
3055/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3056/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3057/// Rounding is done according to the rounding parameter, which can be one of:
3058///
3059/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3060/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3061/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3062/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3063/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3064///
3065/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmul_round_pch)
3066#[inline]
3067#[target_feature(enable = "avx512fp16")]
3068#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
3069#[rustc_legacy_const_generics(4)]
3070#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3071pub fn _mm512_mask_fmul_round_pch<const ROUNDING: i32>(
3072    src: __m512h,
3073    k: __mmask16,
3074    a: __m512h,
3075    b: __m512h,
3076) -> __m512h {
3077    static_assert_rounding!(ROUNDING);
3078    _mm512_mask_mul_round_pch::<ROUNDING>(src, k, a, b)
3079}
3080
3081/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
3082/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3083/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3084/// Rounding is done according to the rounding parameter, which can be one of:
3085///
3086/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3087/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3088/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3089/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3090/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3091///
3092/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmul_round_pch)
3093#[inline]
3094#[target_feature(enable = "avx512fp16")]
3095#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
3096#[rustc_legacy_const_generics(3)]
3097#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3098pub fn _mm512_maskz_fmul_round_pch<const ROUNDING: i32>(
3099    k: __mmask16,
3100    a: __m512h,
3101    b: __m512h,
3102) -> __m512h {
3103    static_assert_rounding!(ROUNDING);
3104    _mm512_maskz_mul_round_pch::<ROUNDING>(k, a, b)
3105}
3106
3107/// Multiply the lower complex numbers in a and b, and store the results in dst. Each complex number is
3108/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
3109/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3110///
3111/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_sch)
3112#[inline]
3113#[target_feature(enable = "avx512fp16")]
3114#[cfg_attr(test, assert_instr(vfmulcsh))]
3115#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3116pub fn _mm_fmul_sch(a: __m128h, b: __m128h) -> __m128h {
3117    _mm_mul_sch(a, b)
3118}
3119
3120/// Multiply the lower complex numbers in a and b, and store the results in dst using writemask k (the element
3121/// is copied from src when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
3122/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3123///
3124/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_sch)
3125#[inline]
3126#[target_feature(enable = "avx512fp16")]
3127#[cfg_attr(test, assert_instr(vfmulcsh))]
3128#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3129pub fn _mm_mask_fmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3130    _mm_mask_mul_sch(src, k, a, b)
3131}
3132
3133/// Multiply the lower complex numbers in a and b, and store the results in dst using zeromask k (the element
3134/// is zeroed out when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
3135/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3136///
3137/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_sch)
3138#[inline]
3139#[target_feature(enable = "avx512fp16")]
3140#[cfg_attr(test, assert_instr(vfmulcsh))]
3141#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3142pub fn _mm_maskz_fmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3143    _mm_maskz_mul_sch(k, a, b)
3144}
3145
3146/// Multiply the lower complex numbers in a and b, and store the results in dst. Each complex number is composed
3147/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3148///
3149/// Rounding is done according to the rounding parameter, which can be one of:
3150///
3151/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3152/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3153/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3154/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3155/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3156///
3157/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_round_sch)
3158#[inline]
3159#[target_feature(enable = "avx512fp16")]
3160#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
3161#[rustc_legacy_const_generics(2)]
3162#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3163pub fn _mm_fmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
3164    static_assert_rounding!(ROUNDING);
3165    _mm_mul_round_sch::<ROUNDING>(a, b)
3166}
3167
3168/// Multiply the lower complex numbers in a and b, and store the results in dst using writemask k (the element
3169/// is copied from src when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
3170/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3171///
3172/// Rounding is done according to the rounding parameter, which can be one of:
3173///
3174/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3175/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3176/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3177/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3178/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3179///
3180/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_round_sch)
3181#[inline]
3182#[target_feature(enable = "avx512fp16")]
3183#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
3184#[rustc_legacy_const_generics(4)]
3185#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3186pub fn _mm_mask_fmul_round_sch<const ROUNDING: i32>(
3187    src: __m128h,
3188    k: __mmask8,
3189    a: __m128h,
3190    b: __m128h,
3191) -> __m128h {
3192    static_assert_rounding!(ROUNDING);
3193    _mm_mask_mul_round_sch::<ROUNDING>(src, k, a, b)
3194}
3195
3196/// Multiply the lower complex numbers in a and b, and store the results in dst using zeromask k (the element
3197/// is zeroed out when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
3198/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3199///
3200/// Rounding is done according to the rounding parameter, which can be one of:
3201///
3202/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3203/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3204/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3205/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3206/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3207///
3208/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_round_sch)
3209#[inline]
3210#[target_feature(enable = "avx512fp16")]
3211#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
3212#[rustc_legacy_const_generics(3)]
3213#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3214pub fn _mm_maskz_fmul_round_sch<const ROUNDING: i32>(
3215    k: __mmask8,
3216    a: __m128h,
3217    b: __m128h,
3218) -> __m128h {
3219    static_assert_rounding!(ROUNDING);
3220    _mm_maskz_mul_round_sch::<ROUNDING>(k, a, b)
3221}
3222
3223/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3224/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3225/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3226/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3227///
3228/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_pch)
3229#[inline]
3230#[target_feature(enable = "avx512fp16,avx512vl")]
3231#[cfg_attr(test, assert_instr(vfcmulcph))]
3232#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3233pub fn _mm_cmul_pch(a: __m128h, b: __m128h) -> __m128h {
3234    _mm_mask_cmul_pch(_mm_undefined_ph(), 0xff, a, b)
3235}
3236
3237/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3238/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3239/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3240/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3241///
3242/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_pch)
3243#[inline]
3244#[target_feature(enable = "avx512fp16,avx512vl")]
3245#[cfg_attr(test, assert_instr(vfcmulcph))]
3246#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3247pub fn _mm_mask_cmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3248    unsafe { transmute(vfcmulcph_128(transmute(a), transmute(b), transmute(src), k)) }
3249}
3250
3251/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3252/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3253/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3254/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3255///
3256/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_pch)
3257#[inline]
3258#[target_feature(enable = "avx512fp16,avx512vl")]
3259#[cfg_attr(test, assert_instr(vfcmulcph))]
3260#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3261pub fn _mm_maskz_cmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3262    _mm_mask_cmul_pch(_mm_setzero_ph(), k, a, b)
3263}
3264
3265/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3266/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3267/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3268/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3269///
3270/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmul_pch)
3271#[inline]
3272#[target_feature(enable = "avx512fp16,avx512vl")]
3273#[cfg_attr(test, assert_instr(vfcmulcph))]
3274#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3275pub fn _mm256_cmul_pch(a: __m256h, b: __m256h) -> __m256h {
3276    _mm256_mask_cmul_pch(_mm256_undefined_ph(), 0xff, a, b)
3277}
3278
3279/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3280/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3281/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3282/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3283///
3284/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmul_pch)
3285#[inline]
3286#[target_feature(enable = "avx512fp16,avx512vl")]
3287#[cfg_attr(test, assert_instr(vfcmulcph))]
3288#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3289pub fn _mm256_mask_cmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3290    unsafe { transmute(vfcmulcph_256(transmute(a), transmute(b), transmute(src), k)) }
3291}
3292
3293/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3294/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3295/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3296/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3297///
3298/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cmul_pch)
3299#[inline]
3300#[target_feature(enable = "avx512fp16,avx512vl")]
3301#[cfg_attr(test, assert_instr(vfcmulcph))]
3302#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3303pub fn _mm256_maskz_cmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3304    _mm256_mask_cmul_pch(_mm256_setzero_ph(), k, a, b)
3305}
3306
3307/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3308/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3309/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3310/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3311///
3312/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmul_pch)
3313#[inline]
3314#[target_feature(enable = "avx512fp16")]
3315#[cfg_attr(test, assert_instr(vfcmulcph))]
3316#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3317pub fn _mm512_cmul_pch(a: __m512h, b: __m512h) -> __m512h {
3318    _mm512_mask_cmul_pch(_mm512_undefined_ph(), 0xffff, a, b)
3319}
3320
3321/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3322/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3323/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3324/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3325///
3326/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmul_pch)
3327#[inline]
3328#[target_feature(enable = "avx512fp16")]
3329#[cfg_attr(test, assert_instr(vfcmulcph))]
3330#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3331pub fn _mm512_mask_cmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3332    _mm512_mask_cmul_round_pch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
3333}
3334
3335/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3336/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3337/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3338/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3339///
3340/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cmul_pch)
3341#[inline]
3342#[target_feature(enable = "avx512fp16")]
3343#[cfg_attr(test, assert_instr(vfcmulcph))]
3344#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3345pub fn _mm512_maskz_cmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3346    _mm512_mask_cmul_pch(_mm512_setzero_ph(), k, a, b)
3347}
3348
3349/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3350/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3351/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3352/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3353///
3354/// Rounding is done according to the rounding parameter, which can be one of:
3355///
3356/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3357/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3358/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3359/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3360/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3361///
3362/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmul_round_pch)
3363#[inline]
3364#[target_feature(enable = "avx512fp16")]
3365#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3366#[rustc_legacy_const_generics(2)]
3367#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3368pub fn _mm512_cmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
3369    static_assert_rounding!(ROUNDING);
3370    _mm512_mask_cmul_round_pch::<ROUNDING>(_mm512_undefined_ph(), 0xffff, a, b)
3371}
3372
3373/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3374/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3375/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3376/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3377///
3378/// Rounding is done according to the rounding parameter, which can be one of:
3379///
3380/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3381/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3382/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3383/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3384/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3385///
3386/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmul_round_pch)
3387#[inline]
3388#[target_feature(enable = "avx512fp16")]
3389#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3390#[rustc_legacy_const_generics(4)]
3391#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3392pub fn _mm512_mask_cmul_round_pch<const ROUNDING: i32>(
3393    src: __m512h,
3394    k: __mmask16,
3395    a: __m512h,
3396    b: __m512h,
3397) -> __m512h {
3398    unsafe {
3399        static_assert_rounding!(ROUNDING);
3400        transmute(vfcmulcph_512(
3401            transmute(a),
3402            transmute(b),
3403            transmute(src),
3404            k,
3405            ROUNDING,
3406        ))
3407    }
3408}
3409
3410/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3411/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3412/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3413/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3414///
3415/// Rounding is done according to the rounding parameter, which can be one of:
3416///
3417/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3418/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3419/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3420/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3421/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3422///
3423/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cmul_round_pch)
3424#[inline]
3425#[target_feature(enable = "avx512fp16")]
3426#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3427#[rustc_legacy_const_generics(3)]
3428#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3429pub fn _mm512_maskz_cmul_round_pch<const ROUNDING: i32>(
3430    k: __mmask16,
3431    a: __m512h,
3432    b: __m512h,
3433) -> __m512h {
3434    static_assert_rounding!(ROUNDING);
3435    _mm512_mask_cmul_round_pch::<ROUNDING>(_mm512_setzero_ph(), k, a, b)
3436}
3437
3438/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3439/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3440/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3441///
3442/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_sch)
3443#[inline]
3444#[target_feature(enable = "avx512fp16")]
3445#[cfg_attr(test, assert_instr(vfcmulcsh))]
3446#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3447pub fn _mm_cmul_sch(a: __m128h, b: __m128h) -> __m128h {
3448    _mm_mask_cmul_sch(f16x8::ZERO.as_m128h(), 0xff, a, b)
3449}
3450
3451/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3452/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
3453/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3454/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3455///
3456/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_sch)
3457#[inline]
3458#[target_feature(enable = "avx512fp16")]
3459#[cfg_attr(test, assert_instr(vfcmulcsh))]
3460#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3461pub fn _mm_mask_cmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3462    _mm_mask_cmul_round_sch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
3463}
3464
3465/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3466/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
3467/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3468/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3469///
3470/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_sch)
3471#[inline]
3472#[target_feature(enable = "avx512fp16")]
3473#[cfg_attr(test, assert_instr(vfcmulcsh))]
3474#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3475pub fn _mm_maskz_cmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3476    _mm_mask_cmul_sch(f16x8::ZERO.as_m128h(), k, a, b)
3477}
3478
3479/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3480/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3481/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3482///
3483/// Rounding is done according to the rounding parameter, which can be one of:
3484///
3485/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3486/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3487/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3488/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3489/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3490///
3491/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_round_sch)
3492#[inline]
3493#[target_feature(enable = "avx512fp16")]
3494#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3495#[rustc_legacy_const_generics(2)]
3496#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3497pub fn _mm_cmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
3498    static_assert_rounding!(ROUNDING);
3499    _mm_mask_cmul_round_sch::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
3500}
3501
3502/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3503/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
3504/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3505/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3506///
3507/// Rounding is done according to the rounding parameter, which can be one of:
3508///
3509/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3510/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3511/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3512/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3513/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3514///
3515/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_round_sch)
3516#[inline]
3517#[target_feature(enable = "avx512fp16")]
3518#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3519#[rustc_legacy_const_generics(4)]
3520#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3521pub fn _mm_mask_cmul_round_sch<const ROUNDING: i32>(
3522    src: __m128h,
3523    k: __mmask8,
3524    a: __m128h,
3525    b: __m128h,
3526) -> __m128h {
3527    unsafe {
3528        static_assert_rounding!(ROUNDING);
3529        transmute(vfcmulcsh(
3530            transmute(a),
3531            transmute(b),
3532            transmute(src),
3533            k,
3534            ROUNDING,
3535        ))
3536    }
3537}
3538
3539/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3540/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
3541/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3542/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3543///
3544/// Rounding is done according to the rounding parameter, which can be one of:
3545///
3546/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3547/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3548/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3549/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3550/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3551///
3552/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_round_sch)
3553#[inline]
3554#[target_feature(enable = "avx512fp16")]
3555#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3556#[rustc_legacy_const_generics(3)]
3557#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3558pub fn _mm_maskz_cmul_round_sch<const ROUNDING: i32>(
3559    k: __mmask8,
3560    a: __m128h,
3561    b: __m128h,
3562) -> __m128h {
3563    static_assert_rounding!(ROUNDING);
3564    _mm_mask_cmul_round_sch::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
3565}
3566
3567/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3568/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3569/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3570/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3571///
3572/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_pch)
3573#[inline]
3574#[target_feature(enable = "avx512fp16,avx512vl")]
3575#[cfg_attr(test, assert_instr(vfcmulcph))]
3576#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3577pub fn _mm_fcmul_pch(a: __m128h, b: __m128h) -> __m128h {
3578    _mm_cmul_pch(a, b)
3579}
3580
3581/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3582/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3583/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3584/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3585///
3586/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_pch)
3587#[inline]
3588#[target_feature(enable = "avx512fp16,avx512vl")]
3589#[cfg_attr(test, assert_instr(vfcmulcph))]
3590#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3591pub fn _mm_mask_fcmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3592    _mm_mask_cmul_pch(src, k, a, b)
3593}
3594
3595/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3596/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3597/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3598/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3599///
3600/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_pch)
3601#[inline]
3602#[target_feature(enable = "avx512fp16,avx512vl")]
3603#[cfg_attr(test, assert_instr(vfcmulcph))]
3604#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3605pub fn _mm_maskz_fcmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3606    _mm_maskz_cmul_pch(k, a, b)
3607}
3608
3609/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3610/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3611/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3612/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3613///
3614/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fcmul_pch)
3615#[inline]
3616#[target_feature(enable = "avx512fp16,avx512vl")]
3617#[cfg_attr(test, assert_instr(vfcmulcph))]
3618#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3619pub fn _mm256_fcmul_pch(a: __m256h, b: __m256h) -> __m256h {
3620    _mm256_cmul_pch(a, b)
3621}
3622
3623/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3624/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3625/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3626/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3627///
3628/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fcmul_pch)
3629#[inline]
3630#[target_feature(enable = "avx512fp16,avx512vl")]
3631#[cfg_attr(test, assert_instr(vfcmulcph))]
3632#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3633pub fn _mm256_mask_fcmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3634    _mm256_mask_cmul_pch(src, k, a, b)
3635}
3636
3637/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3638/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3639/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3640/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3641///
3642/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fcmul_pch)
3643#[inline]
3644#[target_feature(enable = "avx512fp16,avx512vl")]
3645#[cfg_attr(test, assert_instr(vfcmulcph))]
3646#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3647pub fn _mm256_maskz_fcmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3648    _mm256_maskz_cmul_pch(k, a, b)
3649}
3650
3651/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3652/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3653/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3654/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3655///
3656/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmul_pch)
3657#[inline]
3658#[target_feature(enable = "avx512fp16")]
3659#[cfg_attr(test, assert_instr(vfcmulcph))]
3660#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3661pub fn _mm512_fcmul_pch(a: __m512h, b: __m512h) -> __m512h {
3662    _mm512_cmul_pch(a, b)
3663}
3664
3665/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3666/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3667/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3668/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3669///
3670/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmul_pch)
3671#[inline]
3672#[target_feature(enable = "avx512fp16")]
3673#[cfg_attr(test, assert_instr(vfcmulcph))]
3674#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3675pub fn _mm512_mask_fcmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3676    _mm512_mask_cmul_pch(src, k, a, b)
3677}
3678
3679/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3680/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3681/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3682/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3683///
3684/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmul_pch)
3685#[inline]
3686#[target_feature(enable = "avx512fp16")]
3687#[cfg_attr(test, assert_instr(vfcmulcph))]
3688#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3689pub fn _mm512_maskz_fcmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3690    _mm512_maskz_cmul_pch(k, a, b)
3691}
3692
3693/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3694/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3695/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3696///
3697/// Rounding is done according to the rounding parameter, which can be one of:
3698///
3699/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3700/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3701/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3702/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3703/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3704///
3705/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmul_round_pch)
3706#[inline]
3707#[target_feature(enable = "avx512fp16")]
3708#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3709#[rustc_legacy_const_generics(2)]
3710#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3711pub fn _mm512_fcmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
3712    static_assert_rounding!(ROUNDING);
3713    _mm512_cmul_round_pch::<ROUNDING>(a, b)
3714}
3715
3716/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3717/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3718/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3719/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3720///
3721/// Rounding is done according to the rounding parameter, which can be one of:
3722///
3723/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3724/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3725/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3726/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3727/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3728///
3729/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmul_round_pch)
3730#[inline]
3731#[target_feature(enable = "avx512fp16")]
3732#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3733#[rustc_legacy_const_generics(4)]
3734#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3735pub fn _mm512_mask_fcmul_round_pch<const ROUNDING: i32>(
3736    src: __m512h,
3737    k: __mmask16,
3738    a: __m512h,
3739    b: __m512h,
3740) -> __m512h {
3741    static_assert_rounding!(ROUNDING);
3742    _mm512_mask_cmul_round_pch::<ROUNDING>(src, k, a, b)
3743}
3744
3745/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3746/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3747/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3748/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3749///
3750/// Rounding is done according to the rounding parameter, which can be one of:
3751///
3752/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3753/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3754/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3755/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3756/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3757///
3758/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmul_round_pch)
3759#[inline]
3760#[target_feature(enable = "avx512fp16")]
3761#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3762#[rustc_legacy_const_generics(3)]
3763#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3764pub fn _mm512_maskz_fcmul_round_pch<const ROUNDING: i32>(
3765    k: __mmask16,
3766    a: __m512h,
3767    b: __m512h,
3768) -> __m512h {
3769    static_assert_rounding!(ROUNDING);
3770    _mm512_maskz_cmul_round_pch::<ROUNDING>(k, a, b)
3771}
3772
3773/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3774/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3775/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3776/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3777///
3778/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_sch)
3779#[inline]
3780#[target_feature(enable = "avx512fp16")]
3781#[cfg_attr(test, assert_instr(vfcmulcsh))]
3782#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3783pub fn _mm_fcmul_sch(a: __m128h, b: __m128h) -> __m128h {
3784    _mm_cmul_sch(a, b)
3785}
3786
3787/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3788/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
3789/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3790/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3791///
3792/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_sch)
3793#[inline]
3794#[target_feature(enable = "avx512fp16")]
3795#[cfg_attr(test, assert_instr(vfcmulcsh))]
3796#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3797pub fn _mm_mask_fcmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3798    _mm_mask_cmul_sch(src, k, a, b)
3799}
3800
3801/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3802/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
3803/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3804/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3805///
3806/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_sch)
3807#[inline]
3808#[target_feature(enable = "avx512fp16")]
3809#[cfg_attr(test, assert_instr(vfcmulcsh))]
3810#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3811pub fn _mm_maskz_fcmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3812    _mm_maskz_cmul_sch(k, a, b)
3813}
3814
3815/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3816/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3817/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3818///
3819/// Rounding is done according to the rounding parameter, which can be one of:
3820///
3821/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3822/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3823/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3824/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3825/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3826///
3827/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_round_sch)
3828#[inline]
3829#[target_feature(enable = "avx512fp16")]
3830#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3831#[rustc_legacy_const_generics(2)]
3832#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3833pub fn _mm_fcmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
3834    static_assert_rounding!(ROUNDING);
3835    _mm_cmul_round_sch::<ROUNDING>(a, b)
3836}
3837
3838/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3839/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
3840/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3841/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3842///
3843/// Rounding is done according to the rounding parameter, which can be one of:
3844///
3845/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3846/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3847/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3848/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3849/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3850///
3851/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_round_sch)
3852#[inline]
3853#[target_feature(enable = "avx512fp16")]
3854#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3855#[rustc_legacy_const_generics(4)]
3856#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3857pub fn _mm_mask_fcmul_round_sch<const ROUNDING: i32>(
3858    src: __m128h,
3859    k: __mmask8,
3860    a: __m128h,
3861    b: __m128h,
3862) -> __m128h {
3863    static_assert_rounding!(ROUNDING);
3864    _mm_mask_cmul_round_sch::<ROUNDING>(src, k, a, b)
3865}
3866
3867/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3868/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
3869/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3870/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3871///
3872/// Rounding is done according to the rounding parameter, which can be one of:
3873///
3874/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3875/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3876/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3877/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3878/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3879///
3880/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_round_sch)
3881#[inline]
3882#[target_feature(enable = "avx512fp16")]
3883#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3884#[rustc_legacy_const_generics(3)]
3885#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3886pub fn _mm_maskz_fcmul_round_sch<const ROUNDING: i32>(
3887    k: __mmask8,
3888    a: __m128h,
3889    b: __m128h,
3890) -> __m128h {
3891    static_assert_rounding!(ROUNDING);
3892    _mm_maskz_cmul_round_sch::<ROUNDING>(k, a, b)
3893}
3894
3895/// Finds the absolute value of each packed half-precision (16-bit) floating-point element in v2, storing
3896/// the results in dst.
3897///
3898/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_ph)
3899#[inline]
3900#[target_feature(enable = "avx512fp16,avx512vl")]
3901#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3902pub fn _mm_abs_ph(v2: __m128h) -> __m128h {
3903    unsafe { transmute(_mm_and_si128(transmute(v2), _mm_set1_epi16(i16::MAX))) }
3904}
3905
3906/// Finds the absolute value of each packed half-precision (16-bit) floating-point element in v2, storing
3907/// the result in dst.
3908///
3909/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_ph)
3910#[inline]
3911#[target_feature(enable = "avx512fp16,avx512vl")]
3912#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3913pub fn _mm256_abs_ph(v2: __m256h) -> __m256h {
3914    unsafe { transmute(_mm256_and_si256(transmute(v2), _mm256_set1_epi16(i16::MAX))) }
3915}
3916
3917/// Finds the absolute value of each packed half-precision (16-bit) floating-point element in v2, storing
3918/// the result in dst.
3919///
3920/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_abs_ph)
3921#[inline]
3922#[target_feature(enable = "avx512fp16")]
3923#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3924pub fn _mm512_abs_ph(v2: __m512h) -> __m512h {
3925    unsafe { transmute(_mm512_and_si512(transmute(v2), _mm512_set1_epi16(i16::MAX))) }
3926}
3927
3928/// Compute the complex conjugates of complex numbers in a, and store the results in dst. Each complex
3929/// number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines
3930/// the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate
3931/// `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3932///
3933/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_conj_pch)
3934#[inline]
3935#[target_feature(enable = "avx512fp16,avx512vl")]
3936#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3937pub fn _mm_conj_pch(a: __m128h) -> __m128h {
3938    unsafe { transmute(_mm_xor_si128(transmute(a), _mm_set1_epi32(i32::MIN))) }
3939}
3940
3941/// Compute the complex conjugates of complex numbers in a, and store the results in dst using writemask k
3942/// (the element is copied from src when corresponding mask bit is not set). Each complex number is composed of two
3943/// adjacent half-precision (16-bit) floating-point elements, which defines the complex number
3944/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3945///
3946/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_conj_pch)
3947#[inline]
3948#[target_feature(enable = "avx512fp16,avx512vl")]
3949#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3950pub fn _mm_mask_conj_pch(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
3951    unsafe {
3952        let r: __m128 = transmute(_mm_conj_pch(a));
3953        transmute(simd_select_bitmask(k, r, transmute(src)))
3954    }
3955}
3956
3957/// Compute the complex conjugates of complex numbers in a, and store the results in dst using zeromask k
3958/// (the element is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
3959/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3960/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3961///
3962/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_conj_pch)
3963#[inline]
3964#[target_feature(enable = "avx512fp16,avx512vl")]
3965#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3966pub fn _mm_maskz_conj_pch(k: __mmask8, a: __m128h) -> __m128h {
3967    _mm_mask_conj_pch(_mm_setzero_ph(), k, a)
3968}
3969
3970/// Compute the complex conjugates of complex numbers in a, and store the results in dst. Each complex number
3971/// is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
3972/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3973///
3974/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_conj_pch)
3975#[inline]
3976#[target_feature(enable = "avx512fp16,avx512vl")]
3977#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3978pub fn _mm256_conj_pch(a: __m256h) -> __m256h {
3979    unsafe { transmute(_mm256_xor_si256(transmute(a), _mm256_set1_epi32(i32::MIN))) }
3980}
3981
3982/// Compute the complex conjugates of complex numbers in a, and store the results in dst using writemask k
3983/// (the element is copied from src when corresponding mask bit is not set). Each complex number is composed of two
3984/// adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3985/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3986///
3987/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_conj_pch)
3988#[inline]
3989#[target_feature(enable = "avx512fp16,avx512vl")]
3990#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3991pub fn _mm256_mask_conj_pch(src: __m256h, k: __mmask8, a: __m256h) -> __m256h {
3992    unsafe {
3993        let r: __m256 = transmute(_mm256_conj_pch(a));
3994        transmute(simd_select_bitmask(k, r, transmute(src)))
3995    }
3996}
3997
3998/// Compute the complex conjugates of complex numbers in a, and store the results in dst using zeromask k
3999/// (the element is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
4000/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4001/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4002///
4003/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_conj_pch)
4004#[inline]
4005#[target_feature(enable = "avx512fp16,avx512vl")]
4006#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4007pub fn _mm256_maskz_conj_pch(k: __mmask8, a: __m256h) -> __m256h {
4008    _mm256_mask_conj_pch(_mm256_setzero_ph(), k, a)
4009}
4010
4011/// Compute the complex conjugates of complex numbers in a, and store the results in dst. Each complex number
4012/// is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
4013/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4014///
4015/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_conj_pch)
4016#[inline]
4017#[target_feature(enable = "avx512fp16")]
4018#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4019pub fn _mm512_conj_pch(a: __m512h) -> __m512h {
4020    unsafe { transmute(_mm512_xor_si512(transmute(a), _mm512_set1_epi32(i32::MIN))) }
4021}
4022
4023/// Compute the complex conjugates of complex numbers in a, and store the results in dst using writemask k
4024/// (the element is copied from src when corresponding mask bit is not set). Each complex number is composed of two
4025/// adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4026/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4027///
4028/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_conj_pch)
4029#[inline]
4030#[target_feature(enable = "avx512fp16")]
4031#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4032pub fn _mm512_mask_conj_pch(src: __m512h, k: __mmask16, a: __m512h) -> __m512h {
4033    unsafe {
4034        let r: __m512 = transmute(_mm512_conj_pch(a));
4035        transmute(simd_select_bitmask(k, r, transmute(src)))
4036    }
4037}
4038
4039/// Compute the complex conjugates of complex numbers in a, and store the results in dst using zeromask k
4040/// (the element is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
4041/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4042/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4043///
4044/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_conj_pch)
4045#[inline]
4046#[target_feature(enable = "avx512fp16")]
4047#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4048pub fn _mm512_maskz_conj_pch(k: __mmask16, a: __m512h) -> __m512h {
4049    _mm512_mask_conj_pch(_mm512_setzero_ph(), k, a)
4050}
4051
4052/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4053/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
4054/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4055///
4056/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_pch)
4057#[inline]
4058#[target_feature(enable = "avx512fp16,avx512vl")]
4059#[cfg_attr(test, assert_instr(vfmaddcph))]
4060#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4061pub fn _mm_fmadd_pch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4062    _mm_mask3_fmadd_pch(a, b, c, 0xff)
4063}
4064
4065/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4066/// and store the results in dst using writemask k (the element is copied from a when the corresponding
4067/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4068/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4069///
4070/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_pch)
4071#[inline]
4072#[target_feature(enable = "avx512fp16,avx512vl")]
4073#[cfg_attr(test, assert_instr(vfmaddcph))]
4074#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4075pub fn _mm_mask_fmadd_pch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
4076    unsafe {
4077        let r: __m128 = transmute(_mm_mask3_fmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
4078        transmute(simd_select_bitmask(k, r, transmute(a)))
4079    }
4080}
4081
4082/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4083/// and store the results in dst using writemask k (the element is copied from c when the corresponding
4084/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4085/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4086///
4087/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_pch)
4088#[inline]
4089#[target_feature(enable = "avx512fp16,avx512vl")]
4090#[cfg_attr(test, assert_instr(vfmaddcph))]
4091#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4092pub fn _mm_mask3_fmadd_pch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
4093    unsafe {
4094        transmute(vfmaddcph_mask3_128(
4095            transmute(a),
4096            transmute(b),
4097            transmute(c),
4098            k,
4099        ))
4100    }
4101}
4102
4103/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4104/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
4105/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4106/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4107///
4108/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_pch)
4109#[inline]
4110#[target_feature(enable = "avx512fp16,avx512vl")]
4111#[cfg_attr(test, assert_instr(vfmaddcph))]
4112#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4113pub fn _mm_maskz_fmadd_pch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4114    unsafe {
4115        transmute(vfmaddcph_maskz_128(
4116            transmute(a),
4117            transmute(b),
4118            transmute(c),
4119            k,
4120        ))
4121    }
4122}
4123
4124/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4125/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
4126/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4127///
4128/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmadd_pch)
4129#[inline]
4130#[target_feature(enable = "avx512fp16,avx512vl")]
4131#[cfg_attr(test, assert_instr(vfmaddcph))]
4132#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4133pub fn _mm256_fmadd_pch(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
4134    _mm256_mask3_fmadd_pch(a, b, c, 0xff)
4135}
4136
4137/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4138/// and store the results in dst using writemask k (the element is copied from a when the corresponding mask
4139/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4140/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4141///
4142/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmadd_pch)
4143#[inline]
4144#[target_feature(enable = "avx512fp16,avx512vl")]
4145#[cfg_attr(test, assert_instr(vfmaddcph))]
4146#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4147pub fn _mm256_mask_fmadd_pch(a: __m256h, k: __mmask8, b: __m256h, c: __m256h) -> __m256h {
4148    unsafe {
4149        let r: __m256 = transmute(_mm256_mask3_fmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
4150        transmute(simd_select_bitmask(k, r, transmute(a)))
4151    }
4152}
4153
4154/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4155/// and store the results in dst using writemask k (the element is copied from c when the corresponding
4156/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4157/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4158///
4159/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmadd_pch)
4160#[inline]
4161#[target_feature(enable = "avx512fp16,avx512vl")]
4162#[cfg_attr(test, assert_instr(vfmaddcph))]
4163#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4164pub fn _mm256_mask3_fmadd_pch(a: __m256h, b: __m256h, c: __m256h, k: __mmask8) -> __m256h {
4165    unsafe {
4166        transmute(vfmaddcph_mask3_256(
4167            transmute(a),
4168            transmute(b),
4169            transmute(c),
4170            k,
4171        ))
4172    }
4173}
4174
4175/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4176/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
4177/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4178/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4179///
4180/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmadd_pch)
4181#[inline]
4182#[target_feature(enable = "avx512fp16,avx512vl")]
4183#[cfg_attr(test, assert_instr(vfmaddcph))]
4184#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4185pub fn _mm256_maskz_fmadd_pch(k: __mmask8, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
4186    unsafe {
4187        transmute(vfmaddcph_maskz_256(
4188            transmute(a),
4189            transmute(b),
4190            transmute(c),
4191            k,
4192        ))
4193    }
4194}
4195
4196/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4197/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
4198/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4199///
4200/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_pch)
4201#[inline]
4202#[target_feature(enable = "avx512fp16")]
4203#[cfg_attr(test, assert_instr(vfmaddcph))]
4204#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4205pub fn _mm512_fmadd_pch(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4206    _mm512_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
4207}
4208
4209/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4210/// and store the results in dst using writemask k (the element is copied from a when the corresponding mask
4211/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4212/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4213///
4214/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_pch)
4215#[inline]
4216#[target_feature(enable = "avx512fp16")]
4217#[cfg_attr(test, assert_instr(vfmaddcph))]
4218#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4219pub fn _mm512_mask_fmadd_pch(a: __m512h, k: __mmask16, b: __m512h, c: __m512h) -> __m512h {
4220    _mm512_mask_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
4221}
4222
4223/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4224/// and store the results in dst using writemask k (the element is copied from c when the corresponding
4225/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4226/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4227///
4228/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_pch)
4229#[inline]
4230#[target_feature(enable = "avx512fp16")]
4231#[cfg_attr(test, assert_instr(vfmaddcph))]
4232#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4233pub fn _mm512_mask3_fmadd_pch(a: __m512h, b: __m512h, c: __m512h, k: __mmask16) -> __m512h {
4234    _mm512_mask3_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
4235}
4236
4237/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4238/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
4239/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4240/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4241///
4242/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_pch)
4243#[inline]
4244#[target_feature(enable = "avx512fp16")]
4245#[cfg_attr(test, assert_instr(vfmaddcph))]
4246#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4247pub fn _mm512_maskz_fmadd_pch(k: __mmask16, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4248    _mm512_maskz_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
4249}
4250
4251/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4252/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
4253/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4254///
4255/// Rounding is done according to the rounding parameter, which can be one of:
4256///
4257/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4258/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4259/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4260/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4261/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4262///
4263/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_round_pch)
4264#[inline]
4265#[target_feature(enable = "avx512fp16")]
4266#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
4267#[rustc_legacy_const_generics(3)]
4268#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4269pub fn _mm512_fmadd_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4270    static_assert_rounding!(ROUNDING);
4271    _mm512_mask3_fmadd_round_pch::<ROUNDING>(a, b, c, 0xffff)
4272}
4273
4274/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4275/// and store the results in dst using writemask k (the element is copied from a when the corresponding mask
4276/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4277/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4278///
4279/// Rounding is done according to the rounding parameter, which can be one of:
4280///
4281/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4282/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4283/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4284/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4285/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4286///
4287/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_round_pch)
4288#[inline]
4289#[target_feature(enable = "avx512fp16")]
4290#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
4291#[rustc_legacy_const_generics(4)]
4292#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4293pub fn _mm512_mask_fmadd_round_pch<const ROUNDING: i32>(
4294    a: __m512h,
4295    k: __mmask16,
4296    b: __m512h,
4297    c: __m512h,
4298) -> __m512h {
4299    unsafe {
4300        static_assert_rounding!(ROUNDING);
4301        let r: __m512 = transmute(_mm512_mask3_fmadd_round_pch::<ROUNDING>(a, b, c, k)); // using `0xffff` would have been fine here, but this is what CLang does
4302        transmute(simd_select_bitmask(k, r, transmute(a)))
4303    }
4304}
4305
4306/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4307/// and store the results in dst using writemask k (the element is copied from c when the corresponding
4308/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4309/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4310///
4311/// Rounding is done according to the rounding parameter, which can be one of:
4312///
4313/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4314/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4315/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4316/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4317/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4318///
4319/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_round_pch)
4320#[inline]
4321#[target_feature(enable = "avx512fp16")]
4322#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
4323#[rustc_legacy_const_generics(4)]
4324#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4325pub fn _mm512_mask3_fmadd_round_pch<const ROUNDING: i32>(
4326    a: __m512h,
4327    b: __m512h,
4328    c: __m512h,
4329    k: __mmask16,
4330) -> __m512h {
4331    unsafe {
4332        static_assert_rounding!(ROUNDING);
4333        transmute(vfmaddcph_mask3_512(
4334            transmute(a),
4335            transmute(b),
4336            transmute(c),
4337            k,
4338            ROUNDING,
4339        ))
4340    }
4341}
4342
4343/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4344/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
4345/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4346/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4347///
4348/// Rounding is done according to the rounding parameter, which can be one of:
4349///
4350/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4351/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4352/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4353/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4354/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4355///
4356/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_round_pch)
4357#[inline]
4358#[target_feature(enable = "avx512fp16")]
4359#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
4360#[rustc_legacy_const_generics(4)]
4361#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4362pub fn _mm512_maskz_fmadd_round_pch<const ROUNDING: i32>(
4363    k: __mmask16,
4364    a: __m512h,
4365    b: __m512h,
4366    c: __m512h,
4367) -> __m512h {
4368    unsafe {
4369        static_assert_rounding!(ROUNDING);
4370        transmute(vfmaddcph_maskz_512(
4371            transmute(a),
4372            transmute(b),
4373            transmute(c),
4374            k,
4375            ROUNDING,
4376        ))
4377    }
4378}
4379
4380/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4381/// store the result in the lower elements of dst, and copy the upper 6 packed elements from a to the
4382/// upper elements of dst. Each complex number is composed of two adjacent half-precision (16-bit)
4383/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4384///
4385/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_sch)
4386#[inline]
4387#[target_feature(enable = "avx512fp16")]
4388#[cfg_attr(test, assert_instr(vfmaddcsh))]
4389#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4390pub fn _mm_fmadd_sch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4391    _mm_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
4392}
4393
4394/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4395/// store the result in the lower elements of dst using writemask k (elements are copied from a when
4396/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
4397/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
4398/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4399///
4400/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_sch)
4401#[inline]
4402#[target_feature(enable = "avx512fp16")]
4403#[cfg_attr(test, assert_instr(vfmaddcsh))]
4404#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4405pub fn _mm_mask_fmadd_sch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
4406    _mm_mask_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
4407}
4408
4409/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4410/// store the result in the lower elements of dst using writemask k (elements are copied from c when
4411/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
4412/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
4413/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4414///
4415/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_sch)
4416#[inline]
4417#[target_feature(enable = "avx512fp16")]
4418#[cfg_attr(test, assert_instr(vfmaddcsh))]
4419#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4420pub fn _mm_mask3_fmadd_sch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
4421    _mm_mask3_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
4422}
4423
4424/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4425/// store the result in the lower elements of dst using zeromask k (elements are zeroed out when mask
4426/// bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst. Each
4427/// complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
4428/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4429///
4430/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_sch)
4431#[inline]
4432#[target_feature(enable = "avx512fp16")]
4433#[cfg_attr(test, assert_instr(vfmaddcsh))]
4434#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4435pub fn _mm_maskz_fmadd_sch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4436    _mm_maskz_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
4437}
4438
4439/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4440/// store the result in the lower elements of dst. Each complex number is composed of two adjacent
4441/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4442///
4443/// Rounding is done according to the rounding parameter, which can be one of:
4444///
4445/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4446/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4447/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4448/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4449/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4450///
4451/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_round_sch)
4452#[inline]
4453#[target_feature(enable = "avx512fp16")]
4454#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
4455#[rustc_legacy_const_generics(3)]
4456#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4457pub fn _mm_fmadd_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4458    unsafe {
4459        static_assert_rounding!(ROUNDING);
4460        transmute(vfmaddcsh_mask(
4461            transmute(a),
4462            transmute(b),
4463            transmute(c),
4464            0xff,
4465            ROUNDING,
4466        ))
4467    }
4468}
4469
4470/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4471/// store the result in the lower elements of dst using writemask k (elements are copied from a when
4472/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
4473/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
4474/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4475///
4476/// Rounding is done according to the rounding parameter, which can be one of:
4477///
4478/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4479/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4480/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4481/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4482/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4483///
4484/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_round_sch)
4485#[inline]
4486#[target_feature(enable = "avx512fp16")]
4487#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
4488#[rustc_legacy_const_generics(4)]
4489#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4490pub fn _mm_mask_fmadd_round_sch<const ROUNDING: i32>(
4491    a: __m128h,
4492    k: __mmask8,
4493    b: __m128h,
4494    c: __m128h,
4495) -> __m128h {
4496    unsafe {
4497        static_assert_rounding!(ROUNDING);
4498        let a = transmute(a);
4499        let r = vfmaddcsh_mask(a, transmute(b), transmute(c), k, ROUNDING); // using `0xff` would have been fine here, but this is what CLang does
4500        transmute(_mm_mask_move_ss(a, k, a, r))
4501    }
4502}
4503
4504/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4505/// store the result in the lower elements of dst using writemask k (elements are copied from c when
4506/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
4507/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
4508/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4509///
4510/// Rounding is done according to the rounding parameter, which can be one of:
4511///
4512/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4513/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4514/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4515/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4516/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4517///
4518/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_sch)
4519#[inline]
4520#[target_feature(enable = "avx512fp16")]
4521#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
4522#[rustc_legacy_const_generics(4)]
4523#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4524pub fn _mm_mask3_fmadd_round_sch<const ROUNDING: i32>(
4525    a: __m128h,
4526    b: __m128h,
4527    c: __m128h,
4528    k: __mmask8,
4529) -> __m128h {
4530    unsafe {
4531        static_assert_rounding!(ROUNDING);
4532        let c = transmute(c);
4533        let r = vfmaddcsh_mask(transmute(a), transmute(b), c, k, ROUNDING);
4534        transmute(_mm_move_ss(c, r))
4535    }
4536}
4537
4538/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4539/// store the result in the lower elements of dst using zeromask k (elements are zeroed out when mask
4540/// bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst. Each
4541/// complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
4542/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4543///
4544/// Rounding is done according to the rounding parameter, which can be one of:
4545///
4546/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4547/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4548/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4549/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4550/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4551///
4552/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_round_sch)
4553#[inline]
4554#[target_feature(enable = "avx512fp16")]
4555#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
4556#[rustc_legacy_const_generics(4)]
4557#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4558pub fn _mm_maskz_fmadd_round_sch<const ROUNDING: i32>(
4559    k: __mmask8,
4560    a: __m128h,
4561    b: __m128h,
4562    c: __m128h,
4563) -> __m128h {
4564    unsafe {
4565        static_assert_rounding!(ROUNDING);
4566        transmute(vfmaddcsh_maskz(
4567            transmute(a),
4568            transmute(b),
4569            transmute(c),
4570            k,
4571            ROUNDING,
4572        ))
4573    }
4574}
4575
4576/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4577/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
4578/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
4579/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4580///
4581/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmadd_pch)
4582#[inline]
4583#[target_feature(enable = "avx512fp16,avx512vl")]
4584#[cfg_attr(test, assert_instr(vfcmaddcph))]
4585#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4586pub fn _mm_fcmadd_pch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4587    _mm_mask3_fcmadd_pch(a, b, c, 0xff)
4588}
4589
4590/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4591/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4592/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4593/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4594/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4595///
4596/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmadd_pch)
4597#[inline]
4598#[target_feature(enable = "avx512fp16,avx512vl")]
4599#[cfg_attr(test, assert_instr(vfcmaddcph))]
4600#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4601pub fn _mm_mask_fcmadd_pch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
4602    unsafe {
4603        let r: __m128 = transmute(_mm_mask3_fcmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
4604        transmute(simd_select_bitmask(k, r, transmute(a)))
4605    }
4606}
4607
4608/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4609/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4610/// copied from c when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4611/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4612/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4613///
4614/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fcmadd_pch)
4615#[inline]
4616#[target_feature(enable = "avx512fp16,avx512vl")]
4617#[cfg_attr(test, assert_instr(vfcmaddcph))]
4618#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4619pub fn _mm_mask3_fcmadd_pch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
4620    unsafe {
4621        transmute(vfcmaddcph_mask3_128(
4622            transmute(a),
4623            transmute(b),
4624            transmute(c),
4625            k,
4626        ))
4627    }
4628}
4629
4630/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4631/// to the corresponding complex numbers in c, and store the results in dst using zeromask k (the element is
4632/// zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4633/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4634/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4635///
4636/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmadd_pch)
4637#[inline]
4638#[target_feature(enable = "avx512fp16,avx512vl")]
4639#[cfg_attr(test, assert_instr(vfcmaddcph))]
4640#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4641pub fn _mm_maskz_fcmadd_pch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4642    unsafe {
4643        transmute(vfcmaddcph_maskz_128(
4644            transmute(a),
4645            transmute(b),
4646            transmute(c),
4647            k,
4648        ))
4649    }
4650}
4651
4652/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4653/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
4654/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
4655/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4656///
4657/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fcmadd_pch)
4658#[inline]
4659#[target_feature(enable = "avx512fp16,avx512vl")]
4660#[cfg_attr(test, assert_instr(vfcmaddcph))]
4661#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4662pub fn _mm256_fcmadd_pch(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
4663    _mm256_mask3_fcmadd_pch(a, b, c, 0xff)
4664}
4665
4666/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4667/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4668/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4669/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4670/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4671///
4672/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fcmadd_pch)
4673#[inline]
4674#[target_feature(enable = "avx512fp16,avx512vl")]
4675#[cfg_attr(test, assert_instr(vfcmaddcph))]
4676#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4677pub fn _mm256_mask_fcmadd_pch(a: __m256h, k: __mmask8, b: __m256h, c: __m256h) -> __m256h {
4678    unsafe {
4679        let r: __m256 = transmute(_mm256_mask3_fcmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
4680        transmute(simd_select_bitmask(k, r, transmute(a)))
4681    }
4682}
4683
4684/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4685/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4686/// copied from c when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4687/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4688/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4689///
4690/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fcmadd_pch)
4691#[inline]
4692#[target_feature(enable = "avx512fp16,avx512vl")]
4693#[cfg_attr(test, assert_instr(vfcmaddcph))]
4694#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4695pub fn _mm256_mask3_fcmadd_pch(a: __m256h, b: __m256h, c: __m256h, k: __mmask8) -> __m256h {
4696    unsafe {
4697        transmute(vfcmaddcph_mask3_256(
4698            transmute(a),
4699            transmute(b),
4700            transmute(c),
4701            k,
4702        ))
4703    }
4704}
4705
4706/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4707/// to the corresponding complex numbers in c, and store the results in dst using zeromask k (the element is
4708/// zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4709/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4710/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4711///
4712/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fcmadd_pch)
4713#[inline]
4714#[target_feature(enable = "avx512fp16,avx512vl")]
4715#[cfg_attr(test, assert_instr(vfcmaddcph))]
4716#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4717pub fn _mm256_maskz_fcmadd_pch(k: __mmask8, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
4718    unsafe {
4719        transmute(vfcmaddcph_maskz_256(
4720            transmute(a),
4721            transmute(b),
4722            transmute(c),
4723            k,
4724        ))
4725    }
4726}
4727
4728/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4729/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
4730/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
4731/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4732///
4733/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmadd_pch)
4734#[inline]
4735#[target_feature(enable = "avx512fp16")]
4736#[cfg_attr(test, assert_instr(vfcmaddcph))]
4737#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4738pub fn _mm512_fcmadd_pch(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4739    _mm512_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
4740}
4741
4742/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4743/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4744/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4745/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4746/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4747///
4748/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmadd_pch)
4749#[inline]
4750#[target_feature(enable = "avx512fp16")]
4751#[cfg_attr(test, assert_instr(vfcmaddcph))]
4752#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4753pub fn _mm512_mask_fcmadd_pch(a: __m512h, k: __mmask16, b: __m512h, c: __m512h) -> __m512h {
4754    _mm512_mask_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
4755}
4756
4757/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4758/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4759/// copied from c when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4760/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4761/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4762///
4763/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fcmadd_pch)
4764#[inline]
4765#[target_feature(enable = "avx512fp16")]
4766#[cfg_attr(test, assert_instr(vfcmaddcph))]
4767#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4768pub fn _mm512_mask3_fcmadd_pch(a: __m512h, b: __m512h, c: __m512h, k: __mmask16) -> __m512h {
4769    _mm512_mask3_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
4770}
4771
4772/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4773/// to the corresponding complex numbers in c, and store the results in dst using zeromask k (the element is
4774/// zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4775/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4776/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4777///
4778/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmadd_pch)
4779#[inline]
4780#[target_feature(enable = "avx512fp16")]
4781#[cfg_attr(test, assert_instr(vfcmaddcph))]
4782#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4783pub fn _mm512_maskz_fcmadd_pch(k: __mmask16, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4784    _mm512_maskz_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
4785}
4786
4787/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4788/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
4789/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
4790/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4791///
4792/// Rounding is done according to the rounding parameter, which can be one of:
4793///
4794/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4795/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4796/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4797/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4798/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4799///
4800/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmadd_round_pch)
4801#[inline]
4802#[target_feature(enable = "avx512fp16")]
4803#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
4804#[rustc_legacy_const_generics(3)]
4805#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4806pub fn _mm512_fcmadd_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4807    static_assert_rounding!(ROUNDING);
4808    _mm512_mask3_fcmadd_round_pch::<ROUNDING>(a, b, c, 0xffff)
4809}
4810
4811/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4812/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4813/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4814/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4815/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4816///
4817/// Rounding is done according to the rounding parameter, which can be one of:
4818///
4819/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4820/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4821/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4822/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4823/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4824///
4825/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmadd_round_pch)
4826#[inline]
4827#[target_feature(enable = "avx512fp16")]
4828#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
4829#[rustc_legacy_const_generics(4)]
4830#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4831pub fn _mm512_mask_fcmadd_round_pch<const ROUNDING: i32>(
4832    a: __m512h,
4833    k: __mmask16,
4834    b: __m512h,
4835    c: __m512h,
4836) -> __m512h {
4837    unsafe {
4838        static_assert_rounding!(ROUNDING);
4839        let r: __m512 = transmute(_mm512_mask3_fcmadd_round_pch::<ROUNDING>(a, b, c, k)); // using `0xffff` would have been fine here, but this is what CLang does
4840        transmute(simd_select_bitmask(k, r, transmute(a)))
4841    }
4842}
4843
4844/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4845/// to the corresponding complex numbers in c using writemask k (the element is copied from c when the corresponding
4846/// mask bit is not set), and store the results in dst. Each complex number is composed of two adjacent half-precision
4847/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1`, or the complex
4848/// conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4849///
4850/// Rounding is done according to the rounding parameter, which can be one of:
4851///
4852/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4853/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4854/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4855/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4856/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4857///
4858/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fcmadd_round_pch)
4859#[inline]
4860#[target_feature(enable = "avx512fp16")]
4861#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
4862#[rustc_legacy_const_generics(4)]
4863#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4864pub fn _mm512_mask3_fcmadd_round_pch<const ROUNDING: i32>(
4865    a: __m512h,
4866    b: __m512h,
4867    c: __m512h,
4868    k: __mmask16,
4869) -> __m512h {
4870    unsafe {
4871        static_assert_rounding!(ROUNDING);
4872        transmute(vfcmaddcph_mask3_512(
4873            transmute(a),
4874            transmute(b),
4875            transmute(c),
4876            k,
4877            ROUNDING,
4878        ))
4879    }
4880}
4881
4882/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4883/// to the corresponding complex numbers in c using zeromask k (the element is zeroed out when the corresponding
4884/// mask bit is not set), and store the results in dst. Each complex number is composed of two adjacent half-precision
4885/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1`, or the complex
4886/// conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4887///
4888/// Rounding is done according to the rounding parameter, which can be one of:
4889///
4890/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4891/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4892/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4893/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4894/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4895///
4896/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmadd_round_pch)
4897#[inline]
4898#[target_feature(enable = "avx512fp16")]
4899#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
4900#[rustc_legacy_const_generics(4)]
4901#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4902pub fn _mm512_maskz_fcmadd_round_pch<const ROUNDING: i32>(
4903    k: __mmask16,
4904    a: __m512h,
4905    b: __m512h,
4906    c: __m512h,
4907) -> __m512h {
4908    unsafe {
4909        static_assert_rounding!(ROUNDING);
4910        transmute(vfcmaddcph_maskz_512(
4911            transmute(a),
4912            transmute(b),
4913            transmute(c),
4914            k,
4915            ROUNDING,
4916        ))
4917    }
4918}
4919
4920/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
4921/// accumulate to the lower complex number in c, and store the result in the lower elements of dst,
4922/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
4923/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
4924/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4925///
4926/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmadd_sch)
4927#[inline]
4928#[target_feature(enable = "avx512fp16")]
4929#[cfg_attr(test, assert_instr(vfcmaddcsh))]
4930#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4931pub fn _mm_fcmadd_sch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4932    _mm_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
4933}
4934
4935/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
4936/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
4937/// writemask k (the element is copied from a when the corresponding mask bit is not set), and copy the upper
4938/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
4939/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4940/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4941///
4942/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmadd_sch)
4943#[inline]
4944#[target_feature(enable = "avx512fp16")]
4945#[cfg_attr(test, assert_instr(vfcmaddcsh))]
4946#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4947pub fn _mm_mask_fcmadd_sch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
4948    _mm_mask_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
4949}
4950
4951/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
4952/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
4953/// writemask k (the element is copied from c when the corresponding mask bit is not set), and copy the upper
4954/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
4955/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4956/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4957///
4958/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fcmadd_sch)
4959#[inline]
4960#[target_feature(enable = "avx512fp16")]
4961#[cfg_attr(test, assert_instr(vfcmaddcsh))]
4962#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4963pub fn _mm_mask3_fcmadd_sch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
4964    _mm_mask3_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
4965}
4966
4967/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
4968/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
4969/// zeromask k (the element is zeroed out when the corresponding mask bit is not set), and copy the upper
4970/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
4971/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4972/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4973///
4974/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmadd_sch)
4975#[inline]
4976#[target_feature(enable = "avx512fp16")]
4977#[cfg_attr(test, assert_instr(vfcmaddcsh))]
4978#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4979pub fn _mm_maskz_fcmadd_sch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4980    _mm_maskz_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
4981}
4982
4983/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
4984/// accumulate to the lower complex number in c, and store the result in the lower elements of dst,
4985/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
4986/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
4987/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4988///
4989/// Rounding is done according to the rounding parameter, which can be one of:
4990///
4991/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4992/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4993/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4994/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4995/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4996///
4997/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmadd_round_sch)
4998#[inline]
4999#[target_feature(enable = "avx512fp16")]
5000#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
5001#[rustc_legacy_const_generics(3)]
5002#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5003pub fn _mm_fcmadd_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5004    unsafe {
5005        static_assert_rounding!(ROUNDING);
5006        transmute(vfcmaddcsh_mask(
5007            transmute(a),
5008            transmute(b),
5009            transmute(c),
5010            0xff,
5011            ROUNDING,
5012        ))
5013    }
5014}
5015
5016/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5017/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
5018/// writemask k (the element is copied from a when the corresponding mask bit is not set), and copy the upper
5019/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
5020/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
5021/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5022///
5023/// Rounding is done according to the rounding parameter, which can be one of:
5024///
5025/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5026/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5027/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5028/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5029/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5030///
5031/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmadd_round_sch)
5032#[inline]
5033#[target_feature(enable = "avx512fp16")]
5034#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
5035#[rustc_legacy_const_generics(4)]
5036#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5037pub fn _mm_mask_fcmadd_round_sch<const ROUNDING: i32>(
5038    a: __m128h,
5039    k: __mmask8,
5040    b: __m128h,
5041    c: __m128h,
5042) -> __m128h {
5043    unsafe {
5044        static_assert_rounding!(ROUNDING);
5045        let a = transmute(a);
5046        let r = vfcmaddcsh_mask(a, transmute(b), transmute(c), k, ROUNDING);
5047        transmute(_mm_mask_move_ss(a, k, a, r))
5048    }
5049}
5050
5051/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5052/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
5053/// writemask k (the element is copied from c when the corresponding mask bit is not set), and copy the upper
5054/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
5055/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
5056/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5057///
5058/// Rounding is done according to the rounding parameter, which can be one of:
5059///
5060/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5061/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5062/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5063/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5064/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5065///
5066/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fcmadd_round_sch)
5067#[inline]
5068#[target_feature(enable = "avx512fp16")]
5069#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
5070#[rustc_legacy_const_generics(4)]
5071#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5072pub fn _mm_mask3_fcmadd_round_sch<const ROUNDING: i32>(
5073    a: __m128h,
5074    b: __m128h,
5075    c: __m128h,
5076    k: __mmask8,
5077) -> __m128h {
5078    unsafe {
5079        static_assert_rounding!(ROUNDING);
5080        let c = transmute(c);
5081        let r = vfcmaddcsh_mask(transmute(a), transmute(b), c, k, ROUNDING);
5082        transmute(_mm_move_ss(c, r))
5083    }
5084}
5085
5086/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5087/// accumulate to the lower complex number in c using zeromask k (the element is zeroed out when the corresponding
5088/// mask bit is not set), and store the result in the lower elements of dst, and copy the upper 6 packed elements
5089/// from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision (16-bit)
5090/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1`, or the complex
5091/// conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5092///
5093/// Rounding is done according to the rounding parameter, which can be one of:
5094///
5095/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5096/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5097/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5098/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5099/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5100///
5101/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmadd_round_sch)
5102#[inline]
5103#[target_feature(enable = "avx512fp16")]
5104#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
5105#[rustc_legacy_const_generics(4)]
5106#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5107pub fn _mm_maskz_fcmadd_round_sch<const ROUNDING: i32>(
5108    k: __mmask8,
5109    a: __m128h,
5110    b: __m128h,
5111    c: __m128h,
5112) -> __m128h {
5113    unsafe {
5114        static_assert_rounding!(ROUNDING);
5115        transmute(vfcmaddcsh_maskz(
5116            transmute(a),
5117            transmute(b),
5118            transmute(c),
5119            k,
5120            ROUNDING,
5121        ))
5122    }
5123}
5124
5125/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5126/// result to packed elements in c, and store the results in dst.
5127///
5128/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_ph)
5129#[inline]
5130#[target_feature(enable = "avx512fp16,avx512vl")]
5131#[cfg_attr(test, assert_instr(vfmadd))]
5132#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5133pub fn _mm_fmadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5134    unsafe { simd_fma(a, b, c) }
5135}
5136
5137/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5138/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5139/// from a when the corresponding mask bit is not set).
5140///
5141/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_ph)
5142#[inline]
5143#[target_feature(enable = "avx512fp16,avx512vl")]
5144#[cfg_attr(test, assert_instr(vfmadd))]
5145#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5146pub fn _mm_mask_fmadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
5147    unsafe { simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), a) }
5148}
5149
5150/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5151/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5152/// from c when the corresponding mask bit is not set).
5153///
5154/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_ph)
5155#[inline]
5156#[target_feature(enable = "avx512fp16,avx512vl")]
5157#[cfg_attr(test, assert_instr(vfmadd))]
5158#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5159pub fn _mm_mask3_fmadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
5160    unsafe { simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), c) }
5161}
5162
5163/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5164/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
5165/// out when the corresponding mask bit is not set).
5166///
5167/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_ph)
5168#[inline]
5169#[target_feature(enable = "avx512fp16,avx512vl")]
5170#[cfg_attr(test, assert_instr(vfmadd))]
5171#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5172pub fn _mm_maskz_fmadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5173    unsafe { simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), _mm_setzero_ph()) }
5174}
5175
5176/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5177/// result to packed elements in c, and store the results in dst.
5178///
5179/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmadd_ph)
5180#[inline]
5181#[target_feature(enable = "avx512fp16,avx512vl")]
5182#[cfg_attr(test, assert_instr(vfmadd))]
5183#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5184pub fn _mm256_fmadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
5185    unsafe { simd_fma(a, b, c) }
5186}
5187
5188/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5189/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5190/// from a when the corresponding mask bit is not set).
5191///
5192/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmadd_ph)
5193#[inline]
5194#[target_feature(enable = "avx512fp16,avx512vl")]
5195#[cfg_attr(test, assert_instr(vfmadd))]
5196#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5197pub fn _mm256_mask_fmadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
5198    unsafe { simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), a) }
5199}
5200
5201/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5202/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5203/// from c when the corresponding mask bit is not set).
5204///
5205/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmadd_ph)
5206#[inline]
5207#[target_feature(enable = "avx512fp16,avx512vl")]
5208#[cfg_attr(test, assert_instr(vfmadd))]
5209#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5210pub fn _mm256_mask3_fmadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
5211    unsafe { simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), c) }
5212}
5213
5214/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5215/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
5216/// out when the corresponding mask bit is not set).
5217///
5218/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmadd_ph)
5219#[inline]
5220#[target_feature(enable = "avx512fp16,avx512vl")]
5221#[cfg_attr(test, assert_instr(vfmadd))]
5222#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5223pub fn _mm256_maskz_fmadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
5224    unsafe { simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), _mm256_setzero_ph()) }
5225}
5226
5227/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5228/// result to packed elements in c, and store the results in dst.
5229///
5230/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_ph)
5231#[inline]
5232#[target_feature(enable = "avx512fp16")]
5233#[cfg_attr(test, assert_instr(vfmadd))]
5234#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5235pub fn _mm512_fmadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5236    unsafe { simd_fma(a, b, c) }
5237}
5238
5239/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5240/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5241/// from a when the corresponding mask bit is not set).
5242///
5243/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_ph)
5244#[inline]
5245#[target_feature(enable = "avx512fp16")]
5246#[cfg_attr(test, assert_instr(vfmadd))]
5247#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5248pub fn _mm512_mask_fmadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
5249    unsafe { simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), a) }
5250}
5251
5252/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5253/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5254/// from c when the corresponding mask bit is not set).
5255///
5256/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_ph)
5257#[inline]
5258#[target_feature(enable = "avx512fp16")]
5259#[cfg_attr(test, assert_instr(vfmadd))]
5260#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5261pub fn _mm512_mask3_fmadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
5262    unsafe { simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), c) }
5263}
5264
5265/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5266/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
5267/// out when the corresponding mask bit is not set).
5268///
5269/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_ph)
5270#[inline]
5271#[target_feature(enable = "avx512fp16")]
5272#[cfg_attr(test, assert_instr(vfmadd))]
5273#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5274pub fn _mm512_maskz_fmadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5275    unsafe { simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), _mm512_setzero_ph()) }
5276}
5277
5278/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5279/// result to packed elements in c, and store the results in dst.
5280///
5281/// Rounding is done according to the rounding parameter, which can be one of:
5282///
5283/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5284/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5285/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5286/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5287/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5288///
5289/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_round_ph)
5290#[inline]
5291#[target_feature(enable = "avx512fp16")]
5292#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5293#[rustc_legacy_const_generics(3)]
5294#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5295pub fn _mm512_fmadd_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5296    unsafe {
5297        static_assert_rounding!(ROUNDING);
5298        vfmaddph_512(a, b, c, ROUNDING)
5299    }
5300}
5301
5302/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5303/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5304/// from a when the corresponding mask bit is not set).
5305///
5306/// Rounding is done according to the rounding parameter, which can be one of:
5307///
5308/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5309/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5310/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5311/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5312/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5313///
5314/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_round_ph)
5315#[inline]
5316#[target_feature(enable = "avx512fp16")]
5317#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5318#[rustc_legacy_const_generics(4)]
5319#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5320pub fn _mm512_mask_fmadd_round_ph<const ROUNDING: i32>(
5321    a: __m512h,
5322    k: __mmask32,
5323    b: __m512h,
5324    c: __m512h,
5325) -> __m512h {
5326    unsafe {
5327        static_assert_rounding!(ROUNDING);
5328        simd_select_bitmask(k, _mm512_fmadd_round_ph::<ROUNDING>(a, b, c), a)
5329    }
5330}
5331
5332/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5333/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5334/// from c when the corresponding mask bit is not set).
5335///
5336/// Rounding is done according to the rounding parameter, which can be one of:
5337///
5338/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5339/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5340/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5341/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5342/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5343///
5344/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_round_ph)
5345#[inline]
5346#[target_feature(enable = "avx512fp16")]
5347#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5348#[rustc_legacy_const_generics(4)]
5349#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5350pub fn _mm512_mask3_fmadd_round_ph<const ROUNDING: i32>(
5351    a: __m512h,
5352    b: __m512h,
5353    c: __m512h,
5354    k: __mmask32,
5355) -> __m512h {
5356    unsafe {
5357        static_assert_rounding!(ROUNDING);
5358        simd_select_bitmask(k, _mm512_fmadd_round_ph::<ROUNDING>(a, b, c), c)
5359    }
5360}
5361
5362/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5363/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
5364/// out when the corresponding mask bit is not set).
5365///
5366/// Rounding is done according to the rounding parameter, which can be one of:
5367///
5368/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5369/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5370/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5371/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5372/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5373///
5374/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_round_ph)
5375#[inline]
5376#[target_feature(enable = "avx512fp16")]
5377#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5378#[rustc_legacy_const_generics(4)]
5379#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5380pub fn _mm512_maskz_fmadd_round_ph<const ROUNDING: i32>(
5381    k: __mmask32,
5382    a: __m512h,
5383    b: __m512h,
5384    c: __m512h,
5385) -> __m512h {
5386    unsafe {
5387        static_assert_rounding!(ROUNDING);
5388        simd_select_bitmask(
5389            k,
5390            _mm512_fmadd_round_ph::<ROUNDING>(a, b, c),
5391            _mm512_setzero_ph(),
5392        )
5393    }
5394}
5395
5396/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5397/// result to the lower element in c. Store the result in the lower element of dst, and copy the upper
5398/// 7 packed elements from a to the upper elements of dst.
5399///
5400/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_sh)
5401#[inline]
5402#[target_feature(enable = "avx512fp16")]
5403#[cfg_attr(test, assert_instr(vfmadd))]
5404#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5405pub fn _mm_fmadd_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5406    unsafe {
5407        let extracta: f16 = simd_extract!(a, 0);
5408        let extractb: f16 = simd_extract!(b, 0);
5409        let extractc: f16 = simd_extract!(c, 0);
5410        let r = fmaf16(extracta, extractb, extractc);
5411        simd_insert!(a, 0, r)
5412    }
5413}
5414
5415/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5416/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
5417/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5418/// upper elements of dst.
5419///
5420/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_sh)
5421#[inline]
5422#[target_feature(enable = "avx512fp16")]
5423#[cfg_attr(test, assert_instr(vfmadd))]
5424#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5425pub fn _mm_mask_fmadd_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
5426    unsafe {
5427        let mut fmadd: f16 = simd_extract!(a, 0);
5428        if k & 1 != 0 {
5429            let extractb: f16 = simd_extract!(b, 0);
5430            let extractc: f16 = simd_extract!(c, 0);
5431            fmadd = fmaf16(fmadd, extractb, extractc);
5432        }
5433        simd_insert!(a, 0, fmadd)
5434    }
5435}
5436
5437/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5438/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
5439/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
5440/// upper elements of dst.
5441///
5442/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_sh)
5443#[inline]
5444#[target_feature(enable = "avx512fp16")]
5445#[cfg_attr(test, assert_instr(vfmadd))]
5446#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5447pub fn _mm_mask3_fmadd_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
5448    unsafe {
5449        let mut fmadd: f16 = simd_extract!(c, 0);
5450        if k & 1 != 0 {
5451            let extracta: f16 = simd_extract!(a, 0);
5452            let extractb: f16 = simd_extract!(b, 0);
5453            fmadd = fmaf16(extracta, extractb, fmadd);
5454        }
5455        simd_insert!(c, 0, fmadd)
5456    }
5457}
5458
5459/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5460/// result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element
5461/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5462/// upper elements of dst.
5463///
5464/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_sh)
5465#[inline]
5466#[target_feature(enable = "avx512fp16")]
5467#[cfg_attr(test, assert_instr(vfmadd))]
5468#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5469pub fn _mm_maskz_fmadd_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5470    unsafe {
5471        let mut fmadd: f16 = 0.0;
5472        if k & 1 != 0 {
5473            let extracta: f16 = simd_extract!(a, 0);
5474            let extractb: f16 = simd_extract!(b, 0);
5475            let extractc: f16 = simd_extract!(c, 0);
5476            fmadd = fmaf16(extracta, extractb, extractc);
5477        }
5478        simd_insert!(a, 0, fmadd)
5479    }
5480}
5481
5482/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5483/// result to the lower element in c. Store the result in the lower element of dst, and copy the upper
5484/// 7 packed elements from a to the upper elements of dst.
5485///
5486/// Rounding is done according to the rounding parameter, which can be one of:
5487///
5488/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5489/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5490/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5491/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5492/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5493///
5494/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_round_sh)
5495#[inline]
5496#[target_feature(enable = "avx512fp16")]
5497#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5498#[rustc_legacy_const_generics(3)]
5499#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5500pub fn _mm_fmadd_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5501    unsafe {
5502        static_assert_rounding!(ROUNDING);
5503        let extracta: f16 = simd_extract!(a, 0);
5504        let extractb: f16 = simd_extract!(b, 0);
5505        let extractc: f16 = simd_extract!(c, 0);
5506        let r = vfmaddsh(extracta, extractb, extractc, ROUNDING);
5507        simd_insert!(a, 0, r)
5508    }
5509}
5510
5511/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5512/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
5513/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5514/// upper elements of dst.
5515///
5516/// Rounding is done according to the rounding parameter, which can be one of:
5517///
5518/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5519/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5520/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5521/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5522/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5523///
5524/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_round_sh)
5525#[inline]
5526#[target_feature(enable = "avx512fp16")]
5527#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5528#[rustc_legacy_const_generics(4)]
5529#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5530pub fn _mm_mask_fmadd_round_sh<const ROUNDING: i32>(
5531    a: __m128h,
5532    k: __mmask8,
5533    b: __m128h,
5534    c: __m128h,
5535) -> __m128h {
5536    unsafe {
5537        static_assert_rounding!(ROUNDING);
5538        let mut fmadd: f16 = simd_extract!(a, 0);
5539        if k & 1 != 0 {
5540            let extractb: f16 = simd_extract!(b, 0);
5541            let extractc: f16 = simd_extract!(c, 0);
5542            fmadd = vfmaddsh(fmadd, extractb, extractc, ROUNDING);
5543        }
5544        simd_insert!(a, 0, fmadd)
5545    }
5546}
5547
5548/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5549/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
5550/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
5551/// upper elements of dst.
5552///
5553/// Rounding is done according to the rounding parameter, which can be one of:
5554///
5555/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5556/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5557/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5558/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5559/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5560///
5561/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_sh)
5562#[inline]
5563#[target_feature(enable = "avx512fp16")]
5564#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5565#[rustc_legacy_const_generics(4)]
5566#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5567pub fn _mm_mask3_fmadd_round_sh<const ROUNDING: i32>(
5568    a: __m128h,
5569    b: __m128h,
5570    c: __m128h,
5571    k: __mmask8,
5572) -> __m128h {
5573    unsafe {
5574        static_assert_rounding!(ROUNDING);
5575        let mut fmadd: f16 = simd_extract!(c, 0);
5576        if k & 1 != 0 {
5577            let extracta: f16 = simd_extract!(a, 0);
5578            let extractb: f16 = simd_extract!(b, 0);
5579            fmadd = vfmaddsh(extracta, extractb, fmadd, ROUNDING);
5580        }
5581        simd_insert!(c, 0, fmadd)
5582    }
5583}
5584
5585/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5586/// result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element
5587/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5588/// upper elements of dst.
5589///
5590/// Rounding is done according to the rounding parameter, which can be one of:
5591///
5592/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5593/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5594/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5595/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5596/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5597///
5598/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_round_sh)
5599#[inline]
5600#[target_feature(enable = "avx512fp16")]
5601#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5602#[rustc_legacy_const_generics(4)]
5603#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5604pub fn _mm_maskz_fmadd_round_sh<const ROUNDING: i32>(
5605    k: __mmask8,
5606    a: __m128h,
5607    b: __m128h,
5608    c: __m128h,
5609) -> __m128h {
5610    unsafe {
5611        static_assert_rounding!(ROUNDING);
5612        let mut fmadd: f16 = 0.0;
5613        if k & 1 != 0 {
5614            let extracta: f16 = simd_extract!(a, 0);
5615            let extractb: f16 = simd_extract!(b, 0);
5616            let extractc: f16 = simd_extract!(c, 0);
5617            fmadd = vfmaddsh(extracta, extractb, extractc, ROUNDING);
5618        }
5619        simd_insert!(a, 0, fmadd)
5620    }
5621}
5622
5623/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5624/// in c from the intermediate result, and store the results in dst.
5625/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5626///
5627/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_ph)
5628#[inline]
5629#[target_feature(enable = "avx512fp16,avx512vl")]
5630#[cfg_attr(test, assert_instr(vfmsub))]
5631#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5632pub fn _mm_fmsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5633    unsafe { simd_fma(a, b, simd_neg(c)) }
5634}
5635
5636/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5637/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5638/// from a when the corresponding mask bit is not set).
5639///
5640/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_ph)
5641#[inline]
5642#[target_feature(enable = "avx512fp16,avx512vl")]
5643#[cfg_attr(test, assert_instr(vfmsub))]
5644#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5645pub fn _mm_mask_fmsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
5646    unsafe { simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), a) }
5647}
5648
5649/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5650/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5651/// from c when the corresponding mask bit is not set).
5652///
5653/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_ph)
5654#[inline]
5655#[target_feature(enable = "avx512fp16,avx512vl")]
5656#[cfg_attr(test, assert_instr(vfmsub))]
5657#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5658pub fn _mm_mask3_fmsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
5659    unsafe { simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), c) }
5660}
5661
5662/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5663/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
5664/// out when the corresponding mask bit is not set).
5665///
5666/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_ph)
5667#[inline]
5668#[target_feature(enable = "avx512fp16,avx512vl")]
5669#[cfg_attr(test, assert_instr(vfmsub))]
5670#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5671pub fn _mm_maskz_fmsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5672    unsafe { simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), _mm_setzero_ph()) }
5673}
5674
5675/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5676/// in c from the intermediate result, and store the results in dst.
5677///
5678/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmsub_ph)
5679#[inline]
5680#[target_feature(enable = "avx512fp16,avx512vl")]
5681#[cfg_attr(test, assert_instr(vfmsub))]
5682#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5683pub fn _mm256_fmsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
5684    unsafe { simd_fma(a, b, simd_neg(c)) }
5685}
5686
5687/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5688/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5689/// from a when the corresponding mask bit is not set).
5690///
5691/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmsub_ph)
5692#[inline]
5693#[target_feature(enable = "avx512fp16,avx512vl")]
5694#[cfg_attr(test, assert_instr(vfmsub))]
5695#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5696pub fn _mm256_mask_fmsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
5697    unsafe { simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), a) }
5698}
5699
5700/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5701/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5702/// from c when the corresponding mask bit is not set).
5703///
5704/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmsub_ph)
5705#[inline]
5706#[target_feature(enable = "avx512fp16,avx512vl")]
5707#[cfg_attr(test, assert_instr(vfmsub))]
5708#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5709pub fn _mm256_mask3_fmsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
5710    unsafe { simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), c) }
5711}
5712
5713/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5714/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
5715/// out when the corresponding mask bit is not set).
5716///
5717/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmsub_ph)
5718#[inline]
5719#[target_feature(enable = "avx512fp16,avx512vl")]
5720#[cfg_attr(test, assert_instr(vfmsub))]
5721#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5722pub fn _mm256_maskz_fmsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
5723    unsafe { simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), _mm256_setzero_ph()) }
5724}
5725
5726/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5727/// in c from the intermediate result, and store the results in dst.
5728///
5729/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsub_ph)
5730#[inline]
5731#[target_feature(enable = "avx512fp16")]
5732#[cfg_attr(test, assert_instr(vfmsub))]
5733#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5734pub fn _mm512_fmsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5735    unsafe { simd_fma(a, b, simd_neg(c)) }
5736}
5737
5738/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5739/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5740/// from a when the corresponding mask bit is not set).
5741///
5742/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsub_ph)
5743#[inline]
5744#[target_feature(enable = "avx512fp16")]
5745#[cfg_attr(test, assert_instr(vfmsub))]
5746#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5747pub fn _mm512_mask_fmsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
5748    unsafe { simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), a) }
5749}
5750
5751/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5752/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5753/// from c when the corresponding mask bit is not set).
5754///
5755/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsub_ph)
5756#[inline]
5757#[target_feature(enable = "avx512fp16")]
5758#[cfg_attr(test, assert_instr(vfmsub))]
5759#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5760pub fn _mm512_mask3_fmsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
5761    unsafe { simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), c) }
5762}
5763
5764/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5765/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
5766/// out when the corresponding mask bit is not set).
5767///
5768/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsub_ph)
5769#[inline]
5770#[target_feature(enable = "avx512fp16")]
5771#[cfg_attr(test, assert_instr(vfmsub))]
5772#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5773pub fn _mm512_maskz_fmsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5774    unsafe { simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), _mm512_setzero_ph()) }
5775}
5776
5777/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5778/// in c from the intermediate result, and store the results in dst.
5779///
5780/// Rounding is done according to the rounding parameter, which can be one of:
5781///
5782/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5783/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5784/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5785/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5786/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5787///
5788/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsub_round_ph)
5789#[inline]
5790#[target_feature(enable = "avx512fp16")]
5791#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
5792#[rustc_legacy_const_generics(3)]
5793#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5794pub fn _mm512_fmsub_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5795    unsafe {
5796        static_assert_rounding!(ROUNDING);
5797        vfmaddph_512(a, b, simd_neg(c), ROUNDING)
5798    }
5799}
5800
5801/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5802/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5803/// from a when the corresponding mask bit is not set).
5804///
5805/// Rounding is done according to the rounding parameter, which can be one of:
5806///
5807/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5808/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5809/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5810/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5811/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5812///
5813/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsub_round_ph)
5814#[inline]
5815#[target_feature(enable = "avx512fp16")]
5816#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
5817#[rustc_legacy_const_generics(4)]
5818#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5819pub fn _mm512_mask_fmsub_round_ph<const ROUNDING: i32>(
5820    a: __m512h,
5821    k: __mmask32,
5822    b: __m512h,
5823    c: __m512h,
5824) -> __m512h {
5825    unsafe {
5826        static_assert_rounding!(ROUNDING);
5827        simd_select_bitmask(k, _mm512_fmsub_round_ph::<ROUNDING>(a, b, c), a)
5828    }
5829}
5830
5831/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5832/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5833/// from c when the corresponding mask bit is not set).
5834///
5835/// Rounding is done according to the rounding parameter, which can be one of:
5836///
5837/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5838/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5839/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5840/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5841/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5842///
5843/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsub_round_ph)
5844#[inline]
5845#[target_feature(enable = "avx512fp16")]
5846#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
5847#[rustc_legacy_const_generics(4)]
5848#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5849pub fn _mm512_mask3_fmsub_round_ph<const ROUNDING: i32>(
5850    a: __m512h,
5851    b: __m512h,
5852    c: __m512h,
5853    k: __mmask32,
5854) -> __m512h {
5855    unsafe {
5856        static_assert_rounding!(ROUNDING);
5857        simd_select_bitmask(k, _mm512_fmsub_round_ph::<ROUNDING>(a, b, c), c)
5858    }
5859}
5860
5861/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5862/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
5863/// out when the corresponding mask bit is not set).
5864///
5865/// Rounding is done according to the rounding parameter, which can be one of:
5866///
5867/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5868/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5869/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5870/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5871/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5872///
5873/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsub_round_ph)
5874#[inline]
5875#[target_feature(enable = "avx512fp16")]
5876#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
5877#[rustc_legacy_const_generics(4)]
5878#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5879pub fn _mm512_maskz_fmsub_round_ph<const ROUNDING: i32>(
5880    k: __mmask32,
5881    a: __m512h,
5882    b: __m512h,
5883    c: __m512h,
5884) -> __m512h {
5885    unsafe {
5886        static_assert_rounding!(ROUNDING);
5887        simd_select_bitmask(
5888            k,
5889            _mm512_fmsub_round_ph::<ROUNDING>(a, b, c),
5890            _mm512_setzero_ph(),
5891        )
5892    }
5893}
5894
5895/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
5896/// in c from the intermediate result. Store the result in the lower element of dst, and copy the upper
5897/// 7 packed elements from a to the upper elements of dst.
5898///
5899/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_sh)
5900#[inline]
5901#[target_feature(enable = "avx512fp16")]
5902#[cfg_attr(test, assert_instr(vfmsub))]
5903#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5904pub fn _mm_fmsub_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5905    unsafe {
5906        let extracta: f16 = simd_extract!(a, 0);
5907        let extractb: f16 = simd_extract!(b, 0);
5908        let extractc: f16 = simd_extract!(c, 0);
5909        let r = fmaf16(extracta, extractb, -extractc);
5910        simd_insert!(a, 0, r)
5911    }
5912}
5913
5914/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
5915/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
5916/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5917/// upper elements of dst.
5918///
5919/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_sh)
5920#[inline]
5921#[target_feature(enable = "avx512fp16")]
5922#[cfg_attr(test, assert_instr(vfmsub))]
5923#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5924pub fn _mm_mask_fmsub_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
5925    unsafe {
5926        let mut fmsub: f16 = simd_extract!(a, 0);
5927        if k & 1 != 0 {
5928            let extractb: f16 = simd_extract!(b, 0);
5929            let extractc: f16 = simd_extract!(c, 0);
5930            fmsub = fmaf16(fmsub, extractb, -extractc);
5931        }
5932        simd_insert!(a, 0, fmsub)
5933    }
5934}
5935
5936/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
5937/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
5938/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
5939/// upper elements of dst.
5940///
5941/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_sh)
5942#[inline]
5943#[target_feature(enable = "avx512fp16")]
5944#[cfg_attr(test, assert_instr(vfmsub))]
5945#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5946pub fn _mm_mask3_fmsub_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
5947    unsafe {
5948        let mut fmsub: f16 = simd_extract!(c, 0);
5949        if k & 1 != 0 {
5950            let extracta: f16 = simd_extract!(a, 0);
5951            let extractb: f16 = simd_extract!(b, 0);
5952            fmsub = fmaf16(extracta, extractb, -fmsub);
5953        }
5954        simd_insert!(c, 0, fmsub)
5955    }
5956}
5957
5958/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
5959/// in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element
5960/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5961/// upper elements of dst.
5962///
5963/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_sh)
5964#[inline]
5965#[target_feature(enable = "avx512fp16")]
5966#[cfg_attr(test, assert_instr(vfmsub))]
5967#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5968pub fn _mm_maskz_fmsub_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5969    unsafe {
5970        let mut fmsub: f16 = 0.0;
5971        if k & 1 != 0 {
5972            let extracta: f16 = simd_extract!(a, 0);
5973            let extractb: f16 = simd_extract!(b, 0);
5974            let extractc: f16 = simd_extract!(c, 0);
5975            fmsub = fmaf16(extracta, extractb, -extractc);
5976        }
5977        simd_insert!(a, 0, fmsub)
5978    }
5979}
5980
5981/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
5982/// in c from the intermediate result. Store the result in the lower element of dst, and copy the upper
5983/// 7 packed elements from a to the upper elements of dst.
5984///
5985/// Rounding is done according to the rounding parameter, which can be one of:
5986///
5987/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5988/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5989/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5990/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5991/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5992///
5993/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_round_sh)
5994#[inline]
5995#[target_feature(enable = "avx512fp16")]
5996#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
5997#[rustc_legacy_const_generics(3)]
5998#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5999pub fn _mm_fmsub_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6000    unsafe {
6001        static_assert_rounding!(ROUNDING);
6002        let extracta: f16 = simd_extract!(a, 0);
6003        let extractb: f16 = simd_extract!(b, 0);
6004        let extractc: f16 = simd_extract!(c, 0);
6005        let r = vfmaddsh(extracta, extractb, -extractc, ROUNDING);
6006        simd_insert!(a, 0, r)
6007    }
6008}
6009
6010/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
6011/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
6012/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
6013/// upper elements of dst.
6014///
6015/// Rounding is done according to the rounding parameter, which can be one of:
6016///
6017/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6018/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6019/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6020/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6021/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6022///
6023/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_round_sh)
6024#[inline]
6025#[target_feature(enable = "avx512fp16")]
6026#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
6027#[rustc_legacy_const_generics(4)]
6028#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6029pub fn _mm_mask_fmsub_round_sh<const ROUNDING: i32>(
6030    a: __m128h,
6031    k: __mmask8,
6032    b: __m128h,
6033    c: __m128h,
6034) -> __m128h {
6035    unsafe {
6036        static_assert_rounding!(ROUNDING);
6037        let mut fmsub: f16 = simd_extract!(a, 0);
6038        if k & 1 != 0 {
6039            let extractb: f16 = simd_extract!(b, 0);
6040            let extractc: f16 = simd_extract!(c, 0);
6041            fmsub = vfmaddsh(fmsub, extractb, -extractc, ROUNDING);
6042        }
6043        simd_insert!(a, 0, fmsub)
6044    }
6045}
6046
6047/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
6048/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
6049/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
6050/// upper elements of dst.
6051///
6052/// Rounding is done according to the rounding parameter, which can be one of:
6053///
6054/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6055/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6056/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6057/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6058/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6059///
6060/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_round_sh)
6061#[inline]
6062#[target_feature(enable = "avx512fp16")]
6063#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
6064#[rustc_legacy_const_generics(4)]
6065#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6066pub fn _mm_mask3_fmsub_round_sh<const ROUNDING: i32>(
6067    a: __m128h,
6068    b: __m128h,
6069    c: __m128h,
6070    k: __mmask8,
6071) -> __m128h {
6072    unsafe {
6073        static_assert_rounding!(ROUNDING);
6074        let mut fmsub: f16 = simd_extract!(c, 0);
6075        if k & 1 != 0 {
6076            let extracta: f16 = simd_extract!(a, 0);
6077            let extractb: f16 = simd_extract!(b, 0);
6078            fmsub = vfmaddsh(extracta, extractb, -fmsub, ROUNDING);
6079        }
6080        simd_insert!(c, 0, fmsub)
6081    }
6082}
6083
6084/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
6085/// in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element
6086/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
6087/// upper elements of dst.
6088///
6089/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_round_sh)
6090#[inline]
6091#[target_feature(enable = "avx512fp16")]
6092#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
6093#[rustc_legacy_const_generics(4)]
6094#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6095pub fn _mm_maskz_fmsub_round_sh<const ROUNDING: i32>(
6096    k: __mmask8,
6097    a: __m128h,
6098    b: __m128h,
6099    c: __m128h,
6100) -> __m128h {
6101    unsafe {
6102        static_assert_rounding!(ROUNDING);
6103        let mut fmsub: f16 = 0.0;
6104        if k & 1 != 0 {
6105            let extracta: f16 = simd_extract!(a, 0);
6106            let extractb: f16 = simd_extract!(b, 0);
6107            let extractc: f16 = simd_extract!(c, 0);
6108            fmsub = vfmaddsh(extracta, extractb, -extractc, ROUNDING);
6109        }
6110        simd_insert!(a, 0, fmsub)
6111    }
6112}
6113
6114/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6115/// result from packed elements in c, and store the results in dst.
6116///
6117/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_ph)
6118#[inline]
6119#[target_feature(enable = "avx512fp16,avx512vl")]
6120#[cfg_attr(test, assert_instr(vfnmadd))]
6121#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6122pub fn _mm_fnmadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6123    unsafe { simd_fma(simd_neg(a), b, c) }
6124}
6125
6126/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6127/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6128/// from a when the corresponding mask bit is not set).
6129///
6130/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_ph)
6131#[inline]
6132#[target_feature(enable = "avx512fp16,avx512vl")]
6133#[cfg_attr(test, assert_instr(vfnmadd))]
6134#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6135pub fn _mm_mask_fnmadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
6136    unsafe { simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), a) }
6137}
6138
6139/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6140/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6141/// from c when the corresponding mask bit is not set).
6142///
6143/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmadd_ph)
6144#[inline]
6145#[target_feature(enable = "avx512fp16,avx512vl")]
6146#[cfg_attr(test, assert_instr(vfnmadd))]
6147#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6148pub fn _mm_mask3_fnmadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
6149    unsafe { simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), c) }
6150}
6151
6152/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6153/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
6154/// out when the corresponding mask bit is not set).
6155///
6156/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_ph)
6157#[inline]
6158#[target_feature(enable = "avx512fp16,avx512vl")]
6159#[cfg_attr(test, assert_instr(vfnmadd))]
6160#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6161pub fn _mm_maskz_fnmadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6162    unsafe { simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), _mm_setzero_ph()) }
6163}
6164
6165/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6166/// result from packed elements in c, and store the results in dst.
6167///
6168/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fnmadd_ph)
6169#[inline]
6170#[target_feature(enable = "avx512fp16,avx512vl")]
6171#[cfg_attr(test, assert_instr(vfnmadd))]
6172#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6173pub fn _mm256_fnmadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
6174    unsafe { simd_fma(simd_neg(a), b, c) }
6175}
6176
6177/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6178/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6179/// from a when the corresponding mask bit is not set).
6180///
6181/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fnmadd_ph)
6182#[inline]
6183#[target_feature(enable = "avx512fp16,avx512vl")]
6184#[cfg_attr(test, assert_instr(vfnmadd))]
6185#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6186pub fn _mm256_mask_fnmadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
6187    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), a) }
6188}
6189
6190/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6191/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6192/// from c when the corresponding mask bit is not set).
6193///
6194/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fnmadd_ph)
6195#[inline]
6196#[target_feature(enable = "avx512fp16,avx512vl")]
6197#[cfg_attr(test, assert_instr(vfnmadd))]
6198#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6199pub fn _mm256_mask3_fnmadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
6200    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), c) }
6201}
6202
6203/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6204/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
6205/// out when the corresponding mask bit is not set).
6206///
6207/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fnmadd_ph)
6208#[inline]
6209#[target_feature(enable = "avx512fp16,avx512vl")]
6210#[cfg_attr(test, assert_instr(vfnmadd))]
6211#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6212pub fn _mm256_maskz_fnmadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
6213    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), _mm256_setzero_ph()) }
6214}
6215
6216/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6217/// result from packed elements in c, and store the results in dst.
6218///
6219/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmadd_ph)
6220#[inline]
6221#[target_feature(enable = "avx512fp16")]
6222#[cfg_attr(test, assert_instr(vfnmadd))]
6223#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6224pub fn _mm512_fnmadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6225    unsafe { simd_fma(simd_neg(a), b, c) }
6226}
6227
6228/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6229/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6230/// from a when the corresponding mask bit is not set).
6231///
6232/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmadd_ph)
6233#[inline]
6234#[target_feature(enable = "avx512fp16")]
6235#[cfg_attr(test, assert_instr(vfnmadd))]
6236#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6237pub fn _mm512_mask_fnmadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
6238    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), a) }
6239}
6240
6241/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6242/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6243/// from c when the corresponding mask bit is not set).
6244///
6245/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmadd_ph)
6246#[inline]
6247#[target_feature(enable = "avx512fp16")]
6248#[cfg_attr(test, assert_instr(vfnmadd))]
6249#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6250pub fn _mm512_mask3_fnmadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
6251    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), c) }
6252}
6253
6254/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6255/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
6256/// out when the corresponding mask bit is not set).
6257///
6258/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmadd_ph)
6259#[inline]
6260#[target_feature(enable = "avx512fp16")]
6261#[cfg_attr(test, assert_instr(vfnmadd))]
6262#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6263pub fn _mm512_maskz_fnmadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6264    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), _mm512_setzero_ph()) }
6265}
6266
6267/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6268/// result from packed elements in c, and store the results in dst.
6269///
6270/// Rounding is done according to the rounding parameter, which can be one of:
6271///
6272/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6273/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6274/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6275/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6276/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6277///
6278/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmadd_round_ph)
6279#[inline]
6280#[target_feature(enable = "avx512fp16")]
6281#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6282#[rustc_legacy_const_generics(3)]
6283#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6284pub fn _mm512_fnmadd_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6285    unsafe {
6286        static_assert_rounding!(ROUNDING);
6287        vfmaddph_512(simd_neg(a), b, c, ROUNDING)
6288    }
6289}
6290
6291/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6292/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6293/// from a when the corresponding mask bit is not set).
6294///
6295/// Rounding is done according to the rounding parameter, which can be one of:
6296///
6297/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6298/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6299/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6300/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6301/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6302///
6303/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmadd_round_ph)
6304#[inline]
6305#[target_feature(enable = "avx512fp16")]
6306#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6307#[rustc_legacy_const_generics(4)]
6308#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6309pub fn _mm512_mask_fnmadd_round_ph<const ROUNDING: i32>(
6310    a: __m512h,
6311    k: __mmask32,
6312    b: __m512h,
6313    c: __m512h,
6314) -> __m512h {
6315    unsafe {
6316        static_assert_rounding!(ROUNDING);
6317        simd_select_bitmask(k, _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c), a)
6318    }
6319}
6320
6321/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6322/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6323/// from c when the corresponding mask bit is not set).
6324///
6325/// Rounding is done according to the rounding parameter, which can be one of:
6326///
6327/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6328/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6329/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6330/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6331/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6332///
6333/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmadd_round_ph)
6334#[inline]
6335#[target_feature(enable = "avx512fp16")]
6336#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6337#[rustc_legacy_const_generics(4)]
6338#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6339pub fn _mm512_mask3_fnmadd_round_ph<const ROUNDING: i32>(
6340    a: __m512h,
6341    b: __m512h,
6342    c: __m512h,
6343    k: __mmask32,
6344) -> __m512h {
6345    unsafe {
6346        static_assert_rounding!(ROUNDING);
6347        simd_select_bitmask(k, _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c), c)
6348    }
6349}
6350
6351/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6352/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
6353/// out when the corresponding mask bit is not set).
6354///
6355/// Rounding is done according to the rounding parameter, which can be one of:
6356///
6357/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6358/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6359/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6360/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6361/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6362///
6363/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmadd_round_ph)
6364#[inline]
6365#[target_feature(enable = "avx512fp16")]
6366#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6367#[rustc_legacy_const_generics(4)]
6368#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6369pub fn _mm512_maskz_fnmadd_round_ph<const ROUNDING: i32>(
6370    k: __mmask32,
6371    a: __m512h,
6372    b: __m512h,
6373    c: __m512h,
6374) -> __m512h {
6375    unsafe {
6376        static_assert_rounding!(ROUNDING);
6377        simd_select_bitmask(
6378            k,
6379            _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c),
6380            _mm512_setzero_ph(),
6381        )
6382    }
6383}
6384
6385/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6386/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
6387/// elements from a to the upper elements of dst.
6388///
6389/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_sh)
6390#[inline]
6391#[target_feature(enable = "avx512fp16")]
6392#[cfg_attr(test, assert_instr(vfnmadd))]
6393#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6394pub fn _mm_fnmadd_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6395    unsafe {
6396        let extracta: f16 = simd_extract!(a, 0);
6397        let extractb: f16 = simd_extract!(b, 0);
6398        let extractc: f16 = simd_extract!(c, 0);
6399        let r = fmaf16(-extracta, extractb, extractc);
6400        simd_insert!(a, 0, r)
6401    }
6402}
6403
6404/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6405/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6406/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6407/// elements of dst.
6408///
6409/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_sh)
6410#[inline]
6411#[target_feature(enable = "avx512fp16")]
6412#[cfg_attr(test, assert_instr(vfnmadd))]
6413#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6414pub fn _mm_mask_fnmadd_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
6415    unsafe {
6416        let mut fnmadd: f16 = simd_extract!(a, 0);
6417        if k & 1 != 0 {
6418            let extractb: f16 = simd_extract!(b, 0);
6419            let extractc: f16 = simd_extract!(c, 0);
6420            fnmadd = fmaf16(-fnmadd, extractb, extractc);
6421        }
6422        simd_insert!(a, 0, fnmadd)
6423    }
6424}
6425
6426/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6427/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6428/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
6429/// elements of dst.
6430///
6431/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmadd_sh)
6432#[inline]
6433#[target_feature(enable = "avx512fp16")]
6434#[cfg_attr(test, assert_instr(vfnmadd))]
6435#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6436pub fn _mm_mask3_fnmadd_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
6437    unsafe {
6438        let mut fnmadd: f16 = simd_extract!(c, 0);
6439        if k & 1 != 0 {
6440            let extracta: f16 = simd_extract!(a, 0);
6441            let extractb: f16 = simd_extract!(b, 0);
6442            fnmadd = fmaf16(-extracta, extractb, fnmadd);
6443        }
6444        simd_insert!(c, 0, fnmadd)
6445    }
6446}
6447
6448/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6449/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
6450/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6451/// elements of dst.
6452///
6453/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_sh)
6454#[inline]
6455#[target_feature(enable = "avx512fp16")]
6456#[cfg_attr(test, assert_instr(vfnmadd))]
6457#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6458pub fn _mm_maskz_fnmadd_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6459    unsafe {
6460        let mut fnmadd: f16 = 0.0;
6461        if k & 1 != 0 {
6462            let extracta: f16 = simd_extract!(a, 0);
6463            let extractb: f16 = simd_extract!(b, 0);
6464            let extractc: f16 = simd_extract!(c, 0);
6465            fnmadd = fmaf16(-extracta, extractb, extractc);
6466        }
6467        simd_insert!(a, 0, fnmadd)
6468    }
6469}
6470
6471/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6472/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
6473/// elements from a to the upper elements of dst.
6474///
6475/// Rounding is done according to the rounding parameter, which can be one of:
6476///
6477/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6478/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6479/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6480/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6481/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6482///
6483/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_round_sh)
6484#[inline]
6485#[target_feature(enable = "avx512fp16")]
6486#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6487#[rustc_legacy_const_generics(3)]
6488#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6489pub fn _mm_fnmadd_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6490    unsafe {
6491        static_assert_rounding!(ROUNDING);
6492        let extracta: f16 = simd_extract!(a, 0);
6493        let extractb: f16 = simd_extract!(b, 0);
6494        let extractc: f16 = simd_extract!(c, 0);
6495        let r = vfmaddsh(-extracta, extractb, extractc, ROUNDING);
6496        simd_insert!(a, 0, r)
6497    }
6498}
6499
6500/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6501/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6502/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6503/// elements of dst.
6504///
6505/// Rounding is done according to the rounding parameter, which can be one of:
6506///
6507/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6508/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6509/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6510/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6511/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6512///
6513/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_round_sh)
6514#[inline]
6515#[target_feature(enable = "avx512fp16")]
6516#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6517#[rustc_legacy_const_generics(4)]
6518#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6519pub fn _mm_mask_fnmadd_round_sh<const ROUNDING: i32>(
6520    a: __m128h,
6521    k: __mmask8,
6522    b: __m128h,
6523    c: __m128h,
6524) -> __m128h {
6525    unsafe {
6526        static_assert_rounding!(ROUNDING);
6527        let mut fnmadd: f16 = simd_extract!(a, 0);
6528        if k & 1 != 0 {
6529            let extractb: f16 = simd_extract!(b, 0);
6530            let extractc: f16 = simd_extract!(c, 0);
6531            fnmadd = vfmaddsh(-fnmadd, extractb, extractc, ROUNDING);
6532        }
6533        simd_insert!(a, 0, fnmadd)
6534    }
6535}
6536
6537/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6538/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6539/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
6540/// elements of dst.
6541///
6542/// Rounding is done according to the rounding parameter, which can be one of:
6543///
6544/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6545/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6546/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6547/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6548/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6549///
6550/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmadd_round_sh)
6551#[inline]
6552#[target_feature(enable = "avx512fp16")]
6553#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6554#[rustc_legacy_const_generics(4)]
6555#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6556pub fn _mm_mask3_fnmadd_round_sh<const ROUNDING: i32>(
6557    a: __m128h,
6558    b: __m128h,
6559    c: __m128h,
6560    k: __mmask8,
6561) -> __m128h {
6562    unsafe {
6563        static_assert_rounding!(ROUNDING);
6564        let mut fnmadd: f16 = simd_extract!(c, 0);
6565        if k & 1 != 0 {
6566            let extracta: f16 = simd_extract!(a, 0);
6567            let extractb: f16 = simd_extract!(b, 0);
6568            fnmadd = vfmaddsh(-extracta, extractb, fnmadd, ROUNDING);
6569        }
6570        simd_insert!(c, 0, fnmadd)
6571    }
6572}
6573
6574/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6575/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
6576/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6577/// elements of dst.
6578///
6579/// Rounding is done according to the rounding parameter, which can be one of:
6580///
6581/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6582/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6583/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6584/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6585/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6586///
6587/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_round_sh)
6588#[inline]
6589#[target_feature(enable = "avx512fp16")]
6590#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6591#[rustc_legacy_const_generics(4)]
6592#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6593pub fn _mm_maskz_fnmadd_round_sh<const ROUNDING: i32>(
6594    k: __mmask8,
6595    a: __m128h,
6596    b: __m128h,
6597    c: __m128h,
6598) -> __m128h {
6599    unsafe {
6600        static_assert_rounding!(ROUNDING);
6601        let mut fnmadd: f16 = 0.0;
6602        if k & 1 != 0 {
6603            let extracta: f16 = simd_extract!(a, 0);
6604            let extractb: f16 = simd_extract!(b, 0);
6605            let extractc: f16 = simd_extract!(c, 0);
6606            fnmadd = vfmaddsh(-extracta, extractb, extractc, ROUNDING);
6607        }
6608        simd_insert!(a, 0, fnmadd)
6609    }
6610}
6611
6612/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6613/// in c from the negated intermediate result, and store the results in dst.
6614///
6615/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_ph)
6616#[inline]
6617#[target_feature(enable = "avx512fp16,avx512vl")]
6618#[cfg_attr(test, assert_instr(vfnmsub))]
6619#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6620pub fn _mm_fnmsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6621    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
6622}
6623
6624/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6625/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6626/// copied from a when the corresponding mask bit is not set).
6627///
6628/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_ph)
6629#[inline]
6630#[target_feature(enable = "avx512fp16,avx512vl")]
6631#[cfg_attr(test, assert_instr(vfnmsub))]
6632#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6633pub fn _mm_mask_fnmsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
6634    unsafe { simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), a) }
6635}
6636
6637/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6638/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6639/// copied from c when the corresponding mask bit is not set).
6640///
6641/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_ph)
6642#[inline]
6643#[target_feature(enable = "avx512fp16,avx512vl")]
6644#[cfg_attr(test, assert_instr(vfnmsub))]
6645#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6646pub fn _mm_mask3_fnmsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
6647    unsafe { simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), c) }
6648}
6649
6650/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6651/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
6652/// zeroed out when the corresponding mask bit is not set).
6653///
6654/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_ph)
6655#[inline]
6656#[target_feature(enable = "avx512fp16,avx512vl")]
6657#[cfg_attr(test, assert_instr(vfnmsub))]
6658#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6659pub fn _mm_maskz_fnmsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6660    unsafe { simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), _mm_setzero_ph()) }
6661}
6662
6663/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6664/// in c from the negated intermediate result, and store the results in dst.
6665///
6666/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fnmsub_ph)
6667#[inline]
6668#[target_feature(enable = "avx512fp16,avx512vl")]
6669#[cfg_attr(test, assert_instr(vfnmsub))]
6670#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6671pub fn _mm256_fnmsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
6672    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
6673}
6674
6675/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6676/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6677/// copied from a when the corresponding mask bit is not set).
6678///
6679/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fnmsub_ph)
6680#[inline]
6681#[target_feature(enable = "avx512fp16,avx512vl")]
6682#[cfg_attr(test, assert_instr(vfnmsub))]
6683#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6684pub fn _mm256_mask_fnmsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
6685    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), a) }
6686}
6687
6688/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6689/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6690/// copied from c when the corresponding mask bit is not set).
6691///
6692/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fnmsub_ph)
6693#[inline]
6694#[target_feature(enable = "avx512fp16,avx512vl")]
6695#[cfg_attr(test, assert_instr(vfnmsub))]
6696#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6697pub fn _mm256_mask3_fnmsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
6698    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), c) }
6699}
6700
6701/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6702/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
6703/// zeroed out when the corresponding mask bit is not set).
6704///
6705/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fnmsub_ph)
6706#[inline]
6707#[target_feature(enable = "avx512fp16,avx512vl")]
6708#[cfg_attr(test, assert_instr(vfnmsub))]
6709#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6710pub fn _mm256_maskz_fnmsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
6711    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), _mm256_setzero_ph()) }
6712}
6713
6714/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6715/// in c from the negated intermediate result, and store the results in dst.
6716///
6717/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmsub_ph)
6718#[inline]
6719#[target_feature(enable = "avx512fp16")]
6720#[cfg_attr(test, assert_instr(vfnmsub))]
6721#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6722pub fn _mm512_fnmsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6723    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
6724}
6725
6726/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6727/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6728/// copied from a when the corresponding mask bit is not set).
6729///
6730/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmsub_ph)
6731#[inline]
6732#[target_feature(enable = "avx512fp16")]
6733#[cfg_attr(test, assert_instr(vfnmsub))]
6734#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6735pub fn _mm512_mask_fnmsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
6736    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), a) }
6737}
6738
6739/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6740/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6741/// copied from c when the corresponding mask bit is not set).
6742///
6743/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmsub_ph)
6744#[inline]
6745#[target_feature(enable = "avx512fp16")]
6746#[cfg_attr(test, assert_instr(vfnmsub))]
6747#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6748pub fn _mm512_mask3_fnmsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
6749    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), c) }
6750}
6751
6752/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6753/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
6754/// zeroed out when the corresponding mask bit is not set).
6755///
6756/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmsub_ph)
6757#[inline]
6758#[target_feature(enable = "avx512fp16")]
6759#[cfg_attr(test, assert_instr(vfnmsub))]
6760#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6761pub fn _mm512_maskz_fnmsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6762    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), _mm512_setzero_ph()) }
6763}
6764
6765/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6766/// in c from the negated intermediate result, and store the results in dst.
6767///
6768/// Rounding is done according to the rounding parameter, which can be one of:
6769///
6770/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6771/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6772/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6773/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6774/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6775///
6776/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmsub_round_ph)
6777#[inline]
6778#[target_feature(enable = "avx512fp16")]
6779#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
6780#[rustc_legacy_const_generics(3)]
6781#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6782pub fn _mm512_fnmsub_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6783    unsafe {
6784        static_assert_rounding!(ROUNDING);
6785        vfmaddph_512(simd_neg(a), b, simd_neg(c), ROUNDING)
6786    }
6787}
6788
6789/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6790/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6791/// copied from a when the corresponding mask bit is not set).
6792///
6793/// Rounding is done according to the rounding parameter, which can be one of:
6794///
6795/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6796/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6797/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6798/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6799/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6800///
6801/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmsub_round_ph)
6802#[inline]
6803#[target_feature(enable = "avx512fp16")]
6804#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
6805#[rustc_legacy_const_generics(4)]
6806#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6807pub fn _mm512_mask_fnmsub_round_ph<const ROUNDING: i32>(
6808    a: __m512h,
6809    k: __mmask32,
6810    b: __m512h,
6811    c: __m512h,
6812) -> __m512h {
6813    unsafe {
6814        static_assert_rounding!(ROUNDING);
6815        simd_select_bitmask(k, _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c), a)
6816    }
6817}
6818
6819/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6820/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6821/// copied from c when the corresponding mask bit is not set).
6822///
6823/// Rounding is done according to the rounding parameter, which can be one of:
6824///
6825/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6826/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6827/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6828/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6829/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6830///
6831/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmsub_round_ph)
6832#[inline]
6833#[target_feature(enable = "avx512fp16")]
6834#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
6835#[rustc_legacy_const_generics(4)]
6836#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6837pub fn _mm512_mask3_fnmsub_round_ph<const ROUNDING: i32>(
6838    a: __m512h,
6839    b: __m512h,
6840    c: __m512h,
6841    k: __mmask32,
6842) -> __m512h {
6843    unsafe {
6844        static_assert_rounding!(ROUNDING);
6845        simd_select_bitmask(k, _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c), c)
6846    }
6847}
6848
6849/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6850/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
6851/// zeroed out when the corresponding mask bit is not set).
6852///
6853/// Rounding is done according to the rounding parameter, which can be one of:
6854///
6855/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6856/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6857/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6858/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6859/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6860///
6861/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmsub_round_ph)
6862#[inline]
6863#[target_feature(enable = "avx512fp16")]
6864#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
6865#[rustc_legacy_const_generics(4)]
6866#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6867pub fn _mm512_maskz_fnmsub_round_ph<const ROUNDING: i32>(
6868    k: __mmask32,
6869    a: __m512h,
6870    b: __m512h,
6871    c: __m512h,
6872) -> __m512h {
6873    unsafe {
6874        static_assert_rounding!(ROUNDING);
6875        simd_select_bitmask(
6876            k,
6877            _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c),
6878            _mm512_setzero_ph(),
6879        )
6880    }
6881}
6882
6883/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6884/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
6885/// elements from a to the upper elements of dst.
6886///
6887/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_sh)
6888#[inline]
6889#[target_feature(enable = "avx512fp16")]
6890#[cfg_attr(test, assert_instr(vfnmsub))]
6891#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6892pub fn _mm_fnmsub_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6893    unsafe {
6894        let extracta: f16 = simd_extract!(a, 0);
6895        let extractb: f16 = simd_extract!(b, 0);
6896        let extractc: f16 = simd_extract!(c, 0);
6897        let r = fmaf16(-extracta, extractb, -extractc);
6898        simd_insert!(a, 0, r)
6899    }
6900}
6901
6902/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6903/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6904/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6905/// elements of dst.
6906///
6907/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_sh)
6908#[inline]
6909#[target_feature(enable = "avx512fp16")]
6910#[cfg_attr(test, assert_instr(vfnmsub))]
6911#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6912pub fn _mm_mask_fnmsub_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
6913    unsafe {
6914        let mut fnmsub: f16 = simd_extract!(a, 0);
6915        if k & 1 != 0 {
6916            let extractb: f16 = simd_extract!(b, 0);
6917            let extractc: f16 = simd_extract!(c, 0);
6918            fnmsub = fmaf16(-fnmsub, extractb, -extractc);
6919        }
6920        simd_insert!(a, 0, fnmsub)
6921    }
6922}
6923
6924/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6925/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6926/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
6927/// elements of dst.
6928///
6929/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_sh)
6930#[inline]
6931#[target_feature(enable = "avx512fp16")]
6932#[cfg_attr(test, assert_instr(vfnmsub))]
6933#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6934pub fn _mm_mask3_fnmsub_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
6935    unsafe {
6936        let mut fnmsub: f16 = simd_extract!(c, 0);
6937        if k & 1 != 0 {
6938            let extracta: f16 = simd_extract!(a, 0);
6939            let extractb: f16 = simd_extract!(b, 0);
6940            fnmsub = fmaf16(-extracta, extractb, -fnmsub);
6941        }
6942        simd_insert!(c, 0, fnmsub)
6943    }
6944}
6945
6946/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6947/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
6948/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6949/// elements of dst.
6950///
6951/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_sh)
6952#[inline]
6953#[target_feature(enable = "avx512fp16")]
6954#[cfg_attr(test, assert_instr(vfnmsub))]
6955#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6956pub fn _mm_maskz_fnmsub_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6957    unsafe {
6958        let mut fnmsub: f16 = 0.0;
6959        if k & 1 != 0 {
6960            let extracta: f16 = simd_extract!(a, 0);
6961            let extractb: f16 = simd_extract!(b, 0);
6962            let extractc: f16 = simd_extract!(c, 0);
6963            fnmsub = fmaf16(-extracta, extractb, -extractc);
6964        }
6965        simd_insert!(a, 0, fnmsub)
6966    }
6967}
6968
6969/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6970/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
6971/// elements from a to the upper elements of dst.
6972///
6973/// Rounding is done according to the rounding parameter, which can be one of:
6974///
6975/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6976/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6977/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6978/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6979/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6980///
6981/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_round_sh)
6982#[inline]
6983#[target_feature(enable = "avx512fp16")]
6984#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
6985#[rustc_legacy_const_generics(3)]
6986#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6987pub fn _mm_fnmsub_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6988    unsafe {
6989        static_assert_rounding!(ROUNDING);
6990        let extracta: f16 = simd_extract!(a, 0);
6991        let extractb: f16 = simd_extract!(b, 0);
6992        let extractc: f16 = simd_extract!(c, 0);
6993        let r = vfmaddsh(-extracta, extractb, -extractc, ROUNDING);
6994        simd_insert!(a, 0, r)
6995    }
6996}
6997
6998/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6999/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
7000/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
7001/// elements of dst.
7002///
7003/// Rounding is done according to the rounding parameter, which can be one of:
7004///
7005/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7006/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7007/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7008/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7009/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7010///
7011/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_round_sh)
7012#[inline]
7013#[target_feature(enable = "avx512fp16")]
7014#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
7015#[rustc_legacy_const_generics(4)]
7016#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7017pub fn _mm_mask_fnmsub_round_sh<const ROUNDING: i32>(
7018    a: __m128h,
7019    k: __mmask8,
7020    b: __m128h,
7021    c: __m128h,
7022) -> __m128h {
7023    unsafe {
7024        static_assert_rounding!(ROUNDING);
7025        let mut fnmsub: f16 = simd_extract!(a, 0);
7026        if k & 1 != 0 {
7027            let extractb: f16 = simd_extract!(b, 0);
7028            let extractc: f16 = simd_extract!(c, 0);
7029            fnmsub = vfmaddsh(-fnmsub, extractb, -extractc, ROUNDING);
7030        }
7031        simd_insert!(a, 0, fnmsub)
7032    }
7033}
7034
7035/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
7036/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
7037/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
7038/// elements of dst.
7039///
7040/// Rounding is done according to the rounding parameter, which can be one of:
7041///
7042/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7043/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7044/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7045/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7046/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7047///
7048/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_round_sh)
7049#[inline]
7050#[target_feature(enable = "avx512fp16")]
7051#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
7052#[rustc_legacy_const_generics(4)]
7053#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7054pub fn _mm_mask3_fnmsub_round_sh<const ROUNDING: i32>(
7055    a: __m128h,
7056    b: __m128h,
7057    c: __m128h,
7058    k: __mmask8,
7059) -> __m128h {
7060    unsafe {
7061        static_assert_rounding!(ROUNDING);
7062        let mut fnmsub: f16 = simd_extract!(c, 0);
7063        if k & 1 != 0 {
7064            let extracta: f16 = simd_extract!(a, 0);
7065            let extractb: f16 = simd_extract!(b, 0);
7066            fnmsub = vfmaddsh(-extracta, extractb, -fnmsub, ROUNDING);
7067        }
7068        simd_insert!(c, 0, fnmsub)
7069    }
7070}
7071
7072/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
7073/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
7074/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
7075/// elements of dst.
7076///
7077/// Rounding is done according to the rounding parameter, which can be one of:
7078///
7079/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7080/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7081/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7082/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7083/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7084///
7085/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_round_sh)
7086#[inline]
7087#[target_feature(enable = "avx512fp16")]
7088#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
7089#[rustc_legacy_const_generics(4)]
7090#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7091pub fn _mm_maskz_fnmsub_round_sh<const ROUNDING: i32>(
7092    k: __mmask8,
7093    a: __m128h,
7094    b: __m128h,
7095    c: __m128h,
7096) -> __m128h {
7097    unsafe {
7098        static_assert_rounding!(ROUNDING);
7099        let mut fnmsub: f16 = 0.0;
7100        if k & 1 != 0 {
7101            let extracta: f16 = simd_extract!(a, 0);
7102            let extractb: f16 = simd_extract!(b, 0);
7103            let extractc: f16 = simd_extract!(c, 0);
7104            fnmsub = vfmaddsh(-extracta, extractb, -extractc, ROUNDING);
7105        }
7106        simd_insert!(a, 0, fnmsub)
7107    }
7108}
7109
7110/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7111/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
7112///
7113/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmaddsub_ph)
7114#[inline]
7115#[target_feature(enable = "avx512fp16,avx512vl")]
7116#[cfg_attr(test, assert_instr(vfmaddsub))]
7117#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7118pub fn _mm_fmaddsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7119    unsafe { vfmaddsubph_128(a, b, c) }
7120}
7121
7122/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7123/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7124/// (the element is copied from a when the corresponding mask bit is not set).
7125///
7126/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmaddsub_ph)
7127#[inline]
7128#[target_feature(enable = "avx512fp16,avx512vl")]
7129#[cfg_attr(test, assert_instr(vfmaddsub))]
7130#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7131pub fn _mm_mask_fmaddsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
7132    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), a) }
7133}
7134
7135/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7136/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7137/// (the element is copied from c when the corresponding mask bit is not set).
7138///
7139/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmaddsub_ph)
7140#[inline]
7141#[target_feature(enable = "avx512fp16,avx512vl")]
7142#[cfg_attr(test, assert_instr(vfmaddsub))]
7143#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7144pub fn _mm_mask3_fmaddsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
7145    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), c) }
7146}
7147
7148/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7149/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7150/// (the element is zeroed out when the corresponding mask bit is not set).
7151///
7152/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmaddsub_ph)
7153#[inline]
7154#[target_feature(enable = "avx512fp16,avx512vl")]
7155#[cfg_attr(test, assert_instr(vfmaddsub))]
7156#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7157pub fn _mm_maskz_fmaddsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7158    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), _mm_setzero_ph()) }
7159}
7160
7161/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7162/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
7163///
7164/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmaddsub_ph)
7165#[inline]
7166#[target_feature(enable = "avx512fp16,avx512vl")]
7167#[cfg_attr(test, assert_instr(vfmaddsub))]
7168#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7169pub fn _mm256_fmaddsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
7170    unsafe { vfmaddsubph_256(a, b, c) }
7171}
7172
7173/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7174/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7175/// (the element is copied from a when the corresponding mask bit is not set).
7176///
7177/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmaddsub_ph)
7178#[inline]
7179#[target_feature(enable = "avx512fp16,avx512vl")]
7180#[cfg_attr(test, assert_instr(vfmaddsub))]
7181#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7182pub fn _mm256_mask_fmaddsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
7183    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), a) }
7184}
7185
7186/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7187/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7188/// (the element is copied from c when the corresponding mask bit is not set).
7189///
7190/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmaddsub_ph)
7191#[inline]
7192#[target_feature(enable = "avx512fp16,avx512vl")]
7193#[cfg_attr(test, assert_instr(vfmaddsub))]
7194#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7195pub fn _mm256_mask3_fmaddsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
7196    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), c) }
7197}
7198
7199/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7200/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7201/// (the element is zeroed out when the corresponding mask bit is not set).
7202///
7203/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmaddsub_ph)
7204#[inline]
7205#[target_feature(enable = "avx512fp16,avx512vl")]
7206#[cfg_attr(test, assert_instr(vfmaddsub))]
7207#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7208pub fn _mm256_maskz_fmaddsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
7209    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), _mm256_setzero_ph()) }
7210}
7211
7212/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7213/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
7214///
7215/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmaddsub_ph)
7216#[inline]
7217#[target_feature(enable = "avx512fp16")]
7218#[cfg_attr(test, assert_instr(vfmaddsub))]
7219#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7220pub fn _mm512_fmaddsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
7221    _mm512_fmaddsub_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
7222}
7223
7224/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7225/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7226/// (the element is copied from a when the corresponding mask bit is not set).
7227///
7228/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmaddsub_ph)
7229#[inline]
7230#[target_feature(enable = "avx512fp16")]
7231#[cfg_attr(test, assert_instr(vfmaddsub))]
7232#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7233pub fn _mm512_mask_fmaddsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
7234    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), a) }
7235}
7236
7237/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7238/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7239/// (the element is copied from c when the corresponding mask bit is not set).
7240///
7241/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmaddsub_ph)
7242#[inline]
7243#[target_feature(enable = "avx512fp16")]
7244#[cfg_attr(test, assert_instr(vfmaddsub))]
7245#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7246pub fn _mm512_mask3_fmaddsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
7247    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), c) }
7248}
7249
7250/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7251/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7252/// (the element is zeroed out when the corresponding mask bit is not set).
7253///
7254/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmaddsub_ph)
7255#[inline]
7256#[target_feature(enable = "avx512fp16")]
7257#[cfg_attr(test, assert_instr(vfmaddsub))]
7258#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7259pub fn _mm512_maskz_fmaddsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
7260    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), _mm512_setzero_ph()) }
7261}
7262
7263/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7264/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
7265///
7266/// Rounding is done according to the rounding parameter, which can be one of:
7267///
7268/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7269/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7270/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7271/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7272/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7273///
7274/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmaddsub_round_ph)
7275#[inline]
7276#[target_feature(enable = "avx512fp16")]
7277#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
7278#[rustc_legacy_const_generics(3)]
7279#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7280pub fn _mm512_fmaddsub_round_ph<const ROUNDING: i32>(
7281    a: __m512h,
7282    b: __m512h,
7283    c: __m512h,
7284) -> __m512h {
7285    unsafe {
7286        static_assert_rounding!(ROUNDING);
7287        vfmaddsubph_512(a, b, c, ROUNDING)
7288    }
7289}
7290
7291/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7292/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7293/// (the element is copied from a when the corresponding mask bit is not set).
7294///
7295/// Rounding is done according to the rounding parameter, which can be one of:
7296///
7297/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7298/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7299/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7300/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7301/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7302///
7303/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmaddsub_round_ph)
7304#[inline]
7305#[target_feature(enable = "avx512fp16")]
7306#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
7307#[rustc_legacy_const_generics(4)]
7308#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7309pub fn _mm512_mask_fmaddsub_round_ph<const ROUNDING: i32>(
7310    a: __m512h,
7311    k: __mmask32,
7312    b: __m512h,
7313    c: __m512h,
7314) -> __m512h {
7315    unsafe {
7316        static_assert_rounding!(ROUNDING);
7317        simd_select_bitmask(k, _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c), a)
7318    }
7319}
7320
7321/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7322/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7323/// (the element is copied from c when the corresponding mask bit is not set).
7324///
7325/// Rounding is done according to the rounding parameter, which can be one of:
7326///
7327/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7328/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7329/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7330/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7331/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7332///
7333/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmaddsub_round_ph)
7334#[inline]
7335#[target_feature(enable = "avx512fp16")]
7336#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
7337#[rustc_legacy_const_generics(4)]
7338#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7339pub fn _mm512_mask3_fmaddsub_round_ph<const ROUNDING: i32>(
7340    a: __m512h,
7341    b: __m512h,
7342    c: __m512h,
7343    k: __mmask32,
7344) -> __m512h {
7345    unsafe {
7346        static_assert_rounding!(ROUNDING);
7347        simd_select_bitmask(k, _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c), c)
7348    }
7349}
7350
7351/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7352/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7353/// (the element is zeroed out when the corresponding mask bit is not set).
7354///
7355/// Rounding is done according to the rounding parameter, which can be one of:
7356///
7357/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7358/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7359/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7360/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7361/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7362///
7363/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmaddsub_round_ph)
7364#[inline]
7365#[target_feature(enable = "avx512fp16")]
7366#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
7367#[rustc_legacy_const_generics(4)]
7368#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7369pub fn _mm512_maskz_fmaddsub_round_ph<const ROUNDING: i32>(
7370    k: __mmask32,
7371    a: __m512h,
7372    b: __m512h,
7373    c: __m512h,
7374) -> __m512h {
7375    unsafe {
7376        static_assert_rounding!(ROUNDING);
7377        simd_select_bitmask(
7378            k,
7379            _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c),
7380            _mm512_setzero_ph(),
7381        )
7382    }
7383}
7384
7385/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7386/// and add packed elements in c to/from the intermediate result, and store the results in dst.
7387///
7388/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsubadd_ph)
7389#[inline]
7390#[target_feature(enable = "avx512fp16,avx512vl")]
7391#[cfg_attr(test, assert_instr(vfmsubadd))]
7392#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7393pub fn _mm_fmsubadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7394    unsafe { vfmaddsubph_128(a, b, simd_neg(c)) }
7395}
7396
7397/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7398/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7399/// (the element is copied from a when the corresponding mask bit is not set).
7400///
7401/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsubadd_ph)
7402#[inline]
7403#[target_feature(enable = "avx512fp16,avx512vl")]
7404#[cfg_attr(test, assert_instr(vfmsubadd))]
7405#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7406pub fn _mm_mask_fmsubadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
7407    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), a) }
7408}
7409
7410/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7411/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7412/// (the element is copied from c when the corresponding mask bit is not set).
7413///
7414/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsubadd_ph)
7415#[inline]
7416#[target_feature(enable = "avx512fp16,avx512vl")]
7417#[cfg_attr(test, assert_instr(vfmsubadd))]
7418#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7419pub fn _mm_mask3_fmsubadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
7420    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), c) }
7421}
7422
7423/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7424/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7425/// (the element is zeroed out when the corresponding mask bit is not set).
7426///
7427/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsubadd_ph)
7428#[inline]
7429#[target_feature(enable = "avx512fp16,avx512vl")]
7430#[cfg_attr(test, assert_instr(vfmsubadd))]
7431#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7432pub fn _mm_maskz_fmsubadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7433    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), _mm_setzero_ph()) }
7434}
7435
7436/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7437/// and add packed elements in c to/from the intermediate result, and store the results in dst.
7438///
7439/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmsubadd_ph)
7440#[inline]
7441#[target_feature(enable = "avx512fp16,avx512vl")]
7442#[cfg_attr(test, assert_instr(vfmsubadd))]
7443#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7444pub fn _mm256_fmsubadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
7445    unsafe { vfmaddsubph_256(a, b, simd_neg(c)) }
7446}
7447
7448/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7449/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7450/// (the element is copied from a when the corresponding mask bit is not set).
7451///
7452/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmsubadd_ph)
7453#[inline]
7454#[target_feature(enable = "avx512fp16,avx512vl")]
7455#[cfg_attr(test, assert_instr(vfmsubadd))]
7456#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7457pub fn _mm256_mask_fmsubadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
7458    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), a) }
7459}
7460
7461/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7462/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7463/// (the element is copied from c when the corresponding mask bit is not set).
7464///
7465/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmsubadd_ph)
7466#[inline]
7467#[target_feature(enable = "avx512fp16,avx512vl")]
7468#[cfg_attr(test, assert_instr(vfmsubadd))]
7469#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7470pub fn _mm256_mask3_fmsubadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
7471    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), c) }
7472}
7473
7474/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7475/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7476/// (the element is zeroed out when the corresponding mask bit is not set).
7477///
7478/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmsubadd_ph)
7479#[inline]
7480#[target_feature(enable = "avx512fp16,avx512vl")]
7481#[cfg_attr(test, assert_instr(vfmsubadd))]
7482#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7483pub fn _mm256_maskz_fmsubadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
7484    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), _mm256_setzero_ph()) }
7485}
7486
7487/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7488/// and add packed elements in c to/from the intermediate result, and store the results in dst.
7489///
7490/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsubadd_ph)
7491#[inline]
7492#[target_feature(enable = "avx512fp16")]
7493#[cfg_attr(test, assert_instr(vfmsubadd))]
7494#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7495pub fn _mm512_fmsubadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
7496    _mm512_fmsubadd_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
7497}
7498
7499/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7500/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7501/// (the element is copied from a when the corresponding mask bit is not set).
7502///
7503/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsubadd_ph)
7504#[inline]
7505#[target_feature(enable = "avx512fp16")]
7506#[cfg_attr(test, assert_instr(vfmsubadd))]
7507#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7508pub fn _mm512_mask_fmsubadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
7509    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), a) }
7510}
7511
7512/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7513/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7514/// (the element is copied from c when the corresponding mask bit is not set).
7515///
7516/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsubadd_ph)
7517#[inline]
7518#[target_feature(enable = "avx512fp16")]
7519#[cfg_attr(test, assert_instr(vfmsubadd))]
7520#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7521pub fn _mm512_mask3_fmsubadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
7522    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), c) }
7523}
7524
7525/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7526/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7527/// (the element is zeroed out when the corresponding mask bit is not set).
7528///
7529/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsubadd_ph)
7530#[inline]
7531#[target_feature(enable = "avx512fp16")]
7532#[cfg_attr(test, assert_instr(vfmsubadd))]
7533#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7534pub fn _mm512_maskz_fmsubadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
7535    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), _mm512_setzero_ph()) }
7536}
7537
7538/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7539/// and add packed elements in c to/from the intermediate result, and store the results in dst.
7540///
7541/// Rounding is done according to the rounding parameter, which can be one of:
7542///
7543/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7544/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7545/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7546/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7547/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7548///
7549/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsubadd_round_ph)
7550#[inline]
7551#[target_feature(enable = "avx512fp16")]
7552#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
7553#[rustc_legacy_const_generics(3)]
7554#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7555pub fn _mm512_fmsubadd_round_ph<const ROUNDING: i32>(
7556    a: __m512h,
7557    b: __m512h,
7558    c: __m512h,
7559) -> __m512h {
7560    unsafe {
7561        static_assert_rounding!(ROUNDING);
7562        vfmaddsubph_512(a, b, simd_neg(c), ROUNDING)
7563    }
7564}
7565
7566/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7567/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7568/// (the element is copied from a when the corresponding mask bit is not set).
7569///
7570/// Rounding is done according to the rounding parameter, which can be one of:
7571///
7572/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7573/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7574/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7575/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7576/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7577///
7578/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsubadd_round_ph)
7579#[inline]
7580#[target_feature(enable = "avx512fp16")]
7581#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
7582#[rustc_legacy_const_generics(4)]
7583#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7584pub fn _mm512_mask_fmsubadd_round_ph<const ROUNDING: i32>(
7585    a: __m512h,
7586    k: __mmask32,
7587    b: __m512h,
7588    c: __m512h,
7589) -> __m512h {
7590    unsafe {
7591        static_assert_rounding!(ROUNDING);
7592        simd_select_bitmask(k, _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c), a)
7593    }
7594}
7595
7596/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7597/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7598/// (the element is copied from c when the corresponding mask bit is not set).
7599///
7600/// Rounding is done according to the rounding parameter, which can be one of:
7601///
7602/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7603/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7604/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7605/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7606/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7607///
7608/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsubadd_round_ph)
7609#[inline]
7610#[target_feature(enable = "avx512fp16")]
7611#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
7612#[rustc_legacy_const_generics(4)]
7613#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7614pub fn _mm512_mask3_fmsubadd_round_ph<const ROUNDING: i32>(
7615    a: __m512h,
7616    b: __m512h,
7617    c: __m512h,
7618    k: __mmask32,
7619) -> __m512h {
7620    unsafe {
7621        static_assert_rounding!(ROUNDING);
7622        simd_select_bitmask(k, _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c), c)
7623    }
7624}
7625
7626/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7627/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7628/// (the element is zeroed out when the corresponding mask bit is not set).
7629///
7630/// Rounding is done according to the rounding parameter, which can be one of:
7631///
7632/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7633/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7634/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7635/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7636/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7637///
7638/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsubadd_round_ph)
7639#[inline]
7640#[target_feature(enable = "avx512fp16")]
7641#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
7642#[rustc_legacy_const_generics(4)]
7643#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7644pub fn _mm512_maskz_fmsubadd_round_ph<const ROUNDING: i32>(
7645    k: __mmask32,
7646    a: __m512h,
7647    b: __m512h,
7648    c: __m512h,
7649) -> __m512h {
7650    unsafe {
7651        static_assert_rounding!(ROUNDING);
7652        simd_select_bitmask(
7653            k,
7654            _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c),
7655            _mm512_setzero_ph(),
7656        )
7657    }
7658}
7659
7660/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`.
7661/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7662///
7663/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ph)
7664#[inline]
7665#[target_feature(enable = "avx512fp16,avx512vl")]
7666#[cfg_attr(test, assert_instr(vrcpph))]
7667#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7668pub fn _mm_rcp_ph(a: __m128h) -> __m128h {
7669    _mm_mask_rcp_ph(_mm_undefined_ph(), 0xff, a)
7670}
7671
7672/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7673/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
7674/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7675///
7676/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp_ph)
7677#[inline]
7678#[target_feature(enable = "avx512fp16,avx512vl")]
7679#[cfg_attr(test, assert_instr(vrcpph))]
7680#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7681pub fn _mm_mask_rcp_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
7682    unsafe { vrcpph_128(a, src, k) }
7683}
7684
7685/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7686/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
7687/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7688///
7689/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp_ph)
7690#[inline]
7691#[target_feature(enable = "avx512fp16,avx512vl")]
7692#[cfg_attr(test, assert_instr(vrcpph))]
7693#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7694pub fn _mm_maskz_rcp_ph(k: __mmask8, a: __m128h) -> __m128h {
7695    _mm_mask_rcp_ph(_mm_setzero_ph(), k, a)
7696}
7697
7698/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`.
7699/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7700///
7701/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rcp_ph)
7702#[inline]
7703#[target_feature(enable = "avx512fp16,avx512vl")]
7704#[cfg_attr(test, assert_instr(vrcpph))]
7705#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7706pub fn _mm256_rcp_ph(a: __m256h) -> __m256h {
7707    _mm256_mask_rcp_ph(_mm256_undefined_ph(), 0xffff, a)
7708}
7709
7710/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7711/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
7712/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7713///
7714/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rcp_ph)
7715#[inline]
7716#[target_feature(enable = "avx512fp16,avx512vl")]
7717#[cfg_attr(test, assert_instr(vrcpph))]
7718#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7719pub fn _mm256_mask_rcp_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
7720    unsafe { vrcpph_256(a, src, k) }
7721}
7722
7723/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7724/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
7725/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7726///
7727/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rcp_ph)
7728#[inline]
7729#[target_feature(enable = "avx512fp16,avx512vl")]
7730#[cfg_attr(test, assert_instr(vrcpph))]
7731#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7732pub fn _mm256_maskz_rcp_ph(k: __mmask16, a: __m256h) -> __m256h {
7733    _mm256_mask_rcp_ph(_mm256_setzero_ph(), k, a)
7734}
7735
7736/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`.
7737/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7738///
7739/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rcp_ph)
7740#[inline]
7741#[target_feature(enable = "avx512fp16")]
7742#[cfg_attr(test, assert_instr(vrcpph))]
7743#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7744pub fn _mm512_rcp_ph(a: __m512h) -> __m512h {
7745    _mm512_mask_rcp_ph(_mm512_undefined_ph(), 0xffffffff, a)
7746}
7747
7748/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7749/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
7750/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7751///
7752/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rcp_ph)
7753#[inline]
7754#[target_feature(enable = "avx512fp16")]
7755#[cfg_attr(test, assert_instr(vrcpph))]
7756#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7757pub fn _mm512_mask_rcp_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
7758    unsafe { vrcpph_512(a, src, k) }
7759}
7760
7761/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7762/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
7763/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7764///
7765/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rcp_ph)
7766#[inline]
7767#[target_feature(enable = "avx512fp16")]
7768#[cfg_attr(test, assert_instr(vrcpph))]
7769#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7770pub fn _mm512_maskz_rcp_ph(k: __mmask32, a: __m512h) -> __m512h {
7771    _mm512_mask_rcp_ph(_mm512_setzero_ph(), k, a)
7772}
7773
7774/// Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in b,
7775/// store the result in the lower element of dst, and copy the upper 7 packed elements from a to the
7776/// upper elements of dst.
7777/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7778///
7779/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_sh)
7780#[inline]
7781#[target_feature(enable = "avx512fp16")]
7782#[cfg_attr(test, assert_instr(vrcpsh))]
7783#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7784pub fn _mm_rcp_sh(a: __m128h, b: __m128h) -> __m128h {
7785    _mm_mask_rcp_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
7786}
7787
7788/// Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in b,
7789/// store the result in the lower element of dst using writemask k (the element is copied from src when
7790/// mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
7791/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7792///
7793/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp_sh)
7794#[inline]
7795#[target_feature(enable = "avx512fp16")]
7796#[cfg_attr(test, assert_instr(vrcpsh))]
7797#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7798pub fn _mm_mask_rcp_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
7799    unsafe { vrcpsh(a, b, src, k) }
7800}
7801
7802/// Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in b,
7803/// store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0
7804/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
7805/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7806///
7807/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp_sh)
7808#[inline]
7809#[target_feature(enable = "avx512fp16")]
7810#[cfg_attr(test, assert_instr(vrcpsh))]
7811#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7812pub fn _mm_maskz_rcp_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
7813    _mm_mask_rcp_sh(f16x8::ZERO.as_m128h(), k, a, b)
7814}
7815
7816/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7817/// elements in a, and store the results in dst.
7818/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7819///
7820/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ph)
7821#[inline]
7822#[target_feature(enable = "avx512fp16,avx512vl")]
7823#[cfg_attr(test, assert_instr(vrsqrtph))]
7824#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7825pub fn _mm_rsqrt_ph(a: __m128h) -> __m128h {
7826    _mm_mask_rsqrt_ph(_mm_undefined_ph(), 0xff, a)
7827}
7828
7829/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7830/// elements in a, and store the results in dst using writemask k (elements are copied from src when
7831/// the corresponding mask bit is not set).
7832/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7833///
7834/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt_ph)
7835#[inline]
7836#[target_feature(enable = "avx512fp16,avx512vl")]
7837#[cfg_attr(test, assert_instr(vrsqrtph))]
7838#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7839pub fn _mm_mask_rsqrt_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
7840    unsafe { vrsqrtph_128(a, src, k) }
7841}
7842
7843/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7844/// elements in a, and store the results in dst using zeromask k (elements are zeroed out when the
7845/// corresponding mask bit is not set).
7846/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7847///
7848/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt_ph)
7849#[inline]
7850#[target_feature(enable = "avx512fp16,avx512vl")]
7851#[cfg_attr(test, assert_instr(vrsqrtph))]
7852#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7853pub fn _mm_maskz_rsqrt_ph(k: __mmask8, a: __m128h) -> __m128h {
7854    _mm_mask_rsqrt_ph(_mm_setzero_ph(), k, a)
7855}
7856
7857/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7858/// elements in a, and store the results in dst.
7859/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7860///
7861/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rsqrt_ph)
7862#[inline]
7863#[target_feature(enable = "avx512fp16,avx512vl")]
7864#[cfg_attr(test, assert_instr(vrsqrtph))]
7865#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7866pub fn _mm256_rsqrt_ph(a: __m256h) -> __m256h {
7867    _mm256_mask_rsqrt_ph(_mm256_undefined_ph(), 0xffff, a)
7868}
7869
7870/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7871/// elements in a, and store the results in dst using writemask k (elements are copied from src when
7872/// the corresponding mask bit is not set).
7873/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7874///
7875/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rsqrt_ph)
7876#[inline]
7877#[target_feature(enable = "avx512fp16,avx512vl")]
7878#[cfg_attr(test, assert_instr(vrsqrtph))]
7879#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7880pub fn _mm256_mask_rsqrt_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
7881    unsafe { vrsqrtph_256(a, src, k) }
7882}
7883
7884/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7885/// elements in a, and store the results in dst using zeromask k (elements are zeroed out when the
7886/// corresponding mask bit is not set).
7887/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7888///
7889/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rsqrt_ph)
7890#[inline]
7891#[target_feature(enable = "avx512fp16,avx512vl")]
7892#[cfg_attr(test, assert_instr(vrsqrtph))]
7893#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7894pub fn _mm256_maskz_rsqrt_ph(k: __mmask16, a: __m256h) -> __m256h {
7895    _mm256_mask_rsqrt_ph(_mm256_setzero_ph(), k, a)
7896}
7897
7898/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7899/// elements in a, and store the results in dst.
7900/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7901///
7902/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rsqrt_ph)
7903#[inline]
7904#[target_feature(enable = "avx512fp16")]
7905#[cfg_attr(test, assert_instr(vrsqrtph))]
7906#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7907pub fn _mm512_rsqrt_ph(a: __m512h) -> __m512h {
7908    _mm512_mask_rsqrt_ph(_mm512_undefined_ph(), 0xffffffff, a)
7909}
7910
7911/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7912/// elements in a, and store the results in dst using writemask k (elements are copied from src when
7913/// the corresponding mask bit is not set).
7914/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7915///
7916/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rsqrt_ph)
7917#[inline]
7918#[target_feature(enable = "avx512fp16")]
7919#[cfg_attr(test, assert_instr(vrsqrtph))]
7920#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7921pub fn _mm512_mask_rsqrt_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
7922    unsafe { vrsqrtph_512(a, src, k) }
7923}
7924
7925/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7926/// elements in a, and store the results in dst using zeromask k (elements are zeroed out when the
7927/// corresponding mask bit is not set).
7928/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7929///
7930/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rsqrt_ph)
7931#[inline]
7932#[target_feature(enable = "avx512fp16")]
7933#[cfg_attr(test, assert_instr(vrsqrtph))]
7934#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7935pub fn _mm512_maskz_rsqrt_ph(k: __mmask32, a: __m512h) -> __m512h {
7936    _mm512_mask_rsqrt_ph(_mm512_setzero_ph(), k, a)
7937}
7938
7939/// Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point
7940/// element in b, store the result in the lower element of dst, and copy the upper 7 packed elements from a
7941/// to the upper elements of dst.
7942/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7943///
7944/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_sh)
7945#[inline]
7946#[target_feature(enable = "avx512fp16")]
7947#[cfg_attr(test, assert_instr(vrsqrtsh))]
7948#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7949pub fn _mm_rsqrt_sh(a: __m128h, b: __m128h) -> __m128h {
7950    _mm_mask_rsqrt_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
7951}
7952
7953/// Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point
7954/// element in b, store the result in the lower element of dst using writemask k (the element is copied from src
7955/// when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
7956/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7957///
7958/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt_sh)
7959#[inline]
7960#[target_feature(enable = "avx512fp16")]
7961#[cfg_attr(test, assert_instr(vrsqrtsh))]
7962#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7963pub fn _mm_mask_rsqrt_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
7964    unsafe { vrsqrtsh(a, b, src, k) }
7965}
7966
7967/// Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point
7968/// element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when
7969/// mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
7970/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7971///
7972/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt_sh)
7973#[inline]
7974#[target_feature(enable = "avx512fp16")]
7975#[cfg_attr(test, assert_instr(vrsqrtsh))]
7976#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7977pub fn _mm_maskz_rsqrt_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
7978    _mm_mask_rsqrt_sh(f16x8::ZERO.as_m128h(), k, a, b)
7979}
7980
7981/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
7982/// results in dst.
7983///
7984/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_ph)
7985#[inline]
7986#[target_feature(enable = "avx512fp16,avx512vl")]
7987#[cfg_attr(test, assert_instr(vsqrtph))]
7988#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7989pub fn _mm_sqrt_ph(a: __m128h) -> __m128h {
7990    unsafe { simd_fsqrt(a) }
7991}
7992
7993/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
7994/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
7995///
7996/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_ph)
7997#[inline]
7998#[target_feature(enable = "avx512fp16,avx512vl")]
7999#[cfg_attr(test, assert_instr(vsqrtph))]
8000#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8001pub fn _mm_mask_sqrt_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
8002    unsafe { simd_select_bitmask(k, _mm_sqrt_ph(a), src) }
8003}
8004
8005/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8006/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8007///
8008/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_ph)
8009#[inline]
8010#[target_feature(enable = "avx512fp16,avx512vl")]
8011#[cfg_attr(test, assert_instr(vsqrtph))]
8012#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8013pub fn _mm_maskz_sqrt_ph(k: __mmask8, a: __m128h) -> __m128h {
8014    unsafe { simd_select_bitmask(k, _mm_sqrt_ph(a), _mm_setzero_ph()) }
8015}
8016
8017/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8018/// results in dst.
8019///
8020/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sqrt_ph)
8021#[inline]
8022#[target_feature(enable = "avx512fp16,avx512vl")]
8023#[cfg_attr(test, assert_instr(vsqrtph))]
8024#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8025pub fn _mm256_sqrt_ph(a: __m256h) -> __m256h {
8026    unsafe { simd_fsqrt(a) }
8027}
8028
8029/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8030/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8031///
8032/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sqrt_ph)
8033#[inline]
8034#[target_feature(enable = "avx512fp16,avx512vl")]
8035#[cfg_attr(test, assert_instr(vsqrtph))]
8036#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8037pub fn _mm256_mask_sqrt_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
8038    unsafe { simd_select_bitmask(k, _mm256_sqrt_ph(a), src) }
8039}
8040
8041/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8042/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8043///
8044/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sqrt_ph)
8045#[inline]
8046#[target_feature(enable = "avx512fp16,avx512vl")]
8047#[cfg_attr(test, assert_instr(vsqrtph))]
8048#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8049pub fn _mm256_maskz_sqrt_ph(k: __mmask16, a: __m256h) -> __m256h {
8050    unsafe { simd_select_bitmask(k, _mm256_sqrt_ph(a), _mm256_setzero_ph()) }
8051}
8052
8053/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8054/// results in dst.
8055///
8056/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sqrt_ph)
8057#[inline]
8058#[target_feature(enable = "avx512fp16")]
8059#[cfg_attr(test, assert_instr(vsqrtph))]
8060#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8061pub fn _mm512_sqrt_ph(a: __m512h) -> __m512h {
8062    unsafe { simd_fsqrt(a) }
8063}
8064
8065/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8066/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8067///
8068/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sqrt_ph)
8069#[inline]
8070#[target_feature(enable = "avx512fp16")]
8071#[cfg_attr(test, assert_instr(vsqrtph))]
8072#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8073pub fn _mm512_mask_sqrt_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
8074    unsafe { simd_select_bitmask(k, _mm512_sqrt_ph(a), src) }
8075}
8076
8077/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8078/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8079///
8080/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sqrt_ph)
8081#[inline]
8082#[target_feature(enable = "avx512fp16")]
8083#[cfg_attr(test, assert_instr(vsqrtph))]
8084#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8085pub fn _mm512_maskz_sqrt_ph(k: __mmask32, a: __m512h) -> __m512h {
8086    unsafe { simd_select_bitmask(k, _mm512_sqrt_ph(a), _mm512_setzero_ph()) }
8087}
8088
8089/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8090/// results in dst.
8091/// Rounding is done according to the rounding parameter, which can be one of:
8092///
8093/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8094/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8095/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8096/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8097/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8098///
8099/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sqrt_round_ph)
8100#[inline]
8101#[target_feature(enable = "avx512fp16")]
8102#[cfg_attr(test, assert_instr(vsqrtph, ROUNDING = 8))]
8103#[rustc_legacy_const_generics(1)]
8104#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8105pub fn _mm512_sqrt_round_ph<const ROUNDING: i32>(a: __m512h) -> __m512h {
8106    unsafe {
8107        static_assert_rounding!(ROUNDING);
8108        vsqrtph_512(a, ROUNDING)
8109    }
8110}
8111
8112/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8113/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8114/// Rounding is done according to the rounding parameter, which can be one of:
8115///
8116/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8117/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8118/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8119/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8120/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8121///
8122/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sqrt_round_ph)
8123#[inline]
8124#[target_feature(enable = "avx512fp16")]
8125#[cfg_attr(test, assert_instr(vsqrtph, ROUNDING = 8))]
8126#[rustc_legacy_const_generics(3)]
8127#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8128pub fn _mm512_mask_sqrt_round_ph<const ROUNDING: i32>(
8129    src: __m512h,
8130    k: __mmask32,
8131    a: __m512h,
8132) -> __m512h {
8133    unsafe {
8134        static_assert_rounding!(ROUNDING);
8135        simd_select_bitmask(k, _mm512_sqrt_round_ph::<ROUNDING>(a), src)
8136    }
8137}
8138
8139/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8140/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8141/// Rounding is done according to the rounding parameter, which can be one of:
8142///
8143/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8144/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8145/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8146/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8147/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8148///
8149/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sqrt_round_ph)
8150#[inline]
8151#[target_feature(enable = "avx512fp16")]
8152#[cfg_attr(test, assert_instr(vsqrtph, ROUNDING = 8))]
8153#[rustc_legacy_const_generics(2)]
8154#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8155pub fn _mm512_maskz_sqrt_round_ph<const ROUNDING: i32>(k: __mmask32, a: __m512h) -> __m512h {
8156    unsafe {
8157        static_assert_rounding!(ROUNDING);
8158        simd_select_bitmask(k, _mm512_sqrt_round_ph::<ROUNDING>(a), _mm512_setzero_ph())
8159    }
8160}
8161
8162/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8163/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
8164/// elements of dst.
8165///
8166/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_sh)
8167#[inline]
8168#[target_feature(enable = "avx512fp16")]
8169#[cfg_attr(test, assert_instr(vsqrtsh))]
8170#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8171pub fn _mm_sqrt_sh(a: __m128h, b: __m128h) -> __m128h {
8172    _mm_mask_sqrt_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
8173}
8174
8175/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8176/// the result in the lower element of dst using writemask k (the element is copied from src when mask
8177/// bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8178///
8179/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_sh)
8180#[inline]
8181#[target_feature(enable = "avx512fp16")]
8182#[cfg_attr(test, assert_instr(vsqrtsh))]
8183#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8184pub fn _mm_mask_sqrt_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8185    _mm_mask_sqrt_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
8186}
8187
8188/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8189/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0
8190/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8191///
8192/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_sh)
8193#[inline]
8194#[target_feature(enable = "avx512fp16")]
8195#[cfg_attr(test, assert_instr(vsqrtsh))]
8196#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8197pub fn _mm_maskz_sqrt_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8198    _mm_mask_sqrt_sh(f16x8::ZERO.as_m128h(), k, a, b)
8199}
8200
8201/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8202/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
8203/// elements of dst.
8204/// Rounding is done according to the rounding parameter, which can be one of:
8205///
8206/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8207/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8208/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8209/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8210/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8211///
8212/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_round_sh)
8213#[inline]
8214#[target_feature(enable = "avx512fp16")]
8215#[cfg_attr(test, assert_instr(vsqrtsh, ROUNDING = 8))]
8216#[rustc_legacy_const_generics(2)]
8217#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8218pub fn _mm_sqrt_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
8219    static_assert_rounding!(ROUNDING);
8220    _mm_mask_sqrt_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
8221}
8222
8223/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8224/// the result in the lower element of dst using writemask k (the element is copied from src when mask
8225/// bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8226/// Rounding is done according to the rounding parameter, which can be one of:
8227///
8228/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8229/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8230/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8231/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8232/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8233///
8234/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_round_sh)
8235#[inline]
8236#[target_feature(enable = "avx512fp16")]
8237#[cfg_attr(test, assert_instr(vsqrtsh, ROUNDING = 8))]
8238#[rustc_legacy_const_generics(4)]
8239#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8240pub fn _mm_mask_sqrt_round_sh<const ROUNDING: i32>(
8241    src: __m128h,
8242    k: __mmask8,
8243    a: __m128h,
8244    b: __m128h,
8245) -> __m128h {
8246    unsafe {
8247        static_assert_rounding!(ROUNDING);
8248        vsqrtsh(a, b, src, k, ROUNDING)
8249    }
8250}
8251
8252/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8253/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0
8254/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8255/// Rounding is done according to the rounding parameter, which can be one of:
8256///
8257/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8258/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8259/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8260/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8261/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8262///
8263/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_round_sh)
8264#[inline]
8265#[target_feature(enable = "avx512fp16")]
8266#[cfg_attr(test, assert_instr(vsqrtsh, ROUNDING = 8))]
8267#[rustc_legacy_const_generics(3)]
8268#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8269pub fn _mm_maskz_sqrt_round_sh<const ROUNDING: i32>(
8270    k: __mmask8,
8271    a: __m128h,
8272    b: __m128h,
8273) -> __m128h {
8274    static_assert_rounding!(ROUNDING);
8275    _mm_mask_sqrt_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
8276}
8277
8278/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8279/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum
8280/// value when inputs are NaN or signed-zero values.
8281///
8282/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_ph)
8283#[inline]
8284#[target_feature(enable = "avx512fp16,avx512vl")]
8285#[cfg_attr(test, assert_instr(vmaxph))]
8286#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8287pub fn _mm_max_ph(a: __m128h, b: __m128h) -> __m128h {
8288    unsafe { vmaxph_128(a, b) }
8289}
8290
8291/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8292/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8293/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8294/// NaN or signed-zero values.
8295///
8296/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_ph)
8297#[inline]
8298#[target_feature(enable = "avx512fp16,avx512vl")]
8299#[cfg_attr(test, assert_instr(vmaxph))]
8300#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8301pub fn _mm_mask_max_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8302    unsafe { simd_select_bitmask(k, _mm_max_ph(a, b), src) }
8303}
8304
8305/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8306/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8307/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8308/// NaN or signed-zero values.
8309///
8310/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_ph)
8311#[inline]
8312#[target_feature(enable = "avx512fp16,avx512vl")]
8313#[cfg_attr(test, assert_instr(vmaxph))]
8314#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8315pub fn _mm_maskz_max_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8316    unsafe { simd_select_bitmask(k, _mm_max_ph(a, b), _mm_setzero_ph()) }
8317}
8318
8319/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8320/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum
8321/// value when inputs are NaN or signed-zero values.
8322///
8323/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_ph)
8324#[inline]
8325#[target_feature(enable = "avx512fp16,avx512vl")]
8326#[cfg_attr(test, assert_instr(vmaxph))]
8327#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8328pub fn _mm256_max_ph(a: __m256h, b: __m256h) -> __m256h {
8329    unsafe { vmaxph_256(a, b) }
8330}
8331
8332/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8333/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8334/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8335/// NaN or signed-zero values.
8336///
8337/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_max_ph)
8338#[inline]
8339#[target_feature(enable = "avx512fp16,avx512vl")]
8340#[cfg_attr(test, assert_instr(vmaxph))]
8341#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8342pub fn _mm256_mask_max_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
8343    unsafe { simd_select_bitmask(k, _mm256_max_ph(a, b), src) }
8344}
8345
8346/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8347/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8348/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8349/// NaN or signed-zero values.
8350///
8351/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_max_ph)
8352#[inline]
8353#[target_feature(enable = "avx512fp16,avx512vl")]
8354#[cfg_attr(test, assert_instr(vmaxph))]
8355#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8356pub fn _mm256_maskz_max_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
8357    unsafe { simd_select_bitmask(k, _mm256_max_ph(a, b), _mm256_setzero_ph()) }
8358}
8359
8360/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8361/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum
8362/// value when inputs are NaN or signed-zero values.
8363///
8364/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_ph)
8365#[inline]
8366#[target_feature(enable = "avx512fp16")]
8367#[cfg_attr(test, assert_instr(vmaxph))]
8368#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8369pub fn _mm512_max_ph(a: __m512h, b: __m512h) -> __m512h {
8370    _mm512_max_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b)
8371}
8372
8373/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8374/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8375/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8376/// NaN or signed-zero values.
8377///
8378/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_ph)
8379#[inline]
8380#[target_feature(enable = "avx512fp16")]
8381#[cfg_attr(test, assert_instr(vmaxph))]
8382#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8383pub fn _mm512_mask_max_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8384    unsafe { simd_select_bitmask(k, _mm512_max_ph(a, b), src) }
8385}
8386
8387/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8388/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8389/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8390/// NaN or signed-zero values.
8391///
8392/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_ph)
8393#[inline]
8394#[target_feature(enable = "avx512fp16")]
8395#[cfg_attr(test, assert_instr(vmaxph))]
8396#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8397pub fn _mm512_maskz_max_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8398    unsafe { simd_select_bitmask(k, _mm512_max_ph(a, b), _mm512_setzero_ph()) }
8399}
8400
8401/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8402/// values in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
8403/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8404/// NaN or signed-zero values.
8405///
8406/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_round_ph)
8407#[inline]
8408#[target_feature(enable = "avx512fp16")]
8409#[cfg_attr(test, assert_instr(vmaxph, SAE = 8))]
8410#[rustc_legacy_const_generics(2)]
8411#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8412pub fn _mm512_max_round_ph<const SAE: i32>(a: __m512h, b: __m512h) -> __m512h {
8413    unsafe {
8414        static_assert_sae!(SAE);
8415        vmaxph_512(a, b, SAE)
8416    }
8417}
8418
8419/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8420/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8421/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8422/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8423///
8424/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_round_ph)
8425#[inline]
8426#[target_feature(enable = "avx512fp16")]
8427#[cfg_attr(test, assert_instr(vmaxph, SAE = 8))]
8428#[rustc_legacy_const_generics(4)]
8429#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8430pub fn _mm512_mask_max_round_ph<const SAE: i32>(
8431    src: __m512h,
8432    k: __mmask32,
8433    a: __m512h,
8434    b: __m512h,
8435) -> __m512h {
8436    unsafe {
8437        static_assert_sae!(SAE);
8438        simd_select_bitmask(k, _mm512_max_round_ph::<SAE>(a, b), src)
8439    }
8440}
8441
8442/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8443/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8444/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8445/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8446///
8447/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_round_ph)
8448#[inline]
8449#[target_feature(enable = "avx512fp16")]
8450#[cfg_attr(test, assert_instr(vmaxph, SAE = 8))]
8451#[rustc_legacy_const_generics(3)]
8452#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8453pub fn _mm512_maskz_max_round_ph<const SAE: i32>(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8454    unsafe {
8455        static_assert_sae!(SAE);
8456        simd_select_bitmask(k, _mm512_max_round_ph::<SAE>(a, b), _mm512_setzero_ph())
8457    }
8458}
8459
8460/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum
8461/// value in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
8462/// of dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value
8463/// when inputs are NaN or signed-zero values.
8464///
8465/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_sh)
8466#[inline]
8467#[target_feature(enable = "avx512fp16,avx512vl")]
8468#[cfg_attr(test, assert_instr(vmaxsh))]
8469#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8470pub fn _mm_max_sh(a: __m128h, b: __m128h) -> __m128h {
8471    _mm_mask_max_sh(_mm_undefined_ph(), 0xff, a, b)
8472}
8473
8474/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum
8475/// value in the lower element of dst using writemask k (the element is copied from src when mask bit 0
8476/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst. Does not follow
8477/// the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8478///
8479/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_sh)
8480#[inline]
8481#[target_feature(enable = "avx512fp16,avx512vl")]
8482#[cfg_attr(test, assert_instr(vmaxsh))]
8483#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8484pub fn _mm_mask_max_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8485    _mm_mask_max_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
8486}
8487
8488/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
8489/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
8490/// copy the upper 7 packed elements from a to the upper elements of dst. Does not follow the IEEE Standard
8491/// for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8492///
8493/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_sh)
8494#[inline]
8495#[target_feature(enable = "avx512fp16,avx512vl")]
8496#[cfg_attr(test, assert_instr(vmaxsh))]
8497#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8498pub fn _mm_maskz_max_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8499    _mm_mask_max_sh(f16x8::ZERO.as_m128h(), k, a, b)
8500}
8501
8502/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
8503/// in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
8504/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8505/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8506///
8507/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_round_sh)
8508#[inline]
8509#[target_feature(enable = "avx512fp16,avx512vl")]
8510#[cfg_attr(test, assert_instr(vmaxsh, SAE = 8))]
8511#[rustc_legacy_const_generics(2)]
8512#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8513pub fn _mm_max_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
8514    static_assert_sae!(SAE);
8515    _mm_mask_max_round_sh::<SAE>(_mm_undefined_ph(), 0xff, a, b)
8516}
8517
8518/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
8519/// in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
8520/// and copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
8521/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
8522/// (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8523///
8524/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_round_sh)
8525#[inline]
8526#[target_feature(enable = "avx512fp16,avx512vl")]
8527#[cfg_attr(test, assert_instr(vmaxsh, SAE = 8))]
8528#[rustc_legacy_const_generics(4)]
8529#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8530pub fn _mm_mask_max_round_sh<const SAE: i32>(
8531    src: __m128h,
8532    k: __mmask8,
8533    a: __m128h,
8534    b: __m128h,
8535) -> __m128h {
8536    unsafe {
8537        static_assert_sae!(SAE);
8538        vmaxsh(a, b, src, k, SAE)
8539    }
8540}
8541
8542/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
8543/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
8544/// copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
8545/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
8546/// (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8547///
8548/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_round_sh)
8549#[inline]
8550#[target_feature(enable = "avx512fp16,avx512vl")]
8551#[cfg_attr(test, assert_instr(vmaxsh, SAE = 8))]
8552#[rustc_legacy_const_generics(3)]
8553#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8554pub fn _mm_maskz_max_round_sh<const SAE: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8555    static_assert_sae!(SAE);
8556    _mm_mask_max_round_sh::<SAE>(f16x8::ZERO.as_m128h(), k, a, b)
8557}
8558
8559/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8560/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value
8561/// when inputs are NaN or signed-zero values.
8562///
8563/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_ph)
8564#[inline]
8565#[target_feature(enable = "avx512fp16,avx512vl")]
8566#[cfg_attr(test, assert_instr(vminph))]
8567#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8568pub fn _mm_min_ph(a: __m128h, b: __m128h) -> __m128h {
8569    unsafe { vminph_128(a, b) }
8570}
8571
8572/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8573/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8574/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8575/// NaN or signed-zero values.
8576///
8577/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_ph)
8578#[inline]
8579#[target_feature(enable = "avx512fp16,avx512vl")]
8580#[cfg_attr(test, assert_instr(vminph))]
8581#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8582pub fn _mm_mask_min_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8583    unsafe { simd_select_bitmask(k, _mm_min_ph(a, b), src) }
8584}
8585
8586/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8587/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8588/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8589/// NaN or signed-zero values.
8590///
8591/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_ph)
8592#[inline]
8593#[target_feature(enable = "avx512fp16,avx512vl")]
8594#[cfg_attr(test, assert_instr(vminph))]
8595#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8596pub fn _mm_maskz_min_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8597    unsafe { simd_select_bitmask(k, _mm_min_ph(a, b), _mm_setzero_ph()) }
8598}
8599
8600/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8601/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value
8602/// when inputs are NaN or signed-zero values.
8603///
8604/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_ph)
8605#[inline]
8606#[target_feature(enable = "avx512fp16,avx512vl")]
8607#[cfg_attr(test, assert_instr(vminph))]
8608#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8609pub fn _mm256_min_ph(a: __m256h, b: __m256h) -> __m256h {
8610    unsafe { vminph_256(a, b) }
8611}
8612
8613/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8614/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8615/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8616/// NaN or signed-zero values.
8617///
8618/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_min_ph)
8619#[inline]
8620#[target_feature(enable = "avx512fp16,avx512vl")]
8621#[cfg_attr(test, assert_instr(vminph))]
8622#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8623pub fn _mm256_mask_min_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
8624    unsafe { simd_select_bitmask(k, _mm256_min_ph(a, b), src) }
8625}
8626
8627/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8628/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8629/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8630/// NaN or signed-zero values.
8631///
8632/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_min_ph)
8633#[inline]
8634#[target_feature(enable = "avx512fp16,avx512vl")]
8635#[cfg_attr(test, assert_instr(vminph))]
8636#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8637pub fn _mm256_maskz_min_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
8638    unsafe { simd_select_bitmask(k, _mm256_min_ph(a, b), _mm256_setzero_ph()) }
8639}
8640
8641/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8642/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value
8643/// when inputs are NaN or signed-zero values.
8644///
8645/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_ph)
8646#[inline]
8647#[target_feature(enable = "avx512fp16")]
8648#[cfg_attr(test, assert_instr(vminph))]
8649#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8650pub fn _mm512_min_ph(a: __m512h, b: __m512h) -> __m512h {
8651    _mm512_min_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b)
8652}
8653
8654/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8655/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8656/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8657/// NaN or signed-zero values.
8658///
8659/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_ph)
8660#[inline]
8661#[target_feature(enable = "avx512fp16")]
8662#[cfg_attr(test, assert_instr(vminph))]
8663#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8664pub fn _mm512_mask_min_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8665    unsafe { simd_select_bitmask(k, _mm512_min_ph(a, b), src) }
8666}
8667
8668/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8669/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8670/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8671/// NaN or signed-zero values.
8672///
8673/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_ph)
8674#[inline]
8675#[target_feature(enable = "avx512fp16")]
8676#[cfg_attr(test, assert_instr(vminph))]
8677#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8678pub fn _mm512_maskz_min_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8679    unsafe { simd_select_bitmask(k, _mm512_min_ph(a, b), _mm512_setzero_ph()) }
8680}
8681
8682/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8683/// values in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not
8684/// follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8685///
8686/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_round_ph)
8687#[inline]
8688#[target_feature(enable = "avx512fp16")]
8689#[cfg_attr(test, assert_instr(vminph, SAE = 8))]
8690#[rustc_legacy_const_generics(2)]
8691#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8692pub fn _mm512_min_round_ph<const SAE: i32>(a: __m512h, b: __m512h) -> __m512h {
8693    unsafe {
8694        static_assert_sae!(SAE);
8695        vminph_512(a, b, SAE)
8696    }
8697}
8698
8699/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8700/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8701/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8702/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8703///
8704/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_round_ph)
8705#[inline]
8706#[target_feature(enable = "avx512fp16")]
8707#[cfg_attr(test, assert_instr(vminph, SAE = 8))]
8708#[rustc_legacy_const_generics(4)]
8709#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8710pub fn _mm512_mask_min_round_ph<const SAE: i32>(
8711    src: __m512h,
8712    k: __mmask32,
8713    a: __m512h,
8714    b: __m512h,
8715) -> __m512h {
8716    unsafe {
8717        static_assert_sae!(SAE);
8718        simd_select_bitmask(k, _mm512_min_round_ph::<SAE>(a, b), src)
8719    }
8720}
8721
8722/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8723/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8724/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8725/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8726///
8727/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_round_ph)
8728#[inline]
8729#[target_feature(enable = "avx512fp16")]
8730#[cfg_attr(test, assert_instr(vminph, SAE = 8))]
8731#[rustc_legacy_const_generics(3)]
8732#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8733pub fn _mm512_maskz_min_round_ph<const SAE: i32>(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8734    unsafe {
8735        static_assert_sae!(SAE);
8736        simd_select_bitmask(k, _mm512_min_round_ph::<SAE>(a, b), _mm512_setzero_ph())
8737    }
8738}
8739
8740/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum
8741/// value in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
8742/// of dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when
8743/// inputs are NaN or signed-zero values.
8744///
8745/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_sh)
8746#[inline]
8747#[target_feature(enable = "avx512fp16,avx512vl")]
8748#[cfg_attr(test, assert_instr(vminsh))]
8749#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8750pub fn _mm_min_sh(a: __m128h, b: __m128h) -> __m128h {
8751    _mm_mask_min_sh(_mm_undefined_ph(), 0xff, a, b)
8752}
8753
8754/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum
8755/// value in the lower element of dst using writemask k (the element is copied from src when mask bit 0
8756/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst. Does not follow
8757/// the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8758///
8759/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_sh)
8760#[inline]
8761#[target_feature(enable = "avx512fp16,avx512vl")]
8762#[cfg_attr(test, assert_instr(vminsh))]
8763#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8764pub fn _mm_mask_min_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8765    _mm_mask_min_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
8766}
8767
8768/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
8769/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
8770/// copy the upper 7 packed elements from a to the upper elements of dst. Does not follow the IEEE Standard
8771/// for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8772///
8773/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_sh)
8774#[inline]
8775#[target_feature(enable = "avx512fp16,avx512vl")]
8776#[cfg_attr(test, assert_instr(vminsh))]
8777#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8778pub fn _mm_maskz_min_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8779    _mm_mask_min_sh(f16x8::ZERO.as_m128h(), k, a, b)
8780}
8781
8782/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
8783/// in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
8784/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8785/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8786///
8787/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_round_sh)
8788#[inline]
8789#[target_feature(enable = "avx512fp16,avx512vl")]
8790#[cfg_attr(test, assert_instr(vminsh, SAE = 8))]
8791#[rustc_legacy_const_generics(2)]
8792#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8793pub fn _mm_min_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
8794    static_assert_sae!(SAE);
8795    _mm_mask_min_round_sh::<SAE>(_mm_undefined_ph(), 0xff, a, b)
8796}
8797
8798/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
8799/// in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
8800/// and copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
8801/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
8802/// (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8803///
8804/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_round_sh)
8805#[inline]
8806#[target_feature(enable = "avx512fp16,avx512vl")]
8807#[cfg_attr(test, assert_instr(vminsh, SAE = 8))]
8808#[rustc_legacy_const_generics(4)]
8809#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8810pub fn _mm_mask_min_round_sh<const SAE: i32>(
8811    src: __m128h,
8812    k: __mmask8,
8813    a: __m128h,
8814    b: __m128h,
8815) -> __m128h {
8816    unsafe {
8817        static_assert_sae!(SAE);
8818        vminsh(a, b, src, k, SAE)
8819    }
8820}
8821
8822/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
8823/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
8824/// copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
8825/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
8826/// (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8827///
8828/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_round_sh)
8829#[inline]
8830#[target_feature(enable = "avx512fp16,avx512vl")]
8831#[cfg_attr(test, assert_instr(vminsh, SAE = 8))]
8832#[rustc_legacy_const_generics(3)]
8833#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8834pub fn _mm_maskz_min_round_sh<const SAE: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8835    static_assert_sae!(SAE);
8836    _mm_mask_min_round_sh::<SAE>(f16x8::ZERO.as_m128h(), k, a, b)
8837}
8838
8839/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8840/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
8841/// This intrinsic essentially calculates `floor(log2(x))` for each element.
8842///
8843/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_ph)
8844#[inline]
8845#[target_feature(enable = "avx512fp16,avx512vl")]
8846#[cfg_attr(test, assert_instr(vgetexpph))]
8847#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8848pub fn _mm_getexp_ph(a: __m128h) -> __m128h {
8849    _mm_mask_getexp_ph(_mm_undefined_ph(), 0xff, a)
8850}
8851
8852/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8853/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
8854/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
8855/// `floor(log2(x))` for each element.
8856///
8857/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_ph)
8858#[inline]
8859#[target_feature(enable = "avx512fp16,avx512vl")]
8860#[cfg_attr(test, assert_instr(vgetexpph))]
8861#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8862pub fn _mm_mask_getexp_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
8863    unsafe { vgetexpph_128(a, src, k) }
8864}
8865
8866/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8867/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
8868/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
8869/// `floor(log2(x))` for each element.
8870///
8871/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_ph)
8872#[inline]
8873#[target_feature(enable = "avx512fp16,avx512vl")]
8874#[cfg_attr(test, assert_instr(vgetexpph))]
8875#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8876pub fn _mm_maskz_getexp_ph(k: __mmask8, a: __m128h) -> __m128h {
8877    _mm_mask_getexp_ph(_mm_setzero_ph(), k, a)
8878}
8879
8880/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8881/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
8882/// This intrinsic essentially calculates `floor(log2(x))` for each element.
8883///
8884/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_getexp_ph)
8885#[inline]
8886#[target_feature(enable = "avx512fp16,avx512vl")]
8887#[cfg_attr(test, assert_instr(vgetexpph))]
8888#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8889pub fn _mm256_getexp_ph(a: __m256h) -> __m256h {
8890    _mm256_mask_getexp_ph(_mm256_undefined_ph(), 0xffff, a)
8891}
8892
8893/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8894/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
8895/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
8896/// `floor(log2(x))` for each element.
8897///
8898/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_getexp_ph)
8899#[inline]
8900#[target_feature(enable = "avx512fp16,avx512vl")]
8901#[cfg_attr(test, assert_instr(vgetexpph))]
8902#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8903pub fn _mm256_mask_getexp_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
8904    unsafe { vgetexpph_256(a, src, k) }
8905}
8906
8907/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8908/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
8909/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
8910/// `floor(log2(x))` for each element.
8911///
8912/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_getexp_ph)
8913#[inline]
8914#[target_feature(enable = "avx512fp16,avx512vl")]
8915#[cfg_attr(test, assert_instr(vgetexpph))]
8916#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8917pub fn _mm256_maskz_getexp_ph(k: __mmask16, a: __m256h) -> __m256h {
8918    _mm256_mask_getexp_ph(_mm256_setzero_ph(), k, a)
8919}
8920
8921/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8922/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
8923/// This intrinsic essentially calculates `floor(log2(x))` for each element.
8924///
8925/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getexp_ph)
8926#[inline]
8927#[target_feature(enable = "avx512fp16")]
8928#[cfg_attr(test, assert_instr(vgetexpph))]
8929#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8930pub fn _mm512_getexp_ph(a: __m512h) -> __m512h {
8931    _mm512_mask_getexp_ph(_mm512_undefined_ph(), 0xffffffff, a)
8932}
8933
8934/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8935/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
8936/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
8937/// `floor(log2(x))` for each element.
8938///
8939/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getexp_ph)
8940#[inline]
8941#[target_feature(enable = "avx512fp16")]
8942#[cfg_attr(test, assert_instr(vgetexpph))]
8943#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8944pub fn _mm512_mask_getexp_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
8945    _mm512_mask_getexp_round_ph::<_MM_FROUND_CUR_DIRECTION>(src, k, a)
8946}
8947
8948/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8949/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
8950/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
8951/// `floor(log2(x))` for each element.
8952///
8953/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getexp_ph)
8954#[inline]
8955#[target_feature(enable = "avx512fp16")]
8956#[cfg_attr(test, assert_instr(vgetexpph))]
8957#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8958pub fn _mm512_maskz_getexp_ph(k: __mmask32, a: __m512h) -> __m512h {
8959    _mm512_mask_getexp_ph(_mm512_setzero_ph(), k, a)
8960}
8961
8962/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8963/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
8964/// This intrinsic essentially calculates `floor(log2(x))` for each element. Exceptions can be suppressed
8965/// by passing _MM_FROUND_NO_EXC in the sae parameter
8966///
8967/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getexp_round_ph)
8968#[inline]
8969#[target_feature(enable = "avx512fp16")]
8970#[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
8971#[rustc_legacy_const_generics(1)]
8972#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8973pub fn _mm512_getexp_round_ph<const SAE: i32>(a: __m512h) -> __m512h {
8974    static_assert_sae!(SAE);
8975    _mm512_mask_getexp_round_ph::<SAE>(_mm512_undefined_ph(), 0xffffffff, a)
8976}
8977
8978/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8979/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
8980/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
8981/// `floor(log2(x))` for each element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
8982///
8983/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getexp_round_ph)
8984#[inline]
8985#[target_feature(enable = "avx512fp16")]
8986#[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
8987#[rustc_legacy_const_generics(3)]
8988#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8989pub fn _mm512_mask_getexp_round_ph<const SAE: i32>(
8990    src: __m512h,
8991    k: __mmask32,
8992    a: __m512h,
8993) -> __m512h {
8994    unsafe {
8995        static_assert_sae!(SAE);
8996        vgetexpph_512(a, src, k, SAE)
8997    }
8998}
8999
9000/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9001/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
9002/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
9003/// `floor(log2(x))` for each element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9004///
9005/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getexp_round_ph)
9006#[inline]
9007#[target_feature(enable = "avx512fp16")]
9008#[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
9009#[rustc_legacy_const_generics(2)]
9010#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9011pub fn _mm512_maskz_getexp_round_ph<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512h {
9012    static_assert_sae!(SAE);
9013    _mm512_mask_getexp_round_ph::<SAE>(_mm512_setzero_ph(), k, a)
9014}
9015
9016/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9017/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9018/// of dst, and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially
9019/// calculates `floor(log2(x))` for the lower element.
9020///
9021/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_sh)
9022#[inline]
9023#[target_feature(enable = "avx512fp16")]
9024#[cfg_attr(test, assert_instr(vgetexpsh))]
9025#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9026pub fn _mm_getexp_sh(a: __m128h, b: __m128h) -> __m128h {
9027    _mm_mask_getexp_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
9028}
9029
9030/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9031/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9032/// of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 7
9033/// packed elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))`
9034/// for the lower element.
9035///
9036/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_sh)
9037#[inline]
9038#[target_feature(enable = "avx512fp16")]
9039#[cfg_attr(test, assert_instr(vgetexpsh))]
9040#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9041pub fn _mm_mask_getexp_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
9042    _mm_mask_getexp_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
9043}
9044
9045/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9046/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9047/// of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed
9048/// elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))` for the
9049/// lower element.
9050///
9051/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_sh)
9052#[inline]
9053#[target_feature(enable = "avx512fp16")]
9054#[cfg_attr(test, assert_instr(vgetexpsh))]
9055#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9056pub fn _mm_maskz_getexp_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
9057    _mm_mask_getexp_sh(f16x8::ZERO.as_m128h(), k, a, b)
9058}
9059
9060/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9061/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9062/// of dst, and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially
9063/// calculates `floor(log2(x))` for the lower element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
9064/// in the sae parameter
9065///
9066/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_round_sh)
9067#[inline]
9068#[target_feature(enable = "avx512fp16")]
9069#[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
9070#[rustc_legacy_const_generics(2)]
9071#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9072pub fn _mm_getexp_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
9073    static_assert_sae!(SAE);
9074    _mm_mask_getexp_round_sh::<SAE>(f16x8::ZERO.as_m128h(), 0xff, a, b)
9075}
9076
9077/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9078/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9079/// of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 7
9080/// packed elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))`
9081/// for the lower element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9082///
9083/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_round_sh)
9084#[inline]
9085#[target_feature(enable = "avx512fp16")]
9086#[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
9087#[rustc_legacy_const_generics(4)]
9088#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9089pub fn _mm_mask_getexp_round_sh<const SAE: i32>(
9090    src: __m128h,
9091    k: __mmask8,
9092    a: __m128h,
9093    b: __m128h,
9094) -> __m128h {
9095    unsafe {
9096        static_assert_sae!(SAE);
9097        vgetexpsh(a, b, src, k, SAE)
9098    }
9099}
9100
9101/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9102/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9103/// of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed
9104/// elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))` for the
9105/// lower element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9106///
9107/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_round_sh)
9108#[inline]
9109#[target_feature(enable = "avx512fp16")]
9110#[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
9111#[rustc_legacy_const_generics(3)]
9112#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9113pub fn _mm_maskz_getexp_round_sh<const SAE: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
9114    static_assert_sae!(SAE);
9115    _mm_mask_getexp_round_sh::<SAE>(f16x8::ZERO.as_m128h(), k, a, b)
9116}
9117
9118/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9119/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9120/// on the interval range defined by norm and the sign depends on sign and the source sign.
9121///
9122/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9123///
9124///     _MM_MANT_NORM_1_2     // interval [1, 2)
9125///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9126///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9127///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9128///
9129/// The sign is determined by sc which can take the following values:
9130///
9131///     _MM_MANT_SIGN_src     // sign = sign(src)
9132///     _MM_MANT_SIGN_zero    // sign = 0
9133///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9134///
9135/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_ph)
9136#[inline]
9137#[target_feature(enable = "avx512fp16,avx512vl")]
9138#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9139#[rustc_legacy_const_generics(1, 2)]
9140#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9141pub fn _mm_getmant_ph<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
9142    a: __m128h,
9143) -> __m128h {
9144    static_assert_uimm_bits!(NORM, 4);
9145    static_assert_uimm_bits!(SIGN, 2);
9146    _mm_mask_getmant_ph::<NORM, SIGN>(_mm_undefined_ph(), 0xff, a)
9147}
9148
9149/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9150/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
9151/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9152/// by norm and the sign depends on sign and the source sign.
9153///
9154/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9155///
9156///     _MM_MANT_NORM_1_2     // interval [1, 2)
9157///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9158///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9159///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9160///
9161/// The sign is determined by sc which can take the following values:
9162///
9163///     _MM_MANT_SIGN_src     // sign = sign(src)
9164///     _MM_MANT_SIGN_zero    // sign = 0
9165///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9166///
9167/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_ph)
9168#[inline]
9169#[target_feature(enable = "avx512fp16,avx512vl")]
9170#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9171#[rustc_legacy_const_generics(3, 4)]
9172#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9173pub fn _mm_mask_getmant_ph<
9174    const NORM: _MM_MANTISSA_NORM_ENUM,
9175    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9176>(
9177    src: __m128h,
9178    k: __mmask8,
9179    a: __m128h,
9180) -> __m128h {
9181    unsafe {
9182        static_assert_uimm_bits!(NORM, 4);
9183        static_assert_uimm_bits!(SIGN, 2);
9184        vgetmantph_128(a, (SIGN << 2) | NORM, src, k)
9185    }
9186}
9187
9188/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9189/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
9190/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9191/// by norm and the sign depends on sign and the source sign.
9192///
9193/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9194///
9195///     _MM_MANT_NORM_1_2     // interval [1, 2)
9196///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9197///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9198///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9199///
9200/// The sign is determined by sc which can take the following values:
9201///
9202///     _MM_MANT_SIGN_src     // sign = sign(src)
9203///     _MM_MANT_SIGN_zero    // sign = 0
9204///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9205///
9206/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_ph)
9207#[inline]
9208#[target_feature(enable = "avx512fp16,avx512vl")]
9209#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9210#[rustc_legacy_const_generics(2, 3)]
9211#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9212pub fn _mm_maskz_getmant_ph<
9213    const NORM: _MM_MANTISSA_NORM_ENUM,
9214    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9215>(
9216    k: __mmask8,
9217    a: __m128h,
9218) -> __m128h {
9219    static_assert_uimm_bits!(NORM, 4);
9220    static_assert_uimm_bits!(SIGN, 2);
9221    _mm_mask_getmant_ph::<NORM, SIGN>(_mm_setzero_ph(), k, a)
9222}
9223
9224/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9225/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9226/// on the interval range defined by norm and the sign depends on sign and the source sign.
9227///
9228/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9229///
9230///     _MM_MANT_NORM_1_2     // interval [1, 2)
9231///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9232///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9233///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9234///
9235/// The sign is determined by sc which can take the following values:
9236///
9237///     _MM_MANT_SIGN_src     // sign = sign(src)
9238///     _MM_MANT_SIGN_zero    // sign = 0
9239///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9240///
9241/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_getmant_ph)
9242#[inline]
9243#[target_feature(enable = "avx512fp16,avx512vl")]
9244#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9245#[rustc_legacy_const_generics(1, 2)]
9246#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9247pub fn _mm256_getmant_ph<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
9248    a: __m256h,
9249) -> __m256h {
9250    static_assert_uimm_bits!(NORM, 4);
9251    static_assert_uimm_bits!(SIGN, 2);
9252    _mm256_mask_getmant_ph::<NORM, SIGN>(_mm256_undefined_ph(), 0xffff, a)
9253}
9254
9255/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9256/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
9257/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9258/// by norm and the sign depends on sign and the source sign.
9259///
9260/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9261///
9262///     _MM_MANT_NORM_1_2     // interval [1, 2)
9263///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9264///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9265///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9266///
9267/// The sign is determined by sc which can take the following values:
9268///
9269///     _MM_MANT_SIGN_src     // sign = sign(src)
9270///     _MM_MANT_SIGN_zero    // sign = 0
9271///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9272///
9273/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_getmant_ph)
9274#[inline]
9275#[target_feature(enable = "avx512fp16,avx512vl")]
9276#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9277#[rustc_legacy_const_generics(3, 4)]
9278#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9279pub fn _mm256_mask_getmant_ph<
9280    const NORM: _MM_MANTISSA_NORM_ENUM,
9281    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9282>(
9283    src: __m256h,
9284    k: __mmask16,
9285    a: __m256h,
9286) -> __m256h {
9287    unsafe {
9288        static_assert_uimm_bits!(NORM, 4);
9289        static_assert_uimm_bits!(SIGN, 2);
9290        vgetmantph_256(a, (SIGN << 2) | NORM, src, k)
9291    }
9292}
9293
9294/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9295/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
9296/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9297/// by norm and the sign depends on sign and the source sign.
9298///
9299/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9300///
9301///     _MM_MANT_NORM_1_2     // interval [1, 2)
9302///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9303///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9304///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9305///
9306/// The sign is determined by sc which can take the following values:
9307///
9308///     _MM_MANT_SIGN_src     // sign = sign(src)
9309///     _MM_MANT_SIGN_zero    // sign = 0
9310///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9311///
9312/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_getmant_ph)
9313#[inline]
9314#[target_feature(enable = "avx512fp16,avx512vl")]
9315#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9316#[rustc_legacy_const_generics(2, 3)]
9317#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9318pub fn _mm256_maskz_getmant_ph<
9319    const NORM: _MM_MANTISSA_NORM_ENUM,
9320    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9321>(
9322    k: __mmask16,
9323    a: __m256h,
9324) -> __m256h {
9325    static_assert_uimm_bits!(NORM, 4);
9326    static_assert_uimm_bits!(SIGN, 2);
9327    _mm256_mask_getmant_ph::<NORM, SIGN>(_mm256_setzero_ph(), k, a)
9328}
9329
9330/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9331/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9332/// on the interval range defined by norm and the sign depends on sign and the source sign.
9333///
9334/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9335///
9336///     _MM_MANT_NORM_1_2     // interval [1, 2)
9337///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9338///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9339///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9340///
9341/// The sign is determined by sc which can take the following values:
9342///
9343///     _MM_MANT_SIGN_src     // sign = sign(src)
9344///     _MM_MANT_SIGN_zero    // sign = 0
9345///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9346///
9347/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getmant_ph)
9348#[inline]
9349#[target_feature(enable = "avx512fp16")]
9350#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9351#[rustc_legacy_const_generics(1, 2)]
9352#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9353pub fn _mm512_getmant_ph<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
9354    a: __m512h,
9355) -> __m512h {
9356    static_assert_uimm_bits!(NORM, 4);
9357    static_assert_uimm_bits!(SIGN, 2);
9358    _mm512_mask_getmant_ph::<NORM, SIGN>(_mm512_undefined_ph(), 0xffffffff, a)
9359}
9360
9361/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9362/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
9363/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9364/// by norm and the sign depends on sign and the source sign.
9365///
9366/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9367///
9368///     _MM_MANT_NORM_1_2     // interval [1, 2)
9369///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9370///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9371///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9372///
9373/// The sign is determined by sc which can take the following values:
9374///
9375///     _MM_MANT_SIGN_src     // sign = sign(src)
9376///     _MM_MANT_SIGN_zero    // sign = 0
9377///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9378///
9379/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getmant_ph)
9380#[inline]
9381#[target_feature(enable = "avx512fp16")]
9382#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9383#[rustc_legacy_const_generics(3, 4)]
9384#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9385pub fn _mm512_mask_getmant_ph<
9386    const NORM: _MM_MANTISSA_NORM_ENUM,
9387    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9388>(
9389    src: __m512h,
9390    k: __mmask32,
9391    a: __m512h,
9392) -> __m512h {
9393    static_assert_uimm_bits!(NORM, 4);
9394    static_assert_uimm_bits!(SIGN, 2);
9395    _mm512_mask_getmant_round_ph::<NORM, SIGN, _MM_FROUND_CUR_DIRECTION>(src, k, a)
9396}
9397
9398/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9399/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
9400/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9401/// by norm and the sign depends on sign and the source sign.
9402///
9403/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9404///
9405///     _MM_MANT_NORM_1_2     // interval [1, 2)
9406///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9407///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9408///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9409///
9410/// The sign is determined by sc which can take the following values:
9411///
9412///     _MM_MANT_SIGN_src     // sign = sign(src)
9413///     _MM_MANT_SIGN_zero    // sign = 0
9414///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9415///
9416/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_ph)
9417#[inline]
9418#[target_feature(enable = "avx512fp16")]
9419#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9420#[rustc_legacy_const_generics(2, 3)]
9421#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9422pub fn _mm512_maskz_getmant_ph<
9423    const NORM: _MM_MANTISSA_NORM_ENUM,
9424    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9425>(
9426    k: __mmask32,
9427    a: __m512h,
9428) -> __m512h {
9429    static_assert_uimm_bits!(NORM, 4);
9430    static_assert_uimm_bits!(SIGN, 2);
9431    _mm512_mask_getmant_ph::<NORM, SIGN>(_mm512_setzero_ph(), k, a)
9432}
9433
9434/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9435/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9436/// on the interval range defined by norm and the sign depends on sign and the source sign. Exceptions can
9437/// be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9438///
9439/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9440///
9441///     _MM_MANT_NORM_1_2     // interval [1, 2)
9442///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9443///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9444///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9445///
9446/// The sign is determined by sc which can take the following values:
9447///
9448///     _MM_MANT_SIGN_src     // sign = sign(src)
9449///     _MM_MANT_SIGN_zero    // sign = 0
9450///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9451///
9452/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9453///
9454/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getmant_round_ph)
9455#[inline]
9456#[target_feature(enable = "avx512fp16")]
9457#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
9458#[rustc_legacy_const_generics(1, 2, 3)]
9459#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9460pub fn _mm512_getmant_round_ph<
9461    const NORM: _MM_MANTISSA_NORM_ENUM,
9462    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9463    const SAE: i32,
9464>(
9465    a: __m512h,
9466) -> __m512h {
9467    static_assert_uimm_bits!(NORM, 4);
9468    static_assert_uimm_bits!(SIGN, 2);
9469    static_assert_sae!(SAE);
9470    _mm512_mask_getmant_round_ph::<NORM, SIGN, SAE>(_mm512_undefined_ph(), 0xffffffff, a)
9471}
9472
9473/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9474/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
9475/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9476/// by norm and the sign depends on sign and the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
9477/// in the sae parameter
9478///
9479/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9480///
9481///     _MM_MANT_NORM_1_2     // interval [1, 2)
9482///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9483///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9484///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9485///
9486/// The sign is determined by sc which can take the following values:
9487///
9488///     _MM_MANT_SIGN_src     // sign = sign(src)
9489///     _MM_MANT_SIGN_zero    // sign = 0
9490///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9491///
9492/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9493///
9494/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getmant_round_ph)
9495#[inline]
9496#[target_feature(enable = "avx512fp16")]
9497#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
9498#[rustc_legacy_const_generics(3, 4, 5)]
9499#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9500pub fn _mm512_mask_getmant_round_ph<
9501    const NORM: _MM_MANTISSA_NORM_ENUM,
9502    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9503    const SAE: i32,
9504>(
9505    src: __m512h,
9506    k: __mmask32,
9507    a: __m512h,
9508) -> __m512h {
9509    unsafe {
9510        static_assert_uimm_bits!(NORM, 4);
9511        static_assert_uimm_bits!(SIGN, 2);
9512        static_assert_sae!(SAE);
9513        vgetmantph_512(a, (SIGN << 2) | NORM, src, k, SAE)
9514    }
9515}
9516
9517/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9518/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
9519/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9520/// by norm and the sign depends on sign and the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
9521/// in the sae parameter
9522///
9523/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9524///
9525///     _MM_MANT_NORM_1_2     // interval [1, 2)
9526///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9527///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9528///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9529///
9530/// The sign is determined by sc which can take the following values:
9531///
9532///     _MM_MANT_SIGN_src     // sign = sign(src)
9533///     _MM_MANT_SIGN_zero    // sign = 0
9534///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9535///
9536/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9537///
9538/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_round_ph)
9539#[inline]
9540#[target_feature(enable = "avx512fp16")]
9541#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
9542#[rustc_legacy_const_generics(2, 3, 4)]
9543#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9544pub fn _mm512_maskz_getmant_round_ph<
9545    const NORM: _MM_MANTISSA_NORM_ENUM,
9546    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9547    const SAE: i32,
9548>(
9549    k: __mmask32,
9550    a: __m512h,
9551) -> __m512h {
9552    static_assert_uimm_bits!(NORM, 4);
9553    static_assert_uimm_bits!(SIGN, 2);
9554    static_assert_sae!(SAE);
9555    _mm512_mask_getmant_round_ph::<NORM, SIGN, SAE>(_mm512_setzero_ph(), k, a)
9556}
9557
9558/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9559/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
9560/// elements of dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9561/// on the interval range defined by norm and the sign depends on sign and the source sign.
9562///
9563/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9564///
9565///     _MM_MANT_NORM_1_2     // interval [1, 2)
9566///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9567///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9568///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9569///
9570/// The sign is determined by sc which can take the following values:
9571///
9572///     _MM_MANT_SIGN_src     // sign = sign(src)
9573///     _MM_MANT_SIGN_zero    // sign = 0
9574///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9575///
9576/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_sh)
9577#[inline]
9578#[target_feature(enable = "avx512fp16")]
9579#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
9580#[rustc_legacy_const_generics(2, 3)]
9581#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9582pub fn _mm_getmant_sh<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
9583    a: __m128h,
9584    b: __m128h,
9585) -> __m128h {
9586    static_assert_uimm_bits!(NORM, 4);
9587    static_assert_uimm_bits!(SIGN, 2);
9588    _mm_mask_getmant_sh::<NORM, SIGN>(f16x8::ZERO.as_m128h(), 0xff, a, b)
9589}
9590
9591/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9592/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
9593/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
9594/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
9595/// the source sign.
9596///
9597/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9598///
9599///     _MM_MANT_NORM_1_2     // interval [1, 2)
9600///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9601///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9602///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9603///
9604/// The sign is determined by sc which can take the following values:
9605///
9606///     _MM_MANT_SIGN_src     // sign = sign(src)
9607///     _MM_MANT_SIGN_zero    // sign = 0
9608///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9609///
9610/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_sh)
9611#[inline]
9612#[target_feature(enable = "avx512fp16")]
9613#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
9614#[rustc_legacy_const_generics(4, 5)]
9615#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9616pub fn _mm_mask_getmant_sh<
9617    const NORM: _MM_MANTISSA_NORM_ENUM,
9618    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9619>(
9620    src: __m128h,
9621    k: __mmask8,
9622    a: __m128h,
9623    b: __m128h,
9624) -> __m128h {
9625    static_assert_uimm_bits!(NORM, 4);
9626    static_assert_uimm_bits!(SIGN, 2);
9627    _mm_mask_getmant_round_sh::<NORM, SIGN, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
9628}
9629
9630/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9631/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
9632/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
9633/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
9634/// the source sign.
9635///
9636/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9637///
9638///     _MM_MANT_NORM_1_2     // interval [1, 2)
9639///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9640///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9641///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9642///
9643/// The sign is determined by sc which can take the following values:
9644///
9645///     _MM_MANT_SIGN_src     // sign = sign(src)
9646///     _MM_MANT_SIGN_zero    // sign = 0
9647///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9648///
9649/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_sh)
9650#[inline]
9651#[target_feature(enable = "avx512fp16")]
9652#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
9653#[rustc_legacy_const_generics(3, 4)]
9654#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9655pub fn _mm_maskz_getmant_sh<
9656    const NORM: _MM_MANTISSA_NORM_ENUM,
9657    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9658>(
9659    k: __mmask8,
9660    a: __m128h,
9661    b: __m128h,
9662) -> __m128h {
9663    static_assert_uimm_bits!(NORM, 4);
9664    static_assert_uimm_bits!(SIGN, 2);
9665    _mm_mask_getmant_sh::<NORM, SIGN>(f16x8::ZERO.as_m128h(), k, a, b)
9666}
9667
9668/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9669/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
9670/// elements of dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9671/// on the interval range defined by norm and the sign depends on sign and the source sign. Exceptions can
9672/// be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9673///
9674/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9675///
9676///     _MM_MANT_NORM_1_2     // interval [1, 2)
9677///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9678///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9679///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9680///
9681/// The sign is determined by sc which can take the following values:
9682///
9683///     _MM_MANT_SIGN_src     // sign = sign(src)
9684///     _MM_MANT_SIGN_zero    // sign = 0
9685///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9686///
9687/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9688///
9689/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_round_sh)
9690#[inline]
9691#[target_feature(enable = "avx512fp16")]
9692#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
9693#[rustc_legacy_const_generics(2, 3, 4)]
9694#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9695pub fn _mm_getmant_round_sh<
9696    const NORM: _MM_MANTISSA_NORM_ENUM,
9697    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9698    const SAE: i32,
9699>(
9700    a: __m128h,
9701    b: __m128h,
9702) -> __m128h {
9703    static_assert_uimm_bits!(NORM, 4);
9704    static_assert_uimm_bits!(SIGN, 2);
9705    static_assert_sae!(SAE);
9706    _mm_mask_getmant_round_sh::<NORM, SIGN, SAE>(f16x8::ZERO.as_m128h(), 0xff, a, b)
9707}
9708
9709/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9710/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
9711/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
9712/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
9713/// the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9714///
9715/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9716///
9717///     _MM_MANT_NORM_1_2     // interval [1, 2)
9718///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9719///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9720///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9721///
9722/// The sign is determined by sc which can take the following values:
9723///
9724///     _MM_MANT_SIGN_src     // sign = sign(src)
9725///     _MM_MANT_SIGN_zero    // sign = 0
9726///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9727///
9728/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9729///
9730/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_round_sh)
9731#[inline]
9732#[target_feature(enable = "avx512fp16")]
9733#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
9734#[rustc_legacy_const_generics(4, 5, 6)]
9735#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9736pub fn _mm_mask_getmant_round_sh<
9737    const NORM: _MM_MANTISSA_NORM_ENUM,
9738    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9739    const SAE: i32,
9740>(
9741    src: __m128h,
9742    k: __mmask8,
9743    a: __m128h,
9744    b: __m128h,
9745) -> __m128h {
9746    unsafe {
9747        static_assert_uimm_bits!(NORM, 4);
9748        static_assert_uimm_bits!(SIGN, 2);
9749        static_assert_sae!(SAE);
9750        vgetmantsh(a, b, (SIGN << 2) | NORM, src, k, SAE)
9751    }
9752}
9753
9754/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9755/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
9756/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
9757/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
9758/// the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9759///
9760/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9761///
9762///     _MM_MANT_NORM_1_2     // interval [1, 2)
9763///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9764///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9765///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9766///
9767/// The sign is determined by sc which can take the following values:
9768///
9769///     _MM_MANT_SIGN_src     // sign = sign(src)
9770///     _MM_MANT_SIGN_zero    // sign = 0
9771///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9772///
9773/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9774///
9775/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_round_sh)
9776#[inline]
9777#[target_feature(enable = "avx512fp16")]
9778#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
9779#[rustc_legacy_const_generics(3, 4, 5)]
9780#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9781pub fn _mm_maskz_getmant_round_sh<
9782    const NORM: _MM_MANTISSA_NORM_ENUM,
9783    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9784    const SAE: i32,
9785>(
9786    k: __mmask8,
9787    a: __m128h,
9788    b: __m128h,
9789) -> __m128h {
9790    static_assert_uimm_bits!(NORM, 4);
9791    static_assert_uimm_bits!(SIGN, 2);
9792    static_assert_sae!(SAE);
9793    _mm_mask_getmant_round_sh::<NORM, SIGN, SAE>(f16x8::ZERO.as_m128h(), k, a, b)
9794}
9795
9796/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9797/// specified by imm8, and store the results in dst.
9798///
9799/// Rounding is done according to the imm8 parameter, which can be one of:
9800///
9801/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9802/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9803/// * [`_MM_FROUND_TO_POS_INF`] : round up
9804/// * [`_MM_FROUND_TO_ZERO`] : truncate
9805/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9806///
9807/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_ph)
9808#[inline]
9809#[target_feature(enable = "avx512fp16,avx512vl")]
9810#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9811#[rustc_legacy_const_generics(1)]
9812#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9813pub fn _mm_roundscale_ph<const IMM8: i32>(a: __m128h) -> __m128h {
9814    static_assert_uimm_bits!(IMM8, 8);
9815    _mm_mask_roundscale_ph::<IMM8>(_mm_undefined_ph(), 0xff, a)
9816}
9817
9818/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9819/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
9820/// the corresponding mask bit is not set).
9821///
9822/// Rounding is done according to the imm8 parameter, which can be one of:
9823///
9824/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9825/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9826/// * [`_MM_FROUND_TO_POS_INF`] : round up
9827/// * [`_MM_FROUND_TO_ZERO`] : truncate
9828/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9829///
9830/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_ph)
9831#[inline]
9832#[target_feature(enable = "avx512fp16,avx512vl")]
9833#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9834#[rustc_legacy_const_generics(3)]
9835#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9836pub fn _mm_mask_roundscale_ph<const IMM8: i32>(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
9837    unsafe {
9838        static_assert_uimm_bits!(IMM8, 8);
9839        vrndscaleph_128(a, IMM8, src, k)
9840    }
9841}
9842
9843/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9844/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
9845/// mask bit is not set).
9846///
9847/// Rounding is done according to the imm8 parameter, which can be one of:
9848///
9849/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9850/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9851/// * [`_MM_FROUND_TO_POS_INF`] : round up
9852/// * [`_MM_FROUND_TO_ZERO`] : truncate
9853/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9854///
9855/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_ph)
9856#[inline]
9857#[target_feature(enable = "avx512fp16,avx512vl")]
9858#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9859#[rustc_legacy_const_generics(2)]
9860#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9861pub fn _mm_maskz_roundscale_ph<const IMM8: i32>(k: __mmask8, a: __m128h) -> __m128h {
9862    static_assert_uimm_bits!(IMM8, 8);
9863    _mm_mask_roundscale_ph::<IMM8>(_mm_setzero_ph(), k, a)
9864}
9865
9866/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9867/// specified by imm8, and store the results in dst.
9868///
9869/// Rounding is done according to the imm8 parameter, which can be one of:
9870///
9871/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9872/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9873/// * [`_MM_FROUND_TO_POS_INF`] : round up
9874/// * [`_MM_FROUND_TO_ZERO`] : truncate
9875/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9876///
9877/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_roundscale_ph)
9878#[inline]
9879#[target_feature(enable = "avx512fp16,avx512vl")]
9880#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9881#[rustc_legacy_const_generics(1)]
9882#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9883pub fn _mm256_roundscale_ph<const IMM8: i32>(a: __m256h) -> __m256h {
9884    static_assert_uimm_bits!(IMM8, 8);
9885    _mm256_mask_roundscale_ph::<IMM8>(_mm256_undefined_ph(), 0xffff, a)
9886}
9887
9888/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9889/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
9890/// the corresponding mask bit is not set).
9891///
9892/// Rounding is done according to the imm8 parameter, which can be one of:
9893///
9894/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9895/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9896/// * [`_MM_FROUND_TO_POS_INF`] : round up
9897/// * [`_MM_FROUND_TO_ZERO`] : truncate
9898/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9899///
9900/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_roundscale_ph)
9901#[inline]
9902#[target_feature(enable = "avx512fp16,avx512vl")]
9903#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9904#[rustc_legacy_const_generics(3)]
9905#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9906pub fn _mm256_mask_roundscale_ph<const IMM8: i32>(
9907    src: __m256h,
9908    k: __mmask16,
9909    a: __m256h,
9910) -> __m256h {
9911    unsafe {
9912        static_assert_uimm_bits!(IMM8, 8);
9913        vrndscaleph_256(a, IMM8, src, k)
9914    }
9915}
9916
9917/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9918/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
9919/// mask bit is not set).
9920///
9921/// Rounding is done according to the imm8 parameter, which can be one of:
9922///
9923/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9924/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9925/// * [`_MM_FROUND_TO_POS_INF`] : round up
9926/// * [`_MM_FROUND_TO_ZERO`] : truncate
9927/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9928///
9929/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_roundscale_ph)
9930#[inline]
9931#[target_feature(enable = "avx512fp16,avx512vl")]
9932#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9933#[rustc_legacy_const_generics(2)]
9934#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9935pub fn _mm256_maskz_roundscale_ph<const IMM8: i32>(k: __mmask16, a: __m256h) -> __m256h {
9936    static_assert_uimm_bits!(IMM8, 8);
9937    _mm256_mask_roundscale_ph::<IMM8>(_mm256_setzero_ph(), k, a)
9938}
9939
9940/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9941/// specified by imm8, and store the results in dst.
9942///
9943/// Rounding is done according to the imm8 parameter, which can be one of:
9944///
9945/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9946/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9947/// * [`_MM_FROUND_TO_POS_INF`] : round up
9948/// * [`_MM_FROUND_TO_ZERO`] : truncate
9949/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9950///
9951/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_ph)
9952#[inline]
9953#[target_feature(enable = "avx512fp16")]
9954#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9955#[rustc_legacy_const_generics(1)]
9956#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9957pub fn _mm512_roundscale_ph<const IMM8: i32>(a: __m512h) -> __m512h {
9958    static_assert_uimm_bits!(IMM8, 8);
9959    _mm512_mask_roundscale_ph::<IMM8>(_mm512_undefined_ph(), 0xffffffff, a)
9960}
9961
9962/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9963/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
9964/// the corresponding mask bit is not set).
9965///
9966/// Rounding is done according to the imm8 parameter, which can be one of:
9967///
9968/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9969/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9970/// * [`_MM_FROUND_TO_POS_INF`] : round up
9971/// * [`_MM_FROUND_TO_ZERO`] : truncate
9972/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9973///
9974/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_ph)
9975#[inline]
9976#[target_feature(enable = "avx512fp16")]
9977#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9978#[rustc_legacy_const_generics(3)]
9979#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9980pub fn _mm512_mask_roundscale_ph<const IMM8: i32>(
9981    src: __m512h,
9982    k: __mmask32,
9983    a: __m512h,
9984) -> __m512h {
9985    static_assert_uimm_bits!(IMM8, 8);
9986    _mm512_mask_roundscale_round_ph::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a)
9987}
9988
9989/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9990/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
9991/// mask bit is not set).
9992///
9993/// Rounding is done according to the imm8 parameter, which can be one of:
9994///
9995/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9996/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9997/// * [`_MM_FROUND_TO_POS_INF`] : round up
9998/// * [`_MM_FROUND_TO_ZERO`] : truncate
9999/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10000///
10001/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_ph)
10002#[inline]
10003#[target_feature(enable = "avx512fp16")]
10004#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
10005#[rustc_legacy_const_generics(2)]
10006#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10007pub fn _mm512_maskz_roundscale_ph<const IMM8: i32>(k: __mmask32, a: __m512h) -> __m512h {
10008    static_assert_uimm_bits!(IMM8, 8);
10009    _mm512_mask_roundscale_ph::<IMM8>(_mm512_setzero_ph(), k, a)
10010}
10011
10012/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10013/// specified by imm8, and store the results in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
10014/// in the sae parameter
10015///
10016/// Rounding is done according to the imm8 parameter, which can be one of:
10017///
10018/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10019/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10020/// * [`_MM_FROUND_TO_POS_INF`] : round up
10021/// * [`_MM_FROUND_TO_ZERO`] : truncate
10022/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10023///
10024/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_round_ph)
10025#[inline]
10026#[target_feature(enable = "avx512fp16")]
10027#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
10028#[rustc_legacy_const_generics(1, 2)]
10029#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10030pub fn _mm512_roundscale_round_ph<const IMM8: i32, const SAE: i32>(a: __m512h) -> __m512h {
10031    static_assert_uimm_bits!(IMM8, 8);
10032    static_assert_sae!(SAE);
10033    _mm512_mask_roundscale_round_ph::<IMM8, SAE>(_mm512_undefined_ph(), 0xffffffff, a)
10034}
10035
10036/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10037/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
10038/// the corresponding mask bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
10039/// in the sae parameter
10040///
10041/// Rounding is done according to the imm8 parameter, which can be one of:
10042///
10043/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10044/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10045/// * [`_MM_FROUND_TO_POS_INF`] : round up
10046/// * [`_MM_FROUND_TO_ZERO`] : truncate
10047/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10048///
10049/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_round_ph)
10050#[inline]
10051#[target_feature(enable = "avx512fp16")]
10052#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
10053#[rustc_legacy_const_generics(3, 4)]
10054#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10055pub fn _mm512_mask_roundscale_round_ph<const IMM8: i32, const SAE: i32>(
10056    src: __m512h,
10057    k: __mmask32,
10058    a: __m512h,
10059) -> __m512h {
10060    unsafe {
10061        static_assert_uimm_bits!(IMM8, 8);
10062        static_assert_sae!(SAE);
10063        vrndscaleph_512(a, IMM8, src, k, SAE)
10064    }
10065}
10066
10067/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10068/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
10069/// mask bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10070///
10071/// Rounding is done according to the imm8 parameter, which can be one of:
10072///
10073/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10074/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10075/// * [`_MM_FROUND_TO_POS_INF`] : round up
10076/// * [`_MM_FROUND_TO_ZERO`] : truncate
10077/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10078///
10079/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_round_ph)
10080#[inline]
10081#[target_feature(enable = "avx512fp16")]
10082#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
10083#[rustc_legacy_const_generics(2, 3)]
10084#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10085pub fn _mm512_maskz_roundscale_round_ph<const IMM8: i32, const SAE: i32>(
10086    k: __mmask32,
10087    a: __m512h,
10088) -> __m512h {
10089    static_assert_uimm_bits!(IMM8, 8);
10090    static_assert_sae!(SAE);
10091    _mm512_mask_roundscale_round_ph::<IMM8, SAE>(_mm512_setzero_ph(), k, a)
10092}
10093
10094/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10095/// specified by imm8, store the result in the lower element of dst, and copy the upper 7 packed elements
10096/// from a to the upper elements of dst.
10097///
10098/// Rounding is done according to the imm8 parameter, which can be one of:
10099///
10100/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10101/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10102/// * [`_MM_FROUND_TO_POS_INF`] : round up
10103/// * [`_MM_FROUND_TO_ZERO`] : truncate
10104/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10105///
10106/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_sh)
10107#[inline]
10108#[target_feature(enable = "avx512fp16")]
10109#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
10110#[rustc_legacy_const_generics(2)]
10111#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10112pub fn _mm_roundscale_sh<const IMM8: i32>(a: __m128h, b: __m128h) -> __m128h {
10113    static_assert_uimm_bits!(IMM8, 8);
10114    _mm_mask_roundscale_sh::<IMM8>(f16x8::ZERO.as_m128h(), 0xff, a, b)
10115}
10116
10117/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10118/// specified by imm8, store the result in the lower element of dst using writemask k (the element is copied
10119/// from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
10120///
10121/// Rounding is done according to the imm8 parameter, which can be one of:
10122///
10123/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10124/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10125/// * [`_MM_FROUND_TO_POS_INF`] : round up
10126/// * [`_MM_FROUND_TO_ZERO`] : truncate
10127/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10128///
10129/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_sh)
10130#[inline]
10131#[target_feature(enable = "avx512fp16")]
10132#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
10133#[rustc_legacy_const_generics(4)]
10134#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10135pub fn _mm_mask_roundscale_sh<const IMM8: i32>(
10136    src: __m128h,
10137    k: __mmask8,
10138    a: __m128h,
10139    b: __m128h,
10140) -> __m128h {
10141    static_assert_uimm_bits!(IMM8, 8);
10142    _mm_mask_roundscale_round_sh::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
10143}
10144
10145/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10146/// specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed
10147/// out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
10148///
10149/// Rounding is done according to the imm8 parameter, which can be one of:
10150///
10151/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10152/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10153/// * [`_MM_FROUND_TO_POS_INF`] : round up
10154/// * [`_MM_FROUND_TO_ZERO`] : truncate
10155/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10156///
10157/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_sh)
10158#[inline]
10159#[target_feature(enable = "avx512fp16")]
10160#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
10161#[rustc_legacy_const_generics(3)]
10162#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10163pub fn _mm_maskz_roundscale_sh<const IMM8: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10164    static_assert_uimm_bits!(IMM8, 8);
10165    _mm_mask_roundscale_sh::<IMM8>(f16x8::ZERO.as_m128h(), k, a, b)
10166}
10167
10168/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10169/// specified by imm8, store the result in the lower element of dst, and copy the upper 7 packed elements
10170/// from a to the upper elements of dst.
10171///
10172/// Rounding is done according to the imm8 parameter, which can be one of:
10173///
10174/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10175/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10176/// * [`_MM_FROUND_TO_POS_INF`] : round up
10177/// * [`_MM_FROUND_TO_ZERO`] : truncate
10178/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10179///
10180/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10181///
10182/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_round_sh)
10183#[inline]
10184#[target_feature(enable = "avx512fp16")]
10185#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
10186#[rustc_legacy_const_generics(2, 3)]
10187#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10188pub fn _mm_roundscale_round_sh<const IMM8: i32, const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
10189    static_assert_uimm_bits!(IMM8, 8);
10190    static_assert_sae!(SAE);
10191    _mm_mask_roundscale_round_sh::<IMM8, SAE>(f16x8::ZERO.as_m128h(), 0xff, a, b)
10192}
10193
10194/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10195/// specified by imm8, store the result in the lower element of dst using writemask k (the element is copied
10196/// from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
10197///
10198/// Rounding is done according to the imm8 parameter, which can be one of:
10199///
10200/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10201/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10202/// * [`_MM_FROUND_TO_POS_INF`] : round up
10203/// * [`_MM_FROUND_TO_ZERO`] : truncate
10204/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10205///
10206/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10207///
10208/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_round_sh)
10209#[inline]
10210#[target_feature(enable = "avx512fp16")]
10211#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
10212#[rustc_legacy_const_generics(4, 5)]
10213#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10214pub fn _mm_mask_roundscale_round_sh<const IMM8: i32, const SAE: i32>(
10215    src: __m128h,
10216    k: __mmask8,
10217    a: __m128h,
10218    b: __m128h,
10219) -> __m128h {
10220    unsafe {
10221        static_assert_uimm_bits!(IMM8, 8);
10222        static_assert_sae!(SAE);
10223        vrndscalesh(a, b, src, k, IMM8, SAE)
10224    }
10225}
10226
10227/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10228/// specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed
10229/// out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
10230///
10231/// Rounding is done according to the imm8 parameter, which can be one of:
10232///
10233/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10234/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10235/// * [`_MM_FROUND_TO_POS_INF`] : round up
10236/// * [`_MM_FROUND_TO_ZERO`] : truncate
10237/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10238///
10239/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10240///
10241/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_round_sh)
10242#[inline]
10243#[target_feature(enable = "avx512fp16")]
10244#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
10245#[rustc_legacy_const_generics(3, 4)]
10246#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10247pub fn _mm_maskz_roundscale_round_sh<const IMM8: i32, const SAE: i32>(
10248    k: __mmask8,
10249    a: __m128h,
10250    b: __m128h,
10251) -> __m128h {
10252    static_assert_uimm_bits!(IMM8, 8);
10253    static_assert_sae!(SAE);
10254    _mm_mask_roundscale_round_sh::<IMM8, SAE>(f16x8::ZERO.as_m128h(), k, a, b)
10255}
10256
10257/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10258/// the results in dst.
10259///
10260/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_ph)
10261#[inline]
10262#[target_feature(enable = "avx512fp16,avx512vl")]
10263#[cfg_attr(test, assert_instr(vscalefph))]
10264#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10265pub fn _mm_scalef_ph(a: __m128h, b: __m128h) -> __m128h {
10266    _mm_mask_scalef_ph(_mm_undefined_ph(), 0xff, a, b)
10267}
10268
10269/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10270/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10271///
10272/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_ph)
10273#[inline]
10274#[target_feature(enable = "avx512fp16,avx512vl")]
10275#[cfg_attr(test, assert_instr(vscalefph))]
10276#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10277pub fn _mm_mask_scalef_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10278    unsafe { vscalefph_128(a, b, src, k) }
10279}
10280
10281/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10282/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10283///
10284/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_ph)
10285#[inline]
10286#[target_feature(enable = "avx512fp16,avx512vl")]
10287#[cfg_attr(test, assert_instr(vscalefph))]
10288#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10289pub fn _mm_maskz_scalef_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10290    _mm_mask_scalef_ph(_mm_setzero_ph(), k, a, b)
10291}
10292
10293/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10294/// the results in dst.
10295///
10296/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_scalef_ph)
10297#[inline]
10298#[target_feature(enable = "avx512fp16,avx512vl")]
10299#[cfg_attr(test, assert_instr(vscalefph))]
10300#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10301pub fn _mm256_scalef_ph(a: __m256h, b: __m256h) -> __m256h {
10302    _mm256_mask_scalef_ph(_mm256_undefined_ph(), 0xffff, a, b)
10303}
10304
10305/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10306/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10307///
10308/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_scalef_ph)
10309#[inline]
10310#[target_feature(enable = "avx512fp16,avx512vl")]
10311#[cfg_attr(test, assert_instr(vscalefph))]
10312#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10313pub fn _mm256_mask_scalef_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
10314    unsafe { vscalefph_256(a, b, src, k) }
10315}
10316
10317/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10318/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10319///
10320/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_scalef_ph)
10321#[inline]
10322#[target_feature(enable = "avx512fp16,avx512vl")]
10323#[cfg_attr(test, assert_instr(vscalefph))]
10324#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10325pub fn _mm256_maskz_scalef_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
10326    _mm256_mask_scalef_ph(_mm256_setzero_ph(), k, a, b)
10327}
10328
10329/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10330/// the results in dst.
10331///
10332/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_ph)
10333#[inline]
10334#[target_feature(enable = "avx512fp16")]
10335#[cfg_attr(test, assert_instr(vscalefph))]
10336#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10337pub fn _mm512_scalef_ph(a: __m512h, b: __m512h) -> __m512h {
10338    _mm512_mask_scalef_ph(_mm512_undefined_ph(), 0xffffffff, a, b)
10339}
10340
10341/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10342/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10343///
10344/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_ph)
10345#[inline]
10346#[target_feature(enable = "avx512fp16")]
10347#[cfg_attr(test, assert_instr(vscalefph))]
10348#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10349pub fn _mm512_mask_scalef_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
10350    _mm512_mask_scalef_round_ph::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
10351}
10352
10353/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10354/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10355///
10356/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_ph)
10357#[inline]
10358#[target_feature(enable = "avx512fp16")]
10359#[cfg_attr(test, assert_instr(vscalefph))]
10360#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10361pub fn _mm512_maskz_scalef_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
10362    _mm512_mask_scalef_ph(_mm512_setzero_ph(), k, a, b)
10363}
10364
10365/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10366/// the results in dst.
10367///
10368/// Rounding is done according to the rounding parameter, which can be one of:
10369///
10370/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10371/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10372/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10373/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10374/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10375///
10376/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_round_ph)
10377#[inline]
10378#[target_feature(enable = "avx512fp16")]
10379#[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
10380#[rustc_legacy_const_generics(2)]
10381#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10382pub fn _mm512_scalef_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
10383    static_assert_rounding!(ROUNDING);
10384    _mm512_mask_scalef_round_ph::<ROUNDING>(_mm512_undefined_ph(), 0xffffffff, a, b)
10385}
10386
10387/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10388/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10389///
10390/// Rounding is done according to the rounding parameter, which can be one of:
10391///
10392/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10393/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10394/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10395/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10396/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10397///
10398/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_ph)
10399#[inline]
10400#[target_feature(enable = "avx512fp16")]
10401#[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
10402#[rustc_legacy_const_generics(4)]
10403#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10404pub fn _mm512_mask_scalef_round_ph<const ROUNDING: i32>(
10405    src: __m512h,
10406    k: __mmask32,
10407    a: __m512h,
10408    b: __m512h,
10409) -> __m512h {
10410    unsafe {
10411        static_assert_rounding!(ROUNDING);
10412        vscalefph_512(a, b, src, k, ROUNDING)
10413    }
10414}
10415
10416/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10417/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10418///
10419/// Rounding is done according to the rounding parameter, which can be one of:
10420///
10421/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10422/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10423/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10424/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10425/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10426///
10427/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_round_ph)
10428#[inline]
10429#[target_feature(enable = "avx512fp16")]
10430#[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
10431#[rustc_legacy_const_generics(3)]
10432#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10433pub fn _mm512_maskz_scalef_round_ph<const ROUNDING: i32>(
10434    k: __mmask32,
10435    a: __m512h,
10436    b: __m512h,
10437) -> __m512h {
10438    static_assert_rounding!(ROUNDING);
10439    _mm512_mask_scalef_round_ph::<ROUNDING>(_mm512_setzero_ph(), k, a, b)
10440}
10441
10442/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10443/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
10444/// elements of dst.
10445///
10446/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_sh)
10447#[inline]
10448#[target_feature(enable = "avx512fp16")]
10449#[cfg_attr(test, assert_instr(vscalefsh))]
10450#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10451pub fn _mm_scalef_sh(a: __m128h, b: __m128h) -> __m128h {
10452    _mm_mask_scalef_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
10453}
10454
10455/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10456/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
10457/// and copy the upper 7 packed elements from a to the upper elements of dst.
10458///
10459/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_sh)
10460#[inline]
10461#[target_feature(enable = "avx512fp16")]
10462#[cfg_attr(test, assert_instr(vscalefsh))]
10463#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10464pub fn _mm_mask_scalef_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10465    _mm_mask_scalef_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
10466}
10467
10468/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10469/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
10470/// and copy the upper 7 packed elements from a to the upper elements of dst.
10471///
10472/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_sh)
10473#[inline]
10474#[target_feature(enable = "avx512fp16")]
10475#[cfg_attr(test, assert_instr(vscalefsh))]
10476#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10477pub fn _mm_maskz_scalef_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10478    _mm_mask_scalef_sh(f16x8::ZERO.as_m128h(), k, a, b)
10479}
10480
10481/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10482/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
10483/// elements of dst.
10484///
10485/// Rounding is done according to the rounding parameter, which can be one of:
10486///
10487/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10488/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10489/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10490/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10491/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10492///
10493/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_round_sh)
10494#[inline]
10495#[target_feature(enable = "avx512fp16")]
10496#[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
10497#[rustc_legacy_const_generics(2)]
10498#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10499pub fn _mm_scalef_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
10500    static_assert_rounding!(ROUNDING);
10501    _mm_mask_scalef_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
10502}
10503
10504/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10505/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
10506/// and copy the upper 7 packed elements from a to the upper elements of dst.
10507///
10508/// Rounding is done according to the rounding parameter, which can be one of:
10509///
10510/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10511/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10512/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10513/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10514/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10515///
10516/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_round_sh)
10517#[inline]
10518#[target_feature(enable = "avx512fp16")]
10519#[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
10520#[rustc_legacy_const_generics(4)]
10521#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10522pub fn _mm_mask_scalef_round_sh<const ROUNDING: i32>(
10523    src: __m128h,
10524    k: __mmask8,
10525    a: __m128h,
10526    b: __m128h,
10527) -> __m128h {
10528    unsafe {
10529        static_assert_rounding!(ROUNDING);
10530        vscalefsh(a, b, src, k, ROUNDING)
10531    }
10532}
10533
10534/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10535/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
10536/// and copy the upper 7 packed elements from a to the upper elements of dst.
10537///
10538/// Rounding is done according to the rounding parameter, which can be one of:
10539///
10540/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10541/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10542/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10543/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10544/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10545///
10546/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_round_sh)
10547#[inline]
10548#[target_feature(enable = "avx512fp16")]
10549#[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
10550#[rustc_legacy_const_generics(3)]
10551#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10552pub fn _mm_maskz_scalef_round_sh<const ROUNDING: i32>(
10553    k: __mmask8,
10554    a: __m128h,
10555    b: __m128h,
10556) -> __m128h {
10557    static_assert_rounding!(ROUNDING);
10558    _mm_mask_scalef_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
10559}
10560
10561/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10562/// number of bits specified by imm8, and store the results in dst.
10563///
10564/// Rounding is done according to the imm8 parameter, which can be one of:
10565///
10566/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10567/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10568/// * [`_MM_FROUND_TO_POS_INF`] : round up
10569/// * [`_MM_FROUND_TO_ZERO`] : truncate
10570/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10571///
10572/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_ph)
10573#[inline]
10574#[target_feature(enable = "avx512fp16,avx512vl")]
10575#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10576#[rustc_legacy_const_generics(1)]
10577#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10578pub fn _mm_reduce_ph<const IMM8: i32>(a: __m128h) -> __m128h {
10579    static_assert_uimm_bits!(IMM8, 8);
10580    _mm_mask_reduce_ph::<IMM8>(_mm_undefined_ph(), 0xff, a)
10581}
10582
10583/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10584/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
10585/// from src when the corresponding mask bit is not set).
10586///
10587/// Rounding is done according to the imm8 parameter, which can be one of:
10588///
10589/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10590/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10591/// * [`_MM_FROUND_TO_POS_INF`] : round up
10592/// * [`_MM_FROUND_TO_ZERO`] : truncate
10593/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10594///
10595/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_ph)
10596#[inline]
10597#[target_feature(enable = "avx512fp16,avx512vl")]
10598#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10599#[rustc_legacy_const_generics(3)]
10600#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10601pub fn _mm_mask_reduce_ph<const IMM8: i32>(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
10602    unsafe {
10603        static_assert_uimm_bits!(IMM8, 8);
10604        vreduceph_128(a, IMM8, src, k)
10605    }
10606}
10607
10608/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10609/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
10610/// out when the corresponding mask bit is not set).
10611///
10612/// Rounding is done according to the imm8 parameter, which can be one of:
10613///
10614/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10615/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10616/// * [`_MM_FROUND_TO_POS_INF`] : round up
10617/// * [`_MM_FROUND_TO_ZERO`] : truncate
10618/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10619///
10620/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_ph)
10621#[inline]
10622#[target_feature(enable = "avx512fp16,avx512vl")]
10623#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10624#[rustc_legacy_const_generics(2)]
10625#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10626pub fn _mm_maskz_reduce_ph<const IMM8: i32>(k: __mmask8, a: __m128h) -> __m128h {
10627    static_assert_uimm_bits!(IMM8, 8);
10628    _mm_mask_reduce_ph::<IMM8>(_mm_setzero_ph(), k, a)
10629}
10630
10631/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10632/// number of bits specified by imm8, and store the results in dst.
10633///
10634/// Rounding is done according to the imm8 parameter, which can be one of:
10635///
10636/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10637/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10638/// * [`_MM_FROUND_TO_POS_INF`] : round up
10639/// * [`_MM_FROUND_TO_ZERO`] : truncate
10640/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10641///
10642/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_ph)
10643#[inline]
10644#[target_feature(enable = "avx512fp16,avx512vl")]
10645#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10646#[rustc_legacy_const_generics(1)]
10647#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10648pub fn _mm256_reduce_ph<const IMM8: i32>(a: __m256h) -> __m256h {
10649    static_assert_uimm_bits!(IMM8, 8);
10650    _mm256_mask_reduce_ph::<IMM8>(_mm256_undefined_ph(), 0xffff, a)
10651}
10652
10653/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10654/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
10655/// from src when the corresponding mask bit is not set).
10656///
10657/// Rounding is done according to the imm8 parameter, which can be one of:
10658///
10659/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10660/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10661/// * [`_MM_FROUND_TO_POS_INF`] : round up
10662/// * [`_MM_FROUND_TO_ZERO`] : truncate
10663/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10664///
10665/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_ph)
10666#[inline]
10667#[target_feature(enable = "avx512fp16,avx512vl")]
10668#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10669#[rustc_legacy_const_generics(3)]
10670#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10671pub fn _mm256_mask_reduce_ph<const IMM8: i32>(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
10672    unsafe {
10673        static_assert_uimm_bits!(IMM8, 8);
10674        vreduceph_256(a, IMM8, src, k)
10675    }
10676}
10677
10678/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10679/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
10680/// out when the corresponding mask bit is not set).
10681///
10682/// Rounding is done according to the imm8 parameter, which can be one of:
10683///
10684/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10685/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10686/// * [`_MM_FROUND_TO_POS_INF`] : round up
10687/// * [`_MM_FROUND_TO_ZERO`] : truncate
10688/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10689///
10690/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_reduce_ph)
10691#[inline]
10692#[target_feature(enable = "avx512fp16,avx512vl")]
10693#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10694#[rustc_legacy_const_generics(2)]
10695#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10696pub fn _mm256_maskz_reduce_ph<const IMM8: i32>(k: __mmask16, a: __m256h) -> __m256h {
10697    static_assert_uimm_bits!(IMM8, 8);
10698    _mm256_mask_reduce_ph::<IMM8>(_mm256_setzero_ph(), k, a)
10699}
10700
10701/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10702/// number of bits specified by imm8, and store the results in dst.
10703///
10704/// Rounding is done according to the imm8 parameter, which can be one of:
10705///
10706/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10707/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10708/// * [`_MM_FROUND_TO_POS_INF`] : round up
10709/// * [`_MM_FROUND_TO_ZERO`] : truncate
10710/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10711///
10712/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_ph)
10713#[inline]
10714#[target_feature(enable = "avx512fp16")]
10715#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10716#[rustc_legacy_const_generics(1)]
10717#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10718pub fn _mm512_reduce_ph<const IMM8: i32>(a: __m512h) -> __m512h {
10719    static_assert_uimm_bits!(IMM8, 8);
10720    _mm512_mask_reduce_ph::<IMM8>(_mm512_undefined_ph(), 0xffffffff, a)
10721}
10722
10723/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10724/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
10725/// from src when the corresponding mask bit is not set).
10726///
10727/// Rounding is done according to the imm8 parameter, which can be one of:
10728///
10729/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10730/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10731/// * [`_MM_FROUND_TO_POS_INF`] : round up
10732/// * [`_MM_FROUND_TO_ZERO`] : truncate
10733/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10734///
10735/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_ph)
10736#[inline]
10737#[target_feature(enable = "avx512fp16")]
10738#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10739#[rustc_legacy_const_generics(3)]
10740#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10741pub fn _mm512_mask_reduce_ph<const IMM8: i32>(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
10742    static_assert_uimm_bits!(IMM8, 8);
10743    _mm512_mask_reduce_round_ph::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a)
10744}
10745
10746/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10747/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
10748/// out when the corresponding mask bit is not set).
10749///
10750/// Rounding is done according to the imm8 parameter, which can be one of:
10751///
10752/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10753/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10754/// * [`_MM_FROUND_TO_POS_INF`] : round up
10755/// * [`_MM_FROUND_TO_ZERO`] : truncate
10756/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10757///
10758/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_reduce_ph)
10759#[inline]
10760#[target_feature(enable = "avx512fp16")]
10761#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10762#[rustc_legacy_const_generics(2)]
10763#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10764pub fn _mm512_maskz_reduce_ph<const IMM8: i32>(k: __mmask32, a: __m512h) -> __m512h {
10765    static_assert_uimm_bits!(IMM8, 8);
10766    _mm512_mask_reduce_ph::<IMM8>(_mm512_setzero_ph(), k, a)
10767}
10768
10769/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10770/// number of bits specified by imm8, and store the results in dst.
10771///
10772/// Rounding is done according to the imm8 parameter, which can be one of:
10773///
10774/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10775/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10776/// * [`_MM_FROUND_TO_POS_INF`] : round up
10777/// * [`_MM_FROUND_TO_ZERO`] : truncate
10778/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10779///
10780/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
10781///
10782/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_round_ph)
10783#[inline]
10784#[target_feature(enable = "avx512fp16")]
10785#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
10786#[rustc_legacy_const_generics(1, 2)]
10787#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10788pub fn _mm512_reduce_round_ph<const IMM8: i32, const SAE: i32>(a: __m512h) -> __m512h {
10789    static_assert_uimm_bits!(IMM8, 8);
10790    static_assert_sae!(SAE);
10791    _mm512_mask_reduce_round_ph::<IMM8, SAE>(_mm512_undefined_ph(), 0xffffffff, a)
10792}
10793
10794/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10795/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
10796/// from src when the corresponding mask bit is not set).
10797///
10798/// Rounding is done according to the imm8 parameter, which can be one of:
10799///
10800/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10801/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10802/// * [`_MM_FROUND_TO_POS_INF`] : round up
10803/// * [`_MM_FROUND_TO_ZERO`] : truncate
10804/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10805///
10806/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
10807///
10808/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_round_ph)
10809#[inline]
10810#[target_feature(enable = "avx512fp16")]
10811#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
10812#[rustc_legacy_const_generics(3, 4)]
10813#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10814pub fn _mm512_mask_reduce_round_ph<const IMM8: i32, const SAE: i32>(
10815    src: __m512h,
10816    k: __mmask32,
10817    a: __m512h,
10818) -> __m512h {
10819    unsafe {
10820        static_assert_uimm_bits!(IMM8, 8);
10821        static_assert_sae!(SAE);
10822        vreduceph_512(a, IMM8, src, k, SAE)
10823    }
10824}
10825
10826/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10827/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
10828/// out when the corresponding mask bit is not set).
10829///
10830/// Rounding is done according to the imm8 parameter, which can be one of:
10831///
10832/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10833/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10834/// * [`_MM_FROUND_TO_POS_INF`] : round up
10835/// * [`_MM_FROUND_TO_ZERO`] : truncate
10836/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10837///
10838/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
10839///
10840/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_reduce_round_ph)
10841#[inline]
10842#[target_feature(enable = "avx512fp16")]
10843#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
10844#[rustc_legacy_const_generics(2, 3)]
10845#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10846pub fn _mm512_maskz_reduce_round_ph<const IMM8: i32, const SAE: i32>(
10847    k: __mmask32,
10848    a: __m512h,
10849) -> __m512h {
10850    static_assert_uimm_bits!(IMM8, 8);
10851    static_assert_sae!(SAE);
10852    _mm512_mask_reduce_round_ph::<IMM8, SAE>(_mm512_setzero_ph(), k, a)
10853}
10854
10855/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
10856/// the number of bits specified by imm8, store the result in the lower element of dst, and copy the
10857/// upper 7 packed elements from a to the upper elements of dst.
10858///
10859/// Rounding is done according to the imm8 parameter, which can be one of:
10860///
10861/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10862/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10863/// * [`_MM_FROUND_TO_POS_INF`] : round up
10864/// * [`_MM_FROUND_TO_ZERO`] : truncate
10865/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10866///
10867/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_sh)
10868#[inline]
10869#[target_feature(enable = "avx512fp16")]
10870#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
10871#[rustc_legacy_const_generics(2)]
10872#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10873pub fn _mm_reduce_sh<const IMM8: i32>(a: __m128h, b: __m128h) -> __m128h {
10874    static_assert_uimm_bits!(IMM8, 8);
10875    _mm_mask_reduce_sh::<IMM8>(f16x8::ZERO.as_m128h(), 0xff, a, b)
10876}
10877
10878/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
10879/// the number of bits specified by imm8, store the result in the lower element of dst using writemask k
10880/// (the element is copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from
10881/// a to the upper elements of dst.
10882///
10883/// Rounding is done according to the imm8 parameter, which can be one of:
10884///
10885/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10886/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10887/// * [`_MM_FROUND_TO_POS_INF`] : round up
10888/// * [`_MM_FROUND_TO_ZERO`] : truncate
10889/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10890///
10891/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_sh)
10892#[inline]
10893#[target_feature(enable = "avx512fp16")]
10894#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
10895#[rustc_legacy_const_generics(4)]
10896#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10897pub fn _mm_mask_reduce_sh<const IMM8: i32>(
10898    src: __m128h,
10899    k: __mmask8,
10900    a: __m128h,
10901    b: __m128h,
10902) -> __m128h {
10903    static_assert_uimm_bits!(IMM8, 8);
10904    _mm_mask_reduce_round_sh::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
10905}
10906
10907/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
10908/// the number of bits specified by imm8, store the result in the lower element of dst using zeromask k
10909/// (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a
10910/// to the upper elements of dst.
10911///
10912/// Rounding is done according to the imm8 parameter, which can be one of:
10913///
10914/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10915/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10916/// * [`_MM_FROUND_TO_POS_INF`] : round up
10917/// * [`_MM_FROUND_TO_ZERO`] : truncate
10918/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10919///
10920/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_sh)
10921#[inline]
10922#[target_feature(enable = "avx512fp16")]
10923#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
10924#[rustc_legacy_const_generics(3)]
10925#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10926pub fn _mm_maskz_reduce_sh<const IMM8: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10927    static_assert_uimm_bits!(IMM8, 8);
10928    _mm_mask_reduce_sh::<IMM8>(f16x8::ZERO.as_m128h(), k, a, b)
10929}
10930
10931/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
10932/// the number of bits specified by imm8, store the result in the lower element of dst, and copy the upper
10933/// 7 packed elements from a to the upper elements of dst.
10934///
10935/// Rounding is done according to the imm8 parameter, which can be one of:
10936///
10937/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10938/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10939/// * [`_MM_FROUND_TO_POS_INF`] : round up
10940/// * [`_MM_FROUND_TO_ZERO`] : truncate
10941/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10942///
10943/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
10944///
10945/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_round_sh)
10946#[inline]
10947#[target_feature(enable = "avx512fp16")]
10948#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
10949#[rustc_legacy_const_generics(2, 3)]
10950#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10951pub fn _mm_reduce_round_sh<const IMM8: i32, const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
10952    static_assert_uimm_bits!(IMM8, 8);
10953    static_assert_sae!(SAE);
10954    _mm_mask_reduce_round_sh::<IMM8, SAE>(f16x8::ZERO.as_m128h(), 0xff, a, b)
10955}
10956
10957/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
10958/// the number of bits specified by imm8, store the result in the lower element of dst using writemask k
10959/// (the element is copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a
10960/// to the upper elements of dst.
10961///
10962/// Rounding is done according to the imm8 parameter, which can be one of:
10963///
10964/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10965/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10966/// * [`_MM_FROUND_TO_POS_INF`] : round up
10967/// * [`_MM_FROUND_TO_ZERO`] : truncate
10968/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10969///
10970/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
10971///
10972/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_round_sh)
10973#[inline]
10974#[target_feature(enable = "avx512fp16")]
10975#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
10976#[rustc_legacy_const_generics(4, 5)]
10977#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10978pub fn _mm_mask_reduce_round_sh<const IMM8: i32, const SAE: i32>(
10979    src: __m128h,
10980    k: __mmask8,
10981    a: __m128h,
10982    b: __m128h,
10983) -> __m128h {
10984    unsafe {
10985        static_assert_uimm_bits!(IMM8, 8);
10986        static_assert_sae!(SAE);
10987        vreducesh(a, b, src, k, IMM8, SAE)
10988    }
10989}
10990
10991/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
10992/// the number of bits specified by imm8, store the result in the lower element of dst using zeromask k
10993/// (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a
10994/// to the upper elements of dst.
10995///
10996/// Rounding is done according to the imm8 parameter, which can be one of:
10997///
10998/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10999/// * [`_MM_FROUND_TO_NEG_INF`] : round down
11000/// * [`_MM_FROUND_TO_POS_INF`] : round up
11001/// * [`_MM_FROUND_TO_ZERO`] : truncate
11002/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11003///
11004/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
11005///
11006/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_round_sh)
11007#[inline]
11008#[target_feature(enable = "avx512fp16")]
11009#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
11010#[rustc_legacy_const_generics(3, 4)]
11011#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11012pub fn _mm_maskz_reduce_round_sh<const IMM8: i32, const SAE: i32>(
11013    k: __mmask8,
11014    a: __m128h,
11015    b: __m128h,
11016) -> __m128h {
11017    static_assert_uimm_bits!(IMM8, 8);
11018    static_assert_sae!(SAE);
11019    _mm_mask_reduce_round_sh::<IMM8, SAE>(f16x8::ZERO.as_m128h(), k, a, b)
11020}
11021
11022/// Reduce the packed half-precision (16-bit) floating-point elements in a by addition. Returns the
11023/// sum of all elements in a.
11024///
11025/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_add_ph)
11026#[inline]
11027#[target_feature(enable = "avx512fp16,avx512vl")]
11028#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11029pub fn _mm_reduce_add_ph(a: __m128h) -> f16 {
11030    unsafe {
11031        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
11032        let a = _mm_add_ph(a, b);
11033        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
11034        let a = _mm_add_ph(a, b);
11035        simd_extract::<_, f16>(a, 0) + simd_extract::<_, f16>(a, 1)
11036    }
11037}
11038
11039/// Reduce the packed half-precision (16-bit) floating-point elements in a by addition. Returns the
11040/// sum of all elements in a.
11041///
11042/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_add_ph)
11043#[inline]
11044#[target_feature(enable = "avx512fp16,avx512vl")]
11045#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11046pub fn _mm256_reduce_add_ph(a: __m256h) -> f16 {
11047    unsafe {
11048        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
11049        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
11050        _mm_reduce_add_ph(_mm_add_ph(p, q))
11051    }
11052}
11053
11054/// Reduce the packed half-precision (16-bit) floating-point elements in a by addition. Returns the
11055/// sum of all elements in a.
11056///
11057/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_add_ph)
11058#[inline]
11059#[target_feature(enable = "avx512fp16")]
11060#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11061pub fn _mm512_reduce_add_ph(a: __m512h) -> f16 {
11062    unsafe {
11063        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
11064        let q = simd_shuffle!(
11065            a,
11066            a,
11067            [
11068                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
11069            ]
11070        );
11071        _mm256_reduce_add_ph(_mm256_add_ph(p, q))
11072    }
11073}
11074
11075/// Reduce the packed half-precision (16-bit) floating-point elements in a by multiplication. Returns
11076/// the product of all elements in a.
11077///
11078/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_mul_ph)
11079#[inline]
11080#[target_feature(enable = "avx512fp16,avx512vl")]
11081#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11082pub fn _mm_reduce_mul_ph(a: __m128h) -> f16 {
11083    unsafe {
11084        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
11085        let a = _mm_mul_ph(a, b);
11086        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
11087        let a = _mm_mul_ph(a, b);
11088        simd_extract::<_, f16>(a, 0) * simd_extract::<_, f16>(a, 1)
11089    }
11090}
11091
11092/// Reduce the packed half-precision (16-bit) floating-point elements in a by multiplication. Returns
11093/// the product of all elements in a.
11094///
11095/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_mul_ph)
11096#[inline]
11097#[target_feature(enable = "avx512fp16,avx512vl")]
11098#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11099pub fn _mm256_reduce_mul_ph(a: __m256h) -> f16 {
11100    unsafe {
11101        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
11102        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
11103        _mm_reduce_mul_ph(_mm_mul_ph(p, q))
11104    }
11105}
11106
11107/// Reduce the packed half-precision (16-bit) floating-point elements in a by multiplication. Returns
11108/// the product of all elements in a.
11109///
11110/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_mul_ph)
11111#[inline]
11112#[target_feature(enable = "avx512fp16")]
11113#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11114pub unsafe fn _mm512_reduce_mul_ph(a: __m512h) -> f16 {
11115    unsafe {
11116        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
11117        let q = simd_shuffle!(
11118            a,
11119            a,
11120            [
11121                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
11122            ]
11123        );
11124        _mm256_reduce_mul_ph(_mm256_mul_ph(p, q))
11125    }
11126}
11127
11128/// Reduce the packed half-precision (16-bit) floating-point elements in a by minimum. Returns the
11129/// minimum of all elements in a.
11130///
11131/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_min_ph)
11132#[inline]
11133#[target_feature(enable = "avx512fp16,avx512vl")]
11134#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11135pub fn _mm_reduce_min_ph(a: __m128h) -> f16 {
11136    unsafe {
11137        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
11138        let a = _mm_min_ph(a, b);
11139        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
11140        let a = _mm_min_ph(a, b);
11141        let b = simd_shuffle!(a, a, [1, 0, 2, 3, 4, 5, 6, 7]);
11142        simd_extract!(_mm_min_sh(a, b), 0)
11143    }
11144}
11145
11146/// Reduce the packed half-precision (16-bit) floating-point elements in a by minimum. Returns the
11147/// minimum of all elements in a.
11148///
11149/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_min_ph)
11150#[inline]
11151#[target_feature(enable = "avx512fp16,avx512vl")]
11152#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11153pub fn _mm256_reduce_min_ph(a: __m256h) -> f16 {
11154    unsafe {
11155        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
11156        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
11157        _mm_reduce_min_ph(_mm_min_ph(p, q))
11158    }
11159}
11160
11161/// Reduce the packed half-precision (16-bit) floating-point elements in a by minimum. Returns the
11162/// minimum of all elements in a.
11163///
11164/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_ph)
11165#[inline]
11166#[target_feature(enable = "avx512fp16")]
11167#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11168pub fn _mm512_reduce_min_ph(a: __m512h) -> f16 {
11169    unsafe {
11170        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
11171        let q = simd_shuffle!(
11172            a,
11173            a,
11174            [
11175                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
11176            ]
11177        );
11178        _mm256_reduce_min_ph(_mm256_min_ph(p, q))
11179    }
11180}
11181
11182/// Reduce the packed half-precision (16-bit) floating-point elements in a by maximum. Returns the
11183/// maximum of all elements in a.
11184///
11185/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_max_ph)
11186#[inline]
11187#[target_feature(enable = "avx512fp16,avx512vl")]
11188#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11189pub fn _mm_reduce_max_ph(a: __m128h) -> f16 {
11190    unsafe {
11191        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
11192        let a = _mm_max_ph(a, b);
11193        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
11194        let a = _mm_max_ph(a, b);
11195        let b = simd_shuffle!(a, a, [1, 0, 2, 3, 4, 5, 6, 7]);
11196        simd_extract!(_mm_max_sh(a, b), 0)
11197    }
11198}
11199
11200/// Reduce the packed half-precision (16-bit) floating-point elements in a by maximum. Returns the
11201/// maximum of all elements in a.
11202///
11203/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_max_ph)
11204#[inline]
11205#[target_feature(enable = "avx512fp16,avx512vl")]
11206#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11207pub fn _mm256_reduce_max_ph(a: __m256h) -> f16 {
11208    unsafe {
11209        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
11210        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
11211        _mm_reduce_max_ph(_mm_max_ph(p, q))
11212    }
11213}
11214
11215/// Reduce the packed half-precision (16-bit) floating-point elements in a by maximum. Returns the
11216/// maximum of all elements in a.
11217///
11218/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_ph)
11219#[inline]
11220#[target_feature(enable = "avx512fp16")]
11221#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11222pub fn _mm512_reduce_max_ph(a: __m512h) -> f16 {
11223    unsafe {
11224        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
11225        let q = simd_shuffle!(
11226            a,
11227            a,
11228            [
11229                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
11230            ]
11231        );
11232        _mm256_reduce_max_ph(_mm256_max_ph(p, q))
11233    }
11234}
11235
11236macro_rules! fpclass_asm { // FIXME: use LLVM intrinsics
11237    ($mask_type: ty, $reg: ident, $a: expr) => {{
11238        let dst: $mask_type;
11239        asm!(
11240            "vfpclassph {k}, {src}, {imm8}",
11241            k = lateout(kreg) dst,
11242            src = in($reg) $a,
11243            imm8 = const IMM8,
11244            options(pure, nomem, nostack)
11245        );
11246        dst
11247    }};
11248    ($mask_type: ty, $mask: expr, $reg: ident, $a: expr) => {{
11249        let dst: $mask_type;
11250        asm!(
11251            "vfpclassph {k} {{ {mask} }}, {src}, {imm8}",
11252            k = lateout(kreg) dst,
11253            mask = in(kreg) $mask,
11254            src = in($reg) $a,
11255            imm8 = const IMM8,
11256            options(pure, nomem, nostack)
11257        );
11258        dst
11259    }};
11260}
11261
11262/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11263/// by imm8, and store the results in mask vector k.
11264/// imm can be a combination of:
11265///
11266///     0x01 // QNaN
11267///     0x02 // Positive Zero
11268///     0x04 // Negative Zero
11269///     0x08 // Positive Infinity
11270///     0x10 // Negative Infinity
11271///     0x20 // Denormal
11272///     0x40 // Negative
11273///     0x80 // SNaN
11274///
11275/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fpclass_ph_mask)
11276#[inline]
11277#[target_feature(enable = "avx512fp16,avx512vl")]
11278#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11279#[rustc_legacy_const_generics(1)]
11280#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11281pub fn _mm_fpclass_ph_mask<const IMM8: i32>(a: __m128h) -> __mmask8 {
11282    unsafe {
11283        static_assert_uimm_bits!(IMM8, 8);
11284        fpclass_asm!(__mmask8, xmm_reg, a)
11285    }
11286}
11287
11288/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11289/// by imm8, and store the results in mask vector k using zeromask k (elements are zeroed out when the
11290/// corresponding mask bit is not set).
11291/// imm can be a combination of:
11292///
11293///     0x01 // QNaN
11294///     0x02 // Positive Zero
11295///     0x04 // Negative Zero
11296///     0x08 // Positive Infinity
11297///     0x10 // Negative Infinity
11298///     0x20 // Denormal
11299///     0x40 // Negative
11300///     0x80 // SNaN
11301///
11302/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fpclass_ph_mask)
11303#[inline]
11304#[target_feature(enable = "avx512fp16,avx512vl")]
11305#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11306#[rustc_legacy_const_generics(2)]
11307#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11308pub fn _mm_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask8, a: __m128h) -> __mmask8 {
11309    unsafe {
11310        static_assert_uimm_bits!(IMM8, 8);
11311        fpclass_asm!(__mmask8, k1, xmm_reg, a)
11312    }
11313}
11314
11315/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11316/// by imm8, and store the results in mask vector k.
11317/// imm can be a combination of:
11318///
11319///     0x01 // QNaN
11320///     0x02 // Positive Zero
11321///     0x04 // Negative Zero
11322///     0x08 // Positive Infinity
11323///     0x10 // Negative Infinity
11324///     0x20 // Denormal
11325///     0x40 // Negative
11326///     0x80 // SNaN
11327///
11328/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fpclass_ph_mask)
11329#[inline]
11330#[target_feature(enable = "avx512fp16,avx512vl")]
11331#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11332#[rustc_legacy_const_generics(1)]
11333#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11334pub fn _mm256_fpclass_ph_mask<const IMM8: i32>(a: __m256h) -> __mmask16 {
11335    unsafe {
11336        static_assert_uimm_bits!(IMM8, 8);
11337        fpclass_asm!(__mmask16, ymm_reg, a)
11338    }
11339}
11340
11341/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11342/// by imm8, and store the results in mask vector k using zeromask k (elements are zeroed out when the
11343/// corresponding mask bit is not set).
11344/// imm can be a combination of:
11345///
11346///     0x01 // QNaN
11347///     0x02 // Positive Zero
11348///     0x04 // Negative Zero
11349///     0x08 // Positive Infinity
11350///     0x10 // Negative Infinity
11351///     0x20 // Denormal
11352///     0x40 // Negative
11353///     0x80 // SNaN
11354///
11355/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fpclass_ph_mask)
11356#[inline]
11357#[target_feature(enable = "avx512fp16,avx512vl")]
11358#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11359#[rustc_legacy_const_generics(2)]
11360#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11361pub fn _mm256_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask16, a: __m256h) -> __mmask16 {
11362    unsafe {
11363        static_assert_uimm_bits!(IMM8, 8);
11364        fpclass_asm!(__mmask16, k1, ymm_reg, a)
11365    }
11366}
11367
11368/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11369/// by imm8, and store the results in mask vector k.
11370/// imm can be a combination of:
11371///
11372///     0x01 // QNaN
11373///     0x02 // Positive Zero
11374///     0x04 // Negative Zero
11375///     0x08 // Positive Infinity
11376///     0x10 // Negative Infinity
11377///     0x20 // Denormal
11378///     0x40 // Negative
11379///     0x80 // SNaN
11380///
11381/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fpclass_ph_mask)
11382#[inline]
11383#[target_feature(enable = "avx512fp16")]
11384#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11385#[rustc_legacy_const_generics(1)]
11386#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11387pub fn _mm512_fpclass_ph_mask<const IMM8: i32>(a: __m512h) -> __mmask32 {
11388    unsafe {
11389        static_assert_uimm_bits!(IMM8, 8);
11390        fpclass_asm!(__mmask32, zmm_reg, a)
11391    }
11392}
11393
11394/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11395/// by imm8, and store the results in mask vector k using zeromask k (elements are zeroed out when the
11396/// corresponding mask bit is not set).
11397/// imm can be a combination of:
11398///
11399///     0x01 // QNaN
11400///     0x02 // Positive Zero
11401///     0x04 // Negative Zero
11402///     0x08 // Positive Infinity
11403///     0x10 // Negative Infinity
11404///     0x20 // Denormal
11405///     0x40 // Negative
11406///     0x80 // SNaN
11407///
11408/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fpclass_ph_mask)
11409#[inline]
11410#[target_feature(enable = "avx512fp16")]
11411#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11412#[rustc_legacy_const_generics(2)]
11413#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11414pub fn _mm512_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask32, a: __m512h) -> __mmask32 {
11415    unsafe {
11416        static_assert_uimm_bits!(IMM8, 8);
11417        fpclass_asm!(__mmask32, k1, zmm_reg, a)
11418    }
11419}
11420
11421/// Test the lower half-precision (16-bit) floating-point element in a for special categories specified
11422/// by imm8, and store the result in mask vector k.
11423/// imm can be a combination of:
11424///
11425///     0x01 // QNaN
11426///     0x02 // Positive Zero
11427///     0x04 // Negative Zero
11428///     0x08 // Positive Infinity
11429///     0x10 // Negative Infinity
11430///     0x20 // Denormal
11431///     0x40 // Negative
11432///     0x80 // SNaN
11433///
11434/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fpclass_sh_mask)
11435#[inline]
11436#[target_feature(enable = "avx512fp16")]
11437#[cfg_attr(test, assert_instr(vfpclasssh, IMM8 = 0))]
11438#[rustc_legacy_const_generics(1)]
11439#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11440pub fn _mm_fpclass_sh_mask<const IMM8: i32>(a: __m128h) -> __mmask8 {
11441    _mm_mask_fpclass_sh_mask::<IMM8>(0xff, a)
11442}
11443
11444/// Test the lower half-precision (16-bit) floating-point element in a for special categories specified
11445/// by imm8, and store the result in mask vector k using zeromask k (elements are zeroed out when the
11446/// corresponding mask bit is not set).
11447/// imm can be a combination of:
11448///
11449///     0x01 // QNaN
11450///     0x02 // Positive Zero
11451///     0x04 // Negative Zero
11452///     0x08 // Positive Infinity
11453///     0x10 // Negative Infinity
11454///     0x20 // Denormal
11455///     0x40 // Negative
11456///     0x80 // SNaN
11457///
11458/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fpclass_sh_mask)
11459#[inline]
11460#[target_feature(enable = "avx512fp16")]
11461#[cfg_attr(test, assert_instr(vfpclasssh, IMM8 = 0))]
11462#[rustc_legacy_const_generics(2)]
11463#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11464pub fn _mm_mask_fpclass_sh_mask<const IMM8: i32>(k1: __mmask8, a: __m128h) -> __mmask8 {
11465    unsafe {
11466        static_assert_uimm_bits!(IMM8, 8);
11467        vfpclasssh(a, IMM8, k1)
11468    }
11469}
11470
11471/// Blend packed half-precision (16-bit) floating-point elements from a and b using control mask k,
11472/// and store the results in dst.
11473///
11474/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_blend_ph)
11475#[inline]
11476#[target_feature(enable = "avx512fp16,avx512vl")]
11477#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11478pub fn _mm_mask_blend_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
11479    unsafe { simd_select_bitmask(k, b, a) }
11480}
11481
11482/// Blend packed half-precision (16-bit) floating-point elements from a and b using control mask k,
11483/// and store the results in dst.
11484///
11485/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_blend_ph)
11486#[inline]
11487#[target_feature(enable = "avx512fp16,avx512vl")]
11488#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11489pub fn _mm256_mask_blend_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
11490    unsafe { simd_select_bitmask(k, b, a) }
11491}
11492
11493/// Blend packed half-precision (16-bit) floating-point elements from a and b using control mask k,
11494/// and store the results in dst.
11495///
11496/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_ph)
11497#[inline]
11498#[target_feature(enable = "avx512fp16")]
11499#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11500pub fn _mm512_mask_blend_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
11501    unsafe { simd_select_bitmask(k, b, a) }
11502}
11503
11504/// Shuffle half-precision (16-bit) floating-point elements in a and b using the corresponding selector
11505/// and index in idx, and store the results in dst.
11506///
11507/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutex2var_ph)
11508#[inline]
11509#[target_feature(enable = "avx512fp16,avx512vl")]
11510#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11511pub fn _mm_permutex2var_ph(a: __m128h, idx: __m128i, b: __m128h) -> __m128h {
11512    _mm_castsi128_ph(_mm_permutex2var_epi16(
11513        _mm_castph_si128(a),
11514        idx,
11515        _mm_castph_si128(b),
11516    ))
11517}
11518
11519/// Shuffle half-precision (16-bit) floating-point elements in a and b using the corresponding selector
11520/// and index in idx, and store the results in dst.
11521///
11522/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutex2var_ph)
11523#[inline]
11524#[target_feature(enable = "avx512fp16,avx512vl")]
11525#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11526pub fn _mm256_permutex2var_ph(a: __m256h, idx: __m256i, b: __m256h) -> __m256h {
11527    _mm256_castsi256_ph(_mm256_permutex2var_epi16(
11528        _mm256_castph_si256(a),
11529        idx,
11530        _mm256_castph_si256(b),
11531    ))
11532}
11533
11534/// Shuffle half-precision (16-bit) floating-point elements in a and b using the corresponding selector
11535/// and index in idx, and store the results in dst.
11536///
11537/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex2var_ph)
11538#[inline]
11539#[target_feature(enable = "avx512fp16")]
11540#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11541pub fn _mm512_permutex2var_ph(a: __m512h, idx: __m512i, b: __m512h) -> __m512h {
11542    _mm512_castsi512_ph(_mm512_permutex2var_epi16(
11543        _mm512_castph_si512(a),
11544        idx,
11545        _mm512_castph_si512(b),
11546    ))
11547}
11548
11549/// Shuffle half-precision (16-bit) floating-point elements in a using the corresponding index in idx,
11550/// and store the results in dst.
11551///
11552/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutexvar_ph)
11553#[inline]
11554#[target_feature(enable = "avx512fp16,avx512vl")]
11555#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11556pub fn _mm_permutexvar_ph(idx: __m128i, a: __m128h) -> __m128h {
11557    _mm_castsi128_ph(_mm_permutexvar_epi16(idx, _mm_castph_si128(a)))
11558}
11559
11560/// Shuffle half-precision (16-bit) floating-point elements in a using the corresponding index in idx,
11561/// and store the results in dst.
11562///
11563/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutexvar_ph)
11564#[inline]
11565#[target_feature(enable = "avx512fp16,avx512vl")]
11566#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11567pub fn _mm256_permutexvar_ph(idx: __m256i, a: __m256h) -> __m256h {
11568    _mm256_castsi256_ph(_mm256_permutexvar_epi16(idx, _mm256_castph_si256(a)))
11569}
11570
11571/// Shuffle half-precision (16-bit) floating-point elements in a using the corresponding index in idx,
11572/// and store the results in dst.
11573///
11574/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutexvar_ph)
11575#[inline]
11576#[target_feature(enable = "avx512fp16")]
11577#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11578pub fn _mm512_permutexvar_ph(idx: __m512i, a: __m512h) -> __m512h {
11579    _mm512_castsi512_ph(_mm512_permutexvar_epi16(idx, _mm512_castph_si512(a)))
11580}
11581
11582/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11583/// and store the results in dst.
11584///
11585/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_ph)
11586#[inline]
11587#[target_feature(enable = "avx512fp16,avx512vl")]
11588#[cfg_attr(test, assert_instr(vcvtw2ph))]
11589#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11590pub fn _mm_cvtepi16_ph(a: __m128i) -> __m128h {
11591    unsafe { vcvtw2ph_128(a.as_i16x8(), _MM_FROUND_CUR_DIRECTION) }
11592}
11593
11594/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11595/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11596/// mask bit is not set).
11597///
11598/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi16_ph)
11599#[inline]
11600#[target_feature(enable = "avx512fp16,avx512vl")]
11601#[cfg_attr(test, assert_instr(vcvtw2ph))]
11602#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11603pub fn _mm_mask_cvtepi16_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
11604    unsafe { simd_select_bitmask(k, _mm_cvtepi16_ph(a), src) }
11605}
11606
11607/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11608/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11609///
11610/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi16_ph)
11611#[inline]
11612#[target_feature(enable = "avx512fp16,avx512vl")]
11613#[cfg_attr(test, assert_instr(vcvtw2ph))]
11614#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11615pub fn _mm_maskz_cvtepi16_ph(k: __mmask8, a: __m128i) -> __m128h {
11616    _mm_mask_cvtepi16_ph(_mm_setzero_ph(), k, a)
11617}
11618
11619/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11620/// and store the results in dst.
11621///
11622/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi16_ph)
11623#[inline]
11624#[target_feature(enable = "avx512fp16,avx512vl")]
11625#[cfg_attr(test, assert_instr(vcvtw2ph))]
11626#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11627pub fn _mm256_cvtepi16_ph(a: __m256i) -> __m256h {
11628    unsafe { vcvtw2ph_256(a.as_i16x16(), _MM_FROUND_CUR_DIRECTION) }
11629}
11630
11631/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11632/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11633/// mask bit is not set).
11634///
11635/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi16_ph)
11636#[inline]
11637#[target_feature(enable = "avx512fp16,avx512vl")]
11638#[cfg_attr(test, assert_instr(vcvtw2ph))]
11639#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11640pub fn _mm256_mask_cvtepi16_ph(src: __m256h, k: __mmask16, a: __m256i) -> __m256h {
11641    unsafe { simd_select_bitmask(k, _mm256_cvtepi16_ph(a), src) }
11642}
11643
11644/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11645/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11646///
11647/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi16_ph)
11648#[inline]
11649#[target_feature(enable = "avx512fp16,avx512vl")]
11650#[cfg_attr(test, assert_instr(vcvtw2ph))]
11651#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11652pub fn _mm256_maskz_cvtepi16_ph(k: __mmask16, a: __m256i) -> __m256h {
11653    _mm256_mask_cvtepi16_ph(_mm256_setzero_ph(), k, a)
11654}
11655
11656/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11657/// and store the results in dst.
11658///
11659/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi16_ph)
11660#[inline]
11661#[target_feature(enable = "avx512fp16")]
11662#[cfg_attr(test, assert_instr(vcvtw2ph))]
11663#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11664pub fn _mm512_cvtepi16_ph(a: __m512i) -> __m512h {
11665    unsafe { vcvtw2ph_512(a.as_i16x32(), _MM_FROUND_CUR_DIRECTION) }
11666}
11667
11668/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11669/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11670/// mask bit is not set).
11671///
11672/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi16_ph)
11673#[inline]
11674#[target_feature(enable = "avx512fp16")]
11675#[cfg_attr(test, assert_instr(vcvtw2ph))]
11676#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11677pub fn _mm512_mask_cvtepi16_ph(src: __m512h, k: __mmask32, a: __m512i) -> __m512h {
11678    unsafe { simd_select_bitmask(k, _mm512_cvtepi16_ph(a), src) }
11679}
11680
11681/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11682/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11683///
11684/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi16_ph)
11685#[inline]
11686#[target_feature(enable = "avx512fp16")]
11687#[cfg_attr(test, assert_instr(vcvtw2ph))]
11688#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11689pub fn _mm512_maskz_cvtepi16_ph(k: __mmask32, a: __m512i) -> __m512h {
11690    _mm512_mask_cvtepi16_ph(_mm512_setzero_ph(), k, a)
11691}
11692
11693/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11694/// and store the results in dst.
11695///
11696/// Rounding is done according to the rounding parameter, which can be one of:
11697///
11698/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
11699/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
11700/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
11701/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
11702/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11703///
11704/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi16_ph)
11705#[inline]
11706#[target_feature(enable = "avx512fp16")]
11707#[cfg_attr(test, assert_instr(vcvtw2ph, ROUNDING = 8))]
11708#[rustc_legacy_const_generics(1)]
11709#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11710pub fn _mm512_cvt_roundepi16_ph<const ROUNDING: i32>(a: __m512i) -> __m512h {
11711    unsafe {
11712        static_assert_rounding!(ROUNDING);
11713        vcvtw2ph_512(a.as_i16x32(), ROUNDING)
11714    }
11715}
11716
11717/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11718/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11719/// mask bit is not set).
11720///
11721/// Rounding is done according to the rounding parameter, which can be one of:
11722///
11723/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
11724/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
11725/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
11726/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
11727/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11728///
11729/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi16_ph)
11730#[inline]
11731#[target_feature(enable = "avx512fp16")]
11732#[cfg_attr(test, assert_instr(vcvtw2ph, ROUNDING = 8))]
11733#[rustc_legacy_const_generics(3)]
11734#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11735pub fn _mm512_mask_cvt_roundepi16_ph<const ROUNDING: i32>(
11736    src: __m512h,
11737    k: __mmask32,
11738    a: __m512i,
11739) -> __m512h {
11740    unsafe {
11741        static_assert_rounding!(ROUNDING);
11742        simd_select_bitmask(k, _mm512_cvt_roundepi16_ph::<ROUNDING>(a), src)
11743    }
11744}
11745
11746/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11747/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11748///
11749/// Rounding is done according to the rounding parameter, which can be one of:
11750///
11751/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
11752/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
11753/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
11754/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
11755/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11756///
11757/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi16_ph)
11758#[inline]
11759#[target_feature(enable = "avx512fp16")]
11760#[cfg_attr(test, assert_instr(vcvtw2ph, ROUNDING = 8))]
11761#[rustc_legacy_const_generics(2)]
11762#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11763pub fn _mm512_maskz_cvt_roundepi16_ph<const ROUNDING: i32>(k: __mmask32, a: __m512i) -> __m512h {
11764    static_assert_rounding!(ROUNDING);
11765    _mm512_mask_cvt_roundepi16_ph::<ROUNDING>(_mm512_setzero_ph(), k, a)
11766}
11767
11768/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11769/// and store the results in dst.
11770///
11771/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu16_ph)
11772#[inline]
11773#[target_feature(enable = "avx512fp16,avx512vl")]
11774#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11775#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11776pub fn _mm_cvtepu16_ph(a: __m128i) -> __m128h {
11777    unsafe { vcvtuw2ph_128(a.as_u16x8(), _MM_FROUND_CUR_DIRECTION) }
11778}
11779
11780/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11781/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11782/// mask bit is not set).
11783///
11784/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu16_ph)
11785#[inline]
11786#[target_feature(enable = "avx512fp16,avx512vl")]
11787#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11788#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11789pub fn _mm_mask_cvtepu16_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
11790    unsafe { simd_select_bitmask(k, _mm_cvtepu16_ph(a), src) }
11791}
11792
11793/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11794/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11795///
11796/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu16_ph)
11797#[inline]
11798#[target_feature(enable = "avx512fp16,avx512vl")]
11799#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11800#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11801pub fn _mm_maskz_cvtepu16_ph(k: __mmask8, a: __m128i) -> __m128h {
11802    _mm_mask_cvtepu16_ph(_mm_setzero_ph(), k, a)
11803}
11804
11805/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11806/// and store the results in dst.
11807///
11808/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu16_ph)
11809#[inline]
11810#[target_feature(enable = "avx512fp16,avx512vl")]
11811#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11812#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11813pub fn _mm256_cvtepu16_ph(a: __m256i) -> __m256h {
11814    unsafe { vcvtuw2ph_256(a.as_u16x16(), _MM_FROUND_CUR_DIRECTION) }
11815}
11816
11817/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11818/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11819/// mask bit is not set).
11820///
11821/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu16_ph)
11822#[inline]
11823#[target_feature(enable = "avx512fp16,avx512vl")]
11824#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11825#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11826pub fn _mm256_mask_cvtepu16_ph(src: __m256h, k: __mmask16, a: __m256i) -> __m256h {
11827    unsafe { simd_select_bitmask(k, _mm256_cvtepu16_ph(a), src) }
11828}
11829
11830/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11831/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11832///
11833/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu16_ph)
11834#[inline]
11835#[target_feature(enable = "avx512fp16,avx512vl")]
11836#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11837#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11838pub fn _mm256_maskz_cvtepu16_ph(k: __mmask16, a: __m256i) -> __m256h {
11839    _mm256_mask_cvtepu16_ph(_mm256_setzero_ph(), k, a)
11840}
11841
11842/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11843/// and store the results in dst.
11844///
11845/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu16_ph)
11846#[inline]
11847#[target_feature(enable = "avx512fp16")]
11848#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11849#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11850pub fn _mm512_cvtepu16_ph(a: __m512i) -> __m512h {
11851    unsafe { vcvtuw2ph_512(a.as_u16x32(), _MM_FROUND_CUR_DIRECTION) }
11852}
11853
11854/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11855/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11856/// mask bit is not set).
11857///
11858/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu16_ph)
11859#[inline]
11860#[target_feature(enable = "avx512fp16")]
11861#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11862#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11863pub fn _mm512_mask_cvtepu16_ph(src: __m512h, k: __mmask32, a: __m512i) -> __m512h {
11864    unsafe { simd_select_bitmask(k, _mm512_cvtepu16_ph(a), src) }
11865}
11866
11867/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11868/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11869///
11870/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu16_ph)
11871#[inline]
11872#[target_feature(enable = "avx512fp16")]
11873#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11874#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11875pub fn _mm512_maskz_cvtepu16_ph(k: __mmask32, a: __m512i) -> __m512h {
11876    _mm512_mask_cvtepu16_ph(_mm512_setzero_ph(), k, a)
11877}
11878
11879/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11880/// and store the results in dst.
11881///
11882/// Rounding is done according to the rounding parameter, which can be one of:
11883///
11884/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
11885/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
11886/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
11887/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
11888/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11889///
11890/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepu16_ph)
11891#[inline]
11892#[target_feature(enable = "avx512fp16")]
11893#[cfg_attr(test, assert_instr(vcvtuw2ph, ROUNDING = 8))]
11894#[rustc_legacy_const_generics(1)]
11895#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11896pub fn _mm512_cvt_roundepu16_ph<const ROUNDING: i32>(a: __m512i) -> __m512h {
11897    unsafe {
11898        static_assert_rounding!(ROUNDING);
11899        vcvtuw2ph_512(a.as_u16x32(), ROUNDING)
11900    }
11901}
11902
11903/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11904/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11905/// mask bit is not set).
11906///
11907/// Rounding is done according to the rounding parameter, which can be one of:
11908///
11909/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
11910/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
11911/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
11912/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
11913/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11914///
11915/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepu16_ph)
11916#[inline]
11917#[target_feature(enable = "avx512fp16")]
11918#[cfg_attr(test, assert_instr(vcvtuw2ph, ROUNDING = 8))]
11919#[rustc_legacy_const_generics(3)]
11920#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11921pub fn _mm512_mask_cvt_roundepu16_ph<const ROUNDING: i32>(
11922    src: __m512h,
11923    k: __mmask32,
11924    a: __m512i,
11925) -> __m512h {
11926    unsafe {
11927        static_assert_rounding!(ROUNDING);
11928        simd_select_bitmask(k, _mm512_cvt_roundepu16_ph::<ROUNDING>(a), src)
11929    }
11930}
11931
11932/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11933/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11934///
11935/// Rounding is done according to the rounding parameter, which can be one of:
11936///
11937/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
11938/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
11939/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
11940/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
11941/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11942///
11943/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepu16_ph)
11944#[inline]
11945#[target_feature(enable = "avx512fp16")]
11946#[cfg_attr(test, assert_instr(vcvtuw2ph, ROUNDING = 8))]
11947#[rustc_legacy_const_generics(2)]
11948#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11949pub fn _mm512_maskz_cvt_roundepu16_ph<const ROUNDING: i32>(k: __mmask32, a: __m512i) -> __m512h {
11950    static_assert_rounding!(ROUNDING);
11951    _mm512_mask_cvt_roundepu16_ph::<ROUNDING>(_mm512_setzero_ph(), k, a)
11952}
11953
11954/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
11955/// and store the results in dst. The upper 64 bits of dst are zeroed out.
11956///
11957/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_ph)
11958#[inline]
11959#[target_feature(enable = "avx512fp16,avx512vl")]
11960#[cfg_attr(test, assert_instr(vcvtdq2ph))]
11961#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11962pub fn _mm_cvtepi32_ph(a: __m128i) -> __m128h {
11963    _mm_mask_cvtepi32_ph(_mm_setzero_ph(), 0xff, a)
11964}
11965
11966/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
11967/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11968/// mask bit is not set). The upper 64 bits of dst are zeroed out.
11969///
11970/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi32_ph)
11971#[inline]
11972#[target_feature(enable = "avx512fp16,avx512vl")]
11973#[cfg_attr(test, assert_instr(vcvtdq2ph))]
11974#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11975pub fn _mm_mask_cvtepi32_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
11976    unsafe { vcvtdq2ph_128(a.as_i32x4(), src, k) }
11977}
11978
11979/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
11980/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11981/// The upper 64 bits of dst are zeroed out.
11982///
11983/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi32_ph)
11984#[inline]
11985#[target_feature(enable = "avx512fp16,avx512vl")]
11986#[cfg_attr(test, assert_instr(vcvtdq2ph))]
11987#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11988pub fn _mm_maskz_cvtepi32_ph(k: __mmask8, a: __m128i) -> __m128h {
11989    _mm_mask_cvtepi32_ph(_mm_setzero_ph(), k, a)
11990}
11991
11992/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
11993/// and store the results in dst.
11994///
11995/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi32_ph)
11996#[inline]
11997#[target_feature(enable = "avx512fp16,avx512vl")]
11998#[cfg_attr(test, assert_instr(vcvtdq2ph))]
11999#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12000pub fn _mm256_cvtepi32_ph(a: __m256i) -> __m128h {
12001    unsafe { vcvtdq2ph_256(a.as_i32x8(), _MM_FROUND_CUR_DIRECTION) }
12002}
12003
12004/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12005/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12006/// mask bit is not set).
12007///
12008/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi32_ph)
12009#[inline]
12010#[target_feature(enable = "avx512fp16,avx512vl")]
12011#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12012#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12013pub fn _mm256_mask_cvtepi32_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
12014    unsafe { simd_select_bitmask(k, _mm256_cvtepi32_ph(a), src) }
12015}
12016
12017/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12018/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12019///
12020/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi32_ph)
12021#[inline]
12022#[target_feature(enable = "avx512fp16,avx512vl")]
12023#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12024#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12025pub fn _mm256_maskz_cvtepi32_ph(k: __mmask8, a: __m256i) -> __m128h {
12026    _mm256_mask_cvtepi32_ph(_mm_setzero_ph(), k, a)
12027}
12028
12029/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12030/// and store the results in dst.
12031///
12032/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi32_ph)
12033#[inline]
12034#[target_feature(enable = "avx512fp16")]
12035#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12036#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12037pub fn _mm512_cvtepi32_ph(a: __m512i) -> __m256h {
12038    unsafe { vcvtdq2ph_512(a.as_i32x16(), _MM_FROUND_CUR_DIRECTION) }
12039}
12040
12041/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12042/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12043/// mask bit is not set).
12044///
12045/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi32_ph)
12046#[inline]
12047#[target_feature(enable = "avx512fp16")]
12048#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12049#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12050pub fn _mm512_mask_cvtepi32_ph(src: __m256h, k: __mmask16, a: __m512i) -> __m256h {
12051    unsafe { simd_select_bitmask(k, _mm512_cvtepi32_ph(a), src) }
12052}
12053
12054/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12055/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12056///
12057/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi32_ph)
12058#[inline]
12059#[target_feature(enable = "avx512fp16")]
12060#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12061#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12062pub fn _mm512_maskz_cvtepi32_ph(k: __mmask16, a: __m512i) -> __m256h {
12063    _mm512_mask_cvtepi32_ph(f16x16::ZERO.as_m256h(), k, a)
12064}
12065
12066/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12067/// and store the results in dst.
12068///
12069/// Rounding is done according to the rounding parameter, which can be one of:
12070///
12071/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12072/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12073/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12074/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12075/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12076///
12077/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi32_ph)
12078#[inline]
12079#[target_feature(enable = "avx512fp16")]
12080#[cfg_attr(test, assert_instr(vcvtdq2ph, ROUNDING = 8))]
12081#[rustc_legacy_const_generics(1)]
12082#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12083pub fn _mm512_cvt_roundepi32_ph<const ROUNDING: i32>(a: __m512i) -> __m256h {
12084    unsafe {
12085        static_assert_rounding!(ROUNDING);
12086        vcvtdq2ph_512(a.as_i32x16(), ROUNDING)
12087    }
12088}
12089
12090/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12091/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12092/// mask bit is not set).
12093///
12094/// Rounding is done according to the rounding parameter, which can be one of:
12095///
12096/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12097/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12098/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12099/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12100/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12101///
12102/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi32_ph)
12103#[inline]
12104#[target_feature(enable = "avx512fp16")]
12105#[cfg_attr(test, assert_instr(vcvtdq2ph, ROUNDING = 8))]
12106#[rustc_legacy_const_generics(3)]
12107#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12108pub fn _mm512_mask_cvt_roundepi32_ph<const ROUNDING: i32>(
12109    src: __m256h,
12110    k: __mmask16,
12111    a: __m512i,
12112) -> __m256h {
12113    unsafe {
12114        static_assert_rounding!(ROUNDING);
12115        simd_select_bitmask(k, _mm512_cvt_roundepi32_ph::<ROUNDING>(a), src)
12116    }
12117}
12118
12119/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12120/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12121///
12122/// Rounding is done according to the rounding parameter, which can be one of:
12123///
12124/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12125/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12126/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12127/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12128/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12129///
12130/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi32_ph)
12131#[inline]
12132#[target_feature(enable = "avx512fp16")]
12133#[cfg_attr(test, assert_instr(vcvtdq2ph, ROUNDING = 8))]
12134#[rustc_legacy_const_generics(2)]
12135#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12136pub fn _mm512_maskz_cvt_roundepi32_ph<const ROUNDING: i32>(k: __mmask16, a: __m512i) -> __m256h {
12137    static_assert_rounding!(ROUNDING);
12138    _mm512_mask_cvt_roundepi32_ph::<ROUNDING>(f16x16::ZERO.as_m256h(), k, a)
12139}
12140
12141/// Convert the signed 32-bit integer b to a half-precision (16-bit) floating-point element, store the
12142/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
12143/// of dst.
12144///
12145/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvti32_sh)
12146#[inline]
12147#[target_feature(enable = "avx512fp16")]
12148#[cfg_attr(test, assert_instr(vcvtsi2sh))]
12149#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12150pub fn _mm_cvti32_sh(a: __m128h, b: i32) -> __m128h {
12151    unsafe { vcvtsi2sh(a, b, _MM_FROUND_CUR_DIRECTION) }
12152}
12153
12154/// Convert the signed 32-bit integer b to a half-precision (16-bit) floating-point element, store the
12155/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
12156/// of dst.
12157///
12158/// Rounding is done according to the rounding parameter, which can be one of:
12159///
12160/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12161/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12162/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12163/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12164/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12165///
12166/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundi32_sh)
12167#[inline]
12168#[target_feature(enable = "avx512fp16")]
12169#[cfg_attr(test, assert_instr(vcvtsi2sh, ROUNDING = 8))]
12170#[rustc_legacy_const_generics(2)]
12171#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12172pub fn _mm_cvt_roundi32_sh<const ROUNDING: i32>(a: __m128h, b: i32) -> __m128h {
12173    unsafe {
12174        static_assert_rounding!(ROUNDING);
12175        vcvtsi2sh(a, b, ROUNDING)
12176    }
12177}
12178
12179/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12180/// and store the results in dst. The upper 64 bits of dst are zeroed out.
12181///
12182/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu32_ph)
12183#[inline]
12184#[target_feature(enable = "avx512fp16,avx512vl")]
12185#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12186#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12187pub fn _mm_cvtepu32_ph(a: __m128i) -> __m128h {
12188    _mm_mask_cvtepu32_ph(_mm_setzero_ph(), 0xff, a)
12189}
12190
12191/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12192/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12193/// mask bit is not set). The upper 64 bits of dst are zeroed out.
12194///
12195/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu32_ph)
12196#[inline]
12197#[target_feature(enable = "avx512fp16,avx512vl")]
12198#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12199#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12200pub fn _mm_mask_cvtepu32_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
12201    unsafe { vcvtudq2ph_128(a.as_u32x4(), src, k) }
12202}
12203
12204/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12205/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12206/// The upper 64 bits of dst are zeroed out.
12207///
12208/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu32_ph)
12209#[inline]
12210#[target_feature(enable = "avx512fp16,avx512vl")]
12211#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12212#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12213pub fn _mm_maskz_cvtepu32_ph(k: __mmask8, a: __m128i) -> __m128h {
12214    _mm_mask_cvtepu32_ph(_mm_setzero_ph(), k, a)
12215}
12216
12217/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12218/// and store the results in dst.
12219///
12220/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu32_ph)
12221#[inline]
12222#[target_feature(enable = "avx512fp16,avx512vl")]
12223#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12224#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12225pub fn _mm256_cvtepu32_ph(a: __m256i) -> __m128h {
12226    unsafe { vcvtudq2ph_256(a.as_u32x8(), _MM_FROUND_CUR_DIRECTION) }
12227}
12228
12229/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12230/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12231/// mask bit is not set).
12232///
12233/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu32_ph)
12234#[inline]
12235#[target_feature(enable = "avx512fp16,avx512vl")]
12236#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12237#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12238pub fn _mm256_mask_cvtepu32_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
12239    unsafe { simd_select_bitmask(k, _mm256_cvtepu32_ph(a), src) }
12240}
12241
12242/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12243/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12244///
12245/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu32_ph)
12246#[inline]
12247#[target_feature(enable = "avx512fp16,avx512vl")]
12248#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12249#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12250pub fn _mm256_maskz_cvtepu32_ph(k: __mmask8, a: __m256i) -> __m128h {
12251    _mm256_mask_cvtepu32_ph(_mm_setzero_ph(), k, a)
12252}
12253
12254/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12255/// and store the results in dst.
12256///
12257/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu32_ph)
12258#[inline]
12259#[target_feature(enable = "avx512fp16")]
12260#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12261#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12262pub fn _mm512_cvtepu32_ph(a: __m512i) -> __m256h {
12263    unsafe { vcvtudq2ph_512(a.as_u32x16(), _MM_FROUND_CUR_DIRECTION) }
12264}
12265
12266/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12267/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12268/// mask bit is not set).
12269///
12270/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu32_ph)
12271#[inline]
12272#[target_feature(enable = "avx512fp16")]
12273#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12274#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12275pub fn _mm512_mask_cvtepu32_ph(src: __m256h, k: __mmask16, a: __m512i) -> __m256h {
12276    unsafe { simd_select_bitmask(k, _mm512_cvtepu32_ph(a), src) }
12277}
12278
12279/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12280/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12281///
12282/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu32_ph)
12283#[inline]
12284#[target_feature(enable = "avx512fp16")]
12285#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12286#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12287pub fn _mm512_maskz_cvtepu32_ph(k: __mmask16, a: __m512i) -> __m256h {
12288    _mm512_mask_cvtepu32_ph(f16x16::ZERO.as_m256h(), k, a)
12289}
12290
12291/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12292/// and store the results in dst.
12293///
12294/// Rounding is done according to the rounding parameter, which can be one of:
12295///
12296/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12297/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12298/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12299/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12300/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12301///
12302/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepu32_ph)
12303#[inline]
12304#[target_feature(enable = "avx512fp16")]
12305#[cfg_attr(test, assert_instr(vcvtudq2ph, ROUNDING = 8))]
12306#[rustc_legacy_const_generics(1)]
12307#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12308pub fn _mm512_cvt_roundepu32_ph<const ROUNDING: i32>(a: __m512i) -> __m256h {
12309    unsafe {
12310        static_assert_rounding!(ROUNDING);
12311        vcvtudq2ph_512(a.as_u32x16(), ROUNDING)
12312    }
12313}
12314
12315/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12316/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12317/// mask bit is not set).
12318///
12319/// Rounding is done according to the rounding parameter, which can be one of:
12320///
12321/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12322/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12323/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12324/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12325/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12326///
12327/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepu32_ph)
12328#[inline]
12329#[target_feature(enable = "avx512fp16")]
12330#[cfg_attr(test, assert_instr(vcvtudq2ph, ROUNDING = 8))]
12331#[rustc_legacy_const_generics(3)]
12332#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12333pub fn _mm512_mask_cvt_roundepu32_ph<const ROUNDING: i32>(
12334    src: __m256h,
12335    k: __mmask16,
12336    a: __m512i,
12337) -> __m256h {
12338    unsafe {
12339        static_assert_rounding!(ROUNDING);
12340        simd_select_bitmask(k, _mm512_cvt_roundepu32_ph::<ROUNDING>(a), src)
12341    }
12342}
12343
12344/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12345/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12346///
12347/// Rounding is done according to the rounding parameter, which can be one of:
12348///
12349/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12350/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12351/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12352/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12353/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12354///
12355/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepu32_ph)
12356#[inline]
12357#[target_feature(enable = "avx512fp16")]
12358#[cfg_attr(test, assert_instr(vcvtudq2ph, ROUNDING = 8))]
12359#[rustc_legacy_const_generics(2)]
12360#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12361pub fn _mm512_maskz_cvt_roundepu32_ph<const ROUNDING: i32>(k: __mmask16, a: __m512i) -> __m256h {
12362    static_assert_rounding!(ROUNDING);
12363    _mm512_mask_cvt_roundepu32_ph::<ROUNDING>(f16x16::ZERO.as_m256h(), k, a)
12364}
12365
12366/// Convert the unsigned 32-bit integer b to a half-precision (16-bit) floating-point element, store the
12367/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
12368/// of dst.
12369///
12370/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtu32_sh)
12371#[inline]
12372#[target_feature(enable = "avx512fp16")]
12373#[cfg_attr(test, assert_instr(vcvtusi2sh))]
12374#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12375pub fn _mm_cvtu32_sh(a: __m128h, b: u32) -> __m128h {
12376    unsafe { vcvtusi2sh(a, b, _MM_FROUND_CUR_DIRECTION) }
12377}
12378
12379/// Convert the unsigned 32-bit integer b to a half-precision (16-bit) floating-point element, store the
12380/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
12381/// of dst.
12382///
12383/// Rounding is done according to the rounding parameter, which can be one of:
12384///
12385/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12386/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12387/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12388/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12389/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12390///
12391/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundu32_sh)
12392#[inline]
12393#[target_feature(enable = "avx512fp16")]
12394#[cfg_attr(test, assert_instr(vcvtusi2sh, ROUNDING = 8))]
12395#[rustc_legacy_const_generics(2)]
12396#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12397pub fn _mm_cvt_roundu32_sh<const ROUNDING: i32>(a: __m128h, b: u32) -> __m128h {
12398    unsafe {
12399        static_assert_rounding!(ROUNDING);
12400        vcvtusi2sh(a, b, ROUNDING)
12401    }
12402}
12403
12404/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12405/// and store the results in dst. The upper 96 bits of dst are zeroed out.
12406///
12407/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi64_ph)
12408#[inline]
12409#[target_feature(enable = "avx512fp16,avx512vl")]
12410#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12411#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12412pub fn _mm_cvtepi64_ph(a: __m128i) -> __m128h {
12413    _mm_mask_cvtepi64_ph(_mm_setzero_ph(), 0xff, a)
12414}
12415
12416/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12417/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12418/// mask bit is not set). The upper 96 bits of dst are zeroed out.
12419///
12420/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi64_ph)
12421#[inline]
12422#[target_feature(enable = "avx512fp16,avx512vl")]
12423#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12424#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12425pub fn _mm_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
12426    unsafe { vcvtqq2ph_128(a.as_i64x2(), src, k) }
12427}
12428
12429/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12430/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12431/// The upper 96 bits of dst are zeroed out.
12432///
12433/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi64_ph)
12434#[inline]
12435#[target_feature(enable = "avx512fp16,avx512vl")]
12436#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12437#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12438pub fn _mm_maskz_cvtepi64_ph(k: __mmask8, a: __m128i) -> __m128h {
12439    _mm_mask_cvtepi64_ph(_mm_setzero_ph(), k, a)
12440}
12441
12442/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12443/// and store the results in dst. The upper 64 bits of dst are zeroed out.
12444///
12445/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi64_ph)
12446#[inline]
12447#[target_feature(enable = "avx512fp16,avx512vl")]
12448#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12449#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12450pub fn _mm256_cvtepi64_ph(a: __m256i) -> __m128h {
12451    _mm256_mask_cvtepi64_ph(_mm_setzero_ph(), 0xff, a)
12452}
12453
12454/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12455/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12456/// mask bit is not set). The upper 64 bits of dst are zeroed out.
12457///
12458/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi64_ph)
12459#[inline]
12460#[target_feature(enable = "avx512fp16,avx512vl")]
12461#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12462#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12463pub fn _mm256_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
12464    unsafe { vcvtqq2ph_256(a.as_i64x4(), src, k) }
12465}
12466
12467/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12468/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12469/// The upper 64 bits of dst are zeroed out.
12470///
12471/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi64_ph)
12472#[inline]
12473#[target_feature(enable = "avx512fp16,avx512vl")]
12474#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12475#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12476pub fn _mm256_maskz_cvtepi64_ph(k: __mmask8, a: __m256i) -> __m128h {
12477    _mm256_mask_cvtepi64_ph(_mm_setzero_ph(), k, a)
12478}
12479
12480/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12481/// and store the results in dst.
12482///
12483/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi64_ph)
12484#[inline]
12485#[target_feature(enable = "avx512fp16")]
12486#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12487#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12488pub fn _mm512_cvtepi64_ph(a: __m512i) -> __m128h {
12489    unsafe { vcvtqq2ph_512(a.as_i64x8(), _MM_FROUND_CUR_DIRECTION) }
12490}
12491
12492/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12493/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12494/// mask bit is not set).
12495///
12496/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi64_ph)
12497#[inline]
12498#[target_feature(enable = "avx512fp16")]
12499#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12500#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12501pub fn _mm512_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m512i) -> __m128h {
12502    unsafe { simd_select_bitmask(k, _mm512_cvtepi64_ph(a), src) }
12503}
12504
12505/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12506/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12507///
12508/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi64_ph)
12509#[inline]
12510#[target_feature(enable = "avx512fp16")]
12511#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12512#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12513pub fn _mm512_maskz_cvtepi64_ph(k: __mmask8, a: __m512i) -> __m128h {
12514    _mm512_mask_cvtepi64_ph(f16x8::ZERO.as_m128h(), k, a)
12515}
12516
12517/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12518/// and store the results in dst.
12519///
12520/// Rounding is done according to the rounding parameter, which can be one of:
12521///
12522/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12523/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12524/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12525/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12526/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12527///
12528/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi64_ph)
12529#[inline]
12530#[target_feature(enable = "avx512fp16")]
12531#[cfg_attr(test, assert_instr(vcvtqq2ph, ROUNDING = 8))]
12532#[rustc_legacy_const_generics(1)]
12533#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12534pub fn _mm512_cvt_roundepi64_ph<const ROUNDING: i32>(a: __m512i) -> __m128h {
12535    unsafe {
12536        static_assert_rounding!(ROUNDING);
12537        vcvtqq2ph_512(a.as_i64x8(), ROUNDING)
12538    }
12539}
12540
12541/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12542/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12543/// mask bit is not set).
12544///
12545/// Rounding is done according to the rounding parameter, which can be one of:
12546///
12547/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12548/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12549/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12550/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12551/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12552///
12553/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi64_ph)
12554#[inline]
12555#[target_feature(enable = "avx512fp16")]
12556#[cfg_attr(test, assert_instr(vcvtqq2ph, ROUNDING = 8))]
12557#[rustc_legacy_const_generics(3)]
12558#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12559pub fn _mm512_mask_cvt_roundepi64_ph<const ROUNDING: i32>(
12560    src: __m128h,
12561    k: __mmask8,
12562    a: __m512i,
12563) -> __m128h {
12564    unsafe {
12565        static_assert_rounding!(ROUNDING);
12566        simd_select_bitmask(k, _mm512_cvt_roundepi64_ph::<ROUNDING>(a), src)
12567    }
12568}
12569
12570/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12571/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12572///
12573/// Rounding is done according to the rounding parameter, which can be one of:
12574///
12575/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12576/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12577/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12578/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12579/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12580///
12581/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi64_ph)
12582#[inline]
12583#[target_feature(enable = "avx512fp16")]
12584#[cfg_attr(test, assert_instr(vcvtqq2ph, ROUNDING = 8))]
12585#[rustc_legacy_const_generics(2)]
12586#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12587pub fn _mm512_maskz_cvt_roundepi64_ph<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m128h {
12588    static_assert_rounding!(ROUNDING);
12589    _mm512_mask_cvt_roundepi64_ph::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a)
12590}
12591
12592/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12593/// and store the results in dst. The upper 96 bits of dst are zeroed out.
12594///
12595/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu64_ph)
12596#[inline]
12597#[target_feature(enable = "avx512fp16,avx512vl")]
12598#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12599#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12600pub fn _mm_cvtepu64_ph(a: __m128i) -> __m128h {
12601    _mm_mask_cvtepu64_ph(_mm_setzero_ph(), 0xff, a)
12602}
12603
12604/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12605/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12606/// mask bit is not set). The upper 96 bits of dst are zeroed out.
12607///
12608/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu64_ph)
12609#[inline]
12610#[target_feature(enable = "avx512fp16,avx512vl")]
12611#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12612#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12613pub fn _mm_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
12614    unsafe { vcvtuqq2ph_128(a.as_u64x2(), src, k) }
12615}
12616
12617/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12618/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12619/// The upper 96 bits of dst are zeroed out.
12620///
12621/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu64_ph)
12622#[inline]
12623#[target_feature(enable = "avx512fp16,avx512vl")]
12624#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12625#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12626pub fn _mm_maskz_cvtepu64_ph(k: __mmask8, a: __m128i) -> __m128h {
12627    _mm_mask_cvtepu64_ph(_mm_setzero_ph(), k, a)
12628}
12629
12630/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12631/// and store the results in dst. The upper 64 bits of dst are zeroed out.
12632///
12633/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu64_ph)
12634#[inline]
12635#[target_feature(enable = "avx512fp16,avx512vl")]
12636#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12637#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12638pub fn _mm256_cvtepu64_ph(a: __m256i) -> __m128h {
12639    _mm256_mask_cvtepu64_ph(_mm_setzero_ph(), 0xff, a)
12640}
12641
12642/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12643/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12644/// mask bit is not set). The upper 64 bits of dst are zeroed out.
12645///
12646/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu64_ph)
12647#[inline]
12648#[target_feature(enable = "avx512fp16,avx512vl")]
12649#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12650#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12651pub fn _mm256_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
12652    unsafe { vcvtuqq2ph_256(a.as_u64x4(), src, k) }
12653}
12654
12655/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12656/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12657/// The upper 64 bits of dst are zeroed out.
12658///
12659/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu64_ph)
12660#[inline]
12661#[target_feature(enable = "avx512fp16,avx512vl")]
12662#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12663#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12664pub fn _mm256_maskz_cvtepu64_ph(k: __mmask8, a: __m256i) -> __m128h {
12665    _mm256_mask_cvtepu64_ph(_mm_setzero_ph(), k, a)
12666}
12667
12668/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12669/// and store the results in dst.
12670///
12671/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu64_ph)
12672#[inline]
12673#[target_feature(enable = "avx512fp16")]
12674#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12675#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12676pub fn _mm512_cvtepu64_ph(a: __m512i) -> __m128h {
12677    unsafe { vcvtuqq2ph_512(a.as_u64x8(), _MM_FROUND_CUR_DIRECTION) }
12678}
12679
12680/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12681/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12682/// mask bit is not set).
12683///
12684/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu64_ph)
12685#[inline]
12686#[target_feature(enable = "avx512fp16")]
12687#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12688#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12689pub fn _mm512_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m512i) -> __m128h {
12690    unsafe { simd_select_bitmask(k, _mm512_cvtepu64_ph(a), src) }
12691}
12692
12693/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12694/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12695///
12696/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu64_ph)
12697#[inline]
12698#[target_feature(enable = "avx512fp16")]
12699#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12700#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12701pub fn _mm512_maskz_cvtepu64_ph(k: __mmask8, a: __m512i) -> __m128h {
12702    _mm512_mask_cvtepu64_ph(f16x8::ZERO.as_m128h(), k, a)
12703}
12704
12705/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12706/// and store the results in dst.
12707///
12708/// Rounding is done according to the rounding parameter, which can be one of:
12709///
12710/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12711/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12712/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12713/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12714/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12715///
12716/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepu64_ph)
12717#[inline]
12718#[target_feature(enable = "avx512fp16")]
12719#[cfg_attr(test, assert_instr(vcvtuqq2ph, ROUNDING = 8))]
12720#[rustc_legacy_const_generics(1)]
12721#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12722pub fn _mm512_cvt_roundepu64_ph<const ROUNDING: i32>(a: __m512i) -> __m128h {
12723    unsafe {
12724        static_assert_rounding!(ROUNDING);
12725        vcvtuqq2ph_512(a.as_u64x8(), ROUNDING)
12726    }
12727}
12728
12729/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12730/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12731/// mask bit is not set).
12732///
12733/// Rounding is done according to the rounding parameter, which can be one of:
12734///
12735/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12736/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12737/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12738/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12739/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12740///
12741/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepu64_ph)
12742#[inline]
12743#[target_feature(enable = "avx512fp16")]
12744#[cfg_attr(test, assert_instr(vcvtuqq2ph, ROUNDING = 8))]
12745#[rustc_legacy_const_generics(3)]
12746#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12747pub fn _mm512_mask_cvt_roundepu64_ph<const ROUNDING: i32>(
12748    src: __m128h,
12749    k: __mmask8,
12750    a: __m512i,
12751) -> __m128h {
12752    unsafe {
12753        static_assert_rounding!(ROUNDING);
12754        simd_select_bitmask(k, _mm512_cvt_roundepu64_ph::<ROUNDING>(a), src)
12755    }
12756}
12757
12758/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12759/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12760///
12761/// Rounding is done according to the rounding parameter, which can be one of:
12762///
12763/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12764/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12765/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12766/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12767/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12768///
12769/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepu64_ph)
12770#[inline]
12771#[target_feature(enable = "avx512fp16")]
12772#[cfg_attr(test, assert_instr(vcvtuqq2ph, ROUNDING = 8))]
12773#[rustc_legacy_const_generics(2)]
12774#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12775pub fn _mm512_maskz_cvt_roundepu64_ph<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m128h {
12776    static_assert_rounding!(ROUNDING);
12777    _mm512_mask_cvt_roundepu64_ph::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a)
12778}
12779
12780/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12781/// floating-point elements, and store the results in dst.
12782///
12783/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtxps_ph)
12784#[inline]
12785#[target_feature(enable = "avx512fp16,avx512vl")]
12786#[cfg_attr(test, assert_instr(vcvtps2phx))]
12787#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12788pub fn _mm_cvtxps_ph(a: __m128) -> __m128h {
12789    _mm_mask_cvtxps_ph(_mm_setzero_ph(), 0xff, a)
12790}
12791
12792/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12793/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
12794/// when the corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
12795///
12796/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtxps_ph)
12797#[inline]
12798#[target_feature(enable = "avx512fp16,avx512vl")]
12799#[cfg_attr(test, assert_instr(vcvtps2phx))]
12800#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12801pub fn _mm_mask_cvtxps_ph(src: __m128h, k: __mmask8, a: __m128) -> __m128h {
12802    unsafe { vcvtps2phx_128(a, src, k) }
12803}
12804
12805/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12806/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
12807/// corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
12808///
12809/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtxps_ph)
12810#[inline]
12811#[target_feature(enable = "avx512fp16,avx512vl")]
12812#[cfg_attr(test, assert_instr(vcvtps2phx))]
12813#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12814pub fn _mm_maskz_cvtxps_ph(k: __mmask8, a: __m128) -> __m128h {
12815    _mm_mask_cvtxps_ph(_mm_setzero_ph(), k, a)
12816}
12817
12818/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12819/// floating-point elements, and store the results in dst.
12820///
12821/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtxps_ph)
12822#[inline]
12823#[target_feature(enable = "avx512fp16,avx512vl")]
12824#[cfg_attr(test, assert_instr(vcvtps2phx))]
12825#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12826pub fn _mm256_cvtxps_ph(a: __m256) -> __m128h {
12827    _mm256_mask_cvtxps_ph(_mm_setzero_ph(), 0xff, a)
12828}
12829
12830/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12831/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
12832/// when the corresponding mask bit is not set).
12833///
12834/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtxps_ph)
12835#[inline]
12836#[target_feature(enable = "avx512fp16,avx512vl")]
12837#[cfg_attr(test, assert_instr(vcvtps2phx))]
12838#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12839pub fn _mm256_mask_cvtxps_ph(src: __m128h, k: __mmask8, a: __m256) -> __m128h {
12840    unsafe { vcvtps2phx_256(a, src, k) }
12841}
12842
12843/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12844/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
12845/// corresponding mask bit is not set).
12846///
12847/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtxps_ph)
12848#[inline]
12849#[target_feature(enable = "avx512fp16,avx512vl")]
12850#[cfg_attr(test, assert_instr(vcvtps2phx))]
12851#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12852pub fn _mm256_maskz_cvtxps_ph(k: __mmask8, a: __m256) -> __m128h {
12853    _mm256_mask_cvtxps_ph(_mm_setzero_ph(), k, a)
12854}
12855
12856/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12857/// floating-point elements, and store the results in dst.
12858///
12859/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtxps_ph)
12860#[inline]
12861#[target_feature(enable = "avx512fp16")]
12862#[cfg_attr(test, assert_instr(vcvtps2phx))]
12863#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12864pub fn _mm512_cvtxps_ph(a: __m512) -> __m256h {
12865    _mm512_mask_cvtxps_ph(f16x16::ZERO.as_m256h(), 0xffff, a)
12866}
12867
12868/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12869/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
12870/// when the corresponding mask bit is not set).
12871///
12872/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtxps_ph)
12873#[inline]
12874#[target_feature(enable = "avx512fp16")]
12875#[cfg_attr(test, assert_instr(vcvtps2phx))]
12876#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12877pub fn _mm512_mask_cvtxps_ph(src: __m256h, k: __mmask16, a: __m512) -> __m256h {
12878    unsafe { vcvtps2phx_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
12879}
12880
12881/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12882/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
12883/// corresponding mask bit is not set).
12884///
12885/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtxps_ph)
12886#[inline]
12887#[target_feature(enable = "avx512fp16")]
12888#[cfg_attr(test, assert_instr(vcvtps2phx))]
12889#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12890pub fn _mm512_maskz_cvtxps_ph(k: __mmask16, a: __m512) -> __m256h {
12891    _mm512_mask_cvtxps_ph(f16x16::ZERO.as_m256h(), k, a)
12892}
12893
12894/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12895/// floating-point elements, and store the results in dst.
12896///
12897/// Rounding is done according to the rounding parameter, which can be one of:
12898///
12899/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12900/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12901/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12902/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12903/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12904///
12905/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtx_roundps_ph)
12906#[inline]
12907#[target_feature(enable = "avx512fp16")]
12908#[cfg_attr(test, assert_instr(vcvtps2phx, ROUNDING = 8))]
12909#[rustc_legacy_const_generics(1)]
12910#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12911pub fn _mm512_cvtx_roundps_ph<const ROUNDING: i32>(a: __m512) -> __m256h {
12912    static_assert_rounding!(ROUNDING);
12913    _mm512_mask_cvtx_roundps_ph::<ROUNDING>(f16x16::ZERO.as_m256h(), 0xffff, a)
12914}
12915
12916/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12917/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
12918/// when the corresponding mask bit is not set).
12919///
12920/// Rounding is done according to the rounding parameter, which can be one of:
12921///
12922/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12923/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12924/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12925/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12926/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12927///
12928/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtx_roundps_ph)
12929#[inline]
12930#[target_feature(enable = "avx512fp16")]
12931#[cfg_attr(test, assert_instr(vcvtps2phx, ROUNDING = 8))]
12932#[rustc_legacy_const_generics(3)]
12933#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12934pub fn _mm512_mask_cvtx_roundps_ph<const ROUNDING: i32>(
12935    src: __m256h,
12936    k: __mmask16,
12937    a: __m512,
12938) -> __m256h {
12939    unsafe {
12940        static_assert_rounding!(ROUNDING);
12941        vcvtps2phx_512(a, src, k, ROUNDING)
12942    }
12943}
12944
12945/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12946/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
12947/// corresponding mask bit is not set).
12948///
12949/// Rounding is done according to the rounding parameter, which can be one of:
12950///
12951/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12952/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12953/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12954/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12955/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12956///
12957/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtx_roundps_ph)
12958#[inline]
12959#[target_feature(enable = "avx512fp16")]
12960#[cfg_attr(test, assert_instr(vcvtps2phx, ROUNDING = 8))]
12961#[rustc_legacy_const_generics(2)]
12962#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12963pub fn _mm512_maskz_cvtx_roundps_ph<const ROUNDING: i32>(k: __mmask16, a: __m512) -> __m256h {
12964    static_assert_rounding!(ROUNDING);
12965    _mm512_mask_cvtx_roundps_ph::<ROUNDING>(f16x16::ZERO.as_m256h(), k, a)
12966}
12967
12968/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
12969/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
12970/// elements from a to the upper elements of dst.
12971///
12972/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sh)
12973#[inline]
12974#[target_feature(enable = "avx512fp16")]
12975#[cfg_attr(test, assert_instr(vcvtss2sh))]
12976#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12977pub fn _mm_cvtss_sh(a: __m128h, b: __m128) -> __m128h {
12978    _mm_mask_cvtss_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
12979}
12980
12981/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
12982/// floating-point elements, store the result in the lower element of dst using writemask k (the element
12983/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
12984/// upper elements of dst.
12985///
12986/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtss_sh)
12987#[inline]
12988#[target_feature(enable = "avx512fp16")]
12989#[cfg_attr(test, assert_instr(vcvtss2sh))]
12990#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12991pub fn _mm_mask_cvtss_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128) -> __m128h {
12992    unsafe { vcvtss2sh(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
12993}
12994
12995/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
12996/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
12997/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
12998/// elements of dst.
12999///
13000/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtss_sh)
13001#[inline]
13002#[target_feature(enable = "avx512fp16")]
13003#[cfg_attr(test, assert_instr(vcvtss2sh))]
13004#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13005pub fn _mm_maskz_cvtss_sh(k: __mmask8, a: __m128h, b: __m128) -> __m128h {
13006    _mm_mask_cvtss_sh(f16x8::ZERO.as_m128h(), k, a, b)
13007}
13008
13009/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
13010/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
13011/// elements from a to the upper elements of dst.
13012///
13013/// Rounding is done according to the rounding parameter, which can be one of:
13014///
13015/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13016/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13017/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13018/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13019/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13020///
13021/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundss_sh)
13022#[inline]
13023#[target_feature(enable = "avx512fp16")]
13024#[cfg_attr(test, assert_instr(vcvtss2sh, ROUNDING = 8))]
13025#[rustc_legacy_const_generics(2)]
13026#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13027pub fn _mm_cvt_roundss_sh<const ROUNDING: i32>(a: __m128h, b: __m128) -> __m128h {
13028    static_assert_rounding!(ROUNDING);
13029    _mm_mask_cvt_roundss_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
13030}
13031
13032/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
13033/// floating-point elements, store the result in the lower element of dst using writemask k (the element
13034/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
13035/// upper elements of dst.
13036///
13037/// Rounding is done according to the rounding parameter, which can be one of:
13038///
13039/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13040/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13041/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13042/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13043/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13044///
13045/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundss_sh)
13046#[inline]
13047#[target_feature(enable = "avx512fp16")]
13048#[cfg_attr(test, assert_instr(vcvtss2sh, ROUNDING = 8))]
13049#[rustc_legacy_const_generics(4)]
13050#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13051pub fn _mm_mask_cvt_roundss_sh<const ROUNDING: i32>(
13052    src: __m128h,
13053    k: __mmask8,
13054    a: __m128h,
13055    b: __m128,
13056) -> __m128h {
13057    unsafe {
13058        static_assert_rounding!(ROUNDING);
13059        vcvtss2sh(a, b, src, k, ROUNDING)
13060    }
13061}
13062
13063/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
13064/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
13065/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
13066/// elements of dst.
13067///
13068/// Rounding is done according to the rounding parameter, which can be one of:
13069///
13070/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13071/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13072/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13073/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13074/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13075///
13076/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundss_sh)
13077#[inline]
13078#[target_feature(enable = "avx512fp16")]
13079#[cfg_attr(test, assert_instr(vcvtss2sh, ROUNDING = 8))]
13080#[rustc_legacy_const_generics(3)]
13081#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13082pub fn _mm_maskz_cvt_roundss_sh<const ROUNDING: i32>(
13083    k: __mmask8,
13084    a: __m128h,
13085    b: __m128,
13086) -> __m128h {
13087    static_assert_rounding!(ROUNDING);
13088    _mm_mask_cvt_roundss_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
13089}
13090
13091/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13092/// floating-point elements, and store the results in dst. The upper 96 bits of dst are zeroed out.
13093///
13094/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ph)
13095#[inline]
13096#[target_feature(enable = "avx512fp16,avx512vl")]
13097#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13098#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13099pub fn _mm_cvtpd_ph(a: __m128d) -> __m128h {
13100    _mm_mask_cvtpd_ph(_mm_setzero_ph(), 0xff, a)
13101}
13102
13103/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13104/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13105/// when the corresponding mask bit is not set). The upper 96 bits of dst are zeroed out.
13106///
13107/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtpd_ph)
13108#[inline]
13109#[target_feature(enable = "avx512fp16,avx512vl")]
13110#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13111#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13112pub fn _mm_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m128d) -> __m128h {
13113    unsafe { vcvtpd2ph_128(a, src, k) }
13114}
13115
13116/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13117/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13118/// corresponding mask bit is not set). The upper 96 bits of dst are zeroed out.
13119///
13120/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtpd_ph)
13121#[inline]
13122#[target_feature(enable = "avx512fp16,avx512vl")]
13123#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13124#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13125pub fn _mm_maskz_cvtpd_ph(k: __mmask8, a: __m128d) -> __m128h {
13126    _mm_mask_cvtpd_ph(_mm_setzero_ph(), k, a)
13127}
13128
13129/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13130/// floating-point elements, and store the results in dst. The upper 64 bits of dst are zeroed out.
13131///
13132/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtpd_ph)
13133#[inline]
13134#[target_feature(enable = "avx512fp16,avx512vl")]
13135#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13136#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13137pub fn _mm256_cvtpd_ph(a: __m256d) -> __m128h {
13138    _mm256_mask_cvtpd_ph(_mm_setzero_ph(), 0xff, a)
13139}
13140
13141/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13142/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13143/// when the corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
13144///
13145/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtpd_ph)
13146#[inline]
13147#[target_feature(enable = "avx512fp16,avx512vl")]
13148#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13149#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13150pub fn _mm256_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m256d) -> __m128h {
13151    unsafe { vcvtpd2ph_256(a, src, k) }
13152}
13153
13154/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13155/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13156/// corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
13157///
13158/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtpd_ph)
13159#[inline]
13160#[target_feature(enable = "avx512fp16,avx512vl")]
13161#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13162#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13163pub fn _mm256_maskz_cvtpd_ph(k: __mmask8, a: __m256d) -> __m128h {
13164    _mm256_mask_cvtpd_ph(_mm_setzero_ph(), k, a)
13165}
13166
13167/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13168/// floating-point elements, and store the results in dst.
13169///
13170/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpd_ph)
13171#[inline]
13172#[target_feature(enable = "avx512fp16")]
13173#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13174#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13175pub fn _mm512_cvtpd_ph(a: __m512d) -> __m128h {
13176    _mm512_mask_cvtpd_ph(f16x8::ZERO.as_m128h(), 0xff, a)
13177}
13178
13179/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13180/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13181/// when the corresponding mask bit is not set).
13182///
13183/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpd_ph)
13184#[inline]
13185#[target_feature(enable = "avx512fp16")]
13186#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13187#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13188pub fn _mm512_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m512d) -> __m128h {
13189    unsafe { vcvtpd2ph_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
13190}
13191
13192/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13193/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13194/// corresponding mask bit is not set).
13195///
13196/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtpd_ph)
13197#[inline]
13198#[target_feature(enable = "avx512fp16")]
13199#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13200#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13201pub fn _mm512_maskz_cvtpd_ph(k: __mmask8, a: __m512d) -> __m128h {
13202    _mm512_mask_cvtpd_ph(f16x8::ZERO.as_m128h(), k, a)
13203}
13204
13205/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13206/// floating-point elements, and store the results in dst.
13207///
13208/// Rounding is done according to the rounding parameter, which can be one of:
13209///
13210/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13211/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13212/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13213/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13214/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13215///
13216/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundpd_ph)
13217#[inline]
13218#[target_feature(enable = "avx512fp16")]
13219#[cfg_attr(test, assert_instr(vcvtpd2ph, ROUNDING = 8))]
13220#[rustc_legacy_const_generics(1)]
13221#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13222pub fn _mm512_cvt_roundpd_ph<const ROUNDING: i32>(a: __m512d) -> __m128h {
13223    static_assert_rounding!(ROUNDING);
13224    _mm512_mask_cvt_roundpd_ph::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a)
13225}
13226
13227/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13228/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13229/// when the corresponding mask bit is not set).
13230///
13231/// Rounding is done according to the rounding parameter, which can be one of:
13232///
13233/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13234/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13235/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13236/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13237/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13238///
13239/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundpd_ph)
13240#[inline]
13241#[target_feature(enable = "avx512fp16")]
13242#[cfg_attr(test, assert_instr(vcvtpd2ph, ROUNDING = 8))]
13243#[rustc_legacy_const_generics(3)]
13244#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13245pub fn _mm512_mask_cvt_roundpd_ph<const ROUNDING: i32>(
13246    src: __m128h,
13247    k: __mmask8,
13248    a: __m512d,
13249) -> __m128h {
13250    unsafe {
13251        static_assert_rounding!(ROUNDING);
13252        vcvtpd2ph_512(a, src, k, ROUNDING)
13253    }
13254}
13255
13256/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13257/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13258/// corresponding mask bit is not set).
13259///
13260/// Rounding is done according to the rounding parameter, which can be one of:
13261///
13262/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13263/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13264/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13265/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13266/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13267///
13268/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundpd_ph)
13269#[inline]
13270#[target_feature(enable = "avx512fp16")]
13271#[cfg_attr(test, assert_instr(vcvtpd2ph, ROUNDING = 8))]
13272#[rustc_legacy_const_generics(2)]
13273#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13274pub fn _mm512_maskz_cvt_roundpd_ph<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m128h {
13275    static_assert_rounding!(ROUNDING);
13276    _mm512_mask_cvt_roundpd_ph::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a)
13277}
13278
13279/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13280/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
13281/// elements from a to the upper elements of dst.
13282///
13283/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_sh)
13284#[inline]
13285#[target_feature(enable = "avx512fp16")]
13286#[cfg_attr(test, assert_instr(vcvtsd2sh))]
13287#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13288pub fn _mm_cvtsd_sh(a: __m128h, b: __m128d) -> __m128h {
13289    _mm_mask_cvtsd_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
13290}
13291
13292/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13293/// floating-point elements, store the result in the lower element of dst using writemask k (the element
13294/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
13295/// upper elements of dst.
13296///
13297/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsd_sh)
13298#[inline]
13299#[target_feature(enable = "avx512fp16")]
13300#[cfg_attr(test, assert_instr(vcvtsd2sh))]
13301#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13302pub fn _mm_mask_cvtsd_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128d) -> __m128h {
13303    unsafe { vcvtsd2sh(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
13304}
13305
13306/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13307/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
13308/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
13309/// elements of dst.
13310///
13311/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsd_sh)
13312#[inline]
13313#[target_feature(enable = "avx512fp16")]
13314#[cfg_attr(test, assert_instr(vcvtsd2sh))]
13315#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13316pub fn _mm_maskz_cvtsd_sh(k: __mmask8, a: __m128h, b: __m128d) -> __m128h {
13317    _mm_mask_cvtsd_sh(f16x8::ZERO.as_m128h(), k, a, b)
13318}
13319
13320/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13321/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
13322/// elements from a to the upper elements of dst.
13323///
13324/// Rounding is done according to the rounding parameter, which can be one of:
13325///
13326/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13327/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13328/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13329/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13330/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13331///
13332/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsd_sh)
13333#[inline]
13334#[target_feature(enable = "avx512fp16")]
13335#[cfg_attr(test, assert_instr(vcvtsd2sh, ROUNDING = 8))]
13336#[rustc_legacy_const_generics(2)]
13337#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13338pub fn _mm_cvt_roundsd_sh<const ROUNDING: i32>(a: __m128h, b: __m128d) -> __m128h {
13339    static_assert_rounding!(ROUNDING);
13340    _mm_mask_cvt_roundsd_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
13341}
13342
13343/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13344/// floating-point elements, store the result in the lower element of dst using writemask k (the element
13345/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
13346/// upper elements of dst.
13347///
13348/// Rounding is done according to the rounding parameter, which can be one of:
13349///
13350/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13351/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13352/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13353/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13354/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13355///
13356/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsd_sh)
13357#[inline]
13358#[target_feature(enable = "avx512fp16")]
13359#[cfg_attr(test, assert_instr(vcvtsd2sh, ROUNDING = 8))]
13360#[rustc_legacy_const_generics(4)]
13361#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13362pub fn _mm_mask_cvt_roundsd_sh<const ROUNDING: i32>(
13363    src: __m128h,
13364    k: __mmask8,
13365    a: __m128h,
13366    b: __m128d,
13367) -> __m128h {
13368    unsafe {
13369        static_assert_rounding!(ROUNDING);
13370        vcvtsd2sh(a, b, src, k, ROUNDING)
13371    }
13372}
13373
13374/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13375/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
13376/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
13377/// elements of dst.
13378///
13379/// Rounding is done according to the rounding parameter, which can be one of:
13380///
13381/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13382/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13383/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13384/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13385/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13386///
13387/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsd_sh)
13388#[inline]
13389#[target_feature(enable = "avx512fp16")]
13390#[cfg_attr(test, assert_instr(vcvtsd2sh, ROUNDING = 8))]
13391#[rustc_legacy_const_generics(3)]
13392#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13393pub fn _mm_maskz_cvt_roundsd_sh<const ROUNDING: i32>(
13394    k: __mmask8,
13395    a: __m128h,
13396    b: __m128d,
13397) -> __m128h {
13398    static_assert_rounding!(ROUNDING);
13399    _mm_mask_cvt_roundsd_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
13400}
13401
13402/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13403/// store the results in dst.
13404///
13405/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epi16)
13406#[inline]
13407#[target_feature(enable = "avx512fp16,avx512vl")]
13408#[cfg_attr(test, assert_instr(vcvtph2w))]
13409#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13410pub fn _mm_cvtph_epi16(a: __m128h) -> __m128i {
13411    _mm_mask_cvtph_epi16(_mm_undefined_si128(), 0xff, a)
13412}
13413
13414/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13415/// store the results in dst using writemask k (elements are copied from src when the corresponding
13416/// mask bit is not set).
13417///
13418/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epi16)
13419#[inline]
13420#[target_feature(enable = "avx512fp16,avx512vl")]
13421#[cfg_attr(test, assert_instr(vcvtph2w))]
13422#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13423pub fn _mm_mask_cvtph_epi16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
13424    unsafe { transmute(vcvtph2w_128(a, src.as_i16x8(), k)) }
13425}
13426
13427/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13428/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13429///
13430/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epi16)
13431#[inline]
13432#[target_feature(enable = "avx512fp16,avx512vl")]
13433#[cfg_attr(test, assert_instr(vcvtph2w))]
13434#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13435pub fn _mm_maskz_cvtph_epi16(k: __mmask8, a: __m128h) -> __m128i {
13436    _mm_mask_cvtph_epi16(_mm_setzero_si128(), k, a)
13437}
13438
13439/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13440/// store the results in dst.
13441///
13442/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epi16)
13443#[inline]
13444#[target_feature(enable = "avx512fp16,avx512vl")]
13445#[cfg_attr(test, assert_instr(vcvtph2w))]
13446#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13447pub fn _mm256_cvtph_epi16(a: __m256h) -> __m256i {
13448    _mm256_mask_cvtph_epi16(_mm256_undefined_si256(), 0xffff, a)
13449}
13450
13451/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13452/// store the results in dst using writemask k (elements are copied from src when the corresponding
13453/// mask bit is not set).
13454///
13455/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epi16)
13456#[inline]
13457#[target_feature(enable = "avx512fp16,avx512vl")]
13458#[cfg_attr(test, assert_instr(vcvtph2w))]
13459#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13460pub fn _mm256_mask_cvtph_epi16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
13461    unsafe { transmute(vcvtph2w_256(a, src.as_i16x16(), k)) }
13462}
13463
13464/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13465/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13466///
13467/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epi16)
13468#[inline]
13469#[target_feature(enable = "avx512fp16,avx512vl")]
13470#[cfg_attr(test, assert_instr(vcvtph2w))]
13471#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13472pub fn _mm256_maskz_cvtph_epi16(k: __mmask16, a: __m256h) -> __m256i {
13473    _mm256_mask_cvtph_epi16(_mm256_setzero_si256(), k, a)
13474}
13475
13476/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13477/// store the results in dst.
13478///
13479/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epi16)
13480#[inline]
13481#[target_feature(enable = "avx512fp16")]
13482#[cfg_attr(test, assert_instr(vcvtph2w))]
13483#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13484pub fn _mm512_cvtph_epi16(a: __m512h) -> __m512i {
13485    _mm512_mask_cvtph_epi16(_mm512_undefined_epi32(), 0xffffffff, a)
13486}
13487
13488/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13489/// store the results in dst using writemask k (elements are copied from src when the corresponding
13490/// mask bit is not set).
13491///
13492/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epi16)
13493#[inline]
13494#[target_feature(enable = "avx512fp16")]
13495#[cfg_attr(test, assert_instr(vcvtph2w))]
13496#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13497pub fn _mm512_mask_cvtph_epi16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
13498    unsafe {
13499        transmute(vcvtph2w_512(
13500            a,
13501            src.as_i16x32(),
13502            k,
13503            _MM_FROUND_CUR_DIRECTION,
13504        ))
13505    }
13506}
13507
13508/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13509/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13510///
13511/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epi16)
13512#[inline]
13513#[target_feature(enable = "avx512fp16")]
13514#[cfg_attr(test, assert_instr(vcvtph2w))]
13515#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13516pub fn _mm512_maskz_cvtph_epi16(k: __mmask32, a: __m512h) -> __m512i {
13517    _mm512_mask_cvtph_epi16(_mm512_setzero_si512(), k, a)
13518}
13519
13520/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13521/// store the results in dst.
13522///
13523/// Rounding is done according to the rounding parameter, which can be one of:
13524///
13525/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13526/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13527/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13528/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13529/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13530///
13531/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epi16)
13532#[inline]
13533#[target_feature(enable = "avx512fp16")]
13534#[cfg_attr(test, assert_instr(vcvtph2w, ROUNDING = 8))]
13535#[rustc_legacy_const_generics(1)]
13536#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13537pub fn _mm512_cvt_roundph_epi16<const ROUNDING: i32>(a: __m512h) -> __m512i {
13538    static_assert_rounding!(ROUNDING);
13539    _mm512_mask_cvt_roundph_epi16::<ROUNDING>(_mm512_undefined_epi32(), 0xffffffff, a)
13540}
13541
13542/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13543/// store the results in dst using writemask k (elements are copied from src when the corresponding
13544/// mask bit is not set).
13545///
13546/// Rounding is done according to the rounding parameter, which can be one of:
13547///
13548/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13549/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13550/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13551/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13552/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13553///
13554/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epi16)
13555#[inline]
13556#[target_feature(enable = "avx512fp16")]
13557#[cfg_attr(test, assert_instr(vcvtph2w, ROUNDING = 8))]
13558#[rustc_legacy_const_generics(3)]
13559#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13560pub fn _mm512_mask_cvt_roundph_epi16<const ROUNDING: i32>(
13561    src: __m512i,
13562    k: __mmask32,
13563    a: __m512h,
13564) -> __m512i {
13565    unsafe {
13566        static_assert_rounding!(ROUNDING);
13567        transmute(vcvtph2w_512(a, src.as_i16x32(), k, ROUNDING))
13568    }
13569}
13570
13571/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13572/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13573///
13574/// Rounding is done according to the rounding parameter, which can be one of:
13575///
13576/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13577/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13578/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13579/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13580/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13581///
13582/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epi16)
13583#[inline]
13584#[target_feature(enable = "avx512fp16")]
13585#[cfg_attr(test, assert_instr(vcvtph2w, ROUNDING = 8))]
13586#[rustc_legacy_const_generics(2)]
13587#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13588pub fn _mm512_maskz_cvt_roundph_epi16<const ROUNDING: i32>(k: __mmask32, a: __m512h) -> __m512i {
13589    static_assert_rounding!(ROUNDING);
13590    _mm512_mask_cvt_roundph_epi16::<ROUNDING>(_mm512_setzero_si512(), k, a)
13591}
13592
13593/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13594/// and store the results in dst.
13595///
13596/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epu16)
13597#[inline]
13598#[target_feature(enable = "avx512fp16,avx512vl")]
13599#[cfg_attr(test, assert_instr(vcvtph2uw))]
13600#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13601pub fn _mm_cvtph_epu16(a: __m128h) -> __m128i {
13602    _mm_mask_cvtph_epu16(_mm_undefined_si128(), 0xff, a)
13603}
13604
13605/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13606/// and store the results in dst using writemask k (elements are copied from src when the corresponding
13607/// mask bit is not set).
13608///
13609/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epu16)
13610#[inline]
13611#[target_feature(enable = "avx512fp16,avx512vl")]
13612#[cfg_attr(test, assert_instr(vcvtph2uw))]
13613#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13614pub fn _mm_mask_cvtph_epu16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
13615    unsafe { transmute(vcvtph2uw_128(a, src.as_u16x8(), k)) }
13616}
13617
13618/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13619/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13620///
13621/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epu16)
13622#[inline]
13623#[target_feature(enable = "avx512fp16,avx512vl")]
13624#[cfg_attr(test, assert_instr(vcvtph2uw))]
13625#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13626pub fn _mm_maskz_cvtph_epu16(k: __mmask8, a: __m128h) -> __m128i {
13627    _mm_mask_cvtph_epu16(_mm_setzero_si128(), k, a)
13628}
13629
13630/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13631/// and store the results in dst.
13632///
13633/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epu16)
13634#[inline]
13635#[target_feature(enable = "avx512fp16,avx512vl")]
13636#[cfg_attr(test, assert_instr(vcvtph2uw))]
13637#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13638pub fn _mm256_cvtph_epu16(a: __m256h) -> __m256i {
13639    _mm256_mask_cvtph_epu16(_mm256_undefined_si256(), 0xffff, a)
13640}
13641
13642/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13643/// and store the results in dst using writemask k (elements are copied from src when the corresponding
13644/// mask bit is not set).
13645///
13646/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epu16)
13647#[inline]
13648#[target_feature(enable = "avx512fp16,avx512vl")]
13649#[cfg_attr(test, assert_instr(vcvtph2uw))]
13650#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13651pub fn _mm256_mask_cvtph_epu16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
13652    unsafe { transmute(vcvtph2uw_256(a, src.as_u16x16(), k)) }
13653}
13654
13655/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13656/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13657///
13658/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epu16)
13659#[inline]
13660#[target_feature(enable = "avx512fp16,avx512vl")]
13661#[cfg_attr(test, assert_instr(vcvtph2uw))]
13662#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13663pub fn _mm256_maskz_cvtph_epu16(k: __mmask16, a: __m256h) -> __m256i {
13664    _mm256_mask_cvtph_epu16(_mm256_setzero_si256(), k, a)
13665}
13666
13667/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13668/// and store the results in dst.
13669///
13670/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epu16)
13671#[inline]
13672#[target_feature(enable = "avx512fp16")]
13673#[cfg_attr(test, assert_instr(vcvtph2uw))]
13674#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13675pub fn _mm512_cvtph_epu16(a: __m512h) -> __m512i {
13676    _mm512_mask_cvtph_epu16(_mm512_undefined_epi32(), 0xffffffff, a)
13677}
13678
13679/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13680/// and store the results in dst using writemask k (elements are copied from src when the corresponding
13681/// mask bit is not set).
13682///
13683/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epu16)
13684#[inline]
13685#[target_feature(enable = "avx512fp16")]
13686#[cfg_attr(test, assert_instr(vcvtph2uw))]
13687#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13688pub fn _mm512_mask_cvtph_epu16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
13689    unsafe {
13690        transmute(vcvtph2uw_512(
13691            a,
13692            src.as_u16x32(),
13693            k,
13694            _MM_FROUND_CUR_DIRECTION,
13695        ))
13696    }
13697}
13698
13699/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13700/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13701///
13702/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epu16)
13703#[inline]
13704#[target_feature(enable = "avx512fp16")]
13705#[cfg_attr(test, assert_instr(vcvtph2uw))]
13706#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13707pub fn _mm512_maskz_cvtph_epu16(k: __mmask32, a: __m512h) -> __m512i {
13708    _mm512_mask_cvtph_epu16(_mm512_setzero_si512(), k, a)
13709}
13710
13711/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13712/// and store the results in dst.
13713///
13714/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`] in the sae parameter.
13715///
13716/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epu16)
13717#[inline]
13718#[target_feature(enable = "avx512fp16")]
13719#[cfg_attr(test, assert_instr(vcvtph2uw, SAE = 8))]
13720#[rustc_legacy_const_generics(1)]
13721#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13722pub fn _mm512_cvt_roundph_epu16<const SAE: i32>(a: __m512h) -> __m512i {
13723    static_assert_sae!(SAE);
13724    _mm512_mask_cvt_roundph_epu16::<SAE>(_mm512_undefined_epi32(), 0xffffffff, a)
13725}
13726
13727/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13728/// and store the results in dst using writemask k (elements are copied from src when the corresponding
13729/// mask bit is not set).
13730///
13731/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`] in the sae parameter.
13732///
13733/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epu16)
13734#[inline]
13735#[target_feature(enable = "avx512fp16")]
13736#[cfg_attr(test, assert_instr(vcvtph2uw, SAE = 8))]
13737#[rustc_legacy_const_generics(3)]
13738#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13739pub fn _mm512_mask_cvt_roundph_epu16<const SAE: i32>(
13740    src: __m512i,
13741    k: __mmask32,
13742    a: __m512h,
13743) -> __m512i {
13744    unsafe {
13745        static_assert_sae!(SAE);
13746        transmute(vcvtph2uw_512(a, src.as_u16x32(), k, SAE))
13747    }
13748}
13749
13750/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13751/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13752///
13753/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`] in the sae parameter.
13754///
13755/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epu16)
13756#[inline]
13757#[target_feature(enable = "avx512fp16")]
13758#[cfg_attr(test, assert_instr(vcvtph2uw, SAE = 8))]
13759#[rustc_legacy_const_generics(2)]
13760#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13761pub fn _mm512_maskz_cvt_roundph_epu16<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512i {
13762    static_assert_sae!(SAE);
13763    _mm512_mask_cvt_roundph_epu16::<SAE>(_mm512_setzero_si512(), k, a)
13764}
13765
13766/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13767/// truncation, and store the results in dst.
13768///
13769/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epi16)
13770#[inline]
13771#[target_feature(enable = "avx512fp16,avx512vl")]
13772#[cfg_attr(test, assert_instr(vcvttph2w))]
13773#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13774pub fn _mm_cvttph_epi16(a: __m128h) -> __m128i {
13775    _mm_mask_cvttph_epi16(_mm_undefined_si128(), 0xff, a)
13776}
13777
13778/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13779/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
13780/// mask bit is not set).
13781///
13782/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epi16)
13783#[inline]
13784#[target_feature(enable = "avx512fp16,avx512vl")]
13785#[cfg_attr(test, assert_instr(vcvttph2w))]
13786#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13787pub fn _mm_mask_cvttph_epi16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
13788    unsafe { transmute(vcvttph2w_128(a, src.as_i16x8(), k)) }
13789}
13790
13791/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13792/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
13793/// mask bit is not set).
13794///
13795/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epi16)
13796#[inline]
13797#[target_feature(enable = "avx512fp16,avx512vl")]
13798#[cfg_attr(test, assert_instr(vcvttph2w))]
13799#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13800pub fn _mm_maskz_cvttph_epi16(k: __mmask8, a: __m128h) -> __m128i {
13801    _mm_mask_cvttph_epi16(_mm_setzero_si128(), k, a)
13802}
13803
13804/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13805/// truncation, and store the results in dst.
13806///
13807/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epi16)
13808#[inline]
13809#[target_feature(enable = "avx512fp16,avx512vl")]
13810#[cfg_attr(test, assert_instr(vcvttph2w))]
13811#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13812pub fn _mm256_cvttph_epi16(a: __m256h) -> __m256i {
13813    _mm256_mask_cvttph_epi16(_mm256_undefined_si256(), 0xffff, a)
13814}
13815
13816/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13817/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
13818/// mask bit is not set).
13819///
13820/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epi16)
13821#[inline]
13822#[target_feature(enable = "avx512fp16,avx512vl")]
13823#[cfg_attr(test, assert_instr(vcvttph2w))]
13824#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13825pub fn _mm256_mask_cvttph_epi16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
13826    unsafe { transmute(vcvttph2w_256(a, src.as_i16x16(), k)) }
13827}
13828
13829/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13830/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
13831/// mask bit is not set).
13832///
13833/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epi16)
13834#[inline]
13835#[target_feature(enable = "avx512fp16,avx512vl")]
13836#[cfg_attr(test, assert_instr(vcvttph2w))]
13837#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13838pub fn _mm256_maskz_cvttph_epi16(k: __mmask16, a: __m256h) -> __m256i {
13839    _mm256_mask_cvttph_epi16(_mm256_setzero_si256(), k, a)
13840}
13841
13842/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13843/// truncation, and store the results in dst.
13844///
13845/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epi16)
13846#[inline]
13847#[target_feature(enable = "avx512fp16")]
13848#[cfg_attr(test, assert_instr(vcvttph2w))]
13849#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13850pub fn _mm512_cvttph_epi16(a: __m512h) -> __m512i {
13851    _mm512_mask_cvttph_epi16(_mm512_undefined_epi32(), 0xffffffff, a)
13852}
13853
13854/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13855/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
13856/// mask bit is not set).
13857///
13858/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epi16)
13859#[inline]
13860#[target_feature(enable = "avx512fp16")]
13861#[cfg_attr(test, assert_instr(vcvttph2w))]
13862#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13863pub fn _mm512_mask_cvttph_epi16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
13864    unsafe {
13865        transmute(vcvttph2w_512(
13866            a,
13867            src.as_i16x32(),
13868            k,
13869            _MM_FROUND_CUR_DIRECTION,
13870        ))
13871    }
13872}
13873
13874/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13875/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
13876/// mask bit is not set).
13877///
13878/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epi16)
13879#[inline]
13880#[target_feature(enable = "avx512fp16")]
13881#[cfg_attr(test, assert_instr(vcvttph2w))]
13882#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13883pub fn _mm512_maskz_cvttph_epi16(k: __mmask32, a: __m512h) -> __m512i {
13884    _mm512_mask_cvttph_epi16(_mm512_setzero_si512(), k, a)
13885}
13886
13887/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13888/// truncation, and store the results in dst.
13889///
13890/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
13891///
13892/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epi16)
13893#[inline]
13894#[target_feature(enable = "avx512fp16")]
13895#[cfg_attr(test, assert_instr(vcvttph2w, SAE = 8))]
13896#[rustc_legacy_const_generics(1)]
13897#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13898pub fn _mm512_cvtt_roundph_epi16<const SAE: i32>(a: __m512h) -> __m512i {
13899    static_assert_sae!(SAE);
13900    _mm512_mask_cvtt_roundph_epi16::<SAE>(_mm512_undefined_epi32(), 0xffffffff, a)
13901}
13902
13903/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13904/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
13905/// mask bit is not set).
13906///
13907/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
13908///
13909/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epi16)
13910#[inline]
13911#[target_feature(enable = "avx512fp16")]
13912#[cfg_attr(test, assert_instr(vcvttph2w, SAE = 8))]
13913#[rustc_legacy_const_generics(3)]
13914#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13915pub fn _mm512_mask_cvtt_roundph_epi16<const SAE: i32>(
13916    src: __m512i,
13917    k: __mmask32,
13918    a: __m512h,
13919) -> __m512i {
13920    unsafe {
13921        static_assert_sae!(SAE);
13922        transmute(vcvttph2w_512(a, src.as_i16x32(), k, SAE))
13923    }
13924}
13925
13926/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13927/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
13928/// mask bit is not set).
13929///
13930/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
13931///
13932/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epi16)
13933#[inline]
13934#[target_feature(enable = "avx512fp16")]
13935#[cfg_attr(test, assert_instr(vcvttph2w, SAE = 8))]
13936#[rustc_legacy_const_generics(2)]
13937#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13938pub fn _mm512_maskz_cvtt_roundph_epi16<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512i {
13939    static_assert_sae!(SAE);
13940    _mm512_mask_cvtt_roundph_epi16::<SAE>(_mm512_setzero_si512(), k, a)
13941}
13942
13943/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
13944/// truncation, and store the results in dst.
13945///
13946/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epu16)
13947#[inline]
13948#[target_feature(enable = "avx512fp16,avx512vl")]
13949#[cfg_attr(test, assert_instr(vcvttph2uw))]
13950#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13951pub fn _mm_cvttph_epu16(a: __m128h) -> __m128i {
13952    _mm_mask_cvttph_epu16(_mm_undefined_si128(), 0xff, a)
13953}
13954
13955/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
13956/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
13957/// mask bit is not set).
13958///
13959/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epu16)
13960#[inline]
13961#[target_feature(enable = "avx512fp16,avx512vl")]
13962#[cfg_attr(test, assert_instr(vcvttph2uw))]
13963#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13964pub fn _mm_mask_cvttph_epu16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
13965    unsafe { transmute(vcvttph2uw_128(a, src.as_u16x8(), k)) }
13966}
13967
13968/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
13969/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
13970/// mask bit is not set).
13971///
13972/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epu16)
13973#[inline]
13974#[target_feature(enable = "avx512fp16,avx512vl")]
13975#[cfg_attr(test, assert_instr(vcvttph2uw))]
13976#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13977pub fn _mm_maskz_cvttph_epu16(k: __mmask8, a: __m128h) -> __m128i {
13978    _mm_mask_cvttph_epu16(_mm_setzero_si128(), k, a)
13979}
13980
13981/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
13982/// truncation, and store the results in dst.
13983///
13984/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epu16)
13985#[inline]
13986#[target_feature(enable = "avx512fp16,avx512vl")]
13987#[cfg_attr(test, assert_instr(vcvttph2uw))]
13988#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13989pub fn _mm256_cvttph_epu16(a: __m256h) -> __m256i {
13990    _mm256_mask_cvttph_epu16(_mm256_undefined_si256(), 0xffff, a)
13991}
13992
13993/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
13994/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
13995/// mask bit is not set).
13996///
13997/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epu16)
13998#[inline]
13999#[target_feature(enable = "avx512fp16,avx512vl")]
14000#[cfg_attr(test, assert_instr(vcvttph2uw))]
14001#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14002pub fn _mm256_mask_cvttph_epu16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
14003    unsafe { transmute(vcvttph2uw_256(a, src.as_u16x16(), k)) }
14004}
14005
14006/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14007/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
14008/// mask bit is not set).
14009///
14010/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epu16)
14011#[inline]
14012#[target_feature(enable = "avx512fp16,avx512vl")]
14013#[cfg_attr(test, assert_instr(vcvttph2uw))]
14014#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14015pub fn _mm256_maskz_cvttph_epu16(k: __mmask16, a: __m256h) -> __m256i {
14016    _mm256_mask_cvttph_epu16(_mm256_setzero_si256(), k, a)
14017}
14018
14019/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14020/// truncation, and store the results in dst.
14021///
14022/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epu16)
14023#[inline]
14024#[target_feature(enable = "avx512fp16")]
14025#[cfg_attr(test, assert_instr(vcvttph2uw))]
14026#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14027pub fn _mm512_cvttph_epu16(a: __m512h) -> __m512i {
14028    _mm512_mask_cvttph_epu16(_mm512_undefined_epi32(), 0xffffffff, a)
14029}
14030
14031/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14032/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
14033/// mask bit is not set).
14034///
14035/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epu16)
14036#[inline]
14037#[target_feature(enable = "avx512fp16")]
14038#[cfg_attr(test, assert_instr(vcvttph2uw))]
14039#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14040pub fn _mm512_mask_cvttph_epu16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
14041    unsafe {
14042        transmute(vcvttph2uw_512(
14043            a,
14044            src.as_u16x32(),
14045            k,
14046            _MM_FROUND_CUR_DIRECTION,
14047        ))
14048    }
14049}
14050
14051/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14052/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
14053/// mask bit is not set).
14054///
14055/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epu16)
14056#[inline]
14057#[target_feature(enable = "avx512fp16")]
14058#[cfg_attr(test, assert_instr(vcvttph2uw))]
14059#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14060pub fn _mm512_maskz_cvttph_epu16(k: __mmask32, a: __m512h) -> __m512i {
14061    _mm512_mask_cvttph_epu16(_mm512_setzero_si512(), k, a)
14062}
14063
14064/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14065/// truncation, and store the results in dst.
14066///
14067/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
14068///
14069/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epu16)
14070#[inline]
14071#[target_feature(enable = "avx512fp16")]
14072#[cfg_attr(test, assert_instr(vcvttph2uw, SAE = 8))]
14073#[rustc_legacy_const_generics(1)]
14074#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14075pub fn _mm512_cvtt_roundph_epu16<const SAE: i32>(a: __m512h) -> __m512i {
14076    static_assert_sae!(SAE);
14077    _mm512_mask_cvtt_roundph_epu16::<SAE>(_mm512_undefined_epi32(), 0xffffffff, a)
14078}
14079
14080/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14081/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
14082/// mask bit is not set).
14083///
14084/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
14085///
14086/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epu16)
14087#[inline]
14088#[target_feature(enable = "avx512fp16")]
14089#[cfg_attr(test, assert_instr(vcvttph2uw, SAE = 8))]
14090#[rustc_legacy_const_generics(3)]
14091#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14092pub fn _mm512_mask_cvtt_roundph_epu16<const SAE: i32>(
14093    src: __m512i,
14094    k: __mmask32,
14095    a: __m512h,
14096) -> __m512i {
14097    unsafe {
14098        static_assert_sae!(SAE);
14099        transmute(vcvttph2uw_512(a, src.as_u16x32(), k, SAE))
14100    }
14101}
14102
14103/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14104/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
14105/// mask bit is not set).
14106///
14107/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
14108///
14109/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epu16)
14110#[inline]
14111#[target_feature(enable = "avx512fp16")]
14112#[cfg_attr(test, assert_instr(vcvttph2uw, SAE = 8))]
14113#[rustc_legacy_const_generics(2)]
14114#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14115pub fn _mm512_maskz_cvtt_roundph_epu16<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512i {
14116    static_assert_sae!(SAE);
14117    _mm512_mask_cvtt_roundph_epu16::<SAE>(_mm512_setzero_si512(), k, a)
14118}
14119
14120/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14121/// results in dst.
14122///
14123/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epi32)
14124#[inline]
14125#[target_feature(enable = "avx512fp16,avx512vl")]
14126#[cfg_attr(test, assert_instr(vcvtph2dq))]
14127#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14128pub fn _mm_cvtph_epi32(a: __m128h) -> __m128i {
14129    _mm_mask_cvtph_epi32(_mm_undefined_si128(), 0xff, a)
14130}
14131
14132/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14133/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14134///
14135/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epi32)
14136#[inline]
14137#[target_feature(enable = "avx512fp16,avx512vl")]
14138#[cfg_attr(test, assert_instr(vcvtph2dq))]
14139#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14140pub fn _mm_mask_cvtph_epi32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14141    unsafe { transmute(vcvtph2dq_128(a, src.as_i32x4(), k)) }
14142}
14143
14144/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14145/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14146///
14147/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epi32)
14148#[inline]
14149#[target_feature(enable = "avx512fp16,avx512vl")]
14150#[cfg_attr(test, assert_instr(vcvtph2dq))]
14151#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14152pub fn _mm_maskz_cvtph_epi32(k: __mmask8, a: __m128h) -> __m128i {
14153    _mm_mask_cvtph_epi32(_mm_setzero_si128(), k, a)
14154}
14155
14156/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14157/// results in dst.
14158///
14159/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epi32)
14160#[inline]
14161#[target_feature(enable = "avx512fp16,avx512vl")]
14162#[cfg_attr(test, assert_instr(vcvtph2dq))]
14163#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14164pub fn _mm256_cvtph_epi32(a: __m128h) -> __m256i {
14165    _mm256_mask_cvtph_epi32(_mm256_undefined_si256(), 0xff, a)
14166}
14167
14168/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14169/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14170///
14171/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epi32)
14172#[inline]
14173#[target_feature(enable = "avx512fp16,avx512vl")]
14174#[cfg_attr(test, assert_instr(vcvtph2dq))]
14175#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14176pub fn _mm256_mask_cvtph_epi32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
14177    unsafe { transmute(vcvtph2dq_256(a, src.as_i32x8(), k)) }
14178}
14179
14180/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14181/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14182///
14183/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epi32)
14184#[inline]
14185#[target_feature(enable = "avx512fp16,avx512vl")]
14186#[cfg_attr(test, assert_instr(vcvtph2dq))]
14187#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14188pub fn _mm256_maskz_cvtph_epi32(k: __mmask8, a: __m128h) -> __m256i {
14189    _mm256_mask_cvtph_epi32(_mm256_setzero_si256(), k, a)
14190}
14191
14192/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14193/// results in dst.
14194///
14195/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epi32)
14196#[inline]
14197#[target_feature(enable = "avx512fp16")]
14198#[cfg_attr(test, assert_instr(vcvtph2dq))]
14199#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14200pub fn _mm512_cvtph_epi32(a: __m256h) -> __m512i {
14201    _mm512_mask_cvtph_epi32(_mm512_undefined_epi32(), 0xffff, a)
14202}
14203
14204/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14205/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14206///
14207/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epi32)
14208#[inline]
14209#[target_feature(enable = "avx512fp16")]
14210#[cfg_attr(test, assert_instr(vcvtph2dq))]
14211#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14212pub fn _mm512_mask_cvtph_epi32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
14213    unsafe {
14214        transmute(vcvtph2dq_512(
14215            a,
14216            src.as_i32x16(),
14217            k,
14218            _MM_FROUND_CUR_DIRECTION,
14219        ))
14220    }
14221}
14222
14223/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14224/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14225///
14226/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epi32)
14227#[inline]
14228#[target_feature(enable = "avx512fp16")]
14229#[cfg_attr(test, assert_instr(vcvtph2dq))]
14230#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14231pub fn _mm512_maskz_cvtph_epi32(k: __mmask16, a: __m256h) -> __m512i {
14232    _mm512_mask_cvtph_epi32(_mm512_setzero_si512(), k, a)
14233}
14234
14235/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14236/// results in dst.
14237///
14238/// Rounding is done according to the rounding parameter, which can be one of:
14239///
14240/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14241/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14242/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14243/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14244/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14245///
14246/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epi32)
14247#[inline]
14248#[target_feature(enable = "avx512fp16")]
14249#[cfg_attr(test, assert_instr(vcvtph2dq, ROUNDING = 8))]
14250#[rustc_legacy_const_generics(1)]
14251#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14252pub fn _mm512_cvt_roundph_epi32<const ROUNDING: i32>(a: __m256h) -> __m512i {
14253    static_assert_rounding!(ROUNDING);
14254    _mm512_mask_cvt_roundph_epi32::<ROUNDING>(_mm512_undefined_epi32(), 0xffff, a)
14255}
14256
14257/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14258/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14259///
14260/// Rounding is done according to the rounding parameter, which can be one of:
14261///
14262/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14263/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14264/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14265/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14266/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14267///
14268/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epi32)
14269#[inline]
14270#[target_feature(enable = "avx512fp16")]
14271#[cfg_attr(test, assert_instr(vcvtph2dq, ROUNDING = 8))]
14272#[rustc_legacy_const_generics(3)]
14273#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14274pub fn _mm512_mask_cvt_roundph_epi32<const ROUNDING: i32>(
14275    src: __m512i,
14276    k: __mmask16,
14277    a: __m256h,
14278) -> __m512i {
14279    unsafe {
14280        static_assert_rounding!(ROUNDING);
14281        transmute(vcvtph2dq_512(a, src.as_i32x16(), k, ROUNDING))
14282    }
14283}
14284
14285/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14286/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14287///
14288/// Rounding is done according to the rounding parameter, which can be one of:
14289///
14290/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14291/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14292/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14293/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14294/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14295///
14296/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epi32)
14297#[inline]
14298#[target_feature(enable = "avx512fp16")]
14299#[cfg_attr(test, assert_instr(vcvtph2dq, ROUNDING = 8))]
14300#[rustc_legacy_const_generics(2)]
14301#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14302pub fn _mm512_maskz_cvt_roundph_epi32<const ROUNDING: i32>(k: __mmask16, a: __m256h) -> __m512i {
14303    static_assert_rounding!(ROUNDING);
14304    _mm512_mask_cvt_roundph_epi32::<ROUNDING>(_mm512_setzero_si512(), k, a)
14305}
14306
14307/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer, and store
14308/// the result in dst.
14309///
14310/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_i32)
14311#[inline]
14312#[target_feature(enable = "avx512fp16")]
14313#[cfg_attr(test, assert_instr(vcvtsh2si))]
14314#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14315pub fn _mm_cvtsh_i32(a: __m128h) -> i32 {
14316    unsafe { vcvtsh2si32(a, _MM_FROUND_CUR_DIRECTION) }
14317}
14318
14319/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer, and store
14320/// the result in dst.
14321///
14322/// Rounding is done according to the rounding parameter, which can be one of:
14323///
14324/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14325/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14326/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14327/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14328/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14329///
14330/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_i32)
14331#[inline]
14332#[target_feature(enable = "avx512fp16")]
14333#[cfg_attr(test, assert_instr(vcvtsh2si, ROUNDING = 8))]
14334#[rustc_legacy_const_generics(1)]
14335#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14336pub fn _mm_cvt_roundsh_i32<const ROUNDING: i32>(a: __m128h) -> i32 {
14337    unsafe {
14338        static_assert_rounding!(ROUNDING);
14339        vcvtsh2si32(a, ROUNDING)
14340    }
14341}
14342
14343/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14344/// results in dst.
14345///
14346/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epu32)
14347#[inline]
14348#[target_feature(enable = "avx512fp16,avx512vl")]
14349#[cfg_attr(test, assert_instr(vcvtph2udq))]
14350#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14351pub fn _mm_cvtph_epu32(a: __m128h) -> __m128i {
14352    _mm_mask_cvtph_epu32(_mm_undefined_si128(), 0xff, a)
14353}
14354
14355/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14356/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14357///
14358/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epu32)
14359#[inline]
14360#[target_feature(enable = "avx512fp16,avx512vl")]
14361#[cfg_attr(test, assert_instr(vcvtph2udq))]
14362#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14363pub fn _mm_mask_cvtph_epu32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14364    unsafe { transmute(vcvtph2udq_128(a, src.as_u32x4(), k)) }
14365}
14366
14367/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14368/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14369///
14370/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epu32)
14371#[inline]
14372#[target_feature(enable = "avx512fp16,avx512vl")]
14373#[cfg_attr(test, assert_instr(vcvtph2udq))]
14374#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14375pub fn _mm_maskz_cvtph_epu32(k: __mmask8, a: __m128h) -> __m128i {
14376    _mm_mask_cvtph_epu32(_mm_setzero_si128(), k, a)
14377}
14378
14379/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14380/// the results in dst.
14381///
14382/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epu32)
14383#[inline]
14384#[target_feature(enable = "avx512fp16,avx512vl")]
14385#[cfg_attr(test, assert_instr(vcvtph2udq))]
14386#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14387pub fn _mm256_cvtph_epu32(a: __m128h) -> __m256i {
14388    _mm256_mask_cvtph_epu32(_mm256_undefined_si256(), 0xff, a)
14389}
14390
14391/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14392/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14393///
14394/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epu32)
14395#[inline]
14396#[target_feature(enable = "avx512fp16,avx512vl")]
14397#[cfg_attr(test, assert_instr(vcvtph2udq))]
14398#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14399pub fn _mm256_mask_cvtph_epu32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
14400    unsafe { transmute(vcvtph2udq_256(a, src.as_u32x8(), k)) }
14401}
14402
14403/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14404/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14405///
14406/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epu32)
14407#[inline]
14408#[target_feature(enable = "avx512fp16,avx512vl")]
14409#[cfg_attr(test, assert_instr(vcvtph2udq))]
14410#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14411pub fn _mm256_maskz_cvtph_epu32(k: __mmask8, a: __m128h) -> __m256i {
14412    _mm256_mask_cvtph_epu32(_mm256_setzero_si256(), k, a)
14413}
14414
14415/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14416/// the results in dst.
14417///
14418/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epu32)
14419#[inline]
14420#[target_feature(enable = "avx512fp16")]
14421#[cfg_attr(test, assert_instr(vcvtph2udq))]
14422#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14423pub fn _mm512_cvtph_epu32(a: __m256h) -> __m512i {
14424    _mm512_mask_cvtph_epu32(_mm512_undefined_epi32(), 0xffff, a)
14425}
14426
14427/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14428/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14429///
14430/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epu32)
14431#[inline]
14432#[target_feature(enable = "avx512fp16")]
14433#[cfg_attr(test, assert_instr(vcvtph2udq))]
14434#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14435pub fn _mm512_mask_cvtph_epu32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
14436    unsafe {
14437        transmute(vcvtph2udq_512(
14438            a,
14439            src.as_u32x16(),
14440            k,
14441            _MM_FROUND_CUR_DIRECTION,
14442        ))
14443    }
14444}
14445
14446/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14447/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14448///
14449/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epu32)
14450#[inline]
14451#[target_feature(enable = "avx512fp16")]
14452#[cfg_attr(test, assert_instr(vcvtph2udq))]
14453#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14454pub fn _mm512_maskz_cvtph_epu32(k: __mmask16, a: __m256h) -> __m512i {
14455    _mm512_mask_cvtph_epu32(_mm512_setzero_si512(), k, a)
14456}
14457
14458/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14459/// the results in dst.
14460///
14461/// Rounding is done according to the rounding parameter, which can be one of:
14462///
14463/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14464/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14465/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14466/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14467/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14468///
14469/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epu32)
14470#[inline]
14471#[target_feature(enable = "avx512fp16")]
14472#[cfg_attr(test, assert_instr(vcvtph2udq, ROUNDING = 8))]
14473#[rustc_legacy_const_generics(1)]
14474#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14475pub fn _mm512_cvt_roundph_epu32<const ROUNDING: i32>(a: __m256h) -> __m512i {
14476    static_assert_rounding!(ROUNDING);
14477    _mm512_mask_cvt_roundph_epu32::<ROUNDING>(_mm512_undefined_epi32(), 0xffff, a)
14478}
14479
14480/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14481/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14482///
14483/// Rounding is done according to the rounding parameter, which can be one of:
14484///
14485/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14486/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14487/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14488/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14489/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14490///
14491/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epu32)
14492#[inline]
14493#[target_feature(enable = "avx512fp16")]
14494#[cfg_attr(test, assert_instr(vcvtph2udq, ROUNDING = 8))]
14495#[rustc_legacy_const_generics(3)]
14496#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14497pub fn _mm512_mask_cvt_roundph_epu32<const ROUNDING: i32>(
14498    src: __m512i,
14499    k: __mmask16,
14500    a: __m256h,
14501) -> __m512i {
14502    unsafe {
14503        static_assert_rounding!(ROUNDING);
14504        transmute(vcvtph2udq_512(a, src.as_u32x16(), k, ROUNDING))
14505    }
14506}
14507
14508/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14509/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14510///
14511/// Rounding is done according to the rounding parameter, which can be one of:
14512///
14513/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14514/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14515/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14516/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14517/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14518///
14519/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epu32)
14520#[inline]
14521#[target_feature(enable = "avx512fp16")]
14522#[cfg_attr(test, assert_instr(vcvtph2udq, ROUNDING = 8))]
14523#[rustc_legacy_const_generics(2)]
14524#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14525pub fn _mm512_maskz_cvt_roundph_epu32<const ROUNDING: i32>(k: __mmask16, a: __m256h) -> __m512i {
14526    static_assert_rounding!(ROUNDING);
14527    _mm512_mask_cvt_roundph_epu32::<ROUNDING>(_mm512_setzero_si512(), k, a)
14528}
14529
14530/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer, and store
14531/// the result in dst.
14532///
14533/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_u32)
14534#[inline]
14535#[target_feature(enable = "avx512fp16")]
14536#[cfg_attr(test, assert_instr(vcvtsh2usi))]
14537#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14538pub fn _mm_cvtsh_u32(a: __m128h) -> u32 {
14539    unsafe { vcvtsh2usi32(a, _MM_FROUND_CUR_DIRECTION) }
14540}
14541
14542/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer, and store
14543/// the result in dst.
14544///
14545/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`] in the sae parameter.
14546///
14547/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_u32)
14548#[inline]
14549#[target_feature(enable = "avx512fp16")]
14550#[cfg_attr(test, assert_instr(vcvtsh2usi, SAE = 8))]
14551#[rustc_legacy_const_generics(1)]
14552#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14553pub fn _mm_cvt_roundsh_u32<const SAE: i32>(a: __m128h) -> u32 {
14554    unsafe {
14555        static_assert_rounding!(SAE);
14556        vcvtsh2usi32(a, SAE)
14557    }
14558}
14559
14560/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14561/// store the results in dst.
14562///
14563/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epi32)
14564#[inline]
14565#[target_feature(enable = "avx512fp16,avx512vl")]
14566#[cfg_attr(test, assert_instr(vcvttph2dq))]
14567#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14568pub fn _mm_cvttph_epi32(a: __m128h) -> __m128i {
14569    _mm_mask_cvttph_epi32(_mm_undefined_si128(), 0xff, a)
14570}
14571
14572/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14573/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14574///
14575/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epi32)
14576#[inline]
14577#[target_feature(enable = "avx512fp16,avx512vl")]
14578#[cfg_attr(test, assert_instr(vcvttph2dq))]
14579#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14580pub fn _mm_mask_cvttph_epi32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14581    unsafe { transmute(vcvttph2dq_128(a, src.as_i32x4(), k)) }
14582}
14583
14584/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14585/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14586///
14587/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epi32)
14588#[inline]
14589#[target_feature(enable = "avx512fp16,avx512vl")]
14590#[cfg_attr(test, assert_instr(vcvttph2dq))]
14591#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14592pub fn _mm_maskz_cvttph_epi32(k: __mmask8, a: __m128h) -> __m128i {
14593    _mm_mask_cvttph_epi32(_mm_setzero_si128(), k, a)
14594}
14595
14596/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14597/// store the results in dst.
14598///
14599/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epi32)
14600#[inline]
14601#[target_feature(enable = "avx512fp16,avx512vl")]
14602#[cfg_attr(test, assert_instr(vcvttph2dq))]
14603#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14604pub fn _mm256_cvttph_epi32(a: __m128h) -> __m256i {
14605    _mm256_mask_cvttph_epi32(_mm256_undefined_si256(), 0xff, a)
14606}
14607
14608/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14609/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14610///
14611/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epi32)
14612#[inline]
14613#[target_feature(enable = "avx512fp16,avx512vl")]
14614#[cfg_attr(test, assert_instr(vcvttph2dq))]
14615#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14616pub fn _mm256_mask_cvttph_epi32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
14617    unsafe { transmute(vcvttph2dq_256(a, src.as_i32x8(), k)) }
14618}
14619
14620/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14621/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14622///
14623/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epi32)
14624#[inline]
14625#[target_feature(enable = "avx512fp16,avx512vl")]
14626#[cfg_attr(test, assert_instr(vcvttph2dq))]
14627#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14628pub fn _mm256_maskz_cvttph_epi32(k: __mmask8, a: __m128h) -> __m256i {
14629    _mm256_mask_cvttph_epi32(_mm256_setzero_si256(), k, a)
14630}
14631
14632/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14633/// store the results in dst.
14634///
14635/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epi32)
14636#[inline]
14637#[target_feature(enable = "avx512fp16")]
14638#[cfg_attr(test, assert_instr(vcvttph2dq))]
14639#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14640pub fn _mm512_cvttph_epi32(a: __m256h) -> __m512i {
14641    _mm512_mask_cvttph_epi32(_mm512_undefined_epi32(), 0xffff, a)
14642}
14643
14644/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14645/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14646///
14647/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epi32)
14648#[inline]
14649#[target_feature(enable = "avx512fp16")]
14650#[cfg_attr(test, assert_instr(vcvttph2dq))]
14651#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14652pub fn _mm512_mask_cvttph_epi32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
14653    unsafe {
14654        transmute(vcvttph2dq_512(
14655            a,
14656            src.as_i32x16(),
14657            k,
14658            _MM_FROUND_CUR_DIRECTION,
14659        ))
14660    }
14661}
14662
14663/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14664/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14665///
14666/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epi32)
14667#[inline]
14668#[target_feature(enable = "avx512fp16")]
14669#[cfg_attr(test, assert_instr(vcvttph2dq))]
14670#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14671pub fn _mm512_maskz_cvttph_epi32(k: __mmask16, a: __m256h) -> __m512i {
14672    _mm512_mask_cvttph_epi32(_mm512_setzero_si512(), k, a)
14673}
14674
14675/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14676/// store the results in dst.
14677///
14678/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14679///
14680/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epi32)
14681#[inline]
14682#[target_feature(enable = "avx512fp16")]
14683#[cfg_attr(test, assert_instr(vcvttph2dq, SAE = 8))]
14684#[rustc_legacy_const_generics(1)]
14685#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14686pub fn _mm512_cvtt_roundph_epi32<const SAE: i32>(a: __m256h) -> __m512i {
14687    static_assert_sae!(SAE);
14688    _mm512_mask_cvtt_roundph_epi32::<SAE>(_mm512_undefined_epi32(), 0xffff, a)
14689}
14690
14691/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14692/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14693///
14694/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14695///
14696/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epi32)
14697#[inline]
14698#[target_feature(enable = "avx512fp16")]
14699#[cfg_attr(test, assert_instr(vcvttph2dq, SAE = 8))]
14700#[rustc_legacy_const_generics(3)]
14701#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14702pub fn _mm512_mask_cvtt_roundph_epi32<const SAE: i32>(
14703    src: __m512i,
14704    k: __mmask16,
14705    a: __m256h,
14706) -> __m512i {
14707    unsafe {
14708        static_assert_sae!(SAE);
14709        transmute(vcvttph2dq_512(a, src.as_i32x16(), k, SAE))
14710    }
14711}
14712
14713/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14714/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14715///
14716/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14717///
14718/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epi32)
14719#[inline]
14720#[target_feature(enable = "avx512fp16")]
14721#[cfg_attr(test, assert_instr(vcvttph2dq, SAE = 8))]
14722#[rustc_legacy_const_generics(2)]
14723#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14724pub fn _mm512_maskz_cvtt_roundph_epi32<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512i {
14725    static_assert_sae!(SAE);
14726    _mm512_mask_cvtt_roundph_epi32::<SAE>(_mm512_setzero_si512(), k, a)
14727}
14728
14729/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer with truncation, and store
14730/// the result in dst.
14731///
14732/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsh_i32)
14733#[inline]
14734#[target_feature(enable = "avx512fp16")]
14735#[cfg_attr(test, assert_instr(vcvttsh2si))]
14736#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14737pub fn _mm_cvttsh_i32(a: __m128h) -> i32 {
14738    unsafe { vcvttsh2si32(a, _MM_FROUND_CUR_DIRECTION) }
14739}
14740
14741/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer with truncation, and store
14742/// the result in dst.
14743///
14744/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14745///
14746/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsh_i32)
14747#[inline]
14748#[target_feature(enable = "avx512fp16")]
14749#[cfg_attr(test, assert_instr(vcvttsh2si, SAE = 8))]
14750#[rustc_legacy_const_generics(1)]
14751#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14752pub fn _mm_cvtt_roundsh_i32<const SAE: i32>(a: __m128h) -> i32 {
14753    unsafe {
14754        static_assert_sae!(SAE);
14755        vcvttsh2si32(a, SAE)
14756    }
14757}
14758
14759/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14760/// store the results in dst.
14761///
14762/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epu32)
14763#[inline]
14764#[target_feature(enable = "avx512fp16,avx512vl")]
14765#[cfg_attr(test, assert_instr(vcvttph2udq))]
14766#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14767pub fn _mm_cvttph_epu32(a: __m128h) -> __m128i {
14768    _mm_mask_cvttph_epu32(_mm_undefined_si128(), 0xff, a)
14769}
14770
14771/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14772/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14773///
14774/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epu32)
14775#[inline]
14776#[target_feature(enable = "avx512fp16,avx512vl")]
14777#[cfg_attr(test, assert_instr(vcvttph2udq))]
14778#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14779pub fn _mm_mask_cvttph_epu32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14780    unsafe { transmute(vcvttph2udq_128(a, src.as_u32x4(), k)) }
14781}
14782
14783/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14784/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14785///
14786/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epu32)
14787#[inline]
14788#[target_feature(enable = "avx512fp16,avx512vl")]
14789#[cfg_attr(test, assert_instr(vcvttph2udq))]
14790#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14791pub fn _mm_maskz_cvttph_epu32(k: __mmask8, a: __m128h) -> __m128i {
14792    _mm_mask_cvttph_epu32(_mm_setzero_si128(), k, a)
14793}
14794
14795/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14796/// store the results in dst.
14797///
14798/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epu32)
14799#[inline]
14800#[target_feature(enable = "avx512fp16,avx512vl")]
14801#[cfg_attr(test, assert_instr(vcvttph2udq))]
14802#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14803pub fn _mm256_cvttph_epu32(a: __m128h) -> __m256i {
14804    _mm256_mask_cvttph_epu32(_mm256_undefined_si256(), 0xff, a)
14805}
14806
14807/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14808/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14809///
14810/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epu32)
14811#[inline]
14812#[target_feature(enable = "avx512fp16,avx512vl")]
14813#[cfg_attr(test, assert_instr(vcvttph2udq))]
14814#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14815pub fn _mm256_mask_cvttph_epu32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
14816    unsafe { transmute(vcvttph2udq_256(a, src.as_u32x8(), k)) }
14817}
14818
14819/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14820/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14821///
14822/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epu32)
14823#[inline]
14824#[target_feature(enable = "avx512fp16,avx512vl")]
14825#[cfg_attr(test, assert_instr(vcvttph2udq))]
14826#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14827pub fn _mm256_maskz_cvttph_epu32(k: __mmask8, a: __m128h) -> __m256i {
14828    _mm256_mask_cvttph_epu32(_mm256_setzero_si256(), k, a)
14829}
14830
14831/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14832/// store the results in dst.
14833///
14834/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epu32)
14835#[inline]
14836#[target_feature(enable = "avx512fp16")]
14837#[cfg_attr(test, assert_instr(vcvttph2udq))]
14838#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14839pub fn _mm512_cvttph_epu32(a: __m256h) -> __m512i {
14840    _mm512_mask_cvttph_epu32(_mm512_undefined_epi32(), 0xffff, a)
14841}
14842
14843/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14844/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14845///
14846/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epu32)
14847#[inline]
14848#[target_feature(enable = "avx512fp16")]
14849#[cfg_attr(test, assert_instr(vcvttph2udq))]
14850#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14851pub fn _mm512_mask_cvttph_epu32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
14852    unsafe {
14853        transmute(vcvttph2udq_512(
14854            a,
14855            src.as_u32x16(),
14856            k,
14857            _MM_FROUND_CUR_DIRECTION,
14858        ))
14859    }
14860}
14861
14862/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14863/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14864///
14865/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epu32)
14866#[inline]
14867#[target_feature(enable = "avx512fp16")]
14868#[cfg_attr(test, assert_instr(vcvttph2udq))]
14869#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14870pub fn _mm512_maskz_cvttph_epu32(k: __mmask16, a: __m256h) -> __m512i {
14871    _mm512_mask_cvttph_epu32(_mm512_setzero_si512(), k, a)
14872}
14873
14874/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14875/// store the results in dst.
14876///
14877/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14878///
14879/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epu32)
14880#[inline]
14881#[target_feature(enable = "avx512fp16")]
14882#[cfg_attr(test, assert_instr(vcvttph2udq, SAE = 8))]
14883#[rustc_legacy_const_generics(1)]
14884#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14885pub fn _mm512_cvtt_roundph_epu32<const SAE: i32>(a: __m256h) -> __m512i {
14886    static_assert_sae!(SAE);
14887    _mm512_mask_cvtt_roundph_epu32::<SAE>(_mm512_undefined_epi32(), 0xffff, a)
14888}
14889
14890/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14891/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14892///
14893/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14894///
14895/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epu32)
14896#[inline]
14897#[target_feature(enable = "avx512fp16")]
14898#[cfg_attr(test, assert_instr(vcvttph2udq, SAE = 8))]
14899#[rustc_legacy_const_generics(3)]
14900#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14901pub fn _mm512_mask_cvtt_roundph_epu32<const SAE: i32>(
14902    src: __m512i,
14903    k: __mmask16,
14904    a: __m256h,
14905) -> __m512i {
14906    unsafe {
14907        static_assert_sae!(SAE);
14908        transmute(vcvttph2udq_512(a, src.as_u32x16(), k, SAE))
14909    }
14910}
14911
14912/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14913/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14914///
14915/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14916///
14917/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epu32)
14918#[inline]
14919#[target_feature(enable = "avx512fp16")]
14920#[cfg_attr(test, assert_instr(vcvttph2udq, SAE = 8))]
14921#[rustc_legacy_const_generics(2)]
14922#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14923pub fn _mm512_maskz_cvtt_roundph_epu32<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512i {
14924    static_assert_sae!(SAE);
14925    _mm512_mask_cvtt_roundph_epu32::<SAE>(_mm512_setzero_si512(), k, a)
14926}
14927
14928/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer with truncation, and store
14929/// the result in dst.
14930///
14931/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsh_u32)
14932#[inline]
14933#[target_feature(enable = "avx512fp16")]
14934#[cfg_attr(test, assert_instr(vcvttsh2usi))]
14935#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14936pub fn _mm_cvttsh_u32(a: __m128h) -> u32 {
14937    unsafe { vcvttsh2usi32(a, _MM_FROUND_CUR_DIRECTION) }
14938}
14939
14940/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer with truncation, and store
14941/// the result in dst.
14942///
14943/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14944///
14945/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsh_u32)
14946#[inline]
14947#[target_feature(enable = "avx512fp16")]
14948#[cfg_attr(test, assert_instr(vcvttsh2usi, SAE = 8))]
14949#[rustc_legacy_const_generics(1)]
14950#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14951pub fn _mm_cvtt_roundsh_u32<const SAE: i32>(a: __m128h) -> u32 {
14952    unsafe {
14953        static_assert_sae!(SAE);
14954        vcvttsh2usi32(a, SAE)
14955    }
14956}
14957
14958/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
14959/// store the results in dst.
14960///
14961/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epi64)
14962#[inline]
14963#[target_feature(enable = "avx512fp16,avx512vl")]
14964#[cfg_attr(test, assert_instr(vcvtph2qq))]
14965#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14966pub fn _mm_cvtph_epi64(a: __m128h) -> __m128i {
14967    _mm_mask_cvtph_epi64(_mm_undefined_si128(), 0xff, a)
14968}
14969
14970/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
14971/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14972///
14973/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epi64)
14974#[inline]
14975#[target_feature(enable = "avx512fp16,avx512vl")]
14976#[cfg_attr(test, assert_instr(vcvtph2qq))]
14977#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14978pub fn _mm_mask_cvtph_epi64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14979    unsafe { transmute(vcvtph2qq_128(a, src.as_i64x2(), k)) }
14980}
14981
14982/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
14983/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14984///
14985/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epi64)
14986#[inline]
14987#[target_feature(enable = "avx512fp16,avx512vl")]
14988#[cfg_attr(test, assert_instr(vcvtph2qq))]
14989#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14990pub fn _mm_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m128i {
14991    _mm_mask_cvtph_epi64(_mm_setzero_si128(), k, a)
14992}
14993
14994/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
14995/// store the results in dst.
14996///
14997/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epi64)
14998#[inline]
14999#[target_feature(enable = "avx512fp16,avx512vl")]
15000#[cfg_attr(test, assert_instr(vcvtph2qq))]
15001#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15002pub fn _mm256_cvtph_epi64(a: __m128h) -> __m256i {
15003    _mm256_mask_cvtph_epi64(_mm256_undefined_si256(), 0xff, a)
15004}
15005
15006/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15007/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15008///
15009/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epi64)
15010#[inline]
15011#[target_feature(enable = "avx512fp16,avx512vl")]
15012#[cfg_attr(test, assert_instr(vcvtph2qq))]
15013#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15014pub fn _mm256_mask_cvtph_epi64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
15015    unsafe { transmute(vcvtph2qq_256(a, src.as_i64x4(), k)) }
15016}
15017
15018/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15019/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15020///
15021/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epi64)
15022#[inline]
15023#[target_feature(enable = "avx512fp16,avx512vl")]
15024#[cfg_attr(test, assert_instr(vcvtph2qq))]
15025#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15026pub fn _mm256_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m256i {
15027    _mm256_mask_cvtph_epi64(_mm256_setzero_si256(), k, a)
15028}
15029
15030/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15031/// store the results in dst.
15032///
15033/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epi64)
15034#[inline]
15035#[target_feature(enable = "avx512fp16")]
15036#[cfg_attr(test, assert_instr(vcvtph2qq))]
15037#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15038pub fn _mm512_cvtph_epi64(a: __m128h) -> __m512i {
15039    _mm512_mask_cvtph_epi64(_mm512_undefined_epi32(), 0xff, a)
15040}
15041
15042/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15043/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15044///
15045/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epi64)
15046#[inline]
15047#[target_feature(enable = "avx512fp16")]
15048#[cfg_attr(test, assert_instr(vcvtph2qq))]
15049#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15050pub fn _mm512_mask_cvtph_epi64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
15051    unsafe {
15052        transmute(vcvtph2qq_512(
15053            a,
15054            src.as_i64x8(),
15055            k,
15056            _MM_FROUND_CUR_DIRECTION,
15057        ))
15058    }
15059}
15060
15061/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15062/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15063///
15064/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epi64)
15065#[inline]
15066#[target_feature(enable = "avx512fp16")]
15067#[cfg_attr(test, assert_instr(vcvtph2qq))]
15068#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15069pub fn _mm512_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m512i {
15070    _mm512_mask_cvtph_epi64(_mm512_setzero_si512(), k, a)
15071}
15072
15073/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15074/// store the results in dst.
15075///
15076/// Rounding is done according to the rounding parameter, which can be one of:
15077///
15078/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15079/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15080/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15081/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15082/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15083///
15084/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epi64)
15085#[inline]
15086#[target_feature(enable = "avx512fp16")]
15087#[cfg_attr(test, assert_instr(vcvtph2qq, ROUNDING = 8))]
15088#[rustc_legacy_const_generics(1)]
15089#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15090pub fn _mm512_cvt_roundph_epi64<const ROUNDING: i32>(a: __m128h) -> __m512i {
15091    static_assert_rounding!(ROUNDING);
15092    _mm512_mask_cvt_roundph_epi64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
15093}
15094
15095/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15096/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15097///
15098/// Rounding is done according to the rounding parameter, which can be one of:
15099///
15100/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15101/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15102/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15103/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15104/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15105///
15106/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epi64)
15107#[inline]
15108#[target_feature(enable = "avx512fp16")]
15109#[cfg_attr(test, assert_instr(vcvtph2qq, ROUNDING = 8))]
15110#[rustc_legacy_const_generics(3)]
15111#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15112pub fn _mm512_mask_cvt_roundph_epi64<const ROUNDING: i32>(
15113    src: __m512i,
15114    k: __mmask8,
15115    a: __m128h,
15116) -> __m512i {
15117    unsafe {
15118        static_assert_rounding!(ROUNDING);
15119        transmute(vcvtph2qq_512(a, src.as_i64x8(), k, ROUNDING))
15120    }
15121}
15122
15123/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15124/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15125///
15126/// Rounding is done according to the rounding parameter, which can be one of:
15127///
15128/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15129/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15130/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15131/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15132/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15133///
15134/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epi64)
15135#[inline]
15136#[target_feature(enable = "avx512fp16")]
15137#[cfg_attr(test, assert_instr(vcvtph2qq, ROUNDING = 8))]
15138#[rustc_legacy_const_generics(2)]
15139#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15140pub fn _mm512_maskz_cvt_roundph_epi64<const ROUNDING: i32>(k: __mmask8, a: __m128h) -> __m512i {
15141    static_assert_rounding!(ROUNDING);
15142    _mm512_mask_cvt_roundph_epi64::<ROUNDING>(_mm512_setzero_si512(), k, a)
15143}
15144
15145/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15146/// store the results in dst.
15147///
15148/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epu64)
15149#[inline]
15150#[target_feature(enable = "avx512fp16,avx512vl")]
15151#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15152#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15153pub fn _mm_cvtph_epu64(a: __m128h) -> __m128i {
15154    _mm_mask_cvtph_epu64(_mm_undefined_si128(), 0xff, a)
15155}
15156
15157/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15158/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15159///
15160/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epu64)
15161#[inline]
15162#[target_feature(enable = "avx512fp16,avx512vl")]
15163#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15164#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15165pub fn _mm_mask_cvtph_epu64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
15166    unsafe { transmute(vcvtph2uqq_128(a, src.as_u64x2(), k)) }
15167}
15168
15169/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15170/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15171///
15172/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epu64)
15173#[inline]
15174#[target_feature(enable = "avx512fp16,avx512vl")]
15175#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15176#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15177pub fn _mm_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m128i {
15178    _mm_mask_cvtph_epu64(_mm_setzero_si128(), k, a)
15179}
15180
15181/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15182/// store the results in dst.
15183///
15184/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epu64)
15185#[inline]
15186#[target_feature(enable = "avx512fp16,avx512vl")]
15187#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15188#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15189pub fn _mm256_cvtph_epu64(a: __m128h) -> __m256i {
15190    _mm256_mask_cvtph_epu64(_mm256_undefined_si256(), 0xff, a)
15191}
15192
15193/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15194/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15195///
15196/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epu64)
15197#[inline]
15198#[target_feature(enable = "avx512fp16,avx512vl")]
15199#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15200#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15201pub fn _mm256_mask_cvtph_epu64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
15202    unsafe { transmute(vcvtph2uqq_256(a, src.as_u64x4(), k)) }
15203}
15204
15205/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15206/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15207///
15208/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epu64)
15209#[inline]
15210#[target_feature(enable = "avx512fp16,avx512vl")]
15211#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15212#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15213pub fn _mm256_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m256i {
15214    _mm256_mask_cvtph_epu64(_mm256_setzero_si256(), k, a)
15215}
15216
15217/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15218/// store the results in dst.
15219///
15220/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epu64)
15221#[inline]
15222#[target_feature(enable = "avx512fp16")]
15223#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15224#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15225pub fn _mm512_cvtph_epu64(a: __m128h) -> __m512i {
15226    _mm512_mask_cvtph_epu64(_mm512_undefined_epi32(), 0xff, a)
15227}
15228
15229/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15230/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15231///
15232/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epu64)
15233#[inline]
15234#[target_feature(enable = "avx512fp16")]
15235#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15236#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15237pub fn _mm512_mask_cvtph_epu64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
15238    unsafe {
15239        transmute(vcvtph2uqq_512(
15240            a,
15241            src.as_u64x8(),
15242            k,
15243            _MM_FROUND_CUR_DIRECTION,
15244        ))
15245    }
15246}
15247
15248/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15249/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15250///
15251/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epu64)
15252#[inline]
15253#[target_feature(enable = "avx512fp16")]
15254#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15255#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15256pub fn _mm512_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m512i {
15257    _mm512_mask_cvtph_epu64(_mm512_setzero_si512(), k, a)
15258}
15259
15260/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15261/// store the results in dst.
15262///
15263/// Rounding is done according to the rounding parameter, which can be one of:
15264///
15265/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15266/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15267/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15268/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15269/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15270///
15271/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epu64)
15272#[inline]
15273#[target_feature(enable = "avx512fp16")]
15274#[cfg_attr(test, assert_instr(vcvtph2uqq, ROUNDING = 8))]
15275#[rustc_legacy_const_generics(1)]
15276#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15277pub fn _mm512_cvt_roundph_epu64<const ROUNDING: i32>(a: __m128h) -> __m512i {
15278    static_assert_rounding!(ROUNDING);
15279    _mm512_mask_cvt_roundph_epu64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
15280}
15281
15282/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15283/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15284///
15285/// Rounding is done according to the rounding parameter, which can be one of:
15286///
15287/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15288/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15289/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15290/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15291/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15292///
15293/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epu64)
15294#[inline]
15295#[target_feature(enable = "avx512fp16")]
15296#[cfg_attr(test, assert_instr(vcvtph2uqq, ROUNDING = 8))]
15297#[rustc_legacy_const_generics(3)]
15298#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15299pub fn _mm512_mask_cvt_roundph_epu64<const ROUNDING: i32>(
15300    src: __m512i,
15301    k: __mmask8,
15302    a: __m128h,
15303) -> __m512i {
15304    unsafe {
15305        static_assert_rounding!(ROUNDING);
15306        transmute(vcvtph2uqq_512(a, src.as_u64x8(), k, ROUNDING))
15307    }
15308}
15309
15310/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15311/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15312///
15313/// Rounding is done according to the rounding parameter, which can be one of:
15314///
15315/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15316/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15317/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15318/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15319/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15320///
15321/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epu64)
15322#[inline]
15323#[target_feature(enable = "avx512fp16")]
15324#[cfg_attr(test, assert_instr(vcvtph2uqq, ROUNDING = 8))]
15325#[rustc_legacy_const_generics(2)]
15326#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15327pub fn _mm512_maskz_cvt_roundph_epu64<const ROUNDING: i32>(k: __mmask8, a: __m128h) -> __m512i {
15328    static_assert_rounding!(ROUNDING);
15329    _mm512_mask_cvt_roundph_epu64::<ROUNDING>(_mm512_setzero_si512(), k, a)
15330}
15331
15332/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15333/// store the results in dst.
15334///
15335/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epi64)
15336#[inline]
15337#[target_feature(enable = "avx512fp16,avx512vl")]
15338#[cfg_attr(test, assert_instr(vcvttph2qq))]
15339#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15340pub fn _mm_cvttph_epi64(a: __m128h) -> __m128i {
15341    _mm_mask_cvttph_epi64(_mm_undefined_si128(), 0xff, a)
15342}
15343
15344/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15345/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15346///
15347/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epi64)
15348#[inline]
15349#[target_feature(enable = "avx512fp16,avx512vl")]
15350#[cfg_attr(test, assert_instr(vcvttph2qq))]
15351#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15352pub fn _mm_mask_cvttph_epi64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
15353    unsafe { transmute(vcvttph2qq_128(a, src.as_i64x2(), k)) }
15354}
15355
15356/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15357/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15358///
15359/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epi64)
15360#[inline]
15361#[target_feature(enable = "avx512fp16,avx512vl")]
15362#[cfg_attr(test, assert_instr(vcvttph2qq))]
15363#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15364pub fn _mm_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m128i {
15365    _mm_mask_cvttph_epi64(_mm_setzero_si128(), k, a)
15366}
15367
15368/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15369/// store the results in dst.
15370///
15371/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epi64)
15372#[inline]
15373#[target_feature(enable = "avx512fp16,avx512vl")]
15374#[cfg_attr(test, assert_instr(vcvttph2qq))]
15375#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15376pub fn _mm256_cvttph_epi64(a: __m128h) -> __m256i {
15377    _mm256_mask_cvttph_epi64(_mm256_undefined_si256(), 0xff, a)
15378}
15379
15380/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15381/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15382///
15383/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epi64)
15384#[inline]
15385#[target_feature(enable = "avx512fp16,avx512vl")]
15386#[cfg_attr(test, assert_instr(vcvttph2qq))]
15387#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15388pub fn _mm256_mask_cvttph_epi64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
15389    unsafe { transmute(vcvttph2qq_256(a, src.as_i64x4(), k)) }
15390}
15391
15392/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15393/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15394///
15395/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epi64)
15396#[inline]
15397#[target_feature(enable = "avx512fp16,avx512vl")]
15398#[cfg_attr(test, assert_instr(vcvttph2qq))]
15399#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15400pub fn _mm256_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m256i {
15401    _mm256_mask_cvttph_epi64(_mm256_setzero_si256(), k, a)
15402}
15403
15404/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15405/// store the results in dst.
15406///
15407/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epi64)
15408#[inline]
15409#[target_feature(enable = "avx512fp16")]
15410#[cfg_attr(test, assert_instr(vcvttph2qq))]
15411#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15412pub fn _mm512_cvttph_epi64(a: __m128h) -> __m512i {
15413    _mm512_mask_cvttph_epi64(_mm512_undefined_epi32(), 0xff, a)
15414}
15415
15416/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15417/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15418///
15419/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epi64)
15420#[inline]
15421#[target_feature(enable = "avx512fp16")]
15422#[cfg_attr(test, assert_instr(vcvttph2qq))]
15423#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15424pub fn _mm512_mask_cvttph_epi64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
15425    unsafe {
15426        transmute(vcvttph2qq_512(
15427            a,
15428            src.as_i64x8(),
15429            k,
15430            _MM_FROUND_CUR_DIRECTION,
15431        ))
15432    }
15433}
15434
15435/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15436/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15437///
15438/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epi64)
15439#[inline]
15440#[target_feature(enable = "avx512fp16")]
15441#[cfg_attr(test, assert_instr(vcvttph2qq))]
15442#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15443pub fn _mm512_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m512i {
15444    _mm512_mask_cvttph_epi64(_mm512_setzero_si512(), k, a)
15445}
15446
15447/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15448/// store the results in dst.
15449///
15450/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15451///
15452/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epi64)
15453#[inline]
15454#[target_feature(enable = "avx512fp16")]
15455#[cfg_attr(test, assert_instr(vcvttph2qq, SAE = 8))]
15456#[rustc_legacy_const_generics(1)]
15457#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15458pub fn _mm512_cvtt_roundph_epi64<const SAE: i32>(a: __m128h) -> __m512i {
15459    static_assert_sae!(SAE);
15460    _mm512_mask_cvtt_roundph_epi64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
15461}
15462
15463/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15464/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15465///
15466/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15467///
15468/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epi64)
15469#[inline]
15470#[target_feature(enable = "avx512fp16")]
15471#[cfg_attr(test, assert_instr(vcvttph2qq, SAE = 8))]
15472#[rustc_legacy_const_generics(3)]
15473#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15474pub fn _mm512_mask_cvtt_roundph_epi64<const SAE: i32>(
15475    src: __m512i,
15476    k: __mmask8,
15477    a: __m128h,
15478) -> __m512i {
15479    unsafe {
15480        static_assert_sae!(SAE);
15481        transmute(vcvttph2qq_512(a, src.as_i64x8(), k, SAE))
15482    }
15483}
15484
15485/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15486/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15487///
15488/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15489///
15490/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epi64)
15491#[inline]
15492#[target_feature(enable = "avx512fp16")]
15493#[cfg_attr(test, assert_instr(vcvttph2qq, SAE = 8))]
15494#[rustc_legacy_const_generics(2)]
15495#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15496pub fn _mm512_maskz_cvtt_roundph_epi64<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512i {
15497    static_assert_sae!(SAE);
15498    _mm512_mask_cvtt_roundph_epi64::<SAE>(_mm512_setzero_si512(), k, a)
15499}
15500
15501/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15502/// store the results in dst.
15503///
15504/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epu64)
15505#[inline]
15506#[target_feature(enable = "avx512fp16,avx512vl")]
15507#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15508#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15509pub fn _mm_cvttph_epu64(a: __m128h) -> __m128i {
15510    _mm_mask_cvttph_epu64(_mm_undefined_si128(), 0xff, a)
15511}
15512
15513/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15514/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15515///
15516/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epu64)
15517#[inline]
15518#[target_feature(enable = "avx512fp16,avx512vl")]
15519#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15520#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15521pub fn _mm_mask_cvttph_epu64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
15522    unsafe { transmute(vcvttph2uqq_128(a, src.as_u64x2(), k)) }
15523}
15524
15525/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15526/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15527///
15528/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epu64)
15529#[inline]
15530#[target_feature(enable = "avx512fp16,avx512vl")]
15531#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15532#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15533pub fn _mm_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m128i {
15534    _mm_mask_cvttph_epu64(_mm_setzero_si128(), k, a)
15535}
15536
15537/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15538/// store the results in dst.
15539///
15540/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epu64)
15541#[inline]
15542#[target_feature(enable = "avx512fp16,avx512vl")]
15543#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15544#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15545pub fn _mm256_cvttph_epu64(a: __m128h) -> __m256i {
15546    _mm256_mask_cvttph_epu64(_mm256_undefined_si256(), 0xff, a)
15547}
15548
15549/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15550/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15551///
15552/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epu64)
15553#[inline]
15554#[target_feature(enable = "avx512fp16,avx512vl")]
15555#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15556#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15557pub fn _mm256_mask_cvttph_epu64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
15558    unsafe { transmute(vcvttph2uqq_256(a, src.as_u64x4(), k)) }
15559}
15560
15561/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15562/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15563///
15564/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epu64)
15565#[inline]
15566#[target_feature(enable = "avx512fp16,avx512vl")]
15567#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15568#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15569pub fn _mm256_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m256i {
15570    _mm256_mask_cvttph_epu64(_mm256_setzero_si256(), k, a)
15571}
15572
15573/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15574/// store the results in dst.
15575///
15576/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epu64)
15577#[inline]
15578#[target_feature(enable = "avx512fp16")]
15579#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15580#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15581pub fn _mm512_cvttph_epu64(a: __m128h) -> __m512i {
15582    _mm512_mask_cvttph_epu64(_mm512_undefined_epi32(), 0xff, a)
15583}
15584
15585/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15586/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15587///
15588/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epu64)
15589#[inline]
15590#[target_feature(enable = "avx512fp16")]
15591#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15592#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15593pub fn _mm512_mask_cvttph_epu64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
15594    unsafe {
15595        transmute(vcvttph2uqq_512(
15596            a,
15597            src.as_u64x8(),
15598            k,
15599            _MM_FROUND_CUR_DIRECTION,
15600        ))
15601    }
15602}
15603
15604/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15605/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15606///
15607/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epu64)
15608#[inline]
15609#[target_feature(enable = "avx512fp16")]
15610#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15611#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15612pub fn _mm512_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m512i {
15613    _mm512_mask_cvttph_epu64(_mm512_setzero_si512(), k, a)
15614}
15615
15616/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15617/// store the results in dst.
15618///
15619/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15620///
15621/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epu64)
15622#[inline]
15623#[target_feature(enable = "avx512fp16")]
15624#[cfg_attr(test, assert_instr(vcvttph2uqq, SAE = 8))]
15625#[rustc_legacy_const_generics(1)]
15626#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15627pub fn _mm512_cvtt_roundph_epu64<const SAE: i32>(a: __m128h) -> __m512i {
15628    static_assert_sae!(SAE);
15629    _mm512_mask_cvtt_roundph_epu64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
15630}
15631
15632/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15633/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15634///
15635/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15636///
15637/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epu64)
15638#[inline]
15639#[target_feature(enable = "avx512fp16")]
15640#[cfg_attr(test, assert_instr(vcvttph2uqq, SAE = 8))]
15641#[rustc_legacy_const_generics(3)]
15642#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15643pub fn _mm512_mask_cvtt_roundph_epu64<const SAE: i32>(
15644    src: __m512i,
15645    k: __mmask8,
15646    a: __m128h,
15647) -> __m512i {
15648    unsafe {
15649        static_assert_sae!(SAE);
15650        transmute(vcvttph2uqq_512(a, src.as_u64x8(), k, SAE))
15651    }
15652}
15653
15654/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15655/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15656///
15657/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15658///
15659/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epu64)
15660#[inline]
15661#[target_feature(enable = "avx512fp16")]
15662#[cfg_attr(test, assert_instr(vcvttph2uqq, SAE = 8))]
15663#[rustc_legacy_const_generics(2)]
15664#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15665pub fn _mm512_maskz_cvtt_roundph_epu64<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512i {
15666    static_assert_sae!(SAE);
15667    _mm512_mask_cvtt_roundph_epu64::<SAE>(_mm512_setzero_si512(), k, a)
15668}
15669
15670/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15671/// floating-point elements, and store the results in dst.
15672///
15673/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtxph_ps)
15674#[inline]
15675#[target_feature(enable = "avx512fp16,avx512vl")]
15676#[cfg_attr(test, assert_instr(vcvtph2psx))]
15677#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15678pub fn _mm_cvtxph_ps(a: __m128h) -> __m128 {
15679    _mm_mask_cvtxph_ps(_mm_setzero_ps(), 0xff, a)
15680}
15681
15682/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15683/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
15684/// dst when the corresponding mask bit is not set).
15685///
15686/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtxph_ps)
15687#[inline]
15688#[target_feature(enable = "avx512fp16,avx512vl")]
15689#[cfg_attr(test, assert_instr(vcvtph2psx))]
15690#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15691pub fn _mm_mask_cvtxph_ps(src: __m128, k: __mmask8, a: __m128h) -> __m128 {
15692    unsafe { vcvtph2psx_128(a, src, k) }
15693}
15694
15695/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15696/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
15697/// corresponding mask bit is not set).
15698///
15699/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtxph_ps)
15700#[inline]
15701#[target_feature(enable = "avx512fp16,avx512vl")]
15702#[cfg_attr(test, assert_instr(vcvtph2psx))]
15703#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15704pub fn _mm_maskz_cvtxph_ps(k: __mmask8, a: __m128h) -> __m128 {
15705    _mm_mask_cvtxph_ps(_mm_setzero_ps(), k, a)
15706}
15707
15708/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15709/// floating-point elements, and store the results in dst.
15710///
15711/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtxph_ps)
15712#[inline]
15713#[target_feature(enable = "avx512fp16,avx512vl")]
15714#[cfg_attr(test, assert_instr(vcvtph2psx))]
15715#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15716pub fn _mm256_cvtxph_ps(a: __m128h) -> __m256 {
15717    _mm256_mask_cvtxph_ps(_mm256_setzero_ps(), 0xff, a)
15718}
15719
15720/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15721/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
15722/// dst when the corresponding mask bit is not set).
15723///
15724/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtxph_ps)
15725#[inline]
15726#[target_feature(enable = "avx512fp16,avx512vl")]
15727#[cfg_attr(test, assert_instr(vcvtph2psx))]
15728#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15729pub fn _mm256_mask_cvtxph_ps(src: __m256, k: __mmask8, a: __m128h) -> __m256 {
15730    unsafe { vcvtph2psx_256(a, src, k) }
15731}
15732
15733/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15734/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
15735/// corresponding mask bit is not set).
15736///
15737/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtxph_ps)
15738#[inline]
15739#[target_feature(enable = "avx512fp16,avx512vl")]
15740#[cfg_attr(test, assert_instr(vcvtph2psx))]
15741#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15742pub fn _mm256_maskz_cvtxph_ps(k: __mmask8, a: __m128h) -> __m256 {
15743    _mm256_mask_cvtxph_ps(_mm256_setzero_ps(), k, a)
15744}
15745
15746/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15747/// floating-point elements, and store the results in dst.
15748///
15749/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtxph_ps)
15750#[inline]
15751#[target_feature(enable = "avx512fp16")]
15752#[cfg_attr(test, assert_instr(vcvtph2psx))]
15753#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15754pub fn _mm512_cvtxph_ps(a: __m256h) -> __m512 {
15755    _mm512_mask_cvtxph_ps(_mm512_setzero_ps(), 0xffff, a)
15756}
15757
15758/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15759/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
15760/// dst when the corresponding mask bit is not set).
15761///
15762/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtxph_ps)
15763#[inline]
15764#[target_feature(enable = "avx512fp16")]
15765#[cfg_attr(test, assert_instr(vcvtph2psx))]
15766#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15767pub fn _mm512_mask_cvtxph_ps(src: __m512, k: __mmask16, a: __m256h) -> __m512 {
15768    unsafe { vcvtph2psx_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
15769}
15770
15771/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15772/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
15773/// corresponding mask bit is not set).
15774///
15775/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtxph_ps)
15776#[inline]
15777#[target_feature(enable = "avx512fp16")]
15778#[cfg_attr(test, assert_instr(vcvtph2psx))]
15779#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15780pub fn _mm512_maskz_cvtxph_ps(k: __mmask16, a: __m256h) -> __m512 {
15781    _mm512_mask_cvtxph_ps(_mm512_setzero_ps(), k, a)
15782}
15783
15784/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15785/// floating-point elements, and store the results in dst.
15786///
15787/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15788///
15789/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtx_roundph_ps)
15790#[inline]
15791#[target_feature(enable = "avx512fp16")]
15792#[cfg_attr(test, assert_instr(vcvtph2psx, SAE = 8))]
15793#[rustc_legacy_const_generics(1)]
15794#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15795pub fn _mm512_cvtx_roundph_ps<const SAE: i32>(a: __m256h) -> __m512 {
15796    static_assert_sae!(SAE);
15797    _mm512_mask_cvtx_roundph_ps::<SAE>(_mm512_setzero_ps(), 0xffff, a)
15798}
15799
15800/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15801/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
15802/// dst when the corresponding mask bit is not set).
15803///
15804/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15805///
15806/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtx_roundph_ps)
15807#[inline]
15808#[target_feature(enable = "avx512fp16")]
15809#[cfg_attr(test, assert_instr(vcvtph2psx, SAE = 8))]
15810#[rustc_legacy_const_generics(3)]
15811#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15812pub fn _mm512_mask_cvtx_roundph_ps<const SAE: i32>(
15813    src: __m512,
15814    k: __mmask16,
15815    a: __m256h,
15816) -> __m512 {
15817    unsafe {
15818        static_assert_sae!(SAE);
15819        vcvtph2psx_512(a, src, k, SAE)
15820    }
15821}
15822
15823/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15824/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
15825/// corresponding mask bit is not set).
15826///
15827/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15828///
15829/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtx_roundph_ps)
15830#[inline]
15831#[target_feature(enable = "avx512fp16")]
15832#[cfg_attr(test, assert_instr(vcvtph2psx, SAE = 8))]
15833#[rustc_legacy_const_generics(2)]
15834#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15835pub fn _mm512_maskz_cvtx_roundph_ps<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512 {
15836    static_assert_sae!(SAE);
15837    _mm512_mask_cvtx_roundph_ps::<SAE>(_mm512_setzero_ps(), k, a)
15838}
15839
15840/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
15841/// floating-point element, store the result in the lower element of dst, and copy the upper 3 packed
15842/// elements from a to the upper elements of dst.
15843///
15844/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_ss)
15845#[inline]
15846#[target_feature(enable = "avx512fp16")]
15847#[cfg_attr(test, assert_instr(vcvtsh2ss))]
15848#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15849pub fn _mm_cvtsh_ss(a: __m128, b: __m128h) -> __m128 {
15850    _mm_mask_cvtsh_ss(a, 0xff, a, b)
15851}
15852
15853/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
15854/// floating-point element, store the result in the lower element of dst using writemask k (the element is
15855/// copied from src to dst when mask bit 0 is not set), and copy the upper 3 packed elements from a to the
15856/// upper elements of dst.
15857///
15858/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsh_ss)
15859#[inline]
15860#[target_feature(enable = "avx512fp16")]
15861#[cfg_attr(test, assert_instr(vcvtsh2ss))]
15862#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15863pub fn _mm_mask_cvtsh_ss(src: __m128, k: __mmask8, a: __m128, b: __m128h) -> __m128 {
15864    unsafe { vcvtsh2ss(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
15865}
15866
15867/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
15868/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
15869/// zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements
15870/// of dst.
15871///
15872/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsh_ss)
15873#[inline]
15874#[target_feature(enable = "avx512fp16")]
15875#[cfg_attr(test, assert_instr(vcvtsh2ss))]
15876#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15877pub fn _mm_maskz_cvtsh_ss(k: __mmask8, a: __m128, b: __m128h) -> __m128 {
15878    _mm_mask_cvtsh_ss(_mm_set_ss(0.0), k, a, b)
15879}
15880
15881/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
15882/// floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements
15883/// from a to the upper elements of dst.
15884///
15885/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15886///
15887/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_ss)
15888#[inline]
15889#[target_feature(enable = "avx512fp16")]
15890#[cfg_attr(test, assert_instr(vcvtsh2ss, SAE = 8))]
15891#[rustc_legacy_const_generics(2)]
15892#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15893pub fn _mm_cvt_roundsh_ss<const SAE: i32>(a: __m128, b: __m128h) -> __m128 {
15894    static_assert_sae!(SAE);
15895    _mm_mask_cvt_roundsh_ss::<SAE>(_mm_undefined_ps(), 0xff, a, b)
15896}
15897
15898/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
15899/// floating-point element, store the result in the lower element of dst using writemask k (the element is
15900/// copied from src to dst when mask bit 0 is not set), and copy the upper 3 packed elements from a to the
15901/// upper elements of dst.
15902///
15903/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15904///
15905/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsh_ss)
15906#[inline]
15907#[target_feature(enable = "avx512fp16")]
15908#[cfg_attr(test, assert_instr(vcvtsh2ss, SAE = 8))]
15909#[rustc_legacy_const_generics(4)]
15910#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15911pub fn _mm_mask_cvt_roundsh_ss<const SAE: i32>(
15912    src: __m128,
15913    k: __mmask8,
15914    a: __m128,
15915    b: __m128h,
15916) -> __m128 {
15917    unsafe {
15918        static_assert_sae!(SAE);
15919        vcvtsh2ss(a, b, src, k, SAE)
15920    }
15921}
15922
15923/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
15924/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
15925/// zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements
15926/// of dst.
15927///
15928/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15929///
15930/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsh_ss)
15931#[inline]
15932#[target_feature(enable = "avx512fp16")]
15933#[cfg_attr(test, assert_instr(vcvtsh2ss, SAE = 8))]
15934#[rustc_legacy_const_generics(3)]
15935#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15936pub fn _mm_maskz_cvt_roundsh_ss<const SAE: i32>(k: __mmask8, a: __m128, b: __m128h) -> __m128 {
15937    static_assert_sae!(SAE);
15938    _mm_mask_cvt_roundsh_ss::<SAE>(_mm_set_ss(0.0), k, a, b)
15939}
15940
15941/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
15942/// floating-point elements, and store the results in dst.
15943///
15944/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_pd)
15945#[inline]
15946#[target_feature(enable = "avx512fp16,avx512vl")]
15947#[cfg_attr(test, assert_instr(vcvtph2pd))]
15948#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15949pub fn _mm_cvtph_pd(a: __m128h) -> __m128d {
15950    _mm_mask_cvtph_pd(_mm_setzero_pd(), 0xff, a)
15951}
15952
15953/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
15954/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
15955/// dst when the corresponding mask bit is not set).
15956///
15957/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_pd)
15958#[inline]
15959#[target_feature(enable = "avx512fp16,avx512vl")]
15960#[cfg_attr(test, assert_instr(vcvtph2pd))]
15961#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15962pub fn _mm_mask_cvtph_pd(src: __m128d, k: __mmask8, a: __m128h) -> __m128d {
15963    unsafe { vcvtph2pd_128(a, src, k) }
15964}
15965
15966/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
15967/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
15968/// corresponding mask bit is not set).
15969///
15970/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_pd)
15971#[inline]
15972#[target_feature(enable = "avx512fp16,avx512vl")]
15973#[cfg_attr(test, assert_instr(vcvtph2pd))]
15974#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15975pub fn _mm_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m128d {
15976    _mm_mask_cvtph_pd(_mm_setzero_pd(), k, a)
15977}
15978
15979/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
15980/// floating-point elements, and store the results in dst.
15981///
15982/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_pd)
15983#[inline]
15984#[target_feature(enable = "avx512fp16,avx512vl")]
15985#[cfg_attr(test, assert_instr(vcvtph2pd))]
15986#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15987pub fn _mm256_cvtph_pd(a: __m128h) -> __m256d {
15988    _mm256_mask_cvtph_pd(_mm256_setzero_pd(), 0xff, a)
15989}
15990
15991/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
15992/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
15993/// dst when the corresponding mask bit is not set).
15994///
15995/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_pd)
15996#[inline]
15997#[target_feature(enable = "avx512fp16,avx512vl")]
15998#[cfg_attr(test, assert_instr(vcvtph2pd))]
15999#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16000pub fn _mm256_mask_cvtph_pd(src: __m256d, k: __mmask8, a: __m128h) -> __m256d {
16001    unsafe { vcvtph2pd_256(a, src, k) }
16002}
16003
16004/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16005/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
16006/// corresponding mask bit is not set).
16007///
16008/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_pd)
16009#[inline]
16010#[target_feature(enable = "avx512fp16,avx512vl")]
16011#[cfg_attr(test, assert_instr(vcvtph2pd))]
16012#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16013pub fn _mm256_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m256d {
16014    _mm256_mask_cvtph_pd(_mm256_setzero_pd(), k, a)
16015}
16016
16017/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16018/// floating-point elements, and store the results in dst.
16019///
16020/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_pd)
16021#[inline]
16022#[target_feature(enable = "avx512fp16")]
16023#[cfg_attr(test, assert_instr(vcvtph2pd))]
16024#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16025pub fn _mm512_cvtph_pd(a: __m128h) -> __m512d {
16026    _mm512_mask_cvtph_pd(_mm512_setzero_pd(), 0xff, a)
16027}
16028
16029/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16030/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
16031/// dst when the corresponding mask bit is not set).
16032///
16033/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_pd)
16034#[inline]
16035#[target_feature(enable = "avx512fp16")]
16036#[cfg_attr(test, assert_instr(vcvtph2pd))]
16037#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16038pub fn _mm512_mask_cvtph_pd(src: __m512d, k: __mmask8, a: __m128h) -> __m512d {
16039    unsafe { vcvtph2pd_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
16040}
16041
16042/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16043/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
16044/// corresponding mask bit is not set).
16045///
16046/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_pd)
16047#[inline]
16048#[target_feature(enable = "avx512fp16")]
16049#[cfg_attr(test, assert_instr(vcvtph2pd))]
16050#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16051pub fn _mm512_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m512d {
16052    _mm512_mask_cvtph_pd(_mm512_setzero_pd(), k, a)
16053}
16054
16055/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16056/// floating-point elements, and store the results in dst.
16057///
16058/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16059///
16060/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_pd)
16061#[inline]
16062#[target_feature(enable = "avx512fp16")]
16063#[cfg_attr(test, assert_instr(vcvtph2pd, SAE = 8))]
16064#[rustc_legacy_const_generics(1)]
16065#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16066pub fn _mm512_cvt_roundph_pd<const SAE: i32>(a: __m128h) -> __m512d {
16067    static_assert_sae!(SAE);
16068    _mm512_mask_cvt_roundph_pd::<SAE>(_mm512_setzero_pd(), 0xff, a)
16069}
16070
16071/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16072/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
16073/// dst when the corresponding mask bit is not set).
16074///
16075/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16076///
16077/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_pd)
16078#[inline]
16079#[target_feature(enable = "avx512fp16")]
16080#[cfg_attr(test, assert_instr(vcvtph2pd, SAE = 8))]
16081#[rustc_legacy_const_generics(3)]
16082#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16083pub fn _mm512_mask_cvt_roundph_pd<const SAE: i32>(
16084    src: __m512d,
16085    k: __mmask8,
16086    a: __m128h,
16087) -> __m512d {
16088    unsafe {
16089        static_assert_sae!(SAE);
16090        vcvtph2pd_512(a, src, k, SAE)
16091    }
16092}
16093
16094/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16095/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
16096/// corresponding mask bit is not set).
16097///
16098/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16099///
16100/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_pd)
16101#[inline]
16102#[target_feature(enable = "avx512fp16")]
16103#[cfg_attr(test, assert_instr(vcvtph2pd, SAE = 8))]
16104#[rustc_legacy_const_generics(2)]
16105#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16106pub fn _mm512_maskz_cvt_roundph_pd<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512d {
16107    static_assert_sae!(SAE);
16108    _mm512_mask_cvt_roundph_pd::<SAE>(_mm512_setzero_pd(), k, a)
16109}
16110
16111/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16112/// floating-point element, store the result in the lower element of dst, and copy the upper element
16113/// from a to the upper element of dst.
16114///
16115/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_sd)
16116#[inline]
16117#[target_feature(enable = "avx512fp16")]
16118#[cfg_attr(test, assert_instr(vcvtsh2sd))]
16119#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16120pub fn _mm_cvtsh_sd(a: __m128d, b: __m128h) -> __m128d {
16121    _mm_mask_cvtsh_sd(a, 0xff, a, b)
16122}
16123
16124/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16125/// floating-point element, store the result in the lower element of dst using writemask k (the element is
16126/// copied from src to dst when mask bit 0 is not set), and copy the upper element from a to the upper element
16127/// of dst.
16128///
16129/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsh_sd)
16130#[inline]
16131#[target_feature(enable = "avx512fp16")]
16132#[cfg_attr(test, assert_instr(vcvtsh2sd))]
16133#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16134pub fn _mm_mask_cvtsh_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
16135    unsafe { vcvtsh2sd(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
16136}
16137
16138/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16139/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
16140/// zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
16141///
16142/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsh_sd)
16143#[inline]
16144#[target_feature(enable = "avx512fp16")]
16145#[cfg_attr(test, assert_instr(vcvtsh2sd))]
16146#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16147pub fn _mm_maskz_cvtsh_sd(k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
16148    _mm_mask_cvtsh_sd(_mm_set_sd(0.0), k, a, b)
16149}
16150
16151/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16152/// floating-point element, store the result in the lower element of dst, and copy the upper element from a
16153/// to the upper element of dst.
16154///
16155/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16156///
16157/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_sd)
16158#[inline]
16159#[target_feature(enable = "avx512fp16")]
16160#[cfg_attr(test, assert_instr(vcvtsh2sd, SAE = 8))]
16161#[rustc_legacy_const_generics(2)]
16162#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16163pub fn _mm_cvt_roundsh_sd<const SAE: i32>(a: __m128d, b: __m128h) -> __m128d {
16164    static_assert_sae!(SAE);
16165    _mm_mask_cvt_roundsh_sd::<SAE>(a, 0xff, a, b)
16166}
16167
16168/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16169/// floating-point element, store the result in the lower element of dst using writemask k (the element is
16170/// copied from src to dst when mask bit 0 is not set), and copy the upper element from a to the upper element
16171/// of dst.
16172///
16173/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16174///
16175/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsh_sd)
16176#[inline]
16177#[target_feature(enable = "avx512fp16")]
16178#[cfg_attr(test, assert_instr(vcvtsh2sd, SAE = 8))]
16179#[rustc_legacy_const_generics(4)]
16180#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16181pub fn _mm_mask_cvt_roundsh_sd<const SAE: i32>(
16182    src: __m128d,
16183    k: __mmask8,
16184    a: __m128d,
16185    b: __m128h,
16186) -> __m128d {
16187    unsafe {
16188        static_assert_sae!(SAE);
16189        vcvtsh2sd(a, b, src, k, SAE)
16190    }
16191}
16192
16193/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16194/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
16195/// zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
16196///
16197/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16198///
16199/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsh_sd)
16200#[inline]
16201#[target_feature(enable = "avx512fp16")]
16202#[cfg_attr(test, assert_instr(vcvtsh2sd, SAE = 8))]
16203#[rustc_legacy_const_generics(3)]
16204#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16205pub fn _mm_maskz_cvt_roundsh_sd<const SAE: i32>(k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
16206    static_assert_sae!(SAE);
16207    _mm_mask_cvt_roundsh_sd::<SAE>(_mm_set_sd(0.0), k, a, b)
16208}
16209
16210/// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
16211///
16212/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_h)
16213#[inline]
16214#[target_feature(enable = "avx512fp16")]
16215#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16216pub fn _mm_cvtsh_h(a: __m128h) -> f16 {
16217    unsafe { simd_extract!(a, 0) }
16218}
16219
16220/// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
16221///
16222/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsh_h)
16223#[inline]
16224#[target_feature(enable = "avx512fp16")]
16225#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16226pub fn _mm256_cvtsh_h(a: __m256h) -> f16 {
16227    unsafe { simd_extract!(a, 0) }
16228}
16229
16230/// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
16231///
16232/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsh_h)
16233#[inline]
16234#[target_feature(enable = "avx512fp16")]
16235#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16236pub fn _mm512_cvtsh_h(a: __m512h) -> f16 {
16237    unsafe { simd_extract!(a, 0) }
16238}
16239
16240/// Copy the lower 16-bit integer in a to dst.
16241///
16242/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si16)
16243#[inline]
16244#[target_feature(enable = "avx512fp16")]
16245#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16246pub fn _mm_cvtsi128_si16(a: __m128i) -> i16 {
16247    unsafe { simd_extract!(a.as_i16x8(), 0) }
16248}
16249
16250/// Copy 16-bit integer a to the lower elements of dst, and zero the upper elements of dst.
16251///
16252/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi16_si128)
16253#[inline]
16254#[target_feature(enable = "avx512fp16")]
16255#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16256pub fn _mm_cvtsi16_si128(a: i16) -> __m128i {
16257    unsafe { transmute(simd_insert!(i16x8::ZERO, 0, a)) }
16258}
16259
16260#[allow(improper_ctypes)]
16261unsafe extern "C" {
16262    #[link_name = "llvm.x86.avx512fp16.mask.cmp.sh"]
16263    fn vcmpsh(a: __m128h, b: __m128h, imm8: i32, mask: __mmask8, sae: i32) -> __mmask8;
16264    #[link_name = "llvm.x86.avx512fp16.vcomi.sh"]
16265    fn vcomish(a: __m128h, b: __m128h, imm8: i32, sae: i32) -> i32;
16266
16267    #[link_name = "llvm.x86.avx512fp16.add.ph.512"]
16268    fn vaddph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
16269    #[link_name = "llvm.x86.avx512fp16.sub.ph.512"]
16270    fn vsubph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
16271    #[link_name = "llvm.x86.avx512fp16.mul.ph.512"]
16272    fn vmulph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
16273    #[link_name = "llvm.x86.avx512fp16.div.ph.512"]
16274    fn vdivph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
16275
16276    #[link_name = "llvm.x86.avx512fp16.mask.add.sh.round"]
16277    fn vaddsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16278    #[link_name = "llvm.x86.avx512fp16.mask.sub.sh.round"]
16279    fn vsubsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16280    #[link_name = "llvm.x86.avx512fp16.mask.mul.sh.round"]
16281    fn vmulsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16282    #[link_name = "llvm.x86.avx512fp16.mask.div.sh.round"]
16283    fn vdivsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16284
16285    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.cph.128"]
16286    fn vfmulcph_128(a: __m128, b: __m128, src: __m128, k: __mmask8) -> __m128;
16287    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.cph.256"]
16288    fn vfmulcph_256(a: __m256, b: __m256, src: __m256, k: __mmask8) -> __m256;
16289    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.cph.512"]
16290    fn vfmulcph_512(a: __m512, b: __m512, src: __m512, k: __mmask16, rounding: i32) -> __m512;
16291    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.csh"]
16292    fn vfmulcsh(a: __m128, b: __m128, src: __m128, k: __mmask8, rounding: i32) -> __m128;
16293
16294    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.cph.128"]
16295    fn vfcmulcph_128(a: __m128, b: __m128, src: __m128, k: __mmask8) -> __m128;
16296    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.cph.256"]
16297    fn vfcmulcph_256(a: __m256, b: __m256, src: __m256, k: __mmask8) -> __m256;
16298    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.cph.512"]
16299    fn vfcmulcph_512(a: __m512, b: __m512, src: __m512, k: __mmask16, rounding: i32) -> __m512;
16300    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.csh"]
16301    fn vfcmulcsh(a: __m128, b: __m128, src: __m128, k: __mmask8, rounding: i32) -> __m128;
16302
16303    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.cph.128"]
16304    fn vfmaddcph_mask3_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
16305    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.cph.128"]
16306    fn vfmaddcph_maskz_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
16307    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.cph.256"]
16308    fn vfmaddcph_mask3_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
16309    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.cph.256"]
16310    fn vfmaddcph_maskz_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
16311    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.cph.512"]
16312    fn vfmaddcph_mask3_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32) -> __m512;
16313    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.cph.512"]
16314    fn vfmaddcph_maskz_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32) -> __m512;
16315    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.csh"]
16316    fn vfmaddcsh_mask(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
16317    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.csh"]
16318    fn vfmaddcsh_maskz(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
16319
16320    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.cph.128"]
16321    fn vfcmaddcph_mask3_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
16322    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.cph.128"]
16323    fn vfcmaddcph_maskz_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
16324    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.cph.256"]
16325    fn vfcmaddcph_mask3_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
16326    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.cph.256"]
16327    fn vfcmaddcph_maskz_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
16328    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.cph.512"]
16329    fn vfcmaddcph_mask3_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32)
16330    -> __m512;
16331    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.cph.512"]
16332    fn vfcmaddcph_maskz_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32)
16333    -> __m512;
16334    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.csh"]
16335    fn vfcmaddcsh_mask(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
16336    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.csh"]
16337    fn vfcmaddcsh_maskz(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
16338
16339    #[link_name = "llvm.x86.avx512fp16.vfmadd.ph.512"]
16340    fn vfmaddph_512(a: __m512h, b: __m512h, c: __m512h, rounding: i32) -> __m512h;
16341    #[link_name = "llvm.x86.avx512fp16.vfmadd.f16"]
16342    fn vfmaddsh(a: f16, b: f16, c: f16, rounding: i32) -> f16;
16343
16344    #[link_name = "llvm.x86.avx512fp16.vfmaddsub.ph.128"]
16345    fn vfmaddsubph_128(a: __m128h, b: __m128h, c: __m128h) -> __m128h;
16346    #[link_name = "llvm.x86.avx512fp16.vfmaddsub.ph.256"]
16347    fn vfmaddsubph_256(a: __m256h, b: __m256h, c: __m256h) -> __m256h;
16348    #[link_name = "llvm.x86.avx512fp16.vfmaddsub.ph.512"]
16349    fn vfmaddsubph_512(a: __m512h, b: __m512h, c: __m512h, rounding: i32) -> __m512h;
16350
16351    #[link_name = "llvm.x86.avx512fp16.mask.rcp.ph.128"]
16352    fn vrcpph_128(a: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16353    #[link_name = "llvm.x86.avx512fp16.mask.rcp.ph.256"]
16354    fn vrcpph_256(a: __m256h, src: __m256h, k: __mmask16) -> __m256h;
16355    #[link_name = "llvm.x86.avx512fp16.mask.rcp.ph.512"]
16356    fn vrcpph_512(a: __m512h, src: __m512h, k: __mmask32) -> __m512h;
16357    #[link_name = "llvm.x86.avx512fp16.mask.rcp.sh"]
16358    fn vrcpsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16359
16360    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.ph.128"]
16361    fn vrsqrtph_128(a: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16362    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.ph.256"]
16363    fn vrsqrtph_256(a: __m256h, src: __m256h, k: __mmask16) -> __m256h;
16364    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.ph.512"]
16365    fn vrsqrtph_512(a: __m512h, src: __m512h, k: __mmask32) -> __m512h;
16366    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.sh"]
16367    fn vrsqrtsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16368
16369    #[link_name = "llvm.x86.avx512fp16.sqrt.ph.512"]
16370    fn vsqrtph_512(a: __m512h, rounding: i32) -> __m512h;
16371    #[link_name = "llvm.x86.avx512fp16.mask.sqrt.sh"]
16372    fn vsqrtsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16373
16374    #[link_name = "llvm.x86.avx512fp16.max.ph.128"]
16375    fn vmaxph_128(a: __m128h, b: __m128h) -> __m128h;
16376    #[link_name = "llvm.x86.avx512fp16.max.ph.256"]
16377    fn vmaxph_256(a: __m256h, b: __m256h) -> __m256h;
16378    #[link_name = "llvm.x86.avx512fp16.max.ph.512"]
16379    fn vmaxph_512(a: __m512h, b: __m512h, sae: i32) -> __m512h;
16380    #[link_name = "llvm.x86.avx512fp16.mask.max.sh.round"]
16381    fn vmaxsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, sae: i32) -> __m128h;
16382
16383    #[link_name = "llvm.x86.avx512fp16.min.ph.128"]
16384    fn vminph_128(a: __m128h, b: __m128h) -> __m128h;
16385    #[link_name = "llvm.x86.avx512fp16.min.ph.256"]
16386    fn vminph_256(a: __m256h, b: __m256h) -> __m256h;
16387    #[link_name = "llvm.x86.avx512fp16.min.ph.512"]
16388    fn vminph_512(a: __m512h, b: __m512h, sae: i32) -> __m512h;
16389    #[link_name = "llvm.x86.avx512fp16.mask.min.sh.round"]
16390    fn vminsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, sae: i32) -> __m128h;
16391
16392    #[link_name = "llvm.x86.avx512fp16.mask.getexp.ph.128"]
16393    fn vgetexpph_128(a: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16394    #[link_name = "llvm.x86.avx512fp16.mask.getexp.ph.256"]
16395    fn vgetexpph_256(a: __m256h, src: __m256h, k: __mmask16) -> __m256h;
16396    #[link_name = "llvm.x86.avx512fp16.mask.getexp.ph.512"]
16397    fn vgetexpph_512(a: __m512h, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
16398    #[link_name = "llvm.x86.avx512fp16.mask.getexp.sh"]
16399    fn vgetexpsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, sae: i32) -> __m128h;
16400
16401    #[link_name = "llvm.x86.avx512fp16.mask.getmant.ph.128"]
16402    fn vgetmantph_128(a: __m128h, imm8: i32, src: __m128h, k: __mmask8) -> __m128h;
16403    #[link_name = "llvm.x86.avx512fp16.mask.getmant.ph.256"]
16404    fn vgetmantph_256(a: __m256h, imm8: i32, src: __m256h, k: __mmask16) -> __m256h;
16405    #[link_name = "llvm.x86.avx512fp16.mask.getmant.ph.512"]
16406    fn vgetmantph_512(a: __m512h, imm8: i32, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
16407    #[link_name = "llvm.x86.avx512fp16.mask.getmant.sh"]
16408    fn vgetmantsh(
16409        a: __m128h,
16410        b: __m128h,
16411        imm8: i32,
16412        src: __m128h,
16413        k: __mmask8,
16414        sae: i32,
16415    ) -> __m128h;
16416
16417    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.ph.128"]
16418    fn vrndscaleph_128(a: __m128h, imm8: i32, src: __m128h, k: __mmask8) -> __m128h;
16419    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.ph.256"]
16420    fn vrndscaleph_256(a: __m256h, imm8: i32, src: __m256h, k: __mmask16) -> __m256h;
16421    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.ph.512"]
16422    fn vrndscaleph_512(a: __m512h, imm8: i32, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
16423    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.sh"]
16424    fn vrndscalesh(
16425        a: __m128h,
16426        b: __m128h,
16427        src: __m128h,
16428        k: __mmask8,
16429        imm8: i32,
16430        sae: i32,
16431    ) -> __m128h;
16432
16433    #[link_name = "llvm.x86.avx512fp16.mask.scalef.ph.128"]
16434    fn vscalefph_128(a: __m128h, b: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16435    #[link_name = "llvm.x86.avx512fp16.mask.scalef.ph.256"]
16436    fn vscalefph_256(a: __m256h, b: __m256h, src: __m256h, k: __mmask16) -> __m256h;
16437    #[link_name = "llvm.x86.avx512fp16.mask.scalef.ph.512"]
16438    fn vscalefph_512(a: __m512h, b: __m512h, src: __m512h, k: __mmask32, rounding: i32) -> __m512h;
16439    #[link_name = "llvm.x86.avx512fp16.mask.scalef.sh"]
16440    fn vscalefsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16441
16442    #[link_name = "llvm.x86.avx512fp16.mask.reduce.ph.128"]
16443    fn vreduceph_128(a: __m128h, imm8: i32, src: __m128h, k: __mmask8) -> __m128h;
16444    #[link_name = "llvm.x86.avx512fp16.mask.reduce.ph.256"]
16445    fn vreduceph_256(a: __m256h, imm8: i32, src: __m256h, k: __mmask16) -> __m256h;
16446    #[link_name = "llvm.x86.avx512fp16.mask.reduce.ph.512"]
16447    fn vreduceph_512(a: __m512h, imm8: i32, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
16448    #[link_name = "llvm.x86.avx512fp16.mask.reduce.sh"]
16449    fn vreducesh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, imm8: i32, sae: i32)
16450    -> __m128h;
16451
16452    #[link_name = "llvm.x86.avx512fp16.mask.fpclass.sh"]
16453    fn vfpclasssh(a: __m128h, imm8: i32, k: __mmask8) -> __mmask8;
16454
16455    #[link_name = "llvm.x86.avx512.sitofp.round.v8f16.v8i16"]
16456    fn vcvtw2ph_128(a: i16x8, rounding: i32) -> __m128h;
16457    #[link_name = "llvm.x86.avx512.sitofp.round.v16f16.v16i16"]
16458    fn vcvtw2ph_256(a: i16x16, rounding: i32) -> __m256h;
16459    #[link_name = "llvm.x86.avx512.sitofp.round.v32f16.v32i16"]
16460    fn vcvtw2ph_512(a: i16x32, rounding: i32) -> __m512h;
16461    #[link_name = "llvm.x86.avx512.uitofp.round.v8f16.v8i16"]
16462    fn vcvtuw2ph_128(a: u16x8, rounding: i32) -> __m128h;
16463    #[link_name = "llvm.x86.avx512.uitofp.round.v16f16.v16i16"]
16464    fn vcvtuw2ph_256(a: u16x16, rounding: i32) -> __m256h;
16465    #[link_name = "llvm.x86.avx512.uitofp.round.v32f16.v32i16"]
16466    fn vcvtuw2ph_512(a: u16x32, rounding: i32) -> __m512h;
16467
16468    #[link_name = "llvm.x86.avx512fp16.mask.vcvtdq2ph.128"]
16469    fn vcvtdq2ph_128(a: i32x4, src: __m128h, k: __mmask8) -> __m128h;
16470    #[link_name = "llvm.x86.avx512.sitofp.round.v8f16.v8i32"]
16471    fn vcvtdq2ph_256(a: i32x8, rounding: i32) -> __m128h;
16472    #[link_name = "llvm.x86.avx512.sitofp.round.v16f16.v16i32"]
16473    fn vcvtdq2ph_512(a: i32x16, rounding: i32) -> __m256h;
16474    #[link_name = "llvm.x86.avx512fp16.vcvtsi2sh"]
16475    fn vcvtsi2sh(a: __m128h, b: i32, rounding: i32) -> __m128h;
16476    #[link_name = "llvm.x86.avx512fp16.mask.vcvtudq2ph.128"]
16477    fn vcvtudq2ph_128(a: u32x4, src: __m128h, k: __mmask8) -> __m128h;
16478    #[link_name = "llvm.x86.avx512.uitofp.round.v8f16.v8i32"]
16479    fn vcvtudq2ph_256(a: u32x8, rounding: i32) -> __m128h;
16480    #[link_name = "llvm.x86.avx512.uitofp.round.v16f16.v16i32"]
16481    fn vcvtudq2ph_512(a: u32x16, rounding: i32) -> __m256h;
16482    #[link_name = "llvm.x86.avx512fp16.vcvtusi2sh"]
16483    fn vcvtusi2sh(a: __m128h, b: u32, rounding: i32) -> __m128h;
16484
16485    #[link_name = "llvm.x86.avx512fp16.mask.vcvtqq2ph.128"]
16486    fn vcvtqq2ph_128(a: i64x2, src: __m128h, k: __mmask8) -> __m128h;
16487    #[link_name = "llvm.x86.avx512fp16.mask.vcvtqq2ph.256"]
16488    fn vcvtqq2ph_256(a: i64x4, src: __m128h, k: __mmask8) -> __m128h;
16489    #[link_name = "llvm.x86.avx512.sitofp.round.v8f16.v8i64"]
16490    fn vcvtqq2ph_512(a: i64x8, rounding: i32) -> __m128h;
16491    #[link_name = "llvm.x86.avx512fp16.mask.vcvtuqq2ph.128"]
16492    fn vcvtuqq2ph_128(a: u64x2, src: __m128h, k: __mmask8) -> __m128h;
16493    #[link_name = "llvm.x86.avx512fp16.mask.vcvtuqq2ph.256"]
16494    fn vcvtuqq2ph_256(a: u64x4, src: __m128h, k: __mmask8) -> __m128h;
16495    #[link_name = "llvm.x86.avx512.uitofp.round.v8f16.v8i64"]
16496    fn vcvtuqq2ph_512(a: u64x8, rounding: i32) -> __m128h;
16497
16498    #[link_name = "llvm.x86.avx512fp16.mask.vcvtps2phx.128"]
16499    fn vcvtps2phx_128(a: __m128, src: __m128h, k: __mmask8) -> __m128h;
16500    #[link_name = "llvm.x86.avx512fp16.mask.vcvtps2phx.256"]
16501    fn vcvtps2phx_256(a: __m256, src: __m128h, k: __mmask8) -> __m128h;
16502    #[link_name = "llvm.x86.avx512fp16.mask.vcvtps2phx.512"]
16503    fn vcvtps2phx_512(a: __m512, src: __m256h, k: __mmask16, rounding: i32) -> __m256h;
16504    #[link_name = "llvm.x86.avx512fp16.mask.vcvtss2sh.round"]
16505    fn vcvtss2sh(a: __m128h, b: __m128, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16506
16507    #[link_name = "llvm.x86.avx512fp16.mask.vcvtpd2ph.128"]
16508    fn vcvtpd2ph_128(a: __m128d, src: __m128h, k: __mmask8) -> __m128h;
16509    #[link_name = "llvm.x86.avx512fp16.mask.vcvtpd2ph.256"]
16510    fn vcvtpd2ph_256(a: __m256d, src: __m128h, k: __mmask8) -> __m128h;
16511    #[link_name = "llvm.x86.avx512fp16.mask.vcvtpd2ph.512"]
16512    fn vcvtpd2ph_512(a: __m512d, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16513    #[link_name = "llvm.x86.avx512fp16.mask.vcvtsd2sh.round"]
16514    fn vcvtsd2sh(a: __m128h, b: __m128d, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16515
16516    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2w.128"]
16517    fn vcvtph2w_128(a: __m128h, src: i16x8, k: __mmask8) -> i16x8;
16518    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2w.256"]
16519    fn vcvtph2w_256(a: __m256h, src: i16x16, k: __mmask16) -> i16x16;
16520    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2w.512"]
16521    fn vcvtph2w_512(a: __m512h, src: i16x32, k: __mmask32, rounding: i32) -> i16x32;
16522    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uw.128"]
16523    fn vcvtph2uw_128(a: __m128h, src: u16x8, k: __mmask8) -> u16x8;
16524    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uw.256"]
16525    fn vcvtph2uw_256(a: __m256h, src: u16x16, k: __mmask16) -> u16x16;
16526    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uw.512"]
16527    fn vcvtph2uw_512(a: __m512h, src: u16x32, k: __mmask32, sae: i32) -> u16x32;
16528
16529    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2w.128"]
16530    fn vcvttph2w_128(a: __m128h, src: i16x8, k: __mmask8) -> i16x8;
16531    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2w.256"]
16532    fn vcvttph2w_256(a: __m256h, src: i16x16, k: __mmask16) -> i16x16;
16533    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2w.512"]
16534    fn vcvttph2w_512(a: __m512h, src: i16x32, k: __mmask32, sae: i32) -> i16x32;
16535    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uw.128"]
16536    fn vcvttph2uw_128(a: __m128h, src: u16x8, k: __mmask8) -> u16x8;
16537    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uw.256"]
16538    fn vcvttph2uw_256(a: __m256h, src: u16x16, k: __mmask16) -> u16x16;
16539    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uw.512"]
16540    fn vcvttph2uw_512(a: __m512h, src: u16x32, k: __mmask32, sae: i32) -> u16x32;
16541
16542    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2dq.128"]
16543    fn vcvtph2dq_128(a: __m128h, src: i32x4, k: __mmask8) -> i32x4;
16544    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2dq.256"]
16545    fn vcvtph2dq_256(a: __m128h, src: i32x8, k: __mmask8) -> i32x8;
16546    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2dq.512"]
16547    fn vcvtph2dq_512(a: __m256h, src: i32x16, k: __mmask16, rounding: i32) -> i32x16;
16548    #[link_name = "llvm.x86.avx512fp16.vcvtsh2si32"]
16549    fn vcvtsh2si32(a: __m128h, rounding: i32) -> i32;
16550    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2udq.128"]
16551    fn vcvtph2udq_128(a: __m128h, src: u32x4, k: __mmask8) -> u32x4;
16552    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2udq.256"]
16553    fn vcvtph2udq_256(a: __m128h, src: u32x8, k: __mmask8) -> u32x8;
16554    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2udq.512"]
16555    fn vcvtph2udq_512(a: __m256h, src: u32x16, k: __mmask16, rounding: i32) -> u32x16;
16556    #[link_name = "llvm.x86.avx512fp16.vcvtsh2usi32"]
16557    fn vcvtsh2usi32(a: __m128h, sae: i32) -> u32;
16558
16559    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2dq.128"]
16560    fn vcvttph2dq_128(a: __m128h, src: i32x4, k: __mmask8) -> i32x4;
16561    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2dq.256"]
16562    fn vcvttph2dq_256(a: __m128h, src: i32x8, k: __mmask8) -> i32x8;
16563    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2dq.512"]
16564    fn vcvttph2dq_512(a: __m256h, src: i32x16, k: __mmask16, sae: i32) -> i32x16;
16565    #[link_name = "llvm.x86.avx512fp16.vcvttsh2si32"]
16566    fn vcvttsh2si32(a: __m128h, sae: i32) -> i32;
16567    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2udq.128"]
16568    fn vcvttph2udq_128(a: __m128h, src: u32x4, k: __mmask8) -> u32x4;
16569    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2udq.256"]
16570    fn vcvttph2udq_256(a: __m128h, src: u32x8, k: __mmask8) -> u32x8;
16571    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2udq.512"]
16572    fn vcvttph2udq_512(a: __m256h, src: u32x16, k: __mmask16, sae: i32) -> u32x16;
16573    #[link_name = "llvm.x86.avx512fp16.vcvttsh2usi32"]
16574    fn vcvttsh2usi32(a: __m128h, sae: i32) -> u32;
16575
16576    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2qq.128"]
16577    fn vcvtph2qq_128(a: __m128h, src: i64x2, k: __mmask8) -> i64x2;
16578    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2qq.256"]
16579    fn vcvtph2qq_256(a: __m128h, src: i64x4, k: __mmask8) -> i64x4;
16580    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2qq.512"]
16581    fn vcvtph2qq_512(a: __m128h, src: i64x8, k: __mmask8, rounding: i32) -> i64x8;
16582    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uqq.128"]
16583    fn vcvtph2uqq_128(a: __m128h, src: u64x2, k: __mmask8) -> u64x2;
16584    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uqq.256"]
16585    fn vcvtph2uqq_256(a: __m128h, src: u64x4, k: __mmask8) -> u64x4;
16586    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uqq.512"]
16587    fn vcvtph2uqq_512(a: __m128h, src: u64x8, k: __mmask8, rounding: i32) -> u64x8;
16588
16589    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2qq.128"]
16590    fn vcvttph2qq_128(a: __m128h, src: i64x2, k: __mmask8) -> i64x2;
16591    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2qq.256"]
16592    fn vcvttph2qq_256(a: __m128h, src: i64x4, k: __mmask8) -> i64x4;
16593    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2qq.512"]
16594    fn vcvttph2qq_512(a: __m128h, src: i64x8, k: __mmask8, sae: i32) -> i64x8;
16595    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uqq.128"]
16596    fn vcvttph2uqq_128(a: __m128h, src: u64x2, k: __mmask8) -> u64x2;
16597    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uqq.256"]
16598    fn vcvttph2uqq_256(a: __m128h, src: u64x4, k: __mmask8) -> u64x4;
16599    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uqq.512"]
16600    fn vcvttph2uqq_512(a: __m128h, src: u64x8, k: __mmask8, sae: i32) -> u64x8;
16601
16602    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2psx.128"]
16603    fn vcvtph2psx_128(a: __m128h, src: __m128, k: __mmask8) -> __m128;
16604    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2psx.256"]
16605    fn vcvtph2psx_256(a: __m128h, src: __m256, k: __mmask8) -> __m256;
16606    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2psx.512"]
16607    fn vcvtph2psx_512(a: __m256h, src: __m512, k: __mmask16, sae: i32) -> __m512;
16608    #[link_name = "llvm.x86.avx512fp16.mask.vcvtsh2ss.round"]
16609    fn vcvtsh2ss(a: __m128, b: __m128h, src: __m128, k: __mmask8, sae: i32) -> __m128;
16610
16611    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2pd.128"]
16612    fn vcvtph2pd_128(a: __m128h, src: __m128d, k: __mmask8) -> __m128d;
16613    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2pd.256"]
16614    fn vcvtph2pd_256(a: __m128h, src: __m256d, k: __mmask8) -> __m256d;
16615    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2pd.512"]
16616    fn vcvtph2pd_512(a: __m128h, src: __m512d, k: __mmask8, sae: i32) -> __m512d;
16617    #[link_name = "llvm.x86.avx512fp16.mask.vcvtsh2sd.round"]
16618    fn vcvtsh2sd(a: __m128d, b: __m128h, src: __m128d, k: __mmask8, sae: i32) -> __m128d;
16619
16620}
16621
16622#[cfg(test)]
16623mod tests {
16624    use crate::core_arch::x86::*;
16625    use crate::mem::transmute;
16626    use crate::ptr::{addr_of, addr_of_mut};
16627    use stdarch_test::simd_test;
16628
16629    #[target_feature(enable = "avx512fp16")]
16630    unsafe fn _mm_set1_pch(re: f16, im: f16) -> __m128h {
16631        _mm_setr_ph(re, im, re, im, re, im, re, im)
16632    }
16633
16634    #[target_feature(enable = "avx512fp16")]
16635    unsafe fn _mm256_set1_pch(re: f16, im: f16) -> __m256h {
16636        _mm256_setr_ph(
16637            re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im,
16638        )
16639    }
16640
16641    #[target_feature(enable = "avx512fp16")]
16642    unsafe fn _mm512_set1_pch(re: f16, im: f16) -> __m512h {
16643        _mm512_setr_ph(
16644            re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im,
16645            re, im, re, im, re, im, re, im, re, im,
16646        )
16647    }
16648
16649    #[simd_test(enable = "avx512fp16,avx512vl")]
16650    unsafe fn test_mm_set_ph() {
16651        let r = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
16652        let e = _mm_setr_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
16653        assert_eq_m128h(r, e);
16654    }
16655
16656    #[simd_test(enable = "avx512fp16,avx512vl")]
16657    unsafe fn test_mm256_set_ph() {
16658        let r = _mm256_set_ph(
16659            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16660        );
16661        let e = _mm256_setr_ph(
16662            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
16663        );
16664        assert_eq_m256h(r, e);
16665    }
16666
16667    #[simd_test(enable = "avx512fp16")]
16668    unsafe fn test_mm512_set_ph() {
16669        let r = _mm512_set_ph(
16670            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16671            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
16672            31.0, 32.0,
16673        );
16674        let e = _mm512_setr_ph(
16675            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
16676            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
16677            3.0, 2.0, 1.0,
16678        );
16679        assert_eq_m512h(r, e);
16680    }
16681
16682    #[simd_test(enable = "avx512fp16,avx512vl")]
16683    unsafe fn test_mm_set_sh() {
16684        let r = _mm_set_sh(1.0);
16685        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0);
16686        assert_eq_m128h(r, e);
16687    }
16688
16689    #[simd_test(enable = "avx512fp16,avx512vl")]
16690    unsafe fn test_mm_set1_ph() {
16691        let r = _mm_set1_ph(1.0);
16692        let e = _mm_set_ph(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0);
16693        assert_eq_m128h(r, e);
16694    }
16695
16696    #[simd_test(enable = "avx512fp16,avx512vl")]
16697    unsafe fn test_mm256_set1_ph() {
16698        let r = _mm256_set1_ph(1.0);
16699        let e = _mm256_set_ph(
16700            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
16701        );
16702        assert_eq_m256h(r, e);
16703    }
16704
16705    #[simd_test(enable = "avx512fp16")]
16706    unsafe fn test_mm512_set1_ph() {
16707        let r = _mm512_set1_ph(1.0);
16708        let e = _mm512_set_ph(
16709            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
16710            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
16711        );
16712        assert_eq_m512h(r, e);
16713    }
16714
16715    #[simd_test(enable = "avx512fp16,avx512vl")]
16716    unsafe fn test_mm_setr_ph() {
16717        let r = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
16718        let e = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
16719        assert_eq_m128h(r, e);
16720    }
16721
16722    #[simd_test(enable = "avx512fp16,avx512vl")]
16723    unsafe fn test_mm256_setr_ph() {
16724        let r = _mm256_setr_ph(
16725            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16726        );
16727        let e = _mm256_set_ph(
16728            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
16729        );
16730        assert_eq_m256h(r, e);
16731    }
16732
16733    #[simd_test(enable = "avx512fp16")]
16734    unsafe fn test_mm512_setr_ph() {
16735        let r = _mm512_setr_ph(
16736            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16737            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
16738            31.0, 32.0,
16739        );
16740        let e = _mm512_set_ph(
16741            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
16742            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
16743            3.0, 2.0, 1.0,
16744        );
16745        assert_eq_m512h(r, e);
16746    }
16747
16748    #[simd_test(enable = "avx512fp16,avx512vl")]
16749    unsafe fn test_mm_setzero_ph() {
16750        let r = _mm_setzero_ph();
16751        let e = _mm_set1_ph(0.0);
16752        assert_eq_m128h(r, e);
16753    }
16754
16755    #[simd_test(enable = "avx512fp16,avx512vl")]
16756    unsafe fn test_mm256_setzero_ph() {
16757        let r = _mm256_setzero_ph();
16758        let e = _mm256_set1_ph(0.0);
16759        assert_eq_m256h(r, e);
16760    }
16761
16762    #[simd_test(enable = "avx512fp16")]
16763    unsafe fn test_mm512_setzero_ph() {
16764        let r = _mm512_setzero_ph();
16765        let e = _mm512_set1_ph(0.0);
16766        assert_eq_m512h(r, e);
16767    }
16768
16769    #[simd_test(enable = "avx512fp16,avx512vl")]
16770    unsafe fn test_mm_castsi128_ph() {
16771        let a = _mm_set1_epi16(0x3c00);
16772        let r = _mm_castsi128_ph(a);
16773        let e = _mm_set1_ph(1.0);
16774        assert_eq_m128h(r, e);
16775    }
16776
16777    #[simd_test(enable = "avx512fp16,avx512vl")]
16778    unsafe fn test_mm256_castsi256_ph() {
16779        let a = _mm256_set1_epi16(0x3c00);
16780        let r = _mm256_castsi256_ph(a);
16781        let e = _mm256_set1_ph(1.0);
16782        assert_eq_m256h(r, e);
16783    }
16784
16785    #[simd_test(enable = "avx512fp16")]
16786    unsafe fn test_mm512_castsi512_ph() {
16787        let a = _mm512_set1_epi16(0x3c00);
16788        let r = _mm512_castsi512_ph(a);
16789        let e = _mm512_set1_ph(1.0);
16790        assert_eq_m512h(r, e);
16791    }
16792
16793    #[simd_test(enable = "avx512fp16")]
16794    unsafe fn test_mm_castph_si128() {
16795        let a = _mm_set1_ph(1.0);
16796        let r = _mm_castph_si128(a);
16797        let e = _mm_set1_epi16(0x3c00);
16798        assert_eq_m128i(r, e);
16799    }
16800
16801    #[simd_test(enable = "avx512fp16")]
16802    unsafe fn test_mm256_castph_si256() {
16803        let a = _mm256_set1_ph(1.0);
16804        let r = _mm256_castph_si256(a);
16805        let e = _mm256_set1_epi16(0x3c00);
16806        assert_eq_m256i(r, e);
16807    }
16808
16809    #[simd_test(enable = "avx512fp16")]
16810    unsafe fn test_mm512_castph_si512() {
16811        let a = _mm512_set1_ph(1.0);
16812        let r = _mm512_castph_si512(a);
16813        let e = _mm512_set1_epi16(0x3c00);
16814        assert_eq_m512i(r, e);
16815    }
16816
16817    #[simd_test(enable = "avx512fp16,avx512vl")]
16818    unsafe fn test_mm_castps_ph() {
16819        let a = _mm_castsi128_ps(_mm_set1_epi16(0x3c00));
16820        let r = _mm_castps_ph(a);
16821        let e = _mm_set1_ph(1.0);
16822        assert_eq_m128h(r, e);
16823    }
16824
16825    #[simd_test(enable = "avx512fp16,avx512vl")]
16826    unsafe fn test_mm256_castps_ph() {
16827        let a = _mm256_castsi256_ps(_mm256_set1_epi16(0x3c00));
16828        let r = _mm256_castps_ph(a);
16829        let e = _mm256_set1_ph(1.0);
16830        assert_eq_m256h(r, e);
16831    }
16832
16833    #[simd_test(enable = "avx512fp16")]
16834    unsafe fn test_mm512_castps_ph() {
16835        let a = _mm512_castsi512_ps(_mm512_set1_epi16(0x3c00));
16836        let r = _mm512_castps_ph(a);
16837        let e = _mm512_set1_ph(1.0);
16838        assert_eq_m512h(r, e);
16839    }
16840
16841    #[simd_test(enable = "avx512fp16")]
16842    unsafe fn test_mm_castph_ps() {
16843        let a = _mm_castsi128_ph(_mm_set1_epi32(0x3f800000));
16844        let r = _mm_castph_ps(a);
16845        let e = _mm_set1_ps(1.0);
16846        assert_eq_m128(r, e);
16847    }
16848
16849    #[simd_test(enable = "avx512fp16")]
16850    unsafe fn test_mm256_castph_ps() {
16851        let a = _mm256_castsi256_ph(_mm256_set1_epi32(0x3f800000));
16852        let r = _mm256_castph_ps(a);
16853        let e = _mm256_set1_ps(1.0);
16854        assert_eq_m256(r, e);
16855    }
16856
16857    #[simd_test(enable = "avx512fp16")]
16858    unsafe fn test_mm512_castph_ps() {
16859        let a = _mm512_castsi512_ph(_mm512_set1_epi32(0x3f800000));
16860        let r = _mm512_castph_ps(a);
16861        let e = _mm512_set1_ps(1.0);
16862        assert_eq_m512(r, e);
16863    }
16864
16865    #[simd_test(enable = "avx512fp16,avx512vl")]
16866    unsafe fn test_mm_castpd_ph() {
16867        let a = _mm_castsi128_pd(_mm_set1_epi16(0x3c00));
16868        let r = _mm_castpd_ph(a);
16869        let e = _mm_set1_ph(1.0);
16870        assert_eq_m128h(r, e);
16871    }
16872
16873    #[simd_test(enable = "avx512fp16,avx512vl")]
16874    unsafe fn test_mm256_castpd_ph() {
16875        let a = _mm256_castsi256_pd(_mm256_set1_epi16(0x3c00));
16876        let r = _mm256_castpd_ph(a);
16877        let e = _mm256_set1_ph(1.0);
16878        assert_eq_m256h(r, e);
16879    }
16880
16881    #[simd_test(enable = "avx512fp16")]
16882    unsafe fn test_mm512_castpd_ph() {
16883        let a = _mm512_castsi512_pd(_mm512_set1_epi16(0x3c00));
16884        let r = _mm512_castpd_ph(a);
16885        let e = _mm512_set1_ph(1.0);
16886        assert_eq_m512h(r, e);
16887    }
16888
16889    #[simd_test(enable = "avx512fp16")]
16890    unsafe fn test_mm_castph_pd() {
16891        let a = _mm_castsi128_ph(_mm_set1_epi64x(0x3ff0000000000000));
16892        let r = _mm_castph_pd(a);
16893        let e = _mm_set1_pd(1.0);
16894        assert_eq_m128d(r, e);
16895    }
16896
16897    #[simd_test(enable = "avx512fp16")]
16898    unsafe fn test_mm256_castph_pd() {
16899        let a = _mm256_castsi256_ph(_mm256_set1_epi64x(0x3ff0000000000000));
16900        let r = _mm256_castph_pd(a);
16901        let e = _mm256_set1_pd(1.0);
16902        assert_eq_m256d(r, e);
16903    }
16904
16905    #[simd_test(enable = "avx512fp16")]
16906    unsafe fn test_mm512_castph_pd() {
16907        let a = _mm512_castsi512_ph(_mm512_set1_epi64(0x3ff0000000000000));
16908        let r = _mm512_castph_pd(a);
16909        let e = _mm512_set1_pd(1.0);
16910        assert_eq_m512d(r, e);
16911    }
16912
16913    #[simd_test(enable = "avx512fp16,avx512vl")]
16914    unsafe fn test_mm256_castph256_ph128() {
16915        let a = _mm256_setr_ph(
16916            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
16917        );
16918        let r = _mm256_castph256_ph128(a);
16919        let e = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
16920        assert_eq_m128h(r, e);
16921    }
16922
16923    #[simd_test(enable = "avx512fp16,avx512vl")]
16924    unsafe fn test_mm512_castph512_ph128() {
16925        let a = _mm512_setr_ph(
16926            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19.,
16927            20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
16928        );
16929        let r = _mm512_castph512_ph128(a);
16930        let e = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
16931        assert_eq_m128h(r, e);
16932    }
16933
16934    #[simd_test(enable = "avx512fp16,avx512vl")]
16935    unsafe fn test_mm512_castph512_ph256() {
16936        let a = _mm512_setr_ph(
16937            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19.,
16938            20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
16939        );
16940        let r = _mm512_castph512_ph256(a);
16941        let e = _mm256_setr_ph(
16942            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
16943        );
16944        assert_eq_m256h(r, e);
16945    }
16946
16947    #[simd_test(enable = "avx512fp16,avx512vl")]
16948    unsafe fn test_mm256_castph128_ph256() {
16949        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
16950        let r = _mm256_castph128_ph256(a);
16951        assert_eq_m128h(_mm256_castph256_ph128(r), a);
16952    }
16953
16954    #[simd_test(enable = "avx512fp16,avx512vl")]
16955    unsafe fn test_mm512_castph128_ph512() {
16956        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
16957        let r = _mm512_castph128_ph512(a);
16958        assert_eq_m128h(_mm512_castph512_ph128(r), a);
16959    }
16960
16961    #[simd_test(enable = "avx512fp16,avx512vl")]
16962    unsafe fn test_mm512_castph256_ph512() {
16963        let a = _mm256_setr_ph(
16964            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
16965        );
16966        let r = _mm512_castph256_ph512(a);
16967        assert_eq_m256h(_mm512_castph512_ph256(r), a);
16968    }
16969
16970    #[simd_test(enable = "avx512fp16,avx512vl")]
16971    unsafe fn test_mm256_zextph128_ph256() {
16972        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
16973        let r = _mm256_zextph128_ph256(a);
16974        let e = _mm256_setr_ph(
16975            1., 2., 3., 4., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
16976        );
16977        assert_eq_m256h(r, e);
16978    }
16979
16980    #[simd_test(enable = "avx512fp16")]
16981    unsafe fn test_mm512_zextph128_ph512() {
16982        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
16983        let r = _mm512_zextph128_ph512(a);
16984        let e = _mm512_setr_ph(
16985            1., 2., 3., 4., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
16986            0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
16987        );
16988        assert_eq_m512h(r, e);
16989    }
16990
16991    #[simd_test(enable = "avx512fp16")]
16992    unsafe fn test_mm512_zextph256_ph512() {
16993        let a = _mm256_setr_ph(
16994            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
16995        );
16996        let r = _mm512_zextph256_ph512(a);
16997        let e = _mm512_setr_ph(
16998            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 0., 0., 0., 0.,
16999            0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
17000        );
17001        assert_eq_m512h(r, e);
17002    }
17003
17004    #[simd_test(enable = "avx512fp16,avx512vl")]
17005    unsafe fn test_mm_cmp_ph_mask() {
17006        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17007        let b = _mm_set_ph(1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0);
17008        let r = _mm_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
17009        assert_eq!(r, 0b11110000);
17010    }
17011
17012    #[simd_test(enable = "avx512fp16,avx512vl")]
17013    unsafe fn test_mm_mask_cmp_ph_mask() {
17014        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17015        let b = _mm_set_ph(1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0);
17016        let r = _mm_mask_cmp_ph_mask::<_CMP_EQ_OQ>(0b01010101, a, b);
17017        assert_eq!(r, 0b01010000);
17018    }
17019
17020    #[simd_test(enable = "avx512fp16,avx512vl")]
17021    unsafe fn test_mm256_cmp_ph_mask() {
17022        let a = _mm256_set_ph(
17023            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17024        );
17025        let b = _mm256_set_ph(
17026            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17027            -16.0,
17028        );
17029        let r = _mm256_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
17030        assert_eq!(r, 0b1111000011110000);
17031    }
17032
17033    #[simd_test(enable = "avx512fp16,avx512vl")]
17034    unsafe fn test_mm256_mask_cmp_ph_mask() {
17035        let a = _mm256_set_ph(
17036            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17037        );
17038        let b = _mm256_set_ph(
17039            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17040            -16.0,
17041        );
17042        let r = _mm256_mask_cmp_ph_mask::<_CMP_EQ_OQ>(0b0101010101010101, a, b);
17043        assert_eq!(r, 0b0101000001010000);
17044    }
17045
17046    #[simd_test(enable = "avx512fp16")]
17047    unsafe fn test_mm512_cmp_ph_mask() {
17048        let a = _mm512_set_ph(
17049            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17050            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17051            31.0, 32.0,
17052        );
17053        let b = _mm512_set_ph(
17054            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17055            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
17056            -29.0, -30.0, -31.0, -32.0,
17057        );
17058        let r = _mm512_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
17059        assert_eq!(r, 0b11110000111100001111000011110000);
17060    }
17061
17062    #[simd_test(enable = "avx512fp16")]
17063    unsafe fn test_mm512_mask_cmp_ph_mask() {
17064        let a = _mm512_set_ph(
17065            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17066            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17067            31.0, 32.0,
17068        );
17069        let b = _mm512_set_ph(
17070            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17071            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
17072            -29.0, -30.0, -31.0, -32.0,
17073        );
17074        let r = _mm512_mask_cmp_ph_mask::<_CMP_EQ_OQ>(0b01010101010101010101010101010101, a, b);
17075        assert_eq!(r, 0b01010000010100000101000001010000);
17076    }
17077
17078    #[simd_test(enable = "avx512fp16")]
17079    unsafe fn test_mm512_cmp_round_ph_mask() {
17080        let a = _mm512_set_ph(
17081            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17082            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17083            31.0, 32.0,
17084        );
17085        let b = _mm512_set_ph(
17086            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17087            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
17088            -29.0, -30.0, -31.0, -32.0,
17089        );
17090        let r = _mm512_cmp_round_ph_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b);
17091        assert_eq!(r, 0b11110000111100001111000011110000);
17092    }
17093
17094    #[simd_test(enable = "avx512fp16")]
17095    unsafe fn test_mm512_mask_cmp_round_ph_mask() {
17096        let a = _mm512_set_ph(
17097            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17098            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17099            31.0, 32.0,
17100        );
17101        let b = _mm512_set_ph(
17102            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17103            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
17104            -29.0, -30.0, -31.0, -32.0,
17105        );
17106        let r = _mm512_mask_cmp_round_ph_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(
17107            0b01010101010101010101010101010101,
17108            a,
17109            b,
17110        );
17111        assert_eq!(r, 0b01010000010100000101000001010000);
17112    }
17113
17114    #[simd_test(enable = "avx512fp16")]
17115    unsafe fn test_mm_cmp_round_sh_mask() {
17116        let a = _mm_set_sh(1.0);
17117        let b = _mm_set_sh(1.0);
17118        let r = _mm_cmp_round_sh_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b);
17119        assert_eq!(r, 1);
17120    }
17121
17122    #[simd_test(enable = "avx512fp16")]
17123    unsafe fn test_mm_mask_cmp_round_sh_mask() {
17124        let a = _mm_set_sh(1.0);
17125        let b = _mm_set_sh(1.0);
17126        let r = _mm_mask_cmp_round_sh_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(0, a, b);
17127        assert_eq!(r, 0);
17128    }
17129
17130    #[simd_test(enable = "avx512fp16")]
17131    unsafe fn test_mm_cmp_sh_mask() {
17132        let a = _mm_set_sh(1.0);
17133        let b = _mm_set_sh(1.0);
17134        let r = _mm_cmp_sh_mask::<_CMP_EQ_OQ>(a, b);
17135        assert_eq!(r, 1);
17136    }
17137
17138    #[simd_test(enable = "avx512fp16")]
17139    unsafe fn test_mm_mask_cmp_sh_mask() {
17140        let a = _mm_set_sh(1.0);
17141        let b = _mm_set_sh(1.0);
17142        let r = _mm_mask_cmp_sh_mask::<_CMP_EQ_OQ>(0, a, b);
17143        assert_eq!(r, 0);
17144    }
17145
17146    #[simd_test(enable = "avx512fp16")]
17147    unsafe fn test_mm_comi_round_sh() {
17148        let a = _mm_set_sh(1.0);
17149        let b = _mm_set_sh(1.0);
17150        let r = _mm_comi_round_sh::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b);
17151        assert_eq!(r, 1);
17152    }
17153
17154    #[simd_test(enable = "avx512fp16")]
17155    unsafe fn test_mm_comi_sh() {
17156        let a = _mm_set_sh(1.0);
17157        let b = _mm_set_sh(1.0);
17158        let r = _mm_comi_sh::<_CMP_EQ_OQ>(a, b);
17159        assert_eq!(r, 1);
17160    }
17161
17162    #[simd_test(enable = "avx512fp16")]
17163    unsafe fn test_mm_comieq_sh() {
17164        let a = _mm_set_sh(1.0);
17165        let b = _mm_set_sh(1.0);
17166        let r = _mm_comieq_sh(a, b);
17167        assert_eq!(r, 1);
17168    }
17169
17170    #[simd_test(enable = "avx512fp16")]
17171    unsafe fn test_mm_comige_sh() {
17172        let a = _mm_set_sh(2.0);
17173        let b = _mm_set_sh(1.0);
17174        let r = _mm_comige_sh(a, b);
17175        assert_eq!(r, 1);
17176    }
17177
17178    #[simd_test(enable = "avx512fp16")]
17179    unsafe fn test_mm_comigt_sh() {
17180        let a = _mm_set_sh(2.0);
17181        let b = _mm_set_sh(1.0);
17182        let r = _mm_comigt_sh(a, b);
17183        assert_eq!(r, 1);
17184    }
17185
17186    #[simd_test(enable = "avx512fp16")]
17187    unsafe fn test_mm_comile_sh() {
17188        let a = _mm_set_sh(1.0);
17189        let b = _mm_set_sh(2.0);
17190        let r = _mm_comile_sh(a, b);
17191        assert_eq!(r, 1);
17192    }
17193
17194    #[simd_test(enable = "avx512fp16")]
17195    unsafe fn test_mm_comilt_sh() {
17196        let a = _mm_set_sh(1.0);
17197        let b = _mm_set_sh(2.0);
17198        let r = _mm_comilt_sh(a, b);
17199        assert_eq!(r, 1);
17200    }
17201
17202    #[simd_test(enable = "avx512fp16")]
17203    unsafe fn test_mm_comineq_sh() {
17204        let a = _mm_set_sh(1.0);
17205        let b = _mm_set_sh(2.0);
17206        let r = _mm_comineq_sh(a, b);
17207        assert_eq!(r, 1);
17208    }
17209
17210    #[simd_test(enable = "avx512fp16")]
17211    unsafe fn test_mm_ucomieq_sh() {
17212        let a = _mm_set_sh(1.0);
17213        let b = _mm_set_sh(1.0);
17214        let r = _mm_ucomieq_sh(a, b);
17215        assert_eq!(r, 1);
17216    }
17217
17218    #[simd_test(enable = "avx512fp16")]
17219    unsafe fn test_mm_ucomige_sh() {
17220        let a = _mm_set_sh(2.0);
17221        let b = _mm_set_sh(1.0);
17222        let r = _mm_ucomige_sh(a, b);
17223        assert_eq!(r, 1);
17224    }
17225
17226    #[simd_test(enable = "avx512fp16")]
17227    unsafe fn test_mm_ucomigt_sh() {
17228        let a = _mm_set_sh(2.0);
17229        let b = _mm_set_sh(1.0);
17230        let r = _mm_ucomigt_sh(a, b);
17231        assert_eq!(r, 1);
17232    }
17233
17234    #[simd_test(enable = "avx512fp16")]
17235    unsafe fn test_mm_ucomile_sh() {
17236        let a = _mm_set_sh(1.0);
17237        let b = _mm_set_sh(2.0);
17238        let r = _mm_ucomile_sh(a, b);
17239        assert_eq!(r, 1);
17240    }
17241
17242    #[simd_test(enable = "avx512fp16")]
17243    unsafe fn test_mm_ucomilt_sh() {
17244        let a = _mm_set_sh(1.0);
17245        let b = _mm_set_sh(2.0);
17246        let r = _mm_ucomilt_sh(a, b);
17247        assert_eq!(r, 1);
17248    }
17249
17250    #[simd_test(enable = "avx512fp16")]
17251    unsafe fn test_mm_ucomineq_sh() {
17252        let a = _mm_set_sh(1.0);
17253        let b = _mm_set_sh(2.0);
17254        let r = _mm_ucomineq_sh(a, b);
17255        assert_eq!(r, 1);
17256    }
17257
17258    #[simd_test(enable = "avx512fp16,avx512vl")]
17259    unsafe fn test_mm_load_ph() {
17260        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17261        let b = _mm_load_ph(addr_of!(a).cast());
17262        assert_eq_m128h(a, b);
17263    }
17264
17265    #[simd_test(enable = "avx512fp16,avx512vl")]
17266    unsafe fn test_mm256_load_ph() {
17267        let a = _mm256_set_ph(
17268            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17269        );
17270        let b = _mm256_load_ph(addr_of!(a).cast());
17271        assert_eq_m256h(a, b);
17272    }
17273
17274    #[simd_test(enable = "avx512fp16")]
17275    unsafe fn test_mm512_load_ph() {
17276        let a = _mm512_set_ph(
17277            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17278            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17279            31.0, 32.0,
17280        );
17281        let b = _mm512_load_ph(addr_of!(a).cast());
17282        assert_eq_m512h(a, b);
17283    }
17284
17285    #[simd_test(enable = "avx512fp16")]
17286    unsafe fn test_mm_load_sh() {
17287        let a = _mm_set_sh(1.0);
17288        let b = _mm_load_sh(addr_of!(a).cast());
17289        assert_eq_m128h(a, b);
17290    }
17291
17292    #[simd_test(enable = "avx512fp16")]
17293    unsafe fn test_mm_mask_load_sh() {
17294        let a = _mm_set_sh(1.0);
17295        let src = _mm_set_sh(2.);
17296        let b = _mm_mask_load_sh(src, 1, addr_of!(a).cast());
17297        assert_eq_m128h(a, b);
17298        let b = _mm_mask_load_sh(src, 0, addr_of!(a).cast());
17299        assert_eq_m128h(src, b);
17300    }
17301
17302    #[simd_test(enable = "avx512fp16")]
17303    unsafe fn test_mm_maskz_load_sh() {
17304        let a = _mm_set_sh(1.0);
17305        let b = _mm_maskz_load_sh(1, addr_of!(a).cast());
17306        assert_eq_m128h(a, b);
17307        let b = _mm_maskz_load_sh(0, addr_of!(a).cast());
17308        assert_eq_m128h(_mm_setzero_ph(), b);
17309    }
17310
17311    #[simd_test(enable = "avx512fp16,avx512vl")]
17312    unsafe fn test_mm_loadu_ph() {
17313        let array = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
17314        let r = _mm_loadu_ph(array.as_ptr());
17315        let e = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17316        assert_eq_m128h(r, e);
17317    }
17318
17319    #[simd_test(enable = "avx512fp16,avx512vl")]
17320    unsafe fn test_mm256_loadu_ph() {
17321        let array = [
17322            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17323        ];
17324        let r = _mm256_loadu_ph(array.as_ptr());
17325        let e = _mm256_setr_ph(
17326            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17327        );
17328        assert_eq_m256h(r, e);
17329    }
17330
17331    #[simd_test(enable = "avx512fp16")]
17332    unsafe fn test_mm512_loadu_ph() {
17333        let array = [
17334            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17335            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17336            31.0, 32.0,
17337        ];
17338        let r = _mm512_loadu_ph(array.as_ptr());
17339        let e = _mm512_setr_ph(
17340            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17341            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17342            31.0, 32.0,
17343        );
17344        assert_eq_m512h(r, e);
17345    }
17346
17347    #[simd_test(enable = "avx512fp16")]
17348    unsafe fn test_mm_move_sh() {
17349        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17350        let b = _mm_set_sh(9.0);
17351        let r = _mm_move_sh(a, b);
17352        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 9.0);
17353        assert_eq_m128h(r, e);
17354    }
17355
17356    #[simd_test(enable = "avx512fp16")]
17357    unsafe fn test_mm_mask_move_sh() {
17358        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17359        let b = _mm_set_sh(9.0);
17360        let src = _mm_set_sh(10.0);
17361        let r = _mm_mask_move_sh(src, 0, a, b);
17362        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 10.0);
17363        assert_eq_m128h(r, e);
17364    }
17365
17366    #[simd_test(enable = "avx512fp16")]
17367    unsafe fn test_mm_maskz_move_sh() {
17368        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17369        let b = _mm_set_sh(9.0);
17370        let r = _mm_maskz_move_sh(0, a, b);
17371        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 0.0);
17372        assert_eq_m128h(r, e);
17373    }
17374
17375    #[simd_test(enable = "avx512fp16,avx512vl")]
17376    unsafe fn test_mm_store_ph() {
17377        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17378        let mut b = _mm_setzero_ph();
17379        _mm_store_ph(addr_of_mut!(b).cast(), a);
17380        assert_eq_m128h(a, b);
17381    }
17382
17383    #[simd_test(enable = "avx512fp16,avx512vl")]
17384    unsafe fn test_mm256_store_ph() {
17385        let a = _mm256_set_ph(
17386            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17387        );
17388        let mut b = _mm256_setzero_ph();
17389        _mm256_store_ph(addr_of_mut!(b).cast(), a);
17390        assert_eq_m256h(a, b);
17391    }
17392
17393    #[simd_test(enable = "avx512fp16")]
17394    unsafe fn test_mm512_store_ph() {
17395        let a = _mm512_set_ph(
17396            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17397            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17398            31.0, 32.0,
17399        );
17400        let mut b = _mm512_setzero_ph();
17401        _mm512_store_ph(addr_of_mut!(b).cast(), a);
17402        assert_eq_m512h(a, b);
17403    }
17404
17405    #[simd_test(enable = "avx512fp16")]
17406    unsafe fn test_mm_store_sh() {
17407        let a = _mm_set_sh(1.0);
17408        let mut b = _mm_setzero_ph();
17409        _mm_store_sh(addr_of_mut!(b).cast(), a);
17410        assert_eq_m128h(a, b);
17411    }
17412
17413    #[simd_test(enable = "avx512fp16")]
17414    unsafe fn test_mm_mask_store_sh() {
17415        let a = _mm_set_sh(1.0);
17416        let mut b = _mm_setzero_ph();
17417        _mm_mask_store_sh(addr_of_mut!(b).cast(), 0, a);
17418        assert_eq_m128h(_mm_setzero_ph(), b);
17419        _mm_mask_store_sh(addr_of_mut!(b).cast(), 1, a);
17420        assert_eq_m128h(a, b);
17421    }
17422
17423    #[simd_test(enable = "avx512fp16,avx512vl")]
17424    unsafe fn test_mm_storeu_ph() {
17425        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17426        let mut array = [0.0; 8];
17427        _mm_storeu_ph(array.as_mut_ptr(), a);
17428        assert_eq_m128h(a, _mm_loadu_ph(array.as_ptr()));
17429    }
17430
17431    #[simd_test(enable = "avx512fp16,avx512vl")]
17432    unsafe fn test_mm256_storeu_ph() {
17433        let a = _mm256_set_ph(
17434            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17435        );
17436        let mut array = [0.0; 16];
17437        _mm256_storeu_ph(array.as_mut_ptr(), a);
17438        assert_eq_m256h(a, _mm256_loadu_ph(array.as_ptr()));
17439    }
17440
17441    #[simd_test(enable = "avx512fp16")]
17442    unsafe fn test_mm512_storeu_ph() {
17443        let a = _mm512_set_ph(
17444            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17445            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17446            31.0, 32.0,
17447        );
17448        let mut array = [0.0; 32];
17449        _mm512_storeu_ph(array.as_mut_ptr(), a);
17450        assert_eq_m512h(a, _mm512_loadu_ph(array.as_ptr()));
17451    }
17452
17453    #[simd_test(enable = "avx512fp16,avx512vl")]
17454    unsafe fn test_mm_add_ph() {
17455        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17456        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17457        let r = _mm_add_ph(a, b);
17458        let e = _mm_set1_ph(9.0);
17459        assert_eq_m128h(r, e);
17460    }
17461
17462    #[simd_test(enable = "avx512fp16,avx512vl")]
17463    unsafe fn test_mm_mask_add_ph() {
17464        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17465        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17466        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
17467        let r = _mm_mask_add_ph(src, 0b01010101, a, b);
17468        let e = _mm_set_ph(10., 9., 12., 9., 14., 9., 16., 9.);
17469        assert_eq_m128h(r, e);
17470    }
17471
17472    #[simd_test(enable = "avx512fp16,avx512vl")]
17473    unsafe fn test_mm_maskz_add_ph() {
17474        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17475        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17476        let r = _mm_maskz_add_ph(0b01010101, a, b);
17477        let e = _mm_set_ph(0., 9., 0., 9., 0., 9., 0., 9.);
17478        assert_eq_m128h(r, e);
17479    }
17480
17481    #[simd_test(enable = "avx512fp16,avx512vl")]
17482    unsafe fn test_mm256_add_ph() {
17483        let a = _mm256_set_ph(
17484            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17485        );
17486        let b = _mm256_set_ph(
17487            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17488        );
17489        let r = _mm256_add_ph(a, b);
17490        let e = _mm256_set1_ph(17.0);
17491        assert_eq_m256h(r, e);
17492    }
17493
17494    #[simd_test(enable = "avx512fp16,avx512vl")]
17495    unsafe fn test_mm256_mask_add_ph() {
17496        let a = _mm256_set_ph(
17497            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17498        );
17499        let b = _mm256_set_ph(
17500            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17501        );
17502        let src = _mm256_set_ph(
17503            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33.,
17504        );
17505        let r = _mm256_mask_add_ph(src, 0b0101010101010101, a, b);
17506        let e = _mm256_set_ph(
17507            18., 17., 20., 17., 22., 17., 24., 17., 26., 17., 28., 17., 30., 17., 32., 17.,
17508        );
17509        assert_eq_m256h(r, e);
17510    }
17511
17512    #[simd_test(enable = "avx512fp16,avx512vl")]
17513    unsafe fn test_mm256_maskz_add_ph() {
17514        let a = _mm256_set_ph(
17515            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17516        );
17517        let b = _mm256_set_ph(
17518            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17519        );
17520        let r = _mm256_maskz_add_ph(0b0101010101010101, a, b);
17521        let e = _mm256_set_ph(
17522            0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17.,
17523        );
17524        assert_eq_m256h(r, e);
17525    }
17526
17527    #[simd_test(enable = "avx512fp16")]
17528    unsafe fn test_mm512_add_ph() {
17529        let a = _mm512_set_ph(
17530            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17531            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17532            31.0, 32.0,
17533        );
17534        let b = _mm512_set_ph(
17535            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17536            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17537            3.0, 2.0, 1.0,
17538        );
17539        let r = _mm512_add_ph(a, b);
17540        let e = _mm512_set1_ph(33.0);
17541        assert_eq_m512h(r, e);
17542    }
17543
17544    #[simd_test(enable = "avx512fp16")]
17545    unsafe fn test_mm512_mask_add_ph() {
17546        let a = _mm512_set_ph(
17547            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17548            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17549            31.0, 32.0,
17550        );
17551        let b = _mm512_set_ph(
17552            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17553            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17554            3.0, 2.0, 1.0,
17555        );
17556        let src = _mm512_set_ph(
17557            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
17558            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
17559        );
17560        let r = _mm512_mask_add_ph(src, 0b01010101010101010101010101010101, a, b);
17561        let e = _mm512_set_ph(
17562            34., 33., 36., 33., 38., 33., 40., 33., 42., 33., 44., 33., 46., 33., 48., 33., 50.,
17563            33., 52., 33., 54., 33., 56., 33., 58., 33., 60., 33., 62., 33., 64., 33.,
17564        );
17565        assert_eq_m512h(r, e);
17566    }
17567
17568    #[simd_test(enable = "avx512fp16")]
17569    unsafe fn test_mm512_maskz_add_ph() {
17570        let a = _mm512_set_ph(
17571            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17572            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17573            31.0, 32.0,
17574        );
17575        let b = _mm512_set_ph(
17576            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17577            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17578            3.0, 2.0, 1.0,
17579        );
17580        let r = _mm512_maskz_add_ph(0b01010101010101010101010101010101, a, b);
17581        let e = _mm512_set_ph(
17582            0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0.,
17583            33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33.,
17584        );
17585        assert_eq_m512h(r, e);
17586    }
17587
17588    #[simd_test(enable = "avx512fp16")]
17589    unsafe fn test_mm512_add_round_ph() {
17590        let a = _mm512_set_ph(
17591            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17592            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17593            31.0, 32.0,
17594        );
17595        let b = _mm512_set_ph(
17596            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17597            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17598            3.0, 2.0, 1.0,
17599        );
17600        let r = _mm512_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
17601        let e = _mm512_set1_ph(33.0);
17602        assert_eq_m512h(r, e);
17603    }
17604
17605    #[simd_test(enable = "avx512fp16")]
17606    unsafe fn test_mm512_mask_add_round_ph() {
17607        let a = _mm512_set_ph(
17608            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17609            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17610            31.0, 32.0,
17611        );
17612        let b = _mm512_set_ph(
17613            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17614            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17615            3.0, 2.0, 1.0,
17616        );
17617        let src = _mm512_set_ph(
17618            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
17619            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
17620        );
17621        let r = _mm512_mask_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17622            src,
17623            0b01010101010101010101010101010101,
17624            a,
17625            b,
17626        );
17627        let e = _mm512_set_ph(
17628            34., 33., 36., 33., 38., 33., 40., 33., 42., 33., 44., 33., 46., 33., 48., 33., 50.,
17629            33., 52., 33., 54., 33., 56., 33., 58., 33., 60., 33., 62., 33., 64., 33.,
17630        );
17631        assert_eq_m512h(r, e);
17632    }
17633
17634    #[simd_test(enable = "avx512fp16")]
17635    unsafe fn test_mm512_maskz_add_round_ph() {
17636        let a = _mm512_set_ph(
17637            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17638            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17639            31.0, 32.0,
17640        );
17641        let b = _mm512_set_ph(
17642            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17643            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17644            3.0, 2.0, 1.0,
17645        );
17646        let r = _mm512_maskz_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17647            0b01010101010101010101010101010101,
17648            a,
17649            b,
17650        );
17651        let e = _mm512_set_ph(
17652            0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0.,
17653            33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33.,
17654        );
17655        assert_eq_m512h(r, e);
17656    }
17657
17658    #[simd_test(enable = "avx512fp16")]
17659    unsafe fn test_mm_add_round_sh() {
17660        let a = _mm_set_sh(1.0);
17661        let b = _mm_set_sh(2.0);
17662        let r = _mm_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
17663        let e = _mm_set_sh(3.0);
17664        assert_eq_m128h(r, e);
17665    }
17666
17667    #[simd_test(enable = "avx512fp16")]
17668    unsafe fn test_mm_mask_add_round_sh() {
17669        let a = _mm_set_sh(1.0);
17670        let b = _mm_set_sh(2.0);
17671        let src = _mm_set_sh(4.0);
17672        let r = _mm_mask_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17673            src, 0, a, b,
17674        );
17675        let e = _mm_set_sh(4.0);
17676        assert_eq_m128h(r, e);
17677        let r = _mm_mask_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17678            src, 1, a, b,
17679        );
17680        let e = _mm_set_sh(3.0);
17681        assert_eq_m128h(r, e);
17682    }
17683
17684    #[simd_test(enable = "avx512fp16")]
17685    unsafe fn test_mm_maskz_add_round_sh() {
17686        let a = _mm_set_sh(1.0);
17687        let b = _mm_set_sh(2.0);
17688        let r =
17689            _mm_maskz_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
17690        let e = _mm_set_sh(0.0);
17691        assert_eq_m128h(r, e);
17692        let r =
17693            _mm_maskz_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
17694        let e = _mm_set_sh(3.0);
17695        assert_eq_m128h(r, e);
17696    }
17697
17698    #[simd_test(enable = "avx512fp16")]
17699    unsafe fn test_mm_add_sh() {
17700        let a = _mm_set_sh(1.0);
17701        let b = _mm_set_sh(2.0);
17702        let r = _mm_add_sh(a, b);
17703        let e = _mm_set_sh(3.0);
17704        assert_eq_m128h(r, e);
17705    }
17706
17707    #[simd_test(enable = "avx512fp16")]
17708    unsafe fn test_mm_mask_add_sh() {
17709        let a = _mm_set_sh(1.0);
17710        let b = _mm_set_sh(2.0);
17711        let src = _mm_set_sh(4.0);
17712        let r = _mm_mask_add_sh(src, 0, a, b);
17713        let e = _mm_set_sh(4.0);
17714        assert_eq_m128h(r, e);
17715        let r = _mm_mask_add_sh(src, 1, a, b);
17716        let e = _mm_set_sh(3.0);
17717        assert_eq_m128h(r, e);
17718    }
17719
17720    #[simd_test(enable = "avx512fp16")]
17721    unsafe fn test_mm_maskz_add_sh() {
17722        let a = _mm_set_sh(1.0);
17723        let b = _mm_set_sh(2.0);
17724        let r = _mm_maskz_add_sh(0, a, b);
17725        let e = _mm_set_sh(0.0);
17726        assert_eq_m128h(r, e);
17727        let r = _mm_maskz_add_sh(1, a, b);
17728        let e = _mm_set_sh(3.0);
17729        assert_eq_m128h(r, e);
17730    }
17731
17732    #[simd_test(enable = "avx512fp16,avx512vl")]
17733    unsafe fn test_mm_sub_ph() {
17734        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17735        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17736        let r = _mm_sub_ph(a, b);
17737        let e = _mm_set_ph(-7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0);
17738        assert_eq_m128h(r, e);
17739    }
17740
17741    #[simd_test(enable = "avx512fp16,avx512vl")]
17742    unsafe fn test_mm_mask_sub_ph() {
17743        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17744        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17745        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
17746        let r = _mm_mask_sub_ph(src, 0b01010101, a, b);
17747        let e = _mm_set_ph(10., -5., 12., -1., 14., 3., 16., 7.);
17748        assert_eq_m128h(r, e);
17749    }
17750
17751    #[simd_test(enable = "avx512fp16,avx512vl")]
17752    unsafe fn test_mm_maskz_sub_ph() {
17753        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17754        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17755        let r = _mm_maskz_sub_ph(0b01010101, a, b);
17756        let e = _mm_set_ph(0., -5., 0., -1., 0., 3., 0., 7.);
17757        assert_eq_m128h(r, e);
17758    }
17759
17760    #[simd_test(enable = "avx512fp16,avx512vl")]
17761    unsafe fn test_mm256_sub_ph() {
17762        let a = _mm256_set_ph(
17763            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17764        );
17765        let b = _mm256_set_ph(
17766            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17767        );
17768        let r = _mm256_sub_ph(a, b);
17769        let e = _mm256_set_ph(
17770            -15.0, -13.0, -11.0, -9.0, -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0,
17771            15.0,
17772        );
17773        assert_eq_m256h(r, e);
17774    }
17775
17776    #[simd_test(enable = "avx512fp16,avx512vl")]
17777    unsafe fn test_mm256_mask_sub_ph() {
17778        let a = _mm256_set_ph(
17779            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17780        );
17781        let b = _mm256_set_ph(
17782            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17783        );
17784        let src = _mm256_set_ph(
17785            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33.,
17786        );
17787        let r = _mm256_mask_sub_ph(src, 0b0101010101010101, a, b);
17788        let e = _mm256_set_ph(
17789            18., -13., 20., -9., 22., -5., 24., -1., 26., 3., 28., 7., 30., 11., 32., 15.,
17790        );
17791        assert_eq_m256h(r, e);
17792    }
17793
17794    #[simd_test(enable = "avx512fp16,avx512vl")]
17795    unsafe fn test_mm256_maskz_sub_ph() {
17796        let a = _mm256_set_ph(
17797            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17798        );
17799        let b = _mm256_set_ph(
17800            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17801        );
17802        let r = _mm256_maskz_sub_ph(0b0101010101010101, a, b);
17803        let e = _mm256_set_ph(
17804            0., -13., 0., -9., 0., -5., 0., -1., 0., 3., 0., 7., 0., 11., 0., 15.,
17805        );
17806        assert_eq_m256h(r, e);
17807    }
17808
17809    #[simd_test(enable = "avx512fp16")]
17810    unsafe fn test_mm512_sub_ph() {
17811        let a = _mm512_set_ph(
17812            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17813            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17814            31.0, 32.0,
17815        );
17816        let b = _mm512_set_ph(
17817            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17818            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17819            3.0, 2.0, 1.0,
17820        );
17821        let r = _mm512_sub_ph(a, b);
17822        let e = _mm512_set_ph(
17823            -31.0, -29.0, -27.0, -25.0, -23.0, -21.0, -19.0, -17.0, -15.0, -13.0, -11.0, -9.0,
17824            -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0,
17825            23.0, 25.0, 27.0, 29.0, 31.0,
17826        );
17827        assert_eq_m512h(r, e);
17828    }
17829
17830    #[simd_test(enable = "avx512fp16")]
17831    unsafe fn test_mm512_mask_sub_ph() {
17832        let a = _mm512_set_ph(
17833            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17834            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17835            31.0, 32.0,
17836        );
17837        let b = _mm512_set_ph(
17838            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17839            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17840            3.0, 2.0, 1.0,
17841        );
17842        let src = _mm512_set_ph(
17843            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
17844            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
17845        );
17846        let r = _mm512_mask_sub_ph(src, 0b01010101010101010101010101010101, a, b);
17847        let e = _mm512_set_ph(
17848            34., -29., 36., -25., 38., -21., 40., -17., 42., -13., 44., -9., 46., -5., 48., -1.,
17849            50., 3., 52., 7., 54., 11., 56., 15., 58., 19., 60., 23., 62., 27., 64., 31.,
17850        );
17851        assert_eq_m512h(r, e);
17852    }
17853
17854    #[simd_test(enable = "avx512fp16")]
17855    unsafe fn test_mm512_maskz_sub_ph() {
17856        let a = _mm512_set_ph(
17857            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17858            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17859            31.0, 32.0,
17860        );
17861        let b = _mm512_set_ph(
17862            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17863            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17864            3.0, 2.0, 1.0,
17865        );
17866        let r = _mm512_maskz_sub_ph(0b01010101010101010101010101010101, a, b);
17867        let e = _mm512_set_ph(
17868            0., -29., 0., -25., 0., -21., 0., -17., 0., -13., 0., -9., 0., -5., 0., -1., 0., 3.,
17869            0., 7., 0., 11., 0., 15., 0., 19., 0., 23., 0., 27., 0., 31.,
17870        );
17871        assert_eq_m512h(r, e);
17872    }
17873
17874    #[simd_test(enable = "avx512fp16")]
17875    unsafe fn test_mm512_sub_round_ph() {
17876        let a = _mm512_set_ph(
17877            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17878            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17879            31.0, 32.0,
17880        );
17881        let b = _mm512_set_ph(
17882            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17883            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17884            3.0, 2.0, 1.0,
17885        );
17886        let r = _mm512_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
17887        let e = _mm512_set_ph(
17888            -31.0, -29.0, -27.0, -25.0, -23.0, -21.0, -19.0, -17.0, -15.0, -13.0, -11.0, -9.0,
17889            -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0,
17890            23.0, 25.0, 27.0, 29.0, 31.0,
17891        );
17892        assert_eq_m512h(r, e);
17893    }
17894
17895    #[simd_test(enable = "avx512fp16")]
17896    unsafe fn test_mm512_mask_sub_round_ph() {
17897        let a = _mm512_set_ph(
17898            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17899            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17900            31.0, 32.0,
17901        );
17902        let b = _mm512_set_ph(
17903            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17904            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17905            3.0, 2.0, 1.0,
17906        );
17907        let src = _mm512_set_ph(
17908            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
17909            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
17910        );
17911        let r = _mm512_mask_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17912            src,
17913            0b01010101010101010101010101010101,
17914            a,
17915            b,
17916        );
17917        let e = _mm512_set_ph(
17918            34., -29., 36., -25., 38., -21., 40., -17., 42., -13., 44., -9., 46., -5., 48., -1.,
17919            50., 3., 52., 7., 54., 11., 56., 15., 58., 19., 60., 23., 62., 27., 64., 31.,
17920        );
17921        assert_eq_m512h(r, e);
17922    }
17923
17924    #[simd_test(enable = "avx512fp16")]
17925    unsafe fn test_mm512_maskz_sub_round_ph() {
17926        let a = _mm512_set_ph(
17927            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17928            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17929            31.0, 32.0,
17930        );
17931        let b = _mm512_set_ph(
17932            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17933            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17934            3.0, 2.0, 1.0,
17935        );
17936        let r = _mm512_maskz_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17937            0b01010101010101010101010101010101,
17938            a,
17939            b,
17940        );
17941        let e = _mm512_set_ph(
17942            0., -29., 0., -25., 0., -21., 0., -17., 0., -13., 0., -9., 0., -5., 0., -1., 0., 3.,
17943            0., 7., 0., 11., 0., 15., 0., 19., 0., 23., 0., 27., 0., 31.,
17944        );
17945        assert_eq_m512h(r, e);
17946    }
17947
17948    #[simd_test(enable = "avx512fp16")]
17949    unsafe fn test_mm_sub_round_sh() {
17950        let a = _mm_set_sh(1.0);
17951        let b = _mm_set_sh(2.0);
17952        let r = _mm_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
17953        let e = _mm_set_sh(-1.0);
17954        assert_eq_m128h(r, e);
17955    }
17956
17957    #[simd_test(enable = "avx512fp16")]
17958    unsafe fn test_mm_mask_sub_round_sh() {
17959        let a = _mm_set_sh(1.0);
17960        let b = _mm_set_sh(2.0);
17961        let src = _mm_set_sh(4.0);
17962        let r = _mm_mask_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17963            src, 0, a, b,
17964        );
17965        let e = _mm_set_sh(4.0);
17966        assert_eq_m128h(r, e);
17967        let r = _mm_mask_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17968            src, 1, a, b,
17969        );
17970        let e = _mm_set_sh(-1.0);
17971        assert_eq_m128h(r, e);
17972    }
17973
17974    #[simd_test(enable = "avx512fp16")]
17975    unsafe fn test_mm_maskz_sub_round_sh() {
17976        let a = _mm_set_sh(1.0);
17977        let b = _mm_set_sh(2.0);
17978        let r =
17979            _mm_maskz_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
17980        let e = _mm_set_sh(0.0);
17981        assert_eq_m128h(r, e);
17982        let r =
17983            _mm_maskz_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
17984        let e = _mm_set_sh(-1.0);
17985        assert_eq_m128h(r, e);
17986    }
17987
17988    #[simd_test(enable = "avx512fp16")]
17989    unsafe fn test_mm_sub_sh() {
17990        let a = _mm_set_sh(1.0);
17991        let b = _mm_set_sh(2.0);
17992        let r = _mm_sub_sh(a, b);
17993        let e = _mm_set_sh(-1.0);
17994        assert_eq_m128h(r, e);
17995    }
17996
17997    #[simd_test(enable = "avx512fp16")]
17998    unsafe fn test_mm_mask_sub_sh() {
17999        let a = _mm_set_sh(1.0);
18000        let b = _mm_set_sh(2.0);
18001        let src = _mm_set_sh(4.0);
18002        let r = _mm_mask_sub_sh(src, 0, a, b);
18003        let e = _mm_set_sh(4.0);
18004        assert_eq_m128h(r, e);
18005        let r = _mm_mask_sub_sh(src, 1, a, b);
18006        let e = _mm_set_sh(-1.0);
18007        assert_eq_m128h(r, e);
18008    }
18009
18010    #[simd_test(enable = "avx512fp16")]
18011    unsafe fn test_mm_maskz_sub_sh() {
18012        let a = _mm_set_sh(1.0);
18013        let b = _mm_set_sh(2.0);
18014        let r = _mm_maskz_sub_sh(0, a, b);
18015        let e = _mm_set_sh(0.0);
18016        assert_eq_m128h(r, e);
18017        let r = _mm_maskz_sub_sh(1, a, b);
18018        let e = _mm_set_sh(-1.0);
18019        assert_eq_m128h(r, e);
18020    }
18021
18022    #[simd_test(enable = "avx512fp16,avx512vl")]
18023    unsafe fn test_mm_mul_ph() {
18024        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
18025        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
18026        let r = _mm_mul_ph(a, b);
18027        let e = _mm_set_ph(8.0, 14.0, 18.0, 20.0, 20.0, 18.0, 14.0, 8.0);
18028        assert_eq_m128h(r, e);
18029    }
18030
18031    #[simd_test(enable = "avx512fp16,avx512vl")]
18032    unsafe fn test_mm_mask_mul_ph() {
18033        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
18034        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
18035        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
18036        let r = _mm_mask_mul_ph(src, 0b01010101, a, b);
18037        let e = _mm_set_ph(10., 14., 12., 20., 14., 18., 16., 8.);
18038        assert_eq_m128h(r, e);
18039    }
18040
18041    #[simd_test(enable = "avx512fp16,avx512vl")]
18042    unsafe fn test_mm_maskz_mul_ph() {
18043        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
18044        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
18045        let r = _mm_maskz_mul_ph(0b01010101, a, b);
18046        let e = _mm_set_ph(0., 14., 0., 20., 0., 18., 0., 8.);
18047        assert_eq_m128h(r, e);
18048    }
18049
18050    #[simd_test(enable = "avx512fp16,avx512vl")]
18051    unsafe fn test_mm256_mul_ph() {
18052        let a = _mm256_set_ph(
18053            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18054        );
18055        let b = _mm256_set_ph(
18056            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
18057        );
18058        let r = _mm256_mul_ph(a, b);
18059        let e = _mm256_set_ph(
18060            16.0, 30.0, 42.0, 52.0, 60.0, 66.0, 70.0, 72.0, 72.0, 70.0, 66.0, 60.0, 52.0, 42.0,
18061            30.0, 16.0,
18062        );
18063        assert_eq_m256h(r, e);
18064    }
18065
18066    #[simd_test(enable = "avx512fp16,avx512vl")]
18067    unsafe fn test_mm256_mask_mul_ph() {
18068        let a = _mm256_set_ph(
18069            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18070        );
18071        let b = _mm256_set_ph(
18072            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
18073        );
18074        let src = _mm256_set_ph(
18075            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33.,
18076        );
18077        let r = _mm256_mask_mul_ph(src, 0b0101010101010101, a, b);
18078        let e = _mm256_set_ph(
18079            18., 30., 20., 52., 22., 66., 24., 72., 26., 70., 28., 60., 30., 42., 32., 16.,
18080        );
18081        assert_eq_m256h(r, e);
18082    }
18083
18084    #[simd_test(enable = "avx512fp16,avx512vl")]
18085    unsafe fn test_mm256_maskz_mul_ph() {
18086        let a = _mm256_set_ph(
18087            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18088        );
18089        let b = _mm256_set_ph(
18090            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
18091        );
18092        let r = _mm256_maskz_mul_ph(0b0101010101010101, a, b);
18093        let e = _mm256_set_ph(
18094            0., 30., 0., 52., 0., 66., 0., 72., 0., 70., 0., 60., 0., 42., 0., 16.,
18095        );
18096        assert_eq_m256h(r, e);
18097    }
18098
18099    #[simd_test(enable = "avx512fp16")]
18100    unsafe fn test_mm512_mul_ph() {
18101        let a = _mm512_set_ph(
18102            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18103            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18104            31.0, 32.0,
18105        );
18106        let b = _mm512_set_ph(
18107            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18108            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18109            3.0, 2.0, 1.0,
18110        );
18111        let r = _mm512_mul_ph(a, b);
18112        let e = _mm512_set_ph(
18113            32.0, 62.0, 90.0, 116.0, 140.0, 162.0, 182.0, 200.0, 216.0, 230.0, 242.0, 252.0, 260.0,
18114            266.0, 270.0, 272.0, 272.0, 270.0, 266.0, 260.0, 252.0, 242.0, 230.0, 216.0, 200.0,
18115            182.0, 162.0, 140.0, 116.0, 90.0, 62.0, 32.0,
18116        );
18117        assert_eq_m512h(r, e);
18118    }
18119
18120    #[simd_test(enable = "avx512fp16")]
18121    unsafe fn test_mm512_mask_mul_ph() {
18122        let a = _mm512_set_ph(
18123            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18124            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18125            31.0, 32.0,
18126        );
18127        let b = _mm512_set_ph(
18128            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18129            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18130            3.0, 2.0, 1.0,
18131        );
18132        let src = _mm512_set_ph(
18133            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
18134            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
18135        );
18136        let r = _mm512_mask_mul_ph(src, 0b01010101010101010101010101010101, a, b);
18137        let e = _mm512_set_ph(
18138            34., 62., 36., 116., 38., 162., 40., 200., 42., 230., 44., 252., 46., 266., 48., 272.,
18139            50., 270., 52., 260., 54., 242., 56., 216., 58., 182., 60., 140., 62., 90., 64., 32.,
18140        );
18141        assert_eq_m512h(r, e);
18142    }
18143
18144    #[simd_test(enable = "avx512fp16")]
18145    unsafe fn test_mm512_maskz_mul_ph() {
18146        let a = _mm512_set_ph(
18147            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18148            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18149            31.0, 32.0,
18150        );
18151        let b = _mm512_set_ph(
18152            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18153            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18154            3.0, 2.0, 1.0,
18155        );
18156        let r = _mm512_maskz_mul_ph(0b01010101010101010101010101010101, a, b);
18157        let e = _mm512_set_ph(
18158            0., 62., 0., 116., 0., 162., 0., 200., 0., 230., 0., 252., 0., 266., 0., 272., 0.,
18159            270., 0., 260., 0., 242., 0., 216., 0., 182., 0., 140., 0., 90., 0., 32.,
18160        );
18161        assert_eq_m512h(r, e);
18162    }
18163
18164    #[simd_test(enable = "avx512fp16")]
18165    unsafe fn test_mm512_mul_round_ph() {
18166        let a = _mm512_set_ph(
18167            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18168            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18169            31.0, 32.0,
18170        );
18171        let b = _mm512_set_ph(
18172            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18173            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18174            3.0, 2.0, 1.0,
18175        );
18176        let r = _mm512_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18177        let e = _mm512_set_ph(
18178            32.0, 62.0, 90.0, 116.0, 140.0, 162.0, 182.0, 200.0, 216.0, 230.0, 242.0, 252.0, 260.0,
18179            266.0, 270.0, 272.0, 272.0, 270.0, 266.0, 260.0, 252.0, 242.0, 230.0, 216.0, 200.0,
18180            182.0, 162.0, 140.0, 116.0, 90.0, 62.0, 32.0,
18181        );
18182        assert_eq_m512h(r, e);
18183    }
18184
18185    #[simd_test(enable = "avx512fp16")]
18186    unsafe fn test_mm512_mask_mul_round_ph() {
18187        let a = _mm512_set_ph(
18188            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18189            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18190            31.0, 32.0,
18191        );
18192        let b = _mm512_set_ph(
18193            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18194            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18195            3.0, 2.0, 1.0,
18196        );
18197        let src = _mm512_set_ph(
18198            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
18199            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
18200        );
18201        let r = _mm512_mask_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18202            src,
18203            0b01010101010101010101010101010101,
18204            a,
18205            b,
18206        );
18207        let e = _mm512_set_ph(
18208            34., 62., 36., 116., 38., 162., 40., 200., 42., 230., 44., 252., 46., 266., 48., 272.,
18209            50., 270., 52., 260., 54., 242., 56., 216., 58., 182., 60., 140., 62., 90., 64., 32.,
18210        );
18211        assert_eq_m512h(r, e);
18212    }
18213
18214    #[simd_test(enable = "avx512fp16")]
18215    unsafe fn test_mm512_maskz_mul_round_ph() {
18216        let a = _mm512_set_ph(
18217            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18218            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18219            31.0, 32.0,
18220        );
18221        let b = _mm512_set_ph(
18222            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18223            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18224            3.0, 2.0, 1.0,
18225        );
18226        let r = _mm512_maskz_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18227            0b01010101010101010101010101010101,
18228            a,
18229            b,
18230        );
18231        let e = _mm512_set_ph(
18232            0., 62., 0., 116., 0., 162., 0., 200., 0., 230., 0., 252., 0., 266., 0., 272., 0.,
18233            270., 0., 260., 0., 242., 0., 216., 0., 182., 0., 140., 0., 90., 0., 32.,
18234        );
18235        assert_eq_m512h(r, e);
18236    }
18237
18238    #[simd_test(enable = "avx512fp16")]
18239    unsafe fn test_mm_mul_round_sh() {
18240        let a = _mm_set_sh(1.0);
18241        let b = _mm_set_sh(2.0);
18242        let r = _mm_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18243        let e = _mm_set_sh(2.0);
18244        assert_eq_m128h(r, e);
18245    }
18246
18247    #[simd_test(enable = "avx512fp16")]
18248    unsafe fn test_mm_mask_mul_round_sh() {
18249        let a = _mm_set_sh(1.0);
18250        let b = _mm_set_sh(2.0);
18251        let src = _mm_set_sh(4.0);
18252        let r = _mm_mask_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18253            src, 0, a, b,
18254        );
18255        let e = _mm_set_sh(4.0);
18256        assert_eq_m128h(r, e);
18257        let r = _mm_mask_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18258            src, 1, a, b,
18259        );
18260        let e = _mm_set_sh(2.0);
18261        assert_eq_m128h(r, e);
18262    }
18263
18264    #[simd_test(enable = "avx512fp16")]
18265    unsafe fn test_mm_maskz_mul_round_sh() {
18266        let a = _mm_set_sh(1.0);
18267        let b = _mm_set_sh(2.0);
18268        let r =
18269            _mm_maskz_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
18270        let e = _mm_set_sh(0.0);
18271        assert_eq_m128h(r, e);
18272        let r =
18273            _mm_maskz_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
18274        let e = _mm_set_sh(2.0);
18275        assert_eq_m128h(r, e);
18276    }
18277
18278    #[simd_test(enable = "avx512fp16")]
18279    unsafe fn test_mm_mul_sh() {
18280        let a = _mm_set_sh(1.0);
18281        let b = _mm_set_sh(2.0);
18282        let r = _mm_mul_sh(a, b);
18283        let e = _mm_set_sh(2.0);
18284        assert_eq_m128h(r, e);
18285    }
18286
18287    #[simd_test(enable = "avx512fp16")]
18288    unsafe fn test_mm_mask_mul_sh() {
18289        let a = _mm_set_sh(1.0);
18290        let b = _mm_set_sh(2.0);
18291        let src = _mm_set_sh(4.0);
18292        let r = _mm_mask_mul_sh(src, 0, a, b);
18293        let e = _mm_set_sh(4.0);
18294        assert_eq_m128h(r, e);
18295        let r = _mm_mask_mul_sh(src, 1, a, b);
18296        let e = _mm_set_sh(2.0);
18297        assert_eq_m128h(r, e);
18298    }
18299
18300    #[simd_test(enable = "avx512fp16")]
18301    unsafe fn test_mm_maskz_mul_sh() {
18302        let a = _mm_set_sh(1.0);
18303        let b = _mm_set_sh(2.0);
18304        let r = _mm_maskz_mul_sh(0, a, b);
18305        let e = _mm_set_sh(0.0);
18306        assert_eq_m128h(r, e);
18307        let r = _mm_maskz_mul_sh(1, a, b);
18308        let e = _mm_set_sh(2.0);
18309        assert_eq_m128h(r, e);
18310    }
18311
18312    #[simd_test(enable = "avx512fp16,avx512vl")]
18313    unsafe fn test_mm_div_ph() {
18314        let a = _mm_set1_ph(1.0);
18315        let b = _mm_set1_ph(2.0);
18316        let r = _mm_div_ph(a, b);
18317        let e = _mm_set1_ph(0.5);
18318        assert_eq_m128h(r, e);
18319    }
18320
18321    #[simd_test(enable = "avx512fp16,avx512vl")]
18322    unsafe fn test_mm_mask_div_ph() {
18323        let a = _mm_set1_ph(1.0);
18324        let b = _mm_set1_ph(2.0);
18325        let src = _mm_set_ph(4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0);
18326        let r = _mm_mask_div_ph(src, 0b01010101, a, b);
18327        let e = _mm_set_ph(4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5);
18328        assert_eq_m128h(r, e);
18329    }
18330
18331    #[simd_test(enable = "avx512fp16,avx512vl")]
18332    unsafe fn test_mm_maskz_div_ph() {
18333        let a = _mm_set1_ph(1.0);
18334        let b = _mm_set1_ph(2.0);
18335        let r = _mm_maskz_div_ph(0b01010101, a, b);
18336        let e = _mm_set_ph(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5);
18337        assert_eq_m128h(r, e);
18338    }
18339
18340    #[simd_test(enable = "avx512fp16,avx512vl")]
18341    unsafe fn test_mm256_div_ph() {
18342        let a = _mm256_set1_ph(1.0);
18343        let b = _mm256_set1_ph(2.0);
18344        let r = _mm256_div_ph(a, b);
18345        let e = _mm256_set1_ph(0.5);
18346        assert_eq_m256h(r, e);
18347    }
18348
18349    #[simd_test(enable = "avx512fp16,avx512vl")]
18350    unsafe fn test_mm256_mask_div_ph() {
18351        let a = _mm256_set1_ph(1.0);
18352        let b = _mm256_set1_ph(2.0);
18353        let src = _mm256_set_ph(
18354            4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
18355            19.0,
18356        );
18357        let r = _mm256_mask_div_ph(src, 0b0101010101010101, a, b);
18358        let e = _mm256_set_ph(
18359            4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5,
18360        );
18361        assert_eq_m256h(r, e);
18362    }
18363
18364    #[simd_test(enable = "avx512fp16,avx512vl")]
18365    unsafe fn test_mm256_maskz_div_ph() {
18366        let a = _mm256_set1_ph(1.0);
18367        let b = _mm256_set1_ph(2.0);
18368        let r = _mm256_maskz_div_ph(0b0101010101010101, a, b);
18369        let e = _mm256_set_ph(
18370            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
18371        );
18372        assert_eq_m256h(r, e);
18373    }
18374
18375    #[simd_test(enable = "avx512fp16")]
18376    unsafe fn test_mm512_div_ph() {
18377        let a = _mm512_set1_ph(1.0);
18378        let b = _mm512_set1_ph(2.0);
18379        let r = _mm512_div_ph(a, b);
18380        let e = _mm512_set1_ph(0.5);
18381        assert_eq_m512h(r, e);
18382    }
18383
18384    #[simd_test(enable = "avx512fp16")]
18385    unsafe fn test_mm512_mask_div_ph() {
18386        let a = _mm512_set1_ph(1.0);
18387        let b = _mm512_set1_ph(2.0);
18388        let src = _mm512_set_ph(
18389            4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
18390            19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0,
18391            33.0, 34.0, 35.0,
18392        );
18393        let r = _mm512_mask_div_ph(src, 0b01010101010101010101010101010101, a, b);
18394        let e = _mm512_set_ph(
18395            4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5,
18396            20.0, 0.5, 22.0, 0.5, 24.0, 0.5, 26.0, 0.5, 28.0, 0.5, 30.0, 0.5, 32.0, 0.5, 34.0, 0.5,
18397        );
18398        assert_eq_m512h(r, e);
18399    }
18400
18401    #[simd_test(enable = "avx512fp16")]
18402    unsafe fn test_mm512_maskz_div_ph() {
18403        let a = _mm512_set1_ph(1.0);
18404        let b = _mm512_set1_ph(2.0);
18405        let r = _mm512_maskz_div_ph(0b01010101010101010101010101010101, a, b);
18406        let e = _mm512_set_ph(
18407            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
18408            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
18409        );
18410        assert_eq_m512h(r, e);
18411    }
18412
18413    #[simd_test(enable = "avx512fp16")]
18414    unsafe fn test_mm512_div_round_ph() {
18415        let a = _mm512_set1_ph(1.0);
18416        let b = _mm512_set1_ph(2.0);
18417        let r = _mm512_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18418        let e = _mm512_set1_ph(0.5);
18419        assert_eq_m512h(r, e);
18420    }
18421
18422    #[simd_test(enable = "avx512fp16")]
18423    unsafe fn test_mm512_mask_div_round_ph() {
18424        let a = _mm512_set1_ph(1.0);
18425        let b = _mm512_set1_ph(2.0);
18426        let src = _mm512_set_ph(
18427            4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
18428            19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0,
18429            33.0, 34.0, 35.0,
18430        );
18431        let r = _mm512_mask_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18432            src,
18433            0b01010101010101010101010101010101,
18434            a,
18435            b,
18436        );
18437        let e = _mm512_set_ph(
18438            4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5,
18439            20.0, 0.5, 22.0, 0.5, 24.0, 0.5, 26.0, 0.5, 28.0, 0.5, 30.0, 0.5, 32.0, 0.5, 34.0, 0.5,
18440        );
18441        assert_eq_m512h(r, e);
18442    }
18443
18444    #[simd_test(enable = "avx512fp16")]
18445    unsafe fn test_mm512_maskz_div_round_ph() {
18446        let a = _mm512_set1_ph(1.0);
18447        let b = _mm512_set1_ph(2.0);
18448        let r = _mm512_maskz_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18449            0b01010101010101010101010101010101,
18450            a,
18451            b,
18452        );
18453        let e = _mm512_set_ph(
18454            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
18455            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
18456        );
18457        assert_eq_m512h(r, e);
18458    }
18459
18460    #[simd_test(enable = "avx512fp16")]
18461    unsafe fn test_mm_div_round_sh() {
18462        let a = _mm_set_sh(1.0);
18463        let b = _mm_set_sh(2.0);
18464        let r = _mm_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18465        let e = _mm_set_sh(0.5);
18466        assert_eq_m128h(r, e);
18467    }
18468
18469    #[simd_test(enable = "avx512fp16")]
18470    unsafe fn test_mm_mask_div_round_sh() {
18471        let a = _mm_set_sh(1.0);
18472        let b = _mm_set_sh(2.0);
18473        let src = _mm_set_sh(4.0);
18474        let r = _mm_mask_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18475            src, 0, a, b,
18476        );
18477        let e = _mm_set_sh(4.0);
18478        assert_eq_m128h(r, e);
18479        let r = _mm_mask_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18480            src, 1, a, b,
18481        );
18482        let e = _mm_set_sh(0.5);
18483        assert_eq_m128h(r, e);
18484    }
18485
18486    #[simd_test(enable = "avx512fp16")]
18487    unsafe fn test_mm_maskz_div_round_sh() {
18488        let a = _mm_set_sh(1.0);
18489        let b = _mm_set_sh(2.0);
18490        let r =
18491            _mm_maskz_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
18492        let e = _mm_set_sh(0.0);
18493        assert_eq_m128h(r, e);
18494        let r =
18495            _mm_maskz_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
18496        let e = _mm_set_sh(0.5);
18497        assert_eq_m128h(r, e);
18498    }
18499
18500    #[simd_test(enable = "avx512fp16")]
18501    unsafe fn test_mm_div_sh() {
18502        let a = _mm_set_sh(1.0);
18503        let b = _mm_set_sh(2.0);
18504        let r = _mm_div_sh(a, b);
18505        let e = _mm_set_sh(0.5);
18506        assert_eq_m128h(r, e);
18507    }
18508
18509    #[simd_test(enable = "avx512fp16")]
18510    unsafe fn test_mm_mask_div_sh() {
18511        let a = _mm_set_sh(1.0);
18512        let b = _mm_set_sh(2.0);
18513        let src = _mm_set_sh(4.0);
18514        let r = _mm_mask_div_sh(src, 0, a, b);
18515        let e = _mm_set_sh(4.0);
18516        assert_eq_m128h(r, e);
18517        let r = _mm_mask_div_sh(src, 1, a, b);
18518        let e = _mm_set_sh(0.5);
18519        assert_eq_m128h(r, e);
18520    }
18521
18522    #[simd_test(enable = "avx512fp16")]
18523    unsafe fn test_mm_maskz_div_sh() {
18524        let a = _mm_set_sh(1.0);
18525        let b = _mm_set_sh(2.0);
18526        let r = _mm_maskz_div_sh(0, a, b);
18527        let e = _mm_set_sh(0.0);
18528        assert_eq_m128h(r, e);
18529        let r = _mm_maskz_div_sh(1, a, b);
18530        let e = _mm_set_sh(0.5);
18531        assert_eq_m128h(r, e);
18532    }
18533
18534    #[simd_test(enable = "avx512fp16,avx512vl")]
18535    unsafe fn test_mm_mul_pch() {
18536        let a = _mm_set1_pch(0.0, 1.0);
18537        let b = _mm_set1_pch(0.0, 1.0);
18538        let r = _mm_mul_pch(a, b);
18539        let e = _mm_set1_pch(-1.0, 0.0);
18540        assert_eq_m128h(r, e);
18541    }
18542
18543    #[simd_test(enable = "avx512fp16,avx512vl")]
18544    unsafe fn test_mm_mask_mul_pch() {
18545        let a = _mm_set1_pch(0.0, 1.0);
18546        let b = _mm_set1_pch(0.0, 1.0);
18547        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
18548        let r = _mm_mask_mul_pch(src, 0b0101, a, b);
18549        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
18550        assert_eq_m128h(r, e);
18551    }
18552
18553    #[simd_test(enable = "avx512fp16,avx512vl")]
18554    unsafe fn test_mm_maskz_mul_pch() {
18555        let a = _mm_set1_pch(0.0, 1.0);
18556        let b = _mm_set1_pch(0.0, 1.0);
18557        let r = _mm_maskz_mul_pch(0b0101, a, b);
18558        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
18559        assert_eq_m128h(r, e);
18560    }
18561
18562    #[simd_test(enable = "avx512fp16,avx512vl")]
18563    unsafe fn test_mm256_mul_pch() {
18564        let a = _mm256_set1_pch(0.0, 1.0);
18565        let b = _mm256_set1_pch(0.0, 1.0);
18566        let r = _mm256_mul_pch(a, b);
18567        let e = _mm256_set1_pch(-1.0, 0.0);
18568        assert_eq_m256h(r, e);
18569    }
18570
18571    #[simd_test(enable = "avx512fp16,avx512vl")]
18572    unsafe fn test_mm256_mask_mul_pch() {
18573        let a = _mm256_set1_pch(0.0, 1.0);
18574        let b = _mm256_set1_pch(0.0, 1.0);
18575        let src = _mm256_setr_ph(
18576            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18577        );
18578        let r = _mm256_mask_mul_pch(src, 0b01010101, a, b);
18579        let e = _mm256_setr_ph(
18580            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18581        );
18582        assert_eq_m256h(r, e);
18583    }
18584
18585    #[simd_test(enable = "avx512fp16,avx512vl")]
18586    unsafe fn test_mm256_maskz_mul_pch() {
18587        let a = _mm256_set1_pch(0.0, 1.0);
18588        let b = _mm256_set1_pch(0.0, 1.0);
18589        let r = _mm256_maskz_mul_pch(0b01010101, a, b);
18590        let e = _mm256_setr_ph(
18591            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18592        );
18593        assert_eq_m256h(r, e);
18594    }
18595
18596    #[simd_test(enable = "avx512fp16")]
18597    unsafe fn test_mm512_mul_pch() {
18598        let a = _mm512_set1_pch(0.0, 1.0);
18599        let b = _mm512_set1_pch(0.0, 1.0);
18600        let r = _mm512_mul_pch(a, b);
18601        let e = _mm512_set1_pch(-1.0, 0.0);
18602        assert_eq_m512h(r, e);
18603    }
18604
18605    #[simd_test(enable = "avx512fp16")]
18606    unsafe fn test_mm512_mask_mul_pch() {
18607        let a = _mm512_set1_pch(0.0, 1.0);
18608        let b = _mm512_set1_pch(0.0, 1.0);
18609        let src = _mm512_setr_ph(
18610            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18611            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
18612            32.0, 33.0,
18613        );
18614        let r = _mm512_mask_mul_pch(src, 0b0101010101010101, a, b);
18615        let e = _mm512_setr_ph(
18616            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18617            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
18618            33.0,
18619        );
18620        assert_eq_m512h(r, e);
18621    }
18622
18623    #[simd_test(enable = "avx512fp16")]
18624    unsafe fn test_mm512_maskz_mul_pch() {
18625        let a = _mm512_set1_pch(0.0, 1.0);
18626        let b = _mm512_set1_pch(0.0, 1.0);
18627        let r = _mm512_maskz_mul_pch(0b0101010101010101, a, b);
18628        let e = _mm512_setr_ph(
18629            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18630            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18631        );
18632        assert_eq_m512h(r, e);
18633    }
18634
18635    #[simd_test(enable = "avx512fp16")]
18636    unsafe fn test_mm512_mul_round_pch() {
18637        let a = _mm512_set1_pch(0.0, 1.0);
18638        let b = _mm512_set1_pch(0.0, 1.0);
18639        let r = _mm512_mul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18640        let e = _mm512_set1_pch(-1.0, 0.0);
18641        assert_eq_m512h(r, e);
18642    }
18643
18644    #[simd_test(enable = "avx512fp16")]
18645    unsafe fn test_mm512_mask_mul_round_pch() {
18646        let a = _mm512_set1_pch(0.0, 1.0);
18647        let b = _mm512_set1_pch(0.0, 1.0);
18648        let src = _mm512_setr_ph(
18649            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18650            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
18651            32.0, 33.0,
18652        );
18653        let r = _mm512_mask_mul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18654            src,
18655            0b0101010101010101,
18656            a,
18657            b,
18658        );
18659        let e = _mm512_setr_ph(
18660            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18661            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
18662            33.0,
18663        );
18664        assert_eq_m512h(r, e);
18665    }
18666
18667    #[simd_test(enable = "avx512fp16")]
18668    unsafe fn test_mm512_maskz_mul_round_pch() {
18669        let a = _mm512_set1_pch(0.0, 1.0);
18670        let b = _mm512_set1_pch(0.0, 1.0);
18671        let r = _mm512_maskz_mul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18672            0b0101010101010101,
18673            a,
18674            b,
18675        );
18676        let e = _mm512_setr_ph(
18677            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18678            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18679        );
18680        assert_eq_m512h(r, e);
18681    }
18682
18683    #[simd_test(enable = "avx512fp16")]
18684    unsafe fn test_mm_mul_round_sch() {
18685        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18686        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18687        let r = _mm_mul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18688        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18689        assert_eq_m128h(r, e);
18690    }
18691
18692    #[simd_test(enable = "avx512fp16")]
18693    unsafe fn test_mm_mask_mul_round_sch() {
18694        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18695        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18696        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
18697        let r = _mm_mask_mul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18698            src, 0, a, b,
18699        );
18700        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18701        assert_eq_m128h(r, e);
18702    }
18703
18704    #[simd_test(enable = "avx512fp16")]
18705    unsafe fn test_mm_maskz_mul_round_sch() {
18706        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18707        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18708        let r =
18709            _mm_maskz_mul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
18710        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18711        assert_eq_m128h(r, e);
18712    }
18713
18714    #[simd_test(enable = "avx512fp16")]
18715    unsafe fn test_mm_mul_sch() {
18716        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18717        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18718        let r = _mm_mul_sch(a, b);
18719        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18720        assert_eq_m128h(r, e);
18721    }
18722
18723    #[simd_test(enable = "avx512fp16")]
18724    unsafe fn test_mm_mask_mul_sch() {
18725        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18726        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18727        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
18728        let r = _mm_mask_mul_sch(src, 0, a, b);
18729        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18730        assert_eq_m128h(r, e);
18731    }
18732
18733    #[simd_test(enable = "avx512fp16")]
18734    unsafe fn test_mm_maskz_mul_sch() {
18735        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18736        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18737        let r = _mm_maskz_mul_sch(0, a, b);
18738        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18739        assert_eq_m128h(r, e);
18740    }
18741
18742    #[simd_test(enable = "avx512fp16,avx512vl")]
18743    unsafe fn test_mm_fmul_pch() {
18744        let a = _mm_set1_pch(0.0, 1.0);
18745        let b = _mm_set1_pch(0.0, 1.0);
18746        let r = _mm_fmul_pch(a, b);
18747        let e = _mm_set1_pch(-1.0, 0.0);
18748        assert_eq_m128h(r, e);
18749    }
18750
18751    #[simd_test(enable = "avx512fp16,avx512vl")]
18752    unsafe fn test_mm_mask_fmul_pch() {
18753        let a = _mm_set1_pch(0.0, 1.0);
18754        let b = _mm_set1_pch(0.0, 1.0);
18755        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
18756        let r = _mm_mask_fmul_pch(src, 0b0101, a, b);
18757        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
18758        assert_eq_m128h(r, e);
18759    }
18760
18761    #[simd_test(enable = "avx512fp16,avx512vl")]
18762    unsafe fn test_mm_maskz_fmul_pch() {
18763        let a = _mm_set1_pch(0.0, 1.0);
18764        let b = _mm_set1_pch(0.0, 1.0);
18765        let r = _mm_maskz_fmul_pch(0b0101, a, b);
18766        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
18767        assert_eq_m128h(r, e);
18768    }
18769
18770    #[simd_test(enable = "avx512fp16,avx512vl")]
18771    unsafe fn test_mm256_fmul_pch() {
18772        let a = _mm256_set1_pch(0.0, 1.0);
18773        let b = _mm256_set1_pch(0.0, 1.0);
18774        let r = _mm256_fmul_pch(a, b);
18775        let e = _mm256_set1_pch(-1.0, 0.0);
18776        assert_eq_m256h(r, e);
18777    }
18778
18779    #[simd_test(enable = "avx512fp16,avx512vl")]
18780    unsafe fn test_mm256_mask_fmul_pch() {
18781        let a = _mm256_set1_pch(0.0, 1.0);
18782        let b = _mm256_set1_pch(0.0, 1.0);
18783        let src = _mm256_setr_ph(
18784            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18785        );
18786        let r = _mm256_mask_fmul_pch(src, 0b01010101, a, b);
18787        let e = _mm256_setr_ph(
18788            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18789        );
18790        assert_eq_m256h(r, e);
18791    }
18792
18793    #[simd_test(enable = "avx512fp16,avx512vl")]
18794    unsafe fn test_mm256_maskz_fmul_pch() {
18795        let a = _mm256_set1_pch(0.0, 1.0);
18796        let b = _mm256_set1_pch(0.0, 1.0);
18797        let r = _mm256_maskz_fmul_pch(0b01010101, a, b);
18798        let e = _mm256_setr_ph(
18799            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18800        );
18801        assert_eq_m256h(r, e);
18802    }
18803
18804    #[simd_test(enable = "avx512fp16")]
18805    unsafe fn test_mm512_fmul_pch() {
18806        let a = _mm512_set1_pch(0.0, 1.0);
18807        let b = _mm512_set1_pch(0.0, 1.0);
18808        let r = _mm512_fmul_pch(a, b);
18809        let e = _mm512_set1_pch(-1.0, 0.0);
18810        assert_eq_m512h(r, e);
18811    }
18812
18813    #[simd_test(enable = "avx512fp16")]
18814    unsafe fn test_mm512_mask_fmul_pch() {
18815        let a = _mm512_set1_pch(0.0, 1.0);
18816        let b = _mm512_set1_pch(0.0, 1.0);
18817        let src = _mm512_setr_ph(
18818            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18819            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
18820            32.0, 33.0,
18821        );
18822        let r = _mm512_mask_fmul_pch(src, 0b0101010101010101, a, b);
18823        let e = _mm512_setr_ph(
18824            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18825            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
18826            33.0,
18827        );
18828        assert_eq_m512h(r, e);
18829    }
18830
18831    #[simd_test(enable = "avx512fp16")]
18832    unsafe fn test_mm512_maskz_fmul_pch() {
18833        let a = _mm512_set1_pch(0.0, 1.0);
18834        let b = _mm512_set1_pch(0.0, 1.0);
18835        let r = _mm512_maskz_fmul_pch(0b0101010101010101, a, b);
18836        let e = _mm512_setr_ph(
18837            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18838            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18839        );
18840        assert_eq_m512h(r, e);
18841    }
18842
18843    #[simd_test(enable = "avx512fp16")]
18844    unsafe fn test_mm512_fmul_round_pch() {
18845        let a = _mm512_set1_pch(0.0, 1.0);
18846        let b = _mm512_set1_pch(0.0, 1.0);
18847        let r = _mm512_fmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18848        let e = _mm512_set1_pch(-1.0, 0.0);
18849        assert_eq_m512h(r, e);
18850    }
18851
18852    #[simd_test(enable = "avx512fp16")]
18853    unsafe fn test_mm512_mask_fmul_round_pch() {
18854        let a = _mm512_set1_pch(0.0, 1.0);
18855        let b = _mm512_set1_pch(0.0, 1.0);
18856        let src = _mm512_setr_ph(
18857            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18858            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
18859            32.0, 33.0,
18860        );
18861        let r = _mm512_mask_fmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18862            src,
18863            0b0101010101010101,
18864            a,
18865            b,
18866        );
18867        let e = _mm512_setr_ph(
18868            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18869            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
18870            33.0,
18871        );
18872        assert_eq_m512h(r, e);
18873    }
18874
18875    #[simd_test(enable = "avx512fp16")]
18876    unsafe fn test_mm512_maskz_fmul_round_pch() {
18877        let a = _mm512_set1_pch(0.0, 1.0);
18878        let b = _mm512_set1_pch(0.0, 1.0);
18879        let r = _mm512_maskz_fmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18880            0b0101010101010101,
18881            a,
18882            b,
18883        );
18884        let e = _mm512_setr_ph(
18885            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18886            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18887        );
18888        assert_eq_m512h(r, e);
18889    }
18890
18891    #[simd_test(enable = "avx512fp16")]
18892    unsafe fn test_mm_fmul_round_sch() {
18893        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18894        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18895        let r = _mm_fmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18896        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18897        assert_eq_m128h(r, e);
18898    }
18899
18900    #[simd_test(enable = "avx512fp16")]
18901    unsafe fn test_mm_mask_fmul_round_sch() {
18902        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18903        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18904        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
18905        let r = _mm_mask_fmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18906            src, 0, a, b,
18907        );
18908        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18909        assert_eq_m128h(r, e);
18910    }
18911
18912    #[simd_test(enable = "avx512fp16")]
18913    unsafe fn test_mm_maskz_fmul_round_sch() {
18914        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18915        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18916        let r =
18917            _mm_maskz_fmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
18918        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18919        assert_eq_m128h(r, e);
18920    }
18921
18922    #[simd_test(enable = "avx512fp16")]
18923    unsafe fn test_mm_fmul_sch() {
18924        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18925        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18926        let r = _mm_fmul_sch(a, b);
18927        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18928        assert_eq_m128h(r, e);
18929    }
18930
18931    #[simd_test(enable = "avx512fp16")]
18932    unsafe fn test_mm_mask_fmul_sch() {
18933        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18934        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18935        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
18936        let r = _mm_mask_fmul_sch(src, 0, a, b);
18937        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18938        assert_eq_m128h(r, e);
18939    }
18940
18941    #[simd_test(enable = "avx512fp16")]
18942    unsafe fn test_mm_maskz_fmul_sch() {
18943        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18944        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18945        let r = _mm_maskz_fmul_sch(0, a, b);
18946        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18947        assert_eq_m128h(r, e);
18948    }
18949
18950    #[simd_test(enable = "avx512fp16,avx512vl")]
18951    unsafe fn test_mm_cmul_pch() {
18952        let a = _mm_set1_pch(0.0, 1.0);
18953        let b = _mm_set1_pch(0.0, -1.0);
18954        let r = _mm_cmul_pch(a, b);
18955        let e = _mm_set1_pch(-1.0, 0.0);
18956        assert_eq_m128h(r, e);
18957    }
18958
18959    #[simd_test(enable = "avx512fp16,avx512vl")]
18960    unsafe fn test_mm_mask_cmul_pch() {
18961        let a = _mm_set1_pch(0.0, 1.0);
18962        let b = _mm_set1_pch(0.0, -1.0);
18963        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
18964        let r = _mm_mask_cmul_pch(src, 0b0101, a, b);
18965        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
18966        assert_eq_m128h(r, e);
18967    }
18968
18969    #[simd_test(enable = "avx512fp16,avx512vl")]
18970    unsafe fn test_mm_maskz_cmul_pch() {
18971        let a = _mm_set1_pch(0.0, 1.0);
18972        let b = _mm_set1_pch(0.0, -1.0);
18973        let r = _mm_maskz_cmul_pch(0b0101, a, b);
18974        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
18975        assert_eq_m128h(r, e);
18976    }
18977
18978    #[simd_test(enable = "avx512fp16,avx512vl")]
18979    unsafe fn test_mm256_cmul_pch() {
18980        let a = _mm256_set1_pch(0.0, 1.0);
18981        let b = _mm256_set1_pch(0.0, -1.0);
18982        let r = _mm256_cmul_pch(a, b);
18983        let e = _mm256_set1_pch(-1.0, 0.0);
18984        assert_eq_m256h(r, e);
18985    }
18986
18987    #[simd_test(enable = "avx512fp16,avx512vl")]
18988    unsafe fn test_mm256_mask_cmul_pch() {
18989        let a = _mm256_set1_pch(0.0, 1.0);
18990        let b = _mm256_set1_pch(0.0, -1.0);
18991        let src = _mm256_setr_ph(
18992            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18993        );
18994        let r = _mm256_mask_cmul_pch(src, 0b01010101, a, b);
18995        let e = _mm256_setr_ph(
18996            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18997        );
18998        assert_eq_m256h(r, e);
18999    }
19000
19001    #[simd_test(enable = "avx512fp16,avx512vl")]
19002    unsafe fn test_mm256_maskz_cmul_pch() {
19003        let a = _mm256_set1_pch(0.0, 1.0);
19004        let b = _mm256_set1_pch(0.0, -1.0);
19005        let r = _mm256_maskz_cmul_pch(0b01010101, a, b);
19006        let e = _mm256_setr_ph(
19007            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19008        );
19009        assert_eq_m256h(r, e);
19010    }
19011
19012    #[simd_test(enable = "avx512fp16")]
19013    unsafe fn test_mm512_cmul_pch() {
19014        let a = _mm512_set1_pch(0.0, 1.0);
19015        let b = _mm512_set1_pch(0.0, -1.0);
19016        let r = _mm512_cmul_pch(a, b);
19017        let e = _mm512_set1_pch(-1.0, 0.0);
19018        assert_eq_m512h(r, e);
19019    }
19020
19021    #[simd_test(enable = "avx512fp16")]
19022    unsafe fn test_mm512_mask_cmul_pch() {
19023        let a = _mm512_set1_pch(0.0, 1.0);
19024        let b = _mm512_set1_pch(0.0, -1.0);
19025        let src = _mm512_setr_ph(
19026            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19027            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19028            32.0, 33.0,
19029        );
19030        let r = _mm512_mask_cmul_pch(src, 0b0101010101010101, a, b);
19031        let e = _mm512_setr_ph(
19032            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19033            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
19034            33.0,
19035        );
19036        assert_eq_m512h(r, e);
19037    }
19038
19039    #[simd_test(enable = "avx512fp16")]
19040    unsafe fn test_mm512_maskz_cmul_pch() {
19041        let a = _mm512_set1_pch(0.0, 1.0);
19042        let b = _mm512_set1_pch(0.0, -1.0);
19043        let r = _mm512_maskz_cmul_pch(0b0101010101010101, a, b);
19044        let e = _mm512_setr_ph(
19045            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19046            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19047        );
19048        assert_eq_m512h(r, e);
19049    }
19050
19051    #[simd_test(enable = "avx512fp16")]
19052    unsafe fn test_mm512_cmul_round_pch() {
19053        let a = _mm512_set1_pch(0.0, 1.0);
19054        let b = _mm512_set1_pch(0.0, -1.0);
19055        let r = _mm512_cmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
19056        let e = _mm512_set1_pch(-1.0, 0.0);
19057        assert_eq_m512h(r, e);
19058    }
19059
19060    #[simd_test(enable = "avx512fp16")]
19061    unsafe fn test_mm512_mask_cmul_round_pch() {
19062        let a = _mm512_set1_pch(0.0, 1.0);
19063        let b = _mm512_set1_pch(0.0, -1.0);
19064        let src = _mm512_setr_ph(
19065            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19066            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19067            32.0, 33.0,
19068        );
19069        let r = _mm512_mask_cmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19070            src,
19071            0b0101010101010101,
19072            a,
19073            b,
19074        );
19075        let e = _mm512_setr_ph(
19076            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19077            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
19078            33.0,
19079        );
19080        assert_eq_m512h(r, e);
19081    }
19082
19083    #[simd_test(enable = "avx512fp16")]
19084    unsafe fn test_mm512_maskz_cmul_round_pch() {
19085        let a = _mm512_set1_pch(0.0, 1.0);
19086        let b = _mm512_set1_pch(0.0, -1.0);
19087        let r = _mm512_maskz_cmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19088            0b0101010101010101,
19089            a,
19090            b,
19091        );
19092        let e = _mm512_setr_ph(
19093            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19094            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19095        );
19096        assert_eq_m512h(r, e);
19097    }
19098
19099    #[simd_test(enable = "avx512fp16")]
19100    unsafe fn test_mm_cmul_sch() {
19101        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19102        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19103        let r = _mm_cmul_sch(a, b);
19104        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19105        assert_eq_m128h(r, e);
19106    }
19107
19108    #[simd_test(enable = "avx512fp16")]
19109    unsafe fn test_mm_mask_cmul_sch() {
19110        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19111        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19112        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19113        let r = _mm_mask_cmul_sch(src, 0, a, b);
19114        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19115        assert_eq_m128h(r, e);
19116    }
19117
19118    #[simd_test(enable = "avx512fp16")]
19119    unsafe fn test_mm_maskz_cmul_sch() {
19120        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19121        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19122        let r = _mm_maskz_cmul_sch(0, a, b);
19123        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19124        assert_eq_m128h(r, e);
19125    }
19126
19127    #[simd_test(enable = "avx512fp16")]
19128    unsafe fn test_mm_cmul_round_sch() {
19129        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19130        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19131        let r = _mm_cmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
19132        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19133        assert_eq_m128h(r, e);
19134    }
19135
19136    #[simd_test(enable = "avx512fp16")]
19137    unsafe fn test_mm_mask_cmul_round_sch() {
19138        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19139        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19140        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19141        let r = _mm_mask_cmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19142            src, 0, a, b,
19143        );
19144        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19145        assert_eq_m128h(r, e);
19146    }
19147
19148    #[simd_test(enable = "avx512fp16")]
19149    unsafe fn test_mm_maskz_cmul_round_sch() {
19150        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19151        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19152        let r =
19153            _mm_maskz_cmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
19154        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19155        assert_eq_m128h(r, e);
19156    }
19157
19158    #[simd_test(enable = "avx512fp16,avx512vl")]
19159    unsafe fn test_mm_fcmul_pch() {
19160        let a = _mm_set1_pch(0.0, 1.0);
19161        let b = _mm_set1_pch(0.0, -1.0);
19162        let r = _mm_fcmul_pch(a, b);
19163        let e = _mm_set1_pch(-1.0, 0.0);
19164        assert_eq_m128h(r, e);
19165    }
19166
19167    #[simd_test(enable = "avx512fp16,avx512vl")]
19168    unsafe fn test_mm_mask_fcmul_pch() {
19169        let a = _mm_set1_pch(0.0, 1.0);
19170        let b = _mm_set1_pch(0.0, -1.0);
19171        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
19172        let r = _mm_mask_fcmul_pch(src, 0b0101, a, b);
19173        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
19174        assert_eq_m128h(r, e);
19175    }
19176
19177    #[simd_test(enable = "avx512fp16,avx512vl")]
19178    unsafe fn test_mm_maskz_fcmul_pch() {
19179        let a = _mm_set1_pch(0.0, 1.0);
19180        let b = _mm_set1_pch(0.0, -1.0);
19181        let r = _mm_maskz_fcmul_pch(0b0101, a, b);
19182        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
19183        assert_eq_m128h(r, e);
19184    }
19185
19186    #[simd_test(enable = "avx512fp16,avx512vl")]
19187    unsafe fn test_mm256_fcmul_pch() {
19188        let a = _mm256_set1_pch(0.0, 1.0);
19189        let b = _mm256_set1_pch(0.0, -1.0);
19190        let r = _mm256_fcmul_pch(a, b);
19191        let e = _mm256_set1_pch(-1.0, 0.0);
19192        assert_eq_m256h(r, e);
19193    }
19194
19195    #[simd_test(enable = "avx512fp16,avx512vl")]
19196    unsafe fn test_mm256_mask_fcmul_pch() {
19197        let a = _mm256_set1_pch(0.0, 1.0);
19198        let b = _mm256_set1_pch(0.0, -1.0);
19199        let src = _mm256_setr_ph(
19200            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19201        );
19202        let r = _mm256_mask_fcmul_pch(src, 0b01010101, a, b);
19203        let e = _mm256_setr_ph(
19204            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19205        );
19206        assert_eq_m256h(r, e);
19207    }
19208
19209    #[simd_test(enable = "avx512fp16,avx512vl")]
19210    unsafe fn test_mm256_maskz_fcmul_pch() {
19211        let a = _mm256_set1_pch(0.0, 1.0);
19212        let b = _mm256_set1_pch(0.0, -1.0);
19213        let r = _mm256_maskz_fcmul_pch(0b01010101, a, b);
19214        let e = _mm256_setr_ph(
19215            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19216        );
19217        assert_eq_m256h(r, e);
19218    }
19219
19220    #[simd_test(enable = "avx512fp16")]
19221    unsafe fn test_mm512_fcmul_pch() {
19222        let a = _mm512_set1_pch(0.0, 1.0);
19223        let b = _mm512_set1_pch(0.0, -1.0);
19224        let r = _mm512_fcmul_pch(a, b);
19225        let e = _mm512_set1_pch(-1.0, 0.0);
19226        assert_eq_m512h(r, e);
19227    }
19228
19229    #[simd_test(enable = "avx512fp16")]
19230    unsafe fn test_mm512_mask_fcmul_pch() {
19231        let a = _mm512_set1_pch(0.0, 1.0);
19232        let b = _mm512_set1_pch(0.0, -1.0);
19233        let src = _mm512_setr_ph(
19234            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19235            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19236            32.0, 33.0,
19237        );
19238        let r = _mm512_mask_fcmul_pch(src, 0b0101010101010101, a, b);
19239        let e = _mm512_setr_ph(
19240            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19241            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
19242            33.0,
19243        );
19244        assert_eq_m512h(r, e);
19245    }
19246
19247    #[simd_test(enable = "avx512fp16")]
19248    unsafe fn test_mm512_maskz_fcmul_pch() {
19249        let a = _mm512_set1_pch(0.0, 1.0);
19250        let b = _mm512_set1_pch(0.0, -1.0);
19251        let r = _mm512_maskz_fcmul_pch(0b0101010101010101, a, b);
19252        let e = _mm512_setr_ph(
19253            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19254            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19255        );
19256        assert_eq_m512h(r, e);
19257    }
19258
19259    #[simd_test(enable = "avx512fp16")]
19260    unsafe fn test_mm512_fcmul_round_pch() {
19261        let a = _mm512_set1_pch(0.0, 1.0);
19262        let b = _mm512_set1_pch(0.0, -1.0);
19263        let r = _mm512_fcmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
19264        let e = _mm512_set1_pch(-1.0, 0.0);
19265        assert_eq_m512h(r, e);
19266    }
19267
19268    #[simd_test(enable = "avx512fp16")]
19269    unsafe fn test_mm512_mask_fcmul_round_pch() {
19270        let a = _mm512_set1_pch(0.0, 1.0);
19271        let b = _mm512_set1_pch(0.0, -1.0);
19272        let src = _mm512_setr_ph(
19273            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19274            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19275            32.0, 33.0,
19276        );
19277        let r = _mm512_mask_fcmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19278            src,
19279            0b0101010101010101,
19280            a,
19281            b,
19282        );
19283        let e = _mm512_setr_ph(
19284            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19285            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
19286            33.0,
19287        );
19288        assert_eq_m512h(r, e);
19289    }
19290
19291    #[simd_test(enable = "avx512fp16")]
19292    unsafe fn test_mm512_maskz_fcmul_round_pch() {
19293        let a = _mm512_set1_pch(0.0, 1.0);
19294        let b = _mm512_set1_pch(0.0, -1.0);
19295        let r = _mm512_maskz_fcmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19296            0b0101010101010101,
19297            a,
19298            b,
19299        );
19300        let e = _mm512_setr_ph(
19301            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19302            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19303        );
19304        assert_eq_m512h(r, e);
19305    }
19306
19307    #[simd_test(enable = "avx512fp16")]
19308    unsafe fn test_mm_fcmul_sch() {
19309        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19310        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19311        let r = _mm_fcmul_sch(a, b);
19312        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19313        assert_eq_m128h(r, e);
19314    }
19315
19316    #[simd_test(enable = "avx512fp16")]
19317    unsafe fn test_mm_mask_fcmul_sch() {
19318        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19319        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19320        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19321        let r = _mm_mask_fcmul_sch(src, 0, a, b);
19322        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19323        assert_eq_m128h(r, e);
19324    }
19325
19326    #[simd_test(enable = "avx512fp16")]
19327    unsafe fn test_mm_maskz_fcmul_sch() {
19328        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19329        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19330        let r = _mm_maskz_fcmul_sch(0, a, b);
19331        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19332        assert_eq_m128h(r, e);
19333    }
19334
19335    #[simd_test(enable = "avx512fp16")]
19336    unsafe fn test_mm_fcmul_round_sch() {
19337        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19338        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19339        let r = _mm_fcmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
19340        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19341        assert_eq_m128h(r, e);
19342    }
19343
19344    #[simd_test(enable = "avx512fp16")]
19345    unsafe fn test_mm_mask_fcmul_round_sch() {
19346        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19347        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19348        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19349        let r = _mm_mask_fcmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19350            src, 0, a, b,
19351        );
19352        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19353        assert_eq_m128h(r, e);
19354    }
19355
19356    #[simd_test(enable = "avx512fp16")]
19357    unsafe fn test_mm_maskz_fcmul_round_sch() {
19358        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19359        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19360        let r =
19361            _mm_maskz_fcmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
19362        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19363        assert_eq_m128h(r, e);
19364    }
19365
19366    #[simd_test(enable = "avx512fp16,avx512vl")]
19367    unsafe fn test_mm_abs_ph() {
19368        let a = _mm_set_ph(-1.0, 0.0, 1.0, -2.0, 3.0, -4.0, 5.0, -6.0);
19369        let r = _mm_abs_ph(a);
19370        let e = _mm_set_ph(1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0);
19371        assert_eq_m128h(r, e);
19372    }
19373
19374    #[simd_test(enable = "avx512fp16,avx512vl")]
19375    unsafe fn test_mm256_abs_ph() {
19376        let a = _mm256_set_ph(
19377            -1.0, 0.0, 1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0,
19378            -14.0,
19379        );
19380        let r = _mm256_abs_ph(a);
19381        let e = _mm256_set_ph(
19382            1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0,
19383        );
19384        assert_eq_m256h(r, e);
19385    }
19386
19387    #[simd_test(enable = "avx512fp16")]
19388    unsafe fn test_mm512_abs_ph() {
19389        let a = _mm512_set_ph(
19390            -1.0, 0.0, 1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0,
19391            -14.0, 15.0, -16.0, 17.0, -18.0, 19.0, -20.0, 21.0, -22.0, 23.0, -24.0, 25.0, -26.0,
19392            27.0, -28.0, 29.0, -30.0,
19393        );
19394        let r = _mm512_abs_ph(a);
19395        let e = _mm512_set_ph(
19396            1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0,
19397            15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0,
19398            29.0, 30.0,
19399        );
19400        assert_eq_m512h(r, e);
19401    }
19402
19403    #[simd_test(enable = "avx512fp16,avx512vl")]
19404    unsafe fn test_mm_conj_pch() {
19405        let a = _mm_set1_pch(0.0, 1.0);
19406        let r = _mm_conj_pch(a);
19407        let e = _mm_set1_pch(0.0, -1.0);
19408        assert_eq_m128h(r, e);
19409    }
19410
19411    #[simd_test(enable = "avx512fp16,avx512vl")]
19412    unsafe fn test_mm_mask_conj_pch() {
19413        let a = _mm_set1_pch(0.0, 1.0);
19414        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
19415        let r = _mm_mask_conj_pch(src, 0b0101, a);
19416        let e = _mm_setr_ph(0.0, -1.0, 4.0, 5.0, 0.0, -1.0, 8.0, 9.0);
19417        assert_eq_m128h(r, e);
19418    }
19419
19420    #[simd_test(enable = "avx512fp16,avx512vl")]
19421    unsafe fn test_mm_maskz_conj_pch() {
19422        let a = _mm_set1_pch(0.0, 1.0);
19423        let r = _mm_maskz_conj_pch(0b0101, a);
19424        let e = _mm_setr_ph(0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0);
19425        assert_eq_m128h(r, e);
19426    }
19427
19428    #[simd_test(enable = "avx512fp16,avx512vl")]
19429    unsafe fn test_mm256_conj_pch() {
19430        let a = _mm256_set1_pch(0.0, 1.0);
19431        let r = _mm256_conj_pch(a);
19432        let e = _mm256_set1_pch(0.0, -1.0);
19433        assert_eq_m256h(r, e);
19434    }
19435
19436    #[simd_test(enable = "avx512fp16,avx512vl")]
19437    unsafe fn test_mm256_mask_conj_pch() {
19438        let a = _mm256_set1_pch(0.0, 1.0);
19439        let src = _mm256_setr_ph(
19440            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19441        );
19442        let r = _mm256_mask_conj_pch(src, 0b01010101, a);
19443        let e = _mm256_setr_ph(
19444            0.0, -1.0, 4.0, 5.0, 0.0, -1.0, 8.0, 9.0, 0.0, -1.0, 12.0, 13.0, 0.0, -1.0, 16.0, 17.0,
19445        );
19446        assert_eq_m256h(r, e);
19447    }
19448
19449    #[simd_test(enable = "avx512fp16,avx512vl")]
19450    unsafe fn test_mm256_maskz_conj_pch() {
19451        let a = _mm256_set1_pch(0.0, 1.0);
19452        let r = _mm256_maskz_conj_pch(0b01010101, a);
19453        let e = _mm256_setr_ph(
19454            0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0,
19455        );
19456        assert_eq_m256h(r, e);
19457    }
19458
19459    #[simd_test(enable = "avx512fp16")]
19460    unsafe fn test_mm512_conj_pch() {
19461        let a = _mm512_set1_pch(0.0, 1.0);
19462        let r = _mm512_conj_pch(a);
19463        let e = _mm512_set1_pch(0.0, -1.0);
19464        assert_eq_m512h(r, e);
19465    }
19466
19467    #[simd_test(enable = "avx512fp16")]
19468    unsafe fn test_mm512_mask_conj_pch() {
19469        let a = _mm512_set1_pch(0.0, 1.0);
19470        let src = _mm512_setr_ph(
19471            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19472            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19473            32.0, 33.0,
19474        );
19475        let r = _mm512_mask_conj_pch(src, 0b0101010101010101, a);
19476        let e = _mm512_setr_ph(
19477            0.0, -1.0, 4.0, 5.0, 0.0, -1.0, 8.0, 9.0, 0.0, -1.0, 12.0, 13.0, 0.0, -1.0, 16.0, 17.0,
19478            0.0, -1.0, 20.0, 21.0, 0.0, -1.0, 24.0, 25.0, 0.0, -1.0, 28.0, 29.0, 0.0, -1.0, 32.0,
19479            33.0,
19480        );
19481        assert_eq_m512h(r, e);
19482    }
19483
19484    #[simd_test(enable = "avx512fp16")]
19485    unsafe fn test_mm512_maskz_conj_pch() {
19486        let a = _mm512_set1_pch(0.0, 1.0);
19487        let r = _mm512_maskz_conj_pch(0b0101010101010101, a);
19488        let e = _mm512_setr_ph(
19489            0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0,
19490            0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0,
19491        );
19492        assert_eq_m512h(r, e);
19493    }
19494
19495    #[simd_test(enable = "avx512fp16,avx512vl")]
19496    unsafe fn test_mm_fmadd_pch() {
19497        let a = _mm_set1_pch(0.0, 1.0);
19498        let b = _mm_set1_pch(0.0, 2.0);
19499        let c = _mm_set1_pch(0.0, 3.0);
19500        let r = _mm_fmadd_pch(a, b, c);
19501        let e = _mm_set1_pch(-2.0, 3.0);
19502        assert_eq_m128h(r, e);
19503    }
19504
19505    #[simd_test(enable = "avx512fp16,avx512vl")]
19506    unsafe fn test_mm_mask_fmadd_pch() {
19507        let a = _mm_set1_pch(0.0, 1.0);
19508        let b = _mm_set1_pch(0.0, 2.0);
19509        let c = _mm_set1_pch(0.0, 3.0);
19510        let r = _mm_mask_fmadd_pch(a, 0b0101, b, c);
19511        let e = _mm_setr_ph(-2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0);
19512        assert_eq_m128h(r, e);
19513    }
19514
19515    #[simd_test(enable = "avx512fp16,avx512vl")]
19516    unsafe fn test_mm_mask3_fmadd_pch() {
19517        let a = _mm_set1_pch(0.0, 1.0);
19518        let b = _mm_set1_pch(0.0, 2.0);
19519        let c = _mm_set1_pch(0.0, 3.0);
19520        let r = _mm_mask3_fmadd_pch(a, b, c, 0b0101);
19521        let e = _mm_setr_ph(-2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0);
19522        assert_eq_m128h(r, e);
19523    }
19524
19525    #[simd_test(enable = "avx512fp16,avx512vl")]
19526    unsafe fn test_mm_maskz_fmadd_pch() {
19527        let a = _mm_set1_pch(0.0, 1.0);
19528        let b = _mm_set1_pch(0.0, 2.0);
19529        let c = _mm_set1_pch(0.0, 3.0);
19530        let r = _mm_maskz_fmadd_pch(0b0101, a, b, c);
19531        let e = _mm_setr_ph(-2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0);
19532        assert_eq_m128h(r, e);
19533    }
19534
19535    #[simd_test(enable = "avx512fp16,avx512vl")]
19536    unsafe fn test_mm256_fmadd_pch() {
19537        let a = _mm256_set1_pch(0.0, 1.0);
19538        let b = _mm256_set1_pch(0.0, 2.0);
19539        let c = _mm256_set1_pch(0.0, 3.0);
19540        let r = _mm256_fmadd_pch(a, b, c);
19541        let e = _mm256_set1_pch(-2.0, 3.0);
19542        assert_eq_m256h(r, e);
19543    }
19544
19545    #[simd_test(enable = "avx512fp16,avx512vl")]
19546    unsafe fn test_mm256_mask_fmadd_pch() {
19547        let a = _mm256_set1_pch(0.0, 1.0);
19548        let b = _mm256_set1_pch(0.0, 2.0);
19549        let c = _mm256_set1_pch(0.0, 3.0);
19550        let r = _mm256_mask_fmadd_pch(a, 0b01010101, b, c);
19551        let e = _mm256_setr_ph(
19552            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19553        );
19554        assert_eq_m256h(r, e);
19555    }
19556
19557    #[simd_test(enable = "avx512fp16,avx512vl")]
19558    unsafe fn test_mm256_mask3_fmadd_pch() {
19559        let a = _mm256_set1_pch(0.0, 1.0);
19560        let b = _mm256_set1_pch(0.0, 2.0);
19561        let c = _mm256_set1_pch(0.0, 3.0);
19562        let r = _mm256_mask3_fmadd_pch(a, b, c, 0b01010101);
19563        let e = _mm256_setr_ph(
19564            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19565        );
19566        assert_eq_m256h(r, e);
19567    }
19568
19569    #[simd_test(enable = "avx512fp16,avx512vl")]
19570    unsafe fn test_mm256_maskz_fmadd_pch() {
19571        let a = _mm256_set1_pch(0.0, 1.0);
19572        let b = _mm256_set1_pch(0.0, 2.0);
19573        let c = _mm256_set1_pch(0.0, 3.0);
19574        let r = _mm256_maskz_fmadd_pch(0b01010101, a, b, c);
19575        let e = _mm256_setr_ph(
19576            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
19577        );
19578        assert_eq_m256h(r, e);
19579    }
19580
19581    #[simd_test(enable = "avx512fp16")]
19582    unsafe fn test_mm512_fmadd_pch() {
19583        let a = _mm512_set1_pch(0.0, 1.0);
19584        let b = _mm512_set1_pch(0.0, 2.0);
19585        let c = _mm512_set1_pch(0.0, 3.0);
19586        let r = _mm512_fmadd_pch(a, b, c);
19587        let e = _mm512_set1_pch(-2.0, 3.0);
19588        assert_eq_m512h(r, e);
19589    }
19590
19591    #[simd_test(enable = "avx512fp16")]
19592    unsafe fn test_mm512_mask_fmadd_pch() {
19593        let a = _mm512_set1_pch(0.0, 1.0);
19594        let b = _mm512_set1_pch(0.0, 2.0);
19595        let c = _mm512_set1_pch(0.0, 3.0);
19596        let r = _mm512_mask_fmadd_pch(a, 0b0101010101010101, b, c);
19597        let e = _mm512_setr_ph(
19598            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19599            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19600        );
19601        assert_eq_m512h(r, e);
19602    }
19603
19604    #[simd_test(enable = "avx512fp16")]
19605    unsafe fn test_mm512_mask3_fmadd_pch() {
19606        let a = _mm512_set1_pch(0.0, 1.0);
19607        let b = _mm512_set1_pch(0.0, 2.0);
19608        let c = _mm512_set1_pch(0.0, 3.0);
19609        let r = _mm512_mask3_fmadd_pch(a, b, c, 0b0101010101010101);
19610        let e = _mm512_setr_ph(
19611            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19612            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19613        );
19614        assert_eq_m512h(r, e);
19615    }
19616
19617    #[simd_test(enable = "avx512fp16")]
19618    unsafe fn test_mm512_maskz_fmadd_pch() {
19619        let a = _mm512_set1_pch(0.0, 1.0);
19620        let b = _mm512_set1_pch(0.0, 2.0);
19621        let c = _mm512_set1_pch(0.0, 3.0);
19622        let r = _mm512_maskz_fmadd_pch(0b0101010101010101, a, b, c);
19623        let e = _mm512_setr_ph(
19624            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
19625            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
19626        );
19627        assert_eq_m512h(r, e);
19628    }
19629
19630    #[simd_test(enable = "avx512fp16")]
19631    unsafe fn test_mm512_fmadd_round_pch() {
19632        let a = _mm512_set1_pch(0.0, 1.0);
19633        let b = _mm512_set1_pch(0.0, 2.0);
19634        let c = _mm512_set1_pch(0.0, 3.0);
19635        let r =
19636            _mm512_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
19637        let e = _mm512_set1_pch(-2.0, 3.0);
19638        assert_eq_m512h(r, e);
19639    }
19640
19641    #[simd_test(enable = "avx512fp16")]
19642    unsafe fn test_mm512_mask_fmadd_round_pch() {
19643        let a = _mm512_set1_pch(0.0, 1.0);
19644        let b = _mm512_set1_pch(0.0, 2.0);
19645        let c = _mm512_set1_pch(0.0, 3.0);
19646        let r = _mm512_mask_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19647            a,
19648            0b0101010101010101,
19649            b,
19650            c,
19651        );
19652        let e = _mm512_setr_ph(
19653            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19654            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19655        );
19656        assert_eq_m512h(r, e);
19657    }
19658
19659    #[simd_test(enable = "avx512fp16")]
19660    unsafe fn test_mm512_mask3_fmadd_round_pch() {
19661        let a = _mm512_set1_pch(0.0, 1.0);
19662        let b = _mm512_set1_pch(0.0, 2.0);
19663        let c = _mm512_set1_pch(0.0, 3.0);
19664        let r = _mm512_mask3_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19665            a,
19666            b,
19667            c,
19668            0b0101010101010101,
19669        );
19670        let e = _mm512_setr_ph(
19671            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19672            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19673        );
19674        assert_eq_m512h(r, e);
19675    }
19676
19677    #[simd_test(enable = "avx512fp16")]
19678    unsafe fn test_mm512_maskz_fmadd_round_pch() {
19679        let a = _mm512_set1_pch(0.0, 1.0);
19680        let b = _mm512_set1_pch(0.0, 2.0);
19681        let c = _mm512_set1_pch(0.0, 3.0);
19682        let r = _mm512_maskz_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19683            0b0101010101010101,
19684            a,
19685            b,
19686            c,
19687        );
19688        let e = _mm512_setr_ph(
19689            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
19690            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
19691        );
19692        assert_eq_m512h(r, e);
19693    }
19694
19695    #[simd_test(enable = "avx512fp16")]
19696    unsafe fn test_mm_fmadd_sch() {
19697        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19698        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19699        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19700        let r = _mm_fmadd_sch(a, b, c);
19701        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19702        assert_eq_m128h(r, e);
19703    }
19704
19705    #[simd_test(enable = "avx512fp16")]
19706    unsafe fn test_mm_mask_fmadd_sch() {
19707        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19708        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19709        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19710        let r = _mm_mask_fmadd_sch(a, 0, b, c);
19711        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19712        assert_eq_m128h(r, e);
19713        let r = _mm_mask_fmadd_sch(a, 1, b, c);
19714        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19715        assert_eq_m128h(r, e);
19716    }
19717
19718    #[simd_test(enable = "avx512fp16")]
19719    unsafe fn test_mm_mask3_fmadd_sch() {
19720        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19721        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19722        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19723        let r = _mm_mask3_fmadd_sch(a, b, c, 0);
19724        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19725        assert_eq_m128h(r, e);
19726        let r = _mm_mask3_fmadd_sch(a, b, c, 1);
19727        let e = _mm_setr_ph(-2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19728        assert_eq_m128h(r, e);
19729    }
19730
19731    #[simd_test(enable = "avx512fp16")]
19732    unsafe fn test_mm_maskz_fmadd_sch() {
19733        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19734        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19735        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19736        let r = _mm_maskz_fmadd_sch(0, a, b, c);
19737        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19738        assert_eq_m128h(r, e);
19739        let r = _mm_maskz_fmadd_sch(1, a, b, c);
19740        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19741        assert_eq_m128h(r, e);
19742    }
19743
19744    #[simd_test(enable = "avx512fp16")]
19745    unsafe fn test_mm_fmadd_round_sch() {
19746        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19747        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19748        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19749        let r = _mm_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
19750        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19751        assert_eq_m128h(r, e);
19752    }
19753
19754    #[simd_test(enable = "avx512fp16")]
19755    unsafe fn test_mm_mask_fmadd_round_sch() {
19756        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19757        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19758        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19759        let r = _mm_mask_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19760            a, 0, b, c,
19761        );
19762        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19763        assert_eq_m128h(r, e);
19764        let r = _mm_mask_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19765            a, 1, b, c,
19766        );
19767        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19768        assert_eq_m128h(r, e);
19769    }
19770
19771    #[simd_test(enable = "avx512fp16")]
19772    unsafe fn test_mm_mask3_fmadd_round_sch() {
19773        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19774        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19775        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19776        let r = _mm_mask3_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19777            a, b, c, 0,
19778        );
19779        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19780        assert_eq_m128h(r, e);
19781        let r = _mm_mask3_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19782            a, b, c, 1,
19783        );
19784        let e = _mm_setr_ph(-2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19785        assert_eq_m128h(r, e);
19786    }
19787
19788    #[simd_test(enable = "avx512fp16")]
19789    unsafe fn test_mm_maskz_fmadd_round_sch() {
19790        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19791        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19792        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19793        let r = _mm_maskz_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19794            0, a, b, c,
19795        );
19796        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19797        assert_eq_m128h(r, e);
19798        let r = _mm_maskz_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19799            1, a, b, c,
19800        );
19801        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19802        assert_eq_m128h(r, e);
19803    }
19804
19805    #[simd_test(enable = "avx512fp16,avx512vl")]
19806    unsafe fn test_mm_fcmadd_pch() {
19807        let a = _mm_set1_pch(0.0, 1.0);
19808        let b = _mm_set1_pch(0.0, 2.0);
19809        let c = _mm_set1_pch(0.0, 3.0);
19810        let r = _mm_fcmadd_pch(a, b, c);
19811        let e = _mm_set1_pch(2.0, 3.0);
19812        assert_eq_m128h(r, e);
19813    }
19814
19815    #[simd_test(enable = "avx512fp16,avx512vl")]
19816    unsafe fn test_mm_mask_fcmadd_pch() {
19817        let a = _mm_set1_pch(0.0, 1.0);
19818        let b = _mm_set1_pch(0.0, 2.0);
19819        let c = _mm_set1_pch(0.0, 3.0);
19820        let r = _mm_mask_fcmadd_pch(a, 0b0101, b, c);
19821        let e = _mm_setr_ph(2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0);
19822        assert_eq_m128h(r, e);
19823    }
19824
19825    #[simd_test(enable = "avx512fp16,avx512vl")]
19826    unsafe fn test_mm_mask3_fcmadd_pch() {
19827        let a = _mm_set1_pch(0.0, 1.0);
19828        let b = _mm_set1_pch(0.0, 2.0);
19829        let c = _mm_set1_pch(0.0, 3.0);
19830        let r = _mm_mask3_fcmadd_pch(a, b, c, 0b0101);
19831        let e = _mm_setr_ph(2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0);
19832        assert_eq_m128h(r, e);
19833    }
19834
19835    #[simd_test(enable = "avx512fp16,avx512vl")]
19836    unsafe fn test_mm_maskz_fcmadd_pch() {
19837        let a = _mm_set1_pch(0.0, 1.0);
19838        let b = _mm_set1_pch(0.0, 2.0);
19839        let c = _mm_set1_pch(0.0, 3.0);
19840        let r = _mm_maskz_fcmadd_pch(0b0101, a, b, c);
19841        let e = _mm_setr_ph(2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0);
19842        assert_eq_m128h(r, e);
19843    }
19844
19845    #[simd_test(enable = "avx512fp16,avx512vl")]
19846    unsafe fn test_mm256_fcmadd_pch() {
19847        let a = _mm256_set1_pch(0.0, 1.0);
19848        let b = _mm256_set1_pch(0.0, 2.0);
19849        let c = _mm256_set1_pch(0.0, 3.0);
19850        let r = _mm256_fcmadd_pch(a, b, c);
19851        let e = _mm256_set1_pch(2.0, 3.0);
19852        assert_eq_m256h(r, e);
19853    }
19854
19855    #[simd_test(enable = "avx512fp16,avx512vl")]
19856    unsafe fn test_mm256_mask_fcmadd_pch() {
19857        let a = _mm256_set1_pch(0.0, 1.0);
19858        let b = _mm256_set1_pch(0.0, 2.0);
19859        let c = _mm256_set1_pch(0.0, 3.0);
19860        let r = _mm256_mask_fcmadd_pch(a, 0b01010101, b, c);
19861        let e = _mm256_setr_ph(
19862            2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0,
19863        );
19864        assert_eq_m256h(r, e);
19865    }
19866
19867    #[simd_test(enable = "avx512fp16,avx512vl")]
19868    unsafe fn test_mm256_mask3_fcmadd_pch() {
19869        let a = _mm256_set1_pch(0.0, 1.0);
19870        let b = _mm256_set1_pch(0.0, 2.0);
19871        let c = _mm256_set1_pch(0.0, 3.0);
19872        let r = _mm256_mask3_fcmadd_pch(a, b, c, 0b01010101);
19873        let e = _mm256_setr_ph(
19874            2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0,
19875        );
19876        assert_eq_m256h(r, e);
19877    }
19878
19879    #[simd_test(enable = "avx512fp16,avx512vl")]
19880    unsafe fn test_mm256_maskz_fcmadd_pch() {
19881        let a = _mm256_set1_pch(0.0, 1.0);
19882        let b = _mm256_set1_pch(0.0, 2.0);
19883        let c = _mm256_set1_pch(0.0, 3.0);
19884        let r = _mm256_maskz_fcmadd_pch(0b01010101, a, b, c);
19885        let e = _mm256_setr_ph(
19886            2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0,
19887        );
19888        assert_eq_m256h(r, e);
19889    }
19890
19891    #[simd_test(enable = "avx512fp16")]
19892    unsafe fn test_mm512_fcmadd_pch() {
19893        let a = _mm512_set1_pch(0.0, 1.0);
19894        let b = _mm512_set1_pch(0.0, 2.0);
19895        let c = _mm512_set1_pch(0.0, 3.0);
19896        let r = _mm512_fcmadd_pch(a, b, c);
19897        let e = _mm512_set1_pch(2.0, 3.0);
19898        assert_eq_m512h(r, e);
19899    }
19900
19901    #[simd_test(enable = "avx512fp16")]
19902    unsafe fn test_mm512_mask_fcmadd_pch() {
19903        let a = _mm512_set1_pch(0.0, 1.0);
19904        let b = _mm512_set1_pch(0.0, 2.0);
19905        let c = _mm512_set1_pch(0.0, 3.0);
19906        let r = _mm512_mask_fcmadd_pch(a, 0b0101010101010101, b, c);
19907        let e = _mm512_setr_ph(
19908            2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0,
19909            3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0,
19910        );
19911        assert_eq_m512h(r, e);
19912    }
19913
19914    #[simd_test(enable = "avx512fp16")]
19915    unsafe fn test_mm512_mask3_fcmadd_pch() {
19916        let a = _mm512_set1_pch(0.0, 1.0);
19917        let b = _mm512_set1_pch(0.0, 2.0);
19918        let c = _mm512_set1_pch(0.0, 3.0);
19919        let r = _mm512_mask3_fcmadd_pch(a, b, c, 0b0101010101010101);
19920        let e = _mm512_setr_ph(
19921            2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0,
19922            3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0,
19923        );
19924        assert_eq_m512h(r, e);
19925    }
19926
19927    #[simd_test(enable = "avx512fp16")]
19928    unsafe fn test_mm512_maskz_fcmadd_pch() {
19929        let a = _mm512_set1_pch(0.0, 1.0);
19930        let b = _mm512_set1_pch(0.0, 2.0);
19931        let c = _mm512_set1_pch(0.0, 3.0);
19932        let r = _mm512_maskz_fcmadd_pch(0b0101010101010101, a, b, c);
19933        let e = _mm512_setr_ph(
19934            2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0,
19935            3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0,
19936        );
19937        assert_eq_m512h(r, e);
19938    }
19939
19940    #[simd_test(enable = "avx512fp16")]
19941    unsafe fn test_mm512_fcmadd_round_pch() {
19942        let a = _mm512_set1_pch(0.0, 1.0);
19943        let b = _mm512_set1_pch(0.0, 2.0);
19944        let c = _mm512_set1_pch(0.0, 3.0);
19945        let r =
19946            _mm512_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
19947        let e = _mm512_set1_pch(2.0, 3.0);
19948        assert_eq_m512h(r, e);
19949    }
19950
19951    #[simd_test(enable = "avx512fp16")]
19952    unsafe fn test_mm512_mask_fcmadd_round_pch() {
19953        let a = _mm512_set1_pch(0.0, 1.0);
19954        let b = _mm512_set1_pch(0.0, 2.0);
19955        let c = _mm512_set1_pch(0.0, 3.0);
19956        let r = _mm512_mask_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19957            a,
19958            0b0101010101010101,
19959            b,
19960            c,
19961        );
19962        let e = _mm512_setr_ph(
19963            2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0,
19964            3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0,
19965        );
19966        assert_eq_m512h(r, e);
19967    }
19968
19969    #[simd_test(enable = "avx512fp16")]
19970    unsafe fn test_mm512_mask3_fcmadd_round_pch() {
19971        let a = _mm512_set1_pch(0.0, 1.0);
19972        let b = _mm512_set1_pch(0.0, 2.0);
19973        let c = _mm512_set1_pch(0.0, 3.0);
19974        let r = _mm512_mask3_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19975            a,
19976            b,
19977            c,
19978            0b0101010101010101,
19979        );
19980        let e = _mm512_setr_ph(
19981            2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0,
19982            3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0,
19983        );
19984        assert_eq_m512h(r, e);
19985    }
19986
19987    #[simd_test(enable = "avx512fp16")]
19988    unsafe fn test_mm512_maskz_fcmadd_round_pch() {
19989        let a = _mm512_set1_pch(0.0, 1.0);
19990        let b = _mm512_set1_pch(0.0, 2.0);
19991        let c = _mm512_set1_pch(0.0, 3.0);
19992        let r = _mm512_maskz_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19993            0b0101010101010101,
19994            a,
19995            b,
19996            c,
19997        );
19998        let e = _mm512_setr_ph(
19999            2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0,
20000            3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0,
20001        );
20002        assert_eq_m512h(r, e);
20003    }
20004
20005    #[simd_test(enable = "avx512fp16")]
20006    unsafe fn test_mm_fcmadd_sch() {
20007        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20008        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20009        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20010        let r = _mm_fcmadd_sch(a, b, c);
20011        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20012        assert_eq_m128h(r, e);
20013    }
20014
20015    #[simd_test(enable = "avx512fp16")]
20016    unsafe fn test_mm_mask_fcmadd_sch() {
20017        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20018        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20019        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20020        let r = _mm_mask_fcmadd_sch(a, 0, b, c);
20021        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20022        assert_eq_m128h(r, e);
20023        let r = _mm_mask_fcmadd_sch(a, 1, b, c);
20024        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20025        assert_eq_m128h(r, e);
20026    }
20027
20028    #[simd_test(enable = "avx512fp16")]
20029    unsafe fn test_mm_mask3_fcmadd_sch() {
20030        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20031        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20032        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20033        let r = _mm_mask3_fcmadd_sch(a, b, c, 0);
20034        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20035        assert_eq_m128h(r, e);
20036        let r = _mm_mask3_fcmadd_sch(a, b, c, 1);
20037        let e = _mm_setr_ph(2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20038        assert_eq_m128h(r, e);
20039    }
20040
20041    #[simd_test(enable = "avx512fp16")]
20042    unsafe fn test_mm_maskz_fcmadd_sch() {
20043        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20044        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20045        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20046        let r = _mm_maskz_fcmadd_sch(0, a, b, c);
20047        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20048        assert_eq_m128h(r, e);
20049        let r = _mm_maskz_fcmadd_sch(1, a, b, c);
20050        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20051        assert_eq_m128h(r, e);
20052    }
20053
20054    #[simd_test(enable = "avx512fp16")]
20055    unsafe fn test_mm_fcmadd_round_sch() {
20056        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20057        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20058        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20059        let r = _mm_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20060        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20061        assert_eq_m128h(r, e);
20062    }
20063
20064    #[simd_test(enable = "avx512fp16")]
20065    unsafe fn test_mm_mask_fcmadd_round_sch() {
20066        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20067        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20068        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20069        let r = _mm_mask_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20070            a, 0, b, c,
20071        );
20072        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20073        assert_eq_m128h(r, e);
20074        let r = _mm_mask_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20075            a, 1, b, c,
20076        );
20077        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20078        assert_eq_m128h(r, e);
20079    }
20080
20081    #[simd_test(enable = "avx512fp16")]
20082    unsafe fn test_mm_mask3_fcmadd_round_sch() {
20083        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20084        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20085        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20086        let r = _mm_mask3_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20087            a, b, c, 0,
20088        );
20089        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20090        assert_eq_m128h(r, e);
20091        let r = _mm_mask3_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20092            a, b, c, 1,
20093        );
20094        let e = _mm_setr_ph(2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20095        assert_eq_m128h(r, e);
20096    }
20097
20098    #[simd_test(enable = "avx512fp16")]
20099    unsafe fn test_mm_maskz_fcmadd_round_sch() {
20100        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20101        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20102        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20103        let r = _mm_maskz_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20104            0, a, b, c,
20105        );
20106        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20107        assert_eq_m128h(r, e);
20108        let r = _mm_maskz_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20109            1, a, b, c,
20110        );
20111        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20112        assert_eq_m128h(r, e);
20113    }
20114
20115    #[simd_test(enable = "avx512fp16,avx512vl")]
20116    unsafe fn test_mm_fmadd_ph() {
20117        let a = _mm_set1_ph(1.0);
20118        let b = _mm_set1_ph(2.0);
20119        let c = _mm_set1_ph(3.0);
20120        let r = _mm_fmadd_ph(a, b, c);
20121        let e = _mm_set1_ph(5.0);
20122        assert_eq_m128h(r, e);
20123    }
20124
20125    #[simd_test(enable = "avx512fp16,avx512vl")]
20126    unsafe fn test_mm_mask_fmadd_ph() {
20127        let a = _mm_set1_ph(1.0);
20128        let b = _mm_set1_ph(2.0);
20129        let c = _mm_set1_ph(3.0);
20130        let r = _mm_mask_fmadd_ph(a, 0b01010101, b, c);
20131        let e = _mm_set_ph(1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0);
20132        assert_eq_m128h(r, e);
20133    }
20134
20135    #[simd_test(enable = "avx512fp16,avx512vl")]
20136    unsafe fn test_mm_mask3_fmadd_ph() {
20137        let a = _mm_set1_ph(1.0);
20138        let b = _mm_set1_ph(2.0);
20139        let c = _mm_set1_ph(3.0);
20140        let r = _mm_mask3_fmadd_ph(a, b, c, 0b01010101);
20141        let e = _mm_set_ph(3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0);
20142        assert_eq_m128h(r, e);
20143    }
20144
20145    #[simd_test(enable = "avx512fp16,avx512vl")]
20146    unsafe fn test_mm_maskz_fmadd_ph() {
20147        let a = _mm_set1_ph(1.0);
20148        let b = _mm_set1_ph(2.0);
20149        let c = _mm_set1_ph(3.0);
20150        let r = _mm_maskz_fmadd_ph(0b01010101, a, b, c);
20151        let e = _mm_set_ph(0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0);
20152        assert_eq_m128h(r, e);
20153    }
20154
20155    #[simd_test(enable = "avx512fp16,avx512vl")]
20156    unsafe fn test_mm256_fmadd_ph() {
20157        let a = _mm256_set1_ph(1.0);
20158        let b = _mm256_set1_ph(2.0);
20159        let c = _mm256_set1_ph(3.0);
20160        let r = _mm256_fmadd_ph(a, b, c);
20161        let e = _mm256_set1_ph(5.0);
20162        assert_eq_m256h(r, e);
20163    }
20164
20165    #[simd_test(enable = "avx512fp16,avx512vl")]
20166    unsafe fn test_mm256_mask_fmadd_ph() {
20167        let a = _mm256_set1_ph(1.0);
20168        let b = _mm256_set1_ph(2.0);
20169        let c = _mm256_set1_ph(3.0);
20170        let r = _mm256_mask_fmadd_ph(a, 0b0101010101010101, b, c);
20171        let e = _mm256_set_ph(
20172            1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0,
20173        );
20174        assert_eq_m256h(r, e);
20175    }
20176
20177    #[simd_test(enable = "avx512fp16,avx512vl")]
20178    unsafe fn test_mm256_mask3_fmadd_ph() {
20179        let a = _mm256_set1_ph(1.0);
20180        let b = _mm256_set1_ph(2.0);
20181        let c = _mm256_set1_ph(3.0);
20182        let r = _mm256_mask3_fmadd_ph(a, b, c, 0b0101010101010101);
20183        let e = _mm256_set_ph(
20184            3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0,
20185        );
20186        assert_eq_m256h(r, e);
20187    }
20188
20189    #[simd_test(enable = "avx512fp16,avx512vl")]
20190    unsafe fn test_mm256_maskz_fmadd_ph() {
20191        let a = _mm256_set1_ph(1.0);
20192        let b = _mm256_set1_ph(2.0);
20193        let c = _mm256_set1_ph(3.0);
20194        let r = _mm256_maskz_fmadd_ph(0b0101010101010101, a, b, c);
20195        let e = _mm256_set_ph(
20196            0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0,
20197        );
20198        assert_eq_m256h(r, e);
20199    }
20200
20201    #[simd_test(enable = "avx512fp16")]
20202    unsafe fn test_mm512_fmadd_ph() {
20203        let a = _mm512_set1_ph(1.0);
20204        let b = _mm512_set1_ph(2.0);
20205        let c = _mm512_set1_ph(3.0);
20206        let r = _mm512_fmadd_ph(a, b, c);
20207        let e = _mm512_set1_ph(5.0);
20208        assert_eq_m512h(r, e);
20209    }
20210
20211    #[simd_test(enable = "avx512fp16")]
20212    unsafe fn test_mm512_mask_fmadd_ph() {
20213        let a = _mm512_set1_ph(1.0);
20214        let b = _mm512_set1_ph(2.0);
20215        let c = _mm512_set1_ph(3.0);
20216        let r = _mm512_mask_fmadd_ph(a, 0b01010101010101010101010101010101, b, c);
20217        let e = _mm512_set_ph(
20218            1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0,
20219            5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0,
20220        );
20221        assert_eq_m512h(r, e);
20222    }
20223
20224    #[simd_test(enable = "avx512fp16")]
20225    unsafe fn test_mm512_mask3_fmadd_ph() {
20226        let a = _mm512_set1_ph(1.0);
20227        let b = _mm512_set1_ph(2.0);
20228        let c = _mm512_set1_ph(3.0);
20229        let r = _mm512_mask3_fmadd_ph(a, b, c, 0b01010101010101010101010101010101);
20230        let e = _mm512_set_ph(
20231            3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0,
20232            5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0,
20233        );
20234        assert_eq_m512h(r, e);
20235    }
20236
20237    #[simd_test(enable = "avx512fp16")]
20238    unsafe fn test_mm512_maskz_fmadd_ph() {
20239        let a = _mm512_set1_ph(1.0);
20240        let b = _mm512_set1_ph(2.0);
20241        let c = _mm512_set1_ph(3.0);
20242        let r = _mm512_maskz_fmadd_ph(0b01010101010101010101010101010101, a, b, c);
20243        let e = _mm512_set_ph(
20244            0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0,
20245            5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0,
20246        );
20247        assert_eq_m512h(r, e);
20248    }
20249
20250    #[simd_test(enable = "avx512fp16")]
20251    unsafe fn test_mm512_fmadd_round_ph() {
20252        let a = _mm512_set1_ph(1.0);
20253        let b = _mm512_set1_ph(2.0);
20254        let c = _mm512_set1_ph(3.0);
20255        let r = _mm512_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20256        let e = _mm512_set1_ph(5.0);
20257        assert_eq_m512h(r, e);
20258    }
20259
20260    #[simd_test(enable = "avx512fp16")]
20261    unsafe fn test_mm512_mask_fmadd_round_ph() {
20262        let a = _mm512_set1_ph(1.0);
20263        let b = _mm512_set1_ph(2.0);
20264        let c = _mm512_set1_ph(3.0);
20265        let r = _mm512_mask_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20266            a,
20267            0b01010101010101010101010101010101,
20268            b,
20269            c,
20270        );
20271        let e = _mm512_set_ph(
20272            1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0,
20273            5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0,
20274        );
20275        assert_eq_m512h(r, e);
20276    }
20277
20278    #[simd_test(enable = "avx512fp16")]
20279    unsafe fn test_mm512_mask3_fmadd_round_ph() {
20280        let a = _mm512_set1_ph(1.0);
20281        let b = _mm512_set1_ph(2.0);
20282        let c = _mm512_set1_ph(3.0);
20283        let r = _mm512_mask3_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20284            a,
20285            b,
20286            c,
20287            0b01010101010101010101010101010101,
20288        );
20289        let e = _mm512_set_ph(
20290            3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0,
20291            5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0,
20292        );
20293        assert_eq_m512h(r, e);
20294    }
20295
20296    #[simd_test(enable = "avx512fp16")]
20297    unsafe fn test_mm512_maskz_fmadd_round_ph() {
20298        let a = _mm512_set1_ph(1.0);
20299        let b = _mm512_set1_ph(2.0);
20300        let c = _mm512_set1_ph(3.0);
20301        let r = _mm512_maskz_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20302            0b01010101010101010101010101010101,
20303            a,
20304            b,
20305            c,
20306        );
20307        let e = _mm512_set_ph(
20308            0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0,
20309            5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0,
20310        );
20311        assert_eq_m512h(r, e);
20312    }
20313
20314    #[simd_test(enable = "avx512fp16")]
20315    unsafe fn test_mm_fmadd_sh() {
20316        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20317        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20318        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20319        let r = _mm_fmadd_sh(a, b, c);
20320        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20321        assert_eq_m128h(r, e);
20322    }
20323
20324    #[simd_test(enable = "avx512fp16")]
20325    unsafe fn test_mm_mask_fmadd_sh() {
20326        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20327        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20328        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20329        let r = _mm_mask_fmadd_sh(a, 0, b, c);
20330        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20331        assert_eq_m128h(r, e);
20332        let r = _mm_mask_fmadd_sh(a, 1, b, c);
20333        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20334        assert_eq_m128h(r, e);
20335    }
20336
20337    #[simd_test(enable = "avx512fp16")]
20338    unsafe fn test_mm_mask3_fmadd_sh() {
20339        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20340        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20341        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20342        let r = _mm_mask3_fmadd_sh(a, b, c, 0);
20343        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20344        assert_eq_m128h(r, e);
20345        let r = _mm_mask3_fmadd_sh(a, b, c, 1);
20346        let e = _mm_setr_ph(5.0, 30., 31., 32., 33., 34., 35., 36.);
20347        assert_eq_m128h(r, e);
20348    }
20349
20350    #[simd_test(enable = "avx512fp16")]
20351    unsafe fn test_mm_maskz_fmadd_sh() {
20352        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20353        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20354        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20355        let r = _mm_maskz_fmadd_sh(0, a, b, c);
20356        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
20357        assert_eq_m128h(r, e);
20358        let r = _mm_maskz_fmadd_sh(1, a, b, c);
20359        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20360        assert_eq_m128h(r, e);
20361    }
20362
20363    #[simd_test(enable = "avx512fp16")]
20364    unsafe fn test_mm_fmadd_round_sh() {
20365        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20366        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20367        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20368        let r = _mm_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20369        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20370        assert_eq_m128h(r, e);
20371    }
20372
20373    #[simd_test(enable = "avx512fp16")]
20374    unsafe fn test_mm_mask_fmadd_round_sh() {
20375        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20376        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20377        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20378        let r = _mm_mask_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20379            a, 0, b, c,
20380        );
20381        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20382        assert_eq_m128h(r, e);
20383        let r = _mm_mask_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20384            a, 1, b, c,
20385        );
20386        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20387        assert_eq_m128h(r, e);
20388    }
20389
20390    #[simd_test(enable = "avx512fp16")]
20391    unsafe fn test_mm_mask3_fmadd_round_sh() {
20392        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20393        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20394        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20395        let r = _mm_mask3_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20396            a, b, c, 0,
20397        );
20398        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20399        assert_eq_m128h(r, e);
20400        let r = _mm_mask3_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20401            a, b, c, 1,
20402        );
20403        let e = _mm_setr_ph(5.0, 30., 31., 32., 33., 34., 35., 36.);
20404        assert_eq_m128h(r, e);
20405    }
20406
20407    #[simd_test(enable = "avx512fp16")]
20408    unsafe fn test_mm_maskz_fmadd_round_sh() {
20409        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20410        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20411        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20412        let r = _mm_maskz_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20413            0, a, b, c,
20414        );
20415        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
20416        assert_eq_m128h(r, e);
20417        let r = _mm_maskz_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20418            1, a, b, c,
20419        );
20420        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20421        assert_eq_m128h(r, e);
20422    }
20423
20424    #[simd_test(enable = "avx512fp16,avx512vl")]
20425    unsafe fn test_mm_fmsub_ph() {
20426        let a = _mm_set1_ph(1.0);
20427        let b = _mm_set1_ph(2.0);
20428        let c = _mm_set1_ph(3.0);
20429        let r = _mm_fmsub_ph(a, b, c);
20430        let e = _mm_set1_ph(-1.0);
20431        assert_eq_m128h(r, e);
20432    }
20433
20434    #[simd_test(enable = "avx512fp16,avx512vl")]
20435    unsafe fn test_mm_mask_fmsub_ph() {
20436        let a = _mm_set1_ph(1.0);
20437        let b = _mm_set1_ph(2.0);
20438        let c = _mm_set1_ph(3.0);
20439        let r = _mm_mask_fmsub_ph(a, 0b01010101, b, c);
20440        let e = _mm_set_ph(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0);
20441        assert_eq_m128h(r, e);
20442    }
20443
20444    #[simd_test(enable = "avx512fp16,avx512vl")]
20445    unsafe fn test_mm_mask3_fmsub_ph() {
20446        let a = _mm_set1_ph(1.0);
20447        let b = _mm_set1_ph(2.0);
20448        let c = _mm_set1_ph(3.0);
20449        let r = _mm_mask3_fmsub_ph(a, b, c, 0b01010101);
20450        let e = _mm_set_ph(3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0);
20451        assert_eq_m128h(r, e);
20452    }
20453
20454    #[simd_test(enable = "avx512fp16,avx512vl")]
20455    unsafe fn test_mm_maskz_fmsub_ph() {
20456        let a = _mm_set1_ph(1.0);
20457        let b = _mm_set1_ph(2.0);
20458        let c = _mm_set1_ph(3.0);
20459        let r = _mm_maskz_fmsub_ph(0b01010101, a, b, c);
20460        let e = _mm_set_ph(0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0);
20461        assert_eq_m128h(r, e);
20462    }
20463
20464    #[simd_test(enable = "avx512fp16,avx512vl")]
20465    unsafe fn test_mm256_fmsub_ph() {
20466        let a = _mm256_set1_ph(1.0);
20467        let b = _mm256_set1_ph(2.0);
20468        let c = _mm256_set1_ph(3.0);
20469        let r = _mm256_fmsub_ph(a, b, c);
20470        let e = _mm256_set1_ph(-1.0);
20471        assert_eq_m256h(r, e);
20472    }
20473
20474    #[simd_test(enable = "avx512fp16,avx512vl")]
20475    unsafe fn test_mm256_mask_fmsub_ph() {
20476        let a = _mm256_set1_ph(1.0);
20477        let b = _mm256_set1_ph(2.0);
20478        let c = _mm256_set1_ph(3.0);
20479        let r = _mm256_mask_fmsub_ph(a, 0b0101010101010101, b, c);
20480        let e = _mm256_set_ph(
20481            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20482        );
20483        assert_eq_m256h(r, e);
20484    }
20485
20486    #[simd_test(enable = "avx512fp16,avx512vl")]
20487    unsafe fn test_mm256_mask3_fmsub_ph() {
20488        let a = _mm256_set1_ph(1.0);
20489        let b = _mm256_set1_ph(2.0);
20490        let c = _mm256_set1_ph(3.0);
20491        let r = _mm256_mask3_fmsub_ph(a, b, c, 0b0101010101010101);
20492        let e = _mm256_set_ph(
20493            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20494        );
20495        assert_eq_m256h(r, e);
20496    }
20497
20498    #[simd_test(enable = "avx512fp16,avx512vl")]
20499    unsafe fn test_mm256_maskz_fmsub_ph() {
20500        let a = _mm256_set1_ph(1.0);
20501        let b = _mm256_set1_ph(2.0);
20502        let c = _mm256_set1_ph(3.0);
20503        let r = _mm256_maskz_fmsub_ph(0b0101010101010101, a, b, c);
20504        let e = _mm256_set_ph(
20505            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20506        );
20507        assert_eq_m256h(r, e);
20508    }
20509
20510    #[simd_test(enable = "avx512fp16")]
20511    unsafe fn test_mm512_fmsub_ph() {
20512        let a = _mm512_set1_ph(1.0);
20513        let b = _mm512_set1_ph(2.0);
20514        let c = _mm512_set1_ph(3.0);
20515        let r = _mm512_fmsub_ph(a, b, c);
20516        let e = _mm512_set1_ph(-1.0);
20517        assert_eq_m512h(r, e);
20518    }
20519
20520    #[simd_test(enable = "avx512fp16")]
20521    unsafe fn test_mm512_mask_fmsub_ph() {
20522        let a = _mm512_set1_ph(1.0);
20523        let b = _mm512_set1_ph(2.0);
20524        let c = _mm512_set1_ph(3.0);
20525        let r = _mm512_mask_fmsub_ph(a, 0b01010101010101010101010101010101, b, c);
20526        let e = _mm512_set_ph(
20527            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20528            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20529        );
20530        assert_eq_m512h(r, e);
20531    }
20532
20533    #[simd_test(enable = "avx512fp16")]
20534    unsafe fn test_mm512_mask3_fmsub_ph() {
20535        let a = _mm512_set1_ph(1.0);
20536        let b = _mm512_set1_ph(2.0);
20537        let c = _mm512_set1_ph(3.0);
20538        let r = _mm512_mask3_fmsub_ph(a, b, c, 0b01010101010101010101010101010101);
20539        let e = _mm512_set_ph(
20540            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20541            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20542        );
20543        assert_eq_m512h(r, e);
20544    }
20545
20546    #[simd_test(enable = "avx512fp16")]
20547    unsafe fn test_mm512_maskz_fmsub_ph() {
20548        let a = _mm512_set1_ph(1.0);
20549        let b = _mm512_set1_ph(2.0);
20550        let c = _mm512_set1_ph(3.0);
20551        let r = _mm512_maskz_fmsub_ph(0b01010101010101010101010101010101, a, b, c);
20552        let e = _mm512_set_ph(
20553            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20554            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20555        );
20556        assert_eq_m512h(r, e);
20557    }
20558
20559    #[simd_test(enable = "avx512fp16")]
20560    unsafe fn test_mm512_fmsub_round_ph() {
20561        let a = _mm512_set1_ph(1.0);
20562        let b = _mm512_set1_ph(2.0);
20563        let c = _mm512_set1_ph(3.0);
20564        let r = _mm512_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20565        let e = _mm512_set1_ph(-1.0);
20566        assert_eq_m512h(r, e);
20567    }
20568
20569    #[simd_test(enable = "avx512fp16")]
20570    unsafe fn test_mm512_mask_fmsub_round_ph() {
20571        let a = _mm512_set1_ph(1.0);
20572        let b = _mm512_set1_ph(2.0);
20573        let c = _mm512_set1_ph(3.0);
20574        let r = _mm512_mask_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20575            a,
20576            0b01010101010101010101010101010101,
20577            b,
20578            c,
20579        );
20580        let e = _mm512_set_ph(
20581            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20582            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20583        );
20584        assert_eq_m512h(r, e);
20585    }
20586
20587    #[simd_test(enable = "avx512fp16")]
20588    unsafe fn test_mm512_mask3_fmsub_round_ph() {
20589        let a = _mm512_set1_ph(1.0);
20590        let b = _mm512_set1_ph(2.0);
20591        let c = _mm512_set1_ph(3.0);
20592        let r = _mm512_mask3_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20593            a,
20594            b,
20595            c,
20596            0b01010101010101010101010101010101,
20597        );
20598        let e = _mm512_set_ph(
20599            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20600            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20601        );
20602        assert_eq_m512h(r, e);
20603    }
20604
20605    #[simd_test(enable = "avx512fp16")]
20606    unsafe fn test_mm512_maskz_fmsub_round_ph() {
20607        let a = _mm512_set1_ph(1.0);
20608        let b = _mm512_set1_ph(2.0);
20609        let c = _mm512_set1_ph(3.0);
20610        let r = _mm512_maskz_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20611            0b01010101010101010101010101010101,
20612            a,
20613            b,
20614            c,
20615        );
20616        let e = _mm512_set_ph(
20617            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20618            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20619        );
20620        assert_eq_m512h(r, e);
20621    }
20622
20623    #[simd_test(enable = "avx512fp16")]
20624    unsafe fn test_mm_fmsub_sh() {
20625        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20626        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20627        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20628        let r = _mm_fmsub_sh(a, b, c);
20629        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20630        assert_eq_m128h(r, e);
20631    }
20632
20633    #[simd_test(enable = "avx512fp16")]
20634    unsafe fn test_mm_mask_fmsub_sh() {
20635        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20636        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20637        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20638        let r = _mm_mask_fmsub_sh(a, 0, b, c);
20639        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20640        assert_eq_m128h(r, e);
20641        let r = _mm_mask_fmsub_sh(a, 1, b, c);
20642        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20643        assert_eq_m128h(r, e);
20644    }
20645
20646    #[simd_test(enable = "avx512fp16")]
20647    unsafe fn test_mm_mask3_fmsub_sh() {
20648        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20649        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20650        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20651        let r = _mm_mask3_fmsub_sh(a, b, c, 0);
20652        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20653        assert_eq_m128h(r, e);
20654        let r = _mm_mask3_fmsub_sh(a, b, c, 1);
20655        let e = _mm_setr_ph(-1.0, 30., 31., 32., 33., 34., 35., 36.);
20656        assert_eq_m128h(r, e);
20657    }
20658
20659    #[simd_test(enable = "avx512fp16")]
20660    unsafe fn test_mm_maskz_fmsub_sh() {
20661        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20662        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20663        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20664        let r = _mm_maskz_fmsub_sh(0, a, b, c);
20665        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
20666        assert_eq_m128h(r, e);
20667        let r = _mm_maskz_fmsub_sh(1, a, b, c);
20668        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20669        assert_eq_m128h(r, e);
20670    }
20671
20672    #[simd_test(enable = "avx512fp16")]
20673    unsafe fn test_mm_fmsub_round_sh() {
20674        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20675        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20676        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20677        let r = _mm_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20678        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20679        assert_eq_m128h(r, e);
20680    }
20681
20682    #[simd_test(enable = "avx512fp16")]
20683    unsafe fn test_mm_mask_fmsub_round_sh() {
20684        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20685        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20686        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20687        let r = _mm_mask_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20688            a, 0, b, c,
20689        );
20690        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20691        assert_eq_m128h(r, e);
20692        let r = _mm_mask_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20693            a, 1, b, c,
20694        );
20695        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20696        assert_eq_m128h(r, e);
20697    }
20698
20699    #[simd_test(enable = "avx512fp16")]
20700    unsafe fn test_mm_mask3_fmsub_round_sh() {
20701        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20702        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20703        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20704        let r = _mm_mask3_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20705            a, b, c, 0,
20706        );
20707        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20708        assert_eq_m128h(r, e);
20709        let r = _mm_mask3_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20710            a, b, c, 1,
20711        );
20712        let e = _mm_setr_ph(-1.0, 30., 31., 32., 33., 34., 35., 36.);
20713        assert_eq_m128h(r, e);
20714    }
20715
20716    #[simd_test(enable = "avx512fp16")]
20717    unsafe fn test_mm_maskz_fmsub_round_sh() {
20718        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20719        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20720        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20721        let r = _mm_maskz_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20722            0, a, b, c,
20723        );
20724        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
20725        assert_eq_m128h(r, e);
20726        let r = _mm_maskz_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20727            1, a, b, c,
20728        );
20729        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20730        assert_eq_m128h(r, e);
20731    }
20732
20733    #[simd_test(enable = "avx512fp16,avx512vl")]
20734    unsafe fn test_mm_fnmadd_ph() {
20735        let a = _mm_set1_ph(1.0);
20736        let b = _mm_set1_ph(2.0);
20737        let c = _mm_set1_ph(3.0);
20738        let r = _mm_fnmadd_ph(a, b, c);
20739        let e = _mm_set1_ph(1.0);
20740        assert_eq_m128h(r, e);
20741    }
20742
20743    #[simd_test(enable = "avx512fp16,avx512vl")]
20744    unsafe fn test_mm_mask_fnmadd_ph() {
20745        let a = _mm_set1_ph(1.0);
20746        let b = _mm_set1_ph(2.0);
20747        let c = _mm_set1_ph(3.0);
20748        let r = _mm_mask_fnmadd_ph(a, 0b01010101, b, c);
20749        let e = _mm_set_ph(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0);
20750        assert_eq_m128h(r, e);
20751    }
20752
20753    #[simd_test(enable = "avx512fp16,avx512vl")]
20754    unsafe fn test_mm_mask3_fnmadd_ph() {
20755        let a = _mm_set1_ph(1.0);
20756        let b = _mm_set1_ph(2.0);
20757        let c = _mm_set1_ph(3.0);
20758        let r = _mm_mask3_fnmadd_ph(a, b, c, 0b01010101);
20759        let e = _mm_set_ph(3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0);
20760        assert_eq_m128h(r, e);
20761    }
20762
20763    #[simd_test(enable = "avx512fp16,avx512vl")]
20764    unsafe fn test_mm_maskz_fnmadd_ph() {
20765        let a = _mm_set1_ph(1.0);
20766        let b = _mm_set1_ph(2.0);
20767        let c = _mm_set1_ph(3.0);
20768        let r = _mm_maskz_fnmadd_ph(0b01010101, a, b, c);
20769        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
20770        assert_eq_m128h(r, e);
20771    }
20772
20773    #[simd_test(enable = "avx512fp16,avx512vl")]
20774    unsafe fn test_mm256_fnmadd_ph() {
20775        let a = _mm256_set1_ph(1.0);
20776        let b = _mm256_set1_ph(2.0);
20777        let c = _mm256_set1_ph(3.0);
20778        let r = _mm256_fnmadd_ph(a, b, c);
20779        let e = _mm256_set1_ph(1.0);
20780        assert_eq_m256h(r, e);
20781    }
20782
20783    #[simd_test(enable = "avx512fp16,avx512vl")]
20784    unsafe fn test_mm256_mask_fnmadd_ph() {
20785        let a = _mm256_set1_ph(1.0);
20786        let b = _mm256_set1_ph(2.0);
20787        let c = _mm256_set1_ph(3.0);
20788        let r = _mm256_mask_fnmadd_ph(a, 0b0101010101010101, b, c);
20789        let e = _mm256_set_ph(
20790            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
20791        );
20792        assert_eq_m256h(r, e);
20793    }
20794
20795    #[simd_test(enable = "avx512fp16,avx512vl")]
20796    unsafe fn test_mm256_mask3_fnmadd_ph() {
20797        let a = _mm256_set1_ph(1.0);
20798        let b = _mm256_set1_ph(2.0);
20799        let c = _mm256_set1_ph(3.0);
20800        let r = _mm256_mask3_fnmadd_ph(a, b, c, 0b0101010101010101);
20801        let e = _mm256_set_ph(
20802            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
20803        );
20804        assert_eq_m256h(r, e);
20805    }
20806
20807    #[simd_test(enable = "avx512fp16,avx512vl")]
20808    unsafe fn test_mm256_maskz_fnmadd_ph() {
20809        let a = _mm256_set1_ph(1.0);
20810        let b = _mm256_set1_ph(2.0);
20811        let c = _mm256_set1_ph(3.0);
20812        let r = _mm256_maskz_fnmadd_ph(0b0101010101010101, a, b, c);
20813        let e = _mm256_set_ph(
20814            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
20815        );
20816        assert_eq_m256h(r, e);
20817    }
20818
20819    #[simd_test(enable = "avx512fp16")]
20820    unsafe fn test_mm512_fnmadd_ph() {
20821        let a = _mm512_set1_ph(1.0);
20822        let b = _mm512_set1_ph(2.0);
20823        let c = _mm512_set1_ph(3.0);
20824        let r = _mm512_fnmadd_ph(a, b, c);
20825        let e = _mm512_set1_ph(1.0);
20826        assert_eq_m512h(r, e);
20827    }
20828
20829    #[simd_test(enable = "avx512fp16")]
20830    unsafe fn test_mm512_mask_fnmadd_ph() {
20831        let a = _mm512_set1_ph(1.0);
20832        let b = _mm512_set1_ph(2.0);
20833        let c = _mm512_set1_ph(3.0);
20834        let r = _mm512_mask_fnmadd_ph(a, 0b01010101010101010101010101010101, b, c);
20835        let e = _mm512_set_ph(
20836            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
20837            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
20838        );
20839        assert_eq_m512h(r, e);
20840    }
20841
20842    #[simd_test(enable = "avx512fp16")]
20843    unsafe fn test_mm512_mask3_fnmadd_ph() {
20844        let a = _mm512_set1_ph(1.0);
20845        let b = _mm512_set1_ph(2.0);
20846        let c = _mm512_set1_ph(3.0);
20847        let r = _mm512_mask3_fnmadd_ph(a, b, c, 0b01010101010101010101010101010101);
20848        let e = _mm512_set_ph(
20849            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
20850            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
20851        );
20852        assert_eq_m512h(r, e);
20853    }
20854
20855    #[simd_test(enable = "avx512fp16")]
20856    unsafe fn test_mm512_maskz_fnmadd_ph() {
20857        let a = _mm512_set1_ph(1.0);
20858        let b = _mm512_set1_ph(2.0);
20859        let c = _mm512_set1_ph(3.0);
20860        let r = _mm512_maskz_fnmadd_ph(0b01010101010101010101010101010101, a, b, c);
20861        let e = _mm512_set_ph(
20862            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
20863            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
20864        );
20865        assert_eq_m512h(r, e);
20866    }
20867
20868    #[simd_test(enable = "avx512fp16")]
20869    unsafe fn test_mm512_fnmadd_round_ph() {
20870        let a = _mm512_set1_ph(1.0);
20871        let b = _mm512_set1_ph(2.0);
20872        let c = _mm512_set1_ph(3.0);
20873        let r =
20874            _mm512_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20875        let e = _mm512_set1_ph(1.0);
20876        assert_eq_m512h(r, e);
20877    }
20878
20879    #[simd_test(enable = "avx512fp16")]
20880    unsafe fn test_mm512_mask_fnmadd_round_ph() {
20881        let a = _mm512_set1_ph(1.0);
20882        let b = _mm512_set1_ph(2.0);
20883        let c = _mm512_set1_ph(3.0);
20884        let r = _mm512_mask_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20885            a,
20886            0b01010101010101010101010101010101,
20887            b,
20888            c,
20889        );
20890        let e = _mm512_set_ph(
20891            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
20892            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
20893        );
20894        assert_eq_m512h(r, e);
20895    }
20896
20897    #[simd_test(enable = "avx512fp16")]
20898    unsafe fn test_mm512_mask3_fnmadd_round_ph() {
20899        let a = _mm512_set1_ph(1.0);
20900        let b = _mm512_set1_ph(2.0);
20901        let c = _mm512_set1_ph(3.0);
20902        let r = _mm512_mask3_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20903            a,
20904            b,
20905            c,
20906            0b01010101010101010101010101010101,
20907        );
20908        let e = _mm512_set_ph(
20909            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
20910            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
20911        );
20912        assert_eq_m512h(r, e);
20913    }
20914
20915    #[simd_test(enable = "avx512fp16")]
20916    unsafe fn test_mm512_maskz_fnmadd_round_ph() {
20917        let a = _mm512_set1_ph(1.0);
20918        let b = _mm512_set1_ph(2.0);
20919        let c = _mm512_set1_ph(3.0);
20920        let r = _mm512_maskz_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20921            0b01010101010101010101010101010101,
20922            a,
20923            b,
20924            c,
20925        );
20926        let e = _mm512_set_ph(
20927            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
20928            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
20929        );
20930        assert_eq_m512h(r, e);
20931    }
20932
20933    #[simd_test(enable = "avx512fp16")]
20934    unsafe fn test_mm_fnmadd_sh() {
20935        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20936        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20937        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20938        let r = _mm_fnmadd_sh(a, b, c);
20939        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20940        assert_eq_m128h(r, e);
20941    }
20942
20943    #[simd_test(enable = "avx512fp16")]
20944    unsafe fn test_mm_mask_fnmadd_sh() {
20945        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20946        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20947        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20948        let r = _mm_mask_fnmadd_sh(a, 0, b, c);
20949        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20950        assert_eq_m128h(r, e);
20951        let r = _mm_mask_fnmadd_sh(a, 1, b, c);
20952        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20953        assert_eq_m128h(r, e);
20954    }
20955
20956    #[simd_test(enable = "avx512fp16")]
20957    unsafe fn test_mm_mask3_fnmadd_sh() {
20958        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20959        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20960        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20961        let r = _mm_mask3_fnmadd_sh(a, b, c, 0);
20962        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20963        assert_eq_m128h(r, e);
20964        let r = _mm_mask3_fnmadd_sh(a, b, c, 1);
20965        let e = _mm_setr_ph(1.0, 30., 31., 32., 33., 34., 35., 36.);
20966        assert_eq_m128h(r, e);
20967    }
20968
20969    #[simd_test(enable = "avx512fp16")]
20970    unsafe fn test_mm_maskz_fnmadd_sh() {
20971        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20972        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20973        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20974        let r = _mm_maskz_fnmadd_sh(0, a, b, c);
20975        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
20976        assert_eq_m128h(r, e);
20977        let r = _mm_maskz_fnmadd_sh(1, a, b, c);
20978        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20979        assert_eq_m128h(r, e);
20980    }
20981
20982    #[simd_test(enable = "avx512fp16")]
20983    unsafe fn test_mm_fnmadd_round_sh() {
20984        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20985        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20986        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20987        let r = _mm_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20988        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20989        assert_eq_m128h(r, e);
20990    }
20991
20992    #[simd_test(enable = "avx512fp16")]
20993    unsafe fn test_mm_mask_fnmadd_round_sh() {
20994        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20995        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20996        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20997        let r = _mm_mask_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20998            a, 0, b, c,
20999        );
21000        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21001        assert_eq_m128h(r, e);
21002        let r = _mm_mask_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21003            a, 1, b, c,
21004        );
21005        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21006        assert_eq_m128h(r, e);
21007    }
21008
21009    #[simd_test(enable = "avx512fp16")]
21010    unsafe fn test_mm_mask3_fnmadd_round_sh() {
21011        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21012        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21013        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21014        let r = _mm_mask3_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21015            a, b, c, 0,
21016        );
21017        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21018        assert_eq_m128h(r, e);
21019        let r = _mm_mask3_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21020            a, b, c, 1,
21021        );
21022        let e = _mm_setr_ph(1.0, 30., 31., 32., 33., 34., 35., 36.);
21023        assert_eq_m128h(r, e);
21024    }
21025
21026    #[simd_test(enable = "avx512fp16")]
21027    unsafe fn test_mm_maskz_fnmadd_round_sh() {
21028        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21029        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21030        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21031        let r = _mm_maskz_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21032            0, a, b, c,
21033        );
21034        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
21035        assert_eq_m128h(r, e);
21036        let r = _mm_maskz_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21037            1, a, b, c,
21038        );
21039        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21040        assert_eq_m128h(r, e);
21041    }
21042
21043    #[simd_test(enable = "avx512fp16,avx512vl")]
21044    unsafe fn test_mm_fnmsub_ph() {
21045        let a = _mm_set1_ph(1.0);
21046        let b = _mm_set1_ph(2.0);
21047        let c = _mm_set1_ph(3.0);
21048        let r = _mm_fnmsub_ph(a, b, c);
21049        let e = _mm_set1_ph(-5.0);
21050        assert_eq_m128h(r, e);
21051    }
21052
21053    #[simd_test(enable = "avx512fp16,avx512vl")]
21054    unsafe fn test_mm_mask_fnmsub_ph() {
21055        let a = _mm_set1_ph(1.0);
21056        let b = _mm_set1_ph(2.0);
21057        let c = _mm_set1_ph(3.0);
21058        let r = _mm_mask_fnmsub_ph(a, 0b01010101, b, c);
21059        let e = _mm_set_ph(1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0);
21060        assert_eq_m128h(r, e);
21061    }
21062
21063    #[simd_test(enable = "avx512fp16,avx512vl")]
21064    unsafe fn test_mm_mask3_fnmsub_ph() {
21065        let a = _mm_set1_ph(1.0);
21066        let b = _mm_set1_ph(2.0);
21067        let c = _mm_set1_ph(3.0);
21068        let r = _mm_mask3_fnmsub_ph(a, b, c, 0b01010101);
21069        let e = _mm_set_ph(3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0);
21070        assert_eq_m128h(r, e);
21071    }
21072
21073    #[simd_test(enable = "avx512fp16,avx512vl")]
21074    unsafe fn test_mm_maskz_fnmsub_ph() {
21075        let a = _mm_set1_ph(1.0);
21076        let b = _mm_set1_ph(2.0);
21077        let c = _mm_set1_ph(3.0);
21078        let r = _mm_maskz_fnmsub_ph(0b01010101, a, b, c);
21079        let e = _mm_set_ph(0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0);
21080        assert_eq_m128h(r, e);
21081    }
21082
21083    #[simd_test(enable = "avx512fp16,avx512vl")]
21084    unsafe fn test_mm256_fnmsub_ph() {
21085        let a = _mm256_set1_ph(1.0);
21086        let b = _mm256_set1_ph(2.0);
21087        let c = _mm256_set1_ph(3.0);
21088        let r = _mm256_fnmsub_ph(a, b, c);
21089        let e = _mm256_set1_ph(-5.0);
21090        assert_eq_m256h(r, e);
21091    }
21092
21093    #[simd_test(enable = "avx512fp16,avx512vl")]
21094    unsafe fn test_mm256_mask_fnmsub_ph() {
21095        let a = _mm256_set1_ph(1.0);
21096        let b = _mm256_set1_ph(2.0);
21097        let c = _mm256_set1_ph(3.0);
21098        let r = _mm256_mask_fnmsub_ph(a, 0b0101010101010101, b, c);
21099        let e = _mm256_set_ph(
21100            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
21101        );
21102        assert_eq_m256h(r, e);
21103    }
21104
21105    #[simd_test(enable = "avx512fp16,avx512vl")]
21106    unsafe fn test_mm256_mask3_fnmsub_ph() {
21107        let a = _mm256_set1_ph(1.0);
21108        let b = _mm256_set1_ph(2.0);
21109        let c = _mm256_set1_ph(3.0);
21110        let r = _mm256_mask3_fnmsub_ph(a, b, c, 0b0101010101010101);
21111        let e = _mm256_set_ph(
21112            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
21113        );
21114        assert_eq_m256h(r, e);
21115    }
21116
21117    #[simd_test(enable = "avx512fp16,avx512vl")]
21118    unsafe fn test_mm256_maskz_fnmsub_ph() {
21119        let a = _mm256_set1_ph(1.0);
21120        let b = _mm256_set1_ph(2.0);
21121        let c = _mm256_set1_ph(3.0);
21122        let r = _mm256_maskz_fnmsub_ph(0b0101010101010101, a, b, c);
21123        let e = _mm256_set_ph(
21124            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
21125        );
21126        assert_eq_m256h(r, e);
21127    }
21128
21129    #[simd_test(enable = "avx512fp16")]
21130    unsafe fn test_mm512_fnmsub_ph() {
21131        let a = _mm512_set1_ph(1.0);
21132        let b = _mm512_set1_ph(2.0);
21133        let c = _mm512_set1_ph(3.0);
21134        let r = _mm512_fnmsub_ph(a, b, c);
21135        let e = _mm512_set1_ph(-5.0);
21136        assert_eq_m512h(r, e);
21137    }
21138
21139    #[simd_test(enable = "avx512fp16")]
21140    unsafe fn test_mm512_mask_fnmsub_ph() {
21141        let a = _mm512_set1_ph(1.0);
21142        let b = _mm512_set1_ph(2.0);
21143        let c = _mm512_set1_ph(3.0);
21144        let r = _mm512_mask_fnmsub_ph(a, 0b01010101010101010101010101010101, b, c);
21145        let e = _mm512_set_ph(
21146            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
21147            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
21148        );
21149        assert_eq_m512h(r, e);
21150    }
21151
21152    #[simd_test(enable = "avx512fp16")]
21153    unsafe fn test_mm512_mask3_fnmsub_ph() {
21154        let a = _mm512_set1_ph(1.0);
21155        let b = _mm512_set1_ph(2.0);
21156        let c = _mm512_set1_ph(3.0);
21157        let r = _mm512_mask3_fnmsub_ph(a, b, c, 0b01010101010101010101010101010101);
21158        let e = _mm512_set_ph(
21159            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
21160            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
21161        );
21162        assert_eq_m512h(r, e);
21163    }
21164
21165    #[simd_test(enable = "avx512fp16")]
21166    unsafe fn test_mm512_maskz_fnmsub_ph() {
21167        let a = _mm512_set1_ph(1.0);
21168        let b = _mm512_set1_ph(2.0);
21169        let c = _mm512_set1_ph(3.0);
21170        let r = _mm512_maskz_fnmsub_ph(0b01010101010101010101010101010101, a, b, c);
21171        let e = _mm512_set_ph(
21172            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
21173            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
21174        );
21175        assert_eq_m512h(r, e);
21176    }
21177
21178    #[simd_test(enable = "avx512fp16")]
21179    unsafe fn test_mm512_fnmsub_round_ph() {
21180        let a = _mm512_set1_ph(1.0);
21181        let b = _mm512_set1_ph(2.0);
21182        let c = _mm512_set1_ph(3.0);
21183        let r =
21184            _mm512_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21185        let e = _mm512_set1_ph(-5.0);
21186        assert_eq_m512h(r, e);
21187    }
21188
21189    #[simd_test(enable = "avx512fp16")]
21190    unsafe fn test_mm512_mask_fnmsub_round_ph() {
21191        let a = _mm512_set1_ph(1.0);
21192        let b = _mm512_set1_ph(2.0);
21193        let c = _mm512_set1_ph(3.0);
21194        let r = _mm512_mask_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21195            a,
21196            0b01010101010101010101010101010101,
21197            b,
21198            c,
21199        );
21200        let e = _mm512_set_ph(
21201            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
21202            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
21203        );
21204        assert_eq_m512h(r, e);
21205    }
21206
21207    #[simd_test(enable = "avx512fp16")]
21208    unsafe fn test_mm512_mask3_fnmsub_round_ph() {
21209        let a = _mm512_set1_ph(1.0);
21210        let b = _mm512_set1_ph(2.0);
21211        let c = _mm512_set1_ph(3.0);
21212        let r = _mm512_mask3_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21213            a,
21214            b,
21215            c,
21216            0b01010101010101010101010101010101,
21217        );
21218        let e = _mm512_set_ph(
21219            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
21220            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
21221        );
21222        assert_eq_m512h(r, e);
21223    }
21224
21225    #[simd_test(enable = "avx512fp16")]
21226    unsafe fn test_mm512_maskz_fnmsub_round_ph() {
21227        let a = _mm512_set1_ph(1.0);
21228        let b = _mm512_set1_ph(2.0);
21229        let c = _mm512_set1_ph(3.0);
21230        let r = _mm512_maskz_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21231            0b01010101010101010101010101010101,
21232            a,
21233            b,
21234            c,
21235        );
21236        let e = _mm512_set_ph(
21237            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
21238            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
21239        );
21240        assert_eq_m512h(r, e);
21241    }
21242
21243    #[simd_test(enable = "avx512fp16")]
21244    unsafe fn test_mm_fnmsub_sh() {
21245        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21246        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21247        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21248        let r = _mm_fnmsub_sh(a, b, c);
21249        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21250        assert_eq_m128h(r, e);
21251    }
21252
21253    #[simd_test(enable = "avx512fp16")]
21254    unsafe fn test_mm_mask_fnmsub_sh() {
21255        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21256        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21257        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21258        let r = _mm_mask_fnmsub_sh(a, 0, b, c);
21259        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21260        assert_eq_m128h(r, e);
21261        let r = _mm_mask_fnmsub_sh(a, 1, b, c);
21262        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21263        assert_eq_m128h(r, e);
21264    }
21265
21266    #[simd_test(enable = "avx512fp16")]
21267    unsafe fn test_mm_mask3_fnmsub_sh() {
21268        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21269        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21270        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21271        let r = _mm_mask3_fnmsub_sh(a, b, c, 0);
21272        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21273        assert_eq_m128h(r, e);
21274        let r = _mm_mask3_fnmsub_sh(a, b, c, 1);
21275        let e = _mm_setr_ph(-5.0, 30., 31., 32., 33., 34., 35., 36.);
21276        assert_eq_m128h(r, e);
21277    }
21278
21279    #[simd_test(enable = "avx512fp16")]
21280    unsafe fn test_mm_maskz_fnmsub_sh() {
21281        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21282        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21283        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21284        let r = _mm_maskz_fnmsub_sh(0, a, b, c);
21285        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
21286        assert_eq_m128h(r, e);
21287        let r = _mm_maskz_fnmsub_sh(1, a, b, c);
21288        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21289        assert_eq_m128h(r, e);
21290    }
21291
21292    #[simd_test(enable = "avx512fp16")]
21293    unsafe fn test_mm_fnmsub_round_sh() {
21294        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21295        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21296        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21297        let r = _mm_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21298        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21299        assert_eq_m128h(r, e);
21300    }
21301
21302    #[simd_test(enable = "avx512fp16")]
21303    unsafe fn test_mm_mask_fnmsub_round_sh() {
21304        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21305        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21306        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21307        let r = _mm_mask_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21308            a, 0, b, c,
21309        );
21310        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21311        assert_eq_m128h(r, e);
21312        let r = _mm_mask_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21313            a, 1, b, c,
21314        );
21315        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21316        assert_eq_m128h(r, e);
21317    }
21318
21319    #[simd_test(enable = "avx512fp16")]
21320    unsafe fn test_mm_mask3_fnmsub_round_sh() {
21321        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21322        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21323        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21324        let r = _mm_mask3_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21325            a, b, c, 0,
21326        );
21327        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21328        assert_eq_m128h(r, e);
21329        let r = _mm_mask3_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21330            a, b, c, 1,
21331        );
21332        let e = _mm_setr_ph(-5.0, 30., 31., 32., 33., 34., 35., 36.);
21333        assert_eq_m128h(r, e);
21334    }
21335
21336    #[simd_test(enable = "avx512fp16")]
21337    unsafe fn test_mm_maskz_fnmsub_round_sh() {
21338        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21339        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21340        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21341        let r = _mm_maskz_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21342            0, a, b, c,
21343        );
21344        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
21345        assert_eq_m128h(r, e);
21346        let r = _mm_maskz_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21347            1, a, b, c,
21348        );
21349        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21350        assert_eq_m128h(r, e);
21351    }
21352
21353    #[simd_test(enable = "avx512fp16,avx512vl")]
21354    unsafe fn test_mm_fmaddsub_ph() {
21355        let a = _mm_set1_ph(1.0);
21356        let b = _mm_set1_ph(2.0);
21357        let c = _mm_set1_ph(3.0);
21358        let r = _mm_fmaddsub_ph(a, b, c);
21359        let e = _mm_set_ph(5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0);
21360        assert_eq_m128h(r, e);
21361    }
21362
21363    #[simd_test(enable = "avx512fp16,avx512vl")]
21364    unsafe fn test_mm_mask_fmaddsub_ph() {
21365        let a = _mm_set1_ph(1.0);
21366        let b = _mm_set1_ph(2.0);
21367        let c = _mm_set1_ph(3.0);
21368        let r = _mm_mask_fmaddsub_ph(a, 0b00110011, b, c);
21369        let e = _mm_set_ph(1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0);
21370        assert_eq_m128h(r, e);
21371    }
21372
21373    #[simd_test(enable = "avx512fp16,avx512vl")]
21374    unsafe fn test_mm_mask3_fmaddsub_ph() {
21375        let a = _mm_set1_ph(1.0);
21376        let b = _mm_set1_ph(2.0);
21377        let c = _mm_set1_ph(3.0);
21378        let r = _mm_mask3_fmaddsub_ph(a, b, c, 0b00110011);
21379        let e = _mm_set_ph(3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0);
21380        assert_eq_m128h(r, e);
21381    }
21382
21383    #[simd_test(enable = "avx512fp16,avx512vl")]
21384    unsafe fn test_mm_maskz_fmaddsub_ph() {
21385        let a = _mm_set1_ph(1.0);
21386        let b = _mm_set1_ph(2.0);
21387        let c = _mm_set1_ph(3.0);
21388        let r = _mm_maskz_fmaddsub_ph(0b00110011, a, b, c);
21389        let e = _mm_set_ph(0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0);
21390        assert_eq_m128h(r, e);
21391    }
21392
21393    #[simd_test(enable = "avx512fp16,avx512vl")]
21394    unsafe fn test_mm256_fmaddsub_ph() {
21395        let a = _mm256_set1_ph(1.0);
21396        let b = _mm256_set1_ph(2.0);
21397        let c = _mm256_set1_ph(3.0);
21398        let r = _mm256_fmaddsub_ph(a, b, c);
21399        let e = _mm256_set_ph(
21400            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21401        );
21402        assert_eq_m256h(r, e);
21403    }
21404
21405    #[simd_test(enable = "avx512fp16,avx512vl")]
21406    unsafe fn test_mm256_mask_fmaddsub_ph() {
21407        let a = _mm256_set1_ph(1.0);
21408        let b = _mm256_set1_ph(2.0);
21409        let c = _mm256_set1_ph(3.0);
21410        let r = _mm256_mask_fmaddsub_ph(a, 0b0011001100110011, b, c);
21411        let e = _mm256_set_ph(
21412            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21413        );
21414        assert_eq_m256h(r, e);
21415    }
21416
21417    #[simd_test(enable = "avx512fp16,avx512vl")]
21418    unsafe fn test_mm256_mask3_fmaddsub_ph() {
21419        let a = _mm256_set1_ph(1.0);
21420        let b = _mm256_set1_ph(2.0);
21421        let c = _mm256_set1_ph(3.0);
21422        let r = _mm256_mask3_fmaddsub_ph(a, b, c, 0b0011001100110011);
21423        let e = _mm256_set_ph(
21424            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21425        );
21426        assert_eq_m256h(r, e);
21427    }
21428
21429    #[simd_test(enable = "avx512fp16,avx512vl")]
21430    unsafe fn test_mm256_maskz_fmaddsub_ph() {
21431        let a = _mm256_set1_ph(1.0);
21432        let b = _mm256_set1_ph(2.0);
21433        let c = _mm256_set1_ph(3.0);
21434        let r = _mm256_maskz_fmaddsub_ph(0b0011001100110011, a, b, c);
21435        let e = _mm256_set_ph(
21436            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21437        );
21438        assert_eq_m256h(r, e);
21439    }
21440
21441    #[simd_test(enable = "avx512fp16")]
21442    unsafe fn test_mm512_fmaddsub_ph() {
21443        let a = _mm512_set1_ph(1.0);
21444        let b = _mm512_set1_ph(2.0);
21445        let c = _mm512_set1_ph(3.0);
21446        let r = _mm512_fmaddsub_ph(a, b, c);
21447        let e = _mm512_set_ph(
21448            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21449            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21450        );
21451        assert_eq_m512h(r, e);
21452    }
21453
21454    #[simd_test(enable = "avx512fp16")]
21455    unsafe fn test_mm512_mask_fmaddsub_ph() {
21456        let a = _mm512_set1_ph(1.0);
21457        let b = _mm512_set1_ph(2.0);
21458        let c = _mm512_set1_ph(3.0);
21459        let r = _mm512_mask_fmaddsub_ph(a, 0b00110011001100110011001100110011, b, c);
21460        let e = _mm512_set_ph(
21461            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21462            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21463        );
21464        assert_eq_m512h(r, e);
21465    }
21466
21467    #[simd_test(enable = "avx512fp16")]
21468    unsafe fn test_mm512_mask3_fmaddsub_ph() {
21469        let a = _mm512_set1_ph(1.0);
21470        let b = _mm512_set1_ph(2.0);
21471        let c = _mm512_set1_ph(3.0);
21472        let r = _mm512_mask3_fmaddsub_ph(a, b, c, 0b00110011001100110011001100110011);
21473        let e = _mm512_set_ph(
21474            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21475            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21476        );
21477        assert_eq_m512h(r, e);
21478    }
21479
21480    #[simd_test(enable = "avx512fp16")]
21481    unsafe fn test_mm512_maskz_fmaddsub_ph() {
21482        let a = _mm512_set1_ph(1.0);
21483        let b = _mm512_set1_ph(2.0);
21484        let c = _mm512_set1_ph(3.0);
21485        let r = _mm512_maskz_fmaddsub_ph(0b00110011001100110011001100110011, a, b, c);
21486        let e = _mm512_set_ph(
21487            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21488            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21489        );
21490        assert_eq_m512h(r, e);
21491    }
21492
21493    #[simd_test(enable = "avx512fp16")]
21494    unsafe fn test_mm512_fmaddsub_round_ph() {
21495        let a = _mm512_set1_ph(1.0);
21496        let b = _mm512_set1_ph(2.0);
21497        let c = _mm512_set1_ph(3.0);
21498        let r =
21499            _mm512_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21500        let e = _mm512_set_ph(
21501            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21502            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21503        );
21504        assert_eq_m512h(r, e);
21505    }
21506
21507    #[simd_test(enable = "avx512fp16")]
21508    unsafe fn test_mm512_mask_fmaddsub_round_ph() {
21509        let a = _mm512_set1_ph(1.0);
21510        let b = _mm512_set1_ph(2.0);
21511        let c = _mm512_set1_ph(3.0);
21512        let r = _mm512_mask_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21513            a,
21514            0b00110011001100110011001100110011,
21515            b,
21516            c,
21517        );
21518        let e = _mm512_set_ph(
21519            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21520            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21521        );
21522        assert_eq_m512h(r, e);
21523    }
21524
21525    #[simd_test(enable = "avx512fp16")]
21526    unsafe fn test_mm512_mask3_fmaddsub_round_ph() {
21527        let a = _mm512_set1_ph(1.0);
21528        let b = _mm512_set1_ph(2.0);
21529        let c = _mm512_set1_ph(3.0);
21530        let r = _mm512_mask3_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21531            a,
21532            b,
21533            c,
21534            0b00110011001100110011001100110011,
21535        );
21536        let e = _mm512_set_ph(
21537            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21538            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21539        );
21540        assert_eq_m512h(r, e);
21541    }
21542
21543    #[simd_test(enable = "avx512fp16")]
21544    unsafe fn test_mm512_maskz_fmaddsub_round_ph() {
21545        let a = _mm512_set1_ph(1.0);
21546        let b = _mm512_set1_ph(2.0);
21547        let c = _mm512_set1_ph(3.0);
21548        let r = _mm512_maskz_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21549            0b00110011001100110011001100110011,
21550            a,
21551            b,
21552            c,
21553        );
21554        let e = _mm512_set_ph(
21555            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21556            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21557        );
21558        assert_eq_m512h(r, e);
21559    }
21560
21561    #[simd_test(enable = "avx512fp16,avx512vl")]
21562    unsafe fn test_mm_fmsubadd_ph() {
21563        let a = _mm_set1_ph(1.0);
21564        let b = _mm_set1_ph(2.0);
21565        let c = _mm_set1_ph(3.0);
21566        let r = _mm_fmsubadd_ph(a, b, c);
21567        let e = _mm_set_ph(-1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0);
21568        assert_eq_m128h(r, e);
21569    }
21570
21571    #[simd_test(enable = "avx512fp16,avx512vl")]
21572    unsafe fn test_mm_mask_fmsubadd_ph() {
21573        let a = _mm_set1_ph(1.0);
21574        let b = _mm_set1_ph(2.0);
21575        let c = _mm_set1_ph(3.0);
21576        let r = _mm_mask_fmsubadd_ph(a, 0b00110011, b, c);
21577        let e = _mm_set_ph(1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0);
21578        assert_eq_m128h(r, e);
21579    }
21580
21581    #[simd_test(enable = "avx512fp16,avx512vl")]
21582    unsafe fn test_mm_mask3_fmsubadd_ph() {
21583        let a = _mm_set1_ph(1.0);
21584        let b = _mm_set1_ph(2.0);
21585        let c = _mm_set1_ph(3.0);
21586        let r = _mm_mask3_fmsubadd_ph(a, b, c, 0b00110011);
21587        let e = _mm_set_ph(3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0);
21588        assert_eq_m128h(r, e);
21589    }
21590
21591    #[simd_test(enable = "avx512fp16,avx512vl")]
21592    unsafe fn test_mm_maskz_fmsubadd_ph() {
21593        let a = _mm_set1_ph(1.0);
21594        let b = _mm_set1_ph(2.0);
21595        let c = _mm_set1_ph(3.0);
21596        let r = _mm_maskz_fmsubadd_ph(0b00110011, a, b, c);
21597        let e = _mm_set_ph(0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0);
21598        assert_eq_m128h(r, e);
21599    }
21600
21601    #[simd_test(enable = "avx512fp16,avx512vl")]
21602    unsafe fn test_mm256_fmsubadd_ph() {
21603        let a = _mm256_set1_ph(1.0);
21604        let b = _mm256_set1_ph(2.0);
21605        let c = _mm256_set1_ph(3.0);
21606        let r = _mm256_fmsubadd_ph(a, b, c);
21607        let e = _mm256_set_ph(
21608            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
21609        );
21610        assert_eq_m256h(r, e);
21611    }
21612
21613    #[simd_test(enable = "avx512fp16,avx512vl")]
21614    unsafe fn test_mm256_mask_fmsubadd_ph() {
21615        let a = _mm256_set1_ph(1.0);
21616        let b = _mm256_set1_ph(2.0);
21617        let c = _mm256_set1_ph(3.0);
21618        let r = _mm256_mask_fmsubadd_ph(a, 0b0011001100110011, b, c);
21619        let e = _mm256_set_ph(
21620            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
21621        );
21622        assert_eq_m256h(r, e);
21623    }
21624
21625    #[simd_test(enable = "avx512fp16,avx512vl")]
21626    unsafe fn test_mm256_mask3_fmsubadd_ph() {
21627        let a = _mm256_set1_ph(1.0);
21628        let b = _mm256_set1_ph(2.0);
21629        let c = _mm256_set1_ph(3.0);
21630        let r = _mm256_mask3_fmsubadd_ph(a, b, c, 0b0011001100110011);
21631        let e = _mm256_set_ph(
21632            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
21633        );
21634        assert_eq_m256h(r, e);
21635    }
21636
21637    #[simd_test(enable = "avx512fp16,avx512vl")]
21638    unsafe fn test_mm256_maskz_fmsubadd_ph() {
21639        let a = _mm256_set1_ph(1.0);
21640        let b = _mm256_set1_ph(2.0);
21641        let c = _mm256_set1_ph(3.0);
21642        let r = _mm256_maskz_fmsubadd_ph(0b0011001100110011, a, b, c);
21643        let e = _mm256_set_ph(
21644            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
21645        );
21646        assert_eq_m256h(r, e);
21647    }
21648
21649    #[simd_test(enable = "avx512fp16")]
21650    unsafe fn test_mm512_fmsubadd_ph() {
21651        let a = _mm512_set1_ph(1.0);
21652        let b = _mm512_set1_ph(2.0);
21653        let c = _mm512_set1_ph(3.0);
21654        let r = _mm512_fmsubadd_ph(a, b, c);
21655        let e = _mm512_set_ph(
21656            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
21657            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
21658        );
21659        assert_eq_m512h(r, e);
21660    }
21661
21662    #[simd_test(enable = "avx512fp16")]
21663    unsafe fn test_mm512_mask_fmsubadd_ph() {
21664        let a = _mm512_set1_ph(1.0);
21665        let b = _mm512_set1_ph(2.0);
21666        let c = _mm512_set1_ph(3.0);
21667        let r = _mm512_mask_fmsubadd_ph(a, 0b00110011001100110011001100110011, b, c);
21668        let e = _mm512_set_ph(
21669            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
21670            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
21671        );
21672        assert_eq_m512h(r, e);
21673    }
21674
21675    #[simd_test(enable = "avx512fp16")]
21676    unsafe fn test_mm512_mask3_fmsubadd_ph() {
21677        let a = _mm512_set1_ph(1.0);
21678        let b = _mm512_set1_ph(2.0);
21679        let c = _mm512_set1_ph(3.0);
21680        let r = _mm512_mask3_fmsubadd_ph(a, b, c, 0b00110011001100110011001100110011);
21681        let e = _mm512_set_ph(
21682            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
21683            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
21684        );
21685        assert_eq_m512h(r, e);
21686    }
21687
21688    #[simd_test(enable = "avx512fp16")]
21689    unsafe fn test_mm512_maskz_fmsubadd_ph() {
21690        let a = _mm512_set1_ph(1.0);
21691        let b = _mm512_set1_ph(2.0);
21692        let c = _mm512_set1_ph(3.0);
21693        let r = _mm512_maskz_fmsubadd_ph(0b00110011001100110011001100110011, a, b, c);
21694        let e = _mm512_set_ph(
21695            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
21696            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
21697        );
21698        assert_eq_m512h(r, e);
21699    }
21700
21701    #[simd_test(enable = "avx512fp16")]
21702    unsafe fn test_mm512_fmsubadd_round_ph() {
21703        let a = _mm512_set1_ph(1.0);
21704        let b = _mm512_set1_ph(2.0);
21705        let c = _mm512_set1_ph(3.0);
21706        let r =
21707            _mm512_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21708        let e = _mm512_set_ph(
21709            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
21710            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
21711        );
21712        assert_eq_m512h(r, e);
21713    }
21714
21715    #[simd_test(enable = "avx512fp16")]
21716    unsafe fn test_mm512_mask_fmsubadd_round_ph() {
21717        let a = _mm512_set1_ph(1.0);
21718        let b = _mm512_set1_ph(2.0);
21719        let c = _mm512_set1_ph(3.0);
21720        let r = _mm512_mask_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21721            a,
21722            0b00110011001100110011001100110011,
21723            b,
21724            c,
21725        );
21726        let e = _mm512_set_ph(
21727            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
21728            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
21729        );
21730        assert_eq_m512h(r, e);
21731    }
21732
21733    #[simd_test(enable = "avx512fp16")]
21734    unsafe fn test_mm512_mask3_fmsubadd_round_ph() {
21735        let a = _mm512_set1_ph(1.0);
21736        let b = _mm512_set1_ph(2.0);
21737        let c = _mm512_set1_ph(3.0);
21738        let r = _mm512_mask3_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21739            a,
21740            b,
21741            c,
21742            0b00110011001100110011001100110011,
21743        );
21744        let e = _mm512_set_ph(
21745            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
21746            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
21747        );
21748        assert_eq_m512h(r, e);
21749    }
21750
21751    #[simd_test(enable = "avx512fp16")]
21752    unsafe fn test_mm512_maskz_fmsubadd_round_ph() {
21753        let a = _mm512_set1_ph(1.0);
21754        let b = _mm512_set1_ph(2.0);
21755        let c = _mm512_set1_ph(3.0);
21756        let r = _mm512_maskz_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21757            0b00110011001100110011001100110011,
21758            a,
21759            b,
21760            c,
21761        );
21762        let e = _mm512_set_ph(
21763            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
21764            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
21765        );
21766        assert_eq_m512h(r, e);
21767    }
21768
21769    #[simd_test(enable = "avx512fp16,avx512vl")]
21770    unsafe fn test_mm_rcp_ph() {
21771        let a = _mm_set1_ph(2.0);
21772        let r = _mm_rcp_ph(a);
21773        let e = _mm_set1_ph(0.5);
21774        assert_eq_m128h(r, e);
21775    }
21776
21777    #[simd_test(enable = "avx512fp16,avx512vl")]
21778    unsafe fn test_mm_mask_rcp_ph() {
21779        let a = _mm_set1_ph(2.0);
21780        let src = _mm_set1_ph(1.0);
21781        let r = _mm_mask_rcp_ph(src, 0b01010101, a);
21782        let e = _mm_set_ph(1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5);
21783        assert_eq_m128h(r, e);
21784    }
21785
21786    #[simd_test(enable = "avx512fp16,avx512vl")]
21787    unsafe fn test_mm_maskz_rcp_ph() {
21788        let a = _mm_set1_ph(2.0);
21789        let r = _mm_maskz_rcp_ph(0b01010101, a);
21790        let e = _mm_set_ph(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5);
21791        assert_eq_m128h(r, e);
21792    }
21793
21794    #[simd_test(enable = "avx512fp16,avx512vl")]
21795    unsafe fn test_mm256_rcp_ph() {
21796        let a = _mm256_set1_ph(2.0);
21797        let r = _mm256_rcp_ph(a);
21798        let e = _mm256_set1_ph(0.5);
21799        assert_eq_m256h(r, e);
21800    }
21801
21802    #[simd_test(enable = "avx512fp16,avx512vl")]
21803    unsafe fn test_mm256_mask_rcp_ph() {
21804        let a = _mm256_set1_ph(2.0);
21805        let src = _mm256_set1_ph(1.0);
21806        let r = _mm256_mask_rcp_ph(src, 0b0101010101010101, a);
21807        let e = _mm256_set_ph(
21808            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
21809        );
21810        assert_eq_m256h(r, e);
21811    }
21812
21813    #[simd_test(enable = "avx512fp16,avx512vl")]
21814    unsafe fn test_mm256_maskz_rcp_ph() {
21815        let a = _mm256_set1_ph(2.0);
21816        let r = _mm256_maskz_rcp_ph(0b0101010101010101, a);
21817        let e = _mm256_set_ph(
21818            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
21819        );
21820        assert_eq_m256h(r, e);
21821    }
21822
21823    #[simd_test(enable = "avx512fp16")]
21824    unsafe fn test_mm512_rcp_ph() {
21825        let a = _mm512_set1_ph(2.0);
21826        let r = _mm512_rcp_ph(a);
21827        let e = _mm512_set1_ph(0.5);
21828        assert_eq_m512h(r, e);
21829    }
21830
21831    #[simd_test(enable = "avx512fp16")]
21832    unsafe fn test_mm512_mask_rcp_ph() {
21833        let a = _mm512_set1_ph(2.0);
21834        let src = _mm512_set1_ph(1.0);
21835        let r = _mm512_mask_rcp_ph(src, 0b01010101010101010101010101010101, a);
21836        let e = _mm512_set_ph(
21837            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0,
21838            0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
21839        );
21840        assert_eq_m512h(r, e);
21841    }
21842
21843    #[simd_test(enable = "avx512fp16")]
21844    unsafe fn test_mm512_maskz_rcp_ph() {
21845        let a = _mm512_set1_ph(2.0);
21846        let r = _mm512_maskz_rcp_ph(0b01010101010101010101010101010101, a);
21847        let e = _mm512_set_ph(
21848            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
21849            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
21850        );
21851        assert_eq_m512h(r, e);
21852    }
21853
21854    #[simd_test(enable = "avx512fp16")]
21855    unsafe fn test_mm_rcp_sh() {
21856        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21857        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
21858        let r = _mm_rcp_sh(a, b);
21859        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21860        assert_eq_m128h(r, e);
21861    }
21862
21863    #[simd_test(enable = "avx512fp16")]
21864    unsafe fn test_mm_mask_rcp_sh() {
21865        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21866        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
21867        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
21868        let r = _mm_mask_rcp_sh(src, 0, a, b);
21869        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21870        assert_eq_m128h(r, e);
21871        let r = _mm_mask_rcp_sh(src, 1, a, b);
21872        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21873        assert_eq_m128h(r, e);
21874    }
21875
21876    #[simd_test(enable = "avx512fp16")]
21877    unsafe fn test_mm_maskz_rcp_sh() {
21878        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21879        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
21880        let r = _mm_maskz_rcp_sh(0, a, b);
21881        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21882        assert_eq_m128h(r, e);
21883        let r = _mm_maskz_rcp_sh(1, a, b);
21884        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21885        assert_eq_m128h(r, e);
21886    }
21887
21888    #[simd_test(enable = "avx512fp16,avx512vl")]
21889    unsafe fn test_mm_rsqrt_ph() {
21890        let a = _mm_set1_ph(4.0);
21891        let r = _mm_rsqrt_ph(a);
21892        let e = _mm_set1_ph(0.5);
21893        assert_eq_m128h(r, e);
21894    }
21895
21896    #[simd_test(enable = "avx512fp16,avx512vl")]
21897    unsafe fn test_mm_mask_rsqrt_ph() {
21898        let a = _mm_set1_ph(4.0);
21899        let src = _mm_set1_ph(1.0);
21900        let r = _mm_mask_rsqrt_ph(src, 0b01010101, a);
21901        let e = _mm_set_ph(1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5);
21902        assert_eq_m128h(r, e);
21903    }
21904
21905    #[simd_test(enable = "avx512fp16,avx512vl")]
21906    unsafe fn test_mm_maskz_rsqrt_ph() {
21907        let a = _mm_set1_ph(4.0);
21908        let r = _mm_maskz_rsqrt_ph(0b01010101, a);
21909        let e = _mm_set_ph(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5);
21910        assert_eq_m128h(r, e);
21911    }
21912
21913    #[simd_test(enable = "avx512fp16,avx512vl")]
21914    unsafe fn test_mm256_rsqrt_ph() {
21915        let a = _mm256_set1_ph(4.0);
21916        let r = _mm256_rsqrt_ph(a);
21917        let e = _mm256_set1_ph(0.5);
21918        assert_eq_m256h(r, e);
21919    }
21920
21921    #[simd_test(enable = "avx512fp16,avx512vl")]
21922    unsafe fn test_mm256_mask_rsqrt_ph() {
21923        let a = _mm256_set1_ph(4.0);
21924        let src = _mm256_set1_ph(1.0);
21925        let r = _mm256_mask_rsqrt_ph(src, 0b0101010101010101, a);
21926        let e = _mm256_set_ph(
21927            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
21928        );
21929        assert_eq_m256h(r, e);
21930    }
21931
21932    #[simd_test(enable = "avx512fp16,avx512vl")]
21933    unsafe fn test_mm256_maskz_rsqrt_ph() {
21934        let a = _mm256_set1_ph(4.0);
21935        let r = _mm256_maskz_rsqrt_ph(0b0101010101010101, a);
21936        let e = _mm256_set_ph(
21937            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
21938        );
21939        assert_eq_m256h(r, e);
21940    }
21941
21942    #[simd_test(enable = "avx512fp16")]
21943    unsafe fn test_mm512_rsqrt_ph() {
21944        let a = _mm512_set1_ph(4.0);
21945        let r = _mm512_rsqrt_ph(a);
21946        let e = _mm512_set1_ph(0.5);
21947        assert_eq_m512h(r, e);
21948    }
21949
21950    #[simd_test(enable = "avx512fp16")]
21951    unsafe fn test_mm512_mask_rsqrt_ph() {
21952        let a = _mm512_set1_ph(4.0);
21953        let src = _mm512_set1_ph(1.0);
21954        let r = _mm512_mask_rsqrt_ph(src, 0b01010101010101010101010101010101, a);
21955        let e = _mm512_set_ph(
21956            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0,
21957            0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
21958        );
21959        assert_eq_m512h(r, e);
21960    }
21961
21962    #[simd_test(enable = "avx512fp16")]
21963    unsafe fn test_mm512_maskz_rsqrt_ph() {
21964        let a = _mm512_set1_ph(4.0);
21965        let r = _mm512_maskz_rsqrt_ph(0b01010101010101010101010101010101, a);
21966        let e = _mm512_set_ph(
21967            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
21968            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
21969        );
21970        assert_eq_m512h(r, e);
21971    }
21972
21973    #[simd_test(enable = "avx512fp16")]
21974    unsafe fn test_mm_rsqrt_sh() {
21975        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21976        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
21977        let r = _mm_rsqrt_sh(a, b);
21978        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21979        assert_eq_m128h(r, e);
21980    }
21981
21982    #[simd_test(enable = "avx512fp16")]
21983    unsafe fn test_mm_mask_rsqrt_sh() {
21984        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21985        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
21986        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
21987        let r = _mm_mask_rsqrt_sh(src, 0, a, b);
21988        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21989        assert_eq_m128h(r, e);
21990        let r = _mm_mask_rsqrt_sh(src, 1, a, b);
21991        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21992        assert_eq_m128h(r, e);
21993    }
21994
21995    #[simd_test(enable = "avx512fp16")]
21996    unsafe fn test_mm_maskz_rsqrt_sh() {
21997        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21998        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
21999        let r = _mm_maskz_rsqrt_sh(0, a, b);
22000        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22001        assert_eq_m128h(r, e);
22002        let r = _mm_maskz_rsqrt_sh(1, a, b);
22003        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22004        assert_eq_m128h(r, e);
22005    }
22006
22007    #[simd_test(enable = "avx512fp16,avx512vl")]
22008    unsafe fn test_mm_sqrt_ph() {
22009        let a = _mm_set1_ph(4.0);
22010        let r = _mm_sqrt_ph(a);
22011        let e = _mm_set1_ph(2.0);
22012        assert_eq_m128h(r, e);
22013    }
22014
22015    #[simd_test(enable = "avx512fp16,avx512vl")]
22016    unsafe fn test_mm_mask_sqrt_ph() {
22017        let a = _mm_set1_ph(4.0);
22018        let src = _mm_set1_ph(1.0);
22019        let r = _mm_mask_sqrt_ph(src, 0b01010101, a);
22020        let e = _mm_set_ph(1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0);
22021        assert_eq_m128h(r, e);
22022    }
22023
22024    #[simd_test(enable = "avx512fp16,avx512vl")]
22025    unsafe fn test_mm_maskz_sqrt_ph() {
22026        let a = _mm_set1_ph(4.0);
22027        let r = _mm_maskz_sqrt_ph(0b01010101, a);
22028        let e = _mm_set_ph(0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0);
22029        assert_eq_m128h(r, e);
22030    }
22031
22032    #[simd_test(enable = "avx512fp16,avx512vl")]
22033    unsafe fn test_mm256_sqrt_ph() {
22034        let a = _mm256_set1_ph(4.0);
22035        let r = _mm256_sqrt_ph(a);
22036        let e = _mm256_set1_ph(2.0);
22037        assert_eq_m256h(r, e);
22038    }
22039
22040    #[simd_test(enable = "avx512fp16,avx512vl")]
22041    unsafe fn test_mm256_mask_sqrt_ph() {
22042        let a = _mm256_set1_ph(4.0);
22043        let src = _mm256_set1_ph(1.0);
22044        let r = _mm256_mask_sqrt_ph(src, 0b0101010101010101, a);
22045        let e = _mm256_set_ph(
22046            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
22047        );
22048        assert_eq_m256h(r, e);
22049    }
22050
22051    #[simd_test(enable = "avx512fp16,avx512vl")]
22052    unsafe fn test_mm256_maskz_sqrt_ph() {
22053        let a = _mm256_set1_ph(4.0);
22054        let r = _mm256_maskz_sqrt_ph(0b0101010101010101, a);
22055        let e = _mm256_set_ph(
22056            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22057        );
22058        assert_eq_m256h(r, e);
22059    }
22060
22061    #[simd_test(enable = "avx512fp16")]
22062    unsafe fn test_mm512_sqrt_ph() {
22063        let a = _mm512_set1_ph(4.0);
22064        let r = _mm512_sqrt_ph(a);
22065        let e = _mm512_set1_ph(2.0);
22066        assert_eq_m512h(r, e);
22067    }
22068
22069    #[simd_test(enable = "avx512fp16")]
22070    unsafe fn test_mm512_mask_sqrt_ph() {
22071        let a = _mm512_set1_ph(4.0);
22072        let src = _mm512_set1_ph(1.0);
22073        let r = _mm512_mask_sqrt_ph(src, 0b01010101010101010101010101010101, a);
22074        let e = _mm512_set_ph(
22075            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
22076            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
22077        );
22078        assert_eq_m512h(r, e);
22079    }
22080
22081    #[simd_test(enable = "avx512fp16")]
22082    unsafe fn test_mm512_maskz_sqrt_ph() {
22083        let a = _mm512_set1_ph(4.0);
22084        let r = _mm512_maskz_sqrt_ph(0b01010101010101010101010101010101, a);
22085        let e = _mm512_set_ph(
22086            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
22087            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22088        );
22089        assert_eq_m512h(r, e);
22090    }
22091
22092    #[simd_test(enable = "avx512fp16")]
22093    unsafe fn test_mm512_sqrt_round_ph() {
22094        let a = _mm512_set1_ph(4.0);
22095        let r = _mm512_sqrt_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
22096        let e = _mm512_set1_ph(2.0);
22097        assert_eq_m512h(r, e);
22098    }
22099
22100    #[simd_test(enable = "avx512fp16")]
22101    unsafe fn test_mm512_mask_sqrt_round_ph() {
22102        let a = _mm512_set1_ph(4.0);
22103        let src = _mm512_set1_ph(1.0);
22104        let r = _mm512_mask_sqrt_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22105            src,
22106            0b01010101010101010101010101010101,
22107            a,
22108        );
22109        let e = _mm512_set_ph(
22110            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
22111            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
22112        );
22113        assert_eq_m512h(r, e);
22114    }
22115
22116    #[simd_test(enable = "avx512fp16")]
22117    unsafe fn test_mm512_maskz_sqrt_round_ph() {
22118        let a = _mm512_set1_ph(4.0);
22119        let r = _mm512_maskz_sqrt_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22120            0b01010101010101010101010101010101,
22121            a,
22122        );
22123        let e = _mm512_set_ph(
22124            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
22125            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22126        );
22127        assert_eq_m512h(r, e);
22128    }
22129
22130    #[simd_test(enable = "avx512fp16")]
22131    unsafe fn test_mm_sqrt_sh() {
22132        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22133        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22134        let r = _mm_sqrt_sh(a, b);
22135        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22136        assert_eq_m128h(r, e);
22137    }
22138
22139    #[simd_test(enable = "avx512fp16")]
22140    unsafe fn test_mm_mask_sqrt_sh() {
22141        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22142        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22143        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22144        let r = _mm_mask_sqrt_sh(src, 0, a, b);
22145        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22146        assert_eq_m128h(r, e);
22147        let r = _mm_mask_sqrt_sh(src, 1, a, b);
22148        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22149        assert_eq_m128h(r, e);
22150    }
22151
22152    #[simd_test(enable = "avx512fp16")]
22153    unsafe fn test_mm_maskz_sqrt_sh() {
22154        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22155        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22156        let r = _mm_maskz_sqrt_sh(0, a, b);
22157        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22158        assert_eq_m128h(r, e);
22159        let r = _mm_maskz_sqrt_sh(1, a, b);
22160        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22161        assert_eq_m128h(r, e);
22162    }
22163
22164    #[simd_test(enable = "avx512fp16")]
22165    unsafe fn test_mm_sqrt_round_sh() {
22166        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22167        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22168        let r = _mm_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22169        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22170        assert_eq_m128h(r, e);
22171    }
22172
22173    #[simd_test(enable = "avx512fp16")]
22174    unsafe fn test_mm_mask_sqrt_round_sh() {
22175        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22176        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22177        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22178        let r = _mm_mask_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22179            src, 0, a, b,
22180        );
22181        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22182        assert_eq_m128h(r, e);
22183        let r = _mm_mask_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22184            src, 1, a, b,
22185        );
22186        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22187        assert_eq_m128h(r, e);
22188    }
22189
22190    #[simd_test(enable = "avx512fp16")]
22191    unsafe fn test_mm_maskz_sqrt_round_sh() {
22192        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22193        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22194        let r =
22195            _mm_maskz_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
22196        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22197        assert_eq_m128h(r, e);
22198        let r =
22199            _mm_maskz_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
22200        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22201        assert_eq_m128h(r, e);
22202    }
22203
22204    #[simd_test(enable = "avx512fp16,avx512vl")]
22205    unsafe fn test_mm_max_ph() {
22206        let a = _mm_set1_ph(2.0);
22207        let b = _mm_set1_ph(1.0);
22208        let r = _mm_max_ph(a, b);
22209        let e = _mm_set1_ph(2.0);
22210        assert_eq_m128h(r, e);
22211    }
22212
22213    #[simd_test(enable = "avx512fp16,avx512vl")]
22214    unsafe fn test_mm_mask_max_ph() {
22215        let a = _mm_set1_ph(2.0);
22216        let b = _mm_set1_ph(1.0);
22217        let src = _mm_set1_ph(3.0);
22218        let r = _mm_mask_max_ph(src, 0b01010101, a, b);
22219        let e = _mm_set_ph(3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0);
22220        assert_eq_m128h(r, e);
22221    }
22222
22223    #[simd_test(enable = "avx512fp16,avx512vl")]
22224    unsafe fn test_mm_maskz_max_ph() {
22225        let a = _mm_set1_ph(2.0);
22226        let b = _mm_set1_ph(1.0);
22227        let r = _mm_maskz_max_ph(0b01010101, a, b);
22228        let e = _mm_set_ph(0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0);
22229        assert_eq_m128h(r, e);
22230    }
22231
22232    #[simd_test(enable = "avx512fp16,avx512vl")]
22233    unsafe fn test_mm256_max_ph() {
22234        let a = _mm256_set1_ph(2.0);
22235        let b = _mm256_set1_ph(1.0);
22236        let r = _mm256_max_ph(a, b);
22237        let e = _mm256_set1_ph(2.0);
22238        assert_eq_m256h(r, e);
22239    }
22240
22241    #[simd_test(enable = "avx512fp16,avx512vl")]
22242    unsafe fn test_mm256_mask_max_ph() {
22243        let a = _mm256_set1_ph(2.0);
22244        let b = _mm256_set1_ph(1.0);
22245        let src = _mm256_set1_ph(3.0);
22246        let r = _mm256_mask_max_ph(src, 0b0101010101010101, a, b);
22247        let e = _mm256_set_ph(
22248            3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0,
22249        );
22250        assert_eq_m256h(r, e);
22251    }
22252
22253    #[simd_test(enable = "avx512fp16,avx512vl")]
22254    unsafe fn test_mm256_maskz_max_ph() {
22255        let a = _mm256_set1_ph(2.0);
22256        let b = _mm256_set1_ph(1.0);
22257        let r = _mm256_maskz_max_ph(0b0101010101010101, a, b);
22258        let e = _mm256_set_ph(
22259            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22260        );
22261        assert_eq_m256h(r, e);
22262    }
22263
22264    #[simd_test(enable = "avx512fp16")]
22265    unsafe fn test_mm512_max_ph() {
22266        let a = _mm512_set1_ph(2.0);
22267        let b = _mm512_set1_ph(1.0);
22268        let r = _mm512_max_ph(a, b);
22269        let e = _mm512_set1_ph(2.0);
22270        assert_eq_m512h(r, e);
22271    }
22272
22273    #[simd_test(enable = "avx512fp16")]
22274    unsafe fn test_mm512_mask_max_ph() {
22275        let a = _mm512_set1_ph(2.0);
22276        let b = _mm512_set1_ph(1.0);
22277        let src = _mm512_set1_ph(3.0);
22278        let r = _mm512_mask_max_ph(src, 0b01010101010101010101010101010101, a, b);
22279        let e = _mm512_set_ph(
22280            3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0,
22281            2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0,
22282        );
22283        assert_eq_m512h(r, e);
22284    }
22285
22286    #[simd_test(enable = "avx512fp16")]
22287    unsafe fn test_mm512_maskz_max_ph() {
22288        let a = _mm512_set1_ph(2.0);
22289        let b = _mm512_set1_ph(1.0);
22290        let r = _mm512_maskz_max_ph(0b01010101010101010101010101010101, a, b);
22291        let e = _mm512_set_ph(
22292            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
22293            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22294        );
22295        assert_eq_m512h(r, e);
22296    }
22297
22298    #[simd_test(enable = "avx512fp16")]
22299    unsafe fn test_mm512_max_round_ph() {
22300        let a = _mm512_set1_ph(2.0);
22301        let b = _mm512_set1_ph(1.0);
22302        let r = _mm512_max_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22303        let e = _mm512_set1_ph(2.0);
22304        assert_eq_m512h(r, e);
22305    }
22306
22307    #[simd_test(enable = "avx512fp16")]
22308    unsafe fn test_mm512_mask_max_round_ph() {
22309        let a = _mm512_set1_ph(2.0);
22310        let b = _mm512_set1_ph(1.0);
22311        let src = _mm512_set1_ph(3.0);
22312        let r = _mm512_mask_max_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22313            src,
22314            0b01010101010101010101010101010101,
22315            a,
22316            b,
22317        );
22318        let e = _mm512_set_ph(
22319            3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0,
22320            2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0,
22321        );
22322        assert_eq_m512h(r, e);
22323    }
22324
22325    #[simd_test(enable = "avx512fp16")]
22326    unsafe fn test_mm512_maskz_max_round_ph() {
22327        let a = _mm512_set1_ph(2.0);
22328        let b = _mm512_set1_ph(1.0);
22329        let r = _mm512_maskz_max_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22330            0b01010101010101010101010101010101,
22331            a,
22332            b,
22333        );
22334        let e = _mm512_set_ph(
22335            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
22336            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22337        );
22338        assert_eq_m512h(r, e);
22339    }
22340
22341    #[simd_test(enable = "avx512fp16")]
22342    unsafe fn test_mm_max_sh() {
22343        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22344        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22345        let r = _mm_max_sh(a, b);
22346        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22347        assert_eq_m128h(r, e);
22348    }
22349
22350    #[simd_test(enable = "avx512fp16")]
22351    unsafe fn test_mm_mask_max_sh() {
22352        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22353        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22354        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22355        let r = _mm_mask_max_sh(src, 0, a, b);
22356        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22357        assert_eq_m128h(r, e);
22358        let r = _mm_mask_max_sh(src, 1, a, b);
22359        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22360        assert_eq_m128h(r, e);
22361    }
22362
22363    #[simd_test(enable = "avx512fp16")]
22364    unsafe fn test_mm_maskz_max_sh() {
22365        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22366        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22367        let r = _mm_maskz_max_sh(0, a, b);
22368        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22369        assert_eq_m128h(r, e);
22370        let r = _mm_maskz_max_sh(1, a, b);
22371        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22372        assert_eq_m128h(r, e);
22373    }
22374
22375    #[simd_test(enable = "avx512fp16")]
22376    unsafe fn test_mm_max_round_sh() {
22377        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22378        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22379        let r = _mm_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22380        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22381        assert_eq_m128h(r, e);
22382    }
22383
22384    #[simd_test(enable = "avx512fp16")]
22385    unsafe fn test_mm_mask_max_round_sh() {
22386        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22387        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22388        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22389        let r = _mm_mask_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22390            src, 0, a, b,
22391        );
22392        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22393        assert_eq_m128h(r, e);
22394        let r = _mm_mask_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22395            src, 1, a, b,
22396        );
22397        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22398        assert_eq_m128h(r, e);
22399    }
22400
22401    #[simd_test(enable = "avx512fp16")]
22402    unsafe fn test_mm_maskz_max_round_sh() {
22403        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22404        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22405        let r =
22406            _mm_maskz_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
22407        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22408        assert_eq_m128h(r, e);
22409        let r =
22410            _mm_maskz_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
22411        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22412        assert_eq_m128h(r, e);
22413    }
22414
22415    #[simd_test(enable = "avx512fp16,avx512vl")]
22416    unsafe fn test_mm_min_ph() {
22417        let a = _mm_set1_ph(2.0);
22418        let b = _mm_set1_ph(1.0);
22419        let r = _mm_min_ph(a, b);
22420        let e = _mm_set1_ph(1.0);
22421        assert_eq_m128h(r, e);
22422    }
22423
22424    #[simd_test(enable = "avx512fp16,avx512vl")]
22425    unsafe fn test_mm_mask_min_ph() {
22426        let a = _mm_set1_ph(2.0);
22427        let b = _mm_set1_ph(1.0);
22428        let src = _mm_set1_ph(3.0);
22429        let r = _mm_mask_min_ph(src, 0b01010101, a, b);
22430        let e = _mm_set_ph(3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0);
22431        assert_eq_m128h(r, e);
22432    }
22433
22434    #[simd_test(enable = "avx512fp16,avx512vl")]
22435    unsafe fn test_mm_maskz_min_ph() {
22436        let a = _mm_set1_ph(2.0);
22437        let b = _mm_set1_ph(1.0);
22438        let r = _mm_maskz_min_ph(0b01010101, a, b);
22439        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
22440        assert_eq_m128h(r, e);
22441    }
22442
22443    #[simd_test(enable = "avx512fp16,avx512vl")]
22444    unsafe fn test_mm256_min_ph() {
22445        let a = _mm256_set1_ph(2.0);
22446        let b = _mm256_set1_ph(1.0);
22447        let r = _mm256_min_ph(a, b);
22448        let e = _mm256_set1_ph(1.0);
22449        assert_eq_m256h(r, e);
22450    }
22451
22452    #[simd_test(enable = "avx512fp16,avx512vl")]
22453    unsafe fn test_mm256_mask_min_ph() {
22454        let a = _mm256_set1_ph(2.0);
22455        let b = _mm256_set1_ph(1.0);
22456        let src = _mm256_set1_ph(3.0);
22457        let r = _mm256_mask_min_ph(src, 0b0101010101010101, a, b);
22458        let e = _mm256_set_ph(
22459            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
22460        );
22461        assert_eq_m256h(r, e);
22462    }
22463
22464    #[simd_test(enable = "avx512fp16,avx512vl")]
22465    unsafe fn test_mm256_maskz_min_ph() {
22466        let a = _mm256_set1_ph(2.0);
22467        let b = _mm256_set1_ph(1.0);
22468        let r = _mm256_maskz_min_ph(0b0101010101010101, a, b);
22469        let e = _mm256_set_ph(
22470            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22471        );
22472        assert_eq_m256h(r, e);
22473    }
22474
22475    #[simd_test(enable = "avx512fp16")]
22476    unsafe fn test_mm512_min_ph() {
22477        let a = _mm512_set1_ph(2.0);
22478        let b = _mm512_set1_ph(1.0);
22479        let r = _mm512_min_ph(a, b);
22480        let e = _mm512_set1_ph(1.0);
22481        assert_eq_m512h(r, e);
22482    }
22483
22484    #[simd_test(enable = "avx512fp16")]
22485    unsafe fn test_mm512_mask_min_ph() {
22486        let a = _mm512_set1_ph(2.0);
22487        let b = _mm512_set1_ph(1.0);
22488        let src = _mm512_set1_ph(3.0);
22489        let r = _mm512_mask_min_ph(src, 0b01010101010101010101010101010101, a, b);
22490        let e = _mm512_set_ph(
22491            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
22492            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
22493        );
22494        assert_eq_m512h(r, e);
22495    }
22496
22497    #[simd_test(enable = "avx512fp16")]
22498    unsafe fn test_mm512_maskz_min_ph() {
22499        let a = _mm512_set1_ph(2.0);
22500        let b = _mm512_set1_ph(1.0);
22501        let r = _mm512_maskz_min_ph(0b01010101010101010101010101010101, a, b);
22502        let e = _mm512_set_ph(
22503            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
22504            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22505        );
22506        assert_eq_m512h(r, e);
22507    }
22508
22509    #[simd_test(enable = "avx512fp16")]
22510    unsafe fn test_mm512_min_round_ph() {
22511        let a = _mm512_set1_ph(2.0);
22512        let b = _mm512_set1_ph(1.0);
22513        let r = _mm512_min_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22514        let e = _mm512_set1_ph(1.0);
22515        assert_eq_m512h(r, e);
22516    }
22517
22518    #[simd_test(enable = "avx512fp16")]
22519    unsafe fn test_mm512_mask_min_round_ph() {
22520        let a = _mm512_set1_ph(2.0);
22521        let b = _mm512_set1_ph(1.0);
22522        let src = _mm512_set1_ph(3.0);
22523        let r = _mm512_mask_min_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22524            src,
22525            0b01010101010101010101010101010101,
22526            a,
22527            b,
22528        );
22529        let e = _mm512_set_ph(
22530            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
22531            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
22532        );
22533        assert_eq_m512h(r, e);
22534    }
22535
22536    #[simd_test(enable = "avx512fp16")]
22537    unsafe fn test_mm512_maskz_min_round_ph() {
22538        let a = _mm512_set1_ph(2.0);
22539        let b = _mm512_set1_ph(1.0);
22540        let r = _mm512_maskz_min_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22541            0b01010101010101010101010101010101,
22542            a,
22543            b,
22544        );
22545        let e = _mm512_set_ph(
22546            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
22547            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22548        );
22549        assert_eq_m512h(r, e);
22550    }
22551
22552    #[simd_test(enable = "avx512fp16")]
22553    unsafe fn test_mm_min_sh() {
22554        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22555        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22556        let r = _mm_min_sh(a, b);
22557        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22558        assert_eq_m128h(r, e);
22559    }
22560
22561    #[simd_test(enable = "avx512fp16")]
22562    unsafe fn test_mm_mask_min_sh() {
22563        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22564        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22565        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22566        let r = _mm_mask_min_sh(src, 0, a, b);
22567        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22568        assert_eq_m128h(r, e);
22569        let r = _mm_mask_min_sh(src, 1, a, b);
22570        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22571        assert_eq_m128h(r, e);
22572    }
22573
22574    #[simd_test(enable = "avx512fp16")]
22575    unsafe fn test_mm_maskz_min_sh() {
22576        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22577        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22578        let r = _mm_maskz_min_sh(0, a, b);
22579        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22580        assert_eq_m128h(r, e);
22581        let r = _mm_maskz_min_sh(1, a, b);
22582        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22583        assert_eq_m128h(r, e);
22584    }
22585
22586    #[simd_test(enable = "avx512fp16")]
22587    unsafe fn test_mm_min_round_sh() {
22588        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22589        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22590        let r = _mm_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22591        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22592        assert_eq_m128h(r, e);
22593    }
22594
22595    #[simd_test(enable = "avx512fp16")]
22596    unsafe fn test_mm_mask_min_round_sh() {
22597        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22598        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22599        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22600        let r = _mm_mask_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22601            src, 0, a, b,
22602        );
22603        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22604        assert_eq_m128h(r, e);
22605        let r = _mm_mask_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22606            src, 1, a, b,
22607        );
22608        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22609        assert_eq_m128h(r, e);
22610    }
22611
22612    #[simd_test(enable = "avx512fp16")]
22613    unsafe fn test_mm_maskz_min_round_sh() {
22614        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22615        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22616        let r =
22617            _mm_maskz_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
22618        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22619        assert_eq_m128h(r, e);
22620        let r =
22621            _mm_maskz_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
22622        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22623        assert_eq_m128h(r, e);
22624    }
22625
22626    #[simd_test(enable = "avx512fp16,avx512vl")]
22627    unsafe fn test_mm_getexp_ph() {
22628        let a = _mm_set1_ph(3.0);
22629        let r = _mm_getexp_ph(a);
22630        let e = _mm_set1_ph(1.0);
22631        assert_eq_m128h(r, e);
22632    }
22633
22634    #[simd_test(enable = "avx512fp16,avx512vl")]
22635    unsafe fn test_mm_mask_getexp_ph() {
22636        let a = _mm_set1_ph(3.0);
22637        let src = _mm_set1_ph(4.0);
22638        let r = _mm_mask_getexp_ph(src, 0b01010101, a);
22639        let e = _mm_set_ph(4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0);
22640        assert_eq_m128h(r, e);
22641    }
22642
22643    #[simd_test(enable = "avx512fp16,avx512vl")]
22644    unsafe fn test_mm_maskz_getexp_ph() {
22645        let a = _mm_set1_ph(3.0);
22646        let r = _mm_maskz_getexp_ph(0b01010101, a);
22647        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
22648        assert_eq_m128h(r, e);
22649    }
22650
22651    #[simd_test(enable = "avx512fp16,avx512vl")]
22652    unsafe fn test_mm256_getexp_ph() {
22653        let a = _mm256_set1_ph(3.0);
22654        let r = _mm256_getexp_ph(a);
22655        let e = _mm256_set1_ph(1.0);
22656        assert_eq_m256h(r, e);
22657    }
22658
22659    #[simd_test(enable = "avx512fp16,avx512vl")]
22660    unsafe fn test_mm256_mask_getexp_ph() {
22661        let a = _mm256_set1_ph(3.0);
22662        let src = _mm256_set1_ph(4.0);
22663        let r = _mm256_mask_getexp_ph(src, 0b0101010101010101, a);
22664        let e = _mm256_set_ph(
22665            4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0,
22666        );
22667        assert_eq_m256h(r, e);
22668    }
22669
22670    #[simd_test(enable = "avx512fp16,avx512vl")]
22671    unsafe fn test_mm256_maskz_getexp_ph() {
22672        let a = _mm256_set1_ph(3.0);
22673        let r = _mm256_maskz_getexp_ph(0b0101010101010101, a);
22674        let e = _mm256_set_ph(
22675            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22676        );
22677        assert_eq_m256h(r, e);
22678    }
22679
22680    #[simd_test(enable = "avx512fp16")]
22681    unsafe fn test_mm512_getexp_ph() {
22682        let a = _mm512_set1_ph(3.0);
22683        let r = _mm512_getexp_ph(a);
22684        let e = _mm512_set1_ph(1.0);
22685        assert_eq_m512h(r, e);
22686    }
22687
22688    #[simd_test(enable = "avx512fp16")]
22689    unsafe fn test_mm512_mask_getexp_ph() {
22690        let a = _mm512_set1_ph(3.0);
22691        let src = _mm512_set1_ph(4.0);
22692        let r = _mm512_mask_getexp_ph(src, 0b01010101010101010101010101010101, a);
22693        let e = _mm512_set_ph(
22694            4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0,
22695            1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0,
22696        );
22697        assert_eq_m512h(r, e);
22698    }
22699
22700    #[simd_test(enable = "avx512fp16")]
22701    unsafe fn test_mm512_maskz_getexp_ph() {
22702        let a = _mm512_set1_ph(3.0);
22703        let r = _mm512_maskz_getexp_ph(0b01010101010101010101010101010101, a);
22704        let e = _mm512_set_ph(
22705            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
22706            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22707        );
22708        assert_eq_m512h(r, e);
22709    }
22710
22711    #[simd_test(enable = "avx512fp16")]
22712    unsafe fn test_mm512_getexp_round_ph() {
22713        let a = _mm512_set1_ph(3.0);
22714        let r = _mm512_getexp_round_ph::<_MM_FROUND_NO_EXC>(a);
22715        let e = _mm512_set1_ph(1.0);
22716        assert_eq_m512h(r, e);
22717    }
22718
22719    #[simd_test(enable = "avx512fp16")]
22720    unsafe fn test_mm512_mask_getexp_round_ph() {
22721        let a = _mm512_set1_ph(3.0);
22722        let src = _mm512_set1_ph(4.0);
22723        let r = _mm512_mask_getexp_round_ph::<_MM_FROUND_NO_EXC>(
22724            src,
22725            0b01010101010101010101010101010101,
22726            a,
22727        );
22728        let e = _mm512_set_ph(
22729            4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0,
22730            1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0,
22731        );
22732        assert_eq_m512h(r, e);
22733    }
22734
22735    #[simd_test(enable = "avx512fp16")]
22736    unsafe fn test_mm512_maskz_getexp_round_ph() {
22737        let a = _mm512_set1_ph(3.0);
22738        let r = _mm512_maskz_getexp_round_ph::<_MM_FROUND_NO_EXC>(
22739            0b01010101010101010101010101010101,
22740            a,
22741        );
22742        let e = _mm512_set_ph(
22743            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
22744            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22745        );
22746        assert_eq_m512h(r, e);
22747    }
22748
22749    #[simd_test(enable = "avx512fp16")]
22750    unsafe fn test_mm_getexp_sh() {
22751        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22752        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
22753        let r = _mm_getexp_sh(a, b);
22754        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22755        assert_eq_m128h(r, e);
22756    }
22757
22758    #[simd_test(enable = "avx512fp16")]
22759    unsafe fn test_mm_mask_getexp_sh() {
22760        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22761        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
22762        let src = _mm_setr_ph(4.0, 30., 31., 32., 33., 34., 35., 36.);
22763        let r = _mm_mask_getexp_sh(src, 0, a, b);
22764        let e = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22765        assert_eq_m128h(r, e);
22766        let r = _mm_mask_getexp_sh(src, 1, a, b);
22767        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22768        assert_eq_m128h(r, e);
22769    }
22770
22771    #[simd_test(enable = "avx512fp16")]
22772    unsafe fn test_mm_maskz_getexp_sh() {
22773        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22774        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
22775        let r = _mm_maskz_getexp_sh(0, a, b);
22776        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
22777        assert_eq_m128h(r, e);
22778        let r = _mm_maskz_getexp_sh(1, a, b);
22779        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22780        assert_eq_m128h(r, e);
22781    }
22782
22783    #[simd_test(enable = "avx512fp16")]
22784    unsafe fn test_mm_getexp_round_sh() {
22785        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22786        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
22787        let r = _mm_getexp_round_sh::<_MM_FROUND_NO_EXC>(a, b);
22788        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22789        assert_eq_m128h(r, e);
22790    }
22791
22792    #[simd_test(enable = "avx512fp16")]
22793    unsafe fn test_mm_mask_getexp_round_sh() {
22794        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22795        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
22796        let src = _mm_setr_ph(4.0, 30., 31., 32., 33., 34., 35., 36.);
22797        let r = _mm_mask_getexp_round_sh::<_MM_FROUND_NO_EXC>(src, 0, a, b);
22798        let e = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22799        assert_eq_m128h(r, e);
22800        let r = _mm_mask_getexp_round_sh::<_MM_FROUND_NO_EXC>(src, 1, a, b);
22801        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22802        assert_eq_m128h(r, e);
22803    }
22804
22805    #[simd_test(enable = "avx512fp16")]
22806    unsafe fn test_mm_maskz_getexp_round_sh() {
22807        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22808        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
22809        let r = _mm_maskz_getexp_round_sh::<_MM_FROUND_NO_EXC>(0, a, b);
22810        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
22811        assert_eq_m128h(r, e);
22812        let r = _mm_maskz_getexp_round_sh::<_MM_FROUND_NO_EXC>(1, a, b);
22813        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22814        assert_eq_m128h(r, e);
22815    }
22816
22817    #[simd_test(enable = "avx512fp16,avx512vl")]
22818    unsafe fn test_mm_getmant_ph() {
22819        let a = _mm_set1_ph(10.0);
22820        let r = _mm_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
22821        let e = _mm_set1_ph(1.25);
22822        assert_eq_m128h(r, e);
22823    }
22824
22825    #[simd_test(enable = "avx512fp16,avx512vl")]
22826    unsafe fn test_mm_mask_getmant_ph() {
22827        let a = _mm_set1_ph(10.0);
22828        let src = _mm_set1_ph(20.0);
22829        let r = _mm_mask_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(src, 0b01010101, a);
22830        let e = _mm_set_ph(20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25);
22831        assert_eq_m128h(r, e);
22832    }
22833
22834    #[simd_test(enable = "avx512fp16,avx512vl")]
22835    unsafe fn test_mm_maskz_getmant_ph() {
22836        let a = _mm_set1_ph(10.0);
22837        let r = _mm_maskz_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(0b01010101, a);
22838        let e = _mm_set_ph(0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25);
22839        assert_eq_m128h(r, e);
22840    }
22841
22842    #[simd_test(enable = "avx512fp16,avx512vl")]
22843    unsafe fn test_mm256_getmant_ph() {
22844        let a = _mm256_set1_ph(10.0);
22845        let r = _mm256_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
22846        let e = _mm256_set1_ph(1.25);
22847        assert_eq_m256h(r, e);
22848    }
22849
22850    #[simd_test(enable = "avx512fp16,avx512vl")]
22851    unsafe fn test_mm256_mask_getmant_ph() {
22852        let a = _mm256_set1_ph(10.0);
22853        let src = _mm256_set1_ph(20.0);
22854        let r = _mm256_mask_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
22855            src,
22856            0b0101010101010101,
22857            a,
22858        );
22859        let e = _mm256_set_ph(
22860            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
22861            20.0, 1.25,
22862        );
22863        assert_eq_m256h(r, e);
22864    }
22865
22866    #[simd_test(enable = "avx512fp16,avx512vl")]
22867    unsafe fn test_mm256_maskz_getmant_ph() {
22868        let a = _mm256_set1_ph(10.0);
22869        let r = _mm256_maskz_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
22870            0b0101010101010101,
22871            a,
22872        );
22873        let e = _mm256_set_ph(
22874            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
22875        );
22876        assert_eq_m256h(r, e);
22877    }
22878
22879    #[simd_test(enable = "avx512fp16")]
22880    unsafe fn test_mm512_getmant_ph() {
22881        let a = _mm512_set1_ph(10.0);
22882        let r = _mm512_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
22883        let e = _mm512_set1_ph(1.25);
22884        assert_eq_m512h(r, e);
22885    }
22886
22887    #[simd_test(enable = "avx512fp16")]
22888    unsafe fn test_mm512_mask_getmant_ph() {
22889        let a = _mm512_set1_ph(10.0);
22890        let src = _mm512_set1_ph(20.0);
22891        let r = _mm512_mask_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
22892            src,
22893            0b01010101010101010101010101010101,
22894            a,
22895        );
22896        let e = _mm512_set_ph(
22897            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
22898            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
22899            20.0, 1.25, 20.0, 1.25,
22900        );
22901        assert_eq_m512h(r, e);
22902    }
22903
22904    #[simd_test(enable = "avx512fp16")]
22905    unsafe fn test_mm512_maskz_getmant_ph() {
22906        let a = _mm512_set1_ph(10.0);
22907        let r = _mm512_maskz_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
22908            0b01010101010101010101010101010101,
22909            a,
22910        );
22911        let e = _mm512_set_ph(
22912            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
22913            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
22914        );
22915        assert_eq_m512h(r, e);
22916    }
22917
22918    #[simd_test(enable = "avx512fp16")]
22919    unsafe fn test_mm512_getmant_round_ph() {
22920        let a = _mm512_set1_ph(10.0);
22921        let r =
22922            _mm512_getmant_round_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN, _MM_FROUND_NO_EXC>(
22923                a,
22924            );
22925        let e = _mm512_set1_ph(1.25);
22926        assert_eq_m512h(r, e);
22927    }
22928
22929    #[simd_test(enable = "avx512fp16")]
22930    unsafe fn test_mm512_mask_getmant_round_ph() {
22931        let a = _mm512_set1_ph(10.0);
22932        let src = _mm512_set1_ph(20.0);
22933        let r = _mm512_mask_getmant_round_ph::<
22934            _MM_MANT_NORM_P75_1P5,
22935            _MM_MANT_SIGN_NAN,
22936            _MM_FROUND_NO_EXC,
22937        >(src, 0b01010101010101010101010101010101, a);
22938        let e = _mm512_set_ph(
22939            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
22940            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
22941            20.0, 1.25, 20.0, 1.25,
22942        );
22943        assert_eq_m512h(r, e);
22944    }
22945
22946    #[simd_test(enable = "avx512fp16")]
22947    unsafe fn test_mm512_maskz_getmant_round_ph() {
22948        let a = _mm512_set1_ph(10.0);
22949        let r = _mm512_maskz_getmant_round_ph::<
22950            _MM_MANT_NORM_P75_1P5,
22951            _MM_MANT_SIGN_NAN,
22952            _MM_FROUND_NO_EXC,
22953        >(0b01010101010101010101010101010101, a);
22954        let e = _mm512_set_ph(
22955            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
22956            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
22957        );
22958        assert_eq_m512h(r, e);
22959    }
22960
22961    #[simd_test(enable = "avx512fp16")]
22962    unsafe fn test_mm_getmant_sh() {
22963        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
22964        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
22965        let r = _mm_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a, b);
22966        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
22967        assert_eq_m128h(r, e);
22968    }
22969
22970    #[simd_test(enable = "avx512fp16")]
22971    unsafe fn test_mm_mask_getmant_sh() {
22972        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
22973        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
22974        let src = _mm_setr_ph(20.0, 30., 31., 32., 33., 34., 35., 36.);
22975        let r = _mm_mask_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(src, 0, a, b);
22976        let e = _mm_setr_ph(20.0, 10., 11., 12., 13., 14., 15., 16.);
22977        assert_eq_m128h(r, e);
22978        let r = _mm_mask_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(src, 1, a, b);
22979        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
22980        assert_eq_m128h(r, e);
22981    }
22982
22983    #[simd_test(enable = "avx512fp16")]
22984    unsafe fn test_mm_maskz_getmant_sh() {
22985        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
22986        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
22987        let r = _mm_maskz_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(0, a, b);
22988        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
22989        assert_eq_m128h(r, e);
22990        let r = _mm_maskz_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(1, a, b);
22991        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
22992        assert_eq_m128h(r, e);
22993    }
22994
22995    #[simd_test(enable = "avx512fp16")]
22996    unsafe fn test_mm_getmant_round_sh() {
22997        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
22998        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
22999        let r = _mm_getmant_round_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN, _MM_FROUND_NO_EXC>(
23000            a, b,
23001        );
23002        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
23003        assert_eq_m128h(r, e);
23004    }
23005
23006    #[simd_test(enable = "avx512fp16")]
23007    unsafe fn test_mm_mask_getmant_round_sh() {
23008        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
23009        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
23010        let src = _mm_setr_ph(20.0, 30., 31., 32., 33., 34., 35., 36.);
23011        let r = _mm_mask_getmant_round_sh::<
23012            _MM_MANT_NORM_P75_1P5,
23013            _MM_MANT_SIGN_NAN,
23014            _MM_FROUND_NO_EXC,
23015        >(src, 0, a, b);
23016        let e = _mm_setr_ph(20.0, 10., 11., 12., 13., 14., 15., 16.);
23017        assert_eq_m128h(r, e);
23018        let r = _mm_mask_getmant_round_sh::<
23019            _MM_MANT_NORM_P75_1P5,
23020            _MM_MANT_SIGN_NAN,
23021            _MM_FROUND_NO_EXC,
23022        >(src, 1, a, b);
23023        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
23024        assert_eq_m128h(r, e);
23025    }
23026
23027    #[simd_test(enable = "avx512fp16")]
23028    unsafe fn test_mm_maskz_getmant_round_sh() {
23029        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
23030        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
23031        let r = _mm_maskz_getmant_round_sh::<
23032            _MM_MANT_NORM_P75_1P5,
23033            _MM_MANT_SIGN_NAN,
23034            _MM_FROUND_NO_EXC,
23035        >(0, a, b);
23036        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23037        assert_eq_m128h(r, e);
23038        let r = _mm_maskz_getmant_round_sh::<
23039            _MM_MANT_NORM_P75_1P5,
23040            _MM_MANT_SIGN_NAN,
23041            _MM_FROUND_NO_EXC,
23042        >(1, a, b);
23043        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
23044        assert_eq_m128h(r, e);
23045    }
23046
23047    #[simd_test(enable = "avx512fp16,avx512vl")]
23048    unsafe fn test_mm_roundscale_ph() {
23049        let a = _mm_set1_ph(1.1);
23050        let r = _mm_roundscale_ph::<0>(a);
23051        let e = _mm_set1_ph(1.0);
23052        assert_eq_m128h(r, e);
23053    }
23054
23055    #[simd_test(enable = "avx512fp16,avx512vl")]
23056    unsafe fn test_mm_mask_roundscale_ph() {
23057        let a = _mm_set1_ph(1.1);
23058        let src = _mm_set1_ph(2.0);
23059        let r = _mm_mask_roundscale_ph::<0>(src, 0b01010101, a);
23060        let e = _mm_set_ph(2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0);
23061        assert_eq_m128h(r, e);
23062    }
23063
23064    #[simd_test(enable = "avx512fp16,avx512vl")]
23065    unsafe fn test_mm_maskz_roundscale_ph() {
23066        let a = _mm_set1_ph(1.1);
23067        let r = _mm_maskz_roundscale_ph::<0>(0b01010101, a);
23068        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
23069        assert_eq_m128h(r, e);
23070    }
23071
23072    #[simd_test(enable = "avx512fp16,avx512vl")]
23073    unsafe fn test_mm256_roundscale_ph() {
23074        let a = _mm256_set1_ph(1.1);
23075        let r = _mm256_roundscale_ph::<0>(a);
23076        let e = _mm256_set1_ph(1.0);
23077        assert_eq_m256h(r, e);
23078    }
23079
23080    #[simd_test(enable = "avx512fp16,avx512vl")]
23081    unsafe fn test_mm256_mask_roundscale_ph() {
23082        let a = _mm256_set1_ph(1.1);
23083        let src = _mm256_set1_ph(2.0);
23084        let r = _mm256_mask_roundscale_ph::<0>(src, 0b0101010101010101, a);
23085        let e = _mm256_set_ph(
23086            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
23087        );
23088        assert_eq_m256h(r, e);
23089    }
23090
23091    #[simd_test(enable = "avx512fp16,avx512vl")]
23092    unsafe fn test_mm256_maskz_roundscale_ph() {
23093        let a = _mm256_set1_ph(1.1);
23094        let r = _mm256_maskz_roundscale_ph::<0>(0b0101010101010101, a);
23095        let e = _mm256_set_ph(
23096            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
23097        );
23098        assert_eq_m256h(r, e);
23099    }
23100
23101    #[simd_test(enable = "avx512fp16")]
23102    unsafe fn test_mm512_roundscale_ph() {
23103        let a = _mm512_set1_ph(1.1);
23104        let r = _mm512_roundscale_ph::<0>(a);
23105        let e = _mm512_set1_ph(1.0);
23106        assert_eq_m512h(r, e);
23107    }
23108
23109    #[simd_test(enable = "avx512fp16")]
23110    unsafe fn test_mm512_mask_roundscale_ph() {
23111        let a = _mm512_set1_ph(1.1);
23112        let src = _mm512_set1_ph(2.0);
23113        let r = _mm512_mask_roundscale_ph::<0>(src, 0b01010101010101010101010101010101, a);
23114        let e = _mm512_set_ph(
23115            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
23116            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
23117        );
23118        assert_eq_m512h(r, e);
23119    }
23120
23121    #[simd_test(enable = "avx512fp16")]
23122    unsafe fn test_mm512_maskz_roundscale_ph() {
23123        let a = _mm512_set1_ph(1.1);
23124        let r = _mm512_maskz_roundscale_ph::<0>(0b01010101010101010101010101010101, a);
23125        let e = _mm512_set_ph(
23126            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
23127            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
23128        );
23129        assert_eq_m512h(r, e);
23130    }
23131
23132    #[simd_test(enable = "avx512fp16")]
23133    unsafe fn test_mm512_roundscale_round_ph() {
23134        let a = _mm512_set1_ph(1.1);
23135        let r = _mm512_roundscale_round_ph::<0, _MM_FROUND_NO_EXC>(a);
23136        let e = _mm512_set1_ph(1.0);
23137        assert_eq_m512h(r, e);
23138    }
23139
23140    #[simd_test(enable = "avx512fp16")]
23141    unsafe fn test_mm512_mask_roundscale_round_ph() {
23142        let a = _mm512_set1_ph(1.1);
23143        let src = _mm512_set1_ph(2.0);
23144        let r = _mm512_mask_roundscale_round_ph::<0, _MM_FROUND_NO_EXC>(
23145            src,
23146            0b01010101010101010101010101010101,
23147            a,
23148        );
23149        let e = _mm512_set_ph(
23150            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
23151            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
23152        );
23153        assert_eq_m512h(r, e);
23154    }
23155
23156    #[simd_test(enable = "avx512fp16")]
23157    unsafe fn test_mm512_maskz_roundscale_round_ph() {
23158        let a = _mm512_set1_ph(1.1);
23159        let r = _mm512_maskz_roundscale_round_ph::<0, _MM_FROUND_NO_EXC>(
23160            0b01010101010101010101010101010101,
23161            a,
23162        );
23163        let e = _mm512_set_ph(
23164            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
23165            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
23166        );
23167        assert_eq_m512h(r, e);
23168    }
23169
23170    #[simd_test(enable = "avx512fp16")]
23171    unsafe fn test_mm_roundscale_sh() {
23172        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23173        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23174        let r = _mm_roundscale_sh::<0>(a, b);
23175        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23176        assert_eq_m128h(r, e);
23177    }
23178
23179    #[simd_test(enable = "avx512fp16")]
23180    unsafe fn test_mm_mask_roundscale_sh() {
23181        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23182        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23183        let src = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
23184        let r = _mm_mask_roundscale_sh::<0>(src, 0, a, b);
23185        let e = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23186        assert_eq_m128h(r, e);
23187        let r = _mm_mask_roundscale_sh::<0>(src, 1, a, b);
23188        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23189        assert_eq_m128h(r, e);
23190    }
23191
23192    #[simd_test(enable = "avx512fp16")]
23193    unsafe fn test_mm_maskz_roundscale_sh() {
23194        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23195        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23196        let r = _mm_maskz_roundscale_sh::<0>(0, a, b);
23197        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23198        assert_eq_m128h(r, e);
23199        let r = _mm_maskz_roundscale_sh::<0>(1, a, b);
23200        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23201        assert_eq_m128h(r, e);
23202    }
23203
23204    #[simd_test(enable = "avx512fp16")]
23205    unsafe fn test_mm_roundscale_round_sh() {
23206        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23207        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23208        let r = _mm_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(a, b);
23209        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23210        assert_eq_m128h(r, e);
23211    }
23212
23213    #[simd_test(enable = "avx512fp16")]
23214    unsafe fn test_mm_mask_roundscale_round_sh() {
23215        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23216        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23217        let src = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
23218        let r = _mm_mask_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(src, 0, a, b);
23219        let e = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23220        assert_eq_m128h(r, e);
23221        let r = _mm_mask_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(src, 1, a, b);
23222        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23223        assert_eq_m128h(r, e);
23224    }
23225
23226    #[simd_test(enable = "avx512fp16")]
23227    unsafe fn test_mm_maskz_roundscale_round_sh() {
23228        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23229        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23230        let r = _mm_maskz_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(0, a, b);
23231        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23232        assert_eq_m128h(r, e);
23233        let r = _mm_maskz_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(1, a, b);
23234        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23235        assert_eq_m128h(r, e);
23236    }
23237
23238    #[simd_test(enable = "avx512fp16,avx512vl")]
23239    unsafe fn test_mm_scalef_ph() {
23240        let a = _mm_set1_ph(1.);
23241        let b = _mm_set1_ph(3.);
23242        let r = _mm_scalef_ph(a, b);
23243        let e = _mm_set1_ph(8.0);
23244        assert_eq_m128h(r, e);
23245    }
23246
23247    #[simd_test(enable = "avx512fp16,avx512vl")]
23248    unsafe fn test_mm_mask_scalef_ph() {
23249        let a = _mm_set1_ph(1.);
23250        let b = _mm_set1_ph(3.);
23251        let src = _mm_set1_ph(2.);
23252        let r = _mm_mask_scalef_ph(src, 0b01010101, a, b);
23253        let e = _mm_set_ph(2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0);
23254        assert_eq_m128h(r, e);
23255    }
23256
23257    #[simd_test(enable = "avx512fp16,avx512vl")]
23258    unsafe fn test_mm_maskz_scalef_ph() {
23259        let a = _mm_set1_ph(1.);
23260        let b = _mm_set1_ph(3.);
23261        let r = _mm_maskz_scalef_ph(0b01010101, a, b);
23262        let e = _mm_set_ph(0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0);
23263        assert_eq_m128h(r, e);
23264    }
23265
23266    #[simd_test(enable = "avx512fp16,avx512vl")]
23267    unsafe fn test_mm256_scalef_ph() {
23268        let a = _mm256_set1_ph(1.);
23269        let b = _mm256_set1_ph(3.);
23270        let r = _mm256_scalef_ph(a, b);
23271        let e = _mm256_set1_ph(8.0);
23272        assert_eq_m256h(r, e);
23273    }
23274
23275    #[simd_test(enable = "avx512fp16,avx512vl")]
23276    unsafe fn test_mm256_mask_scalef_ph() {
23277        let a = _mm256_set1_ph(1.);
23278        let b = _mm256_set1_ph(3.);
23279        let src = _mm256_set1_ph(2.);
23280        let r = _mm256_mask_scalef_ph(src, 0b0101010101010101, a, b);
23281        let e = _mm256_set_ph(
23282            2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0,
23283        );
23284        assert_eq_m256h(r, e);
23285    }
23286
23287    #[simd_test(enable = "avx512fp16,avx512vl")]
23288    unsafe fn test_mm256_maskz_scalef_ph() {
23289        let a = _mm256_set1_ph(1.);
23290        let b = _mm256_set1_ph(3.);
23291        let r = _mm256_maskz_scalef_ph(0b0101010101010101, a, b);
23292        let e = _mm256_set_ph(
23293            0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0,
23294        );
23295        assert_eq_m256h(r, e);
23296    }
23297
23298    #[simd_test(enable = "avx512fp16")]
23299    unsafe fn test_mm512_scalef_ph() {
23300        let a = _mm512_set1_ph(1.);
23301        let b = _mm512_set1_ph(3.);
23302        let r = _mm512_scalef_ph(a, b);
23303        let e = _mm512_set1_ph(8.0);
23304        assert_eq_m512h(r, e);
23305    }
23306
23307    #[simd_test(enable = "avx512fp16")]
23308    unsafe fn test_mm512_mask_scalef_ph() {
23309        let a = _mm512_set1_ph(1.);
23310        let b = _mm512_set1_ph(3.);
23311        let src = _mm512_set1_ph(2.);
23312        let r = _mm512_mask_scalef_ph(src, 0b01010101010101010101010101010101, a, b);
23313        let e = _mm512_set_ph(
23314            2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0,
23315            8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0,
23316        );
23317        assert_eq_m512h(r, e);
23318    }
23319
23320    #[simd_test(enable = "avx512fp16")]
23321    unsafe fn test_mm512_maskz_scalef_ph() {
23322        let a = _mm512_set1_ph(1.);
23323        let b = _mm512_set1_ph(3.);
23324        let r = _mm512_maskz_scalef_ph(0b01010101010101010101010101010101, a, b);
23325        let e = _mm512_set_ph(
23326            0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0,
23327            8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0,
23328        );
23329        assert_eq_m512h(r, e);
23330    }
23331
23332    #[simd_test(enable = "avx512fp16")]
23333    unsafe fn test_mm512_scalef_round_ph() {
23334        let a = _mm512_set1_ph(1.);
23335        let b = _mm512_set1_ph(3.);
23336        let r = _mm512_scalef_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
23337        let e = _mm512_set1_ph(8.0);
23338        assert_eq_m512h(r, e);
23339    }
23340
23341    #[simd_test(enable = "avx512fp16")]
23342    unsafe fn test_mm512_mask_scalef_round_ph() {
23343        let a = _mm512_set1_ph(1.);
23344        let b = _mm512_set1_ph(3.);
23345        let src = _mm512_set1_ph(2.);
23346        let r = _mm512_mask_scalef_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
23347            src,
23348            0b01010101010101010101010101010101,
23349            a,
23350            b,
23351        );
23352        let e = _mm512_set_ph(
23353            2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0,
23354            8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0,
23355        );
23356        assert_eq_m512h(r, e);
23357    }
23358
23359    #[simd_test(enable = "avx512fp16")]
23360    unsafe fn test_mm512_maskz_scalef_round_ph() {
23361        let a = _mm512_set1_ph(1.);
23362        let b = _mm512_set1_ph(3.);
23363        let r = _mm512_maskz_scalef_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
23364            0b01010101010101010101010101010101,
23365            a,
23366            b,
23367        );
23368        let e = _mm512_set_ph(
23369            0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0,
23370            8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0,
23371        );
23372        assert_eq_m512h(r, e);
23373    }
23374
23375    #[simd_test(enable = "avx512fp16")]
23376    unsafe fn test_mm_scalef_sh() {
23377        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23378        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23379        let r = _mm_scalef_sh(a, b);
23380        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23381        assert_eq_m128h(r, e);
23382    }
23383
23384    #[simd_test(enable = "avx512fp16")]
23385    unsafe fn test_mm_mask_scalef_sh() {
23386        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23387        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23388        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
23389        let r = _mm_mask_scalef_sh(src, 0, a, b);
23390        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23391        assert_eq_m128h(r, e);
23392        let r = _mm_mask_scalef_sh(src, 1, a, b);
23393        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23394        assert_eq_m128h(r, e);
23395    }
23396
23397    #[simd_test(enable = "avx512fp16")]
23398    unsafe fn test_mm_maskz_scalef_sh() {
23399        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23400        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23401        let r = _mm_maskz_scalef_sh(0, a, b);
23402        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23403        assert_eq_m128h(r, e);
23404        let r = _mm_maskz_scalef_sh(1, a, b);
23405        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23406        assert_eq_m128h(r, e);
23407    }
23408
23409    #[simd_test(enable = "avx512fp16")]
23410    unsafe fn test_mm_scalef_round_sh() {
23411        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23412        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23413        let r = _mm_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
23414        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23415        assert_eq_m128h(r, e);
23416    }
23417
23418    #[simd_test(enable = "avx512fp16")]
23419    unsafe fn test_mm_mask_scalef_round_sh() {
23420        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23421        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23422        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
23423        let r = _mm_mask_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
23424            src, 0, a, b,
23425        );
23426        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23427        assert_eq_m128h(r, e);
23428        let r = _mm_mask_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
23429            src, 1, a, b,
23430        );
23431        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23432        assert_eq_m128h(r, e);
23433    }
23434
23435    #[simd_test(enable = "avx512fp16")]
23436    unsafe fn test_mm_maskz_scalef_round_sh() {
23437        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23438        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23439        let r =
23440            _mm_maskz_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
23441        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23442        assert_eq_m128h(r, e);
23443        let r =
23444            _mm_maskz_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
23445        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23446        assert_eq_m128h(r, e);
23447    }
23448
23449    #[simd_test(enable = "avx512fp16,avx512vl")]
23450    unsafe fn test_mm_reduce_ph() {
23451        let a = _mm_set1_ph(1.25);
23452        let r = _mm_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
23453        let e = _mm_set1_ph(0.25);
23454        assert_eq_m128h(r, e);
23455    }
23456
23457    #[simd_test(enable = "avx512fp16,avx512vl")]
23458    unsafe fn test_mm_mask_reduce_ph() {
23459        let a = _mm_set1_ph(1.25);
23460        let src = _mm_set1_ph(2.0);
23461        let r = _mm_mask_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b01010101, a);
23462        let e = _mm_set_ph(2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25);
23463        assert_eq_m128h(r, e);
23464    }
23465
23466    #[simd_test(enable = "avx512fp16,avx512vl")]
23467    unsafe fn test_mm_maskz_reduce_ph() {
23468        let a = _mm_set1_ph(1.25);
23469        let r = _mm_maskz_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(0b01010101, a);
23470        let e = _mm_set_ph(0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25);
23471        assert_eq_m128h(r, e);
23472    }
23473
23474    #[simd_test(enable = "avx512fp16,avx512vl")]
23475    unsafe fn test_mm256_reduce_ph() {
23476        let a = _mm256_set1_ph(1.25);
23477        let r = _mm256_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
23478        let e = _mm256_set1_ph(0.25);
23479        assert_eq_m256h(r, e);
23480    }
23481
23482    #[simd_test(enable = "avx512fp16,avx512vl")]
23483    unsafe fn test_mm256_mask_reduce_ph() {
23484        let a = _mm256_set1_ph(1.25);
23485        let src = _mm256_set1_ph(2.0);
23486        let r = _mm256_mask_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b0101010101010101, a);
23487        let e = _mm256_set_ph(
23488            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23489        );
23490        assert_eq_m256h(r, e);
23491    }
23492
23493    #[simd_test(enable = "avx512fp16,avx512vl")]
23494    unsafe fn test_mm256_maskz_reduce_ph() {
23495        let a = _mm256_set1_ph(1.25);
23496        let r = _mm256_maskz_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(0b0101010101010101, a);
23497        let e = _mm256_set_ph(
23498            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23499        );
23500        assert_eq_m256h(r, e);
23501    }
23502
23503    #[simd_test(enable = "avx512fp16")]
23504    unsafe fn test_mm512_reduce_ph() {
23505        let a = _mm512_set1_ph(1.25);
23506        let r = _mm512_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
23507        let e = _mm512_set1_ph(0.25);
23508        assert_eq_m512h(r, e);
23509    }
23510
23511    #[simd_test(enable = "avx512fp16")]
23512    unsafe fn test_mm512_mask_reduce_ph() {
23513        let a = _mm512_set1_ph(1.25);
23514        let src = _mm512_set1_ph(2.0);
23515        let r = _mm512_mask_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(
23516            src,
23517            0b01010101010101010101010101010101,
23518            a,
23519        );
23520        let e = _mm512_set_ph(
23521            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23522            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23523        );
23524        assert_eq_m512h(r, e);
23525    }
23526
23527    #[simd_test(enable = "avx512fp16")]
23528    unsafe fn test_mm512_maskz_reduce_ph() {
23529        let a = _mm512_set1_ph(1.25);
23530        let r = _mm512_maskz_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(
23531            0b01010101010101010101010101010101,
23532            a,
23533        );
23534        let e = _mm512_set_ph(
23535            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23536            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23537        );
23538        assert_eq_m512h(r, e);
23539    }
23540
23541    #[simd_test(enable = "avx512fp16")]
23542    unsafe fn test_mm512_reduce_round_ph() {
23543        let a = _mm512_set1_ph(1.25);
23544        let r = _mm512_reduce_round_ph::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a);
23545        let e = _mm512_set1_ph(0.25);
23546        assert_eq_m512h(r, e);
23547    }
23548
23549    #[simd_test(enable = "avx512fp16")]
23550    unsafe fn test_mm512_mask_reduce_round_ph() {
23551        let a = _mm512_set1_ph(1.25);
23552        let src = _mm512_set1_ph(2.0);
23553        let r = _mm512_mask_reduce_round_ph::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
23554            src,
23555            0b01010101010101010101010101010101,
23556            a,
23557        );
23558        let e = _mm512_set_ph(
23559            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23560            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23561        );
23562        assert_eq_m512h(r, e);
23563    }
23564
23565    #[simd_test(enable = "avx512fp16")]
23566    unsafe fn test_mm512_maskz_reduce_round_ph() {
23567        let a = _mm512_set1_ph(1.25);
23568        let r = _mm512_maskz_reduce_round_ph::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
23569            0b01010101010101010101010101010101,
23570            a,
23571        );
23572        let e = _mm512_set_ph(
23573            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23574            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23575        );
23576        assert_eq_m512h(r, e);
23577    }
23578
23579    #[simd_test(enable = "avx512fp16")]
23580    unsafe fn test_mm_reduce_sh() {
23581        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23582        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23583        let r = _mm_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(a, b);
23584        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23585        assert_eq_m128h(r, e);
23586    }
23587
23588    #[simd_test(enable = "avx512fp16")]
23589    unsafe fn test_mm_mask_reduce_sh() {
23590        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23591        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23592        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
23593        let r = _mm_mask_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0, a, b);
23594        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23595        assert_eq_m128h(r, e);
23596        let r = _mm_mask_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 1, a, b);
23597        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23598        assert_eq_m128h(r, e);
23599    }
23600
23601    #[simd_test(enable = "avx512fp16")]
23602    unsafe fn test_mm_maskz_reduce_sh() {
23603        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23604        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23605        let r = _mm_maskz_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(0, a, b);
23606        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23607        assert_eq_m128h(r, e);
23608        let r = _mm_maskz_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(1, a, b);
23609        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23610        assert_eq_m128h(r, e);
23611    }
23612
23613    #[simd_test(enable = "avx512fp16")]
23614    unsafe fn test_mm_reduce_round_sh() {
23615        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23616        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23617        let r = _mm_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a, b);
23618        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23619        assert_eq_m128h(r, e);
23620    }
23621
23622    #[simd_test(enable = "avx512fp16")]
23623    unsafe fn test_mm_mask_reduce_round_sh() {
23624        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23625        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23626        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
23627        let r = _mm_mask_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
23628            src, 0, a, b,
23629        );
23630        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23631        assert_eq_m128h(r, e);
23632        let r = _mm_mask_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
23633            src, 1, a, b,
23634        );
23635        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23636        assert_eq_m128h(r, e);
23637    }
23638
23639    #[simd_test(enable = "avx512fp16")]
23640    unsafe fn test_mm_maskz_reduce_round_sh() {
23641        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23642        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23643        let r =
23644            _mm_maskz_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(0, a, b);
23645        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23646        assert_eq_m128h(r, e);
23647        let r =
23648            _mm_maskz_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(1, a, b);
23649        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23650        assert_eq_m128h(r, e);
23651    }
23652
23653    #[simd_test(enable = "avx512fp16,avx512vl")]
23654    unsafe fn test_mm_reduce_add_ph() {
23655        let a = _mm_set1_ph(2.0);
23656        let r = _mm_reduce_add_ph(a);
23657        assert_eq!(r, 16.0);
23658    }
23659
23660    #[simd_test(enable = "avx512fp16,avx512vl")]
23661    unsafe fn test_mm256_reduce_add_ph() {
23662        let a = _mm256_set1_ph(2.0);
23663        let r = _mm256_reduce_add_ph(a);
23664        assert_eq!(r, 32.0);
23665    }
23666
23667    #[simd_test(enable = "avx512fp16")]
23668    unsafe fn test_mm512_reduce_add_ph() {
23669        let a = _mm512_set1_ph(2.0);
23670        let r = _mm512_reduce_add_ph(a);
23671        assert_eq!(r, 64.0);
23672    }
23673
23674    #[simd_test(enable = "avx512fp16,avx512vl")]
23675    unsafe fn test_mm_reduce_mul_ph() {
23676        let a = _mm_set1_ph(2.0);
23677        let r = _mm_reduce_mul_ph(a);
23678        assert_eq!(r, 256.0);
23679    }
23680
23681    #[simd_test(enable = "avx512fp16,avx512vl")]
23682    unsafe fn test_mm256_reduce_mul_ph() {
23683        let a = _mm256_set1_ph(2.0);
23684        let r = _mm256_reduce_mul_ph(a);
23685        assert_eq!(r, 65536.0);
23686    }
23687
23688    #[simd_test(enable = "avx512fp16")]
23689    unsafe fn test_mm512_reduce_mul_ph() {
23690        let a = _mm512_set1_ph(2.0);
23691        let r = _mm512_reduce_mul_ph(a);
23692        assert_eq!(r, 16777216.0);
23693    }
23694
23695    #[simd_test(enable = "avx512fp16,avx512vl")]
23696    unsafe fn test_mm_reduce_max_ph() {
23697        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
23698        let r = _mm_reduce_max_ph(a);
23699        assert_eq!(r, 8.0);
23700    }
23701
23702    #[simd_test(enable = "avx512fp16,avx512vl")]
23703    unsafe fn test_mm256_reduce_max_ph() {
23704        let a = _mm256_set_ph(
23705            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23706        );
23707        let r = _mm256_reduce_max_ph(a);
23708        assert_eq!(r, 16.0);
23709    }
23710
23711    #[simd_test(enable = "avx512fp16")]
23712    unsafe fn test_mm512_reduce_max_ph() {
23713        let a = _mm512_set_ph(
23714            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23715            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
23716            31.0, 32.0,
23717        );
23718        let r = _mm512_reduce_max_ph(a);
23719        assert_eq!(r, 32.0);
23720    }
23721
23722    #[simd_test(enable = "avx512fp16,avx512vl")]
23723    unsafe fn test_mm_reduce_min_ph() {
23724        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
23725        let r = _mm_reduce_min_ph(a);
23726        assert_eq!(r, 1.0);
23727    }
23728
23729    #[simd_test(enable = "avx512fp16,avx512vl")]
23730    unsafe fn test_mm256_reduce_min_ph() {
23731        let a = _mm256_set_ph(
23732            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23733        );
23734        let r = _mm256_reduce_min_ph(a);
23735        assert_eq!(r, 1.0);
23736    }
23737
23738    #[simd_test(enable = "avx512fp16")]
23739    unsafe fn test_mm512_reduce_min_ph() {
23740        let a = _mm512_set_ph(
23741            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23742            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
23743            31.0, 32.0,
23744        );
23745        let r = _mm512_reduce_min_ph(a);
23746        assert_eq!(r, 1.0);
23747    }
23748
23749    #[simd_test(enable = "avx512fp16,avx512vl")]
23750    unsafe fn test_mm_fpclass_ph_mask() {
23751        let a = _mm_set_ph(
23752            1.,
23753            f16::INFINITY,
23754            f16::NEG_INFINITY,
23755            0.0,
23756            -0.0,
23757            -2.0,
23758            f16::NAN,
23759            5.9e-8, // Denormal
23760        );
23761        let r = _mm_fpclass_ph_mask::<0x18>(a); // infinities
23762        assert_eq!(r, 0b01100000);
23763    }
23764
23765    #[simd_test(enable = "avx512fp16,avx512vl")]
23766    unsafe fn test_mm_mask_fpclass_ph_mask() {
23767        let a = _mm_set_ph(
23768            1.,
23769            f16::INFINITY,
23770            f16::NEG_INFINITY,
23771            0.0,
23772            -0.0,
23773            -2.0,
23774            f16::NAN,
23775            5.9e-8, // Denormal
23776        );
23777        let r = _mm_mask_fpclass_ph_mask::<0x18>(0b01010101, a);
23778        assert_eq!(r, 0b01000000);
23779    }
23780
23781    #[simd_test(enable = "avx512fp16,avx512vl")]
23782    unsafe fn test_mm256_fpclass_ph_mask() {
23783        let a = _mm256_set_ph(
23784            1.,
23785            f16::INFINITY,
23786            f16::NEG_INFINITY,
23787            0.0,
23788            -0.0,
23789            -2.0,
23790            f16::NAN,
23791            5.9e-8, // Denormal
23792            1.,
23793            f16::INFINITY,
23794            f16::NEG_INFINITY,
23795            0.0,
23796            -0.0,
23797            -2.0,
23798            f16::NAN,
23799            5.9e-8, // Denormal
23800        );
23801        let r = _mm256_fpclass_ph_mask::<0x18>(a); // infinities
23802        assert_eq!(r, 0b0110000001100000);
23803    }
23804
23805    #[simd_test(enable = "avx512fp16,avx512vl")]
23806    unsafe fn test_mm256_mask_fpclass_ph_mask() {
23807        let a = _mm256_set_ph(
23808            1.,
23809            f16::INFINITY,
23810            f16::NEG_INFINITY,
23811            0.0,
23812            -0.0,
23813            -2.0,
23814            f16::NAN,
23815            5.9e-8, // Denormal
23816            1.,
23817            f16::INFINITY,
23818            f16::NEG_INFINITY,
23819            0.0,
23820            -0.0,
23821            -2.0,
23822            f16::NAN,
23823            5.9e-8, // Denormal
23824        );
23825        let r = _mm256_mask_fpclass_ph_mask::<0x18>(0b0101010101010101, a);
23826        assert_eq!(r, 0b0100000001000000);
23827    }
23828
23829    #[simd_test(enable = "avx512fp16")]
23830    unsafe fn test_mm512_fpclass_ph_mask() {
23831        let a = _mm512_set_ph(
23832            1.,
23833            f16::INFINITY,
23834            f16::NEG_INFINITY,
23835            0.0,
23836            -0.0,
23837            -2.0,
23838            f16::NAN,
23839            5.9e-8, // Denormal
23840            1.,
23841            f16::INFINITY,
23842            f16::NEG_INFINITY,
23843            0.0,
23844            -0.0,
23845            -2.0,
23846            f16::NAN,
23847            5.9e-8, // Denormal
23848            1.,
23849            f16::INFINITY,
23850            f16::NEG_INFINITY,
23851            0.0,
23852            -0.0,
23853            -2.0,
23854            f16::NAN,
23855            5.9e-8, // Denormal
23856            1.,
23857            f16::INFINITY,
23858            f16::NEG_INFINITY,
23859            0.0,
23860            -0.0,
23861            -2.0,
23862            f16::NAN,
23863            5.9e-8, // Denormal
23864        );
23865        let r = _mm512_fpclass_ph_mask::<0x18>(a); // infinities
23866        assert_eq!(r, 0b01100000011000000110000001100000);
23867    }
23868
23869    #[simd_test(enable = "avx512fp16")]
23870    unsafe fn test_mm512_mask_fpclass_ph_mask() {
23871        let a = _mm512_set_ph(
23872            1.,
23873            f16::INFINITY,
23874            f16::NEG_INFINITY,
23875            0.0,
23876            -0.0,
23877            -2.0,
23878            f16::NAN,
23879            5.9e-8, // Denormal
23880            1.,
23881            f16::INFINITY,
23882            f16::NEG_INFINITY,
23883            0.0,
23884            -0.0,
23885            -2.0,
23886            f16::NAN,
23887            5.9e-8, // Denormal
23888            1.,
23889            f16::INFINITY,
23890            f16::NEG_INFINITY,
23891            0.0,
23892            -0.0,
23893            -2.0,
23894            f16::NAN,
23895            5.9e-8, // Denormal
23896            1.,
23897            f16::INFINITY,
23898            f16::NEG_INFINITY,
23899            0.0,
23900            -0.0,
23901            -2.0,
23902            f16::NAN,
23903            5.9e-8, // Denormal
23904        );
23905        let r = _mm512_mask_fpclass_ph_mask::<0x18>(0b01010101010101010101010101010101, a);
23906        assert_eq!(r, 0b01000000010000000100000001000000);
23907    }
23908
23909    #[simd_test(enable = "avx512fp16")]
23910    unsafe fn test_mm_fpclass_sh_mask() {
23911        let a = _mm_set_sh(f16::INFINITY);
23912        let r = _mm_fpclass_sh_mask::<0x18>(a);
23913        assert_eq!(r, 1);
23914    }
23915
23916    #[simd_test(enable = "avx512fp16")]
23917    unsafe fn test_mm_mask_fpclass_sh_mask() {
23918        let a = _mm_set_sh(f16::INFINITY);
23919        let r = _mm_mask_fpclass_sh_mask::<0x18>(0, a);
23920        assert_eq!(r, 0);
23921        let r = _mm_mask_fpclass_sh_mask::<0x18>(1, a);
23922        assert_eq!(r, 1);
23923    }
23924
23925    #[simd_test(enable = "avx512fp16,avx512vl")]
23926    unsafe fn test_mm_mask_blend_ph() {
23927        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
23928        let b = _mm_set_ph(-1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0);
23929        let r = _mm_mask_blend_ph(0b01010101, a, b);
23930        let e = _mm_set_ph(1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0);
23931        assert_eq_m128h(r, e);
23932    }
23933
23934    #[simd_test(enable = "avx512fp16,avx512vl")]
23935    unsafe fn test_mm256_mask_blend_ph() {
23936        let a = _mm256_set_ph(
23937            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23938        );
23939        let b = _mm256_set_ph(
23940            -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0,
23941            -14.0, -15.0, -16.0,
23942        );
23943        let r = _mm256_mask_blend_ph(0b0101010101010101, a, b);
23944        let e = _mm256_set_ph(
23945            1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0,
23946            -16.0,
23947        );
23948        assert_eq_m256h(r, e);
23949    }
23950
23951    #[simd_test(enable = "avx512fp16")]
23952    unsafe fn test_mm512_mask_blend_ph() {
23953        let a = _mm512_set_ph(
23954            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23955            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
23956            31.0, 32.0,
23957        );
23958        let b = _mm512_set_ph(
23959            -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0,
23960            -14.0, -15.0, -16.0, -17.0, -18.0, -19.0, -20.0, -21.0, -22.0, -23.0, -24.0, -25.0,
23961            -26.0, -27.0, -28.0, -29.0, -30.0, -31.0, -32.0,
23962        );
23963        let r = _mm512_mask_blend_ph(0b01010101010101010101010101010101, a, b);
23964        let e = _mm512_set_ph(
23965            1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0,
23966            -16.0, 17.0, -18.0, 19.0, -20.0, 21.0, -22.0, 23.0, -24.0, 25.0, -26.0, 27.0, -28.0,
23967            29.0, -30.0, 31.0, -32.0,
23968        );
23969        assert_eq_m512h(r, e);
23970    }
23971
23972    #[simd_test(enable = "avx512fp16,avx512vl")]
23973    unsafe fn test_mm_permutex2var_ph() {
23974        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
23975        let b = _mm_setr_ph(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
23976        let idx = _mm_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14);
23977        let r = _mm_permutex2var_ph(a, idx, b);
23978        let e = _mm_setr_ph(1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0);
23979        assert_eq_m128h(r, e);
23980    }
23981
23982    #[simd_test(enable = "avx512fp16,avx512vl")]
23983    unsafe fn test_mm256_permutex2var_ph() {
23984        let a = _mm256_setr_ph(
23985            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23986        );
23987        let b = _mm256_setr_ph(
23988            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
23989            31.0, 32.0,
23990        );
23991        let idx = _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
23992        let r = _mm256_permutex2var_ph(a, idx, b);
23993        let e = _mm256_setr_ph(
23994            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0,
23995            31.0,
23996        );
23997        assert_eq_m256h(r, e);
23998    }
23999
24000    #[simd_test(enable = "avx512fp16")]
24001    unsafe fn test_mm512_permutex2var_ph() {
24002        let a = _mm512_setr_ph(
24003            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24004            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24005            31.0, 32.0,
24006        );
24007        let b = _mm512_setr_ph(
24008            33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0,
24009            47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0,
24010            61.0, 62.0, 63.0, 64.0,
24011        );
24012        let idx = _mm512_set_epi16(
24013            62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22, 20,
24014            18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
24015        );
24016        let r = _mm512_permutex2var_ph(a, idx, b);
24017        let e = _mm512_setr_ph(
24018            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0,
24019            31.0, 33.0, 35.0, 37.0, 39.0, 41.0, 43.0, 45.0, 47.0, 49.0, 51.0, 53.0, 55.0, 57.0,
24020            59.0, 61.0, 63.0,
24021        );
24022        assert_eq_m512h(r, e);
24023    }
24024
24025    #[simd_test(enable = "avx512fp16,avx512vl")]
24026    unsafe fn test_mm_permutexvar_ph() {
24027        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24028        let idx = _mm_set_epi16(0, 2, 4, 6, 1, 3, 5, 7);
24029        let r = _mm_permutexvar_ph(idx, a);
24030        let e = _mm_setr_ph(1.0, 3.0, 5.0, 7.0, 2.0, 4.0, 6.0, 8.0);
24031        assert_eq_m128h(r, e);
24032    }
24033
24034    #[simd_test(enable = "avx512fp16,avx512vl")]
24035    unsafe fn test_mm256_permutexvar_ph() {
24036        let a = _mm256_set_ph(
24037            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24038        );
24039        let idx = _mm256_set_epi16(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
24040        let r = _mm256_permutexvar_ph(idx, a);
24041        let e = _mm256_setr_ph(
24042            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0,
24043        );
24044        assert_eq_m256h(r, e);
24045    }
24046
24047    #[simd_test(enable = "avx512fp16")]
24048    unsafe fn test_mm512_permutexvar_ph() {
24049        let a = _mm512_set_ph(
24050            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24051            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24052            31.0, 32.0,
24053        );
24054        let idx = _mm512_set_epi16(
24055            0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 1, 3, 5, 7, 9, 11, 13, 15,
24056            17, 19, 21, 23, 25, 27, 29, 31,
24057        );
24058        let r = _mm512_permutexvar_ph(idx, a);
24059        let e = _mm512_setr_ph(
24060            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0,
24061            31.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0, 22.0, 24.0, 26.0, 28.0,
24062            30.0, 32.0,
24063        );
24064        assert_eq_m512h(r, e);
24065    }
24066
24067    #[simd_test(enable = "avx512fp16,avx512vl")]
24068    unsafe fn test_mm_cvtepi16_ph() {
24069        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24070        let r = _mm_cvtepi16_ph(a);
24071        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24072        assert_eq_m128h(r, e);
24073    }
24074
24075    #[simd_test(enable = "avx512fp16,avx512vl")]
24076    unsafe fn test_mm_mask_cvtepi16_ph() {
24077        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24078        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24079        let r = _mm_mask_cvtepi16_ph(src, 0b01010101, a);
24080        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24081        assert_eq_m128h(r, e);
24082    }
24083
24084    #[simd_test(enable = "avx512fp16,avx512vl")]
24085    unsafe fn test_mm_maskz_cvtepi16_ph() {
24086        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24087        let r = _mm_maskz_cvtepi16_ph(0b01010101, a);
24088        let e = _mm_set_ph(0., 2., 0., 4., 0., 6., 0., 8.);
24089        assert_eq_m128h(r, e);
24090    }
24091
24092    #[simd_test(enable = "avx512fp16,avx512vl")]
24093    unsafe fn test_mm256_cvtepi16_ph() {
24094        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24095        let r = _mm256_cvtepi16_ph(a);
24096        let e = _mm256_set_ph(
24097            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24098        );
24099        assert_eq_m256h(r, e);
24100    }
24101
24102    #[simd_test(enable = "avx512fp16,avx512vl")]
24103    unsafe fn test_mm256_mask_cvtepi16_ph() {
24104        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24105        let src = _mm256_set_ph(
24106            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24107        );
24108        let r = _mm256_mask_cvtepi16_ph(src, 0b0101010101010101, a);
24109        let e = _mm256_set_ph(
24110            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
24111        );
24112        assert_eq_m256h(r, e);
24113    }
24114
24115    #[simd_test(enable = "avx512fp16,avx512vl")]
24116    unsafe fn test_mm256_maskz_cvtepi16_ph() {
24117        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24118        let r = _mm256_maskz_cvtepi16_ph(0b0101010101010101, a);
24119        let e = _mm256_set_ph(
24120            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16.,
24121        );
24122        assert_eq_m256h(r, e);
24123    }
24124
24125    #[simd_test(enable = "avx512fp16")]
24126    unsafe fn test_mm512_cvtepi16_ph() {
24127        let a = _mm512_set_epi16(
24128            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24129            25, 26, 27, 28, 29, 30, 31, 32,
24130        );
24131        let r = _mm512_cvtepi16_ph(a);
24132        let e = _mm512_set_ph(
24133            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24134            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24135            31.0, 32.0,
24136        );
24137        assert_eq_m512h(r, e);
24138    }
24139
24140    #[simd_test(enable = "avx512fp16")]
24141    unsafe fn test_mm512_mask_cvtepi16_ph() {
24142        let a = _mm512_set_epi16(
24143            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24144            25, 26, 27, 28, 29, 30, 31, 32,
24145        );
24146        let src = _mm512_set_ph(
24147            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
24148            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
24149        );
24150        let r = _mm512_mask_cvtepi16_ph(src, 0b01010101010101010101010101010101, a);
24151        let e = _mm512_set_ph(
24152            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
24153            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
24154        );
24155        assert_eq_m512h(r, e);
24156    }
24157
24158    #[simd_test(enable = "avx512fp16")]
24159    unsafe fn test_mm512_maskz_cvtepi16_ph() {
24160        let a = _mm512_set_epi16(
24161            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24162            25, 26, 27, 28, 29, 30, 31, 32,
24163        );
24164        let r = _mm512_maskz_cvtepi16_ph(0b01010101010101010101010101010101, a);
24165        let e = _mm512_set_ph(
24166            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
24167            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
24168        );
24169        assert_eq_m512h(r, e);
24170    }
24171
24172    #[simd_test(enable = "avx512fp16")]
24173    unsafe fn test_mm512_cvt_roundepi16_ph() {
24174        let a = _mm512_set_epi16(
24175            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24176            25, 26, 27, 28, 29, 30, 31, 32,
24177        );
24178        let r = _mm512_cvt_roundepi16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24179        let e = _mm512_set_ph(
24180            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24181            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24182            31.0, 32.0,
24183        );
24184        assert_eq_m512h(r, e);
24185    }
24186
24187    #[simd_test(enable = "avx512fp16")]
24188    unsafe fn test_mm512_mask_cvt_roundepi16_ph() {
24189        let a = _mm512_set_epi16(
24190            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24191            25, 26, 27, 28, 29, 30, 31, 32,
24192        );
24193        let src = _mm512_set_ph(
24194            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
24195            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
24196        );
24197        let r = _mm512_mask_cvt_roundepi16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24198            src,
24199            0b01010101010101010101010101010101,
24200            a,
24201        );
24202        let e = _mm512_set_ph(
24203            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
24204            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
24205        );
24206        assert_eq_m512h(r, e);
24207    }
24208
24209    #[simd_test(enable = "avx512fp16")]
24210    unsafe fn test_mm512_maskz_cvt_roundepi16_ph() {
24211        let a = _mm512_set_epi16(
24212            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24213            25, 26, 27, 28, 29, 30, 31, 32,
24214        );
24215        let r = _mm512_maskz_cvt_roundepi16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24216            0b01010101010101010101010101010101,
24217            a,
24218        );
24219        let e = _mm512_set_ph(
24220            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
24221            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
24222        );
24223        assert_eq_m512h(r, e);
24224    }
24225
24226    #[simd_test(enable = "avx512fp16,avx512vl")]
24227    unsafe fn test_mm_cvtepu16_ph() {
24228        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24229        let r = _mm_cvtepu16_ph(a);
24230        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24231        assert_eq_m128h(r, e);
24232    }
24233
24234    #[simd_test(enable = "avx512fp16,avx512vl")]
24235    unsafe fn test_mm_mask_cvtepu16_ph() {
24236        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24237        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24238        let r = _mm_mask_cvtepu16_ph(src, 0b01010101, a);
24239        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24240        assert_eq_m128h(r, e);
24241    }
24242
24243    #[simd_test(enable = "avx512fp16,avx512vl")]
24244    unsafe fn test_mm_maskz_cvtepu16_ph() {
24245        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24246        let r = _mm_maskz_cvtepu16_ph(0b01010101, a);
24247        let e = _mm_set_ph(0., 2., 0., 4., 0., 6., 0., 8.);
24248        assert_eq_m128h(r, e);
24249    }
24250
24251    #[simd_test(enable = "avx512fp16,avx512vl")]
24252    unsafe fn test_mm256_cvtepu16_ph() {
24253        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24254        let r = _mm256_cvtepu16_ph(a);
24255        let e = _mm256_set_ph(
24256            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24257        );
24258        assert_eq_m256h(r, e);
24259    }
24260
24261    #[simd_test(enable = "avx512fp16,avx512vl")]
24262    unsafe fn test_mm256_mask_cvtepu16_ph() {
24263        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24264        let src = _mm256_set_ph(
24265            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24266        );
24267        let r = _mm256_mask_cvtepu16_ph(src, 0b0101010101010101, a);
24268        let e = _mm256_set_ph(
24269            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
24270        );
24271        assert_eq_m256h(r, e);
24272    }
24273
24274    #[simd_test(enable = "avx512fp16,avx512vl")]
24275    unsafe fn test_mm256_maskz_cvtepu16_ph() {
24276        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24277        let r = _mm256_maskz_cvtepu16_ph(0b0101010101010101, a);
24278        let e = _mm256_set_ph(
24279            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16.,
24280        );
24281        assert_eq_m256h(r, e);
24282    }
24283
24284    #[simd_test(enable = "avx512fp16")]
24285    unsafe fn test_mm512_cvtepu16_ph() {
24286        let a = _mm512_set_epi16(
24287            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24288            25, 26, 27, 28, 29, 30, 31, 32,
24289        );
24290        let r = _mm512_cvtepu16_ph(a);
24291        let e = _mm512_set_ph(
24292            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24293            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24294            31.0, 32.0,
24295        );
24296        assert_eq_m512h(r, e);
24297    }
24298
24299    #[simd_test(enable = "avx512fp16")]
24300    unsafe fn test_mm512_mask_cvtepu16_ph() {
24301        let a = _mm512_set_epi16(
24302            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24303            25, 26, 27, 28, 29, 30, 31, 32,
24304        );
24305        let src = _mm512_set_ph(
24306            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
24307            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
24308        );
24309        let r = _mm512_mask_cvtepu16_ph(src, 0b01010101010101010101010101010101, a);
24310        let e = _mm512_set_ph(
24311            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
24312            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
24313        );
24314        assert_eq_m512h(r, e);
24315    }
24316
24317    #[simd_test(enable = "avx512fp16")]
24318    unsafe fn test_mm512_maskz_cvtepu16_ph() {
24319        let a = _mm512_set_epi16(
24320            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24321            25, 26, 27, 28, 29, 30, 31, 32,
24322        );
24323        let r = _mm512_maskz_cvtepu16_ph(0b01010101010101010101010101010101, a);
24324        let e = _mm512_set_ph(
24325            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
24326            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
24327        );
24328        assert_eq_m512h(r, e);
24329    }
24330
24331    #[simd_test(enable = "avx512fp16")]
24332    unsafe fn test_mm512_cvt_roundepu16_ph() {
24333        let a = _mm512_set_epi16(
24334            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24335            25, 26, 27, 28, 29, 30, 31, 32,
24336        );
24337        let r = _mm512_cvt_roundepu16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24338        let e = _mm512_set_ph(
24339            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24340            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24341            31.0, 32.0,
24342        );
24343        assert_eq_m512h(r, e);
24344    }
24345
24346    #[simd_test(enable = "avx512fp16")]
24347    unsafe fn test_mm512_mask_cvt_roundepu16_ph() {
24348        let a = _mm512_set_epi16(
24349            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24350            25, 26, 27, 28, 29, 30, 31, 32,
24351        );
24352        let src = _mm512_set_ph(
24353            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
24354            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
24355        );
24356        let r = _mm512_mask_cvt_roundepu16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24357            src,
24358            0b01010101010101010101010101010101,
24359            a,
24360        );
24361        let e = _mm512_set_ph(
24362            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
24363            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
24364        );
24365        assert_eq_m512h(r, e);
24366    }
24367
24368    #[simd_test(enable = "avx512fp16")]
24369    unsafe fn test_mm512_maskz_cvt_roundepu16_ph() {
24370        let a = _mm512_set_epi16(
24371            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24372            25, 26, 27, 28, 29, 30, 31, 32,
24373        );
24374        let r = _mm512_maskz_cvt_roundepu16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24375            0b01010101010101010101010101010101,
24376            a,
24377        );
24378        let e = _mm512_set_ph(
24379            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
24380            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
24381        );
24382        assert_eq_m512h(r, e);
24383    }
24384
24385    #[simd_test(enable = "avx512fp16,avx512vl")]
24386    unsafe fn test_mm_cvtepi32_ph() {
24387        let a = _mm_set_epi32(1, 2, 3, 4);
24388        let r = _mm_cvtepi32_ph(a);
24389        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
24390        assert_eq_m128h(r, e);
24391    }
24392
24393    #[simd_test(enable = "avx512fp16,avx512vl")]
24394    unsafe fn test_mm_mask_cvtepi32_ph() {
24395        let a = _mm_set_epi32(1, 2, 3, 4);
24396        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24397        let r = _mm_mask_cvtepi32_ph(src, 0b0101, a);
24398        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2., 16., 4.);
24399        assert_eq_m128h(r, e);
24400    }
24401
24402    #[simd_test(enable = "avx512fp16,avx512vl")]
24403    unsafe fn test_mm_maskz_cvtepi32_ph() {
24404        let a = _mm_set_epi32(1, 2, 3, 4);
24405        let r = _mm_maskz_cvtepi32_ph(0b0101, a);
24406        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2., 0.0, 4.);
24407        assert_eq_m128h(r, e);
24408    }
24409
24410    #[simd_test(enable = "avx512fp16,avx512vl")]
24411    unsafe fn test_mm256_cvtepi32_ph() {
24412        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24413        let r = _mm256_cvtepi32_ph(a);
24414        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24415        assert_eq_m128h(r, e);
24416    }
24417
24418    #[simd_test(enable = "avx512fp16,avx512vl")]
24419    unsafe fn test_mm256_mask_cvtepi32_ph() {
24420        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24421        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24422        let r = _mm256_mask_cvtepi32_ph(src, 0b01010101, a);
24423        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24424        assert_eq_m128h(r, e);
24425    }
24426
24427    #[simd_test(enable = "avx512fp16,avx512vl")]
24428    unsafe fn test_mm256_maskz_cvtepi32_ph() {
24429        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24430        let r = _mm256_maskz_cvtepi32_ph(0b01010101, a);
24431        let e = _mm_set_ph(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
24432        assert_eq_m128h(r, e);
24433    }
24434
24435    #[simd_test(enable = "avx512fp16")]
24436    unsafe fn test_mm512_cvtepi32_ph() {
24437        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24438        let r = _mm512_cvtepi32_ph(a);
24439        let e = _mm256_set_ph(
24440            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24441        );
24442        assert_eq_m256h(r, e);
24443    }
24444
24445    #[simd_test(enable = "avx512fp16")]
24446    unsafe fn test_mm512_mask_cvtepi32_ph() {
24447        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24448        let src = _mm256_set_ph(
24449            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24450        );
24451        let r = _mm512_mask_cvtepi32_ph(src, 0b0101010101010101, a);
24452        let e = _mm256_set_ph(
24453            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
24454        );
24455        assert_eq_m256h(r, e);
24456    }
24457
24458    #[simd_test(enable = "avx512fp16")]
24459    unsafe fn test_mm512_maskz_cvtepi32_ph() {
24460        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24461        let r = _mm512_maskz_cvtepi32_ph(0b0101010101010101, a);
24462        let e = _mm256_set_ph(
24463            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24464        );
24465        assert_eq_m256h(r, e);
24466    }
24467
24468    #[simd_test(enable = "avx512fp16")]
24469    unsafe fn test_mm512_cvt_roundepi32_ph() {
24470        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24471        let r = _mm512_cvt_roundepi32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24472        let e = _mm256_set_ph(
24473            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24474        );
24475        assert_eq_m256h(r, e);
24476    }
24477
24478    #[simd_test(enable = "avx512fp16")]
24479    unsafe fn test_mm512_mask_cvt_roundepi32_ph() {
24480        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24481        let src = _mm256_set_ph(
24482            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24483        );
24484        let r = _mm512_mask_cvt_roundepi32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24485            src,
24486            0b0101010101010101,
24487            a,
24488        );
24489        let e = _mm256_set_ph(
24490            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
24491        );
24492        assert_eq_m256h(r, e);
24493    }
24494
24495    #[simd_test(enable = "avx512fp16")]
24496    unsafe fn test_mm512_maskz_cvt_roundepi32_ph() {
24497        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24498        let r = _mm512_maskz_cvt_roundepi32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24499            0b0101010101010101,
24500            a,
24501        );
24502        let e = _mm256_set_ph(
24503            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24504        );
24505        assert_eq_m256h(r, e);
24506    }
24507
24508    #[simd_test(enable = "avx512fp16")]
24509    unsafe fn test_mm_cvti32_sh() {
24510        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24511        let r = _mm_cvti32_sh(a, 10);
24512        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24513        assert_eq_m128h(r, e);
24514    }
24515
24516    #[simd_test(enable = "avx512fp16")]
24517    unsafe fn test_mm_cvt_roundi32_sh() {
24518        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24519        let r = _mm_cvt_roundi32_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, 10);
24520        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24521        assert_eq_m128h(r, e);
24522    }
24523
24524    #[simd_test(enable = "avx512fp16,avx512vl")]
24525    unsafe fn test_mm_cvtepu32_ph() {
24526        let a = _mm_set_epi32(1, 2, 3, 4);
24527        let r = _mm_cvtepu32_ph(a);
24528        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
24529        assert_eq_m128h(r, e);
24530    }
24531
24532    #[simd_test(enable = "avx512fp16,avx512vl")]
24533    unsafe fn test_mm_mask_cvtepu32_ph() {
24534        let a = _mm_set_epi32(1, 2, 3, 4);
24535        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24536        let r = _mm_mask_cvtepu32_ph(src, 0b0101, a);
24537        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2., 16., 4.);
24538        assert_eq_m128h(r, e);
24539    }
24540
24541    #[simd_test(enable = "avx512fp16,avx512vl")]
24542    unsafe fn test_mm_maskz_cvtepu32_ph() {
24543        let a = _mm_set_epi32(1, 2, 3, 4);
24544        let r = _mm_maskz_cvtepu32_ph(0b0101, a);
24545        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2., 0.0, 4.);
24546        assert_eq_m128h(r, e);
24547    }
24548
24549    #[simd_test(enable = "avx512fp16,avx512vl")]
24550    unsafe fn test_mm256_cvtepu32_ph() {
24551        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24552        let r = _mm256_cvtepu32_ph(a);
24553        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24554        assert_eq_m128h(r, e);
24555    }
24556
24557    #[simd_test(enable = "avx512fp16,avx512vl")]
24558    unsafe fn test_mm256_mask_cvtepu32_ph() {
24559        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24560        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24561        let r = _mm256_mask_cvtepu32_ph(src, 0b01010101, a);
24562        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24563        assert_eq_m128h(r, e);
24564    }
24565
24566    #[simd_test(enable = "avx512fp16,avx512vl")]
24567    unsafe fn test_mm256_maskz_cvtepu32_ph() {
24568        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24569        let r = _mm256_maskz_cvtepu32_ph(0b01010101, a);
24570        let e = _mm_set_ph(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
24571        assert_eq_m128h(r, e);
24572    }
24573
24574    #[simd_test(enable = "avx512fp16")]
24575    unsafe fn test_mm512_cvtepu32_ph() {
24576        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24577        let r = _mm512_cvtepu32_ph(a);
24578        let e = _mm256_set_ph(
24579            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24580        );
24581        assert_eq_m256h(r, e);
24582    }
24583
24584    #[simd_test(enable = "avx512fp16")]
24585    unsafe fn test_mm512_mask_cvtepu32_ph() {
24586        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24587        let src = _mm256_set_ph(
24588            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24589        );
24590        let r = _mm512_mask_cvtepu32_ph(src, 0b0101010101010101, a);
24591        let e = _mm256_set_ph(
24592            10., 2.0, 12., 4.0, 14., 6.0, 16., 8.0, 18., 10.0, 20., 12.0, 22., 14.0, 24., 16.0,
24593        );
24594        assert_eq_m256h(r, e);
24595    }
24596
24597    #[simd_test(enable = "avx512fp16")]
24598    unsafe fn test_mm512_maskz_cvtepu32_ph() {
24599        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24600        let r = _mm512_maskz_cvtepu32_ph(0b0101010101010101, a);
24601        let e = _mm256_set_ph(
24602            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24603        );
24604        assert_eq_m256h(r, e);
24605    }
24606
24607    #[simd_test(enable = "avx512fp16")]
24608    unsafe fn test_mm512_cvt_roundepu32_ph() {
24609        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24610        let r = _mm512_cvt_roundepu32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24611        let e = _mm256_set_ph(
24612            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24613        );
24614        assert_eq_m256h(r, e);
24615    }
24616
24617    #[simd_test(enable = "avx512fp16")]
24618    unsafe fn test_mm512_mask_cvt_roundepu32_ph() {
24619        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24620        let src = _mm256_set_ph(
24621            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24622        );
24623        let r = _mm512_mask_cvt_roundepu32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24624            src,
24625            0b0101010101010101,
24626            a,
24627        );
24628        let e = _mm256_set_ph(
24629            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
24630            16.0,
24631        );
24632        assert_eq_m256h(r, e);
24633    }
24634
24635    #[simd_test(enable = "avx512fp16")]
24636    unsafe fn test_mm512_maskz_cvt_roundepu32_ph() {
24637        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24638        let r = _mm512_maskz_cvt_roundepu32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24639            0b0101010101010101,
24640            a,
24641        );
24642        let e = _mm256_set_ph(
24643            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24644        );
24645        assert_eq_m256h(r, e);
24646    }
24647
24648    #[simd_test(enable = "avx512fp16")]
24649    unsafe fn test_mm_cvtu32_sh() {
24650        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24651        let r = _mm_cvtu32_sh(a, 10);
24652        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24653        assert_eq_m128h(r, e);
24654    }
24655
24656    #[simd_test(enable = "avx512fp16")]
24657    unsafe fn test_mm_cvt_roundu32_sh() {
24658        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24659        let r = _mm_cvt_roundu32_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, 10);
24660        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24661        assert_eq_m128h(r, e);
24662    }
24663
24664    #[simd_test(enable = "avx512fp16,avx512vl")]
24665    unsafe fn test_mm_cvtepi64_ph() {
24666        let a = _mm_set_epi64x(1, 2);
24667        let r = _mm_cvtepi64_ph(a);
24668        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
24669        assert_eq_m128h(r, e);
24670    }
24671
24672    #[simd_test(enable = "avx512fp16,avx512vl")]
24673    unsafe fn test_mm_mask_cvtepi64_ph() {
24674        let a = _mm_set_epi64x(1, 2);
24675        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24676        let r = _mm_mask_cvtepi64_ph(src, 0b01, a);
24677        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16., 2.);
24678        assert_eq_m128h(r, e);
24679    }
24680
24681    #[simd_test(enable = "avx512fp16,avx512vl")]
24682    unsafe fn test_mm_maskz_cvtepi64_ph() {
24683        let a = _mm_set_epi64x(1, 2);
24684        let r = _mm_maskz_cvtepi64_ph(0b01, a);
24685        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.);
24686        assert_eq_m128h(r, e);
24687    }
24688
24689    #[simd_test(enable = "avx512fp16,avx512vl")]
24690    unsafe fn test_mm256_cvtepi64_ph() {
24691        let a = _mm256_set_epi64x(1, 2, 3, 4);
24692        let r = _mm256_cvtepi64_ph(a);
24693        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
24694        assert_eq_m128h(r, e);
24695    }
24696
24697    #[simd_test(enable = "avx512fp16,avx512vl")]
24698    unsafe fn test_mm256_mask_cvtepi64_ph() {
24699        let a = _mm256_set_epi64x(1, 2, 3, 4);
24700        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24701        let r = _mm256_mask_cvtepi64_ph(src, 0b0101, a);
24702        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16.0, 4.0);
24703        assert_eq_m128h(r, e);
24704    }
24705
24706    #[simd_test(enable = "avx512fp16,avx512vl")]
24707    unsafe fn test_mm256_maskz_cvtepi64_ph() {
24708        let a = _mm256_set_epi64x(1, 2, 3, 4);
24709        let r = _mm256_maskz_cvtepi64_ph(0b0101, a);
24710        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
24711        assert_eq_m128h(r, e);
24712    }
24713
24714    #[simd_test(enable = "avx512fp16")]
24715    unsafe fn test_mm512_cvtepi64_ph() {
24716        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24717        let r = _mm512_cvtepi64_ph(a);
24718        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24719        assert_eq_m128h(r, e);
24720    }
24721
24722    #[simd_test(enable = "avx512fp16")]
24723    unsafe fn test_mm512_mask_cvtepi64_ph() {
24724        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24725        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24726        let r = _mm512_mask_cvtepi64_ph(src, 0b01010101, a);
24727        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24728        assert_eq_m128h(r, e);
24729    }
24730
24731    #[simd_test(enable = "avx512fp16")]
24732    unsafe fn test_mm512_maskz_cvtepi64_ph() {
24733        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24734        let r = _mm512_maskz_cvtepi64_ph(0b01010101, a);
24735        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
24736        assert_eq_m128h(r, e);
24737    }
24738
24739    #[simd_test(enable = "avx512fp16")]
24740    unsafe fn test_mm512_cvt_roundepi64_ph() {
24741        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24742        let r = _mm512_cvt_roundepi64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24743        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24744        assert_eq_m128h(r, e);
24745    }
24746
24747    #[simd_test(enable = "avx512fp16")]
24748    unsafe fn test_mm512_mask_cvt_roundepi64_ph() {
24749        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24750        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24751        let r = _mm512_mask_cvt_roundepi64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24752            src, 0b01010101, a,
24753        );
24754        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24755        assert_eq_m128h(r, e);
24756    }
24757
24758    #[simd_test(enable = "avx512fp16")]
24759    unsafe fn test_mm512_maskz_cvt_roundepi64_ph() {
24760        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24761        let r = _mm512_maskz_cvt_roundepi64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24762            0b01010101, a,
24763        );
24764        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
24765        assert_eq_m128h(r, e);
24766    }
24767
24768    #[simd_test(enable = "avx512fp16,avx512vl")]
24769    unsafe fn test_mm_cvtepu64_ph() {
24770        let a = _mm_set_epi64x(1, 2);
24771        let r = _mm_cvtepu64_ph(a);
24772        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
24773        assert_eq_m128h(r, e);
24774    }
24775
24776    #[simd_test(enable = "avx512fp16,avx512vl")]
24777    unsafe fn test_mm_mask_cvtepu64_ph() {
24778        let a = _mm_set_epi64x(1, 2);
24779        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24780        let r = _mm_mask_cvtepu64_ph(src, 0b01, a);
24781        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16., 2.);
24782        assert_eq_m128h(r, e);
24783    }
24784
24785    #[simd_test(enable = "avx512fp16,avx512vl")]
24786    unsafe fn test_mm_maskz_cvtepu64_ph() {
24787        let a = _mm_set_epi64x(1, 2);
24788        let r = _mm_maskz_cvtepu64_ph(0b01, a);
24789        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0);
24790        assert_eq_m128h(r, e);
24791    }
24792
24793    #[simd_test(enable = "avx512fp16,avx512vl")]
24794    unsafe fn test_mm256_cvtepu64_ph() {
24795        let a = _mm256_set_epi64x(1, 2, 3, 4);
24796        let r = _mm256_cvtepu64_ph(a);
24797        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
24798        assert_eq_m128h(r, e);
24799    }
24800
24801    #[simd_test(enable = "avx512fp16,avx512vl")]
24802    unsafe fn test_mm256_mask_cvtepu64_ph() {
24803        let a = _mm256_set_epi64x(1, 2, 3, 4);
24804        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24805        let r = _mm256_mask_cvtepu64_ph(src, 0b0101, a);
24806        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16.0, 4.0);
24807        assert_eq_m128h(r, e);
24808    }
24809
24810    #[simd_test(enable = "avx512fp16,avx512vl")]
24811    unsafe fn test_mm256_maskz_cvtepu64_ph() {
24812        let a = _mm256_set_epi64x(1, 2, 3, 4);
24813        let r = _mm256_maskz_cvtepu64_ph(0b0101, a);
24814        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
24815        assert_eq_m128h(r, e);
24816    }
24817
24818    #[simd_test(enable = "avx512fp16")]
24819    unsafe fn test_mm512_cvtepu64_ph() {
24820        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24821        let r = _mm512_cvtepu64_ph(a);
24822        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24823        assert_eq_m128h(r, e);
24824    }
24825
24826    #[simd_test(enable = "avx512fp16")]
24827    unsafe fn test_mm512_mask_cvtepu64_ph() {
24828        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24829        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24830        let r = _mm512_mask_cvtepu64_ph(src, 0b01010101, a);
24831        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24832        assert_eq_m128h(r, e);
24833    }
24834
24835    #[simd_test(enable = "avx512fp16")]
24836    unsafe fn test_mm512_maskz_cvtepu64_ph() {
24837        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24838        let r = _mm512_maskz_cvtepu64_ph(0b01010101, a);
24839        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
24840        assert_eq_m128h(r, e);
24841    }
24842
24843    #[simd_test(enable = "avx512fp16")]
24844    unsafe fn test_mm512_cvt_roundepu64_ph() {
24845        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24846        let r = _mm512_cvt_roundepu64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24847        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24848        assert_eq_m128h(r, e);
24849    }
24850
24851    #[simd_test(enable = "avx512fp16")]
24852    unsafe fn test_mm512_mask_cvt_roundepu64_ph() {
24853        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24854        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24855        let r = _mm512_mask_cvt_roundepu64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24856            src, 0b01010101, a,
24857        );
24858        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24859        assert_eq_m128h(r, e);
24860    }
24861
24862    #[simd_test(enable = "avx512fp16")]
24863    unsafe fn test_mm512_maskz_cvt_roundepu64_ph() {
24864        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24865        let r = _mm512_maskz_cvt_roundepu64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24866            0b01010101, a,
24867        );
24868        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
24869        assert_eq_m128h(r, e);
24870    }
24871
24872    #[simd_test(enable = "avx512fp16,avx512vl")]
24873    unsafe fn test_mm_cvtxps_ph() {
24874        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
24875        let r = _mm_cvtxps_ph(a);
24876        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
24877        assert_eq_m128h(r, e);
24878    }
24879
24880    #[simd_test(enable = "avx512fp16,avx512vl")]
24881    unsafe fn test_mm_mask_cvtxps_ph() {
24882        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
24883        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24884        let r = _mm_mask_cvtxps_ph(src, 0b0101, a);
24885        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16., 4.0);
24886        assert_eq_m128h(r, e);
24887    }
24888
24889    #[simd_test(enable = "avx512fp16,avx512vl")]
24890    unsafe fn test_mm_maskz_cvtxps_ph() {
24891        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
24892        let r = _mm_maskz_cvtxps_ph(0b0101, a);
24893        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
24894        assert_eq_m128h(r, e);
24895    }
24896
24897    #[simd_test(enable = "avx512fp16,avx512vl")]
24898    unsafe fn test_mm256_cvtxps_ph() {
24899        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24900        let r = _mm256_cvtxps_ph(a);
24901        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24902        assert_eq_m128h(r, e);
24903    }
24904
24905    #[simd_test(enable = "avx512fp16,avx512vl")]
24906    unsafe fn test_mm256_mask_cvtxps_ph() {
24907        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24908        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24909        let r = _mm256_mask_cvtxps_ph(src, 0b01010101, a);
24910        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24911        assert_eq_m128h(r, e);
24912    }
24913
24914    #[simd_test(enable = "avx512fp16,avx512vl")]
24915    unsafe fn test_mm256_maskz_cvtxps_ph() {
24916        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24917        let r = _mm256_maskz_cvtxps_ph(0b01010101, a);
24918        let e = _mm_set_ph(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
24919        assert_eq_m128h(r, e);
24920    }
24921
24922    #[simd_test(enable = "avx512fp16")]
24923    unsafe fn test_mm512_cvtxps_ph() {
24924        let a = _mm512_set_ps(
24925            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24926        );
24927        let r = _mm512_cvtxps_ph(a);
24928        let e = _mm256_set_ph(
24929            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24930        );
24931        assert_eq_m256h(r, e);
24932    }
24933
24934    #[simd_test(enable = "avx512fp16")]
24935    unsafe fn test_mm512_mask_cvtxps_ph() {
24936        let a = _mm512_set_ps(
24937            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24938        );
24939        let src = _mm256_set_ph(
24940            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24941        );
24942        let r = _mm512_mask_cvtxps_ph(src, 0b0101010101010101, a);
24943        let e = _mm256_set_ph(
24944            10., 2.0, 12., 4.0, 14., 6.0, 16., 8.0, 18., 10.0, 20., 12.0, 22., 14.0, 24., 16.0,
24945        );
24946        assert_eq_m256h(r, e);
24947    }
24948
24949    #[simd_test(enable = "avx512fp16")]
24950    unsafe fn test_mm512_maskz_cvtxps_ph() {
24951        let a = _mm512_set_ps(
24952            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24953        );
24954        let r = _mm512_maskz_cvtxps_ph(0b0101010101010101, a);
24955        let e = _mm256_set_ph(
24956            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24957        );
24958        assert_eq_m256h(r, e);
24959    }
24960
24961    #[simd_test(enable = "avx512fp16")]
24962    unsafe fn test_mm512_cvtx_roundps_ph() {
24963        let a = _mm512_set_ps(
24964            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24965        );
24966        let r = _mm512_cvtx_roundps_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24967        let e = _mm256_set_ph(
24968            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24969        );
24970        assert_eq_m256h(r, e);
24971    }
24972
24973    #[simd_test(enable = "avx512fp16")]
24974    unsafe fn test_mm512_mask_cvtx_roundps_ph() {
24975        let a = _mm512_set_ps(
24976            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24977        );
24978        let src = _mm256_set_ph(
24979            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24980        );
24981        let r = _mm512_mask_cvtx_roundps_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24982            src,
24983            0b0101010101010101,
24984            a,
24985        );
24986        let e = _mm256_set_ph(
24987            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
24988            16.0,
24989        );
24990        assert_eq_m256h(r, e);
24991    }
24992
24993    #[simd_test(enable = "avx512fp16")]
24994    unsafe fn test_mm512_maskz_cvtx_roundps_ph() {
24995        let a = _mm512_set_ps(
24996            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24997        );
24998        let r = _mm512_maskz_cvtx_roundps_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24999            0b0101010101010101,
25000            a,
25001        );
25002        let e = _mm256_set_ph(
25003            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
25004        );
25005        assert_eq_m256h(r, e);
25006    }
25007
25008    #[simd_test(enable = "avx512fp16")]
25009    unsafe fn test_mm_cvtss_sh() {
25010        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25011        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25012        let r = _mm_cvtss_sh(a, b);
25013        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25014        assert_eq_m128h(r, e);
25015    }
25016
25017    #[simd_test(enable = "avx512fp16")]
25018    unsafe fn test_mm_mask_cvtss_sh() {
25019        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25020        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25021        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
25022        let r = _mm_mask_cvtss_sh(src, 0, a, b);
25023        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
25024        assert_eq_m128h(r, e);
25025        let r = _mm_mask_cvtss_sh(src, 1, a, b);
25026        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25027        assert_eq_m128h(r, e);
25028    }
25029
25030    #[simd_test(enable = "avx512fp16")]
25031    unsafe fn test_mm_maskz_cvtss_sh() {
25032        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25033        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25034        let r = _mm_maskz_cvtss_sh(0, a, b);
25035        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
25036        assert_eq_m128h(r, e);
25037        let r = _mm_maskz_cvtss_sh(1, a, b);
25038        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25039        assert_eq_m128h(r, e);
25040    }
25041
25042    #[simd_test(enable = "avx512fp16")]
25043    unsafe fn test_mm_cvt_roundss_sh() {
25044        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25045        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25046        let r = _mm_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
25047        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25048        assert_eq_m128h(r, e);
25049    }
25050
25051    #[simd_test(enable = "avx512fp16")]
25052    unsafe fn test_mm_mask_cvt_roundss_sh() {
25053        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25054        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25055        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
25056        let r = _mm_mask_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25057            src, 0, a, b,
25058        );
25059        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
25060        assert_eq_m128h(r, e);
25061        let r = _mm_mask_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25062            src, 1, a, b,
25063        );
25064        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25065        assert_eq_m128h(r, e);
25066    }
25067
25068    #[simd_test(enable = "avx512fp16")]
25069    unsafe fn test_mm_maskz_cvt_roundss_sh() {
25070        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25071        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25072        let r =
25073            _mm_maskz_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
25074        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
25075        assert_eq_m128h(r, e);
25076        let r =
25077            _mm_maskz_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
25078        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25079        assert_eq_m128h(r, e);
25080    }
25081
25082    #[simd_test(enable = "avx512fp16,avx512vl")]
25083    unsafe fn test_mm_cvtpd_ph() {
25084        let a = _mm_set_pd(1.0, 2.0);
25085        let r = _mm_cvtpd_ph(a);
25086        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
25087        assert_eq_m128h(r, e);
25088    }
25089
25090    #[simd_test(enable = "avx512fp16,avx512vl")]
25091    unsafe fn test_mm_mask_cvtpd_ph() {
25092        let a = _mm_set_pd(1.0, 2.0);
25093        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25094        let r = _mm_mask_cvtpd_ph(src, 0b01, a);
25095        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16., 2.);
25096        assert_eq_m128h(r, e);
25097    }
25098
25099    #[simd_test(enable = "avx512fp16,avx512vl")]
25100    unsafe fn test_mm_maskz_cvtpd_ph() {
25101        let a = _mm_set_pd(1.0, 2.0);
25102        let r = _mm_maskz_cvtpd_ph(0b01, a);
25103        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0);
25104        assert_eq_m128h(r, e);
25105    }
25106
25107    #[simd_test(enable = "avx512fp16,avx512vl")]
25108    unsafe fn test_mm256_cvtpd_ph() {
25109        let a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
25110        let r = _mm256_cvtpd_ph(a);
25111        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
25112        assert_eq_m128h(r, e);
25113    }
25114
25115    #[simd_test(enable = "avx512fp16,avx512vl")]
25116    unsafe fn test_mm256_mask_cvtpd_ph() {
25117        let a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
25118        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25119        let r = _mm256_mask_cvtpd_ph(src, 0b0101, a);
25120        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16.0, 4.0);
25121        assert_eq_m128h(r, e);
25122    }
25123
25124    #[simd_test(enable = "avx512fp16,avx512vl")]
25125    unsafe fn test_mm256_maskz_cvtpd_ph() {
25126        let a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
25127        let r = _mm256_maskz_cvtpd_ph(0b0101, a);
25128        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
25129        assert_eq_m128h(r, e);
25130    }
25131
25132    #[simd_test(enable = "avx512fp16")]
25133    unsafe fn test_mm512_cvtpd_ph() {
25134        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25135        let r = _mm512_cvtpd_ph(a);
25136        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25137        assert_eq_m128h(r, e);
25138    }
25139
25140    #[simd_test(enable = "avx512fp16")]
25141    unsafe fn test_mm512_mask_cvtpd_ph() {
25142        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25143        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25144        let r = _mm512_mask_cvtpd_ph(src, 0b01010101, a);
25145        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
25146        assert_eq_m128h(r, e);
25147    }
25148
25149    #[simd_test(enable = "avx512fp16")]
25150    unsafe fn test_mm512_maskz_cvtpd_ph() {
25151        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25152        let r = _mm512_maskz_cvtpd_ph(0b01010101, a);
25153        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
25154        assert_eq_m128h(r, e);
25155    }
25156
25157    #[simd_test(enable = "avx512fp16")]
25158    unsafe fn test_mm512_cvt_roundpd_ph() {
25159        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25160        let r = _mm512_cvt_roundpd_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
25161        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25162        assert_eq_m128h(r, e);
25163    }
25164
25165    #[simd_test(enable = "avx512fp16")]
25166    unsafe fn test_mm512_mask_cvt_roundpd_ph() {
25167        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25168        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25169        let r = _mm512_mask_cvt_roundpd_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25170            src, 0b01010101, a,
25171        );
25172        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
25173        assert_eq_m128h(r, e);
25174    }
25175
25176    #[simd_test(enable = "avx512fp16")]
25177    unsafe fn test_mm512_maskz_cvt_roundpd_ph() {
25178        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25179        let r = _mm512_maskz_cvt_roundpd_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25180            0b01010101, a,
25181        );
25182        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
25183        assert_eq_m128h(r, e);
25184    }
25185
25186    #[simd_test(enable = "avx512fp16")]
25187    unsafe fn test_mm_cvtsd_sh() {
25188        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25189        let b = _mm_setr_pd(1.0, 2.0);
25190        let r = _mm_cvtsd_sh(a, b);
25191        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25192        assert_eq_m128h(r, e);
25193    }
25194
25195    #[simd_test(enable = "avx512fp16")]
25196    unsafe fn test_mm_mask_cvtsd_sh() {
25197        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25198        let b = _mm_setr_pd(1.0, 2.0);
25199        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
25200        let r = _mm_mask_cvtsd_sh(src, 0, a, b);
25201        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
25202        assert_eq_m128h(r, e);
25203        let r = _mm_mask_cvtsd_sh(src, 1, a, b);
25204        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25205        assert_eq_m128h(r, e);
25206    }
25207
25208    #[simd_test(enable = "avx512fp16")]
25209    unsafe fn test_mm_maskz_cvtsd_sh() {
25210        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25211        let b = _mm_setr_pd(1.0, 2.0);
25212        let r = _mm_maskz_cvtsd_sh(0, a, b);
25213        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
25214        assert_eq_m128h(r, e);
25215        let r = _mm_maskz_cvtsd_sh(1, a, b);
25216        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25217        assert_eq_m128h(r, e);
25218    }
25219
25220    #[simd_test(enable = "avx512fp16")]
25221    unsafe fn test_mm_cvt_roundsd_sh() {
25222        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25223        let b = _mm_setr_pd(1.0, 2.0);
25224        let r = _mm_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
25225        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25226        assert_eq_m128h(r, e);
25227    }
25228
25229    #[simd_test(enable = "avx512fp16")]
25230    unsafe fn test_mm_mask_cvt_roundsd_sh() {
25231        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25232        let b = _mm_setr_pd(1.0, 2.0);
25233        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
25234        let r = _mm_mask_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25235            src, 0, a, b,
25236        );
25237        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
25238        assert_eq_m128h(r, e);
25239        let r = _mm_mask_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25240            src, 1, a, b,
25241        );
25242        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25243        assert_eq_m128h(r, e);
25244    }
25245
25246    #[simd_test(enable = "avx512fp16")]
25247    unsafe fn test_mm_maskz_cvt_roundsd_sh() {
25248        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25249        let b = _mm_setr_pd(1.0, 2.0);
25250        let r =
25251            _mm_maskz_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
25252        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
25253        assert_eq_m128h(r, e);
25254        let r =
25255            _mm_maskz_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
25256        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25257        assert_eq_m128h(r, e);
25258    }
25259
25260    #[simd_test(enable = "avx512fp16,avx512vl")]
25261    unsafe fn test_mm_cvtph_epi16() {
25262        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25263        let r = _mm_cvttph_epi16(a);
25264        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
25265        assert_eq_m128i(r, e);
25266    }
25267
25268    #[simd_test(enable = "avx512fp16,avx512vl")]
25269    unsafe fn test_mm_mask_cvtph_epi16() {
25270        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25271        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
25272        let r = _mm_mask_cvttph_epi16(src, 0b01010101, a);
25273        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
25274        assert_eq_m128i(r, e);
25275    }
25276
25277    #[simd_test(enable = "avx512fp16,avx512vl")]
25278    unsafe fn test_mm_maskz_cvtph_epi16() {
25279        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25280        let r = _mm_maskz_cvttph_epi16(0b01010101, a);
25281        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
25282        assert_eq_m128i(r, e);
25283    }
25284
25285    #[simd_test(enable = "avx512fp16,avx512vl")]
25286    unsafe fn test_mm256_cvtph_epi16() {
25287        let a = _mm256_set_ph(
25288            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25289        );
25290        let r = _mm256_cvttph_epi16(a);
25291        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25292        assert_eq_m256i(r, e);
25293    }
25294
25295    #[simd_test(enable = "avx512fp16,avx512vl")]
25296    unsafe fn test_mm256_mask_cvtph_epi16() {
25297        let a = _mm256_set_ph(
25298            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25299        );
25300        let src = _mm256_set_epi16(
25301            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
25302        );
25303        let r = _mm256_mask_cvttph_epi16(src, 0b0101010101010101, a);
25304        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
25305        assert_eq_m256i(r, e);
25306    }
25307
25308    #[simd_test(enable = "avx512fp16,avx512vl")]
25309    unsafe fn test_mm256_maskz_cvtph_epi16() {
25310        let a = _mm256_set_ph(
25311            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25312        );
25313        let r = _mm256_maskz_cvttph_epi16(0b0101010101010101, a);
25314        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
25315        assert_eq_m256i(r, e);
25316    }
25317
25318    #[simd_test(enable = "avx512fp16")]
25319    unsafe fn test_mm512_cvtph_epi16() {
25320        let a = _mm512_set_ph(
25321            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25322            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25323            31.0, 32.0,
25324        );
25325        let r = _mm512_cvttph_epi16(a);
25326        let e = _mm512_set_epi16(
25327            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25328            25, 26, 27, 28, 29, 30, 31, 32,
25329        );
25330        assert_eq_m512i(r, e);
25331    }
25332
25333    #[simd_test(enable = "avx512fp16")]
25334    unsafe fn test_mm512_mask_cvtph_epi16() {
25335        let a = _mm512_set_ph(
25336            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25337            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25338            31.0, 32.0,
25339        );
25340        let src = _mm512_set_epi16(
25341            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25342            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25343        );
25344        let r = _mm512_mask_cvttph_epi16(src, 0b01010101010101010101010101010101, a);
25345        let e = _mm512_set_epi16(
25346            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25347            24, 34, 26, 36, 28, 38, 30, 40, 32,
25348        );
25349        assert_eq_m512i(r, e);
25350    }
25351
25352    #[simd_test(enable = "avx512fp16")]
25353    unsafe fn test_mm512_maskz_cvtph_epi16() {
25354        let a = _mm512_set_ph(
25355            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25356            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25357            31.0, 32.0,
25358        );
25359        let r = _mm512_maskz_cvttph_epi16(0b01010101010101010101010101010101, a);
25360        let e = _mm512_set_epi16(
25361            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25362            0, 28, 0, 30, 0, 32,
25363        );
25364        assert_eq_m512i(r, e);
25365    }
25366
25367    #[simd_test(enable = "avx512fp16")]
25368    unsafe fn test_mm512_cvt_roundph_epi16() {
25369        let a = _mm512_set_ph(
25370            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25371            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25372            31.0, 32.0,
25373        );
25374        let r = _mm512_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(a);
25375        let e = _mm512_set_epi16(
25376            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25377            25, 26, 27, 28, 29, 30, 31, 32,
25378        );
25379        assert_eq_m512i(r, e);
25380    }
25381
25382    #[simd_test(enable = "avx512fp16")]
25383    unsafe fn test_mm512_mask_cvt_roundph_epi16() {
25384        let a = _mm512_set_ph(
25385            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25386            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25387            31.0, 32.0,
25388        );
25389        let src = _mm512_set_epi16(
25390            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25391            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25392        );
25393        let r = _mm512_mask_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
25394            src,
25395            0b01010101010101010101010101010101,
25396            a,
25397        );
25398        let e = _mm512_set_epi16(
25399            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25400            24, 34, 26, 36, 28, 38, 30, 40, 32,
25401        );
25402        assert_eq_m512i(r, e);
25403    }
25404
25405    #[simd_test(enable = "avx512fp16")]
25406    unsafe fn test_mm512_maskz_cvt_roundph_epi16() {
25407        let a = _mm512_set_ph(
25408            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25409            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25410            31.0, 32.0,
25411        );
25412        let r = _mm512_maskz_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
25413            0b01010101010101010101010101010101,
25414            a,
25415        );
25416        let e = _mm512_set_epi16(
25417            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25418            0, 28, 0, 30, 0, 32,
25419        );
25420        assert_eq_m512i(r, e);
25421    }
25422
25423    #[simd_test(enable = "avx512fp16,avx512vl")]
25424    unsafe fn test_mm_cvtph_epu16() {
25425        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25426        let r = _mm_cvttph_epu16(a);
25427        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
25428        assert_eq_m128i(r, e);
25429    }
25430
25431    #[simd_test(enable = "avx512fp16,avx512vl")]
25432    unsafe fn test_mm_mask_cvtph_epu16() {
25433        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25434        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
25435        let r = _mm_mask_cvttph_epu16(src, 0b01010101, a);
25436        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
25437        assert_eq_m128i(r, e);
25438    }
25439
25440    #[simd_test(enable = "avx512fp16,avx512vl")]
25441    unsafe fn test_mm_maskz_cvtph_epu16() {
25442        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25443        let r = _mm_maskz_cvttph_epu16(0b01010101, a);
25444        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
25445        assert_eq_m128i(r, e);
25446    }
25447
25448    #[simd_test(enable = "avx512fp16,avx512vl")]
25449    unsafe fn test_mm256_cvtph_epu16() {
25450        let a = _mm256_set_ph(
25451            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25452        );
25453        let r = _mm256_cvttph_epu16(a);
25454        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25455        assert_eq_m256i(r, e);
25456    }
25457
25458    #[simd_test(enable = "avx512fp16,avx512vl")]
25459    unsafe fn test_mm256_mask_cvtph_epu16() {
25460        let a = _mm256_set_ph(
25461            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25462        );
25463        let src = _mm256_set_epi16(
25464            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
25465        );
25466        let r = _mm256_mask_cvttph_epu16(src, 0b0101010101010101, a);
25467        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
25468        assert_eq_m256i(r, e);
25469    }
25470
25471    #[simd_test(enable = "avx512fp16,avx512vl")]
25472    unsafe fn test_mm256_maskz_cvtph_epu16() {
25473        let a = _mm256_set_ph(
25474            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25475        );
25476        let r = _mm256_maskz_cvttph_epu16(0b0101010101010101, a);
25477        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
25478        assert_eq_m256i(r, e);
25479    }
25480
25481    #[simd_test(enable = "avx512fp16")]
25482    unsafe fn test_mm512_cvtph_epu16() {
25483        let a = _mm512_set_ph(
25484            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25485            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25486            31.0, 32.0,
25487        );
25488        let r = _mm512_cvttph_epu16(a);
25489        let e = _mm512_set_epi16(
25490            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25491            25, 26, 27, 28, 29, 30, 31, 32,
25492        );
25493        assert_eq_m512i(r, e);
25494    }
25495
25496    #[simd_test(enable = "avx512fp16")]
25497    unsafe fn test_mm512_mask_cvtph_epu16() {
25498        let a = _mm512_set_ph(
25499            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25500            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25501            31.0, 32.0,
25502        );
25503        let src = _mm512_set_epi16(
25504            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25505            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25506        );
25507        let r = _mm512_mask_cvttph_epu16(src, 0b01010101010101010101010101010101, a);
25508        let e = _mm512_set_epi16(
25509            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25510            24, 34, 26, 36, 28, 38, 30, 40, 32,
25511        );
25512        assert_eq_m512i(r, e);
25513    }
25514
25515    #[simd_test(enable = "avx512fp16")]
25516    unsafe fn test_mm512_maskz_cvtph_epu16() {
25517        let a = _mm512_set_ph(
25518            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25519            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25520            31.0, 32.0,
25521        );
25522        let r = _mm512_maskz_cvttph_epu16(0b01010101010101010101010101010101, a);
25523        let e = _mm512_set_epi16(
25524            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25525            0, 28, 0, 30, 0, 32,
25526        );
25527        assert_eq_m512i(r, e);
25528    }
25529
25530    #[simd_test(enable = "avx512fp16")]
25531    unsafe fn test_mm512_cvt_roundph_epu16() {
25532        let a = _mm512_set_ph(
25533            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25534            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25535            31.0, 32.0,
25536        );
25537        let r = _mm512_cvt_roundph_epu16::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
25538        let e = _mm512_set_epi16(
25539            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25540            25, 26, 27, 28, 29, 30, 31, 32,
25541        );
25542        assert_eq_m512i(r, e);
25543    }
25544
25545    #[simd_test(enable = "avx512fp16")]
25546    unsafe fn test_mm512_mask_cvt_roundph_epu16() {
25547        let a = _mm512_set_ph(
25548            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25549            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25550            31.0, 32.0,
25551        );
25552        let src = _mm512_set_epi16(
25553            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25554            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25555        );
25556        let r = _mm512_mask_cvt_roundph_epu16::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25557            src,
25558            0b01010101010101010101010101010101,
25559            a,
25560        );
25561        let e = _mm512_set_epi16(
25562            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25563            24, 34, 26, 36, 28, 38, 30, 40, 32,
25564        );
25565        assert_eq_m512i(r, e);
25566    }
25567
25568    #[simd_test(enable = "avx512fp16")]
25569    unsafe fn test_mm512_maskz_cvt_roundph_epu16() {
25570        let a = _mm512_set_ph(
25571            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25572            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25573            31.0, 32.0,
25574        );
25575        let r = _mm512_maskz_cvt_roundph_epu16::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25576            0b01010101010101010101010101010101,
25577            a,
25578        );
25579        let e = _mm512_set_epi16(
25580            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25581            0, 28, 0, 30, 0, 32,
25582        );
25583        assert_eq_m512i(r, e);
25584    }
25585
25586    #[simd_test(enable = "avx512fp16,avx512vl")]
25587    unsafe fn test_mm_cvttph_epi16() {
25588        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25589        let r = _mm_cvttph_epi16(a);
25590        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
25591        assert_eq_m128i(r, e);
25592    }
25593
25594    #[simd_test(enable = "avx512fp16,avx512vl")]
25595    unsafe fn test_mm_mask_cvttph_epi16() {
25596        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25597        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
25598        let r = _mm_mask_cvttph_epi16(src, 0b01010101, a);
25599        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
25600        assert_eq_m128i(r, e);
25601    }
25602
25603    #[simd_test(enable = "avx512fp16,avx512vl")]
25604    unsafe fn test_mm_maskz_cvttph_epi16() {
25605        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25606        let r = _mm_maskz_cvttph_epi16(0b01010101, a);
25607        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
25608        assert_eq_m128i(r, e);
25609    }
25610
25611    #[simd_test(enable = "avx512fp16,avx512vl")]
25612    unsafe fn test_mm256_cvttph_epi16() {
25613        let a = _mm256_set_ph(
25614            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25615        );
25616        let r = _mm256_cvttph_epi16(a);
25617        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25618        assert_eq_m256i(r, e);
25619    }
25620
25621    #[simd_test(enable = "avx512fp16,avx512vl")]
25622    unsafe fn test_mm256_mask_cvttph_epi16() {
25623        let a = _mm256_set_ph(
25624            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25625        );
25626        let src = _mm256_set_epi16(
25627            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
25628        );
25629        let r = _mm256_mask_cvttph_epi16(src, 0b0101010101010101, a);
25630        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
25631        assert_eq_m256i(r, e);
25632    }
25633
25634    #[simd_test(enable = "avx512fp16,avx512vl")]
25635    unsafe fn test_mm256_maskz_cvttph_epi16() {
25636        let a = _mm256_set_ph(
25637            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25638        );
25639        let r = _mm256_maskz_cvttph_epi16(0b0101010101010101, a);
25640        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
25641        assert_eq_m256i(r, e);
25642    }
25643
25644    #[simd_test(enable = "avx512fp16")]
25645    unsafe fn test_mm512_cvttph_epi16() {
25646        let a = _mm512_set_ph(
25647            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25648            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25649            31.0, 32.0,
25650        );
25651        let r = _mm512_cvttph_epi16(a);
25652        let e = _mm512_set_epi16(
25653            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25654            25, 26, 27, 28, 29, 30, 31, 32,
25655        );
25656        assert_eq_m512i(r, e);
25657    }
25658
25659    #[simd_test(enable = "avx512fp16")]
25660    unsafe fn test_mm512_mask_cvttph_epi16() {
25661        let a = _mm512_set_ph(
25662            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25663            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25664            31.0, 32.0,
25665        );
25666        let src = _mm512_set_epi16(
25667            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25668            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25669        );
25670        let r = _mm512_mask_cvttph_epi16(src, 0b01010101010101010101010101010101, a);
25671        let e = _mm512_set_epi16(
25672            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25673            24, 34, 26, 36, 28, 38, 30, 40, 32,
25674        );
25675        assert_eq_m512i(r, e);
25676    }
25677
25678    #[simd_test(enable = "avx512fp16")]
25679    unsafe fn test_mm512_maskz_cvttph_epi16() {
25680        let a = _mm512_set_ph(
25681            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25682            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25683            31.0, 32.0,
25684        );
25685        let r = _mm512_maskz_cvttph_epi16(0b01010101010101010101010101010101, a);
25686        let e = _mm512_set_epi16(
25687            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25688            0, 28, 0, 30, 0, 32,
25689        );
25690        assert_eq_m512i(r, e);
25691    }
25692
25693    #[simd_test(enable = "avx512fp16")]
25694    unsafe fn test_mm512_cvtt_roundph_epi16() {
25695        let a = _mm512_set_ph(
25696            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25697            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25698            31.0, 32.0,
25699        );
25700        let r = _mm512_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(a);
25701        let e = _mm512_set_epi16(
25702            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25703            25, 26, 27, 28, 29, 30, 31, 32,
25704        );
25705        assert_eq_m512i(r, e);
25706    }
25707
25708    #[simd_test(enable = "avx512fp16")]
25709    unsafe fn test_mm512_mask_cvtt_roundph_epi16() {
25710        let a = _mm512_set_ph(
25711            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25712            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25713            31.0, 32.0,
25714        );
25715        let src = _mm512_set_epi16(
25716            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25717            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25718        );
25719        let r = _mm512_mask_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
25720            src,
25721            0b01010101010101010101010101010101,
25722            a,
25723        );
25724        let e = _mm512_set_epi16(
25725            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25726            24, 34, 26, 36, 28, 38, 30, 40, 32,
25727        );
25728        assert_eq_m512i(r, e);
25729    }
25730
25731    #[simd_test(enable = "avx512fp16")]
25732    unsafe fn test_mm512_maskz_cvtt_roundph_epi16() {
25733        let a = _mm512_set_ph(
25734            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25735            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25736            31.0, 32.0,
25737        );
25738        let r = _mm512_maskz_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
25739            0b01010101010101010101010101010101,
25740            a,
25741        );
25742        let e = _mm512_set_epi16(
25743            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25744            0, 28, 0, 30, 0, 32,
25745        );
25746        assert_eq_m512i(r, e);
25747    }
25748
25749    #[simd_test(enable = "avx512fp16,avx512vl")]
25750    unsafe fn test_mm_cvttph_epu16() {
25751        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25752        let r = _mm_cvttph_epu16(a);
25753        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
25754        assert_eq_m128i(r, e);
25755    }
25756
25757    #[simd_test(enable = "avx512fp16,avx512vl")]
25758    unsafe fn test_mm_mask_cvttph_epu16() {
25759        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25760        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
25761        let r = _mm_mask_cvttph_epu16(src, 0b01010101, a);
25762        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
25763        assert_eq_m128i(r, e);
25764    }
25765
25766    #[simd_test(enable = "avx512fp16,avx512vl")]
25767    unsafe fn test_mm_maskz_cvttph_epu16() {
25768        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25769        let r = _mm_maskz_cvttph_epu16(0b01010101, a);
25770        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
25771        assert_eq_m128i(r, e);
25772    }
25773
25774    #[simd_test(enable = "avx512fp16,avx512vl")]
25775    unsafe fn test_mm256_cvttph_epu16() {
25776        let a = _mm256_set_ph(
25777            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25778        );
25779        let r = _mm256_cvttph_epu16(a);
25780        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25781        assert_eq_m256i(r, e);
25782    }
25783
25784    #[simd_test(enable = "avx512fp16,avx512vl")]
25785    unsafe fn test_mm256_mask_cvttph_epu16() {
25786        let a = _mm256_set_ph(
25787            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25788        );
25789        let src = _mm256_set_epi16(
25790            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
25791        );
25792        let r = _mm256_mask_cvttph_epu16(src, 0b0101010101010101, a);
25793        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
25794        assert_eq_m256i(r, e);
25795    }
25796
25797    #[simd_test(enable = "avx512fp16,avx512vl")]
25798    unsafe fn test_mm256_maskz_cvttph_epu16() {
25799        let a = _mm256_set_ph(
25800            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25801        );
25802        let r = _mm256_maskz_cvttph_epu16(0b0101010101010101, a);
25803        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
25804        assert_eq_m256i(r, e);
25805    }
25806
25807    #[simd_test(enable = "avx512fp16")]
25808    unsafe fn test_mm512_cvttph_epu16() {
25809        let a = _mm512_set_ph(
25810            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25811            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25812            31.0, 32.0,
25813        );
25814        let r = _mm512_cvttph_epu16(a);
25815        let e = _mm512_set_epi16(
25816            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25817            25, 26, 27, 28, 29, 30, 31, 32,
25818        );
25819        assert_eq_m512i(r, e);
25820    }
25821
25822    #[simd_test(enable = "avx512fp16")]
25823    unsafe fn test_mm512_mask_cvttph_epu16() {
25824        let a = _mm512_set_ph(
25825            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25826            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25827            31.0, 32.0,
25828        );
25829        let src = _mm512_set_epi16(
25830            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25831            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25832        );
25833        let r = _mm512_mask_cvttph_epu16(src, 0b01010101010101010101010101010101, a);
25834        let e = _mm512_set_epi16(
25835            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25836            24, 34, 26, 36, 28, 38, 30, 40, 32,
25837        );
25838        assert_eq_m512i(r, e);
25839    }
25840
25841    #[simd_test(enable = "avx512fp16")]
25842    unsafe fn test_mm512_maskz_cvttph_epu16() {
25843        let a = _mm512_set_ph(
25844            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25845            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25846            31.0, 32.0,
25847        );
25848        let r = _mm512_maskz_cvttph_epu16(0b01010101010101010101010101010101, a);
25849        let e = _mm512_set_epi16(
25850            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25851            0, 28, 0, 30, 0, 32,
25852        );
25853        assert_eq_m512i(r, e);
25854    }
25855
25856    #[simd_test(enable = "avx512fp16")]
25857    unsafe fn test_mm512_cvtt_roundph_epu16() {
25858        let a = _mm512_set_ph(
25859            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25860            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25861            31.0, 32.0,
25862        );
25863        let r = _mm512_cvtt_roundph_epu16::<_MM_FROUND_NO_EXC>(a);
25864        let e = _mm512_set_epi16(
25865            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25866            25, 26, 27, 28, 29, 30, 31, 32,
25867        );
25868        assert_eq_m512i(r, e);
25869    }
25870
25871    #[simd_test(enable = "avx512fp16")]
25872    unsafe fn test_mm512_mask_cvtt_roundph_epu16() {
25873        let a = _mm512_set_ph(
25874            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25875            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25876            31.0, 32.0,
25877        );
25878        let src = _mm512_set_epi16(
25879            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25880            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25881        );
25882        let r = _mm512_mask_cvtt_roundph_epu16::<_MM_FROUND_NO_EXC>(
25883            src,
25884            0b01010101010101010101010101010101,
25885            a,
25886        );
25887        let e = _mm512_set_epi16(
25888            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25889            24, 34, 26, 36, 28, 38, 30, 40, 32,
25890        );
25891        assert_eq_m512i(r, e);
25892    }
25893
25894    #[simd_test(enable = "avx512fp16")]
25895    unsafe fn test_mm512_maskz_cvtt_roundph_epu16() {
25896        let a = _mm512_set_ph(
25897            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25898            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25899            31.0, 32.0,
25900        );
25901        let r = _mm512_maskz_cvtt_roundph_epu16::<_MM_FROUND_NO_EXC>(
25902            0b01010101010101010101010101010101,
25903            a,
25904        );
25905        let e = _mm512_set_epi16(
25906            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25907            0, 28, 0, 30, 0, 32,
25908        );
25909        assert_eq_m512i(r, e);
25910    }
25911
25912    #[simd_test(enable = "avx512fp16,avx512vl")]
25913    unsafe fn test_mm_cvtph_epi32() {
25914        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
25915        let r = _mm_cvtph_epi32(a);
25916        let e = _mm_set_epi32(1, 2, 3, 4);
25917        assert_eq_m128i(r, e);
25918    }
25919
25920    #[simd_test(enable = "avx512fp16,avx512vl")]
25921    unsafe fn test_mm_mask_cvtph_epi32() {
25922        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
25923        let src = _mm_set_epi32(10, 11, 12, 13);
25924        let r = _mm_mask_cvtph_epi32(src, 0b0101, a);
25925        let e = _mm_set_epi32(10, 2, 12, 4);
25926        assert_eq_m128i(r, e);
25927    }
25928
25929    #[simd_test(enable = "avx512fp16,avx512vl")]
25930    unsafe fn test_mm_maskz_cvtph_epi32() {
25931        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
25932        let r = _mm_maskz_cvtph_epi32(0b0101, a);
25933        let e = _mm_set_epi32(0, 2, 0, 4);
25934        assert_eq_m128i(r, e);
25935    }
25936
25937    #[simd_test(enable = "avx512fp16,avx512vl")]
25938    unsafe fn test_mm256_cvtph_epi32() {
25939        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25940        let r = _mm256_cvtph_epi32(a);
25941        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
25942        assert_eq_m256i(r, e);
25943    }
25944
25945    #[simd_test(enable = "avx512fp16,avx512vl")]
25946    unsafe fn test_mm256_mask_cvtph_epi32() {
25947        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25948        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
25949        let r = _mm256_mask_cvtph_epi32(src, 0b01010101, a);
25950        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
25951        assert_eq_m256i(r, e);
25952    }
25953
25954    #[simd_test(enable = "avx512fp16,avx512vl")]
25955    unsafe fn test_mm256_maskz_cvtph_epi32() {
25956        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25957        let r = _mm256_maskz_cvtph_epi32(0b01010101, a);
25958        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
25959        assert_eq_m256i(r, e);
25960    }
25961
25962    #[simd_test(enable = "avx512fp16")]
25963    unsafe fn test_mm512_cvtph_epi32() {
25964        let a = _mm256_set_ph(
25965            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25966        );
25967        let r = _mm512_cvtph_epi32(a);
25968        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25969        assert_eq_m512i(r, e);
25970    }
25971
25972    #[simd_test(enable = "avx512fp16")]
25973    unsafe fn test_mm512_mask_cvtph_epi32() {
25974        let a = _mm256_set_ph(
25975            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25976        );
25977        let src = _mm512_set_epi32(
25978            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
25979        );
25980        let r = _mm512_mask_cvtph_epi32(src, 0b0101010101010101, a);
25981        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
25982        assert_eq_m512i(r, e);
25983    }
25984
25985    #[simd_test(enable = "avx512fp16")]
25986    unsafe fn test_mm512_maskz_cvtph_epi32() {
25987        let a = _mm256_set_ph(
25988            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25989        );
25990        let r = _mm512_maskz_cvtph_epi32(0b0101010101010101, a);
25991        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
25992        assert_eq_m512i(r, e);
25993    }
25994
25995    #[simd_test(enable = "avx512fp16")]
25996    unsafe fn test_mm512_cvt_roundph_epi32() {
25997        let a = _mm256_set_ph(
25998            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25999        );
26000        let r = _mm512_cvt_roundph_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26001        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26002        assert_eq_m512i(r, e);
26003    }
26004
26005    #[simd_test(enable = "avx512fp16")]
26006    unsafe fn test_mm512_mask_cvt_roundph_epi32() {
26007        let a = _mm256_set_ph(
26008            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26009        );
26010        let src = _mm512_set_epi32(
26011            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26012        );
26013        let r = _mm512_mask_cvt_roundph_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26014            src,
26015            0b0101010101010101,
26016            a,
26017        );
26018        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26019        assert_eq_m512i(r, e);
26020    }
26021
26022    #[simd_test(enable = "avx512fp16")]
26023    unsafe fn test_mm512_maskz_cvt_roundph_epi32() {
26024        let a = _mm256_set_ph(
26025            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26026        );
26027        let r = _mm512_maskz_cvt_roundph_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26028            0b0101010101010101,
26029            a,
26030        );
26031        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26032        assert_eq_m512i(r, e);
26033    }
26034
26035    #[simd_test(enable = "avx512fp16")]
26036    unsafe fn test_mm_cvtsh_i32() {
26037        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26038        let r = _mm_cvtsh_i32(a);
26039        assert_eq!(r, 1);
26040    }
26041
26042    #[simd_test(enable = "avx512fp16")]
26043    unsafe fn test_mm_cvt_roundsh_i32() {
26044        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26045        let r = _mm_cvt_roundsh_i32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26046        assert_eq!(r, 1);
26047    }
26048
26049    #[simd_test(enable = "avx512fp16,avx512vl")]
26050    unsafe fn test_mm_cvtph_epu32() {
26051        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26052        let r = _mm_cvtph_epu32(a);
26053        let e = _mm_set_epi32(1, 2, 3, 4);
26054        assert_eq_m128i(r, e);
26055    }
26056
26057    #[simd_test(enable = "avx512fp16,avx512vl")]
26058    unsafe fn test_mm_mask_cvtph_epu32() {
26059        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26060        let src = _mm_set_epi32(10, 11, 12, 13);
26061        let r = _mm_mask_cvtph_epu32(src, 0b0101, a);
26062        let e = _mm_set_epi32(10, 2, 12, 4);
26063        assert_eq_m128i(r, e);
26064    }
26065
26066    #[simd_test(enable = "avx512fp16,avx512vl")]
26067    unsafe fn test_mm_maskz_cvtph_epu32() {
26068        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26069        let r = _mm_maskz_cvtph_epu32(0b0101, a);
26070        let e = _mm_set_epi32(0, 2, 0, 4);
26071        assert_eq_m128i(r, e);
26072    }
26073
26074    #[simd_test(enable = "avx512fp16,avx512vl")]
26075    unsafe fn test_mm256_cvtph_epu32() {
26076        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26077        let r = _mm256_cvtph_epu32(a);
26078        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
26079        assert_eq_m256i(r, e);
26080    }
26081
26082    #[simd_test(enable = "avx512fp16,avx512vl")]
26083    unsafe fn test_mm256_mask_cvtph_epu32() {
26084        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26085        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
26086        let r = _mm256_mask_cvtph_epu32(src, 0b01010101, a);
26087        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
26088        assert_eq_m256i(r, e);
26089    }
26090
26091    #[simd_test(enable = "avx512fp16,avx512vl")]
26092    unsafe fn test_mm256_maskz_cvtph_epu32() {
26093        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26094        let r = _mm256_maskz_cvtph_epu32(0b01010101, a);
26095        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
26096        assert_eq_m256i(r, e);
26097    }
26098
26099    #[simd_test(enable = "avx512fp16")]
26100    unsafe fn test_mm512_cvtph_epu32() {
26101        let a = _mm256_set_ph(
26102            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26103        );
26104        let r = _mm512_cvtph_epu32(a);
26105        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26106        assert_eq_m512i(r, e);
26107    }
26108
26109    #[simd_test(enable = "avx512fp16")]
26110    unsafe fn test_mm512_mask_cvtph_epu32() {
26111        let a = _mm256_set_ph(
26112            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26113        );
26114        let src = _mm512_set_epi32(
26115            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26116        );
26117        let r = _mm512_mask_cvtph_epu32(src, 0b0101010101010101, a);
26118        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26119        assert_eq_m512i(r, e);
26120    }
26121
26122    #[simd_test(enable = "avx512fp16")]
26123    unsafe fn test_mm512_maskz_cvtph_epu32() {
26124        let a = _mm256_set_ph(
26125            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26126        );
26127        let r = _mm512_maskz_cvtph_epu32(0b0101010101010101, a);
26128        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26129        assert_eq_m512i(r, e);
26130    }
26131
26132    #[simd_test(enable = "avx512fp16")]
26133    unsafe fn test_mm512_cvt_roundph_epu32() {
26134        let a = _mm256_set_ph(
26135            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26136        );
26137        let r = _mm512_cvt_roundph_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26138        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26139        assert_eq_m512i(r, e);
26140    }
26141
26142    #[simd_test(enable = "avx512fp16")]
26143    unsafe fn test_mm512_mask_cvt_roundph_epu32() {
26144        let a = _mm256_set_ph(
26145            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26146        );
26147        let src = _mm512_set_epi32(
26148            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26149        );
26150        let r = _mm512_mask_cvt_roundph_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26151            src,
26152            0b0101010101010101,
26153            a,
26154        );
26155        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26156        assert_eq_m512i(r, e);
26157    }
26158
26159    #[simd_test(enable = "avx512fp16")]
26160    unsafe fn test_mm512_maskz_cvt_roundph_epu32() {
26161        let a = _mm256_set_ph(
26162            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26163        );
26164        let r = _mm512_maskz_cvt_roundph_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26165            0b0101010101010101,
26166            a,
26167        );
26168        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26169        assert_eq_m512i(r, e);
26170    }
26171
26172    #[simd_test(enable = "avx512fp16")]
26173    unsafe fn test_mm_cvtsh_u32() {
26174        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26175        let r = _mm_cvtsh_u32(a);
26176        assert_eq!(r, 1);
26177    }
26178
26179    #[simd_test(enable = "avx512fp16")]
26180    unsafe fn test_mm_cvt_roundsh_u32() {
26181        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26182        let r = _mm_cvt_roundsh_u32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26183        assert_eq!(r, 1);
26184    }
26185
26186    #[simd_test(enable = "avx512fp16,avx512vl")]
26187    unsafe fn test_mm_cvttph_epi32() {
26188        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26189        let r = _mm_cvttph_epi32(a);
26190        let e = _mm_set_epi32(1, 2, 3, 4);
26191        assert_eq_m128i(r, e);
26192    }
26193
26194    #[simd_test(enable = "avx512fp16,avx512vl")]
26195    unsafe fn test_mm_mask_cvttph_epi32() {
26196        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26197        let src = _mm_set_epi32(10, 11, 12, 13);
26198        let r = _mm_mask_cvttph_epi32(src, 0b0101, a);
26199        let e = _mm_set_epi32(10, 2, 12, 4);
26200        assert_eq_m128i(r, e);
26201    }
26202
26203    #[simd_test(enable = "avx512fp16,avx512vl")]
26204    unsafe fn test_mm_maskz_cvttph_epi32() {
26205        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26206        let r = _mm_maskz_cvttph_epi32(0b0101, a);
26207        let e = _mm_set_epi32(0, 2, 0, 4);
26208        assert_eq_m128i(r, e);
26209    }
26210
26211    #[simd_test(enable = "avx512fp16,avx512vl")]
26212    unsafe fn test_mm256_cvttph_epi32() {
26213        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26214        let r = _mm256_cvttph_epi32(a);
26215        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
26216        assert_eq_m256i(r, e);
26217    }
26218
26219    #[simd_test(enable = "avx512fp16,avx512vl")]
26220    unsafe fn test_mm256_mask_cvttph_epi32() {
26221        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26222        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
26223        let r = _mm256_mask_cvttph_epi32(src, 0b01010101, a);
26224        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
26225        assert_eq_m256i(r, e);
26226    }
26227
26228    #[simd_test(enable = "avx512fp16,avx512vl")]
26229    unsafe fn test_mm256_maskz_cvttph_epi32() {
26230        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26231        let r = _mm256_maskz_cvttph_epi32(0b01010101, a);
26232        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
26233        assert_eq_m256i(r, e);
26234    }
26235
26236    #[simd_test(enable = "avx512fp16")]
26237    unsafe fn test_mm512_cvttph_epi32() {
26238        let a = _mm256_set_ph(
26239            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26240        );
26241        let r = _mm512_cvttph_epi32(a);
26242        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26243        assert_eq_m512i(r, e);
26244    }
26245
26246    #[simd_test(enable = "avx512fp16")]
26247    unsafe fn test_mm512_mask_cvttph_epi32() {
26248        let a = _mm256_set_ph(
26249            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26250        );
26251        let src = _mm512_set_epi32(
26252            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26253        );
26254        let r = _mm512_mask_cvttph_epi32(src, 0b0101010101010101, a);
26255        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26256        assert_eq_m512i(r, e);
26257    }
26258
26259    #[simd_test(enable = "avx512fp16")]
26260    unsafe fn test_mm512_maskz_cvttph_epi32() {
26261        let a = _mm256_set_ph(
26262            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26263        );
26264        let r = _mm512_maskz_cvttph_epi32(0b0101010101010101, a);
26265        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26266        assert_eq_m512i(r, e);
26267    }
26268
26269    #[simd_test(enable = "avx512fp16")]
26270    unsafe fn test_mm512_cvtt_roundph_epi32() {
26271        let a = _mm256_set_ph(
26272            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26273        );
26274        let r = _mm512_cvtt_roundph_epi32::<_MM_FROUND_NO_EXC>(a);
26275        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26276        assert_eq_m512i(r, e);
26277    }
26278
26279    #[simd_test(enable = "avx512fp16")]
26280    unsafe fn test_mm512_mask_cvtt_roundph_epi32() {
26281        let a = _mm256_set_ph(
26282            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26283        );
26284        let src = _mm512_set_epi32(
26285            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26286        );
26287        let r = _mm512_mask_cvtt_roundph_epi32::<_MM_FROUND_NO_EXC>(src, 0b0101010101010101, a);
26288        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26289        assert_eq_m512i(r, e);
26290    }
26291
26292    #[simd_test(enable = "avx512fp16")]
26293    unsafe fn test_mm512_maskz_cvtt_roundph_epi32() {
26294        let a = _mm256_set_ph(
26295            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26296        );
26297        let r = _mm512_maskz_cvtt_roundph_epi32::<_MM_FROUND_NO_EXC>(0b0101010101010101, a);
26298        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26299        assert_eq_m512i(r, e);
26300    }
26301
26302    #[simd_test(enable = "avx512fp16")]
26303    unsafe fn test_mm_cvttsh_i32() {
26304        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26305        let r = _mm_cvttsh_i32(a);
26306        assert_eq!(r, 1);
26307    }
26308
26309    #[simd_test(enable = "avx512fp16")]
26310    unsafe fn test_mm_cvtt_roundsh_i32() {
26311        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26312        let r = _mm_cvtt_roundsh_i32::<_MM_FROUND_NO_EXC>(a);
26313        assert_eq!(r, 1);
26314    }
26315
26316    #[simd_test(enable = "avx512fp16,avx512vl")]
26317    unsafe fn test_mm_cvttph_epu32() {
26318        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26319        let r = _mm_cvttph_epu32(a);
26320        let e = _mm_set_epi32(1, 2, 3, 4);
26321        assert_eq_m128i(r, e);
26322    }
26323
26324    #[simd_test(enable = "avx512fp16,avx512vl")]
26325    unsafe fn test_mm_mask_cvttph_epu32() {
26326        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26327        let src = _mm_set_epi32(10, 11, 12, 13);
26328        let r = _mm_mask_cvttph_epu32(src, 0b0101, a);
26329        let e = _mm_set_epi32(10, 2, 12, 4);
26330        assert_eq_m128i(r, e);
26331    }
26332
26333    #[simd_test(enable = "avx512fp16,avx512vl")]
26334    unsafe fn test_mm_maskz_cvttph_epu32() {
26335        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26336        let r = _mm_maskz_cvttph_epu32(0b0101, a);
26337        let e = _mm_set_epi32(0, 2, 0, 4);
26338        assert_eq_m128i(r, e);
26339    }
26340
26341    #[simd_test(enable = "avx512fp16,avx512vl")]
26342    unsafe fn test_mm256_cvttph_epu32() {
26343        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26344        let r = _mm256_cvttph_epu32(a);
26345        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
26346        assert_eq_m256i(r, e);
26347    }
26348
26349    #[simd_test(enable = "avx512fp16,avx512vl")]
26350    unsafe fn test_mm256_mask_cvttph_epu32() {
26351        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26352        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
26353        let r = _mm256_mask_cvttph_epu32(src, 0b01010101, a);
26354        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
26355        assert_eq_m256i(r, e);
26356    }
26357
26358    #[simd_test(enable = "avx512fp16,avx512vl")]
26359    unsafe fn test_mm256_maskz_cvttph_epu32() {
26360        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26361        let r = _mm256_maskz_cvttph_epu32(0b01010101, a);
26362        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
26363        assert_eq_m256i(r, e);
26364    }
26365
26366    #[simd_test(enable = "avx512fp16")]
26367    unsafe fn test_mm512_cvttph_epu32() {
26368        let a = _mm256_set_ph(
26369            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26370        );
26371        let r = _mm512_cvttph_epu32(a);
26372        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26373        assert_eq_m512i(r, e);
26374    }
26375
26376    #[simd_test(enable = "avx512fp16")]
26377    unsafe fn test_mm512_mask_cvttph_epu32() {
26378        let a = _mm256_set_ph(
26379            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26380        );
26381        let src = _mm512_set_epi32(
26382            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26383        );
26384        let r = _mm512_mask_cvttph_epu32(src, 0b0101010101010101, a);
26385        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26386        assert_eq_m512i(r, e);
26387    }
26388
26389    #[simd_test(enable = "avx512fp16")]
26390    unsafe fn test_mm512_maskz_cvttph_epu32() {
26391        let a = _mm256_set_ph(
26392            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26393        );
26394        let r = _mm512_maskz_cvttph_epu32(0b0101010101010101, a);
26395        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26396        assert_eq_m512i(r, e);
26397    }
26398
26399    #[simd_test(enable = "avx512fp16")]
26400    unsafe fn test_mm512_cvtt_roundph_epu32() {
26401        let a = _mm256_set_ph(
26402            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26403        );
26404        let r = _mm512_cvtt_roundph_epu32::<_MM_FROUND_NO_EXC>(a);
26405        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26406        assert_eq_m512i(r, e);
26407    }
26408
26409    #[simd_test(enable = "avx512fp16")]
26410    unsafe fn test_mm512_mask_cvtt_roundph_epu32() {
26411        let a = _mm256_set_ph(
26412            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26413        );
26414        let src = _mm512_set_epi32(
26415            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26416        );
26417        let r = _mm512_mask_cvtt_roundph_epu32::<_MM_FROUND_NO_EXC>(src, 0b0101010101010101, a);
26418        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26419        assert_eq_m512i(r, e);
26420    }
26421
26422    #[simd_test(enable = "avx512fp16")]
26423    unsafe fn test_mm512_maskz_cvtt_roundph_epu32() {
26424        let a = _mm256_set_ph(
26425            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26426        );
26427        let r = _mm512_maskz_cvtt_roundph_epu32::<_MM_FROUND_NO_EXC>(0b0101010101010101, a);
26428        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26429        assert_eq_m512i(r, e);
26430    }
26431
26432    #[simd_test(enable = "avx512fp16")]
26433    unsafe fn test_mm_cvttsh_u32() {
26434        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26435        let r = _mm_cvttsh_u32(a);
26436        assert_eq!(r, 1);
26437    }
26438
26439    #[simd_test(enable = "avx512fp16")]
26440    unsafe fn test_mm_cvtt_roundsh_u32() {
26441        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26442        let r = _mm_cvtt_roundsh_u32::<_MM_FROUND_NO_EXC>(a);
26443        assert_eq!(r, 1);
26444    }
26445
26446    #[simd_test(enable = "avx512fp16,avx512vl")]
26447    unsafe fn test_mm_cvtph_epi64() {
26448        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26449        let r = _mm_cvtph_epi64(a);
26450        let e = _mm_set_epi64x(1, 2);
26451        assert_eq_m128i(r, e);
26452    }
26453
26454    #[simd_test(enable = "avx512fp16,avx512vl")]
26455    unsafe fn test_mm_mask_cvtph_epi64() {
26456        let src = _mm_set_epi64x(3, 4);
26457        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26458        let r = _mm_mask_cvtph_epi64(src, 0b01, a);
26459        let e = _mm_set_epi64x(3, 2);
26460        assert_eq_m128i(r, e);
26461    }
26462
26463    #[simd_test(enable = "avx512fp16,avx512vl")]
26464    unsafe fn test_mm_maskz_cvtph_epi64() {
26465        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26466        let r = _mm_maskz_cvtph_epi64(0b01, a);
26467        let e = _mm_set_epi64x(0, 2);
26468        assert_eq_m128i(r, e);
26469    }
26470
26471    #[simd_test(enable = "avx512fp16,avx512vl")]
26472    unsafe fn test_mm256_cvtph_epi64() {
26473        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26474        let r = _mm256_cvtph_epi64(a);
26475        let e = _mm256_set_epi64x(1, 2, 3, 4);
26476        assert_eq_m256i(r, e);
26477    }
26478
26479    #[simd_test(enable = "avx512fp16,avx512vl")]
26480    unsafe fn test_mm256_mask_cvtph_epi64() {
26481        let src = _mm256_set_epi64x(5, 6, 7, 8);
26482        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26483        let r = _mm256_mask_cvtph_epi64(src, 0b0101, a);
26484        let e = _mm256_set_epi64x(5, 2, 7, 4);
26485        assert_eq_m256i(r, e);
26486    }
26487
26488    #[simd_test(enable = "avx512fp16,avx512vl")]
26489    unsafe fn test_mm256_maskz_cvtph_epi64() {
26490        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26491        let r = _mm256_maskz_cvtph_epi64(0b0101, a);
26492        let e = _mm256_set_epi64x(0, 2, 0, 4);
26493        assert_eq_m256i(r, e);
26494    }
26495
26496    #[simd_test(enable = "avx512fp16")]
26497    unsafe fn test_mm512_cvtph_epi64() {
26498        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26499        let r = _mm512_cvtph_epi64(a);
26500        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26501        assert_eq_m512i(r, e);
26502    }
26503
26504    #[simd_test(enable = "avx512fp16")]
26505    unsafe fn test_mm512_mask_cvtph_epi64() {
26506        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26507        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26508        let r = _mm512_mask_cvtph_epi64(src, 0b01010101, a);
26509        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26510        assert_eq_m512i(r, e);
26511    }
26512
26513    #[simd_test(enable = "avx512fp16")]
26514    unsafe fn test_mm512_maskz_cvtph_epi64() {
26515        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26516        let r = _mm512_maskz_cvtph_epi64(0b01010101, a);
26517        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26518        assert_eq_m512i(r, e);
26519    }
26520
26521    #[simd_test(enable = "avx512fp16")]
26522    unsafe fn test_mm512_cvt_roundph_epi64() {
26523        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26524        let r = _mm512_cvt_roundph_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26525        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26526        assert_eq_m512i(r, e);
26527    }
26528
26529    #[simd_test(enable = "avx512fp16")]
26530    unsafe fn test_mm512_mask_cvt_roundph_epi64() {
26531        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26532        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26533        let r = _mm512_mask_cvt_roundph_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26534            src, 0b01010101, a,
26535        );
26536        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26537        assert_eq_m512i(r, e);
26538    }
26539
26540    #[simd_test(enable = "avx512fp16")]
26541    unsafe fn test_mm512_maskz_cvt_roundph_epi64() {
26542        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26543        let r = _mm512_maskz_cvt_roundph_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26544            0b01010101, a,
26545        );
26546        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26547        assert_eq_m512i(r, e);
26548    }
26549
26550    #[simd_test(enable = "avx512fp16,avx512vl")]
26551    unsafe fn test_mm_cvtph_epu64() {
26552        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26553        let r = _mm_cvtph_epu64(a);
26554        let e = _mm_set_epi64x(1, 2);
26555        assert_eq_m128i(r, e);
26556    }
26557
26558    #[simd_test(enable = "avx512fp16,avx512vl")]
26559    unsafe fn test_mm_mask_cvtph_epu64() {
26560        let src = _mm_set_epi64x(3, 4);
26561        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26562        let r = _mm_mask_cvtph_epu64(src, 0b01, a);
26563        let e = _mm_set_epi64x(3, 2);
26564        assert_eq_m128i(r, e);
26565    }
26566
26567    #[simd_test(enable = "avx512fp16,avx512vl")]
26568    unsafe fn test_mm_maskz_cvtph_epu64() {
26569        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26570        let r = _mm_maskz_cvtph_epu64(0b01, a);
26571        let e = _mm_set_epi64x(0, 2);
26572        assert_eq_m128i(r, e);
26573    }
26574
26575    #[simd_test(enable = "avx512fp16,avx512vl")]
26576    unsafe fn test_mm256_cvtph_epu64() {
26577        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26578        let r = _mm256_cvtph_epu64(a);
26579        let e = _mm256_set_epi64x(1, 2, 3, 4);
26580        assert_eq_m256i(r, e);
26581    }
26582
26583    #[simd_test(enable = "avx512fp16,avx512vl")]
26584    unsafe fn test_mm256_mask_cvtph_epu64() {
26585        let src = _mm256_set_epi64x(5, 6, 7, 8);
26586        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26587        let r = _mm256_mask_cvtph_epu64(src, 0b0101, a);
26588        let e = _mm256_set_epi64x(5, 2, 7, 4);
26589        assert_eq_m256i(r, e);
26590    }
26591
26592    #[simd_test(enable = "avx512fp16,avx512vl")]
26593    unsafe fn test_mm256_maskz_cvtph_epu64() {
26594        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26595        let r = _mm256_maskz_cvtph_epu64(0b0101, a);
26596        let e = _mm256_set_epi64x(0, 2, 0, 4);
26597        assert_eq_m256i(r, e);
26598    }
26599
26600    #[simd_test(enable = "avx512fp16")]
26601    unsafe fn test_mm512_cvtph_epu64() {
26602        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26603        let r = _mm512_cvtph_epu64(a);
26604        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26605        assert_eq_m512i(r, e);
26606    }
26607
26608    #[simd_test(enable = "avx512fp16")]
26609    unsafe fn test_mm512_mask_cvtph_epu64() {
26610        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26611        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26612        let r = _mm512_mask_cvtph_epu64(src, 0b01010101, a);
26613        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26614        assert_eq_m512i(r, e);
26615    }
26616
26617    #[simd_test(enable = "avx512fp16")]
26618    unsafe fn test_mm512_maskz_cvtph_epu64() {
26619        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26620        let r = _mm512_maskz_cvtph_epu64(0b01010101, a);
26621        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26622        assert_eq_m512i(r, e);
26623    }
26624
26625    #[simd_test(enable = "avx512fp16")]
26626    unsafe fn test_mm512_cvt_roundph_epu64() {
26627        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26628        let r = _mm512_cvt_roundph_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26629        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26630        assert_eq_m512i(r, e);
26631    }
26632
26633    #[simd_test(enable = "avx512fp16")]
26634    unsafe fn test_mm512_mask_cvt_roundph_epu64() {
26635        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26636        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26637        let r = _mm512_mask_cvt_roundph_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26638            src, 0b01010101, a,
26639        );
26640        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26641        assert_eq_m512i(r, e);
26642    }
26643
26644    #[simd_test(enable = "avx512fp16")]
26645    unsafe fn test_mm512_maskz_cvt_roundph_epu64() {
26646        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26647        let r = _mm512_maskz_cvt_roundph_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26648            0b01010101, a,
26649        );
26650        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26651        assert_eq_m512i(r, e);
26652    }
26653
26654    #[simd_test(enable = "avx512fp16,avx512vl")]
26655    unsafe fn test_mm_cvttph_epi64() {
26656        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26657        let r = _mm_cvttph_epi64(a);
26658        let e = _mm_set_epi64x(1, 2);
26659        assert_eq_m128i(r, e);
26660    }
26661
26662    #[simd_test(enable = "avx512fp16,avx512vl")]
26663    unsafe fn test_mm_mask_cvttph_epi64() {
26664        let src = _mm_set_epi64x(3, 4);
26665        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26666        let r = _mm_mask_cvttph_epi64(src, 0b01, a);
26667        let e = _mm_set_epi64x(3, 2);
26668        assert_eq_m128i(r, e);
26669    }
26670
26671    #[simd_test(enable = "avx512fp16,avx512vl")]
26672    unsafe fn test_mm_maskz_cvttph_epi64() {
26673        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26674        let r = _mm_maskz_cvttph_epi64(0b01, a);
26675        let e = _mm_set_epi64x(0, 2);
26676        assert_eq_m128i(r, e);
26677    }
26678
26679    #[simd_test(enable = "avx512fp16,avx512vl")]
26680    unsafe fn test_mm256_cvttph_epi64() {
26681        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26682        let r = _mm256_cvttph_epi64(a);
26683        let e = _mm256_set_epi64x(1, 2, 3, 4);
26684        assert_eq_m256i(r, e);
26685    }
26686
26687    #[simd_test(enable = "avx512fp16,avx512vl")]
26688    unsafe fn test_mm256_mask_cvttph_epi64() {
26689        let src = _mm256_set_epi64x(5, 6, 7, 8);
26690        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26691        let r = _mm256_mask_cvttph_epi64(src, 0b0101, a);
26692        let e = _mm256_set_epi64x(5, 2, 7, 4);
26693        assert_eq_m256i(r, e);
26694    }
26695
26696    #[simd_test(enable = "avx512fp16,avx512vl")]
26697    unsafe fn test_mm256_maskz_cvttph_epi64() {
26698        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26699        let r = _mm256_maskz_cvttph_epi64(0b0101, a);
26700        let e = _mm256_set_epi64x(0, 2, 0, 4);
26701        assert_eq_m256i(r, e);
26702    }
26703
26704    #[simd_test(enable = "avx512fp16")]
26705    unsafe fn test_mm512_cvttph_epi64() {
26706        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26707        let r = _mm512_cvttph_epi64(a);
26708        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26709        assert_eq_m512i(r, e);
26710    }
26711
26712    #[simd_test(enable = "avx512fp16")]
26713    unsafe fn test_mm512_mask_cvttph_epi64() {
26714        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26715        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26716        let r = _mm512_mask_cvttph_epi64(src, 0b01010101, a);
26717        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26718        assert_eq_m512i(r, e);
26719    }
26720
26721    #[simd_test(enable = "avx512fp16")]
26722    unsafe fn test_mm512_maskz_cvttph_epi64() {
26723        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26724        let r = _mm512_maskz_cvttph_epi64(0b01010101, a);
26725        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26726        assert_eq_m512i(r, e);
26727    }
26728
26729    #[simd_test(enable = "avx512fp16")]
26730    unsafe fn test_mm512_cvtt_roundph_epi64() {
26731        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26732        let r = _mm512_cvtt_roundph_epi64::<_MM_FROUND_NO_EXC>(a);
26733        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26734        assert_eq_m512i(r, e);
26735    }
26736
26737    #[simd_test(enable = "avx512fp16")]
26738    unsafe fn test_mm512_mask_cvtt_roundph_epi64() {
26739        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26740        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26741        let r = _mm512_mask_cvtt_roundph_epi64::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
26742        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26743        assert_eq_m512i(r, e);
26744    }
26745
26746    #[simd_test(enable = "avx512fp16")]
26747    unsafe fn test_mm512_maskz_cvtt_roundph_epi64() {
26748        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26749        let r = _mm512_maskz_cvtt_roundph_epi64::<_MM_FROUND_NO_EXC>(0b01010101, a);
26750        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26751        assert_eq_m512i(r, e);
26752    }
26753
26754    #[simd_test(enable = "avx512fp16,avx512vl")]
26755    unsafe fn test_mm_cvttph_epu64() {
26756        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26757        let r = _mm_cvttph_epu64(a);
26758        let e = _mm_set_epi64x(1, 2);
26759        assert_eq_m128i(r, e);
26760    }
26761
26762    #[simd_test(enable = "avx512fp16,avx512vl")]
26763    unsafe fn test_mm_mask_cvttph_epu64() {
26764        let src = _mm_set_epi64x(3, 4);
26765        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26766        let r = _mm_mask_cvttph_epu64(src, 0b01, a);
26767        let e = _mm_set_epi64x(3, 2);
26768        assert_eq_m128i(r, e);
26769    }
26770
26771    #[simd_test(enable = "avx512fp16,avx512vl")]
26772    unsafe fn test_mm_maskz_cvttph_epu64() {
26773        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26774        let r = _mm_maskz_cvttph_epu64(0b01, a);
26775        let e = _mm_set_epi64x(0, 2);
26776        assert_eq_m128i(r, e);
26777    }
26778
26779    #[simd_test(enable = "avx512fp16,avx512vl")]
26780    unsafe fn test_mm256_cvttph_epu64() {
26781        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26782        let r = _mm256_cvttph_epu64(a);
26783        let e = _mm256_set_epi64x(1, 2, 3, 4);
26784        assert_eq_m256i(r, e);
26785    }
26786
26787    #[simd_test(enable = "avx512fp16,avx512vl")]
26788    unsafe fn test_mm256_mask_cvttph_epu64() {
26789        let src = _mm256_set_epi64x(5, 6, 7, 8);
26790        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26791        let r = _mm256_mask_cvttph_epu64(src, 0b0101, a);
26792        let e = _mm256_set_epi64x(5, 2, 7, 4);
26793        assert_eq_m256i(r, e);
26794    }
26795
26796    #[simd_test(enable = "avx512fp16,avx512vl")]
26797    unsafe fn test_mm256_maskz_cvttph_epu64() {
26798        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26799        let r = _mm256_maskz_cvttph_epu64(0b0101, a);
26800        let e = _mm256_set_epi64x(0, 2, 0, 4);
26801        assert_eq_m256i(r, e);
26802    }
26803
26804    #[simd_test(enable = "avx512fp16")]
26805    unsafe fn test_mm512_cvttph_epu64() {
26806        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26807        let r = _mm512_cvttph_epu64(a);
26808        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26809        assert_eq_m512i(r, e);
26810    }
26811
26812    #[simd_test(enable = "avx512fp16")]
26813    unsafe fn test_mm512_mask_cvttph_epu64() {
26814        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26815        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26816        let r = _mm512_mask_cvttph_epu64(src, 0b01010101, a);
26817        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26818        assert_eq_m512i(r, e);
26819    }
26820
26821    #[simd_test(enable = "avx512fp16")]
26822    unsafe fn test_mm512_maskz_cvttph_epu64() {
26823        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26824        let r = _mm512_maskz_cvttph_epu64(0b01010101, a);
26825        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26826        assert_eq_m512i(r, e);
26827    }
26828
26829    #[simd_test(enable = "avx512fp16")]
26830    unsafe fn test_mm512_cvtt_roundph_epu64() {
26831        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26832        let r = _mm512_cvtt_roundph_epu64::<_MM_FROUND_NO_EXC>(a);
26833        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26834        assert_eq_m512i(r, e);
26835    }
26836
26837    #[simd_test(enable = "avx512fp16")]
26838    unsafe fn test_mm512_mask_cvtt_roundph_epu64() {
26839        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26840        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26841        let r = _mm512_mask_cvtt_roundph_epu64::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
26842        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26843        assert_eq_m512i(r, e);
26844    }
26845
26846    #[simd_test(enable = "avx512fp16")]
26847    unsafe fn test_mm512_maskz_cvtt_roundph_epu64() {
26848        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26849        let r = _mm512_maskz_cvtt_roundph_epu64::<_MM_FROUND_NO_EXC>(0b01010101, a);
26850        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26851        assert_eq_m512i(r, e);
26852    }
26853
26854    #[simd_test(enable = "avx512fp16,avx512vl")]
26855    unsafe fn test_mm_cvtxph_ps() {
26856        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26857        let r = _mm_cvtxph_ps(a);
26858        let e = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
26859        assert_eq_m128(r, e);
26860    }
26861
26862    #[simd_test(enable = "avx512fp16,avx512vl")]
26863    unsafe fn test_mm_mask_cvtxph_ps() {
26864        let src = _mm_set_ps(10.0, 11.0, 12.0, 13.0);
26865        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26866        let r = _mm_mask_cvtxph_ps(src, 0b0101, a);
26867        let e = _mm_set_ps(10.0, 2.0, 12.0, 4.0);
26868        assert_eq_m128(r, e);
26869    }
26870
26871    #[simd_test(enable = "avx512fp16,avx512vl")]
26872    unsafe fn test_mm_maskz_cvtxph_ps() {
26873        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26874        let r = _mm_maskz_cvtxph_ps(0b0101, a);
26875        let e = _mm_set_ps(0.0, 2.0, 0.0, 4.0);
26876        assert_eq_m128(r, e);
26877    }
26878
26879    #[simd_test(enable = "avx512fp16,avx512vl")]
26880    unsafe fn test_mm256_cvtxph_ps() {
26881        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26882        let r = _mm256_cvtxph_ps(a);
26883        let e = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26884        assert_eq_m256(r, e);
26885    }
26886
26887    #[simd_test(enable = "avx512fp16,avx512vl")]
26888    unsafe fn test_mm256_mask_cvtxph_ps() {
26889        let src = _mm256_set_ps(10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0);
26890        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26891        let r = _mm256_mask_cvtxph_ps(src, 0b01010101, a);
26892        let e = _mm256_set_ps(10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0);
26893        assert_eq_m256(r, e);
26894    }
26895
26896    #[simd_test(enable = "avx512fp16,avx512vl")]
26897    unsafe fn test_mm256_maskz_cvtxph_ps() {
26898        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26899        let r = _mm256_maskz_cvtxph_ps(0b01010101, a);
26900        let e = _mm256_set_ps(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
26901        assert_eq_m256(r, e);
26902    }
26903
26904    #[simd_test(enable = "avx512fp16")]
26905    unsafe fn test_mm512_cvtxph_ps() {
26906        let a = _mm256_set_ph(
26907            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26908        );
26909        let r = _mm512_cvtxph_ps(a);
26910        let e = _mm512_set_ps(
26911            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26912        );
26913        assert_eq_m512(r, e);
26914    }
26915
26916    #[simd_test(enable = "avx512fp16")]
26917    unsafe fn test_mm512_mask_cvtxph_ps() {
26918        let src = _mm512_set_ps(
26919            10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0,
26920            24.0, 25.0,
26921        );
26922        let a = _mm256_set_ph(
26923            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26924        );
26925        let r = _mm512_mask_cvtxph_ps(src, 0b0101010101010101, a);
26926        let e = _mm512_set_ps(
26927            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
26928            16.0,
26929        );
26930        assert_eq_m512(r, e);
26931    }
26932
26933    #[simd_test(enable = "avx512fp16")]
26934    unsafe fn test_mm512_maskz_cvtxph_ps() {
26935        let a = _mm256_set_ph(
26936            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26937        );
26938        let r = _mm512_maskz_cvtxph_ps(0b0101010101010101, a);
26939        let e = _mm512_set_ps(
26940            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
26941        );
26942        assert_eq_m512(r, e);
26943    }
26944
26945    #[simd_test(enable = "avx512fp16")]
26946    unsafe fn test_mm512_cvtx_roundph_ps() {
26947        let a = _mm256_set_ph(
26948            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26949        );
26950        let r = _mm512_cvtx_roundph_ps::<_MM_FROUND_NO_EXC>(a);
26951        let e = _mm512_set_ps(
26952            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26953        );
26954        assert_eq_m512(r, e);
26955    }
26956
26957    #[simd_test(enable = "avx512fp16")]
26958    unsafe fn test_mm512_mask_cvtx_roundph_ps() {
26959        let src = _mm512_set_ps(
26960            10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0,
26961            24.0, 25.0,
26962        );
26963        let a = _mm256_set_ph(
26964            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26965        );
26966        let r = _mm512_mask_cvtx_roundph_ps::<_MM_FROUND_NO_EXC>(src, 0b0101010101010101, a);
26967        let e = _mm512_set_ps(
26968            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
26969            16.0,
26970        );
26971        assert_eq_m512(r, e);
26972    }
26973
26974    #[simd_test(enable = "avx512fp16")]
26975    unsafe fn test_mm512_maskz_cvtx_roundph_ps() {
26976        let a = _mm256_set_ph(
26977            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26978        );
26979        let r = _mm512_maskz_cvtx_roundph_ps::<_MM_FROUND_NO_EXC>(0b0101010101010101, a);
26980        let e = _mm512_set_ps(
26981            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
26982        );
26983        assert_eq_m512(r, e);
26984    }
26985
26986    #[simd_test(enable = "avx512fp16")]
26987    unsafe fn test_mm_cvtsh_ss() {
26988        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
26989        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
26990        let r = _mm_cvtsh_ss(a, b);
26991        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
26992        assert_eq_m128(r, e);
26993    }
26994
26995    #[simd_test(enable = "avx512fp16")]
26996    unsafe fn test_mm_mask_cvtsh_ss() {
26997        let src = _mm_setr_ps(3.0, 11.0, 12.0, 13.0);
26998        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
26999        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27000        let r = _mm_mask_cvtsh_ss(src, 0, a, b);
27001        let e = _mm_setr_ps(3.0, 20.0, 21.0, 22.0);
27002        assert_eq_m128(r, e);
27003        let r = _mm_mask_cvtsh_ss(src, 1, a, b);
27004        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27005        assert_eq_m128(r, e);
27006    }
27007
27008    #[simd_test(enable = "avx512fp16")]
27009    unsafe fn test_mm_maskz_cvtsh_ss() {
27010        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27011        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27012        let r = _mm_maskz_cvtsh_ss(0, a, b);
27013        let e = _mm_setr_ps(0.0, 20.0, 21.0, 22.0);
27014        assert_eq_m128(r, e);
27015        let r = _mm_maskz_cvtsh_ss(1, a, b);
27016        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27017        assert_eq_m128(r, e);
27018    }
27019
27020    #[simd_test(enable = "avx512fp16")]
27021    unsafe fn test_mm_cvt_roundsh_ss() {
27022        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27023        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27024        let r = _mm_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(a, b);
27025        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27026        assert_eq_m128(r, e);
27027    }
27028
27029    #[simd_test(enable = "avx512fp16")]
27030    unsafe fn test_mm_mask_cvt_roundsh_ss() {
27031        let src = _mm_setr_ps(3.0, 11.0, 12.0, 13.0);
27032        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27033        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27034        let r = _mm_mask_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(src, 0, a, b);
27035        let e = _mm_setr_ps(3.0, 20.0, 21.0, 22.0);
27036        assert_eq_m128(r, e);
27037        let r = _mm_mask_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(src, 1, a, b);
27038        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27039        assert_eq_m128(r, e);
27040    }
27041
27042    #[simd_test(enable = "avx512fp16")]
27043    unsafe fn test_mm_maskz_cvt_roundsh_ss() {
27044        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27045        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27046        let r = _mm_maskz_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(0, a, b);
27047        let e = _mm_setr_ps(0.0, 20.0, 21.0, 22.0);
27048        assert_eq_m128(r, e);
27049        let r = _mm_maskz_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(1, a, b);
27050        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27051        assert_eq_m128(r, e);
27052    }
27053
27054    #[simd_test(enable = "avx512fp16,avx512vl")]
27055    unsafe fn test_mm_cvtph_pd() {
27056        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
27057        let r = _mm_cvtph_pd(a);
27058        let e = _mm_set_pd(1.0, 2.0);
27059        assert_eq_m128d(r, e);
27060    }
27061
27062    #[simd_test(enable = "avx512fp16,avx512vl")]
27063    unsafe fn test_mm_mask_cvtph_pd() {
27064        let src = _mm_set_pd(10.0, 11.0);
27065        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
27066        let r = _mm_mask_cvtph_pd(src, 0b01, a);
27067        let e = _mm_set_pd(10.0, 2.0);
27068        assert_eq_m128d(r, e);
27069    }
27070
27071    #[simd_test(enable = "avx512fp16,avx512vl")]
27072    unsafe fn test_mm_maskz_cvtph_pd() {
27073        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
27074        let r = _mm_maskz_cvtph_pd(0b01, a);
27075        let e = _mm_set_pd(0.0, 2.0);
27076        assert_eq_m128d(r, e);
27077    }
27078
27079    #[simd_test(enable = "avx512fp16,avx512vl")]
27080    unsafe fn test_mm256_cvtph_pd() {
27081        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27082        let r = _mm256_cvtph_pd(a);
27083        let e = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
27084        assert_eq_m256d(r, e);
27085    }
27086
27087    #[simd_test(enable = "avx512fp16,avx512vl")]
27088    unsafe fn test_mm256_mask_cvtph_pd() {
27089        let src = _mm256_set_pd(10.0, 11.0, 12.0, 13.0);
27090        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27091        let r = _mm256_mask_cvtph_pd(src, 0b0101, a);
27092        let e = _mm256_set_pd(10.0, 2.0, 12.0, 4.0);
27093        assert_eq_m256d(r, e);
27094    }
27095
27096    #[simd_test(enable = "avx512fp16,avx512vl")]
27097    unsafe fn test_mm256_maskz_cvtph_pd() {
27098        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27099        let r = _mm256_maskz_cvtph_pd(0b0101, a);
27100        let e = _mm256_set_pd(0.0, 2.0, 0.0, 4.0);
27101        assert_eq_m256d(r, e);
27102    }
27103
27104    #[simd_test(enable = "avx512fp16")]
27105    unsafe fn test_mm512_cvtph_pd() {
27106        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27107        let r = _mm512_cvtph_pd(a);
27108        let e = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27109        assert_eq_m512d(r, e);
27110    }
27111
27112    #[simd_test(enable = "avx512fp16")]
27113    unsafe fn test_mm512_mask_cvtph_pd() {
27114        let src = _mm512_set_pd(10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0);
27115        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27116        let r = _mm512_mask_cvtph_pd(src, 0b01010101, a);
27117        let e = _mm512_set_pd(10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0);
27118        assert_eq_m512d(r, e);
27119    }
27120
27121    #[simd_test(enable = "avx512fp16")]
27122    unsafe fn test_mm512_maskz_cvtph_pd() {
27123        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27124        let r = _mm512_maskz_cvtph_pd(0b01010101, a);
27125        let e = _mm512_set_pd(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
27126        assert_eq_m512d(r, e);
27127    }
27128
27129    #[simd_test(enable = "avx512fp16")]
27130    unsafe fn test_mm512_cvt_roundph_pd() {
27131        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27132        let r = _mm512_cvt_roundph_pd::<_MM_FROUND_NO_EXC>(a);
27133        let e = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27134        assert_eq_m512d(r, e);
27135    }
27136
27137    #[simd_test(enable = "avx512fp16")]
27138    unsafe fn test_mm512_mask_cvt_roundph_pd() {
27139        let src = _mm512_set_pd(10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0);
27140        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27141        let r = _mm512_mask_cvt_roundph_pd::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
27142        let e = _mm512_set_pd(10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0);
27143        assert_eq_m512d(r, e);
27144    }
27145
27146    #[simd_test(enable = "avx512fp16")]
27147    unsafe fn test_mm512_maskz_cvt_roundph_pd() {
27148        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27149        let r = _mm512_maskz_cvt_roundph_pd::<_MM_FROUND_NO_EXC>(0b01010101, a);
27150        let e = _mm512_set_pd(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
27151        assert_eq_m512d(r, e);
27152    }
27153
27154    #[simd_test(enable = "avx512fp16")]
27155    unsafe fn test_mm_cvtsh_sd() {
27156        let a = _mm_setr_pd(2.0, 20.0);
27157        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27158        let r = _mm_cvtsh_sd(a, b);
27159        let e = _mm_setr_pd(1.0, 20.0);
27160        assert_eq_m128d(r, e);
27161    }
27162
27163    #[simd_test(enable = "avx512fp16")]
27164    unsafe fn test_mm_mask_cvtsh_sd() {
27165        let src = _mm_setr_pd(3.0, 11.0);
27166        let a = _mm_setr_pd(2.0, 20.0);
27167        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27168        let r = _mm_mask_cvtsh_sd(src, 0, a, b);
27169        let e = _mm_setr_pd(3.0, 20.0);
27170        assert_eq_m128d(r, e);
27171        let r = _mm_mask_cvtsh_sd(src, 1, a, b);
27172        let e = _mm_setr_pd(1.0, 20.0);
27173        assert_eq_m128d(r, e);
27174    }
27175
27176    #[simd_test(enable = "avx512fp16")]
27177    unsafe fn test_mm_maskz_cvtsh_sd() {
27178        let a = _mm_setr_pd(2.0, 20.0);
27179        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27180        let r = _mm_maskz_cvtsh_sd(0, a, b);
27181        let e = _mm_setr_pd(0.0, 20.0);
27182        assert_eq_m128d(r, e);
27183        let r = _mm_maskz_cvtsh_sd(1, a, b);
27184        let e = _mm_setr_pd(1.0, 20.0);
27185        assert_eq_m128d(r, e);
27186    }
27187
27188    #[simd_test(enable = "avx512fp16")]
27189    unsafe fn test_mm_cvt_roundsh_sd() {
27190        let a = _mm_setr_pd(2.0, 20.0);
27191        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27192        let r = _mm_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(a, b);
27193        let e = _mm_setr_pd(1.0, 20.0);
27194        assert_eq_m128d(r, e);
27195    }
27196
27197    #[simd_test(enable = "avx512fp16")]
27198    unsafe fn test_mm_mask_cvt_roundsh_sd() {
27199        let src = _mm_setr_pd(3.0, 11.0);
27200        let a = _mm_setr_pd(2.0, 20.0);
27201        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27202        let r = _mm_mask_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(src, 0, a, b);
27203        let e = _mm_setr_pd(3.0, 20.0);
27204        assert_eq_m128d(r, e);
27205        let r = _mm_mask_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(src, 1, a, b);
27206        let e = _mm_setr_pd(1.0, 20.0);
27207        assert_eq_m128d(r, e);
27208    }
27209
27210    #[simd_test(enable = "avx512fp16")]
27211    unsafe fn test_mm_maskz_cvt_roundsh_sd() {
27212        let a = _mm_setr_pd(2.0, 20.0);
27213        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27214        let r = _mm_maskz_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(0, a, b);
27215        let e = _mm_setr_pd(0.0, 20.0);
27216        assert_eq_m128d(r, e);
27217        let r = _mm_maskz_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(1, a, b);
27218        let e = _mm_setr_pd(1.0, 20.0);
27219        assert_eq_m128d(r, e);
27220    }
27221
27222    #[simd_test(enable = "avx512fp16")]
27223    unsafe fn test_mm_cvtsh_h() {
27224        let a = _mm_setr_ph(1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0);
27225        let r = _mm_cvtsh_h(a);
27226        assert_eq!(r, 1.0);
27227    }
27228
27229    #[simd_test(enable = "avx512fp16")]
27230    unsafe fn test_mm256_cvtsh_h() {
27231        let a = _mm256_setr_ph(
27232            1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27233        );
27234        let r = _mm256_cvtsh_h(a);
27235        assert_eq!(r, 1.0);
27236    }
27237
27238    #[simd_test(enable = "avx512fp16")]
27239    unsafe fn test_mm512_cvtsh_h() {
27240        let a = _mm512_setr_ph(
27241            1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27242            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
27243            31.0, 32.0,
27244        );
27245        let r = _mm512_cvtsh_h(a);
27246        assert_eq!(r, 1.0);
27247    }
27248
27249    #[simd_test(enable = "avx512fp16")]
27250    unsafe fn test_mm_cvtsi128_si16() {
27251        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
27252        let r = _mm_cvtsi128_si16(a);
27253        assert_eq!(r, 1);
27254    }
27255
27256    #[simd_test(enable = "avx512fp16")]
27257    unsafe fn test_mm_cvtsi16_si128() {
27258        let a = 1;
27259        let r = _mm_cvtsi16_si128(a);
27260        let e = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
27261        assert_eq_m128i(r, e);
27262    }
27263}