core\stdarch\crates\core_arch\src\x86/
vpclmulqdq.rs

1//! Vectorized Carry-less Multiplication (VCLMUL)
2//!
3//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
4//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref] (p. 4-241).
5//!
6//! [intel64_ref]: http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
7
8use crate::core_arch::x86::__m256i;
9use crate::core_arch::x86::__m512i;
10
11#[cfg(test)]
12use stdarch_test::assert_instr;
13
14#[allow(improper_ctypes)]
15unsafe extern "C" {
16    #[link_name = "llvm.x86.pclmulqdq.256"]
17    fn pclmulqdq_256(a: __m256i, round_key: __m256i, imm8: u8) -> __m256i;
18    #[link_name = "llvm.x86.pclmulqdq.512"]
19    fn pclmulqdq_512(a: __m512i, round_key: __m512i, imm8: u8) -> __m512i;
20}
21
22// for some odd reason on x86_64 we generate the correct long name instructions
23// but on i686 we generate the short name + imm8
24// so we need to special-case on that...
25
26/// Performs a carry-less multiplication of two 64-bit polynomials over the
27/// finite field GF(2) - in each of the 4 128-bit lanes.
28///
29/// The immediate byte is used for determining which halves of each lane `a` and `b`
30/// should be used. Immediate bits other than 0 and 4 are ignored.
31/// All lanes share immediate byte.
32///
33/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_clmulepi64_epi128)
34#[inline]
35#[target_feature(enable = "vpclmulqdq,avx512f")]
36#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
37// technically according to Intel's documentation we don't need avx512f here, however LLVM gets confused otherwise
38#[cfg_attr(test, assert_instr(vpclmul, IMM8 = 0))]
39#[rustc_legacy_const_generics(2)]
40pub fn _mm512_clmulepi64_epi128<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
41    static_assert_uimm_bits!(IMM8, 8);
42    unsafe { pclmulqdq_512(a, b, IMM8 as u8) }
43}
44
45/// Performs a carry-less multiplication of two 64-bit polynomials over the
46/// finite field GF(2) - in each of the 2 128-bit lanes.
47///
48/// The immediate byte is used for determining which halves of each lane `a` and `b`
49/// should be used. Immediate bits other than 0 and 4 are ignored.
50/// All lanes share immediate byte.
51///
52/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_clmulepi64_epi128)
53#[inline]
54#[target_feature(enable = "vpclmulqdq")]
55#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
56#[cfg_attr(test, assert_instr(vpclmul, IMM8 = 0))]
57#[rustc_legacy_const_generics(2)]
58pub fn _mm256_clmulepi64_epi128<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
59    static_assert_uimm_bits!(IMM8, 8);
60    unsafe { pclmulqdq_256(a, b, IMM8 as u8) }
61}
62
63#[cfg(test)]
64mod tests {
65    // The constants in the tests below are just bit patterns. They should not
66    // be interpreted as integers; signedness does not make sense for them, but
67    // __mXXXi happens to be defined in terms of signed integers.
68    #![allow(overflowing_literals)]
69
70    use stdarch_test::simd_test;
71
72    use crate::core_arch::x86::*;
73
74    macro_rules! verify_kat_pclmul {
75        ($broadcast:ident, $clmul:ident, $assert:ident) => {
76            // Constants taken from https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf
77         let a = _mm_set_epi64x(0x7b5b546573745665, 0x63746f725d53475d);
78         let a = $broadcast(a);
79         let b = _mm_set_epi64x(0x4869285368617929, 0x5b477565726f6e5d);
80         let b = $broadcast(b);
81         let r00 = _mm_set_epi64x(0x1d4d84c85c3440c0, 0x929633d5d36f0451);
82         let r00 = $broadcast(r00);
83         let r01 = _mm_set_epi64x(0x1bd17c8d556ab5a1, 0x7fa540ac2a281315);
84         let r01 = $broadcast(r01);
85         let r10 = _mm_set_epi64x(0x1a2bf6db3a30862f, 0xbabf262df4b7d5c9);
86         let r10 = $broadcast(r10);
87         let r11 = _mm_set_epi64x(0x1d1e1f2c592e7c45, 0xd66ee03e410fd4ed);
88         let r11 = $broadcast(r11);
89
90         $assert($clmul::<0x00>(a, b), r00);
91         $assert($clmul::<0x10>(a, b), r01);
92         $assert($clmul::<0x01>(a, b), r10);
93         $assert($clmul::<0x11>(a, b), r11);
94
95         let a0 = _mm_set_epi64x(0x0000000000000000, 0x8000000000000000);
96         let a0 = $broadcast(a0);
97         let r = _mm_set_epi64x(0x4000000000000000, 0x0000000000000000);
98         let r = $broadcast(r);
99         $assert($clmul::<0x00>(a0, a0), r);
100        }
101    }
102
103    macro_rules! unroll {
104        ($target:ident[4] = $op:ident::<4>($source:ident);) => {
105            $target[3] = $op::<3>($source);
106            $target[2] = $op::<2>($source);
107            unroll! {$target[2] = $op::<2>($source);}
108        };
109        ($target:ident[2] = $op:ident::<2>($source:ident);) => {
110            $target[1] = $op::<1>($source);
111            $target[0] = $op::<0>($source);
112        };
113        (assert_eq_m128i($op:ident::<4>($vec_res:ident),$lin_res:ident[4]);) => {
114            assert_eq_m128i($op::<3>($vec_res), $lin_res[3]);
115            assert_eq_m128i($op::<2>($vec_res), $lin_res[2]);
116            unroll! {assert_eq_m128i($op::<2>($vec_res),$lin_res[2]);}
117        };
118        (assert_eq_m128i($op:ident::<2>($vec_res:ident),$lin_res:ident[2]);) => {
119            assert_eq_m128i($op::<1>($vec_res), $lin_res[1]);
120            assert_eq_m128i($op::<0>($vec_res), $lin_res[0]);
121        };
122    }
123
124    // this function tests one of the possible 4 instances
125    // with different inputs across lanes
126    #[target_feature(enable = "vpclmulqdq,avx512f")]
127    unsafe fn verify_512_helper(
128        linear: unsafe fn(__m128i, __m128i) -> __m128i,
129        vectorized: unsafe fn(__m512i, __m512i) -> __m512i,
130    ) {
131        let a = _mm512_set_epi64(
132            0xDCB4DB3657BF0B7D,
133            0x18DB0601068EDD9F,
134            0xB76B908233200DC5,
135            0xE478235FA8E22D5E,
136            0xAB05CFFA2621154C,
137            0x1171B47A186174C9,
138            0x8C6B6C0E7595CEC9,
139            0xBE3E7D4934E961BD,
140        );
141        let b = _mm512_set_epi64(
142            0x672F6F105A94CEA7,
143            0x8298B8FFCA5F829C,
144            0xA3927047B3FB61D8,
145            0x978093862CDE7187,
146            0xB1927AB22F31D0EC,
147            0xA9A5DA619BE4D7AF,
148            0xCA2590F56884FDC6,
149            0x19BE9F660038BDB5,
150        );
151
152        let mut a_decomp = [_mm_setzero_si128(); 4];
153        unroll! {a_decomp[4] = _mm512_extracti32x4_epi32::<4>(a);}
154        let mut b_decomp = [_mm_setzero_si128(); 4];
155        unroll! {b_decomp[4] = _mm512_extracti32x4_epi32::<4>(b);}
156
157        let r = vectorized(a, b);
158        let mut e_decomp = [_mm_setzero_si128(); 4];
159        for i in 0..4 {
160            e_decomp[i] = linear(a_decomp[i], b_decomp[i]);
161        }
162        unroll! {assert_eq_m128i(_mm512_extracti32x4_epi32::<4>(r),e_decomp[4]);}
163    }
164
165    // this function tests one of the possible 4 instances
166    // with different inputs across lanes for the VL version
167    #[target_feature(enable = "vpclmulqdq,avx512vl")]
168    unsafe fn verify_256_helper(
169        linear: unsafe fn(__m128i, __m128i) -> __m128i,
170        vectorized: unsafe fn(__m256i, __m256i) -> __m256i,
171    ) {
172        let a = _mm512_set_epi64(
173            0xDCB4DB3657BF0B7D,
174            0x18DB0601068EDD9F,
175            0xB76B908233200DC5,
176            0xE478235FA8E22D5E,
177            0xAB05CFFA2621154C,
178            0x1171B47A186174C9,
179            0x8C6B6C0E7595CEC9,
180            0xBE3E7D4934E961BD,
181        );
182        let b = _mm512_set_epi64(
183            0x672F6F105A94CEA7,
184            0x8298B8FFCA5F829C,
185            0xA3927047B3FB61D8,
186            0x978093862CDE7187,
187            0xB1927AB22F31D0EC,
188            0xA9A5DA619BE4D7AF,
189            0xCA2590F56884FDC6,
190            0x19BE9F660038BDB5,
191        );
192
193        let mut a_decomp = [_mm_setzero_si128(); 2];
194        unroll! {a_decomp[2] = _mm512_extracti32x4_epi32::<2>(a);}
195        let mut b_decomp = [_mm_setzero_si128(); 2];
196        unroll! {b_decomp[2] = _mm512_extracti32x4_epi32::<2>(b);}
197
198        let r = vectorized(
199            _mm512_extracti64x4_epi64::<0>(a),
200            _mm512_extracti64x4_epi64::<0>(b),
201        );
202        let mut e_decomp = [_mm_setzero_si128(); 2];
203        for i in 0..2 {
204            e_decomp[i] = linear(a_decomp[i], b_decomp[i]);
205        }
206        unroll! {assert_eq_m128i(_mm256_extracti128_si256::<2>(r),e_decomp[2]);}
207    }
208
209    #[simd_test(enable = "vpclmulqdq,avx512f")]
210    unsafe fn test_mm512_clmulepi64_epi128() {
211        verify_kat_pclmul!(
212            _mm512_broadcast_i32x4,
213            _mm512_clmulepi64_epi128,
214            assert_eq_m512i
215        );
216
217        verify_512_helper(
218            |a, b| _mm_clmulepi64_si128::<0x00>(a, b),
219            |a, b| _mm512_clmulepi64_epi128::<0x00>(a, b),
220        );
221        verify_512_helper(
222            |a, b| _mm_clmulepi64_si128::<0x01>(a, b),
223            |a, b| _mm512_clmulepi64_epi128::<0x01>(a, b),
224        );
225        verify_512_helper(
226            |a, b| _mm_clmulepi64_si128::<0x10>(a, b),
227            |a, b| _mm512_clmulepi64_epi128::<0x10>(a, b),
228        );
229        verify_512_helper(
230            |a, b| _mm_clmulepi64_si128::<0x11>(a, b),
231            |a, b| _mm512_clmulepi64_epi128::<0x11>(a, b),
232        );
233    }
234
235    #[simd_test(enable = "vpclmulqdq,avx512vl")]
236    unsafe fn test_mm256_clmulepi64_epi128() {
237        verify_kat_pclmul!(
238            _mm256_broadcastsi128_si256,
239            _mm256_clmulepi64_epi128,
240            assert_eq_m256i
241        );
242
243        verify_256_helper(
244            |a, b| _mm_clmulepi64_si128::<0x00>(a, b),
245            |a, b| _mm256_clmulepi64_epi128::<0x00>(a, b),
246        );
247        verify_256_helper(
248            |a, b| _mm_clmulepi64_si128::<0x01>(a, b),
249            |a, b| _mm256_clmulepi64_epi128::<0x01>(a, b),
250        );
251        verify_256_helper(
252            |a, b| _mm_clmulepi64_si128::<0x10>(a, b),
253            |a, b| _mm256_clmulepi64_epi128::<0x10>(a, b),
254        );
255        verify_256_helper(
256            |a, b| _mm_clmulepi64_si128::<0x11>(a, b),
257            |a, b| _mm256_clmulepi64_epi128::<0x11>(a, b),
258        );
259    }
260}