cryprot_core/block/
gf128.rs

1use super::Block;
2
3/// The irreducible polynomial for gf128 operations.
4const MOD: u64 = 0b10000111; // 0x87
5
6#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
7cpufeatures::new!(target_feature_pclmulqdq, "pclmulqdq");
8
9impl Block {
10    /// Carryless multiplication of two Blocks as polynomials over GF(2).
11    ///
12    /// Returns (low, high) bits.
13    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
14    #[inline]
15    pub fn clmul(&self, rhs: &Self) -> (Self, Self) {
16        if target_feature_pclmulqdq::get() {
17            // SAFETY: pclmulqdq is available
18            unsafe {
19                let (low, high) = clmul::clmul128(self.into(), rhs.into());
20                (low.into(), high.into())
21            }
22        } else {
23            let (low, high) = scalar::clmul128(self.into(), rhs.into());
24            (low.into(), high.into())
25        }
26    }
27
28    /// Carryless multiplication of two Blocks as polynomials over GF(2).
29    ///
30    /// Returns (low, high) bits.
31    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
32    #[inline]
33    pub fn clmul(&self, rhs: &Self) -> (Self, Self) {
34        let (low, high) = scalar::clmul128(self.into(), rhs.into());
35        (low.into(), high.into())
36    }
37
38    /// Multiplication over GF(2^128).
39    ///
40    /// Uses the irreducible polynomial `x^128 + x^7 + x^2 + x + 1.
41    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
42    #[inline]
43    pub fn gf_mul(&self, rhs: &Self) -> Self {
44        if target_feature_pclmulqdq::get() {
45            // SAFETY: pclmulqdq is available
46            unsafe { clmul::gf128_mul(self.into(), rhs.into()).into() }
47        } else {
48            scalar::gf128_mul(self.into(), rhs.into()).into()
49        }
50    }
51
52    /// Multiplication over GF(2^128).
53    ///
54    /// Uses the irreducible polynomial `x^128 + x^7 + x^2 + x + 1`.
55    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
56    #[inline]
57    pub fn gf_mul(&self, rhs: &Self) -> Self {
58        scalar::gf128_mul(self.into(), rhs.into()).into()
59    }
60
61    /// Reduce polynomial over GF(2) by `x^128 + x^7 + x^2 + x + 1`.
62    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
63    #[inline]
64    pub fn gf_reduce(low: &Self, high: &Self) -> Self {
65        if target_feature_pclmulqdq::get() {
66            // SAFETY: pclmulqdq is available
67            unsafe { clmul::gf128_reduce(low.into(), high.into()).into() }
68        } else {
69            scalar::gf128_reduce(low.into(), high.into()).into()
70        }
71    }
72
73    /// Reduce polynomial over GF(2) by `x^128 + x^7 + x^2 + x + 1`.
74    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
75    #[inline]
76    pub fn gf_reduce(low: &Self, high: &Self) -> Self {
77        scalar::gf128_reduce(low.into(), high.into()).into()
78    }
79
80    #[inline]
81    pub fn gf_pow(&self, mut exp: u64) -> Block {
82        let mut s = Block::ONE;
83        let mut pow2 = *self;
84
85        while exp != 0 {
86            if exp & 1 != 0 {
87                s = s.gf_mul(&pow2);
88            }
89            pow2 = pow2.gf_mul(&pow2);
90            exp >>= 1;
91        }
92        s
93    }
94}
95
96#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
97mod clmul {
98    #[cfg(target_arch = "x86")]
99    use std::arch::x86::*;
100    #[cfg(target_arch = "x86_64")]
101    use std::arch::x86_64::*;
102
103    use super::MOD;
104
105    #[target_feature(enable = "pclmulqdq")]
106    #[inline]
107    pub unsafe fn gf128_mul(a: __m128i, b: __m128i) -> __m128i {
108        unsafe {
109            let (low, high) = clmul128(a, b);
110            gf128_reduce(low, high)
111        }
112    }
113
114    /// Carry-less multiply of two 128-bit numbers.
115    ///
116    /// Return (low, high) bits
117    #[target_feature(enable = "pclmulqdq")]
118    #[inline]
119    pub unsafe fn clmul128(a: __m128i, b: __m128i) -> (__m128i, __m128i) {
120        // This is currently needed because we run the nightly version of
121        // clippy where this is an unused unsafe because the used
122        // intrinsincs have been marked safe on nightly but not yet on
123        // stable.
124        #[allow(unused_unsafe)]
125        unsafe {
126            // NOTE: I tried using karatsuba but it was slightly slower than the naive
127            // multiplication
128            let ab_low = _mm_clmulepi64_si128::<0x00>(a, b);
129            let ab_high = _mm_clmulepi64_si128::<0x11>(a, b);
130            let ab_lohi1 = _mm_clmulepi64_si128::<0x01>(a, b);
131            let ab_lohi2 = _mm_clmulepi64_si128::<0x10>(a, b);
132            let ab_mid = _mm_xor_si128(ab_lohi1, ab_lohi2);
133            let low = _mm_xor_si128(ab_low, _mm_slli_si128::<8>(ab_mid));
134            let high = _mm_xor_si128(ab_high, _mm_srli_si128::<8>(ab_mid));
135            (low, high)
136        }
137    }
138
139    #[target_feature(enable = "pclmulqdq")]
140    #[inline]
141    pub unsafe fn gf128_reduce(mut low: __m128i, mut high: __m128i) -> __m128i {
142        // NOTE: I tried a sse shift based reduction but it was slower than the clmul
143        // implementation
144        unsafe {
145            let modulus = [MOD, 0];
146            let modulus = _mm_loadu_si64(modulus.as_ptr().cast());
147
148            let tmp = _mm_clmulepi64_si128::<0x01>(high, modulus);
149            let tmp_shifted = _mm_slli_si128::<8>(tmp);
150            low = _mm_xor_si128(low, tmp_shifted);
151            high = _mm_xor_si128(high, tmp_shifted);
152
153            // reduce overflow
154            let tmp = _mm_clmulepi64_si128::<0x01>(tmp, modulus);
155            low = _mm_xor_si128(low, tmp);
156
157            let tmp = _mm_clmulepi64_si128::<0x00>(high, modulus);
158            _mm_xor_si128(low, tmp)
159        }
160    }
161
162    #[cfg(all(test, target_feature = "pclmulqdq"))]
163    mod test {
164        use std::{arch::x86_64::__m128i, mem::transmute};
165
166        use crate::block::gf128::clmul::{clmul128, gf128_mul, gf128_reduce};
167
168        #[test]
169        fn test_gf128_mul_zero() {
170            unsafe {
171                let a = transmute(0x19831239123916248127031273012381_u128);
172                let b = transmute(0_u128);
173                let exp = 0_u128;
174                let mul = transmute(gf128_mul(a, b));
175                assert_eq!(exp, mul);
176            }
177        }
178
179        #[test]
180        fn test_gf128_mul_onw() {
181            unsafe {
182                let a = transmute(0x19831239123916248127031273012381_u128);
183                let b = transmute(0x1_u128);
184                let exp = 0x19831239123916248127031273012381_u128;
185                let mul = transmute(gf128_mul(a, b));
186                assert_eq!(exp, mul);
187            }
188        }
189
190        #[test]
191        fn test_gf128_mul() {
192            unsafe {
193                let a = transmute(0x19831239123916248127031273012381_u128);
194                let b = transmute(0xabcdef0123456789abcdef0123456789_u128);
195                let exp = 0x63a033d0ed643e85153c50f4268a7d9_u128;
196                let mul = transmute(gf128_mul(a, b));
197                assert_eq!(exp, mul);
198            }
199        }
200
201        #[test]
202        fn test_clmul128() {
203            unsafe {
204                let a: __m128i = transmute(0x19831239123916248127031273012381_u128);
205                let b: __m128i = transmute(0xabcdef0123456789abcdef0123456789_u128);
206                let (low, high) = clmul128(a, b);
207                let [low, high] = transmute([low, high]);
208                let exp_low: u128 = 0xa5de9b50e6db7b5147e92b99ee261809;
209                let exp_high: u128 = 0xf1d6d37d58114afed2addfedd7c77f7;
210                assert_eq!(exp_low, low);
211                assert_eq!(exp_high, high);
212            }
213        }
214
215        #[test]
216        fn test_gf128_reduce() {
217            unsafe {
218                // test vectors computed using sage
219                let low: __m128i = transmute(0x0123456789abcdef0123456789abcdef_u128);
220                let high: __m128i = transmute(0xabcdef0123456789abcdef0123456789_u128);
221                let exp = 0xb4b548f1c3c23f86b4b548f1c3c21572_u128;
222                let res: u128 = transmute(gf128_reduce(low, high));
223
224                println!("res: {res:b}");
225                println!("exp: {exp:b}");
226                assert_eq!(exp, res);
227            }
228        }
229    }
230
231    #[cfg(all(test, feature = "nightly", target_feature = "pclmulqdq"))]
232    mod benches {
233        extern crate test;
234
235        use std::mem::transmute;
236
237        use criterion::black_box;
238        use rand::{Rng, rng};
239        use test::Bencher;
240
241        #[bench]
242        fn bench_gf128_mul(b: &mut Bencher) {
243            let [low, high] = unsafe { transmute(rng().random::<[u128; 2]>()) };
244            b.iter(|| black_box(unsafe { super::gf128_mul(black_box(low), black_box(high)) }));
245        }
246
247        #[bench]
248        fn bench_gf128_reduce(b: &mut Bencher) {
249            let [low, high] = unsafe { transmute(rng().random::<[u128; 2]>()) };
250            b.iter(|| black_box(unsafe { super::gf128_reduce(black_box(low), black_box(high)) }));
251        }
252    }
253}
254
255// used in tests, but if we're not compiling tests these will otherwise be
256// flagged as unused
257#[allow(dead_code)]
258mod scalar {
259    #[inline]
260    pub fn gf128_mul(a: u128, b: u128) -> u128 {
261        let (low, high) = clmul128(a, b);
262        gf128_reduce(low, high)
263    }
264
265    /// Carry-less multiply of two 128-bit numbers.
266    ///
267    /// Return (low, high) bits
268    #[inline]
269    pub fn clmul128(a: u128, b: u128) -> (u128, u128) {
270        let (a_low, a_high) = (a as u64, (a >> 64) as u64);
271        let (b_low, b_high) = (b as u64, (b >> 64) as u64);
272
273        // Use karatsuba multiplication
274        let ab_low = clmul64(a_low, b_low);
275        let ab_high = clmul64(a_high, b_high);
276        let ab_mid = clmul64(a_low ^ a_high, b_low ^ b_high) ^ ab_low ^ ab_high;
277        let low = ab_low ^ (ab_mid << 64);
278        let high = ab_high ^ (ab_mid >> 64);
279        (low, high)
280    }
281
282    // Adapted from https://github.com/RustCrypto/universal-hashes/blob/802b40974a08bbd2663c63780fc87a23ee931868/polyval/src/backend/soft64.rs#L201C1-L227C2
283    // Uses the technique described in https://www.bearssl.org/constanttime.html#ghash-for-gcm
284    // but directly outputs the 128 bits wihtout needing the Rev trick.
285    // This method is constant time and significantly faster than iterating over the
286    // bits of y and xoring shifted x.
287    /// Multiplication in GF(2)[X] with “holes”
288    /// (sequences of zeroes) to avoid carry spilling.
289    ///
290    /// When carries do occur, they wind up in a "hole" and are subsequently
291    /// masked out of the result.
292    #[inline]
293    fn clmul64(x: u64, y: u64) -> u128 {
294        let x0 = (x & 0x1111_1111_1111_1111) as u128;
295        let x1 = (x & 0x2222_2222_2222_2222) as u128;
296        let x2 = (x & 0x4444_4444_4444_4444) as u128;
297        let x3 = (x & 0x8888_8888_8888_8888) as u128;
298        let y0 = (y & 0x1111_1111_1111_1111) as u128;
299        let y1 = (y & 0x2222_2222_2222_2222) as u128;
300        let y2 = (y & 0x4444_4444_4444_4444) as u128;
301        let y3 = (y & 0x8888_8888_8888_8888) as u128;
302
303        let mut z0 = (x0 * y0) ^ (x1 * y3) ^ (x2 * y2) ^ (x3 * y1);
304        let mut z1 = (x0 * y1) ^ (x1 * y0) ^ (x2 * y3) ^ (x3 * y2);
305        let mut z2 = (x0 * y2) ^ (x1 * y1) ^ (x2 * y0) ^ (x3 * y3);
306        let mut z3 = (x0 * y3) ^ (x1 * y2) ^ (x2 * y1) ^ (x3 * y0);
307
308        z0 &= 0x1111_1111_1111_1111_1111_1111_1111_1111;
309        z1 &= 0x2222_2222_2222_2222_2222_2222_2222_2222;
310        z2 &= 0x4444_4444_4444_4444_4444_4444_4444_4444;
311        z3 &= 0x8888_8888_8888_8888_8888_8888_8888_8888;
312
313        z0 | z1 | z2 | z3
314    }
315
316    /// Generated by ChatGPT o3-mini and reviewed by me.
317    /// Reduces a 256-bit value (given as two u128 words, `high` and `low`)
318    /// modulo the irreducible polynomial f(x) = x^128 + x^7 + x^2 + x + 1.
319    ///
320    /// That is, it computes:
321    ///      low ^ reduce(high * (x^7 + x^2 + x + 1))
322    /// since x^128 ≡ x^7 + x^2 + x + 1 (mod f(x)).
323    #[inline]
324    pub fn gf128_reduce(low: u128, high: u128) -> u128 {
325        // Helper: performs a left shift on a 128-bit word and returns
326        // a tuple (overflow, lower) where:
327        //    x << shift = (overflow << 128) | lower.
328        #[inline]
329        fn shift_u128(x: u128, shift: u32) -> (u128, u128) {
330            // For 0 < shift < 128.
331            let overflow = x >> (128 - shift);
332            let lower = x << shift;
333            (overflow, lower)
334        }
335
336        // For the reduction, note that:
337        //   x^128 ≡ x^7 + x^2 + x + 1 (mod f(x)).
338        // So the contribution of the high word is:
339        //   (high << 7) ^ (high << 2) ^ (high << 1) ^ high,
340        // but each shift must be computed as a 256–bit quantity.
341        let (ov7, lo7) = shift_u128(high, 7);
342        let (ov2, lo2) = shift_u128(high, 2);
343        let (ov1, lo1) = shift_u128(high, 1);
344        let lo0 = high; // equivalent to shift 0
345
346        // Combine the 128-bit parts of each term.
347        let combined_low = lo7 ^ lo2 ^ lo1 ^ lo0;
348        // Combine the overflow (upper) parts.
349        let combined_overflow = ov7 ^ ov2 ^ ov1;
350
351        // The bits in `combined_overflow` represent extra contributions from bits
352        // at positions ≥ 128. Since they are at most 7 bits wide, we can reduce them
353        // by multiplying with the reduction polynomial (i.e. shifting and XORing):
354        let reduced_overflow = (combined_overflow << 7)
355            ^ (combined_overflow << 2)
356            ^ (combined_overflow << 1)
357            ^ combined_overflow;
358
359        // The full contribution from `high` is then given by the low part
360        // combined with the reduced overflow.
361        let poly_contrib = combined_low ^ reduced_overflow;
362
363        // Finally, reduce the entire 256-bit value by XORing in the contribution.
364        low ^ poly_contrib
365    }
366
367    #[cfg(test)]
368    mod tests {
369        use super::{clmul128, gf128_mul, gf128_reduce};
370
371        #[test]
372        fn test_gf128_mul_zero() {
373            let a = 0x19831239123916248127031273012381;
374            let b = 0;
375            let exp = 0;
376            let mul = gf128_mul(a, b);
377            assert_eq!(exp, mul);
378        }
379
380        #[test]
381        fn test_gf128_mul_one() {
382            let a = 0x19831239123916248127031273012381;
383            let b = 1;
384            let exp = 0x19831239123916248127031273012381;
385            let mul = gf128_mul(a, b);
386            assert_eq!(exp, mul);
387        }
388
389        #[test]
390        fn test_gf128_mul() {
391            let a = 0x19831239123916248127031273012381;
392            let b = 0xabcdef0123456789abcdef0123456789;
393            let exp = 0x63a033d0ed643e85153c50f4268a7d9;
394            let mul = gf128_mul(a, b);
395            assert_eq!(exp, mul);
396        }
397
398        #[test]
399        fn test_gf128_reduce_zero() {
400            assert_eq!(gf128_reduce(0, 0), 0);
401        }
402
403        #[test]
404        fn test_gf128_reduce_low_only() {
405            assert_eq!(gf128_reduce(1, 0), 1);
406            assert_eq!(gf128_reduce(0x87, 0), 0x87); // Reduction polynomial itself.
407            assert_eq!(gf128_reduce(0xFFFFFFFFFFFFFFFF, 0), 0xFFFFFFFFFFFFFFFF);
408        }
409
410        #[test]
411        fn test_gf128_reduce_high_only() {
412            // high << 64
413            assert_eq!(gf128_reduce(0, 1), 0x87);
414            assert_eq!(gf128_reduce(0, 2), 0x87 << 1);
415            assert_eq!(gf128_reduce(0, 3), (0x87 << 1) ^ 0x87);
416
417            assert_eq!(gf128_reduce(0, 1 << 63), 0x87 << 63);
418        }
419
420        #[test]
421        fn test_gf128_reduce_overflow() {
422            let high = u128::MAX; // All bits set in high
423            let low = u128::MAX; // All bits set in low.
424            assert_eq!(gf128_reduce(low, high), 0xffffffffffffffffffffffffffffc071);
425        }
426
427        #[test]
428        fn tests_gf128_reduce() {
429            // test vectors computed using sage
430            let low = 0x0123456789abcdef0123456789abcdef;
431            let high = 0xabcdef0123456789abcdef0123456789;
432            let exp = 0xb4b548f1c3c23f86b4b548f1c3c21572;
433            let res = gf128_reduce(low, high);
434
435            println!("res: {res:b}");
436            println!("exp: {exp:b}");
437            assert_eq!(exp, res);
438        }
439
440        #[test]
441        fn test_clmul128() {
442            let a = 0x19831239123916248127031273012381;
443            let b = 0xabcdef0123456789abcdef0123456789;
444            let (low, high) = clmul128(a, b);
445            let exp_low = 0xa5de9b50e6db7b5147e92b99ee261809;
446            let exp_high = 0xf1d6d37d58114afed2addfedd7c77f7;
447            assert_eq!(exp_low, low);
448            assert_eq!(exp_high, high);
449        }
450    }
451
452    #[cfg(all(test, feature = "nightly"))]
453    mod benches {
454        extern crate test;
455
456        use criterion::black_box;
457        use rand::{Rng, rng};
458        use test::Bencher;
459
460        #[bench]
461        fn bench_gf128_mul(b: &mut Bencher) {
462            let [low, high] = rng().random::<[u128; 2]>();
463            b.iter(|| black_box(super::gf128_mul(black_box(low), black_box(high))));
464        }
465
466        #[bench]
467        fn bench_gf128_reduce(b: &mut Bencher) {
468            let [low, high] = rng().random::<[u128; 2]>();
469            b.iter(|| black_box(super::gf128_reduce(black_box(low), black_box(high))));
470        }
471    }
472}
473
474/// Test that scalar implementation and clmul implementation produce the same
475/// results
476#[cfg(all(test, not(miri), target_feature = "pclmulqdq"))]
477mod scalar_simd_tests {
478    use std::mem::transmute;
479
480    use rand::{Rng, rng};
481
482    use super::{clmul, scalar};
483
484    #[test]
485    fn test_clmul128() {
486        for _ in 0..1000 {
487            let (a, b) = rng().random::<(u128, u128)>();
488            unsafe {
489                let clmul_res = clmul::clmul128(transmute(a), transmute(b));
490                let scalar_res = scalar::clmul128(a, b);
491                assert_eq!(scalar_res.0, transmute(clmul_res.0));
492            }
493        }
494    }
495
496    #[test]
497    fn test_gf128_reduce() {
498        for _ in 0..1000 {
499            let (a, b) = rng().random::<(u128, u128)>();
500            unsafe {
501                let clmul_res = clmul::gf128_reduce(transmute(a), transmute(b));
502                let scalar_res = scalar::gf128_reduce(a, b);
503                assert_eq!(scalar_res, transmute(clmul_res));
504            }
505        }
506    }
507
508    #[test]
509    fn test_gf128_mul() {
510        for _ in 0..1000 {
511            let (a, b) = rng().random::<(u128, u128)>();
512            unsafe {
513                let clmul_res = clmul::gf128_mul(transmute(a), transmute(b));
514                let scalar_res = scalar::gf128_mul(a, b);
515                assert_eq!(scalar_res, transmute(clmul_res));
516            }
517        }
518    }
519}
520
521#[cfg(test)]
522mod tests {
523    use crate::Block;
524
525    #[test]
526    fn test_gf_pow() {
527        let b: Block = 24646523424323_u128.into();
528        assert_eq!(Block::ONE, b.gf_pow(0));
529        assert_eq!(b, b.gf_pow(1));
530        assert_eq!(b.gf_mul(&b), b.gf_pow(2));
531        assert_eq!(b.gf_mul(&b.gf_mul(&b)), b.gf_pow(3));
532    }
533}
cryprot_core/block/gf128.rs

cryprot_core/block/
gf128.rs