From 9b623852bde3aff065f27a7872b4d57327bbc00a Mon Sep 17 00:00:00 2001
From: Frank Denis <github@pureftpd.org>
Date: Sun, 13 Nov 2022 18:42:38 +0100
Subject: [PATCH] Rewrite the AES256-GCM implementation

Faster, way more readable, easier to port to other architectures.
---
 AUTHORS                                       |    3 +-
 .../aes256gcm/aesni/aead_aes256gcm_aesni.c    | 1727 +++++++++--------
 test/default/aead_aes256gcm.c                 |   37 +-
 3 files changed, 961 insertions(+), 806 deletions(-)

diff --git a/AUTHORS b/AUTHORS
index 094ee476..04763334 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -53,8 +53,7 @@ crypto_aead/aegis256                   Hongjun Wu
                                        Adrien Gallouet
                                        Frank Denis
 
-crypto_aead/aes256gcm/aesni            Romain Dolbeau
-                                       Frank Denis
+crypto_aead/aes256gcm/aesni            Frank Denis
 
 crypto_aead/chacha20poly1305           Frank Denis
 
diff --git a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c
index c5f80a44..5ac3827b 100644
--- a/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c
+++ b/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c
@@ -6,12 +6,14 @@
  */
 
 #include <errno.h>
+#include <limits.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 
 #include "core.h"
 #include "crypto_aead_aes256gcm.h"
+#include "crypto_verify_16.h"
 #include "export.h"
 #include "private/common.h"
 #include "randombytes.h"
@@ -20,70 +22,175 @@
 
 #if defined(HAVE_TMMINTRIN_H) && defined(HAVE_WMMINTRIN_H)
 
+#ifdef __x86_64__
 #ifdef __GNUC__
-# pragma GCC target("ssse3")
-# pragma GCC target("aes")
-# pragma GCC target("pclmul")
+#pragma GCC target("ssse3")
+#pragma GCC target("aes")
+#pragma GCC target("pclmul")
 #endif
-
 #include <tmmintrin.h>
 #include <wmmintrin.h>
-#include "private/sse2_64_32.h"
-
-#if defined(__INTEL_COMPILER) || defined(_bswap64)
-#elif defined(_MSC_VER)
-# define _bswap64(a) _byteswap_uint64(a)
-#elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2))
-# define _bswap64(a) __builtin_bswap64(a)
+#elif defined(__aarch64__)
+#include <arm_neon.h>
 #else
-static inline uint64_t
-_bswap64(const uint64_t x)
-{
-    return ((x << 56) & 0xFF00000000000000UL) | ((x << 40) & 0x00FF000000000000UL) |
-           ((x << 24) & 0x0000FF0000000000UL) | ((x << 8) & 0x000000FF00000000UL) |
-           ((x >> 8) & 0x00000000FF000000UL) | ((x >> 24) & 0x0000000000FF0000UL) |
-           ((x >> 40) & 0x000000000000FF00UL) | ((x >> 56) & 0x00000000000000FFUL);
-}
+#error Unsupported architecture
 #endif
 
-typedef struct aes256gcm_state {
-    __m128i       rkeys[16];
-    unsigned char H[16];
-} aes256gcm_state;
+#define ABYTES    crypto_aead_aes256gcm_ABYTES
+#define NPUBBYTES crypto_aead_aes256gcm_NPUBBYTES
+#define KEYBYTES  crypto_aead_aes256gcm_KEYBYTES
 
-static inline void
-aesni_key256_expand(const unsigned char *key, __m128i *const rkeys)
+#ifdef __x86_64__
+#define PARALLEL_BLOCKS 7
+#undef USE_KARATSUBA_MULTIPLICATION
+
+typedef __m128i BlockVec;
+
+#define LOAD128(a)                       _mm_loadu_si128((const BlockVec *) (a))
+#define STORE128(a, b)                   _mm_storeu_si128((BlockVec *) (a), (b))
+#define AES_ENCRYPT(block_vec, rkey)     _mm_aesenc_si128((block_vec), (rkey))
+#define AES_ENCRYPTLAST(block_vec, rkey) _mm_aesenclast_si128((block_vec), (rkey))
+#define AES_KEYGEN(block_vec, rc)        _mm_aeskeygenassist_si128((block_vec), (rc))
+#define XOR128(a, b)                     _mm_xor_si128((a), (b))
+#define AND128(a, b)                     _mm_and_si128((a), (b))
+#define OR128(a, b)                      _mm_or_si128((a), (b))
+#define SET64x2(a, b)                    _mm_set_epi64x((uint64_t) (a), (uint64_t) (b))
+#define ZERO128                          _mm_setzero_si128()
+#define ONE128                           SET64x2(0, 1)
+#define TWO128                           SET64x2(0, 2)
+#define ADD64x2(a, b)                    _mm_add_epi64((a), (b))
+#define SUB64x2(a, b)                    _mm_sub_epi64((a), (b))
+#define SHL64x2(a, b)                    _mm_slli_epi64((a), (b))
+#define SHR64x2(a, b)                    _mm_srli_epi64((a), (b))
+#define REV128(x) \
+    _mm_shuffle_epi8((x), _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15))
+#define SHUFFLE32x4(x, a, b, c, d) _mm_shuffle_epi32((x), _MM_SHUFFLE((d), (c), (b), (a)))
+#define BYTESHL128(a, b)           _mm_slli_si128(a, b)
+#define BYTESHR128(a, b)           _mm_srli_si128(a, b)
+#define SHL128(a, b)               OR128(SHL64x2((a), (b)), SHR64x2(BYTESHL128((a), 8), 64 - (b)))
+#define CLMULLO128(a, b)           _mm_clmulepi64_si128((a), (b), 0x00)
+#define CLMULHI128(a, b)           _mm_clmulepi64_si128((a), (b), 0x11)
+#define CLMULLOHI128(a, b)         _mm_clmulepi64_si128((a), (b), 0x10)
+#define CLMULHILO128(a, b)         _mm_clmulepi64_si128((a), (b), 0x01)
+#define PREFETCH_READ(x)           __builtin_prefetch((x), 0, 2)
+#define PREFETCH_WRITE(x)          __builtin_prefetch((x), 1, 2);
+
+#elif defined(HAVE_ARMCRYPTO)
+
+#define PARALLEL_BLOCKS 6
+#undef USE_KARATSUBA_MULTIPLICATION
+
+typedef uint64x2_t BlockVec;
+
+#define LOAD128(a)     vld1q_u64((const void *) a)
+#define STORE128(a, b) vst1q_u64(((void *) a), (b))
+#define AES_ENCRYPT(block_vec, rkey) \
+    vreinterpretq_u64_u8(            \
+        veorq_u8(vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_u64(block_vec), vmovq_n_u8(0))), rkey))
+#define AES_ENCRYPTLAST(block_vec, rkey) \
+    vreinterpretq_u64_u8(veorq_u8(vaeseq_u8(vreinterpretq_u8_u64(block_vec), vmovq_n_u8(0)), rkey))
+#define XOR128(a, b)  veorq_u64((a), (b))
+#define AND128(a, b)  vandq_u64((a), (b))
+#define OR128(a, b)   vorrq_u64((a), (b))
+#define SET64x2(a, b) vsetq_lane_u64((uint64_t) (a), vmovq_n_u64((uint64_t) (b)), 1)
+#define ZERO128       vmovq_n_u8(0)
+#define ONE128        SET64x2(0, 1)
+#define TWO128        SET64x2(0, 2)
+#define ADD64x2(a, b) vaddq_u64((a), (b))
+#define SUB64x2(a, b) vsubq_u64((a), (b))
+#define SHL64x2(a, b) vshlq_n_u64((a), (b))
+#define SHR64x2(a, b) vshrq_n_u64((a), (b))
+#define REV128(x)                                                                                  \
+    vreinterpretq_u64_u8(__builtin_shufflevector(vreinterpretq_u8_u64(x), vreinterpretq_u8_u64(x), \
+                                                 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2,   \
+                                                 1, 0))
+#define SHUFFLE32x4(x, a, b, c, d)                                          \
+    vreinterpretq_u64_u32(__builtin_shufflevector(vreinterpretq_u32_u64(x), \
+                                                  vreinterpretq_u32_u64(x), (a), (b), (c), (d)))
+#define BYTESHL128(a, b) vreinterpretq_u64_u8(vextq_s8(vdupq_n_s8(0), (int8x16_t) a, 16 - (b)))
+#define BYTESHR128(a, b) vreinterpretq_u64_u8(vextq_s8((int8x16_t) a, vdupq_n_s8(0), (b)))
+
+#define SHL128(a, b) OR128(SHL64x2((a), (b)), SHR64x2(BYTESHL128((a), 8), 64 - (b)))
+#define CLMULLO128(a, b) \
+    vreinterpretq_u64_p128(vmull_p64((poly64_t) vget_low_u64(a), (poly64_t) vget_low_u64(b)))
+#define CLMULHI128(a, b) \
+    vreinterpretq_u64_p128(vmull_high_p64(vreinterpretq_p64_s64(a), vreinterpretq_p64_s64(b)))
+#define CLMULLOHI128(a, b) \
+    vreinterpretq_u64_p128(vmull_p64((poly64_t) vget_low_u64(a), (poly64_t) vget_high_u64(b)))
+#define CLMULHILO128(a, b) \
+    vreinterpretq_u64_p128(vmull_p64((poly64_t) vget_high_u64(a), (poly64_t) vget_low_u64(b)))
+#define PREFETCH_READ(x)  __builtin_prefetch((x), 0, 2)
+#define PREFETCH_WRITE(x) __builtin_prefetch((x), 1, 2);
+
+static inline BlockVec
+AES_KEYGEN(BlockVec block_vec, const int rc)
 {
-    __m128i X0, X1, X2, X3;
-    int     i = 0;
+    uint8x16_t       a = vaeseq_u8(vreinterpretq_u8_u64(block_vec), vmovq_n_u8(0));
+    const uint8x16_t b =
+        __builtin_shufflevector(a, a, 4, 1, 14, 11, 1, 14, 11, 4, 12, 9, 6, 3, 9, 6, 3, 12);
+    const uint64x2_t c = SET64x2((uint64_t) rc << 32, (uint64_t) rc << 32);
+    return XOR128(b, c);
+}
 
-    X0         = _mm_loadu_si128((const __m128i *) &key[0]);
-    rkeys[i++] = X0;
+#else
+#error Unsupported architecture
+#endif
 
-    X2         = _mm_loadu_si128((const __m128i *) &key[16]);
-    rkeys[i++] = X2;
+#define ROUNDS 14
 
-#define EXPAND_KEY_1(S)                                                                          \
-    do {                                                                                         \
-        X1 = _mm_shuffle_epi32(_mm_aeskeygenassist_si128(X2, (S)), 0xff);                        \
-        X3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(X3), _mm_castsi128_ps(X0), 0x10)); \
-        X0 = _mm_xor_si128(X0, X3);                                                              \
-        X3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(X3), _mm_castsi128_ps(X0), 0x8c)); \
-        X0 = _mm_xor_si128(_mm_xor_si128(X0, X3), X1);                                           \
-        rkeys[i++] = X0;                                                                         \
-    } while (0)
+#define PC_COUNT (2 * PARALLEL_BLOCKS)
 
-#define EXPAND_KEY_2(S)                                                                          \
-    do {                                                                                         \
-        X1 = _mm_shuffle_epi32(_mm_aeskeygenassist_si128(X0, (S)), 0xaa);                        \
-        X3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(X3), _mm_castsi128_ps(X2), 0x10)); \
-        X2 = _mm_xor_si128(X2, X3);                                                              \
-        X3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(X3), _mm_castsi128_ps(X2), 0x8c)); \
-        X2 = _mm_xor_si128(_mm_xor_si128(X2, X3), X1);                                           \
-        rkeys[i++] = X2;                                                                         \
-    } while (0)
+typedef struct I256 {
+    BlockVec hi;
+    BlockVec lo;
+    BlockVec mid;
+} I256;
 
-    X3 = _mm_setzero_si128();
+static inline I256
+XOR256(const I256 a, const I256 b)
+{
+    return (I256) {
+        .hi  = XOR128(a.hi, b.hi),
+        .lo  = XOR128(a.lo, b.lo),
+        .mid = XOR128(a.mid, b.mid),
+    };
+}
+
+typedef BlockVec Precomp;
+
+typedef struct GHash {
+    BlockVec acc;
+} GHash;
+
+typedef struct State {
+    BlockVec rkeys[ROUNDS + 1];
+    Precomp  hx[PC_COUNT];
+} State;
+
+static void
+expand256(const unsigned char key[KEYBYTES], BlockVec rkeys[1 + ROUNDS])
+{
+    BlockVec t1, t2, s;
+    size_t   i = 0;
+
+#define EXPAND_KEY_1(RC)                        \
+    rkeys[i++] = t2;                            \
+    s          = AES_KEYGEN(t2, RC);            \
+    t1         = XOR128(t1, BYTESHL128(t1, 4)); \
+    t1         = XOR128(t1, BYTESHL128(t1, 8)); \
+    t1         = XOR128(t1, SHUFFLE32x4(s, 3, 3, 3, 3));
+
+#define EXPAND_KEY_2(RC)                        \
+    rkeys[i++] = t1;                            \
+    s          = AES_KEYGEN(t1, RC);            \
+    t2         = XOR128(t2, BYTESHL128(t2, 4)); \
+    t2         = XOR128(t2, BYTESHL128(t2, 8)); \
+    t2         = XOR128(t2, SHUFFLE32x4(s, 2, 2, 2, 2));
+
+    t1 = LOAD128(&key[0]);
+    t2 = LOAD128(&key[16]);
+
+    rkeys[i++] = t1;
     EXPAND_KEY_1(0x01);
     EXPAND_KEY_2(0x01);
     EXPAND_KEY_1(0x02);
@@ -97,498 +204,635 @@ aesni_key256_expand(const unsigned char *key, __m128i *const rkeys)
     EXPAND_KEY_1(0x20);
     EXPAND_KEY_2(0x20);
     EXPAND_KEY_1(0x40);
+    rkeys[i++] = t1;
 }
 
-#define NVDECLx(a) __m128i nv##a
-
-/* Step 1 : loading and incrementing the nonce */
-#define NVx(a)                                                         \
-    nv##a = _mm_shuffle_epi8(_mm_load_si128((const __m128i *) n), pt); \
-    n[3]++
-
-#define TEMPDECLx(a) __m128i temp##a
-
-/* Step 2 : define value in round one (xor with subkey #0, aka key) */
-#define TEMPx(a) temp##a = _mm_xor_si128(nv##a, rkeys[0])
-
-/* Step 3: one round of AES */
-#define AESENCx(a, roundctr) temp##a = _mm_aesenc_si128(temp##a, rkeys[roundctr])
-#define AESENCx1(a) AESENCx(a, 1)
-#define AESENCx2(a) AESENCx(a, 2)
-#define AESENCx3(a) AESENCx(a, 3)
-#define AESENCx4(a) AESENCx(a, 4)
-#define AESENCx5(a) AESENCx(a, 5)
-#define AESENCx6(a) AESENCx(a, 6)
-#define AESENCx7(a) AESENCx(a, 7)
-#define AESENCx8(a) AESENCx(a, 8)
-#define AESENCx9(a) AESENCx(a, 9)
-#define AESENCx10(a) AESENCx(a, 10)
-#define AESENCx11(a) AESENCx(a, 11)
-#define AESENCx12(a) AESENCx(a, 12)
-#define AESENCx13(a) AESENCx(a, 13)
-
-/* Step 4: last round of AES */
-#define AESENCLASTx(a) temp##a = _mm_aesenclast_si128(temp##a, rkeys[14])
-
-/* Step 5: store result */
-#define STOREx(a) _mm_storeu_si128((__m128i *) (out + (a * 16)), temp##a)
-
-/* all the MAKE* macros are for automatic explicit unrolling */
-#define MAKE4(X) \
-    X(0);        \
-    X(1);        \
-    X(2);        \
-    X(3)
-
-#define MAKE8(X) \
-    X(0);        \
-    X(1);        \
-    X(2);        \
-    X(3);        \
-    X(4);        \
-    X(5);        \
-    X(6);        \
-    X(7)
-
-#define COUNTER_INC2(N) (N)[3] += 2
+/* Encrypt a single AES block */
 
 static inline void
-aesni_encrypt1(unsigned char *out, __m128i nv0, const __m128i *rkeys)
+encrypt(const State *st, unsigned char dst[16], const unsigned char src[16])
 {
-    TEMPDECLx(0);
+    BlockVec t;
 
-    TEMPx(0);
-    AESENCx1(0);
-    AESENCx2(0);
-    AESENCx3(0);
-    AESENCx4(0);
-    AESENCx5(0);
-    AESENCx6(0);
-    AESENCx7(0);
-    AESENCx8(0);
-    AESENCx9(0);
-    AESENCx10(0);
-    AESENCx11(0);
-    AESENCx12(0);
-    AESENCx13(0);
-    AESENCLASTx(0);
-    STOREx(0);
-}
+    size_t i;
 
-static inline void
-aesni_encrypt8(unsigned char *out, uint32_t *n, const __m128i *rkeys)
-{
-    const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-    MAKE8(NVDECLx);
-    MAKE8(TEMPDECLx);
-
-    MAKE8(NVx);
-    MAKE8(TEMPx);
-    MAKE8(AESENCx1);
-    MAKE8(AESENCx2);
-    MAKE8(AESENCx3);
-    MAKE8(AESENCx4);
-    MAKE8(AESENCx5);
-    MAKE8(AESENCx6);
-    MAKE8(AESENCx7);
-    MAKE8(AESENCx8);
-    MAKE8(AESENCx9);
-    MAKE8(AESENCx10);
-    MAKE8(AESENCx11);
-    MAKE8(AESENCx12);
-    MAKE8(AESENCx13);
-    MAKE8(AESENCLASTx);
-    MAKE8(STOREx);
-}
-
-/* all GF(2^128) functions are by the book, meaning this one:
-   <https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf>
-*/
-
-static inline void
-addmulreduce(unsigned char *c, const unsigned char *a, unsigned int xlen, const unsigned char *b)
-{
-    const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-    __m128i       A, B, C;
-    __m128i       tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
-    __m128i       tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, tmp18;
-    __m128i       tmp19, tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
-    __m128i       tmp28, tmp29, tmp30, tmp31, tmp32, tmp33, tmp34, tmp35, tmp36;
-
-    if (xlen >= 16) {
-        A = _mm_loadu_si128((const __m128i *) a);
-    } else {
-        CRYPTO_ALIGN(16) unsigned char padded[16];
-        unsigned int                   i;
-
-        memset(padded, 0, 16);
-        for (i = 0; i < xlen; i++) {
-            padded[i] = a[i];
-        }
-        A = _mm_load_si128((const __m128i *) padded);
+    t = XOR128(LOAD128(src), st->rkeys[0]);
+    for (i = 1; i < ROUNDS; i++) {
+        t = AES_ENCRYPT(t, st->rkeys[i]);
     }
-    A     = _mm_shuffle_epi8(A, rev);
-    B     = _mm_loadu_si128((const __m128i *) b);
-    C     = _mm_loadu_si128((const __m128i *) c);
-    A     = _mm_xor_si128(A, C);
-    tmp3  = _mm_clmulepi64_si128(A, B, 0x00);
-    tmp4  = _mm_clmulepi64_si128(A, B, 0x10);
-    tmp5  = _mm_clmulepi64_si128(A, B, 0x01);
-    tmp6  = _mm_clmulepi64_si128(A, B, 0x11);
-    tmp10 = _mm_xor_si128(tmp4, tmp5);
-    tmp13 = _mm_slli_si128(tmp10, 8);
-    tmp11 = _mm_srli_si128(tmp10, 8);
-    tmp15 = _mm_xor_si128(tmp3, tmp13);
-    tmp17 = _mm_xor_si128(tmp6, tmp11);
-    tmp7  = _mm_srli_epi32(tmp15, 31);
-    tmp8  = _mm_srli_epi32(tmp17, 31);
-    tmp16 = _mm_slli_epi32(tmp15, 1);
-    tmp18 = _mm_slli_epi32(tmp17, 1);
-    tmp9  = _mm_srli_si128(tmp7, 12);
-    tmp22 = _mm_slli_si128(tmp8, 4);
-    tmp25 = _mm_slli_si128(tmp7, 4);
-    tmp29 = _mm_or_si128(tmp16, tmp25);
-    tmp19 = _mm_or_si128(tmp18, tmp22);
-    tmp20 = _mm_or_si128(tmp19, tmp9);
-    tmp26 = _mm_slli_epi32(tmp29, 31);
-    tmp23 = _mm_slli_epi32(tmp29, 30);
-    tmp32 = _mm_slli_epi32(tmp29, 25);
-    tmp27 = _mm_xor_si128(tmp26, tmp23);
-    tmp28 = _mm_xor_si128(tmp27, tmp32);
-    tmp24 = _mm_srli_si128(tmp28, 4);
-    tmp33 = _mm_slli_si128(tmp28, 12);
-    tmp30 = _mm_xor_si128(tmp29, tmp33);
-    tmp2  = _mm_srli_epi32(tmp30, 1);
-    tmp12 = _mm_srli_epi32(tmp30, 2);
-    tmp14 = _mm_srli_epi32(tmp30, 7);
-    tmp34 = _mm_xor_si128(tmp2, tmp12);
-    tmp35 = _mm_xor_si128(tmp34, tmp14);
-    tmp36 = _mm_xor_si128(tmp35, tmp24);
-    tmp31 = _mm_xor_si128(tmp30, tmp36);
-    tmp21 = _mm_xor_si128(tmp20, tmp31);
-    _mm_storeu_si128((__m128i *) c, tmp21);
+    t = AES_ENCRYPTLAST(t, st->rkeys[ROUNDS]);
+    STORE128(dst, t);
 }
 
-/* pure multiplication, for pre-computing powers of H */
-static inline __m128i
-mulv(__m128i A, __m128i B)
+/* Encrypt and add a single AES block */
+
+static inline void
+encrypt_xor_block(const State *st, unsigned char dst[16], const unsigned char src[16],
+                  const BlockVec counter)
 {
-    __m128i tmp3  = _mm_clmulepi64_si128(A, B, 0x00);
-    __m128i tmp4  = _mm_clmulepi64_si128(A, B, 0x10);
-    __m128i tmp5  = _mm_clmulepi64_si128(A, B, 0x01);
-    __m128i tmp6  = _mm_clmulepi64_si128(A, B, 0x11);
-    __m128i tmp10 = _mm_xor_si128(tmp4, tmp5);
-    __m128i tmp13 = _mm_slli_si128(tmp10, 8);
-    __m128i tmp11 = _mm_srli_si128(tmp10, 8);
-    __m128i tmp15 = _mm_xor_si128(tmp3, tmp13);
-    __m128i tmp17 = _mm_xor_si128(tmp6, tmp11);
-    __m128i tmp7  = _mm_srli_epi32(tmp15, 31);
-    __m128i tmp8  = _mm_srli_epi32(tmp17, 31);
-    __m128i tmp16 = _mm_slli_epi32(tmp15, 1);
-    __m128i tmp18 = _mm_slli_epi32(tmp17, 1);
-    __m128i tmp9  = _mm_srli_si128(tmp7, 12);
-    __m128i tmp22 = _mm_slli_si128(tmp8, 4);
-    __m128i tmp25 = _mm_slli_si128(tmp7, 4);
-    __m128i tmp29 = _mm_or_si128(tmp16, tmp25);
-    __m128i tmp19 = _mm_or_si128(tmp18, tmp22);
-    __m128i tmp20 = _mm_or_si128(tmp19, tmp9);
-    __m128i tmp26 = _mm_slli_epi32(tmp29, 31);
-    __m128i tmp23 = _mm_slli_epi32(tmp29, 30);
-    __m128i tmp32 = _mm_slli_epi32(tmp29, 25);
-    __m128i tmp27 = _mm_xor_si128(tmp26, tmp23);
-    __m128i tmp28 = _mm_xor_si128(tmp27, tmp32);
-    __m128i tmp24 = _mm_srli_si128(tmp28, 4);
-    __m128i tmp33 = _mm_slli_si128(tmp28, 12);
-    __m128i tmp30 = _mm_xor_si128(tmp29, tmp33);
-    __m128i tmp2  = _mm_srli_epi32(tmp30, 1);
-    __m128i tmp12 = _mm_srli_epi32(tmp30, 2);
-    __m128i tmp14 = _mm_srli_epi32(tmp30, 7);
-    __m128i tmp34 = _mm_xor_si128(tmp2, tmp12);
-    __m128i tmp35 = _mm_xor_si128(tmp34, tmp14);
-    __m128i tmp36 = _mm_xor_si128(tmp35, tmp24);
-    __m128i tmp31 = _mm_xor_si128(tmp30, tmp36);
-    __m128i C     = _mm_xor_si128(tmp20, tmp31);
+    BlockVec ts;
+    size_t   i;
 
-    return C;
+    ts = XOR128(counter, st->rkeys[0]);
+    for (i = 1; i < ROUNDS; i++) {
+        ts = AES_ENCRYPT(ts, st->rkeys[i]);
+    }
+    ts = AES_ENCRYPTLAST(ts, st->rkeys[i]);
+    ts = XOR128(ts, LOAD128(src));
+    STORE128(dst, ts);
 }
 
-#define RED_DECL(a) __m128i H##a##_X##a##_lo, H##a##_X##a##_hi, tmp##a, tmp##a##B
-#define RED_SHUFFLE(a) X##a = _mm_shuffle_epi8(X##a, rev)
-#define RED_MUL_LOW(a) H##a##_X##a##_lo = _mm_clmulepi64_si128(H##a, X##a, 0x00)
-#define RED_MUL_HIGH(a) H##a##_X##a##_hi = _mm_clmulepi64_si128(H##a, X##a, 0x11)
-#define RED_MUL_MID(a)                          \
-    tmp##a    = _mm_shuffle_epi32(H##a, 0x4e);  \
-    tmp##a##B = _mm_shuffle_epi32(X##a, 0x4e);  \
-    tmp##a    = _mm_xor_si128(tmp##a, H##a);    \
-    tmp##a##B = _mm_xor_si128(tmp##a##B, X##a); \
-    tmp##a    = _mm_clmulepi64_si128(tmp##a, tmp##a##B, 0x00)
+/* Encrypt and add PARALLEL_BLOCKS AES blocks */
 
-/* 4 multiply-accumulate at once; again
-   <https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf>
-   for the Aggregated Reduction Method & sample code.
-   Algorithm by Krzysztof Jankowski, Pierre Laurent - Intel */
-#define ADDMULREDUCE4(H0_, H1_, H2_, H3_, X0_, X1_, X2_, X3_, accv)                             \
-    do {                                                                                        \
-        __m128i       H0 = H0_, H1 = H1_, H2 = H2_, H3 = H3_;                                   \
-        __m128i       X0 = X0_, X1 = X1_, X2 = X2_, X3 = X3_;                                   \
-        const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
-        __m128i       lo, hi;                                                                   \
-        __m128i       tmp8, tmp9;                                                               \
-        MAKE4(RED_DECL);                                                                        \
-                                                                                                \
-        /* byte-revert the inputs & xor the first one into the accumulator */                   \
-        MAKE4(RED_SHUFFLE);                                                                     \
-        X3 = _mm_xor_si128(X3, accv);                                                           \
-                                                                                                \
-        /* 4 low H*X (x0*h0) */                                                                 \
-        MAKE4(RED_MUL_LOW);                                                                     \
-        lo = _mm_xor_si128(H0_X0_lo, H1_X1_lo);                                                 \
-        lo = _mm_xor_si128(lo, H2_X2_lo);                                                       \
-        lo = _mm_xor_si128(lo, H3_X3_lo);                                                       \
-                                                                                                \
-        /* 4 high H*X (x1*h1) */                                                                \
-        MAKE4(RED_MUL_HIGH);                                                                    \
-        hi = _mm_xor_si128(H0_X0_hi, H1_X1_hi);                                                 \
-        hi = _mm_xor_si128(hi, H2_X2_hi);                                                       \
-        hi = _mm_xor_si128(hi, H3_X3_hi);                                                       \
-                                                                                                \
-        /* 4 middle H*X, using Karatsuba, i.e.                                                  \
-           x1*h0+x0*h1 =(x1+x0)*(h1+h0)-x1*h1-x0*h0                                             \
-           we already have all x1y1 & x0y0 (accumulated in hi & lo)                             \
-           (0 is low half and 1 is high half)                                                   \
-        */                                                                                      \
-        /* permute the high and low 64 bits in H1 & X1,                                         \
-           so create (h0,h1) from (h1,h0) and (x0,x1) from (x1,x0),                             \
-           then compute (h0+h1,h1+h0) and (x0+x1,x1+x0),                                        \
-           and finally multiply                                                                 \
-        */                                                                                      \
-        MAKE4(RED_MUL_MID);                                                                     \
-                                                                                                \
-        /* subtracts x1*h1 and x0*h0 */                                                         \
-        tmp0 = _mm_xor_si128(tmp0, lo);                                                         \
-        tmp0 = _mm_xor_si128(tmp0, hi);                                                         \
-        tmp0 = _mm_xor_si128(tmp1, tmp0);                                                       \
-        tmp0 = _mm_xor_si128(tmp2, tmp0);                                                       \
-        tmp0 = _mm_xor_si128(tmp3, tmp0);                                                       \
-                                                                                                \
-        /* reduction */                                                                         \
-        tmp0B = _mm_slli_si128(tmp0, 8);                                                        \
-        tmp0  = _mm_srli_si128(tmp0, 8);                                                        \
-        lo    = _mm_xor_si128(tmp0B, lo);                                                       \
-        hi    = _mm_xor_si128(tmp0, hi);                                                        \
-        tmp3  = lo;                                                                             \
-        tmp2B = hi;                                                                             \
-        tmp3B = _mm_srli_epi32(tmp3, 31);                                                       \
-        tmp8  = _mm_srli_epi32(tmp2B, 31);                                                      \
-        tmp3  = _mm_slli_epi32(tmp3, 1);                                                        \
-        tmp2B = _mm_slli_epi32(tmp2B, 1);                                                       \
-        tmp9  = _mm_srli_si128(tmp3B, 12);                                                      \
-        tmp8  = _mm_slli_si128(tmp8, 4);                                                        \
-        tmp3B = _mm_slli_si128(tmp3B, 4);                                                       \
-        tmp3  = _mm_or_si128(tmp3, tmp3B);                                                      \
-        tmp2B = _mm_or_si128(tmp2B, tmp8);                                                      \
-        tmp2B = _mm_or_si128(tmp2B, tmp9);                                                      \
-        tmp3B = _mm_slli_epi32(tmp3, 31);                                                       \
-        tmp8  = _mm_slli_epi32(tmp3, 30);                                                       \
-        tmp9  = _mm_slli_epi32(tmp3, 25);                                                       \
-        tmp3B = _mm_xor_si128(tmp3B, tmp8);                                                     \
-        tmp3B = _mm_xor_si128(tmp3B, tmp9);                                                     \
-        tmp8  = _mm_srli_si128(tmp3B, 4);                                                       \
-        tmp3B = _mm_slli_si128(tmp3B, 12);                                                      \
-        tmp3  = _mm_xor_si128(tmp3, tmp3B);                                                     \
-        tmp2  = _mm_srli_epi32(tmp3, 1);                                                        \
-        tmp0B = _mm_srli_epi32(tmp3, 2);                                                        \
-        tmp1B = _mm_srli_epi32(tmp3, 7);                                                        \
-        tmp2  = _mm_xor_si128(tmp2, tmp0B);                                                     \
-        tmp2  = _mm_xor_si128(tmp2, tmp1B);                                                     \
-        tmp2  = _mm_xor_si128(tmp2, tmp8);                                                      \
-        tmp3  = _mm_xor_si128(tmp3, tmp2);                                                      \
-        tmp2B = _mm_xor_si128(tmp2B, tmp3);                                                     \
-        accv  = tmp2B;                                                                          \
-    } while (0)
+static inline void
+encrypt_xor_wide(const State *st, unsigned char dst[16 * PARALLEL_BLOCKS],
+                 const unsigned char src[16 * PARALLEL_BLOCKS],
+                 const BlockVec      counters[PARALLEL_BLOCKS])
+{
+    BlockVec ts[PARALLEL_BLOCKS];
+    size_t   i, j;
 
-#define ADDMULREDUCE8(H0_, H1_, H2_, H3_, H4_, H5_, H6_, H7_, X0_, X1_, X2_, X3_, X4_, X5_, X6_, \
-                      X7_, accv)                                                                 \
-    do {                                                                                         \
-        __m128i H0 = H0_, H1 = H1_, H2 = H2_, H3 = H3_, H4 = H4_, H5 = H5_, H6 = H6_, H7 = H7_;  \
-        __m128i X0 = X0_, X1 = X1_, X2 = X2_, X3 = X3_, X4 = X4_, X5 = X5_, X6 = X6_, X7 = X7_;  \
-        const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);  \
-        __m128i       lo, hi;                                                                    \
-        __m128i       tmp8, tmp9;                                                                \
-        MAKE8(RED_DECL);                                                                         \
-                                                                                                 \
-        /* byte-revert the inputs & xor the first one into the accumulator */                    \
-        MAKE8(RED_SHUFFLE);                                                                      \
-        X7 = _mm_xor_si128(X7, accv);                                                            \
-                                                                                                 \
-        /* 8 low H*X (x0*h0) */                                                                  \
-        MAKE8(RED_MUL_LOW);                                                                      \
-        lo = _mm_xor_si128(H0_X0_lo, H1_X1_lo);                                                  \
-        lo = _mm_xor_si128(lo, H2_X2_lo);                                                        \
-        lo = _mm_xor_si128(lo, H3_X3_lo);                                                        \
-        lo = _mm_xor_si128(lo, H4_X4_lo);                                                        \
-        lo = _mm_xor_si128(lo, H5_X5_lo);                                                        \
-        lo = _mm_xor_si128(lo, H6_X6_lo);                                                        \
-        lo = _mm_xor_si128(lo, H7_X7_lo);                                                        \
-                                                                                                 \
-        /* 8 high H*X (x1*h1) */                                                                 \
-        MAKE8(RED_MUL_HIGH);                                                                     \
-        hi = _mm_xor_si128(H0_X0_hi, H1_X1_hi);                                                  \
-        hi = _mm_xor_si128(hi, H2_X2_hi);                                                        \
-        hi = _mm_xor_si128(hi, H3_X3_hi);                                                        \
-        hi = _mm_xor_si128(hi, H4_X4_hi);                                                        \
-        hi = _mm_xor_si128(hi, H5_X5_hi);                                                        \
-        hi = _mm_xor_si128(hi, H6_X6_hi);                                                        \
-        hi = _mm_xor_si128(hi, H7_X7_hi);                                                        \
-                                                                                                 \
-        /* 8 middle H*X, using Karatsuba, i.e.                                                   \
-           x1*h0+x0*h1 =(x1+x0)*(h1+h0)-x1*h1-x0*h0                                              \
-           we already have all x1y1 & x0y0 (accumulated in hi & lo)                              \
-           (0 is low half and 1 is high half)                                                    \
-        */                                                                                       \
-        /* permute the high and low 64 bits in H1 & X1,                                          \
-           so create (h0,h1) from (h1,h0) and (x0,x1) from (x1,x0),                              \
-           then compute (h0+h1,h1+h0) and (x0+x1,x1+x0),                                         \
-           and finally multiply                                                                  \
-        */                                                                                       \
-        MAKE8(RED_MUL_MID);                                                                      \
-                                                                                                 \
-        /* subtracts x1*h1 and x0*h0 */                                                          \
-        tmp0 = _mm_xor_si128(tmp0, lo);                                                          \
-        tmp0 = _mm_xor_si128(tmp0, hi);                                                          \
-        tmp0 = _mm_xor_si128(tmp1, tmp0);                                                        \
-        tmp0 = _mm_xor_si128(tmp2, tmp0);                                                        \
-        tmp0 = _mm_xor_si128(tmp3, tmp0);                                                        \
-        tmp0 = _mm_xor_si128(tmp4, tmp0);                                                        \
-        tmp0 = _mm_xor_si128(tmp5, tmp0);                                                        \
-        tmp0 = _mm_xor_si128(tmp6, tmp0);                                                        \
-        tmp0 = _mm_xor_si128(tmp7, tmp0);                                                        \
-                                                                                                 \
-        /* reduction */                                                                          \
-        tmp0B = _mm_slli_si128(tmp0, 8);                                                         \
-        tmp0  = _mm_srli_si128(tmp0, 8);                                                         \
-        lo    = _mm_xor_si128(tmp0B, lo);                                                        \
-        hi    = _mm_xor_si128(tmp0, hi);                                                         \
-        tmp3  = lo;                                                                              \
-        tmp2B = hi;                                                                              \
-        tmp3B = _mm_srli_epi32(tmp3, 31);                                                        \
-        tmp8  = _mm_srli_epi32(tmp2B, 31);                                                       \
-        tmp3  = _mm_slli_epi32(tmp3, 1);                                                         \
-        tmp2B = _mm_slli_epi32(tmp2B, 1);                                                        \
-        tmp9  = _mm_srli_si128(tmp3B, 12);                                                       \
-        tmp8  = _mm_slli_si128(tmp8, 4);                                                         \
-        tmp3B = _mm_slli_si128(tmp3B, 4);                                                        \
-        tmp3  = _mm_or_si128(tmp3, tmp3B);                                                       \
-        tmp2B = _mm_or_si128(tmp2B, tmp8);                                                       \
-        tmp2B = _mm_or_si128(tmp2B, tmp9);                                                       \
-        tmp3B = _mm_slli_epi32(tmp3, 31);                                                        \
-        tmp8  = _mm_slli_epi32(tmp3, 30);                                                        \
-        tmp9  = _mm_slli_epi32(tmp3, 25);                                                        \
-        tmp3B = _mm_xor_si128(tmp3B, tmp8);                                                      \
-        tmp3B = _mm_xor_si128(tmp3B, tmp9);                                                      \
-        tmp8  = _mm_srli_si128(tmp3B, 4);                                                        \
-        tmp3B = _mm_slli_si128(tmp3B, 12);                                                       \
-        tmp3  = _mm_xor_si128(tmp3, tmp3B);                                                      \
-        tmp2  = _mm_srli_epi32(tmp3, 1);                                                         \
-        tmp0B = _mm_srli_epi32(tmp3, 2);                                                         \
-        tmp1B = _mm_srli_epi32(tmp3, 7);                                                         \
-        tmp2  = _mm_xor_si128(tmp2, tmp0B);                                                      \
-        tmp2  = _mm_xor_si128(tmp2, tmp1B);                                                      \
-        tmp2  = _mm_xor_si128(tmp2, tmp8);                                                       \
-        tmp3  = _mm_xor_si128(tmp3, tmp2);                                                       \
-        tmp2B = _mm_xor_si128(tmp2B, tmp3);                                                      \
-        accv  = tmp2B;                                                                           \
-    } while (0)
+    for (j = 0; j < PARALLEL_BLOCKS; j++) {
+        ts[j] = XOR128(counters[j], st->rkeys[0]);
+    }
+    for (i = 1; i < ROUNDS; i++) {
+        for (j = 0; j < PARALLEL_BLOCKS; j++) {
+            ts[j] = AES_ENCRYPT(ts[j], st->rkeys[i]);
+        }
+    }
+    for (j = 0; j < PARALLEL_BLOCKS; j++) {
+        ts[j] = AES_ENCRYPTLAST(ts[j], st->rkeys[i]);
+        ts[j] = XOR128(ts[j], LOAD128(&src[16 * j]));
+    }
+    for (j = 0; j < PARALLEL_BLOCKS; j++) {
+        STORE128(&dst[16 * j], ts[j]);
+    }
+}
 
-#define INDECLx(a) __m128i in##a
+/* Square a field element */
 
-#define XORx(a) temp##a = _mm_xor_si128(temp##a, _mm_loadu_si128((const __m128i *) (in + a * 16)))
+static inline I256
+clsq128(const BlockVec x)
+{
+    const BlockVec x_hi  = BYTESHR128(x, 8);
+    const BlockVec mid   = XOR128(x, x_hi);
+    const BlockVec r_lo  = CLMULLO128(x, x);
+    const BlockVec r_hi  = CLMULHI128(x, x);
+    const BlockVec r_mid = XOR128(CLMULLO128(mid, mid), XOR128(r_lo, r_hi));
 
-#define INXORx(a) \
-    temp##a = _mm_xor_si128(temp##a, (in##a = _mm_loadu_si128((const __m128i *) (in + a * 16))))
+    return (I256) {
+        .hi  = r_hi,
+        .lo  = r_lo,
+        .mid = r_mid,
+    };
+}
 
-#define LOADx(a) __m128i in##a = _mm_loadu_si128((const __m128i *) (in + a * 16))
+/* Multiply two field elements -- Textbook multiplication is faster than Karatsuba on some recent
+ * CPUs */
 
-/* full encrypt & checksum 8 blocks at once */
-#define ENCRYPT8FULL(out_, n_, rkeys, in_, accum, Hv, H2v, H3v, H4v, H5v, H6v, H7v, H8v)        \
-    do {                                                                                        \
-        const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);  \
-        const unsigned char *in   = in_;                                                        \
-        unsigned char *      out  = out_;                                                       \
-        unsigned int *       n    = n_;                                                         \
-        __m128i              accv = _mm_loadu_si128((const __m128i *) accum);                   \
-        MAKE8(NVDECLx);                                                                         \
-        MAKE8(TEMPDECLx);                                                                       \
-                                                                                                \
-        MAKE8(NVx);                                                                             \
-        MAKE8(TEMPx);                                                                           \
-        MAKE8(AESENCx1);                                                                        \
-        MAKE8(AESENCx2);                                                                        \
-        MAKE8(AESENCx3);                                                                        \
-        MAKE8(AESENCx4);                                                                        \
-        MAKE8(AESENCx5);                                                                        \
-        MAKE8(AESENCx6);                                                                        \
-        MAKE8(AESENCx7);                                                                        \
-        MAKE8(AESENCx8);                                                                        \
-        MAKE8(AESENCx9);                                                                        \
-        MAKE8(AESENCx10);                                                                       \
-        MAKE8(AESENCx11);                                                                       \
-        MAKE8(AESENCx12);                                                                       \
-        MAKE8(AESENCx13);                                                                       \
-        MAKE8(AESENCLASTx);                                                                     \
-        MAKE8(XORx);                                                                            \
-        MAKE8(STOREx);                                                                          \
-        ADDMULREDUCE8(Hv, H2v, H3v, H4v, H5v, H6v, H7v, H8v, temp7, temp6, temp5, temp4, temp3, \
-                      temp2, temp1, temp0, accv);                                               \
-        _mm_storeu_si128((__m128i *) accum, accv);                                              \
-    } while (0)
+static inline I256
+clmul128(const BlockVec x, const BlockVec y)
+{
+#ifdef USE_KARATSUBA_MULTIPLICATION
+    const BlockVec x_hi  = BYTESHR128(x, 8);
+    const BlockVec y_hi  = BYTESHR128(y, 8);
+    const BlockVec r_lo  = CLMULLO128(x, y);
+    const BlockVec r_hi  = CLMULHI128(x, y);
+    const BlockVec r_mid = XOR128(CLMULLO128(XOR128(x, x_hi), XOR128(y, y_hi)), XOR128(r_lo, r_hi));
 
-/* full decrypt & checksum 8 blocks at once */
-#define DECRYPT8FULL(out_, n_, rkeys, in_, accum, Hv, H2v, H3v, H4v, H5v, H6v, H7v, H8v)        \
-    do {                                                                                        \
-        const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);  \
-        const unsigned char *in   = in_;                                                        \
-        unsigned char *      out  = out_;                                                       \
-        unsigned int *       n    = n_;                                                         \
-        __m128i              accv = _mm_loadu_si128((const __m128i *) accum);                   \
-        MAKE8(INDECLx);                                                                         \
-        MAKE8(NVDECLx);                                                                         \
-        MAKE8(TEMPDECLx);                                                                       \
-                                                                                                \
-        MAKE8(NVx);                                                                             \
-        MAKE8(TEMPx);                                                                           \
-        MAKE8(AESENCx1);                                                                        \
-        MAKE8(AESENCx2);                                                                        \
-        MAKE8(AESENCx3);                                                                        \
-        MAKE8(AESENCx4);                                                                        \
-        MAKE8(AESENCx5);                                                                        \
-        MAKE8(AESENCx6);                                                                        \
-        MAKE8(AESENCx7);                                                                        \
-        MAKE8(AESENCx8);                                                                        \
-        MAKE8(AESENCx9);                                                                        \
-        MAKE8(AESENCx10);                                                                       \
-        MAKE8(AESENCx11);                                                                       \
-        MAKE8(AESENCx12);                                                                       \
-        MAKE8(AESENCx13);                                                                       \
-        MAKE8(AESENCLASTx);                                                                     \
-        MAKE8(INXORx);                                                                          \
-        ADDMULREDUCE8(Hv, H2v, H3v, H4v, H5v, H6v, H7v, H8v, in7, in6, in5, in4, in3, in2, in1, \
-                      in0, accv);                                                               \
-        _mm_storeu_si128((__m128i *) accum, accv);                                              \
-        MAKE8(STOREx);                                                                          \
-    } while (0)
+    return (I256) {
+        .hi  = r_hi,
+        .lo  = r_lo,
+        .mid = r_mid,
+    };
+#else
+    const BlockVec r_hi  = CLMULHI128(x, y);
+    const BlockVec r_lo  = CLMULLO128(x, y);
+    const BlockVec r_mid = XOR128(CLMULHILO128(x, y), CLMULLOHI128(x, y));
+
+    return (I256) {
+        .hi  = r_hi,
+        .lo  = r_lo,
+        .mid = r_mid,
+    };
+#endif
+}
+
+/* Merge the middle word and reduce a field element */
+
+static inline BlockVec
+gcm_reduce(const I256 x)
+{
+    const BlockVec hi = XOR128(x.hi, BYTESHR128(x.mid, 8));
+    const BlockVec lo = XOR128(x.lo, BYTESHL128(x.mid, 8));
+
+    const BlockVec p64 = SET64x2(0, 0xc200000000000000);
+    const BlockVec a   = CLMULLO128(lo, p64);
+    const BlockVec b   = XOR128(SHUFFLE32x4(lo, 2, 3, 0, 1), a);
+    const BlockVec c   = CLMULLO128(b, p64);
+    const BlockVec d   = XOR128(SHUFFLE32x4(b, 2, 3, 0, 1), c);
+
+    return XOR128(d, hi);
+}
+
+/* Precompute powers of H from `from` to `to` */
+
+static inline void
+precomp(Precomp hx[PC_COUNT], const size_t from, const size_t to)
+{
+    const Precomp h = hx[0];
+    size_t        i;
+
+    for (i = from & ~1U; i < to; i += 2) {
+        hx[i]     = gcm_reduce(clmul128(hx[i - 1], h));
+        hx[i + 1] = gcm_reduce(clsq128(hx[i / 2]));
+    }
+}
+
+/* Precompute powers of H given a key and a block count */
+
+static void
+precomp_for_block_count(Precomp hx[PC_COUNT], const unsigned char gh_key[16],
+                        const size_t block_count)
+{
+    const BlockVec h0    = REV128(LOAD128(gh_key));
+    BlockVec       carry = SET64x2(0xc200000000000000, 1);
+    BlockVec       mask  = SUB64x2(ZERO128, SHR64x2(h0, 63));
+    BlockVec       h0_shifted;
+    BlockVec       h;
+
+    mask       = SHUFFLE32x4(mask, 3, 3, 3, 3);
+    carry      = AND128(carry, mask);
+    h0_shifted = SHL128(h0, 1);
+    h          = XOR128(h0_shifted, carry);
+
+    hx[0] = h;
+    hx[1] = gcm_reduce(clsq128(hx[0]));
+
+    if (block_count >= PC_COUNT) {
+        precomp(hx, 2, PC_COUNT);
+    } else {
+        precomp(hx, 2, block_count);
+    }
+}
+
+/* Initialize a GHash */
+
+static inline void
+gh_init(GHash *sth)
+{
+    sth->acc = ZERO128;
+}
+
+/* Absorb ad_len bytes of associated data. There has to be no partial block. */
+
+static inline void
+gh_ad_blocks(const State *st, GHash *sth, const unsigned char *ad, size_t ad_len)
+{
+    size_t i;
+
+    i = (size_t) 0U;
+    for (; i + PC_COUNT * 16 <= ad_len; i += PC_COUNT * 16) {
+        BlockVec m = REV128(LOAD128(ad + i));
+        I256     u = clmul128(XOR128(sth->acc, m), st->hx[PC_COUNT - 1 - 0]);
+        size_t   j;
+
+        for (j = 1; j < PC_COUNT; j += 1) {
+            m = REV128(LOAD128(ad + i + j * 16));
+            u = XOR256(u, clmul128(m, st->hx[PC_COUNT - 1 - j]));
+        }
+        sth->acc = gcm_reduce(u);
+    }
+    for (; i + PC_COUNT * 16 / 2 <= ad_len; i += PC_COUNT * 16 / 2) {
+        BlockVec m = REV128(LOAD128(ad + i));
+        I256     u = clmul128(XOR128(sth->acc, m), st->hx[PC_COUNT / 2 - 1 - 0]);
+        size_t   j;
+
+        for (j = 1; j < PC_COUNT / 2; j += 1) {
+            m = REV128(LOAD128(ad + i + j * 16));
+            u = XOR256(u, clmul128(m, st->hx[PC_COUNT / 2 - 1 - j]));
+        }
+        sth->acc = gcm_reduce(u);
+    }
+    for (; i + 4 * 16 <= ad_len; i += 4 * 16) {
+        BlockVec m = REV128(LOAD128(ad + i));
+        I256     u = clmul128(XOR128(sth->acc, m), st->hx[4 - 1 - 0]);
+        size_t   j;
+
+        for (j = 1; j < 4; j += 1) {
+            m = REV128(LOAD128(ad + i + j * 16));
+            u = XOR256(u, clmul128(m, st->hx[4 - 1 - j]));
+        }
+        sth->acc = gcm_reduce(u);
+    }
+    for (; i + 2 * 16 <= ad_len; i += 2 * 16) {
+        BlockVec m = REV128(LOAD128(ad + i));
+        I256     u = clmul128(XOR128(sth->acc, m), st->hx[2 - 1 - 0]);
+        size_t   j;
+
+        for (j = 1; j < 2; j += 1) {
+            m = REV128(LOAD128(ad + i + j * 16));
+            u = XOR256(u, clmul128(m, st->hx[2 - 1 - j]));
+        }
+        sth->acc = gcm_reduce(u);
+    }
+    if (i < ad_len) {
+        const size_t n = (ad_len - i) / 16;
+        BlockVec     m = REV128(LOAD128(ad + i));
+        I256         u = clmul128(XOR128(sth->acc, m), st->hx[n - 1 - 0]);
+        size_t       j;
+
+        for (j = 1; j < n; j += 1) {
+            m = REV128(LOAD128(ad + i + j * 16));
+            u = XOR256(u, clmul128(m, st->hx[n - 1 - j]));
+        }
+        i += n * 16;
+        sth->acc = gcm_reduce(u);
+    }
+}
+
+/* Increment counters */
+
+static inline BlockVec
+incr_counters(BlockVec rev_counters[], BlockVec counter, const size_t n)
+{
+    size_t i;
+
+    const BlockVec one = ONE128;
+    for (i = 0; i < n; i++) {
+        rev_counters[i] = REV128(counter);
+        counter         = ADD64x2(counter, one);
+    }
+    return counter;
+}
+
+/* Compute the number of required blocks to encrypt and authenticate `ad_len` of associated data,
+ * and `m_len` of encrypted bytes. Return `0` if limits would be exceeded.*/
+
+static inline size_t
+required_blocks(const size_t ad_len, const size_t m_len)
+{
+    const size_t ad_blocks = (ad_len + 15) / 16;
+    const size_t m_blocks  = (m_len + 15) / 16;
+
+    if (ad_len > SIZE_MAX - 2 * PARALLEL_BLOCKS * 16 ||
+        m_len > SIZE_MAX - 2 * PARALLEL_BLOCKS * 16 || ad_len < ad_blocks || m_len < m_blocks ||
+        m_blocks >= (1ULL << 32) - 2) {
+        return 0;
+    }
+    return ad_blocks + m_blocks + 1;
+}
+
+/* Generic AES-GCM encryption. "Generic" as it can handle arbitrary input sizes,
+unlike a length-limited version that would precompute all the required powers of H */
+
+static void
+aes_gcm_encrypt_generic(const State *st, GHash *sth, unsigned char mac[ABYTES], unsigned char *dst,
+                        const unsigned char *src, size_t src_len, const unsigned char *ad,
+                        size_t ad_len, unsigned char counter_[16])
+{
+    CRYPTO_ALIGN(32) I256          u;
+    CRYPTO_ALIGN(16) unsigned char last_blocks[2 * 16];
+    const BlockVec                 one = ONE128;
+    BlockVec                       final_block;
+    BlockVec                       rev_counters[PARALLEL_BLOCKS];
+    BlockVec                       m;
+    BlockVec                       counter;
+    size_t                         i;
+    size_t                         j;
+    size_t                         left;
+    size_t                         pi;
+
+    COMPILER_ASSERT(PC_COUNT % PARALLEL_BLOCKS == 0);
+
+    /* Associated data */
+
+    if (ad != NULL && ad_len != 0) {
+        gh_ad_blocks(st, sth, ad, ad_len & ~15);
+        left = ad_len & 15;
+        if (left != 0) {
+            unsigned char pad[16];
+
+            memset(pad, 0, sizeof pad);
+            memcpy(pad, ad + ad_len - left, left);
+            gh_ad_blocks(st, sth, pad, sizeof pad);
+        }
+    }
+
+    /* Encrypted data */
+
+    counter = REV128(LOAD128(counter_));
+    i       = 0;
+
+    /* 2*PARALLEL_BLOCKS aggregation */
+
+    if (src_len - i >= 2 * PARALLEL_BLOCKS * 16) {
+        counter = incr_counters(rev_counters, counter, PARALLEL_BLOCKS);
+        encrypt_xor_wide(st, dst + i, src + i, rev_counters);
+        i += PARALLEL_BLOCKS * 16;
+
+        for (; i + 2 * PARALLEL_BLOCKS * 16 <= src_len; i += 2 * PARALLEL_BLOCKS * 16) {
+            counter = incr_counters(rev_counters, counter, PARALLEL_BLOCKS);
+            encrypt_xor_wide(st, dst + i, src + i, rev_counters);
+
+            pi = i - PARALLEL_BLOCKS * 16;
+            m  = REV128(LOAD128(dst + pi));
+            u  = clmul128(XOR128(sth->acc, m), st->hx[2 * PARALLEL_BLOCKS - 1 - 0]);
+            for (j = 1; j < PARALLEL_BLOCKS; j += 1) {
+                m = REV128(LOAD128(dst + pi + j * 16));
+                u = XOR256(u, clmul128(m, st->hx[2 * PARALLEL_BLOCKS - 1 - j]));
+            }
+
+            counter = incr_counters(rev_counters, counter, PARALLEL_BLOCKS);
+            encrypt_xor_wide(st, dst + i + PARALLEL_BLOCKS * 16, src + i + PARALLEL_BLOCKS * 16,
+                             rev_counters);
+
+            pi = i;
+            for (j = 0; j < PARALLEL_BLOCKS; j += 1) {
+                m = REV128(LOAD128(dst + pi + j * 16));
+                u = XOR256(u, clmul128(m, st->hx[PARALLEL_BLOCKS - 1 - j]));
+            }
+            sth->acc = gcm_reduce(u);
+        }
+
+        pi = i - PARALLEL_BLOCKS * 16;
+        m  = REV128(LOAD128(dst + pi));
+        u  = clmul128(XOR128(sth->acc, m), st->hx[PARALLEL_BLOCKS - 1 - 0]);
+        for (j = 1; j < PARALLEL_BLOCKS; j += 1) {
+            m = REV128(LOAD128(dst + pi + j * 16));
+            u = XOR256(u, clmul128(m, st->hx[PARALLEL_BLOCKS - 1 - j]));
+        }
+        sth->acc = gcm_reduce(u);
+    }
+
+    /* PARALLEL_BLOCKS aggregation */
+
+    if (src_len - i >= PARALLEL_BLOCKS * 16) {
+        counter = incr_counters(rev_counters, counter, PARALLEL_BLOCKS);
+        encrypt_xor_wide(st, dst + i, src + i, rev_counters);
+        i += PARALLEL_BLOCKS * 16;
+
+        for (; i + PARALLEL_BLOCKS * 16 <= src_len; i += PARALLEL_BLOCKS * 16) {
+            counter = incr_counters(rev_counters, counter, PARALLEL_BLOCKS);
+            encrypt_xor_wide(st, dst + i, src + i, rev_counters);
+
+            pi = i - PARALLEL_BLOCKS * 16;
+            m  = REV128(LOAD128(dst + pi));
+            u  = clmul128(XOR128(sth->acc, m), st->hx[PARALLEL_BLOCKS - 1 - 0]);
+            for (j = 1; j < PARALLEL_BLOCKS; j += 1) {
+                m = REV128(LOAD128(dst + pi + j * 16));
+                u = XOR256(u, clmul128(m, st->hx[PARALLEL_BLOCKS - 1 - j]));
+            }
+            sth->acc = gcm_reduce(u);
+        }
+
+        pi = i - PARALLEL_BLOCKS * 16;
+        m  = REV128(LOAD128(dst + pi));
+        u  = clmul128(XOR128(sth->acc, m), st->hx[PARALLEL_BLOCKS - 1 - 0]);
+        for (j = 1; j < PARALLEL_BLOCKS; j += 1) {
+            m = REV128(LOAD128(dst + pi + j * 16));
+            u = XOR256(u, clmul128(m, st->hx[PARALLEL_BLOCKS - 1 - j]));
+        }
+        sth->acc = gcm_reduce(u);
+    }
+
+    /* 4-blocks aggregation */
+
+    for (; i + 4 * 16 <= src_len; i += 4 * 16) {
+        counter = incr_counters(rev_counters, counter, 4);
+        for (j = 0; j < 4; j++) {
+            encrypt_xor_block(st, dst + i + j * 16, src + i + j * 16, rev_counters[j]);
+        }
+
+        m = REV128(LOAD128(dst + i));
+        u = clmul128(XOR128(sth->acc, m), st->hx[4 - 1 - 0]);
+        for (j = 1; j < 4; j += 1) {
+            m = REV128(LOAD128(dst + i + j * 16));
+            u = XOR256(u, clmul128(m, st->hx[4 - 1 - j]));
+        }
+        sth->acc = gcm_reduce(u);
+    }
+
+    /* 2-blocks aggregation */
+
+    for (; i + 2 * 16 <= src_len; i += 2 * 16) {
+        counter = incr_counters(rev_counters, counter, 2);
+        for (j = 0; j < 2; j++) {
+            encrypt_xor_block(st, dst + i + j * 16, src + i + j * 16, rev_counters[j]);
+        }
+
+        m = REV128(LOAD128(dst + i));
+        u = clmul128(XOR128(sth->acc, m), st->hx[2 - 1 - 0]);
+        for (j = 1; j < 2; j += 1) {
+            m = REV128(LOAD128(dst + i + j * 16));
+            u = XOR256(u, clmul128(m, st->hx[2 - 1 - j]));
+        }
+        sth->acc = gcm_reduce(u);
+    }
+
+    /* Remaining *partial* blocks; if we have 16 bytes left, we want to keep the
+    full block authenticated along with the final block, hence < and not <= */
+
+    for (; i + 16 < src_len; i += 16) {
+        encrypt_xor_block(st, dst + i, src + i, REV128(counter));
+        m        = REV128(LOAD128(dst + i));
+        u        = clmul128(XOR128(sth->acc, m), st->hx[1 - 1 - 0]);
+        sth->acc = gcm_reduce(u);
+        counter  = ADD64x2(counter, one);
+    }
+
+    /* Authenticate both the last block of the message and the final block */
+
+    final_block = REV128(SET64x2(ad_len * 8, src_len * 8));
+    STORE32_BE(counter_ + NPUBBYTES, 1);
+    encrypt(st, mac, counter_);
+    left = src_len - i;
+    if (left != 0) {
+        for (j = 0; j < left; j++) {
+            last_blocks[j] = src[i + j];
+        }
+        STORE128(last_blocks + 16, final_block);
+        encrypt_xor_block(st, last_blocks, last_blocks, REV128(counter));
+        for (; j < 16; j++) {
+            last_blocks[j] = 0;
+        }
+        for (j = 0; j < left; j++) {
+            dst[i + j] = last_blocks[j];
+        }
+        gh_ad_blocks(st, sth, last_blocks, 32);
+    } else {
+        STORE128(last_blocks, final_block);
+        gh_ad_blocks(st, sth, last_blocks, 16);
+    }
+    STORE128(mac, XOR128(LOAD128(mac), REV128(sth->acc)));
+}
+
+/* Generic AES-GCM decryption. "Generic" as it can handle arbitrary input sizes,
+unlike a length-limited version that would precompute all the required powers of H */
+
+static void
+aes_gcm_decrypt_generic(const State *st, GHash *sth, unsigned char mac[ABYTES], unsigned char *dst,
+                        const unsigned char *src, size_t src_len, const unsigned char *ad,
+                        size_t ad_len, unsigned char counter_[16])
+{
+    CRYPTO_ALIGN(32) I256          u;
+    CRYPTO_ALIGN(16) unsigned char last_blocks[2 * 16];
+    const BlockVec                 one = ONE128;
+    BlockVec                       final_block;
+    BlockVec                       rev_counters[PARALLEL_BLOCKS];
+    BlockVec                       m;
+    BlockVec                       counter;
+    size_t                         i;
+    size_t                         j;
+    size_t                         left;
+
+    COMPILER_ASSERT(PC_COUNT % PARALLEL_BLOCKS == 0);
+
+    /* Associated data */
+
+    if (ad != NULL && ad_len != 0) {
+        gh_ad_blocks(st, sth, ad, ad_len & ~15);
+        left = ad_len & 15;
+        if (left != 0) {
+            unsigned char pad[16];
+
+            memset(pad, 0, sizeof pad);
+            memcpy(pad, ad + ad_len - left, left);
+            gh_ad_blocks(st, sth, pad, sizeof pad);
+        }
+    }
+
+    /* Encrypted data */
+
+    counter = REV128(LOAD128(counter_));
+    i       = 0;
+
+    /* 2*PARALLEL_BLOCKS aggregation */
+
+    for (; i + 2 * PARALLEL_BLOCKS * 16 <= src_len; i += 2 * PARALLEL_BLOCKS * 16) {
+        counter = incr_counters(rev_counters, counter, PARALLEL_BLOCKS);
+
+        m = REV128(LOAD128(src + i));
+        u = clmul128(XOR128(sth->acc, m), st->hx[2 * PARALLEL_BLOCKS - 1 - 0]);
+        for (j = 1; j < PARALLEL_BLOCKS; j += 1) {
+            m = REV128(LOAD128(src + i + j * 16));
+            u = XOR256(u, clmul128(m, st->hx[2 * PARALLEL_BLOCKS - 1 - j]));
+        }
+
+        encrypt_xor_wide(st, dst + i, src + i, rev_counters);
+
+        counter = incr_counters(rev_counters, counter, PARALLEL_BLOCKS);
+
+        for (j = 0; j < PARALLEL_BLOCKS; j += 1) {
+            m = REV128(LOAD128(src + i + j * 16));
+            u = XOR256(u, clmul128(m, st->hx[PARALLEL_BLOCKS - 1 - j]));
+        }
+        sth->acc = gcm_reduce(u);
+
+        encrypt_xor_wide(st, dst + i + PARALLEL_BLOCKS * 16, src + i + PARALLEL_BLOCKS * 16,
+                         rev_counters);
+    }
+
+    /* PARALLEL_BLOCKS aggregation */
+
+    for (; i + PARALLEL_BLOCKS * 16 <= src_len; i += PARALLEL_BLOCKS * 16) {
+        counter = incr_counters(rev_counters, counter, PARALLEL_BLOCKS);
+
+        m = REV128(LOAD128(src + i));
+        u = clmul128(XOR128(sth->acc, m), st->hx[PARALLEL_BLOCKS - 1 - 0]);
+        for (j = 1; j < PARALLEL_BLOCKS; j += 1) {
+            m = REV128(LOAD128(src + i + j * 16));
+            u = XOR256(u, clmul128(m, st->hx[PARALLEL_BLOCKS - 1 - j]));
+        }
+        sth->acc = gcm_reduce(u);
+
+        encrypt_xor_wide(st, dst + i, src + i, rev_counters);
+    }
+
+    /* 4-blocks aggregation */
+
+    for (; i + 4 * 16 <= src_len; i += 4 * 16) {
+        counter = incr_counters(rev_counters, counter, 4);
+
+        m = REV128(LOAD128(src + i));
+        u = clmul128(XOR128(sth->acc, m), st->hx[4 - 1 - 0]);
+        for (j = 1; j < 4; j += 1) {
+            m = REV128(LOAD128(src + i + j * 16));
+            u = XOR256(u, clmul128(m, st->hx[4 - 1 - j]));
+        }
+        sth->acc = gcm_reduce(u);
+
+        for (j = 0; j < 4; j++) {
+            encrypt_xor_block(st, dst + i + j * 16, src + i + j * 16, rev_counters[j]);
+        }
+    }
+
+    /* 2-blocks aggregation */
+
+    for (; i + 2 * 16 <= src_len; i += 2 * 16) {
+        counter = incr_counters(rev_counters, counter, 2);
+
+        m = REV128(LOAD128(src + i));
+        u = clmul128(XOR128(sth->acc, m), st->hx[2 - 1 - 0]);
+        for (j = 1; j < 2; j += 1) {
+            m = REV128(LOAD128(src + i + j * 16));
+            u = XOR256(u, clmul128(m, st->hx[2 - 1 - j]));
+        }
+        sth->acc = gcm_reduce(u);
+
+        for (j = 0; j < 2; j++) {
+            encrypt_xor_block(st, dst + i + j * 16, src + i + j * 16, rev_counters[j]);
+        }
+    }
+
+    /* Remaining *partial* blocks; if we have 16 bytes left, we want to keep the
+    full block authenticated along with the final block, hence < and not <= */
+
+    for (; i + 16 < src_len; i += 16) {
+        m        = REV128(LOAD128(src + i));
+        u        = clmul128(XOR128(sth->acc, m), st->hx[1 - 1 - 0]);
+        sth->acc = gcm_reduce(u);
+        encrypt_xor_block(st, dst + i, src + i, REV128(counter));
+        counter = ADD64x2(counter, one);
+    }
+
+    /* Authenticate both the last block of the message and the final block */
+
+    final_block = REV128(SET64x2(ad_len * 8, src_len * 8));
+    STORE32_BE(counter_ + NPUBBYTES, 1);
+    encrypt(st, mac, counter_);
+    left = src_len - i;
+    if (left != 0) {
+        for (j = 0; j < left; j++) {
+            last_blocks[j] = src[i + j];
+        }
+        for (; j < 16; j++) {
+            last_blocks[j] = 0;
+        }
+        STORE128(last_blocks + 16, final_block);
+        gh_ad_blocks(st, sth, last_blocks, 32);
+        encrypt_xor_block(st, last_blocks, last_blocks, REV128(counter));
+        for (j = 0; j < left; j++) {
+            dst[i + j] = last_blocks[j];
+        }
+    } else {
+        STORE128(last_blocks, final_block);
+        gh_ad_blocks(st, sth, last_blocks, 16);
+    }
+    STORE128(mac, XOR128(LOAD128(mac), REV128(sth->acc)));
+}
 
 int
-crypto_aead_aes256gcm_beforenm(crypto_aead_aes256gcm_state *ctx_, const unsigned char *k)
+crypto_aead_aes256gcm_beforenm(crypto_aead_aes256gcm_state *st_, const unsigned char *k)
 {
-    aes256gcm_state *ctx   = (aes256gcm_state *) (void *) ctx_;
-    unsigned char   *H     = ctx->H;
-    __m128i         *rkeys = ctx->rkeys;
-    const __m128i    zero  = _mm_setzero_si128();
+    State                         *st = (State *) (void *) st_;
+    CRYPTO_ALIGN(16) unsigned char h[16];
 
-    COMPILER_ASSERT((sizeof *ctx_) >= (sizeof *ctx));
-    aesni_key256_expand(k, rkeys);
-    aesni_encrypt1(H, zero, rkeys);
+    expand256(k, st->rkeys);
+    memset(h, 0, sizeof h);
+    encrypt(st, h, h);
+
+    precomp_for_block_count(st->hx, h, PC_COUNT);
 
     return 0;
 }
@@ -596,265 +840,189 @@ crypto_aead_aes256gcm_beforenm(crypto_aead_aes256gcm_state *ctx_, const unsigned
 int
 crypto_aead_aes256gcm_encrypt_detached_afternm(unsigned char *c, unsigned char *mac,
                                                unsigned long long *maclen_p, const unsigned char *m,
-                                               unsigned long long mlen, const unsigned char *ad,
-                                               unsigned long long adlen, const unsigned char *nsec,
-                                               const unsigned char *              npub,
-                                               const crypto_aead_aes256gcm_state *ctx_)
+                                               unsigned long long m_len, const unsigned char *ad,
+                                               unsigned long long ad_len, const unsigned char *nsec,
+                                               const unsigned char               *npub,
+                                               const crypto_aead_aes256gcm_state *st_)
 {
-    const __m128i          rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-    const aes256gcm_state *ctx = (const aes256gcm_state *) (const void *) ctx_;
-    const __m128i *        rkeys = ctx->rkeys;
-    __m128i                Hv, H2v, H3v, H4v, H5v, H6v, H7v, H8v, accv;
-    unsigned long long     i, j;
-    unsigned long long     adlen_rnd64 = adlen & ~63ULL;
-    unsigned long long     mlen_rnd128 = mlen & ~127ULL;
-    CRYPTO_ALIGN(16) uint32_t      n2[4];
-    CRYPTO_ALIGN(16) unsigned char H[16];
-    CRYPTO_ALIGN(16) unsigned char T[16];
-    CRYPTO_ALIGN(16) unsigned char accum[16];
-    CRYPTO_ALIGN(16) unsigned char fb[16];
+    const State                   *st = (const State *) (const void *) st_;
+    GHash                          sth;
+    CRYPTO_ALIGN(16) unsigned char j[16];
+    size_t                         gh_required_blocks;
 
     (void) nsec;
-    memcpy(H, ctx->H, sizeof H);
-    if (mlen > crypto_aead_aes256gcm_MESSAGEBYTES_MAX) {
-        sodium_misuse(); /* LCOV_EXCL_LINE */
-    }
-    memcpy(&n2[0], npub, 3 * 4);
-    n2[3] = 0x01000000;
-    aesni_encrypt1(T, _mm_load_si128((const __m128i *) n2), rkeys);
-    {
-        uint64_t x;
-        x = _bswap64((uint64_t)(8 * adlen));
-        memcpy(&fb[0], &x, sizeof x);
-        x = _bswap64((uint64_t)(8 * mlen));
-        memcpy(&fb[8], &x, sizeof x);
-    }
-    /* we store H (and it's power) byte-reverted once and for all */
-    Hv = _mm_shuffle_epi8(_mm_load_si128((const __m128i *) H), rev);
-    _mm_store_si128((__m128i *) H, Hv);
-    H2v = mulv(Hv, Hv);
-    H3v = mulv(H2v, Hv);
-    H4v = mulv(H3v, Hv);
-    H5v = mulv(H4v, Hv);
-    H6v = mulv(H5v, Hv);
-    H7v = mulv(H6v, Hv);
-    H8v = mulv(H7v, Hv);
-
-    accv = _mm_setzero_si128();
-    for (i = 0; i < adlen_rnd64; i += 64) {
-        __m128i X4_ = _mm_loadu_si128((const __m128i *) (ad + i + 0));
-        __m128i X3_ = _mm_loadu_si128((const __m128i *) (ad + i + 16));
-        __m128i X2_ = _mm_loadu_si128((const __m128i *) (ad + i + 32));
-        __m128i X1_ = _mm_loadu_si128((const __m128i *) (ad + i + 48));
-        ADDMULREDUCE4(Hv, H2v, H3v, H4v, X1_, X2_, X3_, X4_, accv);
-    }
-    _mm_store_si128((__m128i *) accum, accv);
-
-    /* GCM remainder loop */
-    for (i = adlen_rnd64; i < adlen; i += 16) {
-        unsigned int blocklen = 16;
-
-        if (i + (unsigned long long) blocklen > adlen) {
-            blocklen = (unsigned int) (adlen - i);
-        }
-        addmulreduce(accum, ad + i, blocklen, H);
-    }
-
-    /* this only does 8 full blocks, so no fancy bounds checking is necessary*/
-#define LOOPRND128                                                                               \
-    do {                                                                                         \
-        const int iter = 8;                                                                      \
-        const int lb   = iter * 16;                                                              \
-                                                                                                 \
-        for (i = 0; i < mlen_rnd128; i += lb) {                                                  \
-            ENCRYPT8FULL(c + i, n2, rkeys, m + i, accum, Hv, H2v, H3v, H4v, H5v, H6v, H7v, H8v); \
-        }                                                                                        \
-    } while (0)
-
-    /* remainder loop, with the slower GCM update to accommodate partial blocks */
-#define LOOPRMD128                                        \
-    do {                                                  \
-        const int iter = 8;                               \
-        const int lb   = iter * 16;                       \
-                                                          \
-        for (i = mlen_rnd128; i < mlen; i += lb) {        \
-            CRYPTO_ALIGN(16) unsigned char outni[8 * 16]; \
-            unsigned long long             mj = lb;       \
-                                                          \
-            aesni_encrypt8(outni, n2, rkeys);             \
-            if ((i + mj) >= mlen) {                       \
-                mj = mlen - i;                            \
-            }                                             \
-            for (j = 0; j < mj; j++) {                    \
-                c[i + j] = m[i + j] ^ outni[j];           \
-            }                                             \
-            for (j = 0; j < mj; j += 16) {                \
-                unsigned int bl = 16;                     \
-                                                          \
-                if (j + (unsigned long long) bl >= mj) {  \
-                    bl = (unsigned int) (mj - j);         \
-                }                                         \
-                addmulreduce(accum, c + i + j, bl, H);    \
-            }                                             \
-        }                                                 \
-    } while (0)
-
-    n2[3] &= 0x00ffffff;
-    COUNTER_INC2(n2);
-    LOOPRND128;
-    LOOPRMD128;
-
-    addmulreduce(accum, fb, 16, H);
-
-    for (i = 0; i < 16; ++i) {
-        mac[i] = T[i] ^ accum[15 - i];
-    }
     if (maclen_p != NULL) {
-        *maclen_p = 16;
+        *maclen_p = 0;
+    }
+    gh_required_blocks = required_blocks(ad_len, m_len);
+    if (gh_required_blocks == 0) {
+        memset(mac, 0xd0, ABYTES);
+        memset(c, 0, m_len);
+        return -1;
+    }
+
+    gh_init(&sth);
+
+    memcpy(j, npub, NPUBBYTES);
+    STORE32_BE(j + NPUBBYTES, 2);
+
+    aes_gcm_encrypt_generic(st, &sth, mac, c, m, m_len, ad, ad_len, j);
+
+    if (maclen_p != NULL) {
+        *maclen_p = ABYTES;
     }
     return 0;
 }
 
+int
+crypto_aead_aes256gcm_encrypt(unsigned char *c, unsigned long long *clen_p, const unsigned char *m,
+                              unsigned long long m_len, const unsigned char *ad,
+                              unsigned long long ad_len, const unsigned char *nsec,
+                              const unsigned char *npub, const unsigned char *k)
+{
+    const int ret = crypto_aead_aes256gcm_encrypt_detached(c, c + m_len, NULL, m, m_len, ad, ad_len,
+                                                           nsec, npub, k);
+    if (clen_p != NULL) {
+        if (ret == 0) {
+            *clen_p = m_len + crypto_aead_aes256gcm_ABYTES;
+        } else {
+            *clen_p = 0;
+        }
+    }
+    return ret;
+}
+
+int
+crypto_aead_aes256gcm_encrypt_detached(unsigned char *c, unsigned char *mac,
+                                       unsigned long long *maclen_p, const unsigned char *m,
+                                       unsigned long long m_len, const unsigned char *ad,
+                                       unsigned long long ad_len, const unsigned char *nsec,
+                                       const unsigned char *npub, const unsigned char *k)
+{
+    CRYPTO_ALIGN(16) crypto_aead_aes256gcm_state st;
+    int                                          ret;
+
+    PREFETCH_WRITE(c);
+    PREFETCH_READ(m);
+    PREFETCH_READ(ad);
+
+    crypto_aead_aes256gcm_beforenm(&st, k);
+    ret = crypto_aead_aes256gcm_encrypt_detached_afternm(c, mac, maclen_p, m, m_len, ad, ad_len,
+                                                         nsec, npub, &st);
+    sodium_memzero(&st, sizeof st);
+
+    return ret;
+}
+
 int
 crypto_aead_aes256gcm_encrypt_afternm(unsigned char *c, unsigned long long *clen_p,
                                       const unsigned char *m, unsigned long long mlen,
                                       const unsigned char *ad, unsigned long long adlen,
                                       const unsigned char *nsec, const unsigned char *npub,
-                                      const crypto_aead_aes256gcm_state *ctx_)
+                                      const crypto_aead_aes256gcm_state *st_)
 {
     int ret = crypto_aead_aes256gcm_encrypt_detached_afternm(c, c + mlen, NULL, m, mlen, ad, adlen,
-                                                             nsec, npub, ctx_);
+                                                             nsec, npub, st_);
     if (clen_p != NULL) {
         *clen_p = mlen + crypto_aead_aes256gcm_ABYTES;
     }
     return ret;
 }
 
-int
-crypto_aead_aes256gcm_decrypt_detached_afternm(unsigned char *m, unsigned char *nsec,
-                                               const unsigned char *c, unsigned long long clen,
-                                               const unsigned char *mac, const unsigned char *ad,
-                                               unsigned long long adlen, const unsigned char *npub,
-                                               const crypto_aead_aes256gcm_state *ctx_)
+static int
+crypto_aead_aes256gcm_verify_mac(unsigned char *nsec, const unsigned char *c,
+                                 unsigned long long c_len, const unsigned char *mac,
+                                 const unsigned char *ad, unsigned long long ad_len,
+                                 const unsigned char *npub, const crypto_aead_aes256gcm_state *st_)
 {
-    const __m128i          rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-    const aes256gcm_state *ctx = (const aes256gcm_state *) (const void *) ctx_;
-    const __m128i *        rkeys = ctx->rkeys;
-    __m128i                Hv, H2v, H3v, H4v, H5v, H6v, H7v, H8v, accv;
-    unsigned long long     i, j;
-    unsigned long long     adlen_rnd64 = adlen & ~63ULL;
-    unsigned long long     mlen;
-    unsigned long long     mlen_rnd128;
-    CRYPTO_ALIGN(16) uint32_t      n2[4];
-    CRYPTO_ALIGN(16) unsigned char H[16];
-    CRYPTO_ALIGN(16) unsigned char T[16];
-    CRYPTO_ALIGN(16) unsigned char accum[16];
-    CRYPTO_ALIGN(16) unsigned char fb[16];
+    const State                   *st = (const State *) (const void *) st_;
+    GHash                          sth;
+    BlockVec                       final_block;
+    CRYPTO_ALIGN(16) unsigned char j[16];
+    CRYPTO_ALIGN(16) unsigned char computed_mac[16];
+    CRYPTO_ALIGN(16) unsigned char last_block[16];
+    size_t                         gh_required_blocks;
+    size_t                         left;
+    int                            ret;
 
     (void) nsec;
-    if (clen > crypto_aead_aes256gcm_MESSAGEBYTES_MAX) {
-        sodium_misuse(); /* LCOV_EXCL_LINE */
-    }
-    mlen = clen;
-
-    memcpy(&n2[0], npub, 3 * 4);
-    n2[3] = 0x01000000;
-    aesni_encrypt1(T, _mm_load_si128((const __m128i *) n2), rkeys);
-
-    {
-        uint64_t x;
-        x = _bswap64((uint64_t)(8 * adlen));
-        memcpy(&fb[0], &x, sizeof x);
-        x = _bswap64((uint64_t)(8 * mlen));
-        memcpy(&fb[8], &x, sizeof x);
+    gh_required_blocks = required_blocks(ad_len, c_len);
+    if (gh_required_blocks == 0) {
+        return -1;
     }
 
-    memcpy(H, ctx->H, sizeof H);
-    Hv = _mm_shuffle_epi8(_mm_load_si128((const __m128i *) H), rev);
-    _mm_store_si128((__m128i *) H, Hv);
-    H2v  = mulv(Hv, Hv);
-    H3v  = mulv(H2v, Hv);
-    H4v  = mulv(H3v, Hv);
-    H5v  = mulv(H4v, Hv);
-    H6v  = mulv(H5v, Hv);
-    H7v  = mulv(H6v, Hv);
-    H8v  = mulv(H7v, Hv);
-    accv = _mm_setzero_si128();
+    gh_init(&sth);
 
-    for (i = 0; i < adlen_rnd64; i += 64) {
-        __m128i X4_ = _mm_loadu_si128((const __m128i *) (ad + i + 0));
-        __m128i X3_ = _mm_loadu_si128((const __m128i *) (ad + i + 16));
-        __m128i X2_ = _mm_loadu_si128((const __m128i *) (ad + i + 32));
-        __m128i X1_ = _mm_loadu_si128((const __m128i *) (ad + i + 48));
-        ADDMULREDUCE4(Hv, H2v, H3v, H4v, X1_, X2_, X3_, X4_, accv);
-    }
-    _mm_store_si128((__m128i *) accum, accv);
+    memcpy(j, npub, NPUBBYTES);
+    STORE32_BE(j + NPUBBYTES, 2);
 
-    /* GCM remainder loop */
-    for (i = adlen_rnd64; i < adlen; i += 16) {
-        unsigned int blocklen = 16;
-        if (i + (unsigned long long) blocklen > adlen) {
-            blocklen = (unsigned int) (adlen - i);
-        }
-        addmulreduce(accum, ad + i, blocklen, H);
+    gh_ad_blocks(st, &sth, ad, ad_len & ~15);
+    left = ad_len & 15;
+    if (left != 0) {
+        unsigned char pad[16];
+
+        memset(pad, 0, sizeof pad);
+        memcpy(pad, ad + ad_len - left, left);
+        gh_ad_blocks(st, &sth, pad, sizeof pad);
     }
 
-    mlen_rnd128 = mlen & ~127ULL;
+    gh_ad_blocks(st, &sth, c, c_len & ~15);
+    left = c_len & 15;
+    if (left != 0) {
+        unsigned char pad[16];
 
-#define LOOPDRND128                                                                              \
-    do {                                                                                         \
-        const int iter = 8;                                                                      \
-        const int lb   = iter * 16;                                                              \
-        for (i = 0; i < mlen_rnd128; i += lb) {                                                  \
-            DECRYPT8FULL(m + i, n2, rkeys, c + i, accum, Hv, H2v, H3v, H4v, H5v, H6v, H7v, H8v); \
-        }                                                                                        \
-    } while (0)
+        memset(pad, 0, sizeof pad);
+        memcpy(pad, c + c_len - left, left);
+        gh_ad_blocks(st, &sth, pad, sizeof pad);
+    }
+    final_block = REV128(SET64x2(ad_len * 8, c_len * 8));
+    STORE32_BE(j + NPUBBYTES, 1);
+    encrypt(st, computed_mac, j);
+    STORE128(last_block, final_block);
+    gh_ad_blocks(st, &sth, last_block, 16);
+    STORE128(computed_mac, XOR128(LOAD128(computed_mac), REV128(sth.acc)));
 
-#define LOOPDRMD128                                       \
-    do {                                                  \
-        const int iter = 8;                               \
-        const int lb   = iter * 16;                       \
-        for (i = mlen_rnd128; i < mlen; i += lb) {        \
-            CRYPTO_ALIGN(16) unsigned char outni[8 * 16]; \
-            unsigned long long             mj = lb;       \
-                                                          \
-            if ((i + mj) >= mlen) {                       \
-                mj = mlen - i;                            \
-            }                                             \
-            for (j = 0; j < mj; j += 16) {                \
-                unsigned long long bl = 16;               \
-                if (j + bl >= mj) {                       \
-                    bl = mj - j;                          \
-                }                                         \
-                addmulreduce(accum, c + i + j, bl, H);    \
-            }                                             \
-            aesni_encrypt8(outni, n2, rkeys);             \
-            for (j = 0; j < mj; j++) {                    \
-                m[i + j] = c[i + j] ^ outni[j];           \
-            }                                             \
-        }                                                 \
-    } while (0)
+    ret = crypto_verify_16(mac, computed_mac);
+    sodium_memzero(computed_mac, sizeof computed_mac);
 
-    n2[3] = 0U;
-    COUNTER_INC2(n2);
-    LOOPDRND128;
-    LOOPDRMD128;
-    addmulreduce(accum, fb, 16, H);
-    {
-        unsigned char d = 0;
+    return ret;
+}
 
-        for (i = 0; i < 16; i++) {
-            d |= (mac[i] ^ (T[i] ^ accum[15 - i]));
-        }
-        if (d != 0) {
-            if (m != NULL) {
-                memset(m, 0, mlen);
-            }
-            return -1;
-        }
-        if (m == NULL) {
-            return 0;
-        }
+int
+crypto_aead_aes256gcm_decrypt_detached_afternm(unsigned char *m, unsigned char *nsec,
+                                               const unsigned char *c, unsigned long long c_len,
+                                               const unsigned char *mac, const unsigned char *ad,
+                                               unsigned long long ad_len, const unsigned char *npub,
+                                               const crypto_aead_aes256gcm_state *st_)
+{
+    const State                   *st = (const State *) (const void *) st_;
+    GHash                          sth;
+    CRYPTO_ALIGN(16) unsigned char j[16];
+    unsigned char                  computed_mac[16];
+    size_t                         gh_required_blocks;
+    const size_t                   m_len = c_len;
+
+    if (m == NULL) {
+        return crypto_aead_aes256gcm_verify_mac(nsec, c, c_len, mac, ad, ad_len, npub, st_);
+    }
+
+    (void) nsec;
+    gh_required_blocks = required_blocks(ad_len, m_len);
+    if (gh_required_blocks == 0) {
+        return -1;
+    }
+
+    gh_init(&sth);
+
+    memcpy(j, npub, NPUBBYTES);
+    STORE32_BE(j + NPUBBYTES, 2);
+
+    aes_gcm_decrypt_generic(st, &sth, computed_mac, m, c, m_len, ad, ad_len, j);
+
+    if (crypto_verify_16(mac, computed_mac) != 0) {
+        sodium_memzero(computed_mac, sizeof computed_mac);
+        memset(m, 0xd0, m_len);
+        return -1;
     }
     return 0;
 }
@@ -864,59 +1032,24 @@ crypto_aead_aes256gcm_decrypt_afternm(unsigned char *m, unsigned long long *mlen
                                       unsigned char *nsec, const unsigned char *c,
                                       unsigned long long clen, const unsigned char *ad,
                                       unsigned long long adlen, const unsigned char *npub,
-                                      const crypto_aead_aes256gcm_state *ctx_)
+                                      const crypto_aead_aes256gcm_state *st_)
 {
     unsigned long long mlen = 0ULL;
     int                ret  = -1;
 
-    if (clen >= crypto_aead_aes256gcm_ABYTES) {
+    if (clen >= ABYTES) {
         ret = crypto_aead_aes256gcm_decrypt_detached_afternm(
-            m, nsec, c, clen - crypto_aead_aes256gcm_ABYTES,
-            c + clen - crypto_aead_aes256gcm_ABYTES, ad, adlen, npub, ctx_);
+            m, nsec, c, clen - ABYTES, c + clen - ABYTES, ad, adlen, npub, st_);
     }
     if (mlen_p != NULL) {
         if (ret == 0) {
-            mlen = clen - crypto_aead_aes256gcm_ABYTES;
+            mlen = clen - ABYTES;
         }
         *mlen_p = mlen;
     }
     return ret;
 }
 
-int
-crypto_aead_aes256gcm_encrypt_detached(unsigned char *c, unsigned char *mac,
-                                       unsigned long long *maclen_p, const unsigned char *m,
-                                       unsigned long long mlen, const unsigned char *ad,
-                                       unsigned long long adlen, const unsigned char *nsec,
-                                       const unsigned char *npub, const unsigned char *k)
-{
-    CRYPTO_ALIGN(16) crypto_aead_aes256gcm_state ctx;
-
-    crypto_aead_aes256gcm_beforenm(&ctx, k);
-
-    return crypto_aead_aes256gcm_encrypt_detached_afternm(
-        c, mac, maclen_p, m, mlen, ad, adlen, nsec, npub,
-        (const crypto_aead_aes256gcm_state *) &ctx);
-}
-
-int
-crypto_aead_aes256gcm_encrypt(unsigned char *c, unsigned long long *clen_p, const unsigned char *m,
-                              unsigned long long mlen, const unsigned char *ad,
-                              unsigned long long adlen, const unsigned char *nsec,
-                              const unsigned char *npub, const unsigned char *k)
-{
-    CRYPTO_ALIGN(16) crypto_aead_aes256gcm_state ctx;
-    int                                          ret;
-
-    crypto_aead_aes256gcm_beforenm(&ctx, k);
-
-    ret = crypto_aead_aes256gcm_encrypt_afternm(c, clen_p, m, mlen, ad, adlen, nsec, npub,
-                                                (const crypto_aead_aes256gcm_state *) &ctx);
-    sodium_memzero(&ctx, sizeof ctx);
-
-    return ret;
-}
-
 int
 crypto_aead_aes256gcm_decrypt_detached(unsigned char *m, unsigned char *nsec,
                                        const unsigned char *c, unsigned long long clen,
@@ -924,12 +1057,16 @@ crypto_aead_aes256gcm_decrypt_detached(unsigned char *m, unsigned char *nsec,
                                        unsigned long long adlen, const unsigned char *npub,
                                        const unsigned char *k)
 {
-    CRYPTO_ALIGN(16) crypto_aead_aes256gcm_state ctx;
+    CRYPTO_ALIGN(16) crypto_aead_aes256gcm_state st;
 
-    crypto_aead_aes256gcm_beforenm(&ctx, k);
+    PREFETCH_WRITE(m);
+    PREFETCH_READ(c);
+    PREFETCH_READ(ad);
+
+    crypto_aead_aes256gcm_beforenm(&st, k);
 
     return crypto_aead_aes256gcm_decrypt_detached_afternm(
-        m, nsec, c, clen, mac, ad, adlen, npub, (const crypto_aead_aes256gcm_state *) &ctx);
+        m, nsec, c, clen, mac, ad, adlen, npub, (const crypto_aead_aes256gcm_state *) &st);
 }
 
 int
@@ -938,14 +1075,18 @@ crypto_aead_aes256gcm_decrypt(unsigned char *m, unsigned long long *mlen_p, unsi
                               const unsigned char *ad, unsigned long long adlen,
                               const unsigned char *npub, const unsigned char *k)
 {
-    CRYPTO_ALIGN(16) crypto_aead_aes256gcm_state ctx;
+    CRYPTO_ALIGN(16) crypto_aead_aes256gcm_state st;
     int                                          ret;
 
-    crypto_aead_aes256gcm_beforenm(&ctx, k);
+    PREFETCH_WRITE(m);
+    PREFETCH_READ(c);
+    PREFETCH_READ(ad);
+
+    crypto_aead_aes256gcm_beforenm(&st, k);
 
     ret = crypto_aead_aes256gcm_decrypt_afternm(m, mlen_p, nsec, c, clen, ad, adlen, npub,
-                                                (const crypto_aead_aes256gcm_state *) &ctx);
-    sodium_memzero(&ctx, sizeof ctx);
+                                                (const crypto_aead_aes256gcm_state *) &st);
+    sodium_memzero(&st, sizeof st);
 
     return ret;
 }
@@ -1005,7 +1146,7 @@ crypto_aead_aes256gcm_decrypt(unsigned char *m, unsigned long long *mlen_p, unsi
 }
 
 int
-crypto_aead_aes256gcm_beforenm(crypto_aead_aes256gcm_state *ctx_, const unsigned char *k)
+crypto_aead_aes256gcm_beforenm(crypto_aead_aes256gcm_state *st_, const unsigned char *k)
 {
     errno = ENOSYS;
     return -1;
@@ -1016,8 +1157,8 @@ crypto_aead_aes256gcm_encrypt_detached_afternm(unsigned char *c, unsigned char *
                                                unsigned long long *maclen_p, const unsigned char *m,
                                                unsigned long long mlen, const unsigned char *ad,
                                                unsigned long long adlen, const unsigned char *nsec,
-                                               const unsigned char *              npub,
-                                               const crypto_aead_aes256gcm_state *ctx_)
+                                               const unsigned char               *npub,
+                                               const crypto_aead_aes256gcm_state *st_)
 {
     errno = ENOSYS;
     return -1;
@@ -1028,7 +1169,7 @@ crypto_aead_aes256gcm_encrypt_afternm(unsigned char *c, unsigned long long *clen
                                       const unsigned char *m, unsigned long long mlen,
                                       const unsigned char *ad, unsigned long long adlen,
                                       const unsigned char *nsec, const unsigned char *npub,
-                                      const crypto_aead_aes256gcm_state *ctx_)
+                                      const crypto_aead_aes256gcm_state *st_)
 {
     errno = ENOSYS;
     return -1;
@@ -1039,7 +1180,7 @@ crypto_aead_aes256gcm_decrypt_detached_afternm(unsigned char *m, unsigned char *
                                                const unsigned char *c, unsigned long long clen,
                                                const unsigned char *mac, const unsigned char *ad,
                                                unsigned long long adlen, const unsigned char *npub,
-                                               const crypto_aead_aes256gcm_state *ctx_)
+                                               const crypto_aead_aes256gcm_state *st_)
 {
     errno = ENOSYS;
     return -1;
@@ -1050,7 +1191,7 @@ crypto_aead_aes256gcm_decrypt_afternm(unsigned char *m, unsigned long long *mlen
                                       unsigned char *nsec, const unsigned char *c,
                                       unsigned long long clen, const unsigned char *ad,
                                       unsigned long long adlen, const unsigned char *npub,
-                                      const crypto_aead_aes256gcm_state *ctx_)
+                                      const crypto_aead_aes256gcm_state *st_)
 {
     errno = ENOSYS;
     return -1;
diff --git a/test/default/aead_aes256gcm.c b/test/default/aead_aes256gcm.c
index 1f83fdc0..f8da3d57 100644
--- a/test/default/aead_aes256gcm.c
+++ b/test/default/aead_aes256gcm.c
@@ -3097,6 +3097,7 @@ tv(void)
     size_t              detached_ciphertext_len;
     size_t              i = 0U;
     size_t              message_len;
+    int                 res;
 
     key = (unsigned char *) sodium_malloc(crypto_aead_aes256gcm_KEYBYTES);
     nonce = (unsigned char *) sodium_malloc(crypto_aead_aes256gcm_NPUBBYTES);
@@ -3135,34 +3136,48 @@ tv(void)
         ciphertext = (unsigned char *) sodium_malloc(ciphertext_len);
         detached_ciphertext = (unsigned char *) sodium_malloc(detached_ciphertext_len);
 
-        crypto_aead_aes256gcm_encrypt_detached(detached_ciphertext, mac,
-                                               &found_mac_len,
-                                               message, message_len,
-                                               ad, ad_len, NULL, nonce, key);
+        res = crypto_aead_aes256gcm_encrypt_detached(detached_ciphertext, mac,
+                                                     &found_mac_len,
+                                                     message, message_len,
+                                                     ad, ad_len, NULL, nonce, key);
         assert(found_mac_len == crypto_aead_aes256gcm_ABYTES);
         if (memcmp(detached_ciphertext, expected_ciphertext,
                    detached_ciphertext_len) != 0 ||
             memcmp(mac, expected_ciphertext + message_len,
                    crypto_aead_aes256gcm_ABYTES) != 0) {
-            printf("Detached encryption of test vector #%u failed\n", (unsigned int) i);
+            printf("Detached encryption of test vector #%u failed (res=%d)\n",
+                   (unsigned int) i, res);
             hex = (char *) sodium_malloc((size_t) ciphertext_len * 2 + 1);
-            sodium_bin2hex(hex, (size_t) ciphertext_len * 2 + 1,
-                           ciphertext, ciphertext_len);
+            sodium_bin2hex(hex, (size_t) detached_ciphertext_len * 2 + 1,
+                           detached_ciphertext, detached_ciphertext_len);
             printf("Computed: [%s]\n", hex);
+            sodium_bin2hex(hex, (size_t) detached_ciphertext_len * 2 + 1,
+                           expected_ciphertext, detached_ciphertext_len);
+            printf("Expected: [%s]\n", hex);
+            sodium_bin2hex(hex, (size_t) found_mac_len * 2 + 1,
+                           mac, found_mac_len);
+            printf("Computed mac: [%s]\n", hex);
+            sodium_bin2hex(hex, (size_t) found_mac_len * 2 + 1,
+                           expected_ciphertext + message_len, found_mac_len);
+            printf("Expected mac: [%s]\n", hex);
             sodium_free(hex);
         }
 
-        crypto_aead_aes256gcm_encrypt(ciphertext, &found_ciphertext_len,
-                                      message, message_len,
-                                      ad, ad_len, NULL, nonce, key);
+        res = crypto_aead_aes256gcm_encrypt(ciphertext, &found_ciphertext_len,
+                                            message, message_len,
+                                            ad, ad_len, NULL, nonce, key);
 
         assert((size_t) found_ciphertext_len == ciphertext_len);
         if (memcmp(ciphertext, expected_ciphertext, ciphertext_len) != 0) {
-            printf("Encryption of test vector #%u failed\n", (unsigned int) i);
+            printf("Encryption of test vector #%u failed (res=%d)\n",
+                   (unsigned int) i, res);
             hex = (char *) sodium_malloc((size_t) found_ciphertext_len * 2 + 1);
             sodium_bin2hex(hex, (size_t) found_ciphertext_len * 2 + 1,
                            ciphertext, ciphertext_len);
             printf("Computed: [%s]\n", hex);
+            sodium_bin2hex(hex, (size_t) ciphertext_len * 2 + 1,
+                           expected_ciphertext, ciphertext_len);
+            printf("Expected: [%s]\n", hex);
             sodium_free(hex);
         }