mirror of
https://github.com/jedisct1/libsodium.git
synced 2024-12-19 18:15:18 -07:00
aes256gcm: handle 8 blocks at a time instead of 4
Keep using 4 blocks at time for AD, as AD is usually short. Decrypt-and-verify instead of verify-then-decrypt.
This commit is contained in:
parent
c4b08fb208
commit
cc2bcbc217
@ -1,8 +1,8 @@
|
||||
|
||||
/*
|
||||
* AES256-GCM, based on the "Intel Carry-Less Multiplication Instruction and its Usage for Computing
|
||||
* the GCM Mode" paper and reference code, using the aggregated reduction method.
|
||||
* Originally adapted by Romain Dolbeau.
|
||||
* AES256-GCM, based on the "Intel Carry-Less Multiplication Instruction and its
|
||||
* Usage for Computing the GCM Mode" paper and reference code, using the
|
||||
* aggregated reduction method. Originally adapted by Romain Dolbeau.
|
||||
*/
|
||||
|
||||
#include <errno.h>
|
||||
@ -99,50 +99,33 @@ aesni_key256_expand(const unsigned char *key, __m128i *const rkeys)
|
||||
EXPAND_KEY_1(0x40);
|
||||
}
|
||||
|
||||
/** single, by-the-book AES encryption with AES-NI */
|
||||
static inline void
|
||||
aesni_encrypt1(unsigned char *out, __m128i nv, const __m128i *rkeys)
|
||||
{
|
||||
__m128i temp = _mm_xor_si128(nv, rkeys[0]);
|
||||
|
||||
temp = _mm_aesenc_si128(temp, rkeys[1]);
|
||||
temp = _mm_aesenc_si128(temp, rkeys[2]);
|
||||
temp = _mm_aesenc_si128(temp, rkeys[3]);
|
||||
temp = _mm_aesenc_si128(temp, rkeys[4]);
|
||||
temp = _mm_aesenc_si128(temp, rkeys[5]);
|
||||
temp = _mm_aesenc_si128(temp, rkeys[6]);
|
||||
temp = _mm_aesenc_si128(temp, rkeys[7]);
|
||||
temp = _mm_aesenc_si128(temp, rkeys[8]);
|
||||
temp = _mm_aesenc_si128(temp, rkeys[9]);
|
||||
temp = _mm_aesenc_si128(temp, rkeys[10]);
|
||||
temp = _mm_aesenc_si128(temp, rkeys[11]);
|
||||
temp = _mm_aesenc_si128(temp, rkeys[12]);
|
||||
temp = _mm_aesenc_si128(temp, rkeys[13]);
|
||||
|
||||
temp = _mm_aesenclast_si128(temp, rkeys[14]);
|
||||
_mm_storeu_si128((__m128i *) out, temp);
|
||||
}
|
||||
|
||||
/** multiple-blocks-at-once AES encryption with AES-NI ;
|
||||
on Haswell, aesenc has a latency of 7 and a throughput of 1
|
||||
so the sequence of aesenc should be bubble-free if you
|
||||
have at least 8 blocks. Let's build an arbitratry-sized
|
||||
function */
|
||||
/* Step 1 : loading the nonce */
|
||||
/* load & increment the n vector (non-vectorized, unused for now) */
|
||||
#define NVDECLx(a) __m128i nv##a
|
||||
|
||||
/* Step 1 : loading and incrementing the nonce */
|
||||
#define NVx(a) \
|
||||
nv##a = _mm_shuffle_epi8(_mm_load_si128((const __m128i *) n), pt); \
|
||||
n[3]++
|
||||
|
||||
/* Step 2 : define value in round one (xor with subkey #0, aka key) */
|
||||
#define TEMPDECLx(a) __m128i temp##a
|
||||
|
||||
/* Step 2 : define value in round one (xor with subkey #0, aka key) */
|
||||
#define TEMPx(a) temp##a = _mm_xor_si128(nv##a, rkeys[0])
|
||||
|
||||
/* Step 3: one round of AES */
|
||||
#define AESENCx(a) temp##a = _mm_aesenc_si128(temp##a, rkeys[roundctr])
|
||||
#define AESENCx(a, roundctr) temp##a = _mm_aesenc_si128(temp##a, rkeys[roundctr])
|
||||
#define AESENCx1(a) AESENCx(a, 1)
|
||||
#define AESENCx2(a) AESENCx(a, 2)
|
||||
#define AESENCx3(a) AESENCx(a, 3)
|
||||
#define AESENCx4(a) AESENCx(a, 4)
|
||||
#define AESENCx5(a) AESENCx(a, 5)
|
||||
#define AESENCx6(a) AESENCx(a, 6)
|
||||
#define AESENCx7(a) AESENCx(a, 7)
|
||||
#define AESENCx8(a) AESENCx(a, 8)
|
||||
#define AESENCx9(a) AESENCx(a, 9)
|
||||
#define AESENCx10(a) AESENCx(a, 10)
|
||||
#define AESENCx11(a) AESENCx(a, 11)
|
||||
#define AESENCx12(a) AESENCx(a, 12)
|
||||
#define AESENCx13(a) AESENCx(a, 13)
|
||||
|
||||
/* Step 4: last round of AES */
|
||||
#define AESENCLASTx(a) temp##a = _mm_aesenclast_si128(temp##a, rkeys[14])
|
||||
@ -169,33 +152,61 @@ aesni_encrypt1(unsigned char *out, __m128i nv, const __m128i *rkeys)
|
||||
|
||||
#define COUNTER_INC2(N) (N)[3] += 2
|
||||
|
||||
/* create a function of unrolling N ; the MAKEN is the unrolling
|
||||
macro, defined above. The N in MAKEN must match N, obviously. */
|
||||
#define FUNC(N, MAKEN) \
|
||||
static inline void aesni_encrypt##N(unsigned char *out, uint32_t *n, const __m128i *rkeys) \
|
||||
{ \
|
||||
const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
|
||||
int roundctr; \
|
||||
MAKEN(NVDECLx); \
|
||||
MAKEN(TEMPDECLx); \
|
||||
\
|
||||
MAKEN(NVx); \
|
||||
MAKEN(TEMPx); \
|
||||
for (roundctr = 1; roundctr < 14; roundctr++) { \
|
||||
MAKEN(AESENCx); \
|
||||
} \
|
||||
MAKEN(AESENCLASTx); \
|
||||
MAKEN(STOREx); \
|
||||
static inline void
|
||||
aesni_encrypt1(unsigned char *out, __m128i nv0, const __m128i *rkeys)
|
||||
{
|
||||
TEMPDECLx(0);
|
||||
|
||||
TEMPx(0);
|
||||
AESENCx1(0);
|
||||
AESENCx2(0);
|
||||
AESENCx3(0);
|
||||
AESENCx4(0);
|
||||
AESENCx5(0);
|
||||
AESENCx6(0);
|
||||
AESENCx7(0);
|
||||
AESENCx8(0);
|
||||
AESENCx9(0);
|
||||
AESENCx10(0);
|
||||
AESENCx11(0);
|
||||
AESENCx12(0);
|
||||
AESENCx13(0);
|
||||
AESENCLASTx(0);
|
||||
STOREx(0);
|
||||
}
|
||||
|
||||
FUNC(8, MAKE8)
|
||||
static inline void
|
||||
aesni_encrypt8(unsigned char *out, uint32_t *n, const __m128i *rkeys)
|
||||
{
|
||||
const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
MAKE8(NVDECLx);
|
||||
MAKE8(TEMPDECLx);
|
||||
|
||||
/* all GF(2^128) fnctions are by the book, meaning this one:
|
||||
MAKE8(NVx);
|
||||
MAKE8(TEMPx);
|
||||
MAKE8(AESENCx1);
|
||||
MAKE8(AESENCx2);
|
||||
MAKE8(AESENCx3);
|
||||
MAKE8(AESENCx4);
|
||||
MAKE8(AESENCx5);
|
||||
MAKE8(AESENCx6);
|
||||
MAKE8(AESENCx7);
|
||||
MAKE8(AESENCx8);
|
||||
MAKE8(AESENCx9);
|
||||
MAKE8(AESENCx10);
|
||||
MAKE8(AESENCx11);
|
||||
MAKE8(AESENCx12);
|
||||
MAKE8(AESENCx13);
|
||||
MAKE8(AESENCLASTx);
|
||||
MAKE8(STOREx);
|
||||
}
|
||||
|
||||
/* all GF(2^128) functions are by the book, meaning this one:
|
||||
<https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf>
|
||||
*/
|
||||
|
||||
static inline void
|
||||
addmul(unsigned char *c, const unsigned char *a, unsigned int xlen, const unsigned char *b)
|
||||
addmulreduce(unsigned char *c, const unsigned char *a, unsigned int xlen, const unsigned char *b)
|
||||
{
|
||||
const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
__m128i A, B, C;
|
||||
@ -301,11 +312,6 @@ mulv(__m128i A, __m128i B)
|
||||
return C;
|
||||
}
|
||||
|
||||
/* 4 multiply-accumulate at once; again
|
||||
<https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf>
|
||||
for the Aggregated Reduction Method & sample code.
|
||||
Algorithm by Krzysztof Jankowski, Pierre Laurent - Intel */
|
||||
|
||||
#define RED_DECL(a) __m128i H##a##_X##a##_lo, H##a##_X##a##_hi, tmp##a, tmp##a##B
|
||||
#define RED_SHUFFLE(a) X##a = _mm_shuffle_epi8(X##a, rev)
|
||||
#define RED_MUL_LOW(a) H##a##_X##a##_lo = _mm_clmulepi64_si128(H##a, X##a, 0x00)
|
||||
@ -317,34 +323,30 @@ mulv(__m128i A, __m128i B)
|
||||
tmp##a##B = _mm_xor_si128(tmp##a##B, X##a); \
|
||||
tmp##a = _mm_clmulepi64_si128(tmp##a, tmp##a##B, 0x00)
|
||||
|
||||
#define MULREDUCE4(rev, H0_, H1_, H2_, H3_, X0_, X1_, X2_, X3_, accv) \
|
||||
/* 4 multiply-accumulate at once; again
|
||||
<https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf>
|
||||
for the Aggregated Reduction Method & sample code.
|
||||
Algorithm by Krzysztof Jankowski, Pierre Laurent - Intel */
|
||||
#define ADDMULREDUCE4(H0_, H1_, H2_, H3_, X0_, X1_, X2_, X3_, accv) \
|
||||
do { \
|
||||
MAKE4(RED_DECL); \
|
||||
__m128i lo, hi; \
|
||||
__m128i H0 = H0_, H1 = H1_, H2 = H2_, H3 = H3_; \
|
||||
__m128i X0 = X0_, X1 = X1_, X2 = X2_, X3 = X3_; \
|
||||
const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
|
||||
__m128i lo, tmplo, hi, tmphi; \
|
||||
__m128i tmp8, tmp9; \
|
||||
__m128i H0 = H0_; \
|
||||
__m128i H1 = H1_; \
|
||||
__m128i H2 = H2_; \
|
||||
__m128i H3 = H3_; \
|
||||
__m128i X0 = X0_; \
|
||||
__m128i X1 = X1_; \
|
||||
__m128i X2 = X2_; \
|
||||
__m128i X3 = X3_; \
|
||||
MAKE4(RED_DECL); \
|
||||
\
|
||||
/* byte-revert the inputs & xor the first one into the accumulator */ \
|
||||
\
|
||||
MAKE4(RED_SHUFFLE); \
|
||||
X3 = _mm_xor_si128(X3, accv); \
|
||||
\
|
||||
/* 4 low H*X (x0*h0) */ \
|
||||
\
|
||||
MAKE4(RED_MUL_LOW); \
|
||||
lo = _mm_xor_si128(H0_X0_lo, H1_X1_lo); \
|
||||
lo = _mm_xor_si128(lo, H2_X2_lo); \
|
||||
lo = _mm_xor_si128(lo, H3_X3_lo); \
|
||||
\
|
||||
/* 4 high H*X (x1*h1) */ \
|
||||
\
|
||||
MAKE4(RED_MUL_HIGH); \
|
||||
hi = _mm_xor_si128(H0_X0_hi, H1_X1_hi); \
|
||||
hi = _mm_xor_si128(hi, H2_X2_hi); \
|
||||
@ -402,79 +404,177 @@ mulv(__m128i A, __m128i B)
|
||||
tmp2 = _mm_xor_si128(tmp2, tmp8); \
|
||||
tmp3 = _mm_xor_si128(tmp3, tmp2); \
|
||||
tmp2B = _mm_xor_si128(tmp2B, tmp3); \
|
||||
\
|
||||
accv = tmp2B; \
|
||||
} while (0)
|
||||
|
||||
#define ADDMULREDUCE8(H0_, H1_, H2_, H3_, H4_, H5_, H6_, H7_, X0_, X1_, X2_, X3_, X4_, X5_, X6_, \
|
||||
X7_, accv) \
|
||||
do { \
|
||||
__m128i H0 = H0_, H1 = H1_, H2 = H2_, H3 = H3_, H4 = H4_, H5 = H5_, H6 = H6_, H7 = H7_; \
|
||||
__m128i X0 = X0_, X1 = X1_, X2 = X2_, X3 = X3_, X4 = X4_, X5 = X5_, X6 = X6_, X7 = X7_; \
|
||||
const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
|
||||
__m128i lo, tmplo, hi, tmphi; \
|
||||
__m128i tmp8, tmp9; \
|
||||
MAKE8(RED_DECL); \
|
||||
\
|
||||
/* byte-revert the inputs & xor the first one into the accumulator */ \
|
||||
MAKE8(RED_SHUFFLE); \
|
||||
X7 = _mm_xor_si128(X7, accv); \
|
||||
\
|
||||
/* 8 low H*X (x0*h0) */ \
|
||||
MAKE8(RED_MUL_LOW); \
|
||||
lo = _mm_xor_si128(H0_X0_lo, H1_X1_lo); \
|
||||
lo = _mm_xor_si128(lo, H2_X2_lo); \
|
||||
lo = _mm_xor_si128(lo, H3_X3_lo); \
|
||||
lo = _mm_xor_si128(lo, H4_X4_lo); \
|
||||
lo = _mm_xor_si128(lo, H5_X5_lo); \
|
||||
lo = _mm_xor_si128(lo, H6_X6_lo); \
|
||||
lo = _mm_xor_si128(lo, H7_X7_lo); \
|
||||
\
|
||||
/* 8 high H*X (x1*h1) */ \
|
||||
MAKE8(RED_MUL_HIGH); \
|
||||
hi = _mm_xor_si128(H0_X0_hi, H1_X1_hi); \
|
||||
hi = _mm_xor_si128(hi, H2_X2_hi); \
|
||||
hi = _mm_xor_si128(hi, H3_X3_hi); \
|
||||
hi = _mm_xor_si128(hi, H4_X4_hi); \
|
||||
hi = _mm_xor_si128(hi, H5_X5_hi); \
|
||||
hi = _mm_xor_si128(hi, H6_X6_hi); \
|
||||
hi = _mm_xor_si128(hi, H7_X7_hi); \
|
||||
\
|
||||
/* 8 middle H*X, using Karatsuba, i.e. \
|
||||
x1*h0+x0*h1 =(x1+x0)*(h1+h0)-x1*h1-x0*h0 \
|
||||
we already have all x1y1 & x0y0 (accumulated in hi & lo) \
|
||||
(0 is low half and 1 is high half) \
|
||||
*/ \
|
||||
/* permute the high and low 64 bits in H1 & X1, \
|
||||
so create (h0,h1) from (h1,h0) and (x0,x1) from (x1,x0), \
|
||||
then compute (h0+h1,h1+h0) and (x0+x1,x1+x0), \
|
||||
and finally multiply \
|
||||
*/ \
|
||||
MAKE8(RED_MUL_MID); \
|
||||
\
|
||||
/* substracts x1*h1 and x0*h0 */ \
|
||||
tmp0 = _mm_xor_si128(tmp0, lo); \
|
||||
tmp0 = _mm_xor_si128(tmp0, hi); \
|
||||
tmp0 = _mm_xor_si128(tmp1, tmp0); \
|
||||
tmp0 = _mm_xor_si128(tmp2, tmp0); \
|
||||
tmp0 = _mm_xor_si128(tmp3, tmp0); \
|
||||
tmp0 = _mm_xor_si128(tmp4, tmp0); \
|
||||
tmp0 = _mm_xor_si128(tmp5, tmp0); \
|
||||
tmp0 = _mm_xor_si128(tmp6, tmp0); \
|
||||
tmp0 = _mm_xor_si128(tmp7, tmp0); \
|
||||
\
|
||||
/* reduction */ \
|
||||
tmp0B = _mm_slli_si128(tmp0, 8); \
|
||||
tmp0 = _mm_srli_si128(tmp0, 8); \
|
||||
lo = _mm_xor_si128(tmp0B, lo); \
|
||||
hi = _mm_xor_si128(tmp0, hi); \
|
||||
tmp3 = lo; \
|
||||
tmp2B = hi; \
|
||||
tmp3B = _mm_srli_epi32(tmp3, 31); \
|
||||
tmp8 = _mm_srli_epi32(tmp2B, 31); \
|
||||
tmp3 = _mm_slli_epi32(tmp3, 1); \
|
||||
tmp2B = _mm_slli_epi32(tmp2B, 1); \
|
||||
tmp9 = _mm_srli_si128(tmp3B, 12); \
|
||||
tmp8 = _mm_slli_si128(tmp8, 4); \
|
||||
tmp3B = _mm_slli_si128(tmp3B, 4); \
|
||||
tmp3 = _mm_or_si128(tmp3, tmp3B); \
|
||||
tmp2B = _mm_or_si128(tmp2B, tmp8); \
|
||||
tmp2B = _mm_or_si128(tmp2B, tmp9); \
|
||||
tmp3B = _mm_slli_epi32(tmp3, 31); \
|
||||
tmp8 = _mm_slli_epi32(tmp3, 30); \
|
||||
tmp9 = _mm_slli_epi32(tmp3, 25); \
|
||||
tmp3B = _mm_xor_si128(tmp3B, tmp8); \
|
||||
tmp3B = _mm_xor_si128(tmp3B, tmp9); \
|
||||
tmp8 = _mm_srli_si128(tmp3B, 4); \
|
||||
tmp3B = _mm_slli_si128(tmp3B, 12); \
|
||||
tmp3 = _mm_xor_si128(tmp3, tmp3B); \
|
||||
tmp2 = _mm_srli_epi32(tmp3, 1); \
|
||||
tmp0B = _mm_srli_epi32(tmp3, 2); \
|
||||
tmp1B = _mm_srli_epi32(tmp3, 7); \
|
||||
tmp2 = _mm_xor_si128(tmp2, tmp0B); \
|
||||
tmp2 = _mm_xor_si128(tmp2, tmp1B); \
|
||||
tmp2 = _mm_xor_si128(tmp2, tmp8); \
|
||||
tmp3 = _mm_xor_si128(tmp3, tmp2); \
|
||||
tmp2B = _mm_xor_si128(tmp2B, tmp3); \
|
||||
accv = tmp2B; \
|
||||
} while (0)
|
||||
|
||||
#define INDECLx(a) __m128i in##a
|
||||
|
||||
#define XORx(a) temp##a = _mm_xor_si128(temp##a, _mm_loadu_si128((const __m128i *) (in + a * 16)))
|
||||
|
||||
#define INXORx(a) \
|
||||
temp##a = _mm_xor_si128(temp##a, (in##a = _mm_loadu_si128((const __m128i *) (in + a * 16))))
|
||||
|
||||
#define LOADx(a) __m128i in##a = _mm_loadu_si128((const __m128i *) (in + a * 16))
|
||||
|
||||
/* full encrypt & checksum 8 blocks at once */
|
||||
#define aesni_encrypt8full(out_, n_, rkeys, in_, accum, hv_, h2v_, h3v_, h4v_, rev) \
|
||||
#define ENCRYPT8FULL(out_, n_, rkeys, in_, accum, Hv, H2v, H3v, H4v, H5v, H6v, H7v, H8v) \
|
||||
do { \
|
||||
unsigned char * out = out_; \
|
||||
uint32_t * n = n_; \
|
||||
const unsigned char *in = in_; \
|
||||
const __m128i hv = hv_; \
|
||||
const __m128i h2v = h2v_; \
|
||||
const __m128i h3v = h3v_; \
|
||||
const __m128i h4v = h4v_; \
|
||||
const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
|
||||
__m128i accv_; \
|
||||
int roundctr; \
|
||||
\
|
||||
const unsigned char *in = in_; \
|
||||
unsigned char * out = out_; \
|
||||
unsigned int * n = n_; \
|
||||
__m128i accv = _mm_loadu_si128((const __m128i *) accum); \
|
||||
MAKE8(NVDECLx); \
|
||||
MAKE8(TEMPDECLx); \
|
||||
\
|
||||
MAKE8(NVx); \
|
||||
MAKE8(TEMPx); \
|
||||
for (roundctr = 1; roundctr < 14; roundctr++) { \
|
||||
MAKE8(AESENCx); \
|
||||
} \
|
||||
MAKE8(AESENCx1); \
|
||||
MAKE8(AESENCx2); \
|
||||
MAKE8(AESENCx3); \
|
||||
MAKE8(AESENCx4); \
|
||||
MAKE8(AESENCx5); \
|
||||
MAKE8(AESENCx6); \
|
||||
MAKE8(AESENCx7); \
|
||||
MAKE8(AESENCx8); \
|
||||
MAKE8(AESENCx9); \
|
||||
MAKE8(AESENCx10); \
|
||||
MAKE8(AESENCx11); \
|
||||
MAKE8(AESENCx12); \
|
||||
MAKE8(AESENCx13); \
|
||||
MAKE8(AESENCLASTx); \
|
||||
MAKE8(XORx); \
|
||||
MAKE8(STOREx); \
|
||||
accv_ = _mm_load_si128((const __m128i *) accum); \
|
||||
MULREDUCE4(rev, hv, h2v, h3v, h4v, temp3, temp2, temp1, temp0, accv_); \
|
||||
MULREDUCE4(rev, hv, h2v, h3v, h4v, temp7, temp6, temp5, temp4, accv_); \
|
||||
_mm_store_si128((__m128i *) accum, accv_); \
|
||||
ADDMULREDUCE8(Hv, H2v, H3v, H4v, H5v, H6v, H7v, H8v, temp7, temp6, temp5, temp4, temp3, \
|
||||
temp2, temp1, temp0, accv); \
|
||||
_mm_storeu_si128((__m128i *) accum, accv); \
|
||||
} while (0)
|
||||
|
||||
/* checksum 8 blocks at once */
|
||||
#define aesni_addmul8full(in_, accum, hv_, h2v_, h3v_, h4v_, rev) \
|
||||
/* full decrypt & checksum 8 blocks at once */
|
||||
#define DECRYPT8FULL(out_, n_, rkeys, in_, accum, Hv, H2v, H3v, H4v, H5v, H6v, H7v, H8v) \
|
||||
do { \
|
||||
const unsigned char *in = in_; \
|
||||
const __m128i hv = hv_; \
|
||||
const __m128i h2v = h2v_; \
|
||||
const __m128i h3v = h3v_; \
|
||||
const __m128i h4v = h4v_; \
|
||||
__m128i accv_; \
|
||||
\
|
||||
MAKE8(LOADx); \
|
||||
accv_ = _mm_load_si128((const __m128i *) accum); \
|
||||
MULREDUCE4(rev, hv, h2v, h3v, h4v, in3, in2, in1, in0, accv_); \
|
||||
MULREDUCE4(rev, hv, h2v, h3v, h4v, in7, in6, in5, in4, accv_); \
|
||||
_mm_store_si128((__m128i *) accum, accv_); \
|
||||
} while (0)
|
||||
|
||||
/* decrypt 8 blocks at once */
|
||||
#define aesni_decrypt8full(out_, n_, rkeys, in_) \
|
||||
do { \
|
||||
unsigned char * out = out_; \
|
||||
uint32_t * n = n_; \
|
||||
const unsigned char *in = in_; \
|
||||
const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
|
||||
int roundctr; \
|
||||
\
|
||||
const unsigned char *in = in_; \
|
||||
unsigned char * out = out_; \
|
||||
unsigned int * n = n_; \
|
||||
__m128i accv = _mm_loadu_si128((const __m128i *) accum); \
|
||||
MAKE8(INDECLx); \
|
||||
MAKE8(NVDECLx); \
|
||||
MAKE8(TEMPDECLx); \
|
||||
\
|
||||
MAKE8(NVx); \
|
||||
MAKE8(TEMPx); \
|
||||
for (roundctr = 1; roundctr < 14; roundctr++) { \
|
||||
MAKE8(AESENCx); \
|
||||
} \
|
||||
MAKE8(AESENCx1); \
|
||||
MAKE8(AESENCx2); \
|
||||
MAKE8(AESENCx3); \
|
||||
MAKE8(AESENCx4); \
|
||||
MAKE8(AESENCx5); \
|
||||
MAKE8(AESENCx6); \
|
||||
MAKE8(AESENCx7); \
|
||||
MAKE8(AESENCx8); \
|
||||
MAKE8(AESENCx9); \
|
||||
MAKE8(AESENCx10); \
|
||||
MAKE8(AESENCx11); \
|
||||
MAKE8(AESENCx12); \
|
||||
MAKE8(AESENCx13); \
|
||||
MAKE8(AESENCLASTx); \
|
||||
MAKE8(XORx); \
|
||||
MAKE8(INXORx); \
|
||||
ADDMULREDUCE8(Hv, H2v, H3v, H4v, H5v, H6v, H7v, H8v, in7, in6, in5, in4, in3, in2, in1, \
|
||||
in0, accv); \
|
||||
_mm_storeu_si128((__m128i *) accum, accv); \
|
||||
MAKE8(STOREx); \
|
||||
} while (0)
|
||||
|
||||
@ -504,7 +604,7 @@ crypto_aead_aes256gcm_encrypt_detached_afternm(unsigned char *c, unsigned char *
|
||||
const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
const aes256gcm_state *ctx = (const aes256gcm_state *) (const void *) ctx_;
|
||||
const __m128i * rkeys = ctx->rkeys;
|
||||
__m128i Hv, H2v, H3v, H4v, accv;
|
||||
__m128i Hv, H2v, H3v, H4v, H5v, H6v, H7v, H8v, accv;
|
||||
unsigned long long i, j;
|
||||
unsigned long long adlen_rnd64 = adlen & ~63ULL;
|
||||
unsigned long long mlen_rnd128 = mlen & ~127ULL;
|
||||
@ -535,15 +635,18 @@ crypto_aead_aes256gcm_encrypt_detached_afternm(unsigned char *c, unsigned char *
|
||||
H2v = mulv(Hv, Hv);
|
||||
H3v = mulv(H2v, Hv);
|
||||
H4v = mulv(H3v, Hv);
|
||||
H5v = mulv(H4v, Hv);
|
||||
H6v = mulv(H5v, Hv);
|
||||
H7v = mulv(H6v, Hv);
|
||||
H8v = mulv(H7v, Hv);
|
||||
|
||||
accv = _mm_setzero_si128();
|
||||
/* unrolled by 4 GCM (by 8 doesn't improve using MULREDUCE4) */
|
||||
for (i = 0; i < adlen_rnd64; i += 64) {
|
||||
__m128i X4_ = _mm_loadu_si128((const __m128i *) (ad + i + 0));
|
||||
__m128i X3_ = _mm_loadu_si128((const __m128i *) (ad + i + 16));
|
||||
__m128i X2_ = _mm_loadu_si128((const __m128i *) (ad + i + 32));
|
||||
__m128i X1_ = _mm_loadu_si128((const __m128i *) (ad + i + 48));
|
||||
MULREDUCE4(rev, Hv, H2v, H3v, H4v, X1_, X2_, X3_, X4_, accv);
|
||||
ADDMULREDUCE4(Hv, H2v, H3v, H4v, X1_, X2_, X3_, X4_, accv);
|
||||
}
|
||||
_mm_store_si128((__m128i *) accum, accv);
|
||||
|
||||
@ -554,7 +657,7 @@ crypto_aead_aes256gcm_encrypt_detached_afternm(unsigned char *c, unsigned char *
|
||||
if (i + (unsigned long long) blocklen > adlen) {
|
||||
blocklen = (unsigned int) (adlen - i);
|
||||
}
|
||||
addmul(accum, ad + i, blocklen, H);
|
||||
addmulreduce(accum, ad + i, blocklen, H);
|
||||
}
|
||||
|
||||
/* this only does 8 full blocks, so no fancy bounds checking is necessary*/
|
||||
@ -564,7 +667,7 @@ crypto_aead_aes256gcm_encrypt_detached_afternm(unsigned char *c, unsigned char *
|
||||
const int lb = iter * 16; \
|
||||
\
|
||||
for (i = 0; i < mlen_rnd128; i += lb) { \
|
||||
aesni_encrypt8full(c + i, n2, rkeys, m + i, accum, Hv, H2v, H3v, H4v, rev); \
|
||||
ENCRYPT8FULL(c + i, n2, rkeys, m + i, accum, Hv, H2v, H3v, H4v, H5v, H6v, H7v, H8v); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
@ -591,7 +694,7 @@ crypto_aead_aes256gcm_encrypt_detached_afternm(unsigned char *c, unsigned char *
|
||||
if (j + (unsigned long long) bl >= mj) { \
|
||||
bl = (unsigned int) (mj - j); \
|
||||
} \
|
||||
addmul(accum, c + i + j, bl, H); \
|
||||
addmulreduce(accum, c + i + j, bl, H); \
|
||||
} \
|
||||
} \
|
||||
} while (0)
|
||||
@ -601,7 +704,7 @@ crypto_aead_aes256gcm_encrypt_detached_afternm(unsigned char *c, unsigned char *
|
||||
LOOPRND128;
|
||||
LOOPRMD128;
|
||||
|
||||
addmul(accum, fb, 16, H);
|
||||
addmulreduce(accum, fb, 16, H);
|
||||
|
||||
for (i = 0; i < 16; ++i) {
|
||||
mac[i] = T[i] ^ accum[15 - i];
|
||||
@ -637,7 +740,7 @@ crypto_aead_aes256gcm_decrypt_detached_afternm(unsigned char *m, unsigned char *
|
||||
const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
const aes256gcm_state *ctx = (const aes256gcm_state *) (const void *) ctx_;
|
||||
const __m128i * rkeys = ctx->rkeys;
|
||||
__m128i Hv, H2v, H3v, H4v, accv;
|
||||
__m128i Hv, H2v, H3v, H4v, H5v, H6v, H7v, H8v, accv;
|
||||
unsigned long long i, j;
|
||||
unsigned long long adlen_rnd64 = adlen & ~63ULL;
|
||||
unsigned long long mlen;
|
||||
@ -672,65 +775,38 @@ crypto_aead_aes256gcm_decrypt_detached_afternm(unsigned char *m, unsigned char *
|
||||
H2v = mulv(Hv, Hv);
|
||||
H3v = mulv(H2v, Hv);
|
||||
H4v = mulv(H3v, Hv);
|
||||
|
||||
H5v = mulv(H4v, Hv);
|
||||
H6v = mulv(H5v, Hv);
|
||||
H7v = mulv(H6v, Hv);
|
||||
H8v = mulv(H7v, Hv);
|
||||
accv = _mm_setzero_si128();
|
||||
|
||||
for (i = 0; i < adlen_rnd64; i += 64) {
|
||||
__m128i X4_ = _mm_loadu_si128((const __m128i *) (ad + i + 0));
|
||||
__m128i X3_ = _mm_loadu_si128((const __m128i *) (ad + i + 16));
|
||||
__m128i X2_ = _mm_loadu_si128((const __m128i *) (ad + i + 32));
|
||||
__m128i X1_ = _mm_loadu_si128((const __m128i *) (ad + i + 48));
|
||||
MULREDUCE4(rev, Hv, H2v, H3v, H4v, X1_, X2_, X3_, X4_, accv);
|
||||
ADDMULREDUCE4(Hv, H2v, H3v, H4v, X1_, X2_, X3_, X4_, accv);
|
||||
}
|
||||
_mm_store_si128((__m128i *) accum, accv);
|
||||
|
||||
/* GCM remainder loop */
|
||||
for (i = adlen_rnd64; i < adlen; i += 16) {
|
||||
unsigned int blocklen = 16;
|
||||
if (i + (unsigned long long) blocklen > adlen) {
|
||||
blocklen = (unsigned int) (adlen - i);
|
||||
}
|
||||
addmul(accum, ad + i, blocklen, H);
|
||||
addmulreduce(accum, ad + i, blocklen, H);
|
||||
}
|
||||
|
||||
mlen_rnd128 = mlen & ~127ULL;
|
||||
|
||||
#define LOOPACCUMDRND128 \
|
||||
do { \
|
||||
const int iter = 8; \
|
||||
const int lb = iter * 16; \
|
||||
for (i = 0; i < mlen_rnd128; i += lb) { \
|
||||
aesni_addmul8full(c + i, accum, Hv, H2v, H3v, H4v, rev); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define LOOPDRND128 \
|
||||
do { \
|
||||
const int iter = 8; \
|
||||
const int lb = iter * 16; \
|
||||
\
|
||||
for (i = 0; i < mlen_rnd128; i += lb) { \
|
||||
aesni_decrypt8full(m + i, n2, rkeys, c + i); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define LOOPACCUMDRMD128 \
|
||||
do { \
|
||||
const int iter = 8; \
|
||||
const int lb = iter * 16; \
|
||||
\
|
||||
for (i = mlen_rnd128; i < mlen; i += lb) { \
|
||||
unsigned long long mj = lb; \
|
||||
\
|
||||
if ((i + mj) >= mlen) { \
|
||||
mj = mlen - i; \
|
||||
} \
|
||||
for (j = 0; j < mj; j += 16) { \
|
||||
unsigned int bl = 16; \
|
||||
\
|
||||
if (j + (unsigned long long) bl >= mj) { \
|
||||
bl = (unsigned int) (mj - j); \
|
||||
} \
|
||||
addmul(accum, c + i + j, bl, H); \
|
||||
} \
|
||||
DECRYPT8FULL(m + i, n2, rkeys, c + i, accum, Hv, H2v, H3v, H4v, H5v, H6v, H7v, H8v); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
@ -738,13 +814,17 @@ crypto_aead_aes256gcm_decrypt_detached_afternm(unsigned char *m, unsigned char *
|
||||
do { \
|
||||
const int iter = 8; \
|
||||
const int lb = iter * 16; \
|
||||
\
|
||||
for (i = mlen_rnd128; i < mlen; i += lb) { \
|
||||
CRYPTO_ALIGN(16) unsigned char outni[8 * 16]; \
|
||||
CRYPTO_ALIGN(16) unsigned char outni[lb]; \
|
||||
unsigned long long mj = lb; \
|
||||
\
|
||||
if ((i + mj) >= mlen) { \
|
||||
if ((i + mj) >= mlen) \
|
||||
mj = mlen - i; \
|
||||
for (j = 0; j < mj; j += 16) { \
|
||||
unsigned long long bl = 16; \
|
||||
if (j + bl >= mj) { \
|
||||
bl = mj - j; \
|
||||
} \
|
||||
addmulreduce(accum, c + i + j, bl, H); \
|
||||
} \
|
||||
aesni_encrypt8(outni, n2, rkeys); \
|
||||
for (j = 0; j < mj; j++) { \
|
||||
@ -753,12 +833,11 @@ crypto_aead_aes256gcm_decrypt_detached_afternm(unsigned char *m, unsigned char *
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
n2[3] &= 0x00ffffff;
|
||||
|
||||
n2[3] = 0U;
|
||||
COUNTER_INC2(n2);
|
||||
LOOPACCUMDRND128;
|
||||
LOOPACCUMDRMD128;
|
||||
addmul(accum, fb, 16, H);
|
||||
LOOPDRND128;
|
||||
LOOPDRMD128;
|
||||
addmulreduce(accum, fb, 16, H);
|
||||
{
|
||||
unsigned char d = 0;
|
||||
|
||||
@ -775,11 +854,6 @@ crypto_aead_aes256gcm_decrypt_detached_afternm(unsigned char *m, unsigned char *
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
n2[3] = 0U;
|
||||
COUNTER_INC2(n2);
|
||||
LOOPDRND128;
|
||||
LOOPDRMD128;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user