crypto: x86/aes-xts - handle AES-128 and AES-192 more efficiently
Decrease the amount of code specific to the different AES variants by "right-aligning" the sequence of round keys, and for AES-128 and AES-192 just skipping irrelevant rounds at the beginning. This shrinks the size of aes-xts-avx-x86_64.o by 13.3%, and it improves the efficiency of AES-128 and AES-192. The tradeoff is that for AES-256 some additional not-taken conditional jumps are now executed. But these are predicted well and are cheap on x86. Note that the ARMv8 CE based AES-XTS implementation uses a similar strategy to handle the different AES variants. Signed-off-by: Eric Biggers <ebiggers@google.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
parent
ea9459ef36
commit
2717e01fc3
@ -82,14 +82,15 @@
|
||||
|
||||
// Function parameters
|
||||
.set KEY, %rdi // Initially points to crypto_aes_ctx, then is
|
||||
// advanced to point directly to 7th round key
|
||||
// advanced to point to 7th-from-last round key
|
||||
.set SRC, %rsi // Pointer to next source data
|
||||
.set DST, %rdx // Pointer to next destination data
|
||||
.set LEN, %rcx // Remaining length in bytes
|
||||
.set TWEAK, %r8 // Pointer to next tweak
|
||||
|
||||
// %r9d holds the AES key length in bytes.
|
||||
// %r9 holds the AES key length in bytes.
|
||||
.set KEYLEN, %r9d
|
||||
.set KEYLEN64, %r9
|
||||
|
||||
// %rax and %r10-r11 are available as temporaries.
|
||||
|
||||
@ -165,12 +166,18 @@
|
||||
.set GF_POLY_XMM, %xmm14
|
||||
.set GF_POLY, V14
|
||||
|
||||
// V15 holds the first AES round key, copied to all 128-bit lanes.
|
||||
// V15 holds the key for AES "round 0", copied to all 128-bit lanes.
|
||||
.set KEY0_XMM, %xmm15
|
||||
.set KEY0, V15
|
||||
|
||||
// If 32 SIMD registers are available, then V16-V29 hold the remaining
|
||||
// AES round keys, copied to all 128-bit lanes.
|
||||
//
|
||||
// AES-128, AES-192, and AES-256 use different numbers of round keys.
|
||||
// To allow handling all three variants efficiently, we align the round
|
||||
// keys to the *end* of this register range. I.e., AES-128 uses
|
||||
// KEY5-KEY14, AES-192 uses KEY3-KEY14, and AES-256 uses KEY1-KEY14.
|
||||
// (All also use KEY0 for the XOR-only "round" at the beginning.)
|
||||
.if USE_AVX10
|
||||
.set KEY1_XMM, %xmm16
|
||||
.set KEY1, V16
|
||||
@ -340,15 +347,15 @@
|
||||
.set PREV_TWEAK, NEXT_TWEAK2
|
||||
.set NEXT_TWEAK, NEXT_TWEAK3
|
||||
.endif
|
||||
.if \i < 20 && \i % 5 == 0
|
||||
.if \i >= 0 && \i < 20 && \i % 5 == 0
|
||||
vpshufd $0x13, PREV_TWEAK, V5
|
||||
.elseif \i < 20 && \i % 5 == 1
|
||||
.elseif \i >= 0 && \i < 20 && \i % 5 == 1
|
||||
vpaddq PREV_TWEAK, PREV_TWEAK, NEXT_TWEAK
|
||||
.elseif \i < 20 && \i % 5 == 2
|
||||
.elseif \i >= 0 && \i < 20 && \i % 5 == 2
|
||||
vpsrad $31, V5, V5
|
||||
.elseif \i < 20 && \i % 5 == 3
|
||||
.elseif \i >= 0 && \i < 20 && \i % 5 == 3
|
||||
vpand GF_POLY, V5, V5
|
||||
.elseif \i < 20 && \i % 5 == 4
|
||||
.elseif \i >= 0 && \i < 20 && \i % 5 == 4
|
||||
vpxor V5, NEXT_TWEAK, NEXT_TWEAK
|
||||
.elseif \i == 1000
|
||||
vmovdqa NEXT_TWEAK0, TWEAK0
|
||||
@ -364,21 +371,21 @@
|
||||
// when VL > 16 (which it is here), the needed shift amounts are byte-aligned,
|
||||
// which allows the use of vpsrldq and vpslldq to do 128-bit wide shifts.
|
||||
.macro _tweak_step_pclmul i
|
||||
.if \i == 2
|
||||
.if \i == 0
|
||||
vpsrldq $(128 - 4*VL/16) / 8, TWEAK0, NEXT_TWEAK0
|
||||
.elseif \i == 4
|
||||
.elseif \i == 2
|
||||
vpsrldq $(128 - 4*VL/16) / 8, TWEAK1, NEXT_TWEAK1
|
||||
.elseif \i == 6
|
||||
.elseif \i == 4
|
||||
vpsrldq $(128 - 4*VL/16) / 8, TWEAK2, NEXT_TWEAK2
|
||||
.elseif \i == 8
|
||||
.elseif \i == 6
|
||||
vpsrldq $(128 - 4*VL/16) / 8, TWEAK3, NEXT_TWEAK3
|
||||
.elseif \i == 10
|
||||
.elseif \i == 8
|
||||
vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK0, NEXT_TWEAK0
|
||||
.elseif \i == 12
|
||||
.elseif \i == 10
|
||||
vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK1, NEXT_TWEAK1
|
||||
.elseif \i == 14
|
||||
.elseif \i == 12
|
||||
vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK2, NEXT_TWEAK2
|
||||
.elseif \i == 16
|
||||
.elseif \i == 14
|
||||
vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK3, NEXT_TWEAK3
|
||||
.elseif \i == 1000
|
||||
vpslldq $(4*VL/16) / 8, TWEAK0, TWEAK0
|
||||
@ -393,8 +400,8 @@
|
||||
.endm
|
||||
|
||||
// _tweak_step does one step of the computation of the next set of tweaks from
|
||||
// TWEAK[0-3]. To complete all steps, this must be invoked with \i values 0
|
||||
// through at least 19, then 1000 which signals the last step.
|
||||
// TWEAK[0-3]. To complete all steps, this is invoked with increasing values of
|
||||
// \i that include at least 0 through 19, then 1000 which signals the last step.
|
||||
//
|
||||
// This is used to interleave the computation of the next set of tweaks with the
|
||||
// AES en/decryptions, which increases performance in some cases.
|
||||
@ -406,22 +413,56 @@
|
||||
.endif
|
||||
.endm
|
||||
|
||||
// Load the round keys: just the first one if !USE_AVX10, otherwise all of them.
|
||||
.macro _load_round_keys
|
||||
_vbroadcast128 -7*16(KEY), KEY0
|
||||
.macro _setup_round_keys enc
|
||||
|
||||
// Select either the encryption round keys or the decryption round keys.
|
||||
.if \enc
|
||||
.set OFFS, 0
|
||||
.else
|
||||
.set OFFS, 240
|
||||
.endif
|
||||
|
||||
// Load the round key for "round 0".
|
||||
_vbroadcast128 OFFS(KEY), KEY0
|
||||
|
||||
// Increment KEY to make it so that 7*16(KEY) is the last round key.
|
||||
// For AES-128, increment by 3*16, resulting in the 10 round keys (not
|
||||
// counting the zero-th round key which was just loaded into KEY0) being
|
||||
// -2*16(KEY) through 7*16(KEY). For AES-192, increment by 5*16 and use
|
||||
// 12 round keys -4*16(KEY) through 7*16(KEY). For AES-256, increment
|
||||
// by 7*16 and use 14 round keys -6*16(KEY) through 7*16(KEY).
|
||||
//
|
||||
// This rebasing provides two benefits. First, it makes the offset to
|
||||
// any round key be in the range [-96, 112], fitting in a signed byte.
|
||||
// This shortens VEX-encoded instructions that access the later round
|
||||
// keys which otherwise would need 4-byte offsets. Second, it makes it
|
||||
// easy to do AES-128 and AES-192 by skipping irrelevant rounds at the
|
||||
// beginning. Skipping rounds at the end doesn't work as well because
|
||||
// the last round needs different instructions.
|
||||
//
|
||||
// An alternative approach would be to roll up all the round loops. We
|
||||
// don't do that because it isn't compatible with caching the round keys
|
||||
// in registers which we do when possible (see below), and also because
|
||||
// it seems unwise to rely *too* heavily on the CPU's branch predictor.
|
||||
lea OFFS-16(KEY, KEYLEN64, 4), KEY
|
||||
|
||||
// If all 32 SIMD registers are available, cache all the round keys.
|
||||
.if USE_AVX10
|
||||
cmp $24, KEYLEN
|
||||
jl .Laes128\@
|
||||
je .Laes192\@
|
||||
_vbroadcast128 -6*16(KEY), KEY1
|
||||
_vbroadcast128 -5*16(KEY), KEY2
|
||||
.Laes192\@:
|
||||
_vbroadcast128 -4*16(KEY), KEY3
|
||||
_vbroadcast128 -3*16(KEY), KEY4
|
||||
.Laes128\@:
|
||||
_vbroadcast128 -2*16(KEY), KEY5
|
||||
_vbroadcast128 -1*16(KEY), KEY6
|
||||
_vbroadcast128 0*16(KEY), KEY7
|
||||
_vbroadcast128 1*16(KEY), KEY8
|
||||
_vbroadcast128 2*16(KEY), KEY9
|
||||
_vbroadcast128 3*16(KEY), KEY10
|
||||
// Note: if it's AES-128 or AES-192, the last several round keys won't
|
||||
// be used. We do the loads anyway to save a conditional jump.
|
||||
_vbroadcast128 4*16(KEY), KEY11
|
||||
_vbroadcast128 5*16(KEY), KEY12
|
||||
_vbroadcast128 6*16(KEY), KEY13
|
||||
@ -466,22 +507,22 @@
|
||||
|
||||
// Do a single round of AES en/decryption on the blocks in registers V0-V3,
|
||||
// using the same key for all blocks. The round key is loaded from the
|
||||
// appropriate register or memory location for round \i. In addition, does step
|
||||
// \i of the computation of the next set of tweaks. May clobber V4.
|
||||
// appropriate register or memory location for round \i. In addition, does two
|
||||
// steps of the computation of the next set of tweaks. May clobber V4.
|
||||
.macro _vaes_4x enc, last, i
|
||||
.if USE_AVX10
|
||||
_tweak_step (2*(\i-1))
|
||||
_tweak_step (2*(\i-5))
|
||||
_vaes \enc, \last, KEY\i, V0
|
||||
_vaes \enc, \last, KEY\i, V1
|
||||
_tweak_step (2*(\i-1) + 1)
|
||||
_tweak_step (2*(\i-5) + 1)
|
||||
_vaes \enc, \last, KEY\i, V2
|
||||
_vaes \enc, \last, KEY\i, V3
|
||||
.else
|
||||
_vbroadcast128 (\i-7)*16(KEY), V4
|
||||
_tweak_step (2*(\i-1))
|
||||
_tweak_step (2*(\i-5))
|
||||
_vaes \enc, \last, V4, V0
|
||||
_vaes \enc, \last, V4, V1
|
||||
_tweak_step (2*(\i-1) + 1)
|
||||
_tweak_step (2*(\i-5) + 1)
|
||||
_vaes \enc, \last, V4, V2
|
||||
_vaes \enc, \last, V4, V3
|
||||
.endif
|
||||
@ -493,32 +534,25 @@
|
||||
// length VL, use V* registers and leave \xmm_suffix empty. May clobber V4.
|
||||
.macro _aes_crypt enc, xmm_suffix, tweak, data
|
||||
_xor3 KEY0\xmm_suffix, \tweak, \data
|
||||
cmp $24, KEYLEN
|
||||
jl .Laes128\@
|
||||
je .Laes192\@
|
||||
_vaes_1x \enc, 0, 1, \xmm_suffix, \data
|
||||
_vaes_1x \enc, 0, 2, \xmm_suffix, \data
|
||||
.Laes192\@:
|
||||
_vaes_1x \enc, 0, 3, \xmm_suffix, \data
|
||||
_vaes_1x \enc, 0, 4, \xmm_suffix, \data
|
||||
.Laes128\@:
|
||||
_vaes_1x \enc, 0, 5, \xmm_suffix, \data
|
||||
_vaes_1x \enc, 0, 6, \xmm_suffix, \data
|
||||
_vaes_1x \enc, 0, 7, \xmm_suffix, \data
|
||||
_vaes_1x \enc, 0, 8, \xmm_suffix, \data
|
||||
_vaes_1x \enc, 0, 9, \xmm_suffix, \data
|
||||
cmp $24, KEYLEN
|
||||
jle .Laes_128_or_192\@
|
||||
_vaes_1x \enc, 0, 10, \xmm_suffix, \data
|
||||
_vaes_1x \enc, 0, 11, \xmm_suffix, \data
|
||||
_vaes_1x \enc, 0, 12, \xmm_suffix, \data
|
||||
_vaes_1x \enc, 0, 13, \xmm_suffix, \data
|
||||
_vaes_1x \enc, 1, 14, \xmm_suffix, \data
|
||||
jmp .Laes_done\@
|
||||
.Laes_128_or_192\@:
|
||||
je .Laes_192\@
|
||||
_vaes_1x \enc, 1, 10, \xmm_suffix, \data
|
||||
jmp .Laes_done\@
|
||||
.Laes_192\@:
|
||||
_vaes_1x \enc, 0, 10, \xmm_suffix, \data
|
||||
_vaes_1x \enc, 0, 11, \xmm_suffix, \data
|
||||
_vaes_1x \enc, 1, 12, \xmm_suffix, \data
|
||||
.Laes_done\@:
|
||||
_vpxor \tweak, \data, \data
|
||||
.endm
|
||||
|
||||
@ -528,16 +562,7 @@
|
||||
// Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256).
|
||||
movl 480(KEY), KEYLEN
|
||||
|
||||
// Advance KEY to point to the 7th encryption round key (if encrypting)
|
||||
// or the 7th decryption round key (if decrypting). This makes the
|
||||
// offset to any round key be in the range [-112, 112], fitting in a
|
||||
// signed byte. This shortens VEX-encoded instructions that access the
|
||||
// 8th and later round keys which otherwise would need 4-byte offsets.
|
||||
.if \enc
|
||||
add $7*16, KEY
|
||||
.else
|
||||
add $(15+7)*16, KEY
|
||||
|
||||
.if !\enc
|
||||
// When decrypting a message whose length isn't a multiple of the AES
|
||||
// block length, exclude the last full block from the main loop by
|
||||
// subtracting 16 from LEN. This is needed because ciphertext stealing
|
||||
@ -548,8 +573,8 @@
|
||||
.Lxts_init\@:
|
||||
.endif
|
||||
|
||||
// Cache as many round keys as possible.
|
||||
_load_round_keys
|
||||
// Setup the pointer to the round keys and cache as many as possible.
|
||||
_setup_round_keys \enc
|
||||
|
||||
// Compute the first set of tweaks TWEAK[0-3].
|
||||
_compute_first_set_of_tweaks
|
||||
@ -560,7 +585,7 @@
|
||||
.Lmain_loop\@:
|
||||
// This is the main loop, en/decrypting 4*VL bytes per iteration.
|
||||
|
||||
// XOR each source block with its tweak and the first round key.
|
||||
// XOR each source block with its tweak and the zero-th round key.
|
||||
.if USE_AVX10
|
||||
vmovdqu8 0*VL(SRC), V0
|
||||
vmovdqu8 1*VL(SRC), V1
|
||||
@ -580,27 +605,27 @@
|
||||
vpxor TWEAK2, V2, V2
|
||||
vpxor TWEAK3, V3, V3
|
||||
.endif
|
||||
cmp $24, KEYLEN
|
||||
jl .Laes128\@
|
||||
je .Laes192\@
|
||||
// Do all the AES rounds on the data blocks, interleaved with
|
||||
// the computation of the next set of tweaks.
|
||||
_vaes_4x \enc, 0, 1
|
||||
_vaes_4x \enc, 0, 2
|
||||
.Laes192\@:
|
||||
_vaes_4x \enc, 0, 3
|
||||
_vaes_4x \enc, 0, 4
|
||||
.Laes128\@:
|
||||
_vaes_4x \enc, 0, 5
|
||||
_vaes_4x \enc, 0, 6
|
||||
_vaes_4x \enc, 0, 7
|
||||
_vaes_4x \enc, 0, 8
|
||||
_vaes_4x \enc, 0, 9
|
||||
// Try to optimize for AES-256 by keeping the code for AES-128 and
|
||||
// AES-192 out-of-line.
|
||||
cmp $24, KEYLEN
|
||||
jle .Lencrypt_4x_aes_128_or_192\@
|
||||
_vaes_4x \enc, 0, 10
|
||||
_vaes_4x \enc, 0, 11
|
||||
_vaes_4x \enc, 0, 12
|
||||
_vaes_4x \enc, 0, 13
|
||||
_vaes_4x \enc, 1, 14
|
||||
.Lencrypt_4x_done\@:
|
||||
|
||||
// XOR in the tweaks again.
|
||||
_vpxor TWEAK0, V0, V0
|
||||
@ -678,17 +703,6 @@
|
||||
jnz .Lcts\@
|
||||
jmp .Ldone\@
|
||||
|
||||
// Out-of-line handling of AES-128 and AES-192
|
||||
.Lencrypt_4x_aes_128_or_192\@:
|
||||
jz .Lencrypt_4x_aes_192\@
|
||||
_vaes_4x \enc, 1, 10
|
||||
jmp .Lencrypt_4x_done\@
|
||||
.Lencrypt_4x_aes_192\@:
|
||||
_vaes_4x \enc, 0, 10
|
||||
_vaes_4x \enc, 0, 11
|
||||
_vaes_4x \enc, 1, 12
|
||||
jmp .Lencrypt_4x_done\@
|
||||
|
||||
.if !\enc
|
||||
.Lneed_cts_dec\@:
|
||||
sub $16, LEN
|
||||
@ -764,38 +778,30 @@
|
||||
// u8 iv[AES_BLOCK_SIZE]);
|
||||
SYM_TYPED_FUNC_START(aes_xts_encrypt_iv)
|
||||
vmovdqu (%rsi), %xmm0
|
||||
add $7*16, %rdi
|
||||
vpxor -7*16(%rdi), %xmm0, %xmm0
|
||||
vpxor (%rdi), %xmm0, %xmm0
|
||||
movl 480(%rdi), %eax // AES key length
|
||||
lea -16(%rdi, %rax, 4), %rdi
|
||||
cmp $24, %eax
|
||||
jl .Lencrypt_iv_aes128
|
||||
je .Lencrypt_iv_aes192
|
||||
vaesenc -6*16(%rdi), %xmm0, %xmm0
|
||||
vaesenc -5*16(%rdi), %xmm0, %xmm0
|
||||
.Lencrypt_iv_aes192:
|
||||
vaesenc -4*16(%rdi), %xmm0, %xmm0
|
||||
vaesenc -3*16(%rdi), %xmm0, %xmm0
|
||||
.Lencrypt_iv_aes128:
|
||||
vaesenc -2*16(%rdi), %xmm0, %xmm0
|
||||
vaesenc -1*16(%rdi), %xmm0, %xmm0
|
||||
vaesenc 0*16(%rdi), %xmm0, %xmm0
|
||||
vaesenc 1*16(%rdi), %xmm0, %xmm0
|
||||
vaesenc 2*16(%rdi), %xmm0, %xmm0
|
||||
cmpl $24, 480-(7*16)(%rdi)
|
||||
jle .Lencrypt_iv_aes_128_or_192
|
||||
vaesenc 3*16(%rdi), %xmm0, %xmm0
|
||||
vaesenc 4*16(%rdi), %xmm0, %xmm0
|
||||
vaesenc 5*16(%rdi), %xmm0, %xmm0
|
||||
vaesenc 6*16(%rdi), %xmm0, %xmm0
|
||||
vaesenclast 7*16(%rdi), %xmm0, %xmm0
|
||||
.Lencrypt_iv_done:
|
||||
vmovdqu %xmm0, (%rsi)
|
||||
RET
|
||||
|
||||
// Out-of-line handling of AES-128 and AES-192
|
||||
.Lencrypt_iv_aes_128_or_192:
|
||||
jz .Lencrypt_iv_aes_192
|
||||
vaesenclast 3*16(%rdi), %xmm0, %xmm0
|
||||
jmp .Lencrypt_iv_done
|
||||
.Lencrypt_iv_aes_192:
|
||||
vaesenc 3*16(%rdi), %xmm0, %xmm0
|
||||
vaesenc 4*16(%rdi), %xmm0, %xmm0
|
||||
vaesenclast 5*16(%rdi), %xmm0, %xmm0
|
||||
jmp .Lencrypt_iv_done
|
||||
SYM_FUNC_END(aes_xts_encrypt_iv)
|
||||
|
||||
// Below are the actual AES-XTS encryption and decryption functions,
|
||||
|
Loading…
Reference in New Issue
Block a user