1

crypto: arm64/aes-ccm - Cache round keys and unroll AES loops

The CCM code as originally written attempted to use as few NEON
registers as possible, to avoid having to eagerly preserve/restore the
entire NEON register file at every call to kernel_neon_begin/end. At
that time, this API took a number of NEON registers as a parameter, and
only preserved that many registers.

Today, the NEON register file is restored lazily, and the old API is
long gone. This means we can use as many NEON registers as we can make
meaningful use of, which means in the AES case that we can keep all
round keys in registers rather than reloading each of them for each AES
block processed.

On Cortex-A53, this results in a speedup of more than 50%. (From 4
cycles per byte to 2.6 cycles per byte)

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
Ard Biesheuvel 2024-01-18 18:06:35 +01:00 committed by Herbert Xu
parent 948ffc66e5
commit 565def1542

View File

@ -14,40 +14,46 @@
.text
.arch armv8-a+crypto
.macro load_round_keys, rk, nr, tmp
sub w\tmp, \nr, #10
add \tmp, \rk, w\tmp, sxtw #4
ld1 {v10.4s-v13.4s}, [\rk]
ld1 {v14.4s-v17.4s}, [\tmp], #64
ld1 {v18.4s-v21.4s}, [\tmp], #64
ld1 {v3.4s-v5.4s}, [\tmp]
.endm
.macro dround, va, vb, vk
aese \va\().16b, \vk\().16b
aesmc \va\().16b, \va\().16b
aese \vb\().16b, \vk\().16b
aesmc \vb\().16b, \vb\().16b
.endm
.macro aes_encrypt, va, vb, nr
tbz \nr, #2, .L\@
dround \va, \vb, v10
dround \va, \vb, v11
tbz \nr, #1, .L\@
dround \va, \vb, v12
dround \va, \vb, v13
.L\@: .irp v, v14, v15, v16, v17, v18, v19, v20, v21, v3
dround \va, \vb, \v
.endr
aese \va\().16b, v4.16b
aese \vb\().16b, v4.16b
.endm
/*
* void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u8 const rk[],
* u32 rounds);
*/
SYM_FUNC_START(ce_aes_ccm_final)
ld1 {v3.4s}, [x2], #16 /* load first round key */
ld1 {v0.16b}, [x0] /* load mac */
cmp w3, #12 /* which key size? */
sub w3, w3, #2 /* modified # of rounds */
ld1 {v1.16b}, [x1] /* load 1st ctriv */
bmi 0f
bne 3f
mov v5.16b, v3.16b
b 2f
0: mov v4.16b, v3.16b
1: ld1 {v5.4s}, [x2], #16 /* load next round key */
aese v0.16b, v4.16b
aesmc v0.16b, v0.16b
aese v1.16b, v4.16b
aesmc v1.16b, v1.16b
2: ld1 {v3.4s}, [x2], #16 /* load next round key */
aese v0.16b, v5.16b
aesmc v0.16b, v0.16b
aese v1.16b, v5.16b
aesmc v1.16b, v1.16b
3: ld1 {v4.4s}, [x2], #16 /* load next round key */
subs w3, w3, #3
aese v0.16b, v3.16b
aesmc v0.16b, v0.16b
aese v1.16b, v3.16b
aesmc v1.16b, v1.16b
bpl 1b
aese v0.16b, v4.16b
aese v1.16b, v4.16b
aes_encrypt v0, v1, w3
/* final round key cancels out */
eor v0.16b, v0.16b, v1.16b /* en-/decrypt the mac */
st1 {v0.16b}, [x0] /* store result */
@ -55,6 +61,8 @@ SYM_FUNC_START(ce_aes_ccm_final)
SYM_FUNC_END(ce_aes_ccm_final)
.macro aes_ccm_do_crypt,enc
load_round_keys x3, w4, x10
cbz x2, 5f
ldr x8, [x6, #8] /* load lower ctr */
ld1 {v0.16b}, [x5] /* load mac */
@ -64,37 +72,10 @@ CPU_LE( rev x8, x8 ) /* keep swabbed ctr in reg */
prfm pldl1strm, [x1]
add x8, x8, #1
rev x9, x8
cmp w4, #12 /* which key size? */
sub w7, w4, #2 /* get modified # of rounds */
ins v1.d[1], x9 /* no carry in lower ctr */
ld1 {v3.4s}, [x3] /* load first round key */
add x10, x3, #16
bmi 1f
bne 4f
mov v5.16b, v3.16b
b 3f
1: mov v4.16b, v3.16b
ld1 {v5.4s}, [x10], #16 /* load 2nd round key */
2: /* inner loop: 3 rounds, 2x interleaved */
aese v0.16b, v4.16b
aesmc v0.16b, v0.16b
aese v1.16b, v4.16b
aesmc v1.16b, v1.16b
3: ld1 {v3.4s}, [x10], #16 /* load next round key */
aese v0.16b, v5.16b
aesmc v0.16b, v0.16b
aese v1.16b, v5.16b
aesmc v1.16b, v1.16b
4: ld1 {v4.4s}, [x10], #16 /* load next round key */
subs w7, w7, #3
aese v0.16b, v3.16b
aesmc v0.16b, v0.16b
aese v1.16b, v3.16b
aesmc v1.16b, v1.16b
ld1 {v5.4s}, [x10], #16 /* load next round key */
bpl 2b
aese v0.16b, v4.16b
aese v1.16b, v4.16b
aes_encrypt v0, v1, w4
subs w2, w2, #16
bmi 6f /* partial block? */
ld1 {v2.16b}, [x1], #16 /* load next input block */