f691d444f9
The C glue code already infers whether or not the current iteration is the final one, by comparing walk.nbytes with walk.total. This means we can easily inform the asm helpers of this as well, by conditionally passing a pointer to the original IV, which is used in the finalization of the MAC. This removes the need for a separate call into the asm code to perform the finalization. Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
143 lines
3.8 KiB
ArmAsm
143 lines
3.8 KiB
ArmAsm
/* SPDX-License-Identifier: GPL-2.0-only */
|
|
/*
|
|
* aes-ce-ccm-core.S - AES-CCM transform for ARMv8 with Crypto Extensions
|
|
*
|
|
* Copyright (C) 2013 - 2017 Linaro Ltd.
|
|
* Copyright (C) 2024 Google LLC
|
|
*
|
|
* Author: Ard Biesheuvel <ardb@kernel.org>
|
|
*/
|
|
|
|
#include <linux/linkage.h>
|
|
#include <asm/assembler.h>
|
|
|
|
.text
|
|
.arch armv8-a+crypto
|
|
|
|
.macro load_round_keys, rk, nr, tmp
|
|
sub w\tmp, \nr, #10
|
|
add \tmp, \rk, w\tmp, sxtw #4
|
|
ld1 {v10.4s-v13.4s}, [\rk]
|
|
ld1 {v14.4s-v17.4s}, [\tmp], #64
|
|
ld1 {v18.4s-v21.4s}, [\tmp], #64
|
|
ld1 {v3.4s-v5.4s}, [\tmp]
|
|
.endm
|
|
|
|
.macro dround, va, vb, vk
|
|
aese \va\().16b, \vk\().16b
|
|
aesmc \va\().16b, \va\().16b
|
|
aese \vb\().16b, \vk\().16b
|
|
aesmc \vb\().16b, \vb\().16b
|
|
.endm
|
|
|
|
.macro aes_encrypt, va, vb, nr
|
|
tbz \nr, #2, .L\@
|
|
dround \va, \vb, v10
|
|
dround \va, \vb, v11
|
|
tbz \nr, #1, .L\@
|
|
dround \va, \vb, v12
|
|
dround \va, \vb, v13
|
|
.L\@: .irp v, v14, v15, v16, v17, v18, v19, v20, v21, v3
|
|
dround \va, \vb, \v
|
|
.endr
|
|
aese \va\().16b, v4.16b
|
|
aese \vb\().16b, v4.16b
|
|
.endm
|
|
|
|
.macro aes_ccm_do_crypt,enc
|
|
load_round_keys x3, w4, x10
|
|
|
|
ld1 {v0.16b}, [x5] /* load mac */
|
|
cbz x2, ce_aes_ccm_final
|
|
ldr x8, [x6, #8] /* load lower ctr */
|
|
CPU_LE( rev x8, x8 ) /* keep swabbed ctr in reg */
|
|
0: /* outer loop */
|
|
ld1 {v1.8b}, [x6] /* load upper ctr */
|
|
prfm pldl1strm, [x1]
|
|
add x8, x8, #1
|
|
rev x9, x8
|
|
ins v1.d[1], x9 /* no carry in lower ctr */
|
|
|
|
aes_encrypt v0, v1, w4
|
|
|
|
subs w2, w2, #16
|
|
bmi ce_aes_ccm_crypt_tail
|
|
ld1 {v2.16b}, [x1], #16 /* load next input block */
|
|
.if \enc == 1
|
|
eor v2.16b, v2.16b, v5.16b /* final round enc+mac */
|
|
eor v6.16b, v1.16b, v2.16b /* xor with crypted ctr */
|
|
.else
|
|
eor v2.16b, v2.16b, v1.16b /* xor with crypted ctr */
|
|
eor v6.16b, v2.16b, v5.16b /* final round enc */
|
|
.endif
|
|
eor v0.16b, v0.16b, v2.16b /* xor mac with pt ^ rk[last] */
|
|
st1 {v6.16b}, [x0], #16 /* write output block */
|
|
bne 0b
|
|
CPU_LE( rev x8, x8 )
|
|
str x8, [x6, #8] /* store lsb end of ctr (BE) */
|
|
cbnz x7, ce_aes_ccm_final
|
|
st1 {v0.16b}, [x5] /* store mac */
|
|
ret
|
|
.endm
|
|
|
|
SYM_FUNC_START_LOCAL(ce_aes_ccm_crypt_tail)
|
|
eor v0.16b, v0.16b, v5.16b /* final round mac */
|
|
eor v1.16b, v1.16b, v5.16b /* final round enc */
|
|
|
|
add x1, x1, w2, sxtw /* rewind the input pointer (w2 < 0) */
|
|
add x0, x0, w2, sxtw /* rewind the output pointer */
|
|
|
|
adr_l x8, .Lpermute /* load permute vectors */
|
|
add x9, x8, w2, sxtw
|
|
sub x8, x8, w2, sxtw
|
|
ld1 {v7.16b-v8.16b}, [x9]
|
|
ld1 {v9.16b}, [x8]
|
|
|
|
ld1 {v2.16b}, [x1] /* load a full block of input */
|
|
tbl v1.16b, {v1.16b}, v7.16b /* move keystream to end of register */
|
|
eor v7.16b, v2.16b, v1.16b /* encrypt partial input block */
|
|
bif v2.16b, v7.16b, v22.16b /* select plaintext */
|
|
tbx v7.16b, {v6.16b}, v8.16b /* insert output from previous iteration */
|
|
tbl v2.16b, {v2.16b}, v9.16b /* copy plaintext to start of v2 */
|
|
eor v0.16b, v0.16b, v2.16b /* fold plaintext into mac */
|
|
|
|
st1 {v7.16b}, [x0] /* store output block */
|
|
cbz x7, 0f
|
|
|
|
SYM_INNER_LABEL(ce_aes_ccm_final, SYM_L_LOCAL)
|
|
ld1 {v1.16b}, [x7] /* load 1st ctriv */
|
|
|
|
aes_encrypt v0, v1, w4
|
|
|
|
/* final round key cancels out */
|
|
eor v0.16b, v0.16b, v1.16b /* en-/decrypt the mac */
|
|
0: st1 {v0.16b}, [x5] /* store result */
|
|
ret
|
|
SYM_FUNC_END(ce_aes_ccm_crypt_tail)
|
|
|
|
/*
|
|
* void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes,
|
|
* u8 const rk[], u32 rounds, u8 mac[],
|
|
* u8 ctr[], u8 const final_iv[]);
|
|
* void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes,
|
|
* u8 const rk[], u32 rounds, u8 mac[],
|
|
* u8 ctr[], u8 const final_iv[]);
|
|
*/
|
|
SYM_FUNC_START(ce_aes_ccm_encrypt)
|
|
movi v22.16b, #255
|
|
aes_ccm_do_crypt 1
|
|
SYM_FUNC_END(ce_aes_ccm_encrypt)
|
|
|
|
SYM_FUNC_START(ce_aes_ccm_decrypt)
|
|
movi v22.16b, #0
|
|
aes_ccm_do_crypt 0
|
|
SYM_FUNC_END(ce_aes_ccm_decrypt)
|
|
|
|
.section ".rodata", "a"
|
|
.align 6
|
|
.fill 15, 1, 0xff
|
|
.Lpermute:
|
|
.byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
|
|
.byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
|
|
.fill 15, 1, 0xff
|