crypto: arm64/aes-ccm - Cache round keys and unroll AES loops

The CCM code as originally written attempted to use as few NEON registers as possible, to avoid having to eagerly preserve/restore the entire NEON register file at every call to kernel_neon_begin/end. At that time, this API took a number of NEON registers as a parameter, and only preserved that many registers. Today, the NEON register file is restored lazily, and the old API is long gone. This means we can use as many NEON registers as we can make meaningful use of, which means in the AES case that we can keep all round keys in registers rather than reloading each of them for each AES block processed. On Cortex-A53, this results in a speedup of more than 50%. (From 4 cycles per byte to 2.6 cycles per byte) Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2024-01-18 18:06:35 +01:00 · 2024-01-18 18:06:35 +01:00 · 565def1542
commit 565def1542
parent 948ffc66e5
1 changed files with 38 additions and 57 deletions
--- a/arch/arm64/crypto/aes-ce-ccm-core.S
+++ b/arch/arm64/crypto/aes-ce-ccm-core.S
@ -14,40 +14,46 @@
 	.text
 	.arch	armv8-a+crypto

+	.macro	load_round_keys, rk, nr, tmp
+	sub	w\tmp, \nr, #10
+	add	\tmp, \rk, w\tmp, sxtw #4
+	ld1	{v10.4s-v13.4s}, [\rk]
+	ld1	{v14.4s-v17.4s}, [\tmp], #64
+	ld1	{v18.4s-v21.4s}, [\tmp], #64
+	ld1	{v3.4s-v5.4s}, [\tmp]
+	.endm
+
+	.macro	dround, va, vb, vk
+	aese	\va\().16b, \vk\().16b
+	aesmc	\va\().16b, \va\().16b
+	aese	\vb\().16b, \vk\().16b
+	aesmc	\vb\().16b, \vb\().16b
+	.endm
+
+	.macro	aes_encrypt, va, vb, nr
+	tbz	\nr, #2, .L\@
+	dround	\va, \vb, v10
+	dround	\va, \vb, v11
+	tbz	\nr, #1, .L\@
+	dround	\va, \vb, v12
+	dround	\va, \vb, v13
+.L\@:	.irp	v, v14, v15, v16, v17, v18, v19, v20, v21, v3
+	dround	\va, \vb, \v
+	.endr
+	aese	\va\().16b, v4.16b
+	aese	\vb\().16b, v4.16b
+	.endm
+
 	/*
 	 * void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u8 const rk[],
 	 * 			 u32 rounds);
 	 */
 SYM_FUNC_START(ce_aes_ccm_final)
-	ld1	{v3.4s}, [x2], #16		/* load first round key */
 	ld1	{v0.16b}, [x0]			/* load mac */
-	cmp	w3, #12				/* which key size? */
-	sub	w3, w3, #2			/* modified # of rounds */
 	ld1	{v1.16b}, [x1]			/* load 1st ctriv */
-	bmi	0f
-	bne	3f
-	mov	v5.16b, v3.16b
-	b	2f
-0:	mov	v4.16b, v3.16b
-1:	ld1	{v5.4s}, [x2], #16		/* load next round key */
-	aese	v0.16b, v4.16b
-	aesmc	v0.16b, v0.16b
-	aese	v1.16b, v4.16b
-	aesmc	v1.16b, v1.16b
-2:	ld1	{v3.4s}, [x2], #16		/* load next round key */
-	aese	v0.16b, v5.16b
-	aesmc	v0.16b, v0.16b
-	aese	v1.16b, v5.16b
-	aesmc	v1.16b, v1.16b
-3:	ld1	{v4.4s}, [x2], #16		/* load next round key */
-	subs	w3, w3, #3
-	aese	v0.16b, v3.16b
-	aesmc	v0.16b, v0.16b
-	aese	v1.16b, v3.16b
-	aesmc	v1.16b, v1.16b
-	bpl	1b
-	aese	v0.16b, v4.16b
-	aese	v1.16b, v4.16b
+
+	aes_encrypt	v0, v1, w3
+
 	/* final round key cancels out */
 	eor	v0.16b, v0.16b, v1.16b		/* en-/decrypt the mac */
 	st1	{v0.16b}, [x0]			/* store result */
@ -55,6 +61,8 @@ SYM_FUNC_START(ce_aes_ccm_final)
 SYM_FUNC_END(ce_aes_ccm_final)

 	.macro	aes_ccm_do_crypt,enc
+	load_round_keys	x3, w4, x10
+
 	cbz	x2, 5f
 	ldr	x8, [x6, #8]			/* load lower ctr */
 	ld1	{v0.16b}, [x5]			/* load mac */
@ -64,37 +72,10 @@ CPU_LE(	rev	x8, x8			)	/* keep swabbed ctr in reg */
 	prfm	pldl1strm, [x1]
 	add	x8, x8, #1
 	rev	x9, x8
-	cmp	w4, #12				/* which key size? */
-	sub	w7, w4, #2			/* get modified # of rounds */
 	ins	v1.d[1], x9			/* no carry in lower ctr */
-	ld1	{v3.4s}, [x3]			/* load first round key */
-	add	x10, x3, #16
-	bmi	1f
-	bne	4f
-	mov	v5.16b, v3.16b
-	b	3f
-1:	mov	v4.16b, v3.16b
-	ld1	{v5.4s}, [x10], #16		/* load 2nd round key */
-2:	/* inner loop: 3 rounds, 2x interleaved */
-	aese	v0.16b, v4.16b
-	aesmc	v0.16b, v0.16b
-	aese	v1.16b, v4.16b
-	aesmc	v1.16b, v1.16b
-3:	ld1	{v3.4s}, [x10], #16		/* load next round key */
-	aese	v0.16b, v5.16b
-	aesmc	v0.16b, v0.16b
-	aese	v1.16b, v5.16b
-	aesmc	v1.16b, v1.16b
-4:	ld1	{v4.4s}, [x10], #16		/* load next round key */
-	subs	w7, w7, #3
-	aese	v0.16b, v3.16b
-	aesmc	v0.16b, v0.16b
-	aese	v1.16b, v3.16b
-	aesmc	v1.16b, v1.16b
-	ld1	{v5.4s}, [x10], #16		/* load next round key */
-	bpl	2b
-	aese	v0.16b, v4.16b
-	aese	v1.16b, v4.16b
+
+	aes_encrypt	v0, v1, w4
+
 	subs	w2, w2, #16
 	bmi	6f				/* partial block? */
 	ld1	{v2.16b}, [x1], #16		/* load next input block */