linux/arch/riscv/crypto/aes-riscv64-zvkned.S

/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
//
// This file is dual-licensed, meaning that you can use it under your
// choice of either of the following two licenses:
//
// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
//
// Licensed under the Apache License 2.0 (the "License"). You can obtain
// a copy in the file LICENSE in the source distribution or at
// https://www.openssl.org/source/license.html
//
// or
//
// Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
// Copyright (c) 2023, Phoebe Chen <phoebe.chen@sifive.com>
// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
// Copyright 2024 Google LLC
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// 1. Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

// The generated code of this file depends on the following RISC-V extensions:
// - RV64I
// - RISC-V Vector ('V') with VLEN >= 128
// - RISC-V Vector AES block cipher extension ('Zvkned')

#include <linux/linkage.h>

.text
.option arch, +zvkned

#include "aes-macros.S"

#define KEYP		a0
#define INP		a1
#define OUTP		a2
#define LEN		a3
#define IVP		a4

.macro	__aes_crypt_zvkned	enc, keylen
	vle32.v		v16, (INP)
	aes_crypt	v16, \enc, \keylen
	vse32.v		v16, (OUTP)
	ret
.endm

.macro	aes_crypt_zvkned	enc
	aes_begin	KEYP, 128f, 192f
	__aes_crypt_zvkned	\enc, 256
128:
	__aes_crypt_zvkned	\enc, 128
192:
	__aes_crypt_zvkned	\enc, 192
.endm

// void aes_encrypt_zvkned(const struct crypto_aes_ctx *key,
//			   const u8 in[16], u8 out[16]);
SYM_FUNC_START(aes_encrypt_zvkned)
	aes_crypt_zvkned	1
SYM_FUNC_END(aes_encrypt_zvkned)

// Same prototype and calling convention as the encryption function
SYM_FUNC_START(aes_decrypt_zvkned)
	aes_crypt_zvkned	0
SYM_FUNC_END(aes_decrypt_zvkned)

.macro	__aes_ecb_crypt	enc, keylen
	srli		t0, LEN, 2
	// t0 is the remaining length in 32-bit words.  It's a multiple of 4.
1:
	vsetvli		t1, t0, e32, m8, ta, ma
	sub		t0, t0, t1	// Subtract number of words processed
	slli		t1, t1, 2	// Words to bytes
	vle32.v		v16, (INP)
	aes_crypt	v16, \enc, \keylen
	vse32.v		v16, (OUTP)
	add		INP, INP, t1
	add		OUTP, OUTP, t1
	bnez		t0, 1b

	ret
.endm

.macro	aes_ecb_crypt	enc
	aes_begin	KEYP, 128f, 192f
	__aes_ecb_crypt	\enc, 256
128:
	__aes_ecb_crypt	\enc, 128
192:
	__aes_ecb_crypt	\enc, 192
.endm

// void aes_ecb_encrypt_zvkned(const struct crypto_aes_ctx *key,
//			       const u8 *in, u8 *out, size_t len);
//
// |len| must be nonzero and a multiple of 16 (AES_BLOCK_SIZE).
SYM_FUNC_START(aes_ecb_encrypt_zvkned)
	aes_ecb_crypt	1
SYM_FUNC_END(aes_ecb_encrypt_zvkned)

// Same prototype and calling convention as the encryption function
SYM_FUNC_START(aes_ecb_decrypt_zvkned)
	aes_ecb_crypt	0
SYM_FUNC_END(aes_ecb_decrypt_zvkned)

.macro	aes_cbc_encrypt	keylen
	vle32.v		v16, (IVP)	// Load IV
1:
	vle32.v		v17, (INP)	// Load plaintext block
	vxor.vv		v16, v16, v17	// XOR with IV or prev ciphertext block
	aes_encrypt	v16, \keylen	// Encrypt
	vse32.v		v16, (OUTP)	// Store ciphertext block
	addi		INP, INP, 16
	addi		OUTP, OUTP, 16
	addi		LEN, LEN, -16
	bnez		LEN, 1b

	vse32.v		v16, (IVP)	// Store next IV
	ret
.endm

.macro	aes_cbc_decrypt	keylen
	srli		LEN, LEN, 2	// Convert LEN from bytes to words
	vle32.v		v16, (IVP)	// Load IV
1:
	vsetvli		t0, LEN, e32, m4, ta, ma
	vle32.v		v20, (INP)	// Load ciphertext blocks
	vslideup.vi	v16, v20, 4	// Setup prev ciphertext blocks
	addi		t1, t0, -4
	vslidedown.vx	v24, v20, t1	// Save last ciphertext block
	aes_decrypt	v20, \keylen	// Decrypt the blocks
	vxor.vv		v20, v20, v16	// XOR with prev ciphertext blocks
	vse32.v		v20, (OUTP)	// Store plaintext blocks
	vmv.v.v		v16, v24	// Next "IV" is last ciphertext block
	slli		t1, t0, 2	// Words to bytes
	add		INP, INP, t1
	add		OUTP, OUTP, t1
	sub		LEN, LEN, t0
	bnez		LEN, 1b

	vsetivli	zero, 4, e32, m1, ta, ma
	vse32.v		v16, (IVP)	// Store next IV
	ret
.endm

// void aes_cbc_encrypt_zvkned(const struct crypto_aes_ctx *key,
//			       const u8 *in, u8 *out, size_t len, u8 iv[16]);
//
// |len| must be nonzero and a multiple of 16 (AES_BLOCK_SIZE).
SYM_FUNC_START(aes_cbc_encrypt_zvkned)
	aes_begin	KEYP, 128f, 192f
	aes_cbc_encrypt	256
128:
	aes_cbc_encrypt	128
192:
	aes_cbc_encrypt	192
SYM_FUNC_END(aes_cbc_encrypt_zvkned)

// Same prototype and calling convention as the encryption function
SYM_FUNC_START(aes_cbc_decrypt_zvkned)
	aes_begin	KEYP, 128f, 192f
	aes_cbc_decrypt	256
128:
	aes_cbc_decrypt	128
192:
	aes_cbc_decrypt	192
SYM_FUNC_END(aes_cbc_decrypt_zvkned)

.macro	aes_cbc_cts_encrypt	keylen

	// CBC-encrypt all blocks except the last.  But don't store the
	// second-to-last block to the output buffer yet, since it will be
	// handled specially in the ciphertext stealing step.  Exception: if the
	// message is single-block, still encrypt the last (and only) block.
	li		t0, 16
	j		2f
1:
	vse32.v		v16, (OUTP)	// Store ciphertext block
	addi		OUTP, OUTP, 16
2:
	vle32.v		v17, (INP)	// Load plaintext block
	vxor.vv		v16, v16, v17	// XOR with IV or prev ciphertext block
	aes_encrypt	v16, \keylen	// Encrypt
	addi		INP, INP, 16
	addi		LEN, LEN, -16
	bgt		LEN, t0, 1b	// Repeat if more than one block remains

	// Special case: if the message is a single block, just do CBC.
	beqz		LEN, .Lcts_encrypt_done\@

	// Encrypt the last two blocks using ciphertext stealing as follows:
	//	C[n-1] = Encrypt(Encrypt(P[n-1] ^ C[n-2]) ^ P[n])
	//	C[n] = Encrypt(P[n-1] ^ C[n-2])[0..LEN]
	//
	// C[i] denotes the i'th ciphertext block, and likewise P[i] the i'th
	// plaintext block.  Block n, the last block, may be partial; its length
	// is 1 <= LEN <= 16.  If there are only 2 blocks, C[n-2] means the IV.
	//
	// v16 already contains Encrypt(P[n-1] ^ C[n-2]).
	// INP points to P[n].  OUTP points to where C[n-1] should go.
	// To support in-place encryption, load P[n] before storing C[n].
	addi		t0, OUTP, 16	// Get pointer to where C[n] should go
	vsetvli		zero, LEN, e8, m1, tu, ma
	vle8.v		v17, (INP)	// Load P[n]
	vse8.v		v16, (t0)	// Store C[n]
	vxor.vv		v16, v16, v17	// v16 = Encrypt(P[n-1] ^ C[n-2]) ^ P[n]
	vsetivli	zero, 4, e32, m1, ta, ma
	aes_encrypt	v16, \keylen
.Lcts_encrypt_done\@:
	vse32.v		v16, (OUTP)	// Store C[n-1] (or C[n] in single-block case)
	ret
.endm

#define LEN32		t4 // Length of remaining full blocks in 32-bit words
#define LEN_MOD16	t5 // Length of message in bytes mod 16

.macro	aes_cbc_cts_decrypt	keylen
	andi		LEN32, LEN, ~15
	srli		LEN32, LEN32, 2
	andi		LEN_MOD16, LEN, 15

	// Save C[n-2] in v28 so that it's available later during the ciphertext
	// stealing step.  If there are fewer than three blocks, C[n-2] means
	// the IV, otherwise it means the third-to-last ciphertext block.
	vmv.v.v		v28, v16	// IV
	add		t0, LEN, -33
	bltz		t0, .Lcts_decrypt_loop\@
	andi		t0, t0, ~15
	add		t0, t0, INP
	vle32.v		v28, (t0)

	// CBC-decrypt all full blocks.  For the last full block, or the last 2
	// full blocks if the message is block-aligned, this doesn't write the
	// correct output blocks (unless the message is only a single block),
	// because it XORs the wrong values with the raw AES plaintexts.  But we
	// fix this after this loop without redoing the AES decryptions.  This
	// approach allows more of the AES decryptions to be parallelized.
.Lcts_decrypt_loop\@:
	vsetvli		t0, LEN32, e32, m4, ta, ma
	addi		t1, t0, -4
	vle32.v		v20, (INP)	// Load next set of ciphertext blocks
	vmv.v.v		v24, v16	// Get IV or last ciphertext block of prev set
	vslideup.vi	v24, v20, 4	// Setup prev ciphertext blocks
	vslidedown.vx	v16, v20, t1	// Save last ciphertext block of this set
	aes_decrypt	v20, \keylen	// Decrypt this set of blocks
	vxor.vv		v24, v24, v20	// XOR prev ciphertext blocks with decrypted blocks
	vse32.v		v24, (OUTP)	// Store this set of plaintext blocks
	sub		LEN32, LEN32, t0
	slli		t0, t0, 2	// Words to bytes
	add		INP, INP, t0
	add		OUTP, OUTP, t0
	bnez		LEN32, .Lcts_decrypt_loop\@

	vsetivli	zero, 4, e32, m4, ta, ma
	vslidedown.vx	v20, v20, t1	// Extract raw plaintext of last full block
	addi		t0, OUTP, -16	// Get pointer to last full plaintext block
	bnez		LEN_MOD16, .Lcts_decrypt_non_block_aligned\@

	// Special case: if the message is a single block, just do CBC.
	li		t1, 16
	beq		LEN, t1, .Lcts_decrypt_done\@

	// Block-aligned message.  Just fix up the last 2 blocks.  We need:
	//
	//	P[n-1] = Decrypt(C[n]) ^ C[n-2]
	//	P[n] = Decrypt(C[n-1]) ^ C[n]
	//
	// We have C[n] in v16, Decrypt(C[n]) in v20, and C[n-2] in v28.
	// Together with Decrypt(C[n-1]) ^ C[n-2] from the output buffer, this
	// is everything needed to fix the output without re-decrypting blocks.
	addi		t1, OUTP, -32	// Get pointer to where P[n-1] should go
	vxor.vv		v20, v20, v28	// Decrypt(C[n]) ^ C[n-2] == P[n-1]
	vle32.v		v24, (t1)	// Decrypt(C[n-1]) ^ C[n-2]
	vse32.v		v20, (t1)	// Store P[n-1]
	vxor.vv		v20, v24, v16	// Decrypt(C[n-1]) ^ C[n-2] ^ C[n] == P[n] ^ C[n-2]
	j		.Lcts_decrypt_finish\@

.Lcts_decrypt_non_block_aligned\@:
	// Decrypt the last two blocks using ciphertext stealing as follows:
	//
	//	P[n-1] = Decrypt(C[n] || Decrypt(C[n-1])[LEN_MOD16..16]) ^ C[n-2]
	//	P[n] = (Decrypt(C[n-1]) ^ C[n])[0..LEN_MOD16]
	//
	// We already have Decrypt(C[n-1]) in v20 and C[n-2] in v28.
	vmv.v.v		v16, v20	// v16 = Decrypt(C[n-1])
	vsetvli		zero, LEN_MOD16, e8, m1, tu, ma
	vle8.v		v20, (INP)	// v20 = C[n] || Decrypt(C[n-1])[LEN_MOD16..16]
	vxor.vv		v16, v16, v20	// v16 = Decrypt(C[n-1]) ^ C[n]
	vse8.v		v16, (OUTP)	// Store P[n]
	vsetivli	zero, 4, e32, m1, ta, ma
	aes_decrypt	v20, \keylen	// v20 = Decrypt(C[n] || Decrypt(C[n-1])[LEN_MOD16..16])
.Lcts_decrypt_finish\@:
	vxor.vv		v20, v20, v28	// XOR with C[n-2]
	vse32.v		v20, (t0)	// Store last full plaintext block
.Lcts_decrypt_done\@:
	ret
.endm

.macro	aes_cbc_cts_crypt	keylen
	vle32.v		v16, (IVP)	// Load IV
	beqz		a5, .Lcts_decrypt\@
	aes_cbc_cts_encrypt \keylen
.Lcts_decrypt\@:
	aes_cbc_cts_decrypt \keylen
.endm

// void aes_cbc_cts_crypt_zvkned(const struct crypto_aes_ctx *key,
//			         const u8 *in, u8 *out, size_t len,
//				 const u8 iv[16], bool enc);
//
// Encrypts or decrypts a message with the CS3 variant of AES-CBC-CTS.
// This is the variant that unconditionally swaps the last two blocks.
SYM_FUNC_START(aes_cbc_cts_crypt_zvkned)
	aes_begin	KEYP, 128f, 192f
	aes_cbc_cts_crypt 256
128:
	aes_cbc_cts_crypt 128
192:
	aes_cbc_cts_crypt 192
SYM_FUNC_END(aes_cbc_cts_crypt_zvkned)
crypto: riscv - add vector crypto accelerated AES-{ECB,CBC,CTR,XTS} Add implementations of AES-ECB, AES-CBC, AES-CTR, and AES-XTS, as well as bare (single-block) AES, using the RISC-V vector crypto extensions. The assembly code is derived from OpenSSL code (openssl/openssl#21923) that was dual-licensed so that it could be reused in the kernel. Nevertheless, the assembly has been significantly reworked for integration with the kernel, for example by using regular .S files instead of the so-called perlasm, using the assembler instead of bare '.inst', greatly reducing code duplication, supporting AES-192, and making the code use the same AES key structure as the C code. Co-developed-by: Phoebe Chen <phoebe.chen@sifive.com> Signed-off-by: Phoebe Chen <phoebe.chen@sifive.com> Signed-off-by: Jerry Shih <jerry.shih@sifive.com> Co-developed-by: Eric Biggers <ebiggers@google.com> Signed-off-by: Eric Biggers <ebiggers@google.com> Link: https://lore.kernel.org/r/20240122002024.27477-5-ebiggers@kernel.org Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com> 2024-01-21 17:19:15 -07:00			`/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */`
			`//`
			`// This file is dual-licensed, meaning that you can use it under your`
			`// choice of either of the following two licenses:`
			`//`
			`// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.`
			`//`
			`// Licensed under the Apache License 2.0 (the "License"). You can obtain`
			`// a copy in the file LICENSE in the source distribution or at`
			`// https://www.openssl.org/source/license.html`
			`//`
			`// or`
			`//`
			`// Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>`
			`// Copyright (c) 2023, Phoebe Chen <phoebe.chen@sifive.com>`
			`// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>`
			`// Copyright 2024 Google LLC`
			`// All rights reserved.`
			`//`
			`// Redistribution and use in source and binary forms, with or without`
			`// modification, are permitted provided that the following conditions`
			`// are met:`
			`// 1. Redistributions of source code must retain the above copyright`
			`// notice, this list of conditions and the following disclaimer.`
			`// 2. Redistributions in binary form must reproduce the above copyright`
			`// notice, this list of conditions and the following disclaimer in the`
			`// documentation and/or other materials provided with the distribution.`
			`//`
			`// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS`
			`// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT`
			`// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR`
			`// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT`
			`// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,`
			`// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT`
			`// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,`
			`// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY`
			`// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT`
			`// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`

			`// The generated code of this file depends on the following RISC-V extensions:`
			`// - RV64I`
			`// - RISC-V Vector ('V') with VLEN >= 128`
			`// - RISC-V Vector AES block cipher extension ('Zvkned')`

			`#include <linux/linkage.h>`

			`.text`
			`.option arch, +zvkned`

			`#include "aes-macros.S"`

			`#define KEYP a0`
			`#define INP a1`
			`#define OUTP a2`
			`#define LEN a3`
			`#define IVP a4`

			`.macro __aes_crypt_zvkned enc, keylen`
			`vle32.v v16, (INP)`
			`aes_crypt v16, \enc, \keylen`
			`vse32.v v16, (OUTP)`
			`ret`
			`.endm`

			`.macro aes_crypt_zvkned enc`
			`aes_begin KEYP, 128f, 192f`
			`__aes_crypt_zvkned \enc, 256`
			`128:`
			`__aes_crypt_zvkned \enc, 128`
			`192:`
			`__aes_crypt_zvkned \enc, 192`
			`.endm`

			`// void aes_encrypt_zvkned(const struct crypto_aes_ctx *key,`
			`// const u8 in[16], u8 out[16]);`
			`SYM_FUNC_START(aes_encrypt_zvkned)`
			`aes_crypt_zvkned 1`
			`SYM_FUNC_END(aes_encrypt_zvkned)`

			`// Same prototype and calling convention as the encryption function`
			`SYM_FUNC_START(aes_decrypt_zvkned)`
			`aes_crypt_zvkned 0`
			`SYM_FUNC_END(aes_decrypt_zvkned)`

			`.macro __aes_ecb_crypt enc, keylen`
			`srli t0, LEN, 2`
			`// t0 is the remaining length in 32-bit words. It's a multiple of 4.`
			`1:`
			`vsetvli t1, t0, e32, m8, ta, ma`
			`sub t0, t0, t1 // Subtract number of words processed`
			`slli t1, t1, 2 // Words to bytes`
			`vle32.v v16, (INP)`
			`aes_crypt v16, \enc, \keylen`
			`vse32.v v16, (OUTP)`
			`add INP, INP, t1`
			`add OUTP, OUTP, t1`
			`bnez t0, 1b`

			`ret`
			`.endm`

			`.macro aes_ecb_crypt enc`
			`aes_begin KEYP, 128f, 192f`
			`__aes_ecb_crypt \enc, 256`
			`128:`
			`__aes_ecb_crypt \enc, 128`
			`192:`
			`__aes_ecb_crypt \enc, 192`
			`.endm`

			`// void aes_ecb_encrypt_zvkned(const struct crypto_aes_ctx *key,`
			`// const u8 in, u8 out, size_t len);`
			`//`
			`// \|len\| must be nonzero and a multiple of 16 (AES_BLOCK_SIZE).`
			`SYM_FUNC_START(aes_ecb_encrypt_zvkned)`
			`aes_ecb_crypt 1`
			`SYM_FUNC_END(aes_ecb_encrypt_zvkned)`

			`// Same prototype and calling convention as the encryption function`
			`SYM_FUNC_START(aes_ecb_decrypt_zvkned)`
			`aes_ecb_crypt 0`
			`SYM_FUNC_END(aes_ecb_decrypt_zvkned)`

			`.macro aes_cbc_encrypt keylen`
			`vle32.v v16, (IVP) // Load IV`
			`1:`
			`vle32.v v17, (INP) // Load plaintext block`
			`vxor.vv v16, v16, v17 // XOR with IV or prev ciphertext block`
			`aes_encrypt v16, \keylen // Encrypt`
			`vse32.v v16, (OUTP) // Store ciphertext block`
			`addi INP, INP, 16`
			`addi OUTP, OUTP, 16`
			`addi LEN, LEN, -16`
			`bnez LEN, 1b`

			`vse32.v v16, (IVP) // Store next IV`
			`ret`
			`.endm`

			`.macro aes_cbc_decrypt keylen`
crypto: riscv - parallelize AES-CBC decryption Since CBC decryption is parallelizable, make the RISC-V implementation of AES-CBC decryption process multiple blocks at a time, instead of processing the blocks one by one. This should improve performance. Signed-off-by: Eric Biggers <ebiggers@google.com> Link: https://lore.kernel.org/r/20240208060851.154129-1-ebiggers@kernel.org Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com> 2024-02-07 23:08:51 -07:00			`srli LEN, LEN, 2 // Convert LEN from bytes to words`
crypto: riscv - add vector crypto accelerated AES-{ECB,CBC,CTR,XTS} Add implementations of AES-ECB, AES-CBC, AES-CTR, and AES-XTS, as well as bare (single-block) AES, using the RISC-V vector crypto extensions. The assembly code is derived from OpenSSL code (openssl/openssl#21923) that was dual-licensed so that it could be reused in the kernel. Nevertheless, the assembly has been significantly reworked for integration with the kernel, for example by using regular .S files instead of the so-called perlasm, using the assembler instead of bare '.inst', greatly reducing code duplication, supporting AES-192, and making the code use the same AES key structure as the C code. Co-developed-by: Phoebe Chen <phoebe.chen@sifive.com> Signed-off-by: Phoebe Chen <phoebe.chen@sifive.com> Signed-off-by: Jerry Shih <jerry.shih@sifive.com> Co-developed-by: Eric Biggers <ebiggers@google.com> Signed-off-by: Eric Biggers <ebiggers@google.com> Link: https://lore.kernel.org/r/20240122002024.27477-5-ebiggers@kernel.org Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com> 2024-01-21 17:19:15 -07:00			`vle32.v v16, (IVP) // Load IV`
			`1:`
crypto: riscv - parallelize AES-CBC decryption Since CBC decryption is parallelizable, make the RISC-V implementation of AES-CBC decryption process multiple blocks at a time, instead of processing the blocks one by one. This should improve performance. Signed-off-by: Eric Biggers <ebiggers@google.com> Link: https://lore.kernel.org/r/20240208060851.154129-1-ebiggers@kernel.org Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com> 2024-02-07 23:08:51 -07:00			`vsetvli t0, LEN, e32, m4, ta, ma`
			`vle32.v v20, (INP) // Load ciphertext blocks`
			`vslideup.vi v16, v20, 4 // Setup prev ciphertext blocks`
			`addi t1, t0, -4`
			`vslidedown.vx v24, v20, t1 // Save last ciphertext block`
			`aes_decrypt v20, \keylen // Decrypt the blocks`
			`vxor.vv v20, v20, v16 // XOR with prev ciphertext blocks`
			`vse32.v v20, (OUTP) // Store plaintext blocks`
			`vmv.v.v v16, v24 // Next "IV" is last ciphertext block`
			`slli t1, t0, 2 // Words to bytes`
			`add INP, INP, t1`
			`add OUTP, OUTP, t1`
			`sub LEN, LEN, t0`
crypto: riscv - add vector crypto accelerated AES-{ECB,CBC,CTR,XTS} Add implementations of AES-ECB, AES-CBC, AES-CTR, and AES-XTS, as well as bare (single-block) AES, using the RISC-V vector crypto extensions. The assembly code is derived from OpenSSL code (openssl/openssl#21923) that was dual-licensed so that it could be reused in the kernel. Nevertheless, the assembly has been significantly reworked for integration with the kernel, for example by using regular .S files instead of the so-called perlasm, using the assembler instead of bare '.inst', greatly reducing code duplication, supporting AES-192, and making the code use the same AES key structure as the C code. Co-developed-by: Phoebe Chen <phoebe.chen@sifive.com> Signed-off-by: Phoebe Chen <phoebe.chen@sifive.com> Signed-off-by: Jerry Shih <jerry.shih@sifive.com> Co-developed-by: Eric Biggers <ebiggers@google.com> Signed-off-by: Eric Biggers <ebiggers@google.com> Link: https://lore.kernel.org/r/20240122002024.27477-5-ebiggers@kernel.org Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com> 2024-01-21 17:19:15 -07:00			`bnez LEN, 1b`

crypto: riscv - parallelize AES-CBC decryption Since CBC decryption is parallelizable, make the RISC-V implementation of AES-CBC decryption process multiple blocks at a time, instead of processing the blocks one by one. This should improve performance. Signed-off-by: Eric Biggers <ebiggers@google.com> Link: https://lore.kernel.org/r/20240208060851.154129-1-ebiggers@kernel.org Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com> 2024-02-07 23:08:51 -07:00			`vsetivli zero, 4, e32, m1, ta, ma`
crypto: riscv - add vector crypto accelerated AES-{ECB,CBC,CTR,XTS} Add implementations of AES-ECB, AES-CBC, AES-CTR, and AES-XTS, as well as bare (single-block) AES, using the RISC-V vector crypto extensions. The assembly code is derived from OpenSSL code (openssl/openssl#21923) that was dual-licensed so that it could be reused in the kernel. Nevertheless, the assembly has been significantly reworked for integration with the kernel, for example by using regular .S files instead of the so-called perlasm, using the assembler instead of bare '.inst', greatly reducing code duplication, supporting AES-192, and making the code use the same AES key structure as the C code. Co-developed-by: Phoebe Chen <phoebe.chen@sifive.com> Signed-off-by: Phoebe Chen <phoebe.chen@sifive.com> Signed-off-by: Jerry Shih <jerry.shih@sifive.com> Co-developed-by: Eric Biggers <ebiggers@google.com> Signed-off-by: Eric Biggers <ebiggers@google.com> Link: https://lore.kernel.org/r/20240122002024.27477-5-ebiggers@kernel.org Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com> 2024-01-21 17:19:15 -07:00			`vse32.v v16, (IVP) // Store next IV`
			`ret`
			`.endm`

			`// void aes_cbc_encrypt_zvkned(const struct crypto_aes_ctx *key,`
			`// const u8 in, u8 out, size_t len, u8 iv[16]);`
			`//`
			`// \|len\| must be nonzero and a multiple of 16 (AES_BLOCK_SIZE).`
			`SYM_FUNC_START(aes_cbc_encrypt_zvkned)`
			`aes_begin KEYP, 128f, 192f`
			`aes_cbc_encrypt 256`
			`128:`
			`aes_cbc_encrypt 128`
			`192:`
			`aes_cbc_encrypt 192`
			`SYM_FUNC_END(aes_cbc_encrypt_zvkned)`

			`// Same prototype and calling convention as the encryption function`
			`SYM_FUNC_START(aes_cbc_decrypt_zvkned)`
			`aes_begin KEYP, 128f, 192f`
			`aes_cbc_decrypt 256`
			`128:`
			`aes_cbc_decrypt 128`
			`192:`
			`aes_cbc_decrypt 192`
			`SYM_FUNC_END(aes_cbc_decrypt_zvkned)`
crypto: riscv - add vector crypto accelerated AES-CBC-CTS Add an implementation of cts(cbc(aes)) accelerated using the Zvkned RISC-V vector crypto extension. This is mainly useful for fscrypt, where cts(cbc(aes)) is the "default" filenames encryption algorithm. In that use case, typically most messages are short and are block-aligned. The CBC-CTS variant implemented is CS3; this is the variant Linux uses. To perform well on short messages, the new implementation processes the full message in one call to the assembly function if the data is contiguous. Otherwise it falls back to CBC operations followed by CTS at the end. For decryption, to further improve performance on short messages, especially block-aligned messages, the CBC-CTS assembly function parallelizes the AES decryption of all full blocks. This improves on the arm64 implementation of cts(cbc(aes)), which always splits the CBC part(s) from the CTS part, doing the AES decryptions for the last two blocks serially and usually loading the round keys twice. Tested in QEMU with CONFIG_CRYPTO_MANAGER_EXTRA_TESTS=y. Signed-off-by: Eric Biggers <ebiggers@google.com> Link: https://lore.kernel.org/r/20240213055442.35954-1-ebiggers@kernel.org Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com> 2024-02-12 22:54:42 -07:00
			`.macro aes_cbc_cts_encrypt keylen`

			`// CBC-encrypt all blocks except the last. But don't store the`
			`// second-to-last block to the output buffer yet, since it will be`
			`// handled specially in the ciphertext stealing step. Exception: if the`
			`// message is single-block, still encrypt the last (and only) block.`
			`li t0, 16`
			`j 2f`
			`1:`
			`vse32.v v16, (OUTP) // Store ciphertext block`
			`addi OUTP, OUTP, 16`
			`2:`
			`vle32.v v17, (INP) // Load plaintext block`
			`vxor.vv v16, v16, v17 // XOR with IV or prev ciphertext block`
			`aes_encrypt v16, \keylen // Encrypt`
			`addi INP, INP, 16`
			`addi LEN, LEN, -16`
			`bgt LEN, t0, 1b // Repeat if more than one block remains`

			`// Special case: if the message is a single block, just do CBC.`
			`beqz LEN, .Lcts_encrypt_done\@`

			`// Encrypt the last two blocks using ciphertext stealing as follows:`
			`// C[n-1] = Encrypt(Encrypt(P[n-1] ^ C[n-2]) ^ P[n])`
			`// C[n] = Encrypt(P[n-1] ^ C[n-2])[0..LEN]`
			`//`
			`// C[i] denotes the i'th ciphertext block, and likewise P[i] the i'th`
			`// plaintext block. Block n, the last block, may be partial; its length`
			`// is 1 <= LEN <= 16. If there are only 2 blocks, C[n-2] means the IV.`
			`//`
			`// v16 already contains Encrypt(P[n-1] ^ C[n-2]).`
			`// INP points to P[n]. OUTP points to where C[n-1] should go.`
			`// To support in-place encryption, load P[n] before storing C[n].`
			`addi t0, OUTP, 16 // Get pointer to where C[n] should go`
			`vsetvli zero, LEN, e8, m1, tu, ma`
			`vle8.v v17, (INP) // Load P[n]`
			`vse8.v v16, (t0) // Store C[n]`
			`vxor.vv v16, v16, v17 // v16 = Encrypt(P[n-1] ^ C[n-2]) ^ P[n]`
			`vsetivli zero, 4, e32, m1, ta, ma`
			`aes_encrypt v16, \keylen`
			`.Lcts_encrypt_done\@:`
			`vse32.v v16, (OUTP) // Store C[n-1] (or C[n] in single-block case)`
			`ret`
			`.endm`

			`#define LEN32 t4 // Length of remaining full blocks in 32-bit words`
			`#define LEN_MOD16 t5 // Length of message in bytes mod 16`

			`.macro aes_cbc_cts_decrypt keylen`
			`andi LEN32, LEN, ~15`
			`srli LEN32, LEN32, 2`
			`andi LEN_MOD16, LEN, 15`

			`// Save C[n-2] in v28 so that it's available later during the ciphertext`
			`// stealing step. If there are fewer than three blocks, C[n-2] means`
			`// the IV, otherwise it means the third-to-last ciphertext block.`
			`vmv.v.v v28, v16 // IV`
			`add t0, LEN, -33`
			`bltz t0, .Lcts_decrypt_loop\@`
			`andi t0, t0, ~15`
			`add t0, t0, INP`
			`vle32.v v28, (t0)`

			`// CBC-decrypt all full blocks. For the last full block, or the last 2`
			`// full blocks if the message is block-aligned, this doesn't write the`
			`// correct output blocks (unless the message is only a single block),`
			`// because it XORs the wrong values with the raw AES plaintexts. But we`
			`// fix this after this loop without redoing the AES decryptions. This`
			`// approach allows more of the AES decryptions to be parallelized.`
			`.Lcts_decrypt_loop\@:`
			`vsetvli t0, LEN32, e32, m4, ta, ma`
			`addi t1, t0, -4`
			`vle32.v v20, (INP) // Load next set of ciphertext blocks`
			`vmv.v.v v24, v16 // Get IV or last ciphertext block of prev set`
			`vslideup.vi v24, v20, 4 // Setup prev ciphertext blocks`
			`vslidedown.vx v16, v20, t1 // Save last ciphertext block of this set`
			`aes_decrypt v20, \keylen // Decrypt this set of blocks`
			`vxor.vv v24, v24, v20 // XOR prev ciphertext blocks with decrypted blocks`
			`vse32.v v24, (OUTP) // Store this set of plaintext blocks`
			`sub LEN32, LEN32, t0`
			`slli t0, t0, 2 // Words to bytes`
			`add INP, INP, t0`
			`add OUTP, OUTP, t0`
			`bnez LEN32, .Lcts_decrypt_loop\@`

			`vsetivli zero, 4, e32, m4, ta, ma`
			`vslidedown.vx v20, v20, t1 // Extract raw plaintext of last full block`
			`addi t0, OUTP, -16 // Get pointer to last full plaintext block`
			`bnez LEN_MOD16, .Lcts_decrypt_non_block_aligned\@`

			`// Special case: if the message is a single block, just do CBC.`
			`li t1, 16`
			`beq LEN, t1, .Lcts_decrypt_done\@`

			`// Block-aligned message. Just fix up the last 2 blocks. We need:`
			`//`
			`// P[n-1] = Decrypt(C[n]) ^ C[n-2]`
			`// P[n] = Decrypt(C[n-1]) ^ C[n]`
			`//`
			`// We have C[n] in v16, Decrypt(C[n]) in v20, and C[n-2] in v28.`
			`// Together with Decrypt(C[n-1]) ^ C[n-2] from the output buffer, this`
			`// is everything needed to fix the output without re-decrypting blocks.`
			`addi t1, OUTP, -32 // Get pointer to where P[n-1] should go`
			`vxor.vv v20, v20, v28 // Decrypt(C[n]) ^ C[n-2] == P[n-1]`
			`vle32.v v24, (t1) // Decrypt(C[n-1]) ^ C[n-2]`
			`vse32.v v20, (t1) // Store P[n-1]`
			`vxor.vv v20, v24, v16 // Decrypt(C[n-1]) ^ C[n-2] ^ C[n] == P[n] ^ C[n-2]`
			`j .Lcts_decrypt_finish\@`

			`.Lcts_decrypt_non_block_aligned\@:`
			`// Decrypt the last two blocks using ciphertext stealing as follows:`
			`//`
			`// P[n-1] = Decrypt(C[n] \|\| Decrypt(C[n-1])[LEN_MOD16..16]) ^ C[n-2]`
			`// P[n] = (Decrypt(C[n-1]) ^ C[n])[0..LEN_MOD16]`
			`//`
			`// We already have Decrypt(C[n-1]) in v20 and C[n-2] in v28.`
			`vmv.v.v v16, v20 // v16 = Decrypt(C[n-1])`
			`vsetvli zero, LEN_MOD16, e8, m1, tu, ma`
			`vle8.v v20, (INP) // v20 = C[n] \|\| Decrypt(C[n-1])[LEN_MOD16..16]`
			`vxor.vv v16, v16, v20 // v16 = Decrypt(C[n-1]) ^ C[n]`
			`vse8.v v16, (OUTP) // Store P[n]`
			`vsetivli zero, 4, e32, m1, ta, ma`
			`aes_decrypt v20, \keylen // v20 = Decrypt(C[n] \|\| Decrypt(C[n-1])[LEN_MOD16..16])`
			`.Lcts_decrypt_finish\@:`
			`vxor.vv v20, v20, v28 // XOR with C[n-2]`
			`vse32.v v20, (t0) // Store last full plaintext block`
			`.Lcts_decrypt_done\@:`
			`ret`
			`.endm`

			`.macro aes_cbc_cts_crypt keylen`
			`vle32.v v16, (IVP) // Load IV`
			`beqz a5, .Lcts_decrypt\@`
			`aes_cbc_cts_encrypt \keylen`
			`.Lcts_decrypt\@:`
			`aes_cbc_cts_decrypt \keylen`
			`.endm`

			`// void aes_cbc_cts_crypt_zvkned(const struct crypto_aes_ctx *key,`
			`// const u8 in, u8 out, size_t len,`
			`// const u8 iv[16], bool enc);`
			`//`
			`// Encrypts or decrypts a message with the CS3 variant of AES-CBC-CTS.`
			`// This is the variant that unconditionally swaps the last two blocks.`
			`SYM_FUNC_START(aes_cbc_cts_crypt_zvkned)`
			`aes_begin KEYP, 128f, 192f`
			`aes_cbc_cts_crypt 256`
			`128:`
			`aes_cbc_cts_crypt 128`
			`192:`
			`aes_cbc_cts_crypt 192`
			`SYM_FUNC_END(aes_cbc_cts_crypt_zvkned)`