ce41fefd24
This patch does not add new features, but only refactors and simplifies the implementation of the Crypto Extension acceleration of the SM4 algorithm: Extract the macro optimized by SM4 Crypto Extension for reuse in the subsequent optimization of CCM/GCM modes. Encryption in CBC and CFB modes processes four blocks at a time instead of one, allowing the ld1 instruction to load 64 bytes of data at a time, which will reduces unnecessary memory accesses. CBC/CFB/CTR makes full use of free registers to reduce redundant memory accesses, and rearranges some instructions to improve out-of-order execution capabilities. Signed-off-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
210 lines
5.7 KiB
C
210 lines
5.7 KiB
C
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
|
/*
|
|
* SM4 helper macros for Crypto Extensions
|
|
* Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
|
|
*/
|
|
|
|
#define SM4_PREPARE(ptr) \
|
|
ld1 {v24.16b-v27.16b}, [ptr], #64; \
|
|
ld1 {v28.16b-v31.16b}, [ptr];
|
|
|
|
#define SM4_CRYPT_BLK_BE(b0) \
|
|
sm4e b0.4s, v24.4s; \
|
|
sm4e b0.4s, v25.4s; \
|
|
sm4e b0.4s, v26.4s; \
|
|
sm4e b0.4s, v27.4s; \
|
|
sm4e b0.4s, v28.4s; \
|
|
sm4e b0.4s, v29.4s; \
|
|
sm4e b0.4s, v30.4s; \
|
|
sm4e b0.4s, v31.4s; \
|
|
rev64 b0.4s, b0.4s; \
|
|
ext b0.16b, b0.16b, b0.16b, #8; \
|
|
rev32 b0.16b, b0.16b;
|
|
|
|
#define SM4_CRYPT_BLK(b0) \
|
|
rev32 b0.16b, b0.16b; \
|
|
SM4_CRYPT_BLK_BE(b0);
|
|
|
|
#define SM4_CRYPT_BLK2_BE(b0, b1) \
|
|
sm4e b0.4s, v24.4s; \
|
|
sm4e b1.4s, v24.4s; \
|
|
sm4e b0.4s, v25.4s; \
|
|
sm4e b1.4s, v25.4s; \
|
|
sm4e b0.4s, v26.4s; \
|
|
sm4e b1.4s, v26.4s; \
|
|
sm4e b0.4s, v27.4s; \
|
|
sm4e b1.4s, v27.4s; \
|
|
sm4e b0.4s, v28.4s; \
|
|
sm4e b1.4s, v28.4s; \
|
|
sm4e b0.4s, v29.4s; \
|
|
sm4e b1.4s, v29.4s; \
|
|
sm4e b0.4s, v30.4s; \
|
|
sm4e b1.4s, v30.4s; \
|
|
sm4e b0.4s, v31.4s; \
|
|
sm4e b1.4s, v31.4s; \
|
|
rev64 b0.4s, b0.4s; \
|
|
rev64 b1.4s, b1.4s; \
|
|
ext b0.16b, b0.16b, b0.16b, #8; \
|
|
ext b1.16b, b1.16b, b1.16b, #8; \
|
|
rev32 b0.16b, b0.16b; \
|
|
rev32 b1.16b, b1.16b; \
|
|
|
|
#define SM4_CRYPT_BLK2(b0, b1) \
|
|
rev32 b0.16b, b0.16b; \
|
|
rev32 b1.16b, b1.16b; \
|
|
SM4_CRYPT_BLK2_BE(b0, b1);
|
|
|
|
#define SM4_CRYPT_BLK4_BE(b0, b1, b2, b3) \
|
|
sm4e b0.4s, v24.4s; \
|
|
sm4e b1.4s, v24.4s; \
|
|
sm4e b2.4s, v24.4s; \
|
|
sm4e b3.4s, v24.4s; \
|
|
sm4e b0.4s, v25.4s; \
|
|
sm4e b1.4s, v25.4s; \
|
|
sm4e b2.4s, v25.4s; \
|
|
sm4e b3.4s, v25.4s; \
|
|
sm4e b0.4s, v26.4s; \
|
|
sm4e b1.4s, v26.4s; \
|
|
sm4e b2.4s, v26.4s; \
|
|
sm4e b3.4s, v26.4s; \
|
|
sm4e b0.4s, v27.4s; \
|
|
sm4e b1.4s, v27.4s; \
|
|
sm4e b2.4s, v27.4s; \
|
|
sm4e b3.4s, v27.4s; \
|
|
sm4e b0.4s, v28.4s; \
|
|
sm4e b1.4s, v28.4s; \
|
|
sm4e b2.4s, v28.4s; \
|
|
sm4e b3.4s, v28.4s; \
|
|
sm4e b0.4s, v29.4s; \
|
|
sm4e b1.4s, v29.4s; \
|
|
sm4e b2.4s, v29.4s; \
|
|
sm4e b3.4s, v29.4s; \
|
|
sm4e b0.4s, v30.4s; \
|
|
sm4e b1.4s, v30.4s; \
|
|
sm4e b2.4s, v30.4s; \
|
|
sm4e b3.4s, v30.4s; \
|
|
sm4e b0.4s, v31.4s; \
|
|
sm4e b1.4s, v31.4s; \
|
|
sm4e b2.4s, v31.4s; \
|
|
sm4e b3.4s, v31.4s; \
|
|
rev64 b0.4s, b0.4s; \
|
|
rev64 b1.4s, b1.4s; \
|
|
rev64 b2.4s, b2.4s; \
|
|
rev64 b3.4s, b3.4s; \
|
|
ext b0.16b, b0.16b, b0.16b, #8; \
|
|
ext b1.16b, b1.16b, b1.16b, #8; \
|
|
ext b2.16b, b2.16b, b2.16b, #8; \
|
|
ext b3.16b, b3.16b, b3.16b, #8; \
|
|
rev32 b0.16b, b0.16b; \
|
|
rev32 b1.16b, b1.16b; \
|
|
rev32 b2.16b, b2.16b; \
|
|
rev32 b3.16b, b3.16b;
|
|
|
|
#define SM4_CRYPT_BLK4(b0, b1, b2, b3) \
|
|
rev32 b0.16b, b0.16b; \
|
|
rev32 b1.16b, b1.16b; \
|
|
rev32 b2.16b, b2.16b; \
|
|
rev32 b3.16b, b3.16b; \
|
|
SM4_CRYPT_BLK4_BE(b0, b1, b2, b3);
|
|
|
|
#define SM4_CRYPT_BLK8_BE(b0, b1, b2, b3, b4, b5, b6, b7) \
|
|
sm4e b0.4s, v24.4s; \
|
|
sm4e b1.4s, v24.4s; \
|
|
sm4e b2.4s, v24.4s; \
|
|
sm4e b3.4s, v24.4s; \
|
|
sm4e b4.4s, v24.4s; \
|
|
sm4e b5.4s, v24.4s; \
|
|
sm4e b6.4s, v24.4s; \
|
|
sm4e b7.4s, v24.4s; \
|
|
sm4e b0.4s, v25.4s; \
|
|
sm4e b1.4s, v25.4s; \
|
|
sm4e b2.4s, v25.4s; \
|
|
sm4e b3.4s, v25.4s; \
|
|
sm4e b4.4s, v25.4s; \
|
|
sm4e b5.4s, v25.4s; \
|
|
sm4e b6.4s, v25.4s; \
|
|
sm4e b7.4s, v25.4s; \
|
|
sm4e b0.4s, v26.4s; \
|
|
sm4e b1.4s, v26.4s; \
|
|
sm4e b2.4s, v26.4s; \
|
|
sm4e b3.4s, v26.4s; \
|
|
sm4e b4.4s, v26.4s; \
|
|
sm4e b5.4s, v26.4s; \
|
|
sm4e b6.4s, v26.4s; \
|
|
sm4e b7.4s, v26.4s; \
|
|
sm4e b0.4s, v27.4s; \
|
|
sm4e b1.4s, v27.4s; \
|
|
sm4e b2.4s, v27.4s; \
|
|
sm4e b3.4s, v27.4s; \
|
|
sm4e b4.4s, v27.4s; \
|
|
sm4e b5.4s, v27.4s; \
|
|
sm4e b6.4s, v27.4s; \
|
|
sm4e b7.4s, v27.4s; \
|
|
sm4e b0.4s, v28.4s; \
|
|
sm4e b1.4s, v28.4s; \
|
|
sm4e b2.4s, v28.4s; \
|
|
sm4e b3.4s, v28.4s; \
|
|
sm4e b4.4s, v28.4s; \
|
|
sm4e b5.4s, v28.4s; \
|
|
sm4e b6.4s, v28.4s; \
|
|
sm4e b7.4s, v28.4s; \
|
|
sm4e b0.4s, v29.4s; \
|
|
sm4e b1.4s, v29.4s; \
|
|
sm4e b2.4s, v29.4s; \
|
|
sm4e b3.4s, v29.4s; \
|
|
sm4e b4.4s, v29.4s; \
|
|
sm4e b5.4s, v29.4s; \
|
|
sm4e b6.4s, v29.4s; \
|
|
sm4e b7.4s, v29.4s; \
|
|
sm4e b0.4s, v30.4s; \
|
|
sm4e b1.4s, v30.4s; \
|
|
sm4e b2.4s, v30.4s; \
|
|
sm4e b3.4s, v30.4s; \
|
|
sm4e b4.4s, v30.4s; \
|
|
sm4e b5.4s, v30.4s; \
|
|
sm4e b6.4s, v30.4s; \
|
|
sm4e b7.4s, v30.4s; \
|
|
sm4e b0.4s, v31.4s; \
|
|
sm4e b1.4s, v31.4s; \
|
|
sm4e b2.4s, v31.4s; \
|
|
sm4e b3.4s, v31.4s; \
|
|
sm4e b4.4s, v31.4s; \
|
|
sm4e b5.4s, v31.4s; \
|
|
sm4e b6.4s, v31.4s; \
|
|
sm4e b7.4s, v31.4s; \
|
|
rev64 b0.4s, b0.4s; \
|
|
rev64 b1.4s, b1.4s; \
|
|
rev64 b2.4s, b2.4s; \
|
|
rev64 b3.4s, b3.4s; \
|
|
rev64 b4.4s, b4.4s; \
|
|
rev64 b5.4s, b5.4s; \
|
|
rev64 b6.4s, b6.4s; \
|
|
rev64 b7.4s, b7.4s; \
|
|
ext b0.16b, b0.16b, b0.16b, #8; \
|
|
ext b1.16b, b1.16b, b1.16b, #8; \
|
|
ext b2.16b, b2.16b, b2.16b, #8; \
|
|
ext b3.16b, b3.16b, b3.16b, #8; \
|
|
ext b4.16b, b4.16b, b4.16b, #8; \
|
|
ext b5.16b, b5.16b, b5.16b, #8; \
|
|
ext b6.16b, b6.16b, b6.16b, #8; \
|
|
ext b7.16b, b7.16b, b7.16b, #8; \
|
|
rev32 b0.16b, b0.16b; \
|
|
rev32 b1.16b, b1.16b; \
|
|
rev32 b2.16b, b2.16b; \
|
|
rev32 b3.16b, b3.16b; \
|
|
rev32 b4.16b, b4.16b; \
|
|
rev32 b5.16b, b5.16b; \
|
|
rev32 b6.16b, b6.16b; \
|
|
rev32 b7.16b, b7.16b;
|
|
|
|
#define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \
|
|
rev32 b0.16b, b0.16b; \
|
|
rev32 b1.16b, b1.16b; \
|
|
rev32 b2.16b, b2.16b; \
|
|
rev32 b3.16b, b3.16b; \
|
|
rev32 b4.16b, b4.16b; \
|
|
rev32 b5.16b, b5.16b; \
|
|
rev32 b6.16b, b6.16b; \
|
|
rev32 b7.16b, b7.16b; \
|
|
SM4_CRYPT_BLK8_BE(b0, b1, b2, b3, b4, b5, b6, b7);
|