2019-11-08 05:22:31 -07:00
|
|
|
// SPDX-License-Identifier: GPL-2.0 OR MIT
|
|
|
|
/*
|
|
|
|
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <crypto/internal/blake2s.h>
|
|
|
|
|
|
|
|
#include <linux/types.h>
|
|
|
|
#include <linux/jump_label.h>
|
|
|
|
#include <linux/kernel.h>
|
2020-08-19 04:58:20 -07:00
|
|
|
#include <linux/sizes.h>
|
2019-11-08 05:22:31 -07:00
|
|
|
|
|
|
|
#include <asm/cpufeature.h>
|
|
|
|
#include <asm/fpu/api.h>
|
|
|
|
#include <asm/processor.h>
|
|
|
|
#include <asm/simd.h>
|
|
|
|
|
|
|
|
asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state,
|
|
|
|
const u8 *block, const size_t nblocks,
|
|
|
|
const u32 inc);
|
|
|
|
asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
|
|
|
|
const u8 *block, const size_t nblocks,
|
|
|
|
const u32 inc);
|
|
|
|
|
|
|
|
static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
|
|
|
|
static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
|
|
|
|
|
2021-12-22 06:56:58 -07:00
|
|
|
void blake2s_compress(struct blake2s_state *state, const u8 *block,
|
|
|
|
size_t nblocks, const u32 inc)
|
2019-11-08 05:22:31 -07:00
|
|
|
{
|
|
|
|
/* SIMD disables preemption, so relax after processing each page. */
|
crypto: arch/lib - limit simd usage to 4k chunks
The initial Zinc patchset, after some mailing list discussion, contained
code to ensure that kernel_fpu_enable would not be kept on for more than
a 4k chunk, since it disables preemption. The choice of 4k isn't totally
scientific, but it's not a bad guess either, and it's what's used in
both the x86 poly1305, blake2s, and nhpoly1305 code already (in the form
of PAGE_SIZE, which this commit corrects to be explicitly 4k for the
former two).
Ard did some back of the envelope calculations and found that
at 5 cycles/byte (overestimate) on a 1ghz processor (pretty slow), 4k
means we have a maximum preemption disabling of 20us, which Sebastian
confirmed was probably a good limit.
Unfortunately the chunking appears to have been left out of the final
patchset that added the glue code. So, this commit adds it back in.
Fixes: 84e03fa39fbe ("crypto: x86/chacha - expose SIMD ChaCha routine as library function")
Fixes: b3aad5bad26a ("crypto: arm64/chacha - expose arm64 ChaCha routine as library function")
Fixes: a44a3430d71b ("crypto: arm/chacha - expose ARM ChaCha routine as library function")
Fixes: d7d7b8535662 ("crypto: x86/poly1305 - wire up faster implementations for kernel")
Fixes: f569ca164751 ("crypto: arm64/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
Fixes: a6b803b3ddc7 ("crypto: arm/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
Fixes: ed0356eda153 ("crypto: blake2s - x86_64 SIMD implementation")
Cc: Eric Biggers <ebiggers@google.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: stable@vger.kernel.org
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2020-04-22 16:18:53 -07:00
|
|
|
BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
|
2019-11-08 05:22:31 -07:00
|
|
|
|
2022-05-28 12:44:07 -07:00
|
|
|
if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) {
|
2019-11-08 05:22:31 -07:00
|
|
|
blake2s_compress_generic(state, block, nblocks, inc);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
crypto: arch/lib - limit simd usage to 4k chunks
The initial Zinc patchset, after some mailing list discussion, contained
code to ensure that kernel_fpu_enable would not be kept on for more than
a 4k chunk, since it disables preemption. The choice of 4k isn't totally
scientific, but it's not a bad guess either, and it's what's used in
both the x86 poly1305, blake2s, and nhpoly1305 code already (in the form
of PAGE_SIZE, which this commit corrects to be explicitly 4k for the
former two).
Ard did some back of the envelope calculations and found that
at 5 cycles/byte (overestimate) on a 1ghz processor (pretty slow), 4k
means we have a maximum preemption disabling of 20us, which Sebastian
confirmed was probably a good limit.
Unfortunately the chunking appears to have been left out of the final
patchset that added the glue code. So, this commit adds it back in.
Fixes: 84e03fa39fbe ("crypto: x86/chacha - expose SIMD ChaCha routine as library function")
Fixes: b3aad5bad26a ("crypto: arm64/chacha - expose arm64 ChaCha routine as library function")
Fixes: a44a3430d71b ("crypto: arm/chacha - expose ARM ChaCha routine as library function")
Fixes: d7d7b8535662 ("crypto: x86/poly1305 - wire up faster implementations for kernel")
Fixes: f569ca164751 ("crypto: arm64/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
Fixes: a6b803b3ddc7 ("crypto: arm/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
Fixes: ed0356eda153 ("crypto: blake2s - x86_64 SIMD implementation")
Cc: Eric Biggers <ebiggers@google.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: stable@vger.kernel.org
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2020-04-22 16:18:53 -07:00
|
|
|
do {
|
2019-11-08 05:22:31 -07:00
|
|
|
const size_t blocks = min_t(size_t, nblocks,
|
crypto: arch/lib - limit simd usage to 4k chunks
The initial Zinc patchset, after some mailing list discussion, contained
code to ensure that kernel_fpu_enable would not be kept on for more than
a 4k chunk, since it disables preemption. The choice of 4k isn't totally
scientific, but it's not a bad guess either, and it's what's used in
both the x86 poly1305, blake2s, and nhpoly1305 code already (in the form
of PAGE_SIZE, which this commit corrects to be explicitly 4k for the
former two).
Ard did some back of the envelope calculations and found that
at 5 cycles/byte (overestimate) on a 1ghz processor (pretty slow), 4k
means we have a maximum preemption disabling of 20us, which Sebastian
confirmed was probably a good limit.
Unfortunately the chunking appears to have been left out of the final
patchset that added the glue code. So, this commit adds it back in.
Fixes: 84e03fa39fbe ("crypto: x86/chacha - expose SIMD ChaCha routine as library function")
Fixes: b3aad5bad26a ("crypto: arm64/chacha - expose arm64 ChaCha routine as library function")
Fixes: a44a3430d71b ("crypto: arm/chacha - expose ARM ChaCha routine as library function")
Fixes: d7d7b8535662 ("crypto: x86/poly1305 - wire up faster implementations for kernel")
Fixes: f569ca164751 ("crypto: arm64/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
Fixes: a6b803b3ddc7 ("crypto: arm/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
Fixes: ed0356eda153 ("crypto: blake2s - x86_64 SIMD implementation")
Cc: Eric Biggers <ebiggers@google.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: stable@vger.kernel.org
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2020-04-22 16:18:53 -07:00
|
|
|
SZ_4K / BLAKE2S_BLOCK_SIZE);
|
2019-11-08 05:22:31 -07:00
|
|
|
|
|
|
|
kernel_fpu_begin();
|
|
|
|
if (IS_ENABLED(CONFIG_AS_AVX512) &&
|
|
|
|
static_branch_likely(&blake2s_use_avx512))
|
|
|
|
blake2s_compress_avx512(state, block, blocks, inc);
|
|
|
|
else
|
|
|
|
blake2s_compress_ssse3(state, block, blocks, inc);
|
|
|
|
kernel_fpu_end();
|
|
|
|
|
|
|
|
nblocks -= blocks;
|
|
|
|
block += blocks * BLAKE2S_BLOCK_SIZE;
|
crypto: arch/lib - limit simd usage to 4k chunks
The initial Zinc patchset, after some mailing list discussion, contained
code to ensure that kernel_fpu_enable would not be kept on for more than
a 4k chunk, since it disables preemption. The choice of 4k isn't totally
scientific, but it's not a bad guess either, and it's what's used in
both the x86 poly1305, blake2s, and nhpoly1305 code already (in the form
of PAGE_SIZE, which this commit corrects to be explicitly 4k for the
former two).
Ard did some back of the envelope calculations and found that
at 5 cycles/byte (overestimate) on a 1ghz processor (pretty slow), 4k
means we have a maximum preemption disabling of 20us, which Sebastian
confirmed was probably a good limit.
Unfortunately the chunking appears to have been left out of the final
patchset that added the glue code. So, this commit adds it back in.
Fixes: 84e03fa39fbe ("crypto: x86/chacha - expose SIMD ChaCha routine as library function")
Fixes: b3aad5bad26a ("crypto: arm64/chacha - expose arm64 ChaCha routine as library function")
Fixes: a44a3430d71b ("crypto: arm/chacha - expose ARM ChaCha routine as library function")
Fixes: d7d7b8535662 ("crypto: x86/poly1305 - wire up faster implementations for kernel")
Fixes: f569ca164751 ("crypto: arm64/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
Fixes: a6b803b3ddc7 ("crypto: arm/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
Fixes: ed0356eda153 ("crypto: blake2s - x86_64 SIMD implementation")
Cc: Eric Biggers <ebiggers@google.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: stable@vger.kernel.org
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2020-04-22 16:18:53 -07:00
|
|
|
} while (nblocks);
|
2019-11-08 05:22:31 -07:00
|
|
|
}
|
2021-12-22 06:56:58 -07:00
|
|
|
EXPORT_SYMBOL(blake2s_compress);
|
2019-11-08 05:22:31 -07:00
|
|
|
|
|
|
|
static int __init blake2s_mod_init(void)
|
|
|
|
{
|
2021-12-22 06:56:58 -07:00
|
|
|
if (boot_cpu_has(X86_FEATURE_SSSE3))
|
|
|
|
static_branch_enable(&blake2s_use_ssse3);
|
2019-11-08 05:22:31 -07:00
|
|
|
|
|
|
|
if (IS_ENABLED(CONFIG_AS_AVX512) &&
|
|
|
|
boot_cpu_has(X86_FEATURE_AVX) &&
|
|
|
|
boot_cpu_has(X86_FEATURE_AVX2) &&
|
|
|
|
boot_cpu_has(X86_FEATURE_AVX512F) &&
|
|
|
|
boot_cpu_has(X86_FEATURE_AVX512VL) &&
|
|
|
|
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
|
|
|
|
XFEATURE_MASK_AVX512, NULL))
|
|
|
|
static_branch_enable(&blake2s_use_avx512);
|
|
|
|
|
2021-12-22 06:56:58 -07:00
|
|
|
return 0;
|
2019-11-08 05:22:31 -07:00
|
|
|
}
|
|
|
|
|
2023-03-20 03:24:35 -07:00
|
|
|
subsys_initcall(blake2s_mod_init);
|