1
mirror of https://github.com/jedisct1/libsodium.git synced 2024-12-23 20:15:19 -07:00

AES-256-GCM (AES-NI): prefetch the next blocks

...while computing the GHASH of the previous blocks.

For AMD CPUs with disabled hardware prefetchers, the gain may
be significant.
This commit is contained in:
Frank Denis 2023-01-14 00:01:42 +01:00
parent 67f1b568c5
commit 48af322b7a

View File

@ -444,6 +444,11 @@ aes_gcm_encrypt_generic(const State *st, GHash *sth, unsigned char mac[ABYTES],
counter = incr_counters(rev_counters, counter, PARALLEL_BLOCKS); counter = incr_counters(rev_counters, counter, PARALLEL_BLOCKS);
encrypt_xor_wide(st, dst + i, src + i, rev_counters); encrypt_xor_wide(st, dst + i, src + i, rev_counters);
PREFETCH_READ(src + i + PARALLEL_BLOCKS * 16);
#if PARALLEL_BLOCKS >= 64 / 16
PREFETCH_READ(src + i + PARALLEL_BLOCKS * 16 + 64);
#endif
pi = i - PARALLEL_BLOCKS * 16; pi = i - PARALLEL_BLOCKS * 16;
u = gh_update0(sth, dst + pi, st->hx[2 * PARALLEL_BLOCKS - 1 - 0]); u = gh_update0(sth, dst + pi, st->hx[2 * PARALLEL_BLOCKS - 1 - 0]);
for (j = 1; j < PARALLEL_BLOCKS; j += 1) { for (j = 1; j < PARALLEL_BLOCKS; j += 1) {
@ -454,6 +459,10 @@ aes_gcm_encrypt_generic(const State *st, GHash *sth, unsigned char mac[ABYTES],
encrypt_xor_wide(st, dst + i + PARALLEL_BLOCKS * 16, src + i + PARALLEL_BLOCKS * 16, encrypt_xor_wide(st, dst + i + PARALLEL_BLOCKS * 16, src + i + PARALLEL_BLOCKS * 16,
rev_counters); rev_counters);
PREFETCH_READ(src + i + 2 * PARALLEL_BLOCKS * 16);
#if PARALLEL_BLOCKS >= 64 / 16
PREFETCH_READ(src + i + 2 * PARALLEL_BLOCKS * 16 + 64);
#endif
pi = i; pi = i;
for (j = 0; j < PARALLEL_BLOCKS; j += 1) { for (j = 0; j < PARALLEL_BLOCKS; j += 1) {
gh_update(&u, dst + pi + j * 16, st->hx[PARALLEL_BLOCKS - 1 - j]); gh_update(&u, dst + pi + j * 16, st->hx[PARALLEL_BLOCKS - 1 - j]);