s390/checksum: provide csum_partial_copy_nocheck()
With csum_partial(), which reads all bytes into registers it is easy to also implement csum_partial_copy_nocheck() which copies the buffer while calculating its checksum. For a 512 byte buffer this reduces the runtime by 19%. Compared to the old generic variant (memcpy() + cksm instruction) runtime is reduced by 42%). Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
This commit is contained in:
parent
cb2a1dd589
commit
dcd3e1de9d
@ -32,6 +32,9 @@ static inline __wsum cksm(const void *buff, int len, __wsum sum)
|
|||||||
|
|
||||||
__wsum csum_partial(const void *buff, int len, __wsum sum);
|
__wsum csum_partial(const void *buff, int len, __wsum sum);
|
||||||
|
|
||||||
|
#define _HAVE_ARCH_CSUM_AND_COPY
|
||||||
|
__wsum csum_partial_copy_nocheck(const void *src, void *dst, int len);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Fold a partial checksum without adding pseudo headers.
|
* Fold a partial checksum without adding pseudo headers.
|
||||||
*/
|
*/
|
||||||
|
@ -531,6 +531,16 @@
|
|||||||
MRXBOPC 0, 0x37, v1
|
MRXBOPC 0, 0x37, v1
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
|
/* VECTOR STORE WITH LENGTH */
|
||||||
|
.macro VSTL v, gr, disp, base
|
||||||
|
VX_NUM v1, \v
|
||||||
|
GR_NUM b2, \base
|
||||||
|
GR_NUM r3, \gr
|
||||||
|
.word 0xE700 | ((v1&15) << 4) | r3
|
||||||
|
.word (b2 << 12) | (\disp)
|
||||||
|
MRXBOPC 0, 0x3f, v1
|
||||||
|
.endm
|
||||||
|
|
||||||
/* Vector integer instructions */
|
/* Vector integer instructions */
|
||||||
|
|
||||||
/* VECTOR AND */
|
/* VECTOR AND */
|
||||||
|
@ -241,6 +241,64 @@ static __always_inline void fpu_vlvgf(u8 v, u32 val, u16 index)
|
|||||||
|
|
||||||
#ifdef CONFIG_CC_IS_CLANG
|
#ifdef CONFIG_CC_IS_CLANG
|
||||||
|
|
||||||
|
static __always_inline void fpu_vst(u8 v1, const void *vxr)
|
||||||
|
{
|
||||||
|
instrument_write(vxr, sizeof(__vector128));
|
||||||
|
asm volatile("\n"
|
||||||
|
" la 1,%[vxr]\n"
|
||||||
|
" VST %[v1],0,,1\n"
|
||||||
|
: [vxr] "=R" (*(__vector128 *)vxr)
|
||||||
|
: [v1] "I" (v1)
|
||||||
|
: "memory", "1");
|
||||||
|
}
|
||||||
|
|
||||||
|
#else /* CONFIG_CC_IS_CLANG */
|
||||||
|
|
||||||
|
static __always_inline void fpu_vst(u8 v1, const void *vxr)
|
||||||
|
{
|
||||||
|
instrument_write(vxr, sizeof(__vector128));
|
||||||
|
asm volatile("VST %[v1],%O[vxr],,%R[vxr]\n"
|
||||||
|
: [vxr] "=Q" (*(__vector128 *)vxr)
|
||||||
|
: [v1] "I" (v1)
|
||||||
|
: "memory");
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif /* CONFIG_CC_IS_CLANG */
|
||||||
|
|
||||||
|
#ifdef CONFIG_CC_IS_CLANG
|
||||||
|
|
||||||
|
static __always_inline void fpu_vstl(u8 v1, u32 index, const void *vxr)
|
||||||
|
{
|
||||||
|
unsigned int size;
|
||||||
|
|
||||||
|
size = min(index + 1, sizeof(__vector128));
|
||||||
|
instrument_write(vxr, size);
|
||||||
|
asm volatile("\n"
|
||||||
|
" la 1,%[vxr]\n"
|
||||||
|
" VSTL %[v1],%[index],0,1\n"
|
||||||
|
: [vxr] "=R" (*(u8 *)vxr)
|
||||||
|
: [index] "d" (index), [v1] "I" (v1)
|
||||||
|
: "memory", "1");
|
||||||
|
}
|
||||||
|
|
||||||
|
#else /* CONFIG_CC_IS_CLANG */
|
||||||
|
|
||||||
|
static __always_inline void fpu_vstl(u8 v1, u32 index, const void *vxr)
|
||||||
|
{
|
||||||
|
unsigned int size;
|
||||||
|
|
||||||
|
size = min(index + 1, sizeof(__vector128));
|
||||||
|
instrument_write(vxr, size);
|
||||||
|
asm volatile("VSTL %[v1],%[index],%O[vxr],%R[vxr]\n"
|
||||||
|
: [vxr] "=Q" (*(u8 *)vxr)
|
||||||
|
: [index] "d" (index), [v1] "I" (v1)
|
||||||
|
: "memory");
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif /* CONFIG_CC_IS_CLANG */
|
||||||
|
|
||||||
|
#ifdef CONFIG_CC_IS_CLANG
|
||||||
|
|
||||||
#define fpu_vstm(_v1, _v3, _vxrs) \
|
#define fpu_vstm(_v1, _v3, _vxrs) \
|
||||||
({ \
|
({ \
|
||||||
unsigned int size = ((_v3) - (_v1) + 1) * sizeof(__vector128); \
|
unsigned int size = ((_v3) - (_v1) + 1) * sizeof(__vector128); \
|
||||||
|
@ -5,8 +5,8 @@
|
|||||||
#include <asm/fpu.h>
|
#include <asm/fpu.h>
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Computes the checksum of a memory block at buff, length len,
|
* Computes the checksum of a memory block at src, length len,
|
||||||
* and adds in "sum" (32-bit).
|
* and adds in "sum" (32-bit). If copy is true copies to dst.
|
||||||
*
|
*
|
||||||
* Returns a 32-bit number suitable for feeding into itself
|
* Returns a 32-bit number suitable for feeding into itself
|
||||||
* or csum_tcpudp_magic.
|
* or csum_tcpudp_magic.
|
||||||
@ -14,43 +14,60 @@
|
|||||||
* This function must be called with even lengths, except
|
* This function must be called with even lengths, except
|
||||||
* for the last fragment, which may be odd.
|
* for the last fragment, which may be odd.
|
||||||
*
|
*
|
||||||
* It's best to have buff aligned on a 64-bit boundary.
|
* It's best to have src and dst aligned on a 64-bit boundary.
|
||||||
*/
|
*/
|
||||||
__wsum csum_partial(const void *buff, int len, __wsum sum)
|
static __always_inline __wsum csum_copy(void *dst, const void *src, int len, __wsum sum, bool copy)
|
||||||
{
|
{
|
||||||
DECLARE_KERNEL_FPU_ONSTACK8(vxstate);
|
DECLARE_KERNEL_FPU_ONSTACK8(vxstate);
|
||||||
|
|
||||||
if (!cpu_has_vx())
|
if (!cpu_has_vx()) {
|
||||||
return cksm(buff, len, sum);
|
if (copy)
|
||||||
|
memcpy(dst, src, len);
|
||||||
|
return cksm(dst, len, sum);
|
||||||
|
}
|
||||||
kernel_fpu_begin(&vxstate, KERNEL_VXR_V16V23);
|
kernel_fpu_begin(&vxstate, KERNEL_VXR_V16V23);
|
||||||
fpu_vlvgf(16, (__force u32)sum, 1);
|
fpu_vlvgf(16, (__force u32)sum, 1);
|
||||||
fpu_vzero(17);
|
fpu_vzero(17);
|
||||||
fpu_vzero(18);
|
fpu_vzero(18);
|
||||||
fpu_vzero(19);
|
fpu_vzero(19);
|
||||||
while (len >= 64) {
|
while (len >= 64) {
|
||||||
fpu_vlm(20, 23, buff);
|
fpu_vlm(20, 23, src);
|
||||||
|
if (copy) {
|
||||||
|
fpu_vstm(20, 23, dst);
|
||||||
|
dst += 64;
|
||||||
|
}
|
||||||
fpu_vcksm(16, 20, 16);
|
fpu_vcksm(16, 20, 16);
|
||||||
fpu_vcksm(17, 21, 17);
|
fpu_vcksm(17, 21, 17);
|
||||||
fpu_vcksm(18, 22, 18);
|
fpu_vcksm(18, 22, 18);
|
||||||
fpu_vcksm(19, 23, 19);
|
fpu_vcksm(19, 23, 19);
|
||||||
buff += 64;
|
src += 64;
|
||||||
len -= 64;
|
len -= 64;
|
||||||
}
|
}
|
||||||
while (len >= 32) {
|
while (len >= 32) {
|
||||||
fpu_vlm(20, 21, buff);
|
fpu_vlm(20, 21, src);
|
||||||
|
if (copy) {
|
||||||
|
fpu_vstm(20, 21, dst);
|
||||||
|
dst += 32;
|
||||||
|
}
|
||||||
fpu_vcksm(16, 20, 16);
|
fpu_vcksm(16, 20, 16);
|
||||||
fpu_vcksm(17, 21, 17);
|
fpu_vcksm(17, 21, 17);
|
||||||
buff += 32;
|
src += 32;
|
||||||
len -= 32;
|
len -= 32;
|
||||||
}
|
}
|
||||||
while (len >= 16) {
|
while (len >= 16) {
|
||||||
fpu_vl(20, buff);
|
fpu_vl(20, src);
|
||||||
|
if (copy) {
|
||||||
|
fpu_vst(20, dst);
|
||||||
|
dst += 16;
|
||||||
|
}
|
||||||
fpu_vcksm(16, 20, 16);
|
fpu_vcksm(16, 20, 16);
|
||||||
buff += 16;
|
src += 16;
|
||||||
len -= 16;
|
len -= 16;
|
||||||
}
|
}
|
||||||
if (len) {
|
if (len) {
|
||||||
fpu_vll(20, len - 1, buff);
|
fpu_vll(20, len - 1, src);
|
||||||
|
if (copy)
|
||||||
|
fpu_vstl(20, len - 1, dst);
|
||||||
fpu_vcksm(16, 20, 16);
|
fpu_vcksm(16, 20, 16);
|
||||||
}
|
}
|
||||||
fpu_vcksm(18, 19, 18);
|
fpu_vcksm(18, 19, 18);
|
||||||
@ -60,4 +77,15 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
|
|||||||
kernel_fpu_end(&vxstate, KERNEL_VXR_V16V23);
|
kernel_fpu_end(&vxstate, KERNEL_VXR_V16V23);
|
||||||
return sum;
|
return sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__wsum csum_partial(const void *buff, int len, __wsum sum)
|
||||||
|
{
|
||||||
|
return csum_copy(NULL, buff, len, sum, false);
|
||||||
|
}
|
||||||
EXPORT_SYMBOL(csum_partial);
|
EXPORT_SYMBOL(csum_partial);
|
||||||
|
|
||||||
|
__wsum csum_partial_copy_nocheck(const void *src, void *dst, int len)
|
||||||
|
{
|
||||||
|
return csum_copy(dst, src, len, 0, true);
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL(csum_partial_copy_nocheck);
|
||||||
|
Loading…
Reference in New Issue
Block a user