From 3dae5c43badf285e22f6d88388e8a232a83bdfec Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Mon, 28 Aug 2023 11:53:57 -0700 Subject: [PATCH 1/9] x86/asm/bitops: Use __builtin_clz{l|ll} to evaluate constant expressions Micro-optimize the bitops code some more, similar to commits: fdb6649ab7c1 ("x86/asm/bitops: Use __builtin_ctzl() to evaluate constant expressions") 2fcff790dcb4 ("powerpc: Use builtin functions for fls()/__fls()/fls64()") From a recent discussion, I noticed that x86 is lacking an optimization that appears in arch/powerpc/include/asm/bitops.h related to constant folding. If you add a BUILD_BUG_ON(__builtin_constant_p(param)) to these functions, you'll find that there were cases where the use of inline asm pessimized the compiler's ability to perform constant folding resulting in runtime calculation of a value that could have been computed at compile time. Signed-off-by: Nick Desaulniers Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20230828-x86_fls-v1-1-e6a31b9f79c3@google.com --- arch/x86/include/asm/bitops.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 2edf68475fec..50e5ebf9d0a0 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -293,6 +293,9 @@ static __always_inline unsigned long variable_ffz(unsigned long word) */ static __always_inline unsigned long __fls(unsigned long word) { + if (__builtin_constant_p(word)) + return BITS_PER_LONG - 1 - __builtin_clzl(word); + asm("bsr %1,%0" : "=r" (word) : "rm" (word)); @@ -360,6 +363,9 @@ static __always_inline int fls(unsigned int x) { int r; + if (__builtin_constant_p(x)) + return x ? 32 - __builtin_clz(x) : 0; + #ifdef CONFIG_X86_64 /* * AMD64 says BSRL won't clobber the dest reg if x==0; Intel64 says the @@ -401,6 +407,9 @@ static __always_inline int fls(unsigned int x) static __always_inline int fls64(__u64 x) { int bitpos = -1; + + if (__builtin_constant_p(x)) + return x ? 64 - __builtin_clzll(x) : 0; /* * AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the * dest reg is undefined if x==0, but their CPU architect says its From 54cd971c6f4461fb6b178579751788bf4f64dfca Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 6 Sep 2023 20:58:44 +0200 Subject: [PATCH 2/9] x86/percpu: Define {raw,this}_cpu_try_cmpxchg{64,128} Define target-specific {raw,this}_cpu_try_cmpxchg64() and {raw,this}_cpu_try_cmpxchg128() macros. These definitions override the generic fallback definitions and enable target-specific optimized implementations. Several places in mm/slub.o improve from e.g.: 53bc: 48 8d 4f 40 lea 0x40(%rdi),%rcx 53c0: 48 89 fa mov %rdi,%rdx 53c3: 49 8b 5c 05 00 mov 0x0(%r13,%rax,1),%rbx 53c8: 4c 89 e8 mov %r13,%rax 53cb: 49 8d 30 lea (%r8),%rsi 53ce: e8 00 00 00 00 call 53d3 <...> 53cf: R_X86_64_PLT32 this_cpu_cmpxchg16b_emu-0x4 53d3: 48 31 d7 xor %rdx,%rdi 53d6: 4c 31 e8 xor %r13,%rax 53d9: 48 09 c7 or %rax,%rdi 53dc: 75 ae jne 538c <...> to: 53bc: 48 8d 4a 40 lea 0x40(%rdx),%rcx 53c0: 49 8b 1c 07 mov (%r15,%rax,1),%rbx 53c4: 4c 89 f8 mov %r15,%rax 53c7: 48 8d 37 lea (%rdi),%rsi 53ca: e8 00 00 00 00 call 53cf <...> 53cb: R_X86_64_PLT32 this_cpu_cmpxchg16b_emu-0x4 53cf: 75 bb jne 538c <...> reducing the size of mm/slub.o by 80 bytes: text data bss dec hex filename 39758 5337 4208 49303 c097 slub-new.o 39838 5337 4208 49383 c0e7 slub-old.o Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20230906185941.53527-1-ubizjak@gmail.com --- arch/x86/include/asm/percpu.h | 67 +++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 34734d730463..4c3641927f39 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -237,12 +237,47 @@ do { \ #define raw_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg64_op(8, , pcp, oval, nval) #define this_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg64_op(8, volatile, pcp, oval, nval) + +#define percpu_try_cmpxchg64_op(size, qual, _var, _ovalp, _nval) \ +({ \ + bool success; \ + u64 *_oval = (u64 *)(_ovalp); \ + union { \ + u64 var; \ + struct { \ + u32 low, high; \ + }; \ + } old__, new__; \ + \ + old__.var = *_oval; \ + new__.var = _nval; \ + \ + asm qual (ALTERNATIVE("leal %P[var], %%esi; call this_cpu_cmpxchg8b_emu", \ + "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \ + CC_SET(z) \ + : CC_OUT(z) (success), \ + [var] "+m" (_var), \ + "+a" (old__.low), \ + "+d" (old__.high) \ + : "b" (new__.low), \ + "c" (new__.high) \ + : "memory", "esi"); \ + if (unlikely(!success)) \ + *_oval = old__.var; \ + likely(success); \ +}) + +#define raw_cpu_try_cmpxchg64(pcp, ovalp, nval) percpu_try_cmpxchg64_op(8, , pcp, ovalp, nval) +#define this_cpu_try_cmpxchg64(pcp, ovalp, nval) percpu_try_cmpxchg64_op(8, volatile, pcp, ovalp, nval) #endif #ifdef CONFIG_X86_64 #define raw_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg_op(8, , pcp, oval, nval); #define this_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg_op(8, volatile, pcp, oval, nval); +#define raw_cpu_try_cmpxchg64(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, , pcp, ovalp, nval); +#define this_cpu_try_cmpxchg64(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval); + #define percpu_cmpxchg128_op(size, qual, _var, _oval, _nval) \ ({ \ union { \ @@ -269,6 +304,38 @@ do { \ #define raw_cpu_cmpxchg128(pcp, oval, nval) percpu_cmpxchg128_op(16, , pcp, oval, nval) #define this_cpu_cmpxchg128(pcp, oval, nval) percpu_cmpxchg128_op(16, volatile, pcp, oval, nval) + +#define percpu_try_cmpxchg128_op(size, qual, _var, _ovalp, _nval) \ +({ \ + bool success; \ + u128 *_oval = (u128 *)(_ovalp); \ + union { \ + u128 var; \ + struct { \ + u64 low, high; \ + }; \ + } old__, new__; \ + \ + old__.var = *_oval; \ + new__.var = _nval; \ + \ + asm qual (ALTERNATIVE("leaq %P[var], %%rsi; call this_cpu_cmpxchg16b_emu", \ + "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \ + CC_SET(z) \ + : CC_OUT(z) (success), \ + [var] "+m" (_var), \ + "+a" (old__.low), \ + "+d" (old__.high) \ + : "b" (new__.low), \ + "c" (new__.high) \ + : "memory", "rsi"); \ + if (unlikely(!success)) \ + *_oval = old__.var; \ + likely(success); \ +}) + +#define raw_cpu_try_cmpxchg128(pcp, ovalp, nval) percpu_try_cmpxchg128_op(16, , pcp, ovalp, nval) +#define this_cpu_try_cmpxchg128(pcp, ovalp, nval) percpu_try_cmpxchg128_op(16, volatile, pcp, ovalp, nval) #endif /* From 5f863897d964e834a0da35b1e483b5bb8faca522 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 30 Aug 2023 17:13:56 +0200 Subject: [PATCH 3/9] x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg() Define target-specific raw_cpu_try_cmpxchg_N() and this_cpu_try_cmpxchg_N() macros. These definitions override the generic fallback definitions and enable target-specific optimized implementations. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230830151623.3900-1-ubizjak@gmail.com --- arch/x86/include/asm/percpu.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 4c3641927f39..a87db6140fe2 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -210,6 +210,25 @@ do { \ (typeof(_var))(unsigned long) pco_old__; \ }) +#define percpu_try_cmpxchg_op(size, qual, _var, _ovalp, _nval) \ +({ \ + bool success; \ + __pcpu_type_##size *pco_oval__ = (__pcpu_type_##size *)(_ovalp); \ + __pcpu_type_##size pco_old__ = *pco_oval__; \ + __pcpu_type_##size pco_new__ = __pcpu_cast_##size(_nval); \ + asm qual (__pcpu_op2_##size("cmpxchg", "%[nval]", \ + __percpu_arg([var])) \ + CC_SET(z) \ + : CC_OUT(z) (success), \ + [oval] "+a" (pco_old__), \ + [var] "+m" (_var) \ + : [nval] __pcpu_reg_##size(, pco_new__) \ + : "memory"); \ + if (unlikely(!success)) \ + *pco_oval__ = pco_old__; \ + likely(success); \ +}) + #if defined(CONFIG_X86_32) && !defined(CONFIG_UML) #define percpu_cmpxchg64_op(size, qual, _var, _oval, _nval) \ ({ \ @@ -410,6 +429,9 @@ do { \ #define raw_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(1, , pcp, oval, nval) #define raw_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(2, , pcp, oval, nval) #define raw_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(4, , pcp, oval, nval) +#define raw_cpu_try_cmpxchg_1(pcp, ovalp, nval) percpu_try_cmpxchg_op(1, , pcp, ovalp, nval) +#define raw_cpu_try_cmpxchg_2(pcp, ovalp, nval) percpu_try_cmpxchg_op(2, , pcp, ovalp, nval) +#define raw_cpu_try_cmpxchg_4(pcp, ovalp, nval) percpu_try_cmpxchg_op(4, , pcp, ovalp, nval) #define this_cpu_add_return_1(pcp, val) percpu_add_return_op(1, volatile, pcp, val) #define this_cpu_add_return_2(pcp, val) percpu_add_return_op(2, volatile, pcp, val) @@ -417,6 +439,9 @@ do { \ #define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(1, volatile, pcp, oval, nval) #define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(2, volatile, pcp, oval, nval) #define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(4, volatile, pcp, oval, nval) +#define this_cpu_try_cmpxchg_1(pcp, ovalp, nval) percpu_try_cmpxchg_op(1, volatile, pcp, ovalp, nval) +#define this_cpu_try_cmpxchg_2(pcp, ovalp, nval) percpu_try_cmpxchg_op(2, volatile, pcp, ovalp, nval) +#define this_cpu_try_cmpxchg_4(pcp, ovalp, nval) percpu_try_cmpxchg_op(4, volatile, pcp, ovalp, nval) /* * Per cpu atomic 64 bit operations are only available under 64 bit. @@ -431,6 +456,7 @@ do { \ #define raw_cpu_add_return_8(pcp, val) percpu_add_return_op(8, , pcp, val) #define raw_cpu_xchg_8(pcp, nval) raw_percpu_xchg_op(pcp, nval) #define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, , pcp, oval, nval) +#define raw_cpu_try_cmpxchg_8(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, , pcp, ovalp, nval) #define this_cpu_read_8(pcp) percpu_from_op(8, volatile, "mov", pcp) #define this_cpu_write_8(pcp, val) percpu_to_op(8, volatile, "mov", (pcp), val) @@ -440,6 +466,7 @@ do { \ #define this_cpu_add_return_8(pcp, val) percpu_add_return_op(8, volatile, pcp, val) #define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(8, volatile, pcp, nval) #define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, volatile, pcp, oval, nval) +#define this_cpu_try_cmpxchg_8(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval) #endif static __always_inline bool x86_this_cpu_constant_test_bit(unsigned int nr, From b8e3dfa16ec55f310dd95831614af3d24abf5ed5 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 30 Aug 2023 17:13:57 +0200 Subject: [PATCH 4/9] x86/percpu: Use raw_cpu_try_cmpxchg() in preempt_count_set() Use raw_cpu_try_cmpxchg() instead of raw_cpu_cmpxchg(*ptr, old, new) == old. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after CMPXCHG (and related MOV instruction in front of CMPXCHG). Also, raw_cpu_try_cmpxchg() implicitly assigns old *ptr value to "old" when cmpxchg fails. There is no need to re-read the value in the loop. No functional change intended. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20230830151623.3900-2-ubizjak@gmail.com --- arch/x86/include/asm/preempt.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 2d13f25b1bd8..4527e1430c6d 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -31,11 +31,11 @@ static __always_inline void preempt_count_set(int pc) { int old, new; + old = raw_cpu_read_4(pcpu_hot.preempt_count); do { - old = raw_cpu_read_4(pcpu_hot.preempt_count); new = (old & PREEMPT_NEED_RESCHED) | (pc & ~PREEMPT_NEED_RESCHED); - } while (raw_cpu_cmpxchg_4(pcpu_hot.preempt_count, old, new) != old); + } while (!raw_cpu_try_cmpxchg_4(pcpu_hot.preempt_count, &old, new)); } /* From 7c097ca50d2ba7f7989f01175f366151256bfa10 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 18 Sep 2023 17:14:10 +0200 Subject: [PATCH 5/9] x86/percpu: Do not clobber %rsi in percpu_{try_,}cmpxchg{64,128}_op The fallback alternative uses %rsi register to manually load pointer to the percpu variable before the call to the emulation function. This is unoptimal, because the load is hidden from the compiler. Move the load of %rsi outside inline asm, so the compiler can reuse the value. The code in slub.o improves from: 55ac: 49 8b 3c 24 mov (%r12),%rdi 55b0: 48 8d 4a 40 lea 0x40(%rdx),%rcx 55b4: 49 8b 1c 07 mov (%r15,%rax,1),%rbx 55b8: 4c 89 f8 mov %r15,%rax 55bb: 48 8d 37 lea (%rdi),%rsi 55be: e8 00 00 00 00 callq 55c3 <...> 55bf: R_X86_64_PLT32 this_cpu_cmpxchg16b_emu-0x4 55c3: 75 a3 jne 5568 <...> 55c5: ... 0000000000000000 <.altinstr_replacement>: 5: 65 48 0f c7 0f cmpxchg16b %gs:(%rdi) to: 55ac: 49 8b 34 24 mov (%r12),%rsi 55b0: 48 8d 4a 40 lea 0x40(%rdx),%rcx 55b4: 49 8b 1c 07 mov (%r15,%rax,1),%rbx 55b8: 4c 89 f8 mov %r15,%rax 55bb: e8 00 00 00 00 callq 55c0 <...> 55bc: R_X86_64_PLT32 this_cpu_cmpxchg16b_emu-0x4 55c0: 75 a6 jne 5568 <...> 55c2: ... Where the alternative replacement instruction now uses %rsi: 0000000000000000 <.altinstr_replacement>: 5: 65 48 0f c7 0e cmpxchg16b %gs:(%rsi) The instruction (effectively a reg-reg move) at 55bb: in the original assembly is removed. Also, both the CALL and replacement CMPXCHG16B are 5 bytes long, removing the need for NOPs in the asm code. Suggested-by: Linus Torvalds Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230918151452.62344-1-ubizjak@gmail.com --- arch/x86/include/asm/percpu.h | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index a87db6140fe2..20624b80f890 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -242,14 +242,15 @@ do { \ old__.var = _oval; \ new__.var = _nval; \ \ - asm qual (ALTERNATIVE("leal %P[var], %%esi; call this_cpu_cmpxchg8b_emu", \ + asm qual (ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \ "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \ : [var] "+m" (_var), \ "+a" (old__.low), \ "+d" (old__.high) \ : "b" (new__.low), \ - "c" (new__.high) \ - : "memory", "esi"); \ + "c" (new__.high), \ + "S" (&(_var)) \ + : "memory"); \ \ old__.var; \ }) @@ -271,7 +272,7 @@ do { \ old__.var = *_oval; \ new__.var = _nval; \ \ - asm qual (ALTERNATIVE("leal %P[var], %%esi; call this_cpu_cmpxchg8b_emu", \ + asm qual (ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \ "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \ CC_SET(z) \ : CC_OUT(z) (success), \ @@ -279,8 +280,9 @@ do { \ "+a" (old__.low), \ "+d" (old__.high) \ : "b" (new__.low), \ - "c" (new__.high) \ - : "memory", "esi"); \ + "c" (new__.high), \ + "S" (&(_var)) \ + : "memory"); \ if (unlikely(!success)) \ *_oval = old__.var; \ likely(success); \ @@ -309,14 +311,15 @@ do { \ old__.var = _oval; \ new__.var = _nval; \ \ - asm qual (ALTERNATIVE("leaq %P[var], %%rsi; call this_cpu_cmpxchg16b_emu", \ + asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \ "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \ : [var] "+m" (_var), \ "+a" (old__.low), \ "+d" (old__.high) \ : "b" (new__.low), \ - "c" (new__.high) \ - : "memory", "rsi"); \ + "c" (new__.high), \ + "S" (&(_var)) \ + : "memory"); \ \ old__.var; \ }) @@ -338,7 +341,7 @@ do { \ old__.var = *_oval; \ new__.var = _nval; \ \ - asm qual (ALTERNATIVE("leaq %P[var], %%rsi; call this_cpu_cmpxchg16b_emu", \ + asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \ "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \ CC_SET(z) \ : CC_OUT(z) (success), \ @@ -346,8 +349,9 @@ do { \ "+a" (old__.low), \ "+d" (old__.high) \ : "b" (new__.low), \ - "c" (new__.high) \ - : "memory", "rsi"); \ + "c" (new__.high), \ + "S" (&(_var)) \ + : "memory"); \ if (unlikely(!success)) \ *_oval = old__.var; \ likely(success); \ From ad424743256b0119bd60a9248db4df5d998000a4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 22 Jan 2022 13:39:15 +0100 Subject: [PATCH 6/9] x86/bitops: Remove unused __sw_hweight64() assembly implementation on x86-32 Header cleanups in the fast-headers tree highlighted that we have an unused assembly implementation for __sw_hweight64(): WARNING: modpost: EXPORT symbol "__sw_hweight64" [vmlinux] version ... __arch_hweight64() on x86-32 is defined in the arch/x86/include/asm/arch_hweight.h header as an inline, using __arch_hweight32(): #ifdef CONFIG_X86_32 static inline unsigned long __arch_hweight64(__u64 w) { return __arch_hweight32((u32)w) + __arch_hweight32((u32)(w >> 32)); } *But* there's also a __sw_hweight64() assembly implementation: arch/x86/lib/hweight.S SYM_FUNC_START(__sw_hweight64) #ifdef CONFIG_X86_64 ... #else /* CONFIG_X86_32 */ /* We're getting an u64 arg in (%eax,%edx): unsigned long hweight64(__u64 w) */ pushl %ecx call __sw_hweight32 movl %eax, %ecx # stash away result movl %edx, %eax # second part of input call __sw_hweight32 addl %ecx, %eax # result popl %ecx ret #endif But this __sw_hweight64 assembly implementation is unused - and it's essentially doing the same thing that the inline wrapper does. Remove the assembly version and add a comment about it. Reported-by: Nathan Chancellor Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org --- arch/x86/lib/hweight.S | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/arch/x86/lib/hweight.S b/arch/x86/lib/hweight.S index 12c16c6aa44a..0a152e51d3f5 100644 --- a/arch/x86/lib/hweight.S +++ b/arch/x86/lib/hweight.S @@ -36,8 +36,12 @@ SYM_FUNC_START(__sw_hweight32) SYM_FUNC_END(__sw_hweight32) EXPORT_SYMBOL(__sw_hweight32) -SYM_FUNC_START(__sw_hweight64) +/* + * No 32-bit variant, because it's implemented as an inline wrapper + * on top of __arch_hweight32(): + */ #ifdef CONFIG_X86_64 +SYM_FUNC_START(__sw_hweight64) pushq %rdi pushq %rdx @@ -66,18 +70,6 @@ SYM_FUNC_START(__sw_hweight64) popq %rdx popq %rdi RET -#else /* CONFIG_X86_32 */ - /* We're getting an u64 arg in (%eax,%edx): unsigned long hweight64(__u64 w) */ - pushl %ecx - - call __sw_hweight32 - movl %eax, %ecx # stash away result - movl %edx, %eax # second part of input - call __sw_hweight32 - addl %ecx, %eax # result - - popl %ecx - RET -#endif SYM_FUNC_END(__sw_hweight64) EXPORT_SYMBOL(__sw_hweight64) +#endif From da4aff622a7ae424a0292d7288744692fca34319 Mon Sep 17 00:00:00 2001 From: "Xin Li (Intel)" Date: Mon, 25 Sep 2023 23:13:19 -0700 Subject: [PATCH 7/9] x86/entry: Remove unused argument %rsi passed to exc_nmi() exc_nmi() only takes one argument of type struct pt_regs *, but asm_exc_nmi() calls it with 2 arguments. The second one passed in %rsi seems to be a leftover, so simply remove it. Signed-off-by: Xin Li (Intel) Signed-off-by: Ingo Molnar Acked-by: H. Peter Anvin (Intel) Link: https://lore.kernel.org/r/20230926061319.1929127-1-xin@zytor.com --- arch/x86/entry/entry_64.S | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 43606de22511..fb8dd5648e3a 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1237,7 +1237,6 @@ SYM_CODE_START(asm_exc_nmi) */ movq %rsp, %rdi - movq $-1, %rsi call exc_nmi /* @@ -1451,7 +1450,6 @@ end_repeat_nmi: UNWIND_HINT_REGS movq %rsp, %rdi - movq $-1, %rsi call exc_nmi /* Always restore stashed SPEC_CTRL value (see paranoid_entry) */ From 1882366217757d3549e48a833bf9a5799b172251 Mon Sep 17 00:00:00 2001 From: "Xin Li (Intel)" Date: Mon, 25 Sep 2023 23:13:19 -0700 Subject: [PATCH 8/9] x86/entry: Fix typos in comments Fix 2 typos in the comments. Signed-off-by: Xin Li (Intel) Signed-off-by: Ingo Molnar Acked-by: H. Peter Anvin (Intel) Link: https://lore.kernel.org/r/20230926061319.1929127-1-xin@zytor.com --- arch/x86/entry/entry_64.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index fb8dd5648e3a..b940e928c808 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1163,8 +1163,8 @@ SYM_CODE_START(asm_exc_nmi) * anyway. * * To handle this case we do the following: - * Check the a special location on the stack that contains - * a variable that is set when NMIs are executing. + * Check a special location on the stack that contains a + * variable that is set when NMIs are executing. * The interrupted task's stack is also checked to see if it * is an NMI stack. * If the variable is not set and the stack is not the NMI @@ -1294,8 +1294,8 @@ SYM_CODE_START(asm_exc_nmi) * end_repeat_nmi, then we are a nested NMI. We must not * modify the "iret" frame because it's being written by * the outer NMI. That's okay; the outer NMI handler is - * about to about to call exc_nmi() anyway, so we can just - * resume the outer NMI. + * about to call exc_nmi() anyway, so we can just resume + * the outer NMI. */ movq $repeat_nmi, %rdx From 8ae292c66dcb160b3e1e16b66c3076d5a2c63873 Mon Sep 17 00:00:00 2001 From: Zhu Wang Date: Mon, 31 Jul 2023 19:36:22 +0800 Subject: [PATCH 9/9] x86/lib: Address kernel-doc warnings Fix all kernel-doc warnings in csum-wrappers_64.c: arch/x86/lib/csum-wrappers_64.c:25: warning: Excess function parameter 'isum' description in 'csum_and_copy_from_user' arch/x86/lib/csum-wrappers_64.c:25: warning: Excess function parameter 'errp' description in 'csum_and_copy_from_user' arch/x86/lib/csum-wrappers_64.c:49: warning: Excess function parameter 'isum' description in 'csum_and_copy_to_user' arch/x86/lib/csum-wrappers_64.c:49: warning: Excess function parameter 'errp' description in 'csum_and_copy_to_user' arch/x86/lib/csum-wrappers_64.c:71: warning: Excess function parameter 'sum' description in 'csum_partial_copy_nocheck' Signed-off-by: Zhu Wang Signed-off-by: Ingo Molnar Cc: linux-kernel@vger.kernel.org --- arch/x86/lib/csum-wrappers_64.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c index 145f9a0bde29..f4df4d241526 100644 --- a/arch/x86/lib/csum-wrappers_64.c +++ b/arch/x86/lib/csum-wrappers_64.c @@ -14,8 +14,6 @@ * @src: source address (user space) * @dst: destination address * @len: number of bytes to be copied. - * @isum: initial sum that is added into the result (32bit unfolded) - * @errp: set to -EFAULT for an bad source address. * * Returns an 32bit unfolded checksum of the buffer. * src and dst are best aligned to 64bits. @@ -38,8 +36,6 @@ csum_and_copy_from_user(const void __user *src, void *dst, int len) * @src: source address * @dst: destination address (user space) * @len: number of bytes to be copied. - * @isum: initial sum that is added into the result (32bit unfolded) - * @errp: set to -EFAULT for an bad destination address. * * Returns an 32bit unfolded checksum of the buffer. * src and dst are best aligned to 64bits. @@ -62,7 +58,6 @@ csum_and_copy_to_user(const void *src, void __user *dst, int len) * @src: source address * @dst: destination address * @len: number of bytes to be copied. - * @sum: initial sum that is added into the result (32bit unfolded) * * Returns an 32bit unfolded checksum of the buffer. */