From dbaaabd60e1662d2659eaeab0a4fc521667737ed Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 8 Apr 2024 11:38:30 -0700 Subject: [PATCH 1/3] clang: work around asm input constraint problems Work around clang problems with asm constraints that have multiple possibilities, particularly "g" and "rm". Clang seems to turn inputs like that into the most generic form, which is the memory input - but to make matters worse, clang won't even use a possible original memory location, but will spill the value to stack, and use the stack for the asm input. See https://github.com/llvm/llvm-project/issues/20571#issuecomment-980933442 for some explanation of why clang has this strange behavior, but the end result is that "g" and "rm" really end up generating horrid code. Link: https://github.com/llvm/llvm-project/issues/20571 Cc: Peter Zijlstra Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Linus Torvalds --- include/linux/compiler-clang.h | 10 ++++++++++ include/linux/compiler_types.h | 9 +++++++++ 2 files changed, 19 insertions(+) diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index 49feac0162a5..4c1a39dcb624 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -118,3 +118,13 @@ #define __diag_ignore_all(option, comment) \ __diag_clang(13, ignore, option) + +/* + * clang has horrible behavior with "g" or "rm" constraints for asm + * inputs, turning them into something worse than "m". Avoid using + * constraints with multiple possible uses (but "ir" seems to be ok): + * + * https://github.com/llvm/llvm-project/issues/20571 + */ +#define ASM_INPUT_G "ir" +#define ASM_INPUT_RM "r" diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 8f8236317d5b..f82bdac63961 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -391,6 +391,15 @@ struct ftrace_likely_data { #define asm_goto_output(x...) asm volatile goto(x) #endif +/* + * Clang has trouble with constraints with multiple + * alternative behaviors (mainly "g" and "rm"). + */ +#ifndef ASM_INPUT_G + #define ASM_INPUT_G "g" + #define ASM_INPUT_RM "rm" +#endif + #ifdef CONFIG_CC_HAS_ASM_INLINE #define asm_inline asm __inline #else From 7453b9485114f7ffec4a99bccee469a4d4809894 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 8 Apr 2024 11:38:30 -0700 Subject: [PATCH 2/3] x86: improve array_index_mask_nospec() code generation Don't force the inputs to be 'unsigned long', when the comparison can easily be done in 32-bit if that's more appropriate. Note that while we can look at the inputs to choose an appropriate size for the compare instruction, the output is fixed at 'unsigned long'. That's not technically optimal either, since a 32-bit 'sbbl' would often be sufficient. But for the outgoing mask we don't know how the mask ends up being used (ie we have uses that have an incoming 32-bit array index, but end up using the mask for other things). That said, it only costs the extra REX prefix to always generate the 64-bit mask. [ A 'sbbl' also always technically generates a 64-bit mask, but with the upper 32 bits clear: that's fine for when the incoming index that will be masked is already 32-bit, but not if you use the mask to mask a pointer afterwards, like the file table lookup does ] Cc: Peter Zijlstra Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Linus Torvalds --- arch/x86/include/asm/barrier.h | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 63bdc6b85219..7b44b3c4cce1 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -33,20 +33,16 @@ * Returns: * 0 - (index < size) */ -static __always_inline unsigned long array_index_mask_nospec(unsigned long index, - unsigned long size) -{ - unsigned long mask; - - asm volatile ("cmp %1,%2; sbb %0,%0;" - :"=r" (mask) - :"g"(size),"r" (index) - :"cc"); - return mask; -} - -/* Override the default implementation from linux/nospec.h. */ -#define array_index_mask_nospec array_index_mask_nospec +#define array_index_mask_nospec(idx,sz) ({ \ + typeof((idx)+(sz)) __idx = (idx); \ + typeof(__idx) __sz = (sz); \ + unsigned long __mask; \ + asm volatile ("cmp %1,%2; sbb %0,%0" \ + :"=r" (__mask) \ + :ASM_INPUT_G (__sz), \ + "r" (__idx) \ + :"cc"); \ + __mask; }) /* Prevent speculative execution past this barrier. */ #define barrier_nospec() alternative("", "lfence", X86_FEATURE_LFENCE_RDTSC) From b9b60b3199b70fe3ce74ff493b1870ccd7554134 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 9 Apr 2024 11:55:07 -0700 Subject: [PATCH 3/3] x86: improve bitop code generation with clang This uses the new ASM_INPUT_RM macro to avoid the bad code generation issue that clang has with more generic asm inputs. This ends up avoiding generating code like this: mov %r10,(%rsp) tzcnt (%rsp),%rcx which now becomes just tzcnt %r10,%rcx and in the process ends up also removing a few unnecessary stack frames when the only use was that pointless "asm uses memory location off stack". Signed-off-by: Linus Torvalds --- arch/x86/include/asm/bitops.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 990eb686ca67..b96d45944c59 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -250,7 +250,7 @@ static __always_inline unsigned long variable__ffs(unsigned long word) { asm("rep; bsf %1,%0" : "=r" (word) - : "rm" (word)); + : ASM_INPUT_RM (word)); return word; } @@ -297,7 +297,7 @@ static __always_inline unsigned long __fls(unsigned long word) asm("bsr %1,%0" : "=r" (word) - : "rm" (word)); + : ASM_INPUT_RM (word)); return word; } @@ -320,7 +320,7 @@ static __always_inline int variable_ffs(int x) */ asm("bsfl %1,%0" : "=r" (r) - : "rm" (x), "0" (-1)); + : ASM_INPUT_RM (x), "0" (-1)); #elif defined(CONFIG_X86_CMOV) asm("bsfl %1,%0\n\t" "cmovzl %2,%0" @@ -377,7 +377,7 @@ static __always_inline int fls(unsigned int x) */ asm("bsrl %1,%0" : "=r" (r) - : "rm" (x), "0" (-1)); + : ASM_INPUT_RM (x), "0" (-1)); #elif defined(CONFIG_X86_CMOV) asm("bsrl %1,%0\n\t" "cmovzl %2,%0" @@ -416,7 +416,7 @@ static __always_inline int fls64(__u64 x) */ asm("bsrq %1,%q0" : "+r" (bitpos) - : "rm" (x)); + : ASM_INPUT_RM (x)); return bitpos + 1; } #else