From 0ec8bc9e880eb576dc4492e8e0c7153ed0a71031 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 12 Nov 2024 16:34:14 +0800 Subject: [PATCH 01/10] mm, swap: fix allocation and scanning race with swapoff There are two flags used to synchronize allocation and scanning with swapoff: SWP_WRITEOK and SWP_SCANNING. SWP_WRITEOK: Swapoff will first unset this flag, at this point any further swap allocation or scanning on this device should just abort so no more new entries will be referencing this device. Swapoff will then unuse all existing swap entries. SWP_SCANNING: This flag is set when device is being scanned. Swapoff will wait for all scanner to stop before the final release of the swap device structures to avoid UAF. Note this flag is the highest used bit of si->flags so it could be added up arithmetically, if there are multiple scanner. commit 5f843a9a3a1e ("mm: swap: separate SSD allocation from scan_swap_map_slots()") ignored SWP_SCANNING and SWP_WRITEOK flags while separating cluster allocation path from the old allocation path. Add the flags back to fix swapoff race. The race is hard to trigger as si->lock prevents most parallel operations, but si->lock could be dropped for reclaim or discard. This issue is found during code review. This commit fixes this problem. For SWP_SCANNING, Just like before, set the flag before scan and remove it afterwards. For SWP_WRITEOK, there are several places where si->lock could be dropped, it will be error-prone and make the code hard to follow if we try to cover these places one by one. So just do one check before the real allocation, which is also very similar like before. With new cluster allocator it may waste a bit of time iterating the clusters but won't take long, and swapoff is not performance sensitive. Link: https://lkml.kernel.org/r/20241112083414.78174-1-ryncsn@gmail.com Fixes: 5f843a9a3a1e ("mm: swap: separate SSD allocation from scan_swap_map_slots()") Reported-by: "Huang, Ying" Closes: https://lore.kernel.org/linux-mm/87a5es3f1f.fsf@yhuang6-desk2.ccr.corp.intel.com/ Signed-off-by: Kairui Song Cc: Barry Song Cc: Chris Li Cc: Hugh Dickins Cc: Kalesh Singh Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/swapfile.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 9c85bd46ab7f..b0a9071cfe1d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -664,12 +664,15 @@ static bool cluster_scan_range(struct swap_info_struct *si, return true; } -static void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci, +static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci, unsigned int start, unsigned char usage, unsigned int order) { unsigned int nr_pages = 1 << order; + if (!(si->flags & SWP_WRITEOK)) + return false; + if (cluster_is_free(ci)) { if (nr_pages < SWAPFILE_CLUSTER) { list_move_tail(&ci->list, &si->nonfull_clusters[order]); @@ -690,6 +693,8 @@ static void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster list_move_tail(&ci->list, &si->full_clusters); ci->flags = CLUSTER_FLAG_FULL; } + + return true; } static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigned long offset, @@ -713,7 +718,10 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigne while (offset <= end) { if (cluster_scan_range(si, ci, offset, nr_pages)) { - cluster_alloc_range(si, ci, offset, usage, order); + if (!cluster_alloc_range(si, ci, offset, usage, order)) { + offset = SWAP_NEXT_INVALID; + goto done; + } *foundp = offset; if (ci->count == SWAPFILE_CLUSTER) { offset = SWAP_NEXT_INVALID; @@ -805,7 +813,11 @@ new_cluster: if (!list_empty(&si->free_clusters)) { ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list); offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, order, usage); - VM_BUG_ON(!found); + /* + * Either we didn't touch the cluster due to swapoff, + * or the allocation must success. + */ + VM_BUG_ON((si->flags & SWP_WRITEOK) && !found); goto done; } @@ -1041,6 +1053,8 @@ static int cluster_alloc_swap(struct swap_info_struct *si, VM_BUG_ON(!si->cluster_info); + si->flags += SWP_SCANNING; + while (n_ret < nr) { unsigned long offset = cluster_alloc_swap_entry(si, order, usage); @@ -1049,6 +1063,8 @@ static int cluster_alloc_swap(struct swap_info_struct *si, slots[n_ret++] = swp_entry(si->type, offset); } + si->flags -= SWP_SCANNING; + return n_ret; } From a39326767c55c00c7c313333404cbcb502cce8fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Motiejus=20Jak=C3=85=60tys?= Date: Tue, 12 Nov 2024 19:16:55 +0200 Subject: [PATCH 02/10] tools/mm: fix compile error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a missing semicolon. Link: https://lkml.kernel.org/r/20241112171655.1662670-1-motiejus@jakstys.lt Fixes: ece5897e5a10 ("tools/mm: -Werror fixes in page-types/slabinfo") Signed-off-by: Motiejus JakÅ`tys Closes: https://github.com/NixOS/nixpkgs/issues/355369 Reviewed-by: SeongJae Park Reviewed-by: Vishal Moola (Oracle) Acked-by: Oleksandr Natalenko Cc: Wladislav Wiebe Signed-off-by: Andrew Morton --- tools/mm/page-types.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/mm/page-types.c b/tools/mm/page-types.c index 6eb17cc1a06c..bcac7ebfb51f 100644 --- a/tools/mm/page-types.c +++ b/tools/mm/page-types.c @@ -420,7 +420,7 @@ static void show_page(unsigned long voffset, unsigned long offset, if (opt_file) printf("%lx\t", voffset); if (opt_list_cgroup) - printf("@%" PRIu64 "\t", cgroup) + printf("@%" PRIu64 "\t", cgroup); if (opt_list_mapcnt) printf("%" PRIu64 "\t", mapcnt); From a4a282daf1a190f03790bf163458ea3c8d28d217 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 11 Nov 2024 20:34:30 +0100 Subject: [PATCH 03/10] mm/mremap: fix address wraparound in move_page_tables() On 32-bit platforms, it is possible for the expression `len + old_addr < old_end` to be false-positive if `len + old_addr` wraps around. `old_addr` is the cursor in the old range up to which page table entries have been moved; so if the operation succeeded, `old_addr` is the *end* of the old region, and adding `len` to it can wrap. The overflow causes mremap() to mistakenly believe that PTEs have been copied; the consequence is that mremap() bails out, but doesn't move the PTEs back before the new VMA is unmapped, causing anonymous pages in the region to be lost. So basically if userspace tries to mremap() a private-anon region and hits this bug, mremap() will return an error and the private-anon region's contents appear to have been zeroed. The idea of this check is that `old_end - len` is the original start address, and writing the check that way also makes it easier to read; so fix the check by rearranging the comparison accordingly. (An alternate fix would be to refactor this function by introducing an "orig_old_start" variable or such.) Tested in a VM with a 32-bit X86 kernel; without the patch: ``` user@horn:~/big_mremap$ cat test.c #define _GNU_SOURCE #include #include #include #include #define ADDR1 ((void*)0x60000000) #define ADDR2 ((void*)0x10000000) #define SIZE 0x50000000uL int main(void) { unsigned char *p1 = mmap(ADDR1, SIZE, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED_NOREPLACE, -1, 0); if (p1 == MAP_FAILED) err(1, "mmap 1"); unsigned char *p2 = mmap(ADDR2, SIZE, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED_NOREPLACE, -1, 0); if (p2 == MAP_FAILED) err(1, "mmap 2"); *p1 = 0x41; printf("first char is 0x%02hhx\n", *p1); unsigned char *p3 = mremap(p1, SIZE, SIZE, MREMAP_MAYMOVE|MREMAP_FIXED, p2); if (p3 == MAP_FAILED) { printf("mremap() failed; first char is 0x%02hhx\n", *p1); } else { printf("mremap() succeeded; first char is 0x%02hhx\n", *p3); } } user@horn:~/big_mremap$ gcc -static -o test test.c user@horn:~/big_mremap$ setarch -R ./test first char is 0x41 mremap() failed; first char is 0x00 ``` With the patch: ``` user@horn:~/big_mremap$ setarch -R ./test first char is 0x41 mremap() succeeded; first char is 0x41 ``` Link: https://lkml.kernel.org/r/20241111-fix-mremap-32bit-wrap-v1-1-61d6be73b722@google.com Fixes: af8ca1c14906 ("mm/mremap: optimize the start addresses in move_page_tables()") Signed-off-by: Jann Horn Acked-by: Vlastimil Babka Reviewed-by: Lorenzo Stoakes Acked-by: Qi Zheng Reviewed-by: Liam R. Howlett Cc: Joel Fernandes (Google) Cc: Signed-off-by: Andrew Morton --- mm/mremap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mremap.c b/mm/mremap.c index dda09e957a5d..dee98ff2bbd6 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -648,7 +648,7 @@ again: * Prevent negative return values when {old,new}_addr was realigned * but we broke out of the above loop for the first PMD itself. */ - if (len + old_addr < old_end) + if (old_addr < old_end - len) return 0; return len + old_addr - old_end; /* how much done */ From 31daa34315d45d3fe77f2158d889d523d78852ea Mon Sep 17 00:00:00 2001 From: Dave Vasilevsky Date: Tue, 17 Sep 2024 12:37:20 -0400 Subject: [PATCH 04/10] crash, powerpc: default to CRASH_DUMP=n on PPC_BOOK3S_32 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes boot failures on 6.9 on PPC_BOOK3S_32 machines using Open Firmware. On these machines, the kernel refuses to boot from non-zero PHYSICAL_START, which occurs when CRASH_DUMP is on. Since most PPC_BOOK3S_32 machines boot via Open Firmware, it should default to off for them. Users booting via some other mechanism can still turn it on explicitly. Does not change the default on any other architectures for the time being. Link: https://lkml.kernel.org/r/20240917163720.1644584-1-dave@vasilevsky.ca Fixes: 75bc255a7444 ("crash: clean up kdump related config items") Signed-off-by: Dave Vasilevsky Reported-by: Reimar Döffinger Closes: https://lists.debian.org/debian-powerpc/2024/07/msg00001.html Acked-by: Michael Ellerman [powerpc] Acked-by: Baoquan He Cc: "Eric W. Biederman" Cc: John Paul Adrian Glaubitz Cc: Reimar Döffinger Cc: Signed-off-by: Andrew Morton --- arch/arm/Kconfig | 3 +++ arch/arm64/Kconfig | 3 +++ arch/loongarch/Kconfig | 3 +++ arch/mips/Kconfig | 3 +++ arch/powerpc/Kconfig | 4 ++++ arch/riscv/Kconfig | 3 +++ arch/s390/Kconfig | 3 +++ arch/sh/Kconfig | 3 +++ arch/x86/Kconfig | 3 +++ kernel/Kconfig.kexec | 2 +- 10 files changed, 29 insertions(+), 1 deletion(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 749179a1d162..202397be76d8 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1598,6 +1598,9 @@ config ATAGS_PROC config ARCH_SUPPORTS_CRASH_DUMP def_bool y +config ARCH_DEFAULT_CRASH_DUMP + def_bool y + config AUTO_ZRELADDR bool "Auto calculation of the decompressed kernel image address" if !ARCH_MULTIPLATFORM default !(ARCH_FOOTBRIDGE || ARCH_RPC || ARCH_SA1100) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index fd9df6dcc593..22ea2705becc 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1576,6 +1576,9 @@ config ARCH_DEFAULT_KEXEC_IMAGE_VERIFY_SIG config ARCH_SUPPORTS_CRASH_DUMP def_bool y +config ARCH_DEFAULT_CRASH_DUMP + def_bool y + config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION def_bool CRASH_RESERVE diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index bb35c34f86d2..d9fce0fd475a 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -604,6 +604,9 @@ config ARCH_SUPPORTS_KEXEC config ARCH_SUPPORTS_CRASH_DUMP def_bool y +config ARCH_DEFAULT_CRASH_DUMP + def_bool y + config ARCH_SELECTS_CRASH_DUMP def_bool y depends on CRASH_DUMP diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 397edf05dd72..467b10f4361a 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -2876,6 +2876,9 @@ config ARCH_SUPPORTS_KEXEC config ARCH_SUPPORTS_CRASH_DUMP def_bool y +config ARCH_DEFAULT_CRASH_DUMP + def_bool y + config PHYSICAL_START hex "Physical address where the kernel is loaded" default "0xffffffff84000000" diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 8094a01974cc..1a2ff0276365 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -684,6 +684,10 @@ config RELOCATABLE_TEST config ARCH_SUPPORTS_CRASH_DUMP def_bool PPC64 || PPC_BOOK3S_32 || PPC_85xx || (44x && !SMP) +config ARCH_DEFAULT_CRASH_DUMP + bool + default y if !PPC_BOOK3S_32 + config ARCH_SELECTS_CRASH_DUMP def_bool y depends on CRASH_DUMP diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index f4c570538d55..fa8f2da87a0a 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -898,6 +898,9 @@ config ARCH_SUPPORTS_KEXEC_PURGATORY config ARCH_SUPPORTS_CRASH_DUMP def_bool y +config ARCH_DEFAULT_CRASH_DUMP + def_bool y + config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION def_bool CRASH_RESERVE diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index d339fe4fdedf..cc1f9cffe2a5 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -276,6 +276,9 @@ config ARCH_SUPPORTS_CRASH_DUMP This option also enables s390 zfcpdump. See also +config ARCH_DEFAULT_CRASH_DUMP + def_bool y + menu "Processor type and features" config HAVE_MARCH_Z10_FEATURES diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index e9103998cca9..04ff5fb9242e 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -550,6 +550,9 @@ config ARCH_SUPPORTS_KEXEC config ARCH_SUPPORTS_CRASH_DUMP def_bool BROKEN_ON_SMP +config ARCH_DEFAULT_CRASH_DUMP + def_bool y + config ARCH_SUPPORTS_KEXEC_JUMP def_bool y diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 16354dfa6d96..7b9a7e8f39ac 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2084,6 +2084,9 @@ config ARCH_SUPPORTS_KEXEC_JUMP config ARCH_SUPPORTS_CRASH_DUMP def_bool X86_64 || (X86_32 && HIGHMEM) +config ARCH_DEFAULT_CRASH_DUMP + def_bool y + config ARCH_SUPPORTS_CRASH_HOTPLUG def_bool y diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 6c34e63c88ff..4d111f871951 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -97,7 +97,7 @@ config KEXEC_JUMP config CRASH_DUMP bool "kernel crash dumps" - default y + default ARCH_DEFAULT_CRASH_DUMP depends on ARCH_SUPPORTS_CRASH_DUMP depends on KEXEC_CORE select VMCORE_INFO From fd7b4f9f46d46acbc7af3a439bb0d869efdc5c58 Mon Sep 17 00:00:00 2001 From: Qun-Wei Lin Date: Wed, 13 Nov 2024 12:25:43 +0800 Subject: [PATCH 05/10] sched/task_stack: fix object_is_on_stack() for KASAN tagged pointers When CONFIG_KASAN_SW_TAGS and CONFIG_KASAN_STACK are enabled, the object_is_on_stack() function may produce incorrect results due to the presence of tags in the obj pointer, while the stack pointer does not have tags. This discrepancy can lead to incorrect stack object detection and subsequently trigger warnings if CONFIG_DEBUG_OBJECTS is also enabled. Example of the warning: ODEBUG: object 3eff800082ea7bb0 is NOT on stack ffff800082ea0000, but annotated. ------------[ cut here ]------------ WARNING: CPU: 0 PID: 1 at lib/debugobjects.c:557 __debug_object_init+0x330/0x364 Modules linked in: CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.12.0-rc5 #4 Hardware name: linux,dummy-virt (DT) pstate: 600000c5 (nZCv daIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : __debug_object_init+0x330/0x364 lr : __debug_object_init+0x330/0x364 sp : ffff800082ea7b40 x29: ffff800082ea7b40 x28: 98ff0000c0164518 x27: 98ff0000c0164534 x26: ffff800082d93ec8 x25: 0000000000000001 x24: 1cff0000c00172a0 x23: 0000000000000000 x22: ffff800082d93ed0 x21: ffff800081a24418 x20: 3eff800082ea7bb0 x19: efff800000000000 x18: 0000000000000000 x17: 00000000000000ff x16: 0000000000000047 x15: 206b63617473206e x14: 0000000000000018 x13: ffff800082ea7780 x12: 0ffff800082ea78e x11: 0ffff800082ea790 x10: 0ffff800082ea79d x9 : 34d77febe173e800 x8 : 34d77febe173e800 x7 : 0000000000000001 x6 : 0000000000000001 x5 : feff800082ea74b8 x4 : ffff800082870a90 x3 : ffff80008018d3c4 x2 : 0000000000000001 x1 : ffff800082858810 x0 : 0000000000000050 Call trace: __debug_object_init+0x330/0x364 debug_object_init_on_stack+0x30/0x3c schedule_hrtimeout_range_clock+0xac/0x26c schedule_hrtimeout+0x1c/0x30 wait_task_inactive+0x1d4/0x25c kthread_bind_mask+0x28/0x98 init_rescuer+0x1e8/0x280 workqueue_init+0x1a0/0x3cc kernel_init_freeable+0x118/0x200 kernel_init+0x28/0x1f0 ret_from_fork+0x10/0x20 ---[ end trace 0000000000000000 ]--- ODEBUG: object 3eff800082ea7bb0 is NOT on stack ffff800082ea0000, but annotated. ------------[ cut here ]------------ Link: https://lkml.kernel.org/r/20241113042544.19095-1-qun-wei.lin@mediatek.com Signed-off-by: Qun-Wei Lin Cc: Andrew Yang Cc: AngeloGioacchino Del Regno Cc: Casper Li Cc: Catalin Marinas Cc: Chinwen Chang Cc: Kent Overstreet Cc: Matthias Brugger Cc: Pasha Tatashin Cc: Shakeel Butt Cc: Signed-off-by: Andrew Morton --- include/linux/sched/task_stack.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h index bf10bdb487dd..6c2fef89a4fd 100644 --- a/include/linux/sched/task_stack.h +++ b/include/linux/sched/task_stack.h @@ -9,6 +9,7 @@ #include #include #include +#include #ifdef CONFIG_THREAD_INFO_IN_TASK @@ -89,6 +90,7 @@ static inline int object_is_on_stack(const void *obj) { void *stack = task_stack_page(current); + obj = kasan_reset_tag(obj); return (obj >= stack) && (obj < (stack + THREAD_SIZE)); } From 669b0cb81e4e4e78cff77a5b367c7f70c0c6c05e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 14 Nov 2024 11:59:32 +0300 Subject: [PATCH 06/10] fs/proc/task_mmu: prevent integer overflow in pagemap_scan_get_args() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The "arg->vec_len" variable is a u64 that comes from the user at the start of the function. The "arg->vec_len * sizeof(struct page_region))" multiplication can lead to integer wrapping. Use size_mul() to avoid that. Also the size_add/mul() functions work on unsigned long so for 32bit systems we need to ensure that "arg->vec_len" fits in an unsigned long. Link: https://lkml.kernel.org/r/39d41335-dd4d-48ed-8a7f-402c57d8ea84@stanley.mountain Fixes: 52526ca7fdb9 ("fs/proc/task_mmu: implement IOCTL to get and optionally clear info about PTEs") Signed-off-by: Dan Carpenter Cc: Andrei Vagin Cc: Andrii Nakryiko Cc: Arnd Bergmann Cc: David Hildenbrand Cc: Matthew Wilcox Cc: Michał Mirosław Cc: Muhammad Usama Anjum Cc: Oscar Salvador Cc: Peter Xu Cc: Ryan Roberts Cc: Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e52bd96137a6..7eb010de39fe 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -2665,8 +2665,10 @@ static int pagemap_scan_get_args(struct pm_scan_arg *arg, return -EFAULT; if (!arg->vec && arg->vec_len) return -EINVAL; + if (UINT_MAX == SIZE_MAX && arg->vec_len > SIZE_MAX) + return -EINVAL; if (arg->vec && !access_ok((void __user *)(long)arg->vec, - arg->vec_len * sizeof(struct page_region))) + size_mul(arg->vec_len, sizeof(struct page_region)))) return -EFAULT; /* Fixup default values */ From 0740e54304dcd11cf2a8edb6764423eb2fed1c61 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Wed, 13 Nov 2024 23:07:11 +0800 Subject: [PATCH 07/10] mm, doc: update read_ahead_kb for MADV_HUGEPAGE MADV_HUGEPAGE is a new addition to readahead with behavior distinct from normal pages. To prevent confusion, we should update the documentation accordingly. Link: https://lkml.kernel.org/r/20241113150711.1685-1-laoar.shao@gmail.com Signed-off-by: Yafang Shao Cc: David Hildenbrand Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- Documentation/ABI/stable/sysfs-block | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block index cea8856f798d..7a820a7d53aa 100644 --- a/Documentation/ABI/stable/sysfs-block +++ b/Documentation/ABI/stable/sysfs-block @@ -594,6 +594,9 @@ Description: [RW] Maximum number of kilobytes to read-ahead for filesystems on this block device. + For MADV_HUGEPAGE, the readahead size may exceed this setting + since its granularity is based on the hugepage size. + What: /sys/block//queue/rotational Date: January 2009 From 8ce41b0f9d77cca074df25afd39b86e2ee3aa68e Mon Sep 17 00:00:00 2001 From: Jinjiang Tu Date: Wed, 13 Nov 2024 16:32:35 +0800 Subject: [PATCH 08/10] mm: fix NULL pointer dereference in alloc_pages_bulk_noprof We triggered a NULL pointer dereference for ac.preferred_zoneref->zone in alloc_pages_bulk_noprof() when the task is migrated between cpusets. When cpuset is enabled, in prepare_alloc_pages(), ac->nodemask may be ¤t->mems_allowed. when first_zones_zonelist() is called to find preferred_zoneref, the ac->nodemask may be modified concurrently if the task is migrated between different cpusets. Assuming we have 2 NUMA Node, when traversing Node1 in ac->zonelist, the nodemask is 2, and when traversing Node2 in ac->zonelist, the nodemask is 1. As a result, the ac->preferred_zoneref points to NULL zone. In alloc_pages_bulk_noprof(), for_each_zone_zonelist_nodemask() finds a allowable zone and calls zonelist_node_idx(ac.preferred_zoneref), leading to NULL pointer dereference. __alloc_pages_noprof() fixes this issue by checking NULL pointer in commit ea57485af8f4 ("mm, page_alloc: fix check for NULL preferred_zone") and commit df76cee6bbeb ("mm, page_alloc: remove redundant checks from alloc fastpath"). To fix it, check NULL pointer for preferred_zoneref->zone. Link: https://lkml.kernel.org/r/20241113083235.166798-1-tujinjiang@huawei.com Fixes: 387ba26fb1cb ("mm/page_alloc: add a bulk page allocator") Signed-off-by: Jinjiang Tu Reviewed-by: Vlastimil Babka Cc: Alexander Lobakin Cc: David Hildenbrand Cc: Kefeng Wang Cc: Mel Gorman Cc: Nanyong Sun Cc: Signed-off-by: Andrew Morton --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 216fbbfbedcf..b6958333054d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4607,7 +4607,8 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, gfp = alloc_gfp; /* Find an allowed local zone that meets the low watermark. */ - for_each_zone_zonelist_nodemask(zone, z, ac.zonelist, ac.highest_zoneidx, ac.nodemask) { + z = ac.preferred_zoneref; + for_next_zone_zonelist_nodemask(zone, z, ac.highest_zoneidx, ac.nodemask) { unsigned long mark; if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) && From 737f34137844d6572ab7d473c998c7f977ff30eb Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Thu, 14 Nov 2024 07:38:44 +0300 Subject: [PATCH 09/10] ocfs2: uncache inode which has failed entering the group Syzbot has reported the following BUG: kernel BUG at fs/ocfs2/uptodate.c:509! ... Call Trace: ? __die_body+0x5f/0xb0 ? die+0x9e/0xc0 ? do_trap+0x15a/0x3a0 ? ocfs2_set_new_buffer_uptodate+0x145/0x160 ? do_error_trap+0x1dc/0x2c0 ? ocfs2_set_new_buffer_uptodate+0x145/0x160 ? __pfx_do_error_trap+0x10/0x10 ? handle_invalid_op+0x34/0x40 ? ocfs2_set_new_buffer_uptodate+0x145/0x160 ? exc_invalid_op+0x38/0x50 ? asm_exc_invalid_op+0x1a/0x20 ? ocfs2_set_new_buffer_uptodate+0x2e/0x160 ? ocfs2_set_new_buffer_uptodate+0x144/0x160 ? ocfs2_set_new_buffer_uptodate+0x145/0x160 ocfs2_group_add+0x39f/0x15a0 ? __pfx_ocfs2_group_add+0x10/0x10 ? __pfx_lock_acquire+0x10/0x10 ? mnt_get_write_access+0x68/0x2b0 ? __pfx_lock_release+0x10/0x10 ? rcu_read_lock_any_held+0xb7/0x160 ? __pfx_rcu_read_lock_any_held+0x10/0x10 ? smack_log+0x123/0x540 ? mnt_get_write_access+0x68/0x2b0 ? mnt_get_write_access+0x68/0x2b0 ? mnt_get_write_access+0x226/0x2b0 ocfs2_ioctl+0x65e/0x7d0 ? __pfx_ocfs2_ioctl+0x10/0x10 ? smack_file_ioctl+0x29e/0x3a0 ? __pfx_smack_file_ioctl+0x10/0x10 ? lockdep_hardirqs_on_prepare+0x43d/0x780 ? __pfx_lockdep_hardirqs_on_prepare+0x10/0x10 ? __pfx_ocfs2_ioctl+0x10/0x10 __se_sys_ioctl+0xfb/0x170 do_syscall_64+0xf3/0x230 entry_SYSCALL_64_after_hwframe+0x77/0x7f ... When 'ioctl(OCFS2_IOC_GROUP_ADD, ...)' has failed for the particular inode in 'ocfs2_verify_group_and_input()', corresponding buffer head remains cached and subsequent call to the same 'ioctl()' for the same inode issues the BUG() in 'ocfs2_set_new_buffer_uptodate()' (trying to cache the same buffer head of that inode). Fix this by uncaching the buffer head with 'ocfs2_remove_from_cache()' on error path in 'ocfs2_group_add()'. Link: https://lkml.kernel.org/r/20241114043844.111847-1-dmantipov@yandex.ru Fixes: 7909f2bf8353 ("[PATCH 2/2] ocfs2: Implement group add for online resize") Signed-off-by: Dmitry Antipov Reported-by: syzbot+453873f1588c2d75b447@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=453873f1588c2d75b447 Reviewed-by: Joseph Qi Cc: Dmitry Antipov Cc: Joel Becker Cc: Mark Fasheh Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Cc: Signed-off-by: Andrew Morton --- fs/ocfs2/resize.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c index c4a4016d3866..b0733c08ed13 100644 --- a/fs/ocfs2/resize.c +++ b/fs/ocfs2/resize.c @@ -574,6 +574,8 @@ out_commit: ocfs2_commit_trans(osb, handle); out_free_group_bh: + if (ret < 0) + ocfs2_remove_from_cache(INODE_CACHE(inode), group_bh); brelse(group_bh); out_unlock: From d1aa0c04294e29883d65eac6c2f72fe95cc7c049 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 15 Nov 2024 16:57:24 -0800 Subject: [PATCH 10/10] mm: revert "mm: shmem: fix data-race in shmem_getattr()" Revert d949d1d14fa2 ("mm: shmem: fix data-race in shmem_getattr()") as suggested by Chuck [1]. It is causing deadlocks when accessing tmpfs over NFS. As Hugh commented, "added just to silence a syzbot sanitizer splat: added where there has never been any practical problem". Link: https://lkml.kernel.org/r/ZzdxKF39VEmXSSyN@tissot.1015granger.net [1] Fixes: d949d1d14fa2 ("mm: shmem: fix data-race in shmem_getattr()") Acked-by: Hugh Dickins Cc: Chuck Lever Cc: Jeongjun Park Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton --- mm/shmem.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index e87f5d6799a7..568bb290bdce 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1166,9 +1166,7 @@ static int shmem_getattr(struct mnt_idmap *idmap, stat->attributes_mask |= (STATX_ATTR_APPEND | STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); - inode_lock_shared(inode); generic_fillattr(idmap, request_mask, inode, stat); - inode_unlock_shared(inode); if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0)) stat->blksize = HPAGE_PMD_SIZE;