// SPDX-License-Identifier: GPL-2.0-only /* * kexec.c - kexec_load system call * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/capability.h> #include <linux/mm.h> #include <linux/file.h> #include <linux/security.h> #include <linux/kexec.h> #include <linux/mutex.h> #include <linux/list.h> #include <linux/syscalls.h> #include <linux/vmalloc.h> #include <linux/slab.h> #include "kexec_internal.h" static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, unsigned long nr_segments, struct kexec_segment *segments, unsigned long flags) { int ret; struct kimage *image; bool kexec_on_panic = flags & KEXEC_ON_CRASH; #ifdef CONFIG_CRASH_DUMP if (kexec_on_panic) { /* Verify we have a valid entry point */ if ((entry < phys_to_boot_phys(crashk_res.start)) || (entry > phys_to_boot_phys(crashk_res.end))) return -EADDRNOTAVAIL; } #endif /* Allocate and initialize a controlling structure */ image = do_kimage_alloc_init(); if (!image) return -ENOMEM; image->start = entry; image->nr_segments = nr_segments; memcpy(image->segment, segments, nr_segments * sizeof(*segments)); #ifdef CONFIG_CRASH_DUMP if (kexec_on_panic) { /* Enable special crash kernel control page alloc policy. */ image->control_page = crashk_res.start; image->type = KEXEC_TYPE_CRASH; } #endif ret = sanity_check_segment_list(image); if (ret) goto out_free_image; /* * Find a location for the control code buffer, and add it * the vector of segments so that it's pages will also be * counted as destination pages. */ ret = -ENOMEM; image->control_code_page = kimage_alloc_control_pages(image, get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { pr_err("Could not allocate control_code_buffer\n"); goto out_free_image; } if (!kexec_on_panic) { image->swap_page = kimage_alloc_control_pages(image, 0); if (!image->swap_page) { pr_err("Could not allocate swap buffer\n"); goto out_free_control_pages; } } *rimage = image; return 0; out_free_control_pages: kimage_free_page_list(&image->control_pages); out_free_image: kfree(image); return ret; } static int do_kexec_load(unsigned long entry, unsigned long nr_segments, struct kexec_segment *segments, unsigned long flags) { struct kimage **dest_image, *image; unsigned long i; int ret; /* * Because we write directly to the reserved memory region when loading * crash kernels we need a serialization here to prevent multiple crash * kernels from attempting to load simultaneously. */ if (!kexec_trylock()) return -EBUSY; #ifdef CONFIG_CRASH_DUMP if (flags & KEXEC_ON_CRASH) { dest_image = &kexec_crash_image; if (kexec_crash_image) arch_kexec_unprotect_crashkres(); } else #endif dest_image = &kexec_image; if (nr_segments == 0) { /* Uninstall image */ kimage_free(xchg(dest_image, NULL)); ret = 0; goto out_unlock; } if (flags & KEXEC_ON_CRASH) { /* * Loading another kernel to switch to if this one * crashes. Free any current crash dump kernel before * we corrupt it. */ kimage_free(xchg(&kexec_crash_image, NULL)); } ret = kimage_alloc_init(&image, entry, nr_segments, segments, flags); if (ret) goto out_unlock; if (flags & KEXEC_PRESERVE_CONTEXT) image->preserve_context = 1; #ifdef CONFIG_CRASH_HOTPLUG if ((flags & KEXEC_ON_CRASH) && arch_crash_hotplug_support(image, flags)) image->hotplug_support = 1; #endif ret = machine_kexec_prepare(image); if (ret) goto out; /* * Some architecture(like S390) may touch the crash memory before * machine_kexec_prepare(), we must copy vmcoreinfo data after it. */ ret = kimage_crash_copy_vmcoreinfo(image); if (ret) goto out; for (i = 0; i < nr_segments; i++) { ret = kimage_load_segment(image, &image->segment[i]); if (ret) goto out; } kimage_terminate(image); ret = machine_kexec_post_load(image); if (ret) goto out; /* Install the new kernel and uninstall the old */ image = xchg(dest_image, image); out: #ifdef CONFIG_CRASH_DUMP if ((flags & KEXEC_ON_CRASH) && kexec_crash_image) arch_kexec_protect_crashkres(); #endif kimage_free(image); out_unlock: kexec_unlock(); return ret; } /* * Exec Kernel system call: for obvious reasons only root may call it. * * This call breaks up into three pieces. * - A generic part which loads the new kernel from the current * address space, and very carefully places the data in the * allocated pages. * * - A generic part that interacts with the kernel and tells all of * the devices to shut down. Preventing on-going dmas, and placing * the devices in a consistent state so a later kernel can * reinitialize them. * * - A machine specific part that includes the syscall number * and then copies the image to it's final destination. And * jumps into the image at entry. * * kexec does not sync, or unmount filesystems so if you need * that to happen you need to do that yourself. */ static inline int kexec_load_check(unsigned long nr_segments, unsigned long flags) { int image_type = (flags & KEXEC_ON_CRASH) ? KEXEC_TYPE_CRASH : KEXEC_TYPE_DEFAULT; int result; /* We only trust the superuser with rebooting the system. */ if (!kexec_load_permitted(image_type)) return -EPERM; /* Permit LSMs and IMA to fail the kexec */ result = security_kernel_load_data(LOADING_KEXEC_IMAGE, false); if (result < 0) return result; /* * kexec can be used to circumvent module loading restrictions, so * prevent loading in that case */ result = security_locked_down(LOCKDOWN_KEXEC); if (result) return result; /* * Verify we have a legal set of flags * This leaves us room for future extensions. */ if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK)) return -EINVAL; /* Put an artificial cap on the number * of segments passed to kexec_load. */ if (nr_segments > KEXEC_SEGMENT_MAX) return -EINVAL; return 0; } SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, struct kexec_segment __user *, segments, unsigned long, flags) { struct kexec_segment *ksegments; unsigned long result; result = kexec_load_check(nr_segments, flags); if (result) return result; /* Verify we are on the appropriate architecture */ if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) && ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT)) return -EINVAL; ksegments = memdup_array_user(segments, nr_segments, sizeof(ksegments[0])); if (IS_ERR(ksegments)) return PTR_ERR(ksegments); result = do_kexec_load(entry, nr_segments, ksegments, flags); kfree(ksegments); return result; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, compat_ulong_t, nr_segments, struct compat_kexec_segment __user *, segments, compat_ulong_t, flags) { struct compat_kexec_segment in; struct kexec_segment *ksegments; unsigned long i, result; result = kexec_load_check(nr_segments, flags); if (result) return result; /* Don't allow clients that don't understand the native * architecture to do anything. */ if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) return -EINVAL; ksegments = kmalloc_array(nr_segments, sizeof(ksegments[0]), GFP_KERNEL); if (!ksegments) return -ENOMEM; for (i = 0; i < nr_segments; i++) { result = copy_from_user(&in, &segments[i], sizeof(in)); if (result) goto fail; ksegments[i].buf = compat_ptr(in.buf); ksegments[i].bufsz = in.bufsz; ksegments[i].mem = in.mem; ksegments[i].memsz = in.memsz; } result = do_kexec_load(entry, nr_segments, ksegments, flags); fail: kfree(ksegments); return result; } #endif