diff --git a/fs/file.c b/fs/file.c index 5125607d040a..eb093e736972 100644 --- a/fs/file.c +++ b/fs/file.c @@ -272,59 +272,45 @@ static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt) return test_bit(fd, fdt->open_fds); } -static unsigned int count_open_files(struct fdtable *fdt) -{ - unsigned int size = fdt->max_fds; - unsigned int i; - - /* Find the last open fd */ - for (i = size / BITS_PER_LONG; i > 0; ) { - if (fdt->open_fds[--i]) - break; - } - i = (i + 1) * BITS_PER_LONG; - return i; -} - /* * Note that a sane fdtable size always has to be a multiple of * BITS_PER_LONG, since we have bitmaps that are sized by this. * - * 'max_fds' will normally already be properly aligned, but it - * turns out that in the close_range() -> __close_range() -> - * unshare_fd() -> dup_fd() -> sane_fdtable_size() we can end - * up having a 'max_fds' value that isn't already aligned. - * - * Rather than make close_range() have to worry about this, - * just make that BITS_PER_LONG alignment be part of a sane - * fdtable size. Becuase that's really what it is. + * punch_hole is optional - when close_range() is asked to unshare + * and close, we don't need to copy descriptors in that range, so + * a smaller cloned descriptor table might suffice if the last + * currently opened descriptor falls into that range. */ -static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds) +static unsigned int sane_fdtable_size(struct fdtable *fdt, struct fd_range *punch_hole) { - unsigned int count; + unsigned int last = find_last_bit(fdt->open_fds, fdt->max_fds); - count = count_open_files(fdt); - if (max_fds < NR_OPEN_DEFAULT) - max_fds = NR_OPEN_DEFAULT; - return ALIGN(min(count, max_fds), BITS_PER_LONG); + if (last == fdt->max_fds) + return NR_OPEN_DEFAULT; + if (punch_hole && punch_hole->to >= last && punch_hole->from <= last) { + last = find_last_bit(fdt->open_fds, punch_hole->from); + if (last == punch_hole->from) + return NR_OPEN_DEFAULT; + } + return ALIGN(last + 1, BITS_PER_LONG); } /* - * Allocate a new files structure and copy contents from the - * passed in files structure. - * errorp will be valid only when the returned files_struct is NULL. + * Allocate a new descriptor table and copy contents from the passed in + * instance. Returns a pointer to cloned table on success, ERR_PTR() + * on failure. For 'punch_hole' see sane_fdtable_size(). */ -struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int *errorp) +struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_hole) { struct files_struct *newf; struct file **old_fds, **new_fds; unsigned int open_files, i; struct fdtable *old_fdt, *new_fdt; + int error; - *errorp = -ENOMEM; newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); if (!newf) - goto out; + return ERR_PTR(-ENOMEM); atomic_set(&newf->count, 1); @@ -341,7 +327,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); - open_files = sane_fdtable_size(old_fdt, max_fds); + open_files = sane_fdtable_size(old_fdt, punch_hole); /* * Check whether we need to allocate a larger fd array and fd set. @@ -354,14 +340,14 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int new_fdt = alloc_fdtable(open_files - 1); if (!new_fdt) { - *errorp = -ENOMEM; + error = -ENOMEM; goto out_release; } /* beyond sysctl_nr_open; nothing to do */ if (unlikely(new_fdt->max_fds < open_files)) { __free_fdtable(new_fdt); - *errorp = -EMFILE; + error = -EMFILE; goto out_release; } @@ -372,7 +358,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int */ spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); - open_files = sane_fdtable_size(old_fdt, max_fds); + open_files = sane_fdtable_size(old_fdt, punch_hole); } copy_fd_bitmaps(new_fdt, old_fdt, open_files / BITS_PER_LONG); @@ -406,8 +392,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int out_release: kmem_cache_free(files_cachep, newf); -out: - return NULL; + return ERR_PTR(error); } static struct fdtable *close_files(struct files_struct * files) @@ -748,37 +733,25 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) if (fd > max_fd) return -EINVAL; - if (flags & CLOSE_RANGE_UNSHARE) { - int ret; - unsigned int max_unshare_fds = NR_OPEN_MAX; + if ((flags & CLOSE_RANGE_UNSHARE) && atomic_read(&cur_fds->count) > 1) { + struct fd_range range = {fd, max_fd}, *punch_hole = ⦥ /* * If the caller requested all fds to be made cloexec we always * copy all of the file descriptors since they still want to * use them. */ - if (!(flags & CLOSE_RANGE_CLOEXEC)) { - /* - * If the requested range is greater than the current - * maximum, we're closing everything so only copy all - * file descriptors beneath the lowest file descriptor. - */ - rcu_read_lock(); - if (max_fd >= last_fd(files_fdtable(cur_fds))) - max_unshare_fds = fd; - rcu_read_unlock(); - } - - ret = unshare_fd(CLONE_FILES, max_unshare_fds, &fds); - if (ret) - return ret; + if (flags & CLOSE_RANGE_CLOEXEC) + punch_hole = NULL; + fds = dup_fd(cur_fds, punch_hole); + if (IS_ERR(fds)) + return PTR_ERR(fds); /* * We used to share our file descriptor table, and have now * created a private one, make sure we're using it below. */ - if (fds) - swap(cur_fds, fds); + swap(cur_fds, fds); } if (flags & CLOSE_RANGE_CLOEXEC) diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index 2944d4aa413b..b1c5722f2b3c 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -22,7 +22,6 @@ * as this is the granularity returned by copy_fdset(). */ #define NR_OPEN_DEFAULT BITS_PER_LONG -#define NR_OPEN_MAX ~0U struct fdtable { unsigned int max_fds; @@ -106,7 +105,10 @@ struct task_struct; void put_files_struct(struct files_struct *fs); int unshare_files(void); -struct files_struct *dup_fd(struct files_struct *, unsigned, int *) __latent_entropy; +struct fd_range { + unsigned int from, to; +}; +struct files_struct *dup_fd(struct files_struct *, struct fd_range *) __latent_entropy; void do_close_on_exec(struct files_struct *); int iterate_fd(struct files_struct *, unsigned, int (*)(const void *, struct file *, unsigned), @@ -115,8 +117,6 @@ int iterate_fd(struct files_struct *, unsigned, extern int close_fd(unsigned int fd); extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags); extern struct file *file_close_fd(unsigned int fd); -extern int unshare_fd(unsigned long unshare_flags, unsigned int max_fds, - struct files_struct **new_fdp); extern struct kmem_cache *files_cachep; diff --git a/kernel/fork.c b/kernel/fork.c index 60c0b4868fd4..89ceb4a68af2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1756,33 +1756,30 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk, int no_files) { struct files_struct *oldf, *newf; - int error = 0; /* * A background process may not have any files ... */ oldf = current->files; if (!oldf) - goto out; + return 0; if (no_files) { tsk->files = NULL; - goto out; + return 0; } if (clone_flags & CLONE_FILES) { atomic_inc(&oldf->count); - goto out; + return 0; } - newf = dup_fd(oldf, NR_OPEN_MAX, &error); - if (!newf) - goto out; + newf = dup_fd(oldf, NULL); + if (IS_ERR(newf)) + return PTR_ERR(newf); tsk->files = newf; - error = 0; -out: - return error; + return 0; } static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) @@ -3238,17 +3235,16 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) /* * Unshare file descriptor table if it is being shared */ -int unshare_fd(unsigned long unshare_flags, unsigned int max_fds, - struct files_struct **new_fdp) +static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) { struct files_struct *fd = current->files; - int error = 0; if ((unshare_flags & CLONE_FILES) && (fd && atomic_read(&fd->count) > 1)) { - *new_fdp = dup_fd(fd, max_fds, &error); - if (!*new_fdp) - return error; + fd = dup_fd(fd, NULL); + if (IS_ERR(fd)) + return PTR_ERR(fd); + *new_fdp = fd; } return 0; @@ -3306,7 +3302,7 @@ int ksys_unshare(unsigned long unshare_flags) err = unshare_fs(unshare_flags, &new_fs); if (err) goto bad_unshare_out; - err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd); + err = unshare_fd(unshare_flags, &new_fd); if (err) goto bad_unshare_cleanup_fs; err = unshare_userns(unshare_flags, &new_cred); @@ -3398,7 +3394,7 @@ int unshare_files(void) struct files_struct *old, *copy = NULL; int error; - error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, ©); + error = unshare_fd(CLONE_FILES, ©); if (error || !copy) return error;