1

vfs-6.12.file

-----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCZuQEwAAKCRCRxhvAZXjc
 osS0AQCgIpvey9oW5DMyMw6Bv0hFMRv95gbNQZfHy09iK+NMNAD9GALhb/4cMIVB
 7YrZGXEz454lpgcs8AnrOVjVNfctOQg=
 =e9s9
 -----END PGP SIGNATURE-----

Merge tag 'vfs-6.12.file' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull vfs file updates from Christian Brauner:
 "This is the work to cleanup and shrink struct file significantly.

  Right now, (focusing on x86) struct file is 232 bytes. After this
  series struct file will be 184 bytes aka 3 cacheline and a spare 8
  bytes for future extensions at the end of the struct.

  With struct file being as ubiquitous as it is this should make a
  difference for file heavy workloads and allow further optimizations in
  the future.

   - struct fown_struct was embedded into struct file letting it take up
     32 bytes in total when really it shouldn't even be embedded in
     struct file in the first place. Instead, actual users of struct
     fown_struct now allocate the struct on demand. This frees up 24
     bytes.

   - Move struct file_ra_state into the union containg the cleanup hooks
     and move f_iocb_flags out of the union. This closes a 4 byte hole
     we created earlier and brings struct file to 192 bytes. Which means
     struct file is 3 cachelines and we managed to shrink it by 40
     bytes.

   - Reorder struct file so that nothing crosses a cacheline.

     I suspect that in the future we will end up reordering some members
     to mitigate false sharing issues or just because someone does
     actually provide really good perf data.

   - Shrinking struct file to 192 bytes is only part of the work.

     Files use a slab that is SLAB_TYPESAFE_BY_RCU and when a kmem cache
     is created with SLAB_TYPESAFE_BY_RCU the free pointer must be
     located outside of the object because the cache doesn't know what
     part of the memory can safely be overwritten as it may be needed to
     prevent object recycling.

     That has the consequence that SLAB_TYPESAFE_BY_RCU may end up
     adding a new cacheline.

     So this also contains work to add a new kmem_cache_create_rcu()
     function that allows the caller to specify an offset where the
     freelist pointer is supposed to be placed. Thus avoiding the
     implicit addition of a fourth cacheline.

   - And finally this removes the f_version member in struct file.

     The f_version member isn't particularly well-defined. It is mainly
     used as a cookie to detect concurrent seeks when iterating
     directories. But it is also abused by some subsystems for
     completely unrelated things.

     It is mostly a directory and filesystem specific thing that doesn't
     really need to live in struct file and with its wonky semantics it
     really lacks a specific function.

     For pipes, f_version is (ab)used to defer poll notifications until
     a write has happened. And struct pipe_inode_info is used by
     multiple struct files in their ->private_data so there's no chance
     of pushing that down into file->private_data without introducing
     another pointer indirection.

     But pipes don't rely on f_pos_lock so this adds a union into struct
     file encompassing f_pos_lock and a pipe specific f_pipe member that
     pipes can use. This union of course can be extended to other file
     types and is similar to what we do in struct inode already"

* tag 'vfs-6.12.file' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: (26 commits)
  fs: remove f_version
  pipe: use f_pipe
  fs: add f_pipe
  ubifs: store cookie in private data
  ufs: store cookie in private data
  udf: store cookie in private data
  proc: store cookie in private data
  ocfs2: store cookie in private data
  input: remove f_version abuse
  ext4: store cookie in private data
  ext2: store cookie in private data
  affs: store cookie in private data
  fs: add generic_llseek_cookie()
  fs: use must_set_pos()
  fs: add must_set_pos()
  fs: add vfs_setpos_cookie()
  s390: remove unused f_version
  ceph: remove unused f_version
  adi: remove unused f_version
  mm: Removed @freeptr_offset to prevent doc warning
  ...
This commit is contained in:
Linus Torvalds 2024-09-16 09:14:02 +02:00
commit 3352633ce6
33 changed files with 754 additions and 281 deletions

View File

@ -190,7 +190,6 @@ static loff_t adi_llseek(struct file *file, loff_t offset, int whence)
if (offset != file->f_pos) {
file->f_pos = offset;
file->f_version = 0;
ret = offset;
}

View File

@ -1079,33 +1079,31 @@ static inline void input_wakeup_procfs_readers(void)
wake_up(&input_devices_poll_wait);
}
struct input_seq_state {
unsigned short pos;
bool mutex_acquired;
int input_devices_state;
};
static __poll_t input_proc_devices_poll(struct file *file, poll_table *wait)
{
struct seq_file *seq = file->private_data;
struct input_seq_state *state = seq->private;
poll_wait(file, &input_devices_poll_wait, wait);
if (file->f_version != input_devices_state) {
file->f_version = input_devices_state;
if (state->input_devices_state != input_devices_state) {
state->input_devices_state = input_devices_state;
return EPOLLIN | EPOLLRDNORM;
}
return 0;
}
union input_seq_state {
struct {
unsigned short pos;
bool mutex_acquired;
};
void *p;
};
static void *input_devices_seq_start(struct seq_file *seq, loff_t *pos)
{
union input_seq_state *state = (union input_seq_state *)&seq->private;
struct input_seq_state *state = seq->private;
int error;
/* We need to fit into seq->private pointer */
BUILD_BUG_ON(sizeof(union input_seq_state) != sizeof(seq->private));
error = mutex_lock_interruptible(&input_mutex);
if (error) {
state->mutex_acquired = false;
@ -1124,7 +1122,7 @@ static void *input_devices_seq_next(struct seq_file *seq, void *v, loff_t *pos)
static void input_seq_stop(struct seq_file *seq, void *v)
{
union input_seq_state *state = (union input_seq_state *)&seq->private;
struct input_seq_state *state = seq->private;
if (state->mutex_acquired)
mutex_unlock(&input_mutex);
@ -1210,7 +1208,8 @@ static const struct seq_operations input_devices_seq_ops = {
static int input_proc_devices_open(struct inode *inode, struct file *file)
{
return seq_open(file, &input_devices_seq_ops);
return seq_open_private(file, &input_devices_seq_ops,
sizeof(struct input_seq_state));
}
static const struct proc_ops input_devices_proc_ops = {
@ -1218,17 +1217,14 @@ static const struct proc_ops input_devices_proc_ops = {
.proc_poll = input_proc_devices_poll,
.proc_read = seq_read,
.proc_lseek = seq_lseek,
.proc_release = seq_release,
.proc_release = seq_release_private,
};
static void *input_handlers_seq_start(struct seq_file *seq, loff_t *pos)
{
union input_seq_state *state = (union input_seq_state *)&seq->private;
struct input_seq_state *state = seq->private;
int error;
/* We need to fit into seq->private pointer */
BUILD_BUG_ON(sizeof(union input_seq_state) != sizeof(seq->private));
error = mutex_lock_interruptible(&input_mutex);
if (error) {
state->mutex_acquired = false;
@ -1243,7 +1239,7 @@ static void *input_handlers_seq_start(struct seq_file *seq, loff_t *pos)
static void *input_handlers_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
union input_seq_state *state = (union input_seq_state *)&seq->private;
struct input_seq_state *state = seq->private;
state->pos = *pos + 1;
return seq_list_next(v, &input_handler_list, pos);
@ -1252,7 +1248,7 @@ static void *input_handlers_seq_next(struct seq_file *seq, void *v, loff_t *pos)
static int input_handlers_seq_show(struct seq_file *seq, void *v)
{
struct input_handler *handler = container_of(v, struct input_handler, node);
union input_seq_state *state = (union input_seq_state *)&seq->private;
struct input_seq_state *state = seq->private;
seq_printf(seq, "N: Number=%u Name=%s", state->pos, handler->name);
if (handler->filter)
@ -1273,14 +1269,15 @@ static const struct seq_operations input_handlers_seq_ops = {
static int input_proc_handlers_open(struct inode *inode, struct file *file)
{
return seq_open(file, &input_handlers_seq_ops);
return seq_open_private(file, &input_handlers_seq_ops,
sizeof(struct input_seq_state));
}
static const struct proc_ops input_handlers_proc_ops = {
.proc_open = input_proc_handlers_open,
.proc_read = seq_read,
.proc_lseek = seq_lseek,
.proc_release = seq_release,
.proc_release = seq_release_private,
};
static int __init input_proc_init(void)

View File

@ -3452,6 +3452,12 @@ static int tun_chr_fasync(int fd, struct file *file, int on)
struct tun_file *tfile = file->private_data;
int ret;
if (on) {
ret = file_f_owner_allocate(file);
if (ret)
goto out;
}
if ((ret = fasync_helper(fd, file, on, &tfile->fasync)) < 0)
goto out;

View File

@ -186,9 +186,6 @@ static loff_t hmcdrv_dev_seek(struct file *fp, loff_t pos, int whence)
if (pos < 0)
return -EINVAL;
if (fp->f_pos != pos)
++fp->f_version;
fp->f_pos = pos;
return pos;
}

View File

@ -2225,6 +2225,12 @@ static int __tty_fasync(int fd, struct file *filp, int on)
if (tty_paranoia_check(tty, file_inode(filp), "tty_fasync"))
goto out;
if (on) {
retval = file_f_owner_allocate(filp);
if (retval)
goto out;
}
retval = fasync_helper(fd, filp, on, &tty->fasync);
if (retval <= 0)
goto out;

View File

@ -17,13 +17,44 @@
#include <linux/iversion.h>
#include "affs.h"
struct affs_dir_data {
unsigned long ino;
u64 cookie;
};
static int affs_readdir(struct file *, struct dir_context *);
static loff_t affs_dir_llseek(struct file *file, loff_t offset, int whence)
{
struct affs_dir_data *data = file->private_data;
return generic_llseek_cookie(file, offset, whence, &data->cookie);
}
static int affs_dir_open(struct inode *inode, struct file *file)
{
struct affs_dir_data *data;
data = kzalloc(sizeof(struct affs_dir_data), GFP_KERNEL);
if (!data)
return -ENOMEM;
file->private_data = data;
return 0;
}
static int affs_dir_release(struct inode *inode, struct file *file)
{
kfree(file->private_data);
return 0;
}
const struct file_operations affs_dir_operations = {
.open = affs_dir_open,
.read = generic_read_dir,
.llseek = generic_file_llseek,
.llseek = affs_dir_llseek,
.iterate_shared = affs_readdir,
.fsync = affs_file_fsync,
.release = affs_dir_release,
};
/*
@ -45,6 +76,7 @@ static int
affs_readdir(struct file *file, struct dir_context *ctx)
{
struct inode *inode = file_inode(file);
struct affs_dir_data *data = file->private_data;
struct super_block *sb = inode->i_sb;
struct buffer_head *dir_bh = NULL;
struct buffer_head *fh_bh = NULL;
@ -59,7 +91,7 @@ affs_readdir(struct file *file, struct dir_context *ctx)
pr_debug("%s(ino=%lu,f_pos=%llx)\n", __func__, inode->i_ino, ctx->pos);
if (ctx->pos < 2) {
file->private_data = (void *)0;
data->ino = 0;
if (!dir_emit_dots(file, ctx))
return 0;
}
@ -80,8 +112,8 @@ affs_readdir(struct file *file, struct dir_context *ctx)
/* If the directory hasn't changed since the last call to readdir(),
* we can jump directly to where we left off.
*/
ino = (u32)(long)file->private_data;
if (ino && inode_eq_iversion(inode, file->f_version)) {
ino = data->ino;
if (ino && inode_eq_iversion(inode, data->cookie)) {
pr_debug("readdir() left off=%d\n", ino);
goto inside;
}
@ -131,8 +163,8 @@ inside:
} while (ino);
}
done:
file->f_version = inode_query_iversion(inode);
file->private_data = (void *)(long)ino;
data->cookie = inode_query_iversion(inode);
data->ino = ino;
affs_brelse(fh_bh);
out_brelse_dir:

View File

@ -707,7 +707,6 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
if (offset != file->f_pos) {
file->f_pos = offset;
file->f_version = 0;
dfi->file_info.flags &= ~CEPH_F_ATEND;
}
retval = offset;

View File

@ -263,7 +263,7 @@ ext2_readdir(struct file *file, struct dir_context *ctx)
unsigned long n = pos >> PAGE_SHIFT;
unsigned long npages = dir_pages(inode);
unsigned chunk_mask = ~(ext2_chunk_size(inode)-1);
bool need_revalidate = !inode_eq_iversion(inode, file->f_version);
bool need_revalidate = !inode_eq_iversion(inode, *(u64 *)file->private_data);
bool has_filetype;
if (pos > inode->i_size - EXT2_DIR_REC_LEN(1))
@ -290,7 +290,7 @@ ext2_readdir(struct file *file, struct dir_context *ctx)
offset = ext2_validate_entry(kaddr, offset, chunk_mask);
ctx->pos = (n<<PAGE_SHIFT) + offset;
}
file->f_version = inode_query_iversion(inode);
*(u64 *)file->private_data = inode_query_iversion(inode);
need_revalidate = false;
}
de = (ext2_dirent *)(kaddr+offset);
@ -703,8 +703,30 @@ not_empty:
return 0;
}
static int ext2_dir_open(struct inode *inode, struct file *file)
{
file->private_data = kzalloc(sizeof(u64), GFP_KERNEL);
if (!file->private_data)
return -ENOMEM;
return 0;
}
static int ext2_dir_release(struct inode *inode, struct file *file)
{
kfree(file->private_data);
return 0;
}
static loff_t ext2_dir_llseek(struct file *file, loff_t offset, int whence)
{
return generic_llseek_cookie(file, offset, whence,
(u64 *)file->private_data);
}
const struct file_operations ext2_dir_operations = {
.llseek = generic_file_llseek,
.open = ext2_dir_open,
.release = ext2_dir_release,
.llseek = ext2_dir_llseek,
.read = generic_read_dir,
.iterate_shared = ext2_readdir,
.unlocked_ioctl = ext2_ioctl,

View File

@ -133,6 +133,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
struct super_block *sb = inode->i_sb;
struct buffer_head *bh = NULL;
struct fscrypt_str fstr = FSTR_INIT(NULL, 0);
struct dir_private_info *info = file->private_data;
err = fscrypt_prepare_readdir(inode);
if (err)
@ -229,7 +230,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
* readdir(2), then we might be pointing to an invalid
* dirent right now. Scan from the start of the block
* to make sure. */
if (!inode_eq_iversion(inode, file->f_version)) {
if (!inode_eq_iversion(inode, info->cookie)) {
for (i = 0; i < sb->s_blocksize && i < offset; ) {
de = (struct ext4_dir_entry_2 *)
(bh->b_data + i);
@ -249,7 +250,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
offset = i;
ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
| offset;
file->f_version = inode_query_iversion(inode);
info->cookie = inode_query_iversion(inode);
}
while (ctx->pos < inode->i_size
@ -384,6 +385,7 @@ static inline loff_t ext4_get_htree_eof(struct file *filp)
static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
struct dir_private_info *info = file->private_data;
int dx_dir = is_dx_dir(inode);
loff_t ret, htree_max = ext4_get_htree_eof(file);
@ -392,7 +394,7 @@ static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence)
htree_max, htree_max);
else
ret = ext4_llseek(file, offset, whence);
file->f_version = inode_peek_iversion(inode) - 1;
info->cookie = inode_peek_iversion(inode) - 1;
return ret;
}
@ -429,18 +431,15 @@ static void free_rb_tree_fname(struct rb_root *root)
*root = RB_ROOT;
}
static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp,
loff_t pos)
static void ext4_htree_init_dir_info(struct file *filp, loff_t pos)
{
struct dir_private_info *p;
struct dir_private_info *p = filp->private_data;
p = kzalloc(sizeof(*p), GFP_KERNEL);
if (!p)
return NULL;
p->curr_hash = pos2maj_hash(filp, pos);
p->curr_minor_hash = pos2min_hash(filp, pos);
return p;
if (is_dx_dir(file_inode(filp)) && !p->initialized) {
p->curr_hash = pos2maj_hash(filp, pos);
p->curr_minor_hash = pos2min_hash(filp, pos);
p->initialized = true;
}
}
void ext4_htree_free_dir_info(struct dir_private_info *p)
@ -552,12 +551,7 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)
struct fname *fname;
int ret = 0;
if (!info) {
info = ext4_htree_create_dir_info(file, ctx->pos);
if (!info)
return -ENOMEM;
file->private_data = info;
}
ext4_htree_init_dir_info(file, ctx->pos);
if (ctx->pos == ext4_get_htree_eof(file))
return 0; /* EOF */
@ -590,10 +584,10 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)
* cached entries.
*/
if ((!info->curr_node) ||
!inode_eq_iversion(inode, file->f_version)) {
!inode_eq_iversion(inode, info->cookie)) {
info->curr_node = NULL;
free_rb_tree_fname(&info->root);
file->f_version = inode_query_iversion(inode);
info->cookie = inode_query_iversion(inode);
ret = ext4_htree_fill_tree(file, info->curr_hash,
info->curr_minor_hash,
&info->next_hash);
@ -664,7 +658,19 @@ int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf,
return 0;
}
static int ext4_dir_open(struct inode *inode, struct file *file)
{
struct dir_private_info *info;
info = kzalloc(sizeof(*info), GFP_KERNEL);
if (!info)
return -ENOMEM;
file->private_data = info;
return 0;
}
const struct file_operations ext4_dir_operations = {
.open = ext4_dir_open,
.llseek = ext4_dir_llseek,
.read = generic_read_dir,
.iterate_shared = ext4_readdir,

View File

@ -2553,6 +2553,8 @@ struct dir_private_info {
__u32 curr_hash;
__u32 curr_minor_hash;
__u32 next_hash;
u64 cookie;
bool initialized;
};
/* calculate the first block number of the group */

View File

@ -1460,6 +1460,7 @@ int ext4_read_inline_dir(struct file *file,
struct ext4_iloc iloc;
void *dir_buf = NULL;
int dotdot_offset, dotdot_size, extra_offset, extra_size;
struct dir_private_info *info = file->private_data;
ret = ext4_get_inode_loc(inode, &iloc);
if (ret)
@ -1503,12 +1504,12 @@ int ext4_read_inline_dir(struct file *file,
extra_size = extra_offset + inline_size;
/*
* If the version has changed since the last call to
* If the cookie has changed since the last call to
* readdir(2), then we might be pointing to an invalid
* dirent right now. Scan from the start of the inline
* dir to make sure.
*/
if (!inode_eq_iversion(inode, file->f_version)) {
if (!inode_eq_iversion(inode, info->cookie)) {
for (i = 0; i < extra_size && i < offset;) {
/*
* "." is with offset 0 and
@ -1540,7 +1541,7 @@ int ext4_read_inline_dir(struct file *file,
}
offset = i;
ctx->pos = offset;
file->f_version = inode_query_iversion(inode);
info->cookie = inode_query_iversion(inode);
}
while (ctx->pos < extra_size) {

View File

@ -33,6 +33,8 @@
#include <asm/siginfo.h>
#include <linux/uaccess.h>
#include "internal.h"
#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
static int setfl(int fd, struct file * filp, unsigned int arg)
@ -87,22 +89,64 @@ static int setfl(int fd, struct file * filp, unsigned int arg)
return error;
}
/*
* Allocate an file->f_owner struct if it doesn't exist, handling racing
* allocations correctly.
*/
int file_f_owner_allocate(struct file *file)
{
struct fown_struct *f_owner;
f_owner = file_f_owner(file);
if (f_owner)
return 0;
f_owner = kzalloc(sizeof(struct fown_struct), GFP_KERNEL);
if (!f_owner)
return -ENOMEM;
rwlock_init(&f_owner->lock);
f_owner->file = file;
/* If someone else raced us, drop our allocation. */
if (unlikely(cmpxchg(&file->f_owner, NULL, f_owner)))
kfree(f_owner);
return 0;
}
EXPORT_SYMBOL(file_f_owner_allocate);
void file_f_owner_release(struct file *file)
{
struct fown_struct *f_owner;
f_owner = file_f_owner(file);
if (f_owner) {
put_pid(f_owner->pid);
kfree(f_owner);
}
}
static void f_modown(struct file *filp, struct pid *pid, enum pid_type type,
int force)
{
write_lock_irq(&filp->f_owner.lock);
if (force || !filp->f_owner.pid) {
put_pid(filp->f_owner.pid);
filp->f_owner.pid = get_pid(pid);
filp->f_owner.pid_type = type;
struct fown_struct *f_owner;
f_owner = file_f_owner(filp);
if (WARN_ON_ONCE(!f_owner))
return;
write_lock_irq(&f_owner->lock);
if (force || !f_owner->pid) {
put_pid(f_owner->pid);
f_owner->pid = get_pid(pid);
f_owner->pid_type = type;
if (pid) {
const struct cred *cred = current_cred();
filp->f_owner.uid = cred->uid;
filp->f_owner.euid = cred->euid;
f_owner->uid = cred->uid;
f_owner->euid = cred->euid;
}
}
write_unlock_irq(&filp->f_owner.lock);
write_unlock_irq(&f_owner->lock);
}
void __f_setown(struct file *filp, struct pid *pid, enum pid_type type,
@ -119,6 +163,8 @@ int f_setown(struct file *filp, int who, int force)
struct pid *pid = NULL;
int ret = 0;
might_sleep();
type = PIDTYPE_TGID;
if (who < 0) {
/* avoid overflow below */
@ -129,6 +175,10 @@ int f_setown(struct file *filp, int who, int force)
who = -who;
}
ret = file_f_owner_allocate(filp);
if (ret)
return ret;
rcu_read_lock();
if (who) {
pid = find_vpid(who);
@ -152,16 +202,21 @@ void f_delown(struct file *filp)
pid_t f_getown(struct file *filp)
{
pid_t pid = 0;
struct fown_struct *f_owner;
read_lock_irq(&filp->f_owner.lock);
f_owner = file_f_owner(filp);
if (!f_owner)
return pid;
read_lock_irq(&f_owner->lock);
rcu_read_lock();
if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type)) {
pid = pid_vnr(filp->f_owner.pid);
if (filp->f_owner.pid_type == PIDTYPE_PGID)
if (pid_task(f_owner->pid, f_owner->pid_type)) {
pid = pid_vnr(f_owner->pid);
if (f_owner->pid_type == PIDTYPE_PGID)
pid = -pid;
}
rcu_read_unlock();
read_unlock_irq(&filp->f_owner.lock);
read_unlock_irq(&f_owner->lock);
return pid;
}
@ -194,6 +249,10 @@ static int f_setown_ex(struct file *filp, unsigned long arg)
return -EINVAL;
}
ret = file_f_owner_allocate(filp);
if (ret)
return ret;
rcu_read_lock();
pid = find_vpid(owner.pid);
if (owner.pid && !pid)
@ -210,13 +269,20 @@ static int f_getown_ex(struct file *filp, unsigned long arg)
struct f_owner_ex __user *owner_p = (void __user *)arg;
struct f_owner_ex owner = {};
int ret = 0;
struct fown_struct *f_owner;
enum pid_type pid_type = PIDTYPE_PID;
read_lock_irq(&filp->f_owner.lock);
rcu_read_lock();
if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type))
owner.pid = pid_vnr(filp->f_owner.pid);
rcu_read_unlock();
switch (filp->f_owner.pid_type) {
f_owner = file_f_owner(filp);
if (f_owner) {
read_lock_irq(&f_owner->lock);
rcu_read_lock();
if (pid_task(f_owner->pid, f_owner->pid_type))
owner.pid = pid_vnr(f_owner->pid);
rcu_read_unlock();
pid_type = f_owner->pid_type;
}
switch (pid_type) {
case PIDTYPE_PID:
owner.type = F_OWNER_TID;
break;
@ -234,7 +300,8 @@ static int f_getown_ex(struct file *filp, unsigned long arg)
ret = -EINVAL;
break;
}
read_unlock_irq(&filp->f_owner.lock);
if (f_owner)
read_unlock_irq(&f_owner->lock);
if (!ret) {
ret = copy_to_user(owner_p, &owner, sizeof(owner));
@ -248,14 +315,18 @@ static int f_getown_ex(struct file *filp, unsigned long arg)
static int f_getowner_uids(struct file *filp, unsigned long arg)
{
struct user_namespace *user_ns = current_user_ns();
struct fown_struct *f_owner;
uid_t __user *dst = (void __user *)arg;
uid_t src[2];
uid_t src[2] = {0, 0};
int err;
read_lock_irq(&filp->f_owner.lock);
src[0] = from_kuid(user_ns, filp->f_owner.uid);
src[1] = from_kuid(user_ns, filp->f_owner.euid);
read_unlock_irq(&filp->f_owner.lock);
f_owner = file_f_owner(filp);
if (f_owner) {
read_lock_irq(&f_owner->lock);
src[0] = from_kuid(user_ns, f_owner->uid);
src[1] = from_kuid(user_ns, f_owner->euid);
read_unlock_irq(&f_owner->lock);
}
err = put_user(src[0], &dst[0]);
err |= put_user(src[1], &dst[1]);
@ -349,6 +420,30 @@ static long f_created_query(const struct file *filp)
return !!(filp->f_mode & FMODE_CREATED);
}
static int f_owner_sig(struct file *filp, int signum, bool setsig)
{
int ret = 0;
struct fown_struct *f_owner;
might_sleep();
if (setsig) {
if (!valid_signal(signum))
return -EINVAL;
ret = file_f_owner_allocate(filp);
if (ret)
return ret;
}
f_owner = file_f_owner(filp);
if (setsig)
f_owner->signum = signum;
else if (f_owner)
ret = f_owner->signum;
return ret;
}
static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
struct file *filp)
{
@ -430,15 +525,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
err = f_getowner_uids(filp, arg);
break;
case F_GETSIG:
err = filp->f_owner.signum;
err = f_owner_sig(filp, 0, false);
break;
case F_SETSIG:
/* arg == 0 restores default behaviour. */
if (!valid_signal(argi)) {
break;
}
err = 0;
filp->f_owner.signum = argi;
err = f_owner_sig(filp, argi, true);
break;
case F_GETLEASE:
err = fcntl_getlease(filp);
@ -854,14 +944,19 @@ static void send_sigurg_to_task(struct task_struct *p,
do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, type);
}
int send_sigurg(struct fown_struct *fown)
int send_sigurg(struct file *file)
{
struct fown_struct *fown;
struct task_struct *p;
enum pid_type type;
struct pid *pid;
unsigned long flags;
int ret = 0;
fown = file_f_owner(file);
if (!fown)
return 0;
read_lock_irqsave(&fown->lock, flags);
type = fown->pid_type;
@ -1037,13 +1132,16 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
}
read_lock_irqsave(&fa->fa_lock, flags);
if (fa->fa_file) {
fown = &fa->fa_file->f_owner;
fown = file_f_owner(fa->fa_file);
if (!fown)
goto next;
/* Don't send SIGURG to processes which have not set a
queued signum: SIGURG has its own default signalling
mechanism. */
if (!(sig == SIGURG && fown->signum == 0))
send_sigio(fown, fa->fa_fd, band);
}
next:
read_unlock_irqrestore(&fa->fa_lock, flags);
fa = rcu_dereference(fa->fa_next);
}

View File

@ -156,8 +156,14 @@ static int init_file(struct file *f, int flags, const struct cred *cred)
return error;
}
rwlock_init(&f->f_owner.lock);
spin_lock_init(&f->f_lock);
/*
* Note that f_pos_lock is only used for files raising
* FMODE_ATOMIC_POS and directories. Other files such as pipes
* don't need it and since f_pos_lock is in a union may reuse
* the space for other purposes. They are expected to initialize
* the respective member when opening the file.
*/
mutex_init(&f->f_pos_lock);
f->f_flags = flags;
f->f_mode = OPEN_FMODE(flags);
@ -428,7 +434,7 @@ static void __fput(struct file *file)
cdev_put(inode->i_cdev);
}
fops_put(file->f_op);
put_pid(file->f_owner.pid);
file_f_owner_release(file);
put_file_access(file);
dput(dentry);
if (unlikely(mode & FMODE_NEED_UNMOUNT))
@ -515,9 +521,9 @@ EXPORT_SYMBOL(__fput_sync);
void __init files_init(void)
{
filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN |
SLAB_PANIC | SLAB_ACCOUNT, NULL);
filp_cachep = kmem_cache_create_rcu("filp", sizeof(struct file),
offsetof(struct file, f_freeptr),
SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
percpu_counter_init(&nr_files, 0, GFP_KERNEL);
}

View File

@ -337,3 +337,4 @@ static inline bool path_mounted(const struct path *path)
{
return path->mnt->mnt_root == path->dentry;
}
void file_f_owner_release(struct file *file);

View File

@ -1451,7 +1451,7 @@ int lease_modify(struct file_lease *fl, int arg, struct list_head *dispose)
struct file *filp = fl->c.flc_file;
f_delown(filp);
filp->f_owner.signum = 0;
file_f_owner(filp)->signum = 0;
fasync_helper(0, fl->c.flc_file, 0, &fl->fl_fasync);
if (fl->fl_fasync != NULL) {
printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync);
@ -1783,6 +1783,10 @@ generic_add_lease(struct file *filp, int arg, struct file_lease **flp, void **pr
lease = *flp;
trace_generic_add_lease(inode, lease);
error = file_f_owner_allocate(filp);
if (error)
return error;
/* Note that arg is never F_UNLCK here */
ctx = locks_get_lock_context(inode, arg);
if (!ctx)

View File

@ -110,7 +110,7 @@ static int dnotify_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
prev = &dn->dn_next;
continue;
}
fown = &dn->dn_filp->f_owner;
fown = file_f_owner(dn->dn_filp);
send_sigio(fown, dn->dn_fd, POLL_MSG);
if (dn->dn_mask & FS_DN_MULTISHOT)
prev = &dn->dn_next;
@ -316,6 +316,10 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
goto out_err;
}
error = file_f_owner_allocate(filp);
if (error)
goto out_err;
/* set up the new_fsn_mark and new_dn_mark */
new_fsn_mark = &new_dn_mark->fsn_mark;
fsnotify_init_mark(new_fsn_mark, dnotify_group);

View File

@ -1932,6 +1932,7 @@ int ocfs2_readdir(struct file *file, struct dir_context *ctx)
{
int error = 0;
struct inode *inode = file_inode(file);
struct ocfs2_file_private *fp = file->private_data;
int lock_level = 0;
trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno);
@ -1952,7 +1953,7 @@ int ocfs2_readdir(struct file *file, struct dir_context *ctx)
goto bail_nolock;
}
error = ocfs2_dir_foreach_blk(inode, &file->f_version, ctx, false);
error = ocfs2_dir_foreach_blk(inode, &fp->cookie, ctx, false);
ocfs2_inode_unlock(inode, lock_level);
if (error)

View File

@ -2751,6 +2751,13 @@ out_unlock:
return remapped > 0 ? remapped : ret;
}
static loff_t ocfs2_dir_llseek(struct file *file, loff_t offset, int whence)
{
struct ocfs2_file_private *fp = file->private_data;
return generic_llseek_cookie(file, offset, whence, &fp->cookie);
}
const struct inode_operations ocfs2_file_iops = {
.setattr = ocfs2_setattr,
.getattr = ocfs2_getattr,
@ -2798,7 +2805,7 @@ const struct file_operations ocfs2_fops = {
WRAP_DIR_ITER(ocfs2_readdir) // FIXME!
const struct file_operations ocfs2_dops = {
.llseek = generic_file_llseek,
.llseek = ocfs2_dir_llseek,
.read = generic_read_dir,
.iterate_shared = shared_ocfs2_readdir,
.fsync = ocfs2_sync_file,
@ -2844,7 +2851,7 @@ const struct file_operations ocfs2_fops_no_plocks = {
};
const struct file_operations ocfs2_dops_no_plocks = {
.llseek = generic_file_llseek,
.llseek = ocfs2_dir_llseek,
.read = generic_read_dir,
.iterate_shared = shared_ocfs2_readdir,
.fsync = ocfs2_sync_file,

View File

@ -20,6 +20,7 @@ struct ocfs2_alloc_context;
enum ocfs2_alloc_restarted;
struct ocfs2_file_private {
u64 cookie;
struct file *fp_file;
struct mutex fp_mutex;
struct ocfs2_lock_res fp_flock;

View File

@ -686,7 +686,7 @@ pipe_poll(struct file *filp, poll_table *wait)
if (filp->f_mode & FMODE_READ) {
if (!pipe_empty(head, tail))
mask |= EPOLLIN | EPOLLRDNORM;
if (!pipe->writers && filp->f_version != pipe->w_counter)
if (!pipe->writers && filp->f_pipe != pipe->w_counter)
mask |= EPOLLHUP;
}
@ -945,6 +945,7 @@ int create_pipe_files(struct file **res, int flags)
}
f->private_data = inode->i_pipe;
f->f_pipe = 0;
res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK),
&pipefifo_fops);
@ -954,6 +955,7 @@ int create_pipe_files(struct file **res, int flags)
return PTR_ERR(res[0]);
}
res[0]->private_data = inode->i_pipe;
res[0]->f_pipe = 0;
res[1] = f;
stream_open(inode, res[0]);
stream_open(inode, res[1]);
@ -1108,7 +1110,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC;
int ret;
filp->f_version = 0;
filp->f_pipe = 0;
spin_lock(&inode->i_lock);
if (inode->i_pipe) {
@ -1155,7 +1157,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
if ((filp->f_flags & O_NONBLOCK)) {
/* suppress EPOLLHUP until we have
* seen a writer */
filp->f_version = pipe->w_counter;
filp->f_pipe = pipe->w_counter;
} else {
if (wait_for_partner(pipe, &pipe->w_counter))
goto err_rd;

View File

@ -3868,12 +3868,12 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
if (!dir_emit_dots(file, ctx))
return 0;
/* f_version caches the tgid value that the last readdir call couldn't
* return. lseek aka telldir automagically resets f_version to 0.
/* We cache the tgid value that the last readdir call couldn't
* return and lseek resets it to 0.
*/
ns = proc_pid_ns(inode->i_sb);
tid = (int)file->f_version;
file->f_version = 0;
tid = (int)(intptr_t)file->private_data;
file->private_data = NULL;
for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns);
task;
task = next_tid(task), ctx->pos++) {
@ -3888,7 +3888,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
proc_task_instantiate, task, NULL)) {
/* returning this tgid failed, save it as the first
* pid for the next readir call */
file->f_version = (u64)tid;
file->private_data = (void *)(intptr_t)tid;
put_task_struct(task);
break;
}
@ -3913,6 +3913,24 @@ static int proc_task_getattr(struct mnt_idmap *idmap,
return 0;
}
/*
* proc_task_readdir() set @file->private_data to a positive integer
* value, so casting that to u64 is safe. generic_llseek_cookie() will
* set @cookie to 0, so casting to an int is safe. The WARN_ON_ONCE() is
* here to catch any unexpected change in behavior either in
* proc_task_readdir() or generic_llseek_cookie().
*/
static loff_t proc_dir_llseek(struct file *file, loff_t offset, int whence)
{
u64 cookie = (u64)(intptr_t)file->private_data;
loff_t off;
off = generic_llseek_cookie(file, offset, whence, &cookie);
WARN_ON_ONCE(cookie > INT_MAX);
file->private_data = (void *)(intptr_t)cookie; /* serialized by f_pos_lock */
return off;
}
static const struct inode_operations proc_task_inode_operations = {
.lookup = proc_task_lookup,
.getattr = proc_task_getattr,
@ -3923,7 +3941,7 @@ static const struct inode_operations proc_task_inode_operations = {
static const struct file_operations proc_task_operations = {
.read = generic_read_dir,
.iterate_shared = proc_task_readdir,
.llseek = generic_file_llseek,
.llseek = proc_dir_llseek,
};
void __init set_proc_pid_nlink(void)

View File

@ -39,6 +39,35 @@ static inline bool unsigned_offsets(struct file *file)
return file->f_op->fop_flags & FOP_UNSIGNED_OFFSET;
}
/**
* vfs_setpos_cookie - update the file offset for lseek and reset cookie
* @file: file structure in question
* @offset: file offset to seek to
* @maxsize: maximum file size
* @cookie: cookie to reset
*
* Update the file offset to the value specified by @offset if the given
* offset is valid and it is not equal to the current file offset and
* reset the specified cookie to indicate that a seek happened.
*
* Return the specified offset on success and -EINVAL on invalid offset.
*/
static loff_t vfs_setpos_cookie(struct file *file, loff_t offset,
loff_t maxsize, u64 *cookie)
{
if (offset < 0 && !unsigned_offsets(file))
return -EINVAL;
if (offset > maxsize)
return -EINVAL;
if (offset != file->f_pos) {
file->f_pos = offset;
if (cookie)
*cookie = 0;
}
return offset;
}
/**
* vfs_setpos - update the file offset for lseek
* @file: file structure in question
@ -53,19 +82,63 @@ static inline bool unsigned_offsets(struct file *file)
*/
loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
{
if (offset < 0 && !unsigned_offsets(file))
return -EINVAL;
if (offset > maxsize)
return -EINVAL;
if (offset != file->f_pos) {
file->f_pos = offset;
file->f_version = 0;
}
return offset;
return vfs_setpos_cookie(file, offset, maxsize, NULL);
}
EXPORT_SYMBOL(vfs_setpos);
/**
* must_set_pos - check whether f_pos has to be updated
* @file: file to seek on
* @offset: offset to use
* @whence: type of seek operation
* @eof: end of file
*
* Check whether f_pos needs to be updated and update @offset according
* to @whence.
*
* Return: 0 if f_pos doesn't need to be updated, 1 if f_pos has to be
* updated, and negative error code on failure.
*/
static int must_set_pos(struct file *file, loff_t *offset, int whence, loff_t eof)
{
switch (whence) {
case SEEK_END:
*offset += eof;
break;
case SEEK_CUR:
/*
* Here we special-case the lseek(fd, 0, SEEK_CUR)
* position-querying operation. Avoid rewriting the "same"
* f_pos value back to the file because a concurrent read(),
* write() or lseek() might have altered it
*/
if (*offset == 0) {
*offset = file->f_pos;
return 0;
}
break;
case SEEK_DATA:
/*
* In the generic case the entire file is data, so as long as
* offset isn't at the end of the file then the offset is data.
*/
if ((unsigned long long)*offset >= eof)
return -ENXIO;
break;
case SEEK_HOLE:
/*
* There is a virtual hole at the end of the file, so as long as
* offset isn't i_size or larger, return i_size.
*/
if ((unsigned long long)*offset >= eof)
return -ENXIO;
*offset = eof;
break;
}
return 1;
}
/**
* generic_file_llseek_size - generic llseek implementation for regular files
* @file: file structure to seek on
@ -86,51 +159,73 @@ loff_t
generic_file_llseek_size(struct file *file, loff_t offset, int whence,
loff_t maxsize, loff_t eof)
{
switch (whence) {
case SEEK_END:
offset += eof;
break;
case SEEK_CUR:
/*
* Here we special-case the lseek(fd, 0, SEEK_CUR)
* position-querying operation. Avoid rewriting the "same"
* f_pos value back to the file because a concurrent read(),
* write() or lseek() might have altered it
*/
if (offset == 0)
return file->f_pos;
/*
* f_lock protects against read/modify/write race with other
* SEEK_CURs. Note that parallel writes and reads behave
* like SEEK_SET.
*/
spin_lock(&file->f_lock);
offset = vfs_setpos(file, file->f_pos + offset, maxsize);
spin_unlock(&file->f_lock);
int ret;
ret = must_set_pos(file, &offset, whence, eof);
if (ret < 0)
return ret;
if (ret == 0)
return offset;
case SEEK_DATA:
if (whence == SEEK_CUR) {
/*
* In the generic case the entire file is data, so as long as
* offset isn't at the end of the file then the offset is data.
* f_lock protects against read/modify/write race with
* other SEEK_CURs. Note that parallel writes and reads
* behave like SEEK_SET.
*/
if ((unsigned long long)offset >= eof)
return -ENXIO;
break;
case SEEK_HOLE:
/*
* There is a virtual hole at the end of the file, so as long as
* offset isn't i_size or larger, return i_size.
*/
if ((unsigned long long)offset >= eof)
return -ENXIO;
offset = eof;
break;
guard(spinlock)(&file->f_lock);
return vfs_setpos(file, file->f_pos + offset, maxsize);
}
return vfs_setpos(file, offset, maxsize);
}
EXPORT_SYMBOL(generic_file_llseek_size);
/**
* generic_llseek_cookie - versioned llseek implementation
* @file: file structure to seek on
* @offset: file offset to seek to
* @whence: type of seek
* @cookie: cookie to update
*
* See generic_file_llseek for a general description and locking assumptions.
*
* In contrast to generic_file_llseek, this function also resets a
* specified cookie to indicate a seek took place.
*/
loff_t generic_llseek_cookie(struct file *file, loff_t offset, int whence,
u64 *cookie)
{
struct inode *inode = file->f_mapping->host;
loff_t maxsize = inode->i_sb->s_maxbytes;
loff_t eof = i_size_read(inode);
int ret;
if (WARN_ON_ONCE(!cookie))
return -EINVAL;
/*
* Require that this is only used for directories that guarantee
* synchronization between readdir and seek so that an update to
* @cookie is correctly synchronized with concurrent readdir.
*/
if (WARN_ON_ONCE(!(file->f_mode & FMODE_ATOMIC_POS)))
return -EINVAL;
ret = must_set_pos(file, &offset, whence, eof);
if (ret < 0)
return ret;
if (ret == 0)
return offset;
/* No need to hold f_lock because we know that f_pos_lock is held. */
if (whence == SEEK_CUR)
return vfs_setpos_cookie(file, file->f_pos + offset, maxsize, cookie);
return vfs_setpos_cookie(file, offset, maxsize, cookie);
}
EXPORT_SYMBOL(generic_llseek_cookie);
/**
* generic_file_llseek - generic llseek implementation for regular files
* @file: file structure to seek on
@ -270,10 +365,8 @@ loff_t default_llseek(struct file *file, loff_t offset, int whence)
}
retval = -EINVAL;
if (offset >= 0 || unsigned_offsets(file)) {
if (offset != file->f_pos) {
if (offset != file->f_pos)
file->f_pos = offset;
file->f_version = 0;
}
retval = offset;
}
out:

View File

@ -555,6 +555,11 @@ static unsigned int vfs_dent_type(uint8_t type)
return 0;
}
struct ubifs_dir_data {
struct ubifs_dent_node *dent;
u64 cookie;
};
/*
* The classical Unix view for directory is that it is a linear array of
* (name, inode number) entries. Linux/VFS assumes this model as well.
@ -582,6 +587,7 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
struct inode *dir = file_inode(file);
struct ubifs_info *c = dir->i_sb->s_fs_info;
bool encrypted = IS_ENCRYPTED(dir);
struct ubifs_dir_data *data = file->private_data;
dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, ctx->pos);
@ -604,27 +610,27 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
fstr_real_len = fstr.len;
}
if (file->f_version == 0) {
if (data->cookie == 0) {
/*
* The file was seek'ed, which means that @file->private_data
* The file was seek'ed, which means that @data->dent
* is now invalid. This may also be just the first
* 'ubifs_readdir()' invocation, in which case
* @file->private_data is NULL, and the below code is
* @data->dent is NULL, and the below code is
* basically a no-op.
*/
kfree(file->private_data);
file->private_data = NULL;
kfree(data->dent);
data->dent = NULL;
}
/*
* 'generic_file_llseek()' unconditionally sets @file->f_version to
* zero, and we use this for detecting whether the file was seek'ed.
* 'ubifs_dir_llseek()' sets @data->cookie to zero, and we use this
* for detecting whether the file was seek'ed.
*/
file->f_version = 1;
data->cookie = 1;
/* File positions 0 and 1 correspond to "." and ".." */
if (ctx->pos < 2) {
ubifs_assert(c, !file->private_data);
ubifs_assert(c, !data->dent);
if (!dir_emit_dots(file, ctx)) {
if (encrypted)
fscrypt_fname_free_buffer(&fstr);
@ -641,10 +647,10 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
}
ctx->pos = key_hash_flash(c, &dent->key);
file->private_data = dent;
data->dent = dent;
}
dent = file->private_data;
dent = data->dent;
if (!dent) {
/*
* The directory was seek'ed to and is now readdir'ed.
@ -658,7 +664,7 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
goto out;
}
ctx->pos = key_hash_flash(c, &dent->key);
file->private_data = dent;
data->dent = dent;
}
while (1) {
@ -701,15 +707,15 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
goto out;
}
kfree(file->private_data);
kfree(data->dent);
ctx->pos = key_hash_flash(c, &dent->key);
file->private_data = dent;
data->dent = dent;
cond_resched();
}
out:
kfree(file->private_data);
file->private_data = NULL;
kfree(data->dent);
data->dent = NULL;
if (encrypted)
fscrypt_fname_free_buffer(&fstr);
@ -733,7 +739,10 @@ out:
/* Free saved readdir() state when the directory is closed */
static int ubifs_dir_release(struct inode *dir, struct file *file)
{
kfree(file->private_data);
struct ubifs_dir_data *data = file->private_data;
kfree(data->dent);
kfree(data);
file->private_data = NULL;
return 0;
}
@ -1712,6 +1721,24 @@ int ubifs_getattr(struct mnt_idmap *idmap, const struct path *path,
return 0;
}
static int ubifs_dir_open(struct inode *inode, struct file *file)
{
struct ubifs_dir_data *data;
data = kzalloc(sizeof(struct ubifs_dir_data), GFP_KERNEL);
if (!data)
return -ENOMEM;
file->private_data = data;
return 0;
}
static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int whence)
{
struct ubifs_dir_data *data = file->private_data;
return generic_llseek_cookie(file, offset, whence, &data->cookie);
}
const struct inode_operations ubifs_dir_inode_operations = {
.lookup = ubifs_lookup,
.create = ubifs_create,
@ -1732,7 +1759,8 @@ const struct inode_operations ubifs_dir_inode_operations = {
};
const struct file_operations ubifs_dir_operations = {
.llseek = generic_file_llseek,
.open = ubifs_dir_open,
.llseek = ubifs_dir_llseek,
.release = ubifs_dir_release,
.read = generic_read_dir,
.iterate_shared = ubifs_readdir,

View File

@ -60,7 +60,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
* identifying beginning of dir entry (names are under user control),
* we need to scan the directory from the beginning.
*/
if (!inode_eq_iversion(dir, file->f_version)) {
if (!inode_eq_iversion(dir, *(u64 *)file->private_data)) {
emit_pos = nf_pos;
nf_pos = 0;
} else {
@ -122,15 +122,37 @@ out_iter:
udf_fiiter_release(&iter);
out:
if (pos_valid)
file->f_version = inode_query_iversion(dir);
*(u64 *)file->private_data = inode_query_iversion(dir);
kfree(fname);
return ret;
}
static int udf_dir_open(struct inode *inode, struct file *file)
{
file->private_data = kzalloc(sizeof(u64), GFP_KERNEL);
if (!file->private_data)
return -ENOMEM;
return 0;
}
static int udf_dir_release(struct inode *inode, struct file *file)
{
kfree(file->private_data);
return 0;
}
static loff_t udf_dir_llseek(struct file *file, loff_t offset, int whence)
{
return generic_llseek_cookie(file, offset, whence,
(u64 *)file->private_data);
}
/* readdir and lookup functions */
const struct file_operations udf_dir_operations = {
.llseek = generic_file_llseek,
.open = udf_dir_open,
.release = udf_dir_release,
.llseek = udf_dir_llseek,
.read = generic_read_dir,
.iterate_shared = udf_readdir,
.unlocked_ioctl = udf_ioctl,

View File

@ -416,7 +416,7 @@ ufs_readdir(struct file *file, struct dir_context *ctx)
unsigned long n = pos >> PAGE_SHIFT;
unsigned long npages = dir_pages(inode);
unsigned chunk_mask = ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1);
bool need_revalidate = !inode_eq_iversion(inode, file->f_version);
bool need_revalidate = !inode_eq_iversion(inode, *(u64 *)file->private_data);
unsigned flags = UFS_SB(sb)->s_flags;
UFSD("BEGIN\n");
@ -442,7 +442,7 @@ ufs_readdir(struct file *file, struct dir_context *ctx)
offset = ufs_validate_entry(sb, kaddr, offset, chunk_mask);
ctx->pos = (n<<PAGE_SHIFT) + offset;
}
file->f_version = inode_query_iversion(inode);
*(u64 *)file->private_data = inode_query_iversion(inode);
need_revalidate = false;
}
de = (struct ufs_dir_entry *)(kaddr+offset);
@ -627,9 +627,31 @@ not_empty:
return 0;
}
static int ufs_dir_open(struct inode *inode, struct file *file)
{
file->private_data = kzalloc(sizeof(u64), GFP_KERNEL);
if (!file->private_data)
return -ENOMEM;
return 0;
}
static int ufs_dir_release(struct inode *inode, struct file *file)
{
kfree(file->private_data);
return 0;
}
static loff_t ufs_dir_llseek(struct file *file, loff_t offset, int whence)
{
return generic_llseek_cookie(file, offset, whence,
(u64 *)file->private_data);
}
const struct file_operations ufs_dir_operations = {
.open = ufs_dir_open,
.release = ufs_dir_release,
.read = generic_read_dir,
.iterate_shared = ufs_readdir,
.fsync = generic_file_fsync,
.llseek = generic_file_llseek,
.llseek = ufs_dir_llseek,
};

View File

@ -963,6 +963,7 @@ static inline unsigned imajor(const struct inode *inode)
}
struct fown_struct {
struct file *file; /* backpointer for security modules */
rwlock_t lock; /* protects pid, uid, euid fields */
struct pid *pid; /* pid or -pgrp where SIGIO should be sent */
enum pid_type pid_type; /* Kind of process group SIGIO should be sent to */
@ -1002,52 +1003,69 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
index < ra->start + ra->size);
}
/*
* f_{lock,count,pos_lock} members can be highly contended and share
* the same cacheline. f_{lock,mode} are very frequently used together
* and so share the same cacheline as well. The read-mostly
* f_{path,inode,op} are kept on a separate cacheline.
/**
* struct file - Represents a file
* @f_count: reference count
* @f_lock: Protects f_ep, f_flags. Must not be taken from IRQ context.
* @f_mode: FMODE_* flags often used in hotpaths
* @f_op: file operations
* @f_mapping: Contents of a cacheable, mappable object.
* @private_data: filesystem or driver specific data
* @f_inode: cached inode
* @f_flags: file flags
* @f_iocb_flags: iocb flags
* @f_cred: stashed credentials of creator/opener
* @f_path: path of the file
* @f_pos_lock: lock protecting file position
* @f_pipe: specific to pipes
* @f_pos: file position
* @f_security: LSM security context of this file
* @f_owner: file owner
* @f_wb_err: writeback error
* @f_sb_err: per sb writeback errors
* @f_ep: link of all epoll hooks for this file
* @f_task_work: task work entry point
* @f_llist: work queue entrypoint
* @f_ra: file's readahead state
* @f_freeptr: Pointer used by SLAB_TYPESAFE_BY_RCU file cache (don't touch.)
*/
struct file {
union {
/* fput() uses task work when closing and freeing file (default). */
struct callback_head f_task_work;
/* fput() must use workqueue (most kernel threads). */
struct llist_node f_llist;
unsigned int f_iocb_flags;
};
/*
* Protects f_ep, f_flags.
* Must not be taken from IRQ context.
*/
spinlock_t f_lock;
fmode_t f_mode;
atomic_long_t f_count;
struct mutex f_pos_lock;
loff_t f_pos;
unsigned int f_flags;
struct fown_struct f_owner;
const struct cred *f_cred;
struct file_ra_state f_ra;
struct path f_path;
struct inode *f_inode; /* cached value */
atomic_long_t f_count;
spinlock_t f_lock;
fmode_t f_mode;
const struct file_operations *f_op;
u64 f_version;
struct address_space *f_mapping;
void *private_data;
struct inode *f_inode;
unsigned int f_flags;
unsigned int f_iocb_flags;
const struct cred *f_cred;
/* --- cacheline 1 boundary (64 bytes) --- */
struct path f_path;
union {
/* regular files (with FMODE_ATOMIC_POS) and directories */
struct mutex f_pos_lock;
/* pipes */
u64 f_pipe;
};
loff_t f_pos;
#ifdef CONFIG_SECURITY
void *f_security;
void *f_security;
#endif
/* needed for tty driver, and maybe others */
void *private_data;
/* --- cacheline 2 boundary (128 bytes) --- */
struct fown_struct *f_owner;
errseq_t f_wb_err;
errseq_t f_sb_err;
#ifdef CONFIG_EPOLL
/* Used by fs/eventpoll.c to link all the hooks to this file */
struct hlist_head *f_ep;
#endif /* #ifdef CONFIG_EPOLL */
struct address_space *f_mapping;
errseq_t f_wb_err;
errseq_t f_sb_err; /* for syncfs */
struct hlist_head *f_ep;
#endif
union {
struct callback_head f_task_work;
struct llist_node f_llist;
struct file_ra_state f_ra;
freeptr_t f_freeptr;
};
/* --- cacheline 3 boundary (192 bytes) --- */
} __randomize_layout
__attribute__((aligned(4))); /* lest something weird decides that 2 is OK */
@ -1092,6 +1110,12 @@ struct file_lease;
#define OFFT_OFFSET_MAX type_max(off_t)
#endif
int file_f_owner_allocate(struct file *file);
static inline struct fown_struct *file_f_owner(const struct file *file)
{
return READ_ONCE(file->f_owner);
}
extern void send_sigio(struct fown_struct *fown, int fd, int band);
static inline struct inode *file_inode(const struct file *f)
@ -1140,7 +1164,7 @@ extern void __f_setown(struct file *filp, struct pid *, enum pid_type, int force
extern int f_setown(struct file *filp, int who, int force);
extern void f_delown(struct file *filp);
extern pid_t f_getown(struct file *filp);
extern int send_sigurg(struct fown_struct *fown);
extern int send_sigurg(struct file *file);
/*
* sb->s_flags. Note that these mirror the equivalent MS_* flags where
@ -3207,6 +3231,8 @@ extern loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize);
extern loff_t generic_file_llseek(struct file *file, loff_t offset, int whence);
extern loff_t generic_file_llseek_size(struct file *file, loff_t offset,
int whence, loff_t maxsize, loff_t eof);
loff_t generic_llseek_cookie(struct file *file, loff_t offset, int whence,
u64 *cookie);
extern loff_t fixed_size_llseek(struct file *file, loff_t offset,
int whence, loff_t size);
extern loff_t no_seek_end_llseek_size(struct file *, loff_t, int, loff_t);

View File

@ -212,6 +212,12 @@ enum _slab_flag_bits {
#define SLAB_NO_OBJ_EXT __SLAB_FLAG_UNUSED
#endif
/*
* freeptr_t represents a SLUB freelist pointer, which might be encoded
* and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled.
*/
typedef struct { unsigned long v; } freeptr_t;
/*
* ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
*
@ -242,6 +248,9 @@ struct kmem_cache *kmem_cache_create_usercopy(const char *name,
slab_flags_t flags,
unsigned int useroffset, unsigned int usersize,
void (*ctor)(void *));
struct kmem_cache *kmem_cache_create_rcu(const char *name, unsigned int size,
unsigned int freeptr_offset,
slab_flags_t flags);
void kmem_cache_destroy(struct kmem_cache *s);
int kmem_cache_shrink(struct kmem_cache *s);

View File

@ -261,6 +261,8 @@ struct kmem_cache {
unsigned int object_size; /* Object size without metadata */
struct reciprocal_value reciprocal_size;
unsigned int offset; /* Free pointer offset */
/* Specific free pointer requested (if not UINT_MAX) */
unsigned int rcu_freeptr_offset;
#ifdef CONFIG_SLUB_CPU_PARTIAL
/* Number of per cpu partial objects to keep around */
unsigned int cpu_partial;

View File

@ -202,10 +202,10 @@ struct kmem_cache *find_mergeable(unsigned int size, unsigned int align,
}
static struct kmem_cache *create_cache(const char *name,
unsigned int object_size, unsigned int align,
slab_flags_t flags, unsigned int useroffset,
unsigned int usersize, void (*ctor)(void *),
struct kmem_cache *root_cache)
unsigned int object_size, unsigned int freeptr_offset,
unsigned int align, slab_flags_t flags,
unsigned int useroffset, unsigned int usersize,
void (*ctor)(void *))
{
struct kmem_cache *s;
int err;
@ -213,6 +213,13 @@ static struct kmem_cache *create_cache(const char *name,
if (WARN_ON(useroffset + usersize > object_size))
useroffset = usersize = 0;
/* If a custom freelist pointer is requested make sure it's sane. */
err = -EINVAL;
if (freeptr_offset != UINT_MAX &&
(freeptr_offset >= object_size || !(flags & SLAB_TYPESAFE_BY_RCU) ||
!IS_ALIGNED(freeptr_offset, sizeof(freeptr_t))))
goto out;
err = -ENOMEM;
s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
if (!s)
@ -220,13 +227,13 @@ static struct kmem_cache *create_cache(const char *name,
s->name = name;
s->size = s->object_size = object_size;
s->rcu_freeptr_offset = freeptr_offset;
s->align = align;
s->ctor = ctor;
#ifdef CONFIG_HARDENED_USERCOPY
s->useroffset = useroffset;
s->usersize = usersize;
#endif
err = __kmem_cache_create(s, flags);
if (err)
goto out_free_cache;
@ -241,38 +248,10 @@ out:
return ERR_PTR(err);
}
/**
* kmem_cache_create_usercopy - Create a cache with a region suitable
* for copying to userspace
* @name: A string which is used in /proc/slabinfo to identify this cache.
* @size: The size of objects to be created in this cache.
* @align: The required alignment for the objects.
* @flags: SLAB flags
* @useroffset: Usercopy region offset
* @usersize: Usercopy region size
* @ctor: A constructor for the objects.
*
* Cannot be called within a interrupt, but can be interrupted.
* The @ctor is run when new pages are allocated by the cache.
*
* The flags are
*
* %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
* to catch references to uninitialised memory.
*
* %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check
* for buffer overruns.
*
* %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
* cacheline. This can be beneficial if you're counting cycles as closely
* as davem.
*
* Return: a pointer to the cache on success, NULL on failure.
*/
struct kmem_cache *
kmem_cache_create_usercopy(const char *name,
unsigned int size, unsigned int align,
slab_flags_t flags,
static struct kmem_cache *
do_kmem_cache_create_usercopy(const char *name,
unsigned int size, unsigned int freeptr_offset,
unsigned int align, slab_flags_t flags,
unsigned int useroffset, unsigned int usersize,
void (*ctor)(void *))
{
@ -332,9 +311,9 @@ kmem_cache_create_usercopy(const char *name,
goto out_unlock;
}
s = create_cache(cache_name, size,
s = create_cache(cache_name, size, freeptr_offset,
calculate_alignment(flags, align, size),
flags, useroffset, usersize, ctor, NULL);
flags, useroffset, usersize, ctor);
if (IS_ERR(s)) {
err = PTR_ERR(s);
kfree_const(cache_name);
@ -356,6 +335,44 @@ out_unlock:
}
return s;
}
/**
* kmem_cache_create_usercopy - Create a cache with a region suitable
* for copying to userspace
* @name: A string which is used in /proc/slabinfo to identify this cache.
* @size: The size of objects to be created in this cache.
* @align: The required alignment for the objects.
* @flags: SLAB flags
* @useroffset: Usercopy region offset
* @usersize: Usercopy region size
* @ctor: A constructor for the objects.
*
* Cannot be called within a interrupt, but can be interrupted.
* The @ctor is run when new pages are allocated by the cache.
*
* The flags are
*
* %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
* to catch references to uninitialised memory.
*
* %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check
* for buffer overruns.
*
* %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
* cacheline. This can be beneficial if you're counting cycles as closely
* as davem.
*
* Return: a pointer to the cache on success, NULL on failure.
*/
struct kmem_cache *
kmem_cache_create_usercopy(const char *name, unsigned int size,
unsigned int align, slab_flags_t flags,
unsigned int useroffset, unsigned int usersize,
void (*ctor)(void *))
{
return do_kmem_cache_create_usercopy(name, size, UINT_MAX, align, flags,
useroffset, usersize, ctor);
}
EXPORT_SYMBOL(kmem_cache_create_usercopy);
/**
@ -387,11 +404,50 @@ struct kmem_cache *
kmem_cache_create(const char *name, unsigned int size, unsigned int align,
slab_flags_t flags, void (*ctor)(void *))
{
return kmem_cache_create_usercopy(name, size, align, flags, 0, 0,
ctor);
return do_kmem_cache_create_usercopy(name, size, UINT_MAX, align, flags,
0, 0, ctor);
}
EXPORT_SYMBOL(kmem_cache_create);
/**
* kmem_cache_create_rcu - Create a SLAB_TYPESAFE_BY_RCU cache.
* @name: A string which is used in /proc/slabinfo to identify this cache.
* @size: The size of objects to be created in this cache.
* @freeptr_offset: The offset into the memory to the free pointer
* @flags: SLAB flags
*
* Cannot be called within an interrupt, but can be interrupted.
*
* See kmem_cache_create() for an explanation of possible @flags.
*
* By default SLAB_TYPESAFE_BY_RCU caches place the free pointer outside
* of the object. This might cause the object to grow in size. Callers
* that have a reason to avoid this can specify a custom free pointer
* offset in their struct where the free pointer will be placed.
*
* Note that placing the free pointer inside the object requires the
* caller to ensure that no fields are invalidated that are required to
* guard against object recycling (See SLAB_TYPESAFE_BY_RCU for
* details.).
*
* Using zero as a value for @freeptr_offset is valid. To request no
* offset UINT_MAX must be specified.
*
* Note that @ctor isn't supported with custom free pointers as a @ctor
* requires an external free pointer.
*
* Return: a pointer to the cache on success, NULL on failure.
*/
struct kmem_cache *kmem_cache_create_rcu(const char *name, unsigned int size,
unsigned int freeptr_offset,
slab_flags_t flags)
{
return do_kmem_cache_create_usercopy(name, size, freeptr_offset, 0,
flags | SLAB_TYPESAFE_BY_RCU, 0, 0,
NULL);
}
EXPORT_SYMBOL(kmem_cache_create_rcu);
static struct kmem_cache *kmem_buckets_cache __ro_after_init;
/**

View File

@ -465,12 +465,6 @@ static struct workqueue_struct *flushwq;
* Core slab cache functions
*******************************************************************/
/*
* freeptr_t represents a SLUB freelist pointer, which might be encoded
* and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled.
*/
typedef struct { unsigned long v; } freeptr_t;
/*
* Returns freelist pointer (ptr). With hardening, this is obfuscated
* with an XOR of the address where the pointer is held and a per-cache
@ -3925,6 +3919,9 @@ static void *__slab_alloc_node(struct kmem_cache *s,
/*
* If the object has been wiped upon free, make sure it's fully initialized by
* zeroing out freelist pointer.
*
* Note that we also wipe custom freelist pointers specified via
* s->rcu_freeptr_offset.
*/
static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
void *obj)
@ -5148,6 +5145,12 @@ static void set_cpu_partial(struct kmem_cache *s)
#endif
}
/* Was a valid freeptr offset requested? */
static inline bool has_freeptr_offset(const struct kmem_cache *s)
{
return s->rcu_freeptr_offset != UINT_MAX;
}
/*
* calculate_sizes() determines the order and the distribution of data within
* a slab object.
@ -5193,7 +5196,8 @@ static int calculate_sizes(struct kmem_cache *s)
*/
s->inuse = size;
if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || s->ctor ||
if (((flags & SLAB_TYPESAFE_BY_RCU) && !has_freeptr_offset(s)) ||
(flags & SLAB_POISON) || s->ctor ||
((flags & SLAB_RED_ZONE) &&
(s->object_size < sizeof(void *) || slub_debug_orig_size(s)))) {
/*
@ -5214,6 +5218,8 @@ static int calculate_sizes(struct kmem_cache *s)
*/
s->offset = size;
size += sizeof(void *);
} else if ((flags & SLAB_TYPESAFE_BY_RCU) && has_freeptr_offset(s)) {
s->offset = s->rcu_freeptr_offset;
} else {
/*
* Store freelist pointer near middle of object to keep

View File

@ -3497,7 +3497,7 @@ static void sock_def_destruct(struct sock *sk)
void sk_send_sigurg(struct sock *sk)
{
if (sk->sk_socket && sk->sk_socket->file)
if (send_sigurg(&sk->sk_socket->file->f_owner))
if (send_sigurg(sk->sk_socket->file))
sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
}
EXPORT_SYMBOL(sk_send_sigurg);

View File

@ -3950,7 +3950,7 @@ static int selinux_file_send_sigiotask(struct task_struct *tsk,
struct file_security_struct *fsec;
/* struct fown_struct is never outside the context of a struct file */
file = container_of(fown, struct file, f_owner);
file = fown->file;
fsec = selinux_file(file);

View File

@ -1950,7 +1950,7 @@ static int smack_file_send_sigiotask(struct task_struct *tsk,
/*
* struct fown_struct is never outside the context of a struct file
*/
file = container_of(fown, struct file, f_owner);
file = fown->file;
/* we don't log here as rc can be overriden */
blob = smack_file(file);