From ff2c570ef7eaa9ded58e7a02dd7a68874a897508 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 7 Jun 2024 16:55:35 +0200
Subject: [PATCH 01/20] path: add cleanup helper

Add a simple cleanup helper so we can cleanup struct path easily.
No need for any extra machinery. Avoid DEFINE_FREE() as it causes a
local copy of struct path to be used. Just rely on path_put() directly
called from a cleanup helper.

Link: https://lore.kernel.org/r/20240607-vfs-listmount-reverse-v1-2-7877a2bfa5e5@kernel.org
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/path.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/include/linux/path.h b/include/linux/path.h
index 475225a03d0d..ca073e70decd 100644
--- a/include/linux/path.h
+++ b/include/linux/path.h
@@ -24,4 +24,13 @@ static inline void path_put_init(struct path *path)
 	*path = (struct path) { };
 }
 
+/*
+ * Cleanup macro for use with __free(path_put). Avoids dereference and
+ * copying @path unlike DEFINE_FREE(). path_put() will handle the empty
+ * path correctly just ensure @path is initialized:
+ *
+ * struct path path __free(path_put) = {};
+ */
+#define __free_path_put path_put
+
 #endif  /* _LINUX_PATH_H */

From cb54ef4f050e7c504ed87114276a296d727e918a Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 10 Jun 2024 14:49:54 +0200
Subject: [PATCH 02/20] fs: don't copy to userspace under namespace semaphore

Don't copy mount ids to userspace while holding the namespace semaphore.
We really shouldn't do that and I've gone through lenghts avoiding that
in statmount() already.

Limit the number of mounts that can be retrieved in one go to 1 million
mount ids. That's effectively 10 times the default limt of 100000 mounts
that we put on each mount namespace by default. Since listmount() is an
iterator limiting the number of mounts retrievable in one go isn't a
problem as userspace can just pick up where they left off.

Karel menti_ned that libmount will probably be reading the mount table
in "in small steps, 512 nodes per request. Nobody likes a tool that
takes too long in the kernel, and huge servers are unusual use cases.
Libmount will very probably provide API to define size of the step (IDs
per request)."

Reported-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://lore.kernel.org/r/20240610-frettchen-liberal-a9a5c53865f8@brauner
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c | 98 ++++++++++++++++++++++++++++----------------------
 1 file changed, 56 insertions(+), 42 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 5a51315c6678..57311ecbdf5a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -5047,55 +5047,81 @@ static struct mount *listmnt_next(struct mount *curr)
 	return node_to_mount(rb_next(&curr->mnt_node));
 }
 
-static ssize_t do_listmount(struct mount *first, struct path *orig,
-			    u64 mnt_parent_id, u64 __user *mnt_ids,
-			    size_t nr_mnt_ids, const struct path *root)
+static ssize_t do_listmount(u64 mnt_parent_id, u64 last_mnt_id, u64 *mnt_ids,
+			    size_t nr_mnt_ids)
 {
-	struct mount *r;
+	struct path root;
+	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
+	struct path orig;
+	struct mount *r, *first;
 	ssize_t ret;
 
+	rwsem_assert_held(&namespace_sem);
+
+	get_fs_root(current->fs, &root);
+	if (mnt_parent_id == LSMT_ROOT) {
+		orig = root;
+	} else {
+		orig.mnt = lookup_mnt_in_ns(mnt_parent_id, ns);
+		if (!orig.mnt) {
+			ret = -ENOENT;
+			goto err;
+		}
+		orig.dentry = orig.mnt->mnt_root;
+	}
+
 	/*
 	 * Don't trigger audit denials. We just want to determine what
 	 * mounts to show users.
 	 */
-	if (!is_path_reachable(real_mount(orig->mnt), orig->dentry, root) &&
-	    !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN))
-		return -EPERM;
+	if (!is_path_reachable(real_mount(orig.mnt), orig.dentry, &root) &&
+	    !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN)) {
+		ret = -EPERM;
+		goto err;
+	}
 
-	ret = security_sb_statfs(orig->dentry);
+	ret = security_sb_statfs(orig.dentry);
 	if (ret)
-		return ret;
+		goto err;
+
+	if (!last_mnt_id)
+		first = node_to_mount(rb_first(&ns->mounts));
+	else
+		first = mnt_find_id_at(ns, last_mnt_id + 1);
 
 	for (ret = 0, r = first; r && nr_mnt_ids; r = listmnt_next(r)) {
 		if (r->mnt_id_unique == mnt_parent_id)
 			continue;
-		if (!is_path_reachable(r, r->mnt.mnt_root, orig))
+		if (!is_path_reachable(r, r->mnt.mnt_root, &orig))
 			continue;
-		if (put_user(r->mnt_id_unique, mnt_ids))
-			return -EFAULT;
+		*mnt_ids = r->mnt_id_unique;
 		mnt_ids++;
 		nr_mnt_ids--;
 		ret++;
 	}
+err:
+	path_put(&root);
 	return ret;
 }
 
-SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, u64 __user *,
-		mnt_ids, size_t, nr_mnt_ids, unsigned int, flags)
+SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
+		u64 __user *, mnt_ids, size_t, nr_mnt_ids, unsigned int, flags)
 {
-	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
+	u64 *kmnt_ids __free(kvfree) = NULL;
+	const size_t maxcount = 1000000;
 	struct mnt_id_req kreq;
-	struct mount *first;
-	struct path root, orig;
-	u64 mnt_parent_id, last_mnt_id;
-	const size_t maxcount = (size_t)-1 >> 3;
 	ssize_t ret;
 
 	if (flags)
 		return -EINVAL;
 
+	/*
+	 * If the mount namespace really has more than 1 million mounts the
+	 * caller must iterate over the mount namespace (and reconsider their
+	 * system design...).
+	 */
 	if (unlikely(nr_mnt_ids > maxcount))
-		return -EFAULT;
+		return -EOVERFLOW;
 
 	if (!access_ok(mnt_ids, nr_mnt_ids * sizeof(*mnt_ids)))
 		return -EFAULT;
@@ -5103,33 +5129,21 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, u64 __user *,
 	ret = copy_mnt_id_req(req, &kreq);
 	if (ret)
 		return ret;
-	mnt_parent_id = kreq.mnt_id;
-	last_mnt_id = kreq.param;
 
-	down_read(&namespace_sem);
-	get_fs_root(current->fs, &root);
-	if (mnt_parent_id == LSMT_ROOT) {
-		orig = root;
-	} else {
-		ret = -ENOENT;
-		orig.mnt = lookup_mnt_in_ns(mnt_parent_id, ns);
-		if (!orig.mnt)
-			goto err;
-		orig.dentry = orig.mnt->mnt_root;
-	}
-	if (!last_mnt_id)
-		first = node_to_mount(rb_first(&ns->mounts));
-	else
-		first = mnt_find_id_at(ns, last_mnt_id + 1);
+	kmnt_ids = kvmalloc_array(nr_mnt_ids, sizeof(*kmnt_ids),
+				  GFP_KERNEL_ACCOUNT);
+	if (!kmnt_ids)
+		return -ENOMEM;
+
+	scoped_guard(rwsem_read, &namespace_sem)
+		ret = do_listmount(kreq.mnt_id, kreq.param, kmnt_ids, nr_mnt_ids);
+
+	if (copy_to_user(mnt_ids, kmnt_ids, ret * sizeof(*mnt_ids)))
+		return -EFAULT;
 
-	ret = do_listmount(first, &orig, mnt_parent_id, mnt_ids, nr_mnt_ids, &root);
-err:
-	path_put(&root);
-	up_read(&namespace_sem);
 	return ret;
 }
 
-
 static void __init init_mount_tree(void)
 {
 	struct vfsmount *mnt;

From 17e70161281bb66316e94e63a15d1a8498bf6f01 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 7 Jun 2024 16:55:36 +0200
Subject: [PATCH 03/20] fs: simplify error handling

Rely on cleanup helper and simplify error handling

Link: https://lore.kernel.org/r/20240607-vfs-listmount-reverse-v1-3-7877a2bfa5e5@kernel.org
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 57311ecbdf5a..98671cd89511 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -5050,7 +5050,7 @@ static struct mount *listmnt_next(struct mount *curr)
 static ssize_t do_listmount(u64 mnt_parent_id, u64 last_mnt_id, u64 *mnt_ids,
 			    size_t nr_mnt_ids)
 {
-	struct path root;
+	struct path root __free(path_put) = {};
 	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
 	struct path orig;
 	struct mount *r, *first;
@@ -5063,10 +5063,8 @@ static ssize_t do_listmount(u64 mnt_parent_id, u64 last_mnt_id, u64 *mnt_ids,
 		orig = root;
 	} else {
 		orig.mnt = lookup_mnt_in_ns(mnt_parent_id, ns);
-		if (!orig.mnt) {
-			ret = -ENOENT;
-			goto err;
-		}
+		if (!orig.mnt)
+			return -ENOENT;
 		orig.dentry = orig.mnt->mnt_root;
 	}
 
@@ -5075,14 +5073,12 @@ static ssize_t do_listmount(u64 mnt_parent_id, u64 last_mnt_id, u64 *mnt_ids,
 	 * mounts to show users.
 	 */
 	if (!is_path_reachable(real_mount(orig.mnt), orig.dentry, &root) &&
-	    !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN)) {
-		ret = -EPERM;
-		goto err;
-	}
+	    !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN))
+		return -EPERM;
 
 	ret = security_sb_statfs(orig.dentry);
 	if (ret)
-		goto err;
+		return ret;
 
 	if (!last_mnt_id)
 		first = node_to_mount(rb_first(&ns->mounts));
@@ -5099,8 +5095,6 @@ static ssize_t do_listmount(u64 mnt_parent_id, u64 last_mnt_id, u64 *mnt_ids,
 		nr_mnt_ids--;
 		ret++;
 	}
-err:
-	path_put(&root);
 	return ret;
 }
 

From dd7cb142f467c4660698bcaa4a48c688b443ab81 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 24 Jun 2024 11:49:44 -0400
Subject: [PATCH 04/20] fs: relax permissions for listmount()

It is sufficient to have capabilities in the owning user namespace of
the mount namespace to list all mounts regardless of whether they are
reachable or not.

Link: https://lore.kernel.org/r/8adc0d3f4f7495faacc6a7c63095961f7f1637c7.1719243756.git.josef@toxicpanda.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 02a697287da5..22274f74beb0 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -5104,7 +5104,7 @@ static ssize_t do_listmount(u64 mnt_parent_id, u64 last_mnt_id, u64 *mnt_ids,
 	 * mounts to show users.
 	 */
 	if (!is_path_reachable(real_mount(orig.mnt), orig.dentry, &root) &&
-	    !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN))
+	    !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
 		return -EPERM;
 
 	ret = security_sb_statfs(orig.dentry);

From d04bccd8c19d601232ed3e3c9e248c0040167d47 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 7 Jun 2024 16:55:37 +0200
Subject: [PATCH 05/20] listmount: allow listing in reverse order

util-linux is about to implement listmount() and statmount() support.
Karel requested the ability to scan the mount table in backwards order
because that's what libmount currently does in order to get the latest
mount first. We currently don't support this in listmount(). Add a new
LISTMOUNT_REVERSE flag to allow listing mounts in reverse order. For
example, listing all child mounts of /sys without LISTMOUNT_REVERSE
gives:

    /sys/kernel/security @ mnt_id: 4294968369
    /sys/fs/cgroup @ mnt_id: 4294968370
    /sys/firmware/efi/efivars @ mnt_id: 4294968371
    /sys/fs/bpf @ mnt_id: 4294968372
    /sys/kernel/tracing @ mnt_id: 4294968373
    /sys/kernel/debug @ mnt_id: 4294968374
    /sys/fs/fuse/connections @ mnt_id: 4294968375
    /sys/kernel/config @ mnt_id: 4294968376

whereas with LISTMOUNT_REVERSE it gives:

    /sys/kernel/config @ mnt_id: 4294968376
    /sys/fs/fuse/connections @ mnt_id: 4294968375
    /sys/kernel/debug @ mnt_id: 4294968374
    /sys/kernel/tracing @ mnt_id: 4294968373
    /sys/fs/bpf @ mnt_id: 4294968372
    /sys/firmware/efi/efivars @ mnt_id: 4294968371
    /sys/fs/cgroup @ mnt_id: 4294968370
    /sys/kernel/security @ mnt_id: 4294968369

Link: https://lore.kernel.org/r/20240607-vfs-listmount-reverse-v1-4-7877a2bfa5e5@kernel.org
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c             | 59 +++++++++++++++++++++++++++++++-------
 include/uapi/linux/mount.h |  1 +
 2 files changed, 50 insertions(+), 10 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 98671cd89511..02a697287da5 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1448,6 +1448,30 @@ static struct mount *mnt_find_id_at(struct mnt_namespace *ns, u64 mnt_id)
 	return ret;
 }
 
+/*
+ * Returns the mount which either has the specified mnt_id, or has the next
+ * greater id before the specified one.
+ */
+static struct mount *mnt_find_id_at_reverse(struct mnt_namespace *ns, u64 mnt_id)
+{
+	struct rb_node *node = ns->mounts.rb_node;
+	struct mount *ret = NULL;
+
+	while (node) {
+		struct mount *m = node_to_mount(node);
+
+		if (mnt_id >= m->mnt_id_unique) {
+			ret = node_to_mount(node);
+			if (mnt_id == m->mnt_id_unique)
+				break;
+			node = node->rb_right;
+		} else {
+			node = node->rb_left;
+		}
+	}
+	return ret;
+}
+
 #ifdef CONFIG_PROC_FS
 
 /* iterator; we want it to have access to namespace_sem, thus here... */
@@ -5042,13 +5066,20 @@ retry:
 	return ret;
 }
 
-static struct mount *listmnt_next(struct mount *curr)
+static struct mount *listmnt_next(struct mount *curr, bool reverse)
 {
-	return node_to_mount(rb_next(&curr->mnt_node));
+	struct rb_node *node;
+
+	if (reverse)
+		node = rb_prev(&curr->mnt_node);
+	else
+		node = rb_next(&curr->mnt_node);
+
+	return node_to_mount(node);
 }
 
 static ssize_t do_listmount(u64 mnt_parent_id, u64 last_mnt_id, u64 *mnt_ids,
-			    size_t nr_mnt_ids)
+			    size_t nr_mnt_ids, bool reverse)
 {
 	struct path root __free(path_put) = {};
 	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
@@ -5080,12 +5111,19 @@ static ssize_t do_listmount(u64 mnt_parent_id, u64 last_mnt_id, u64 *mnt_ids,
 	if (ret)
 		return ret;
 
-	if (!last_mnt_id)
-		first = node_to_mount(rb_first(&ns->mounts));
-	else
-		first = mnt_find_id_at(ns, last_mnt_id + 1);
+	if (!last_mnt_id) {
+		if (reverse)
+			first = node_to_mount(rb_last(&ns->mounts));
+		else
+			first = node_to_mount(rb_first(&ns->mounts));
+	} else {
+		if (reverse)
+			first = mnt_find_id_at_reverse(ns, last_mnt_id - 1);
+		else
+			first = mnt_find_id_at(ns, last_mnt_id + 1);
+	}
 
-	for (ret = 0, r = first; r && nr_mnt_ids; r = listmnt_next(r)) {
+	for (ret = 0, r = first; r && nr_mnt_ids; r = listmnt_next(r, reverse)) {
 		if (r->mnt_id_unique == mnt_parent_id)
 			continue;
 		if (!is_path_reachable(r, r->mnt.mnt_root, &orig))
@@ -5106,7 +5144,7 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
 	struct mnt_id_req kreq;
 	ssize_t ret;
 
-	if (flags)
+	if (flags & ~LISTMOUNT_REVERSE)
 		return -EINVAL;
 
 	/*
@@ -5130,7 +5168,8 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
 		return -ENOMEM;
 
 	scoped_guard(rwsem_read, &namespace_sem)
-		ret = do_listmount(kreq.mnt_id, kreq.param, kmnt_ids, nr_mnt_ids);
+		ret = do_listmount(kreq.mnt_id, kreq.param, kmnt_ids,
+				   nr_mnt_ids, (flags & LISTMOUNT_REVERSE));
 
 	if (copy_to_user(mnt_ids, kmnt_ids, ret * sizeof(*mnt_ids)))
 		return -EFAULT;
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index ad5478dbad00..88d78de1519f 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -207,5 +207,6 @@ struct mnt_id_req {
  * Special @mnt_id values that can be passed to listmount
  */
 #define LSMT_ROOT		0xffffffffffffffff	/* root mount */
+#define LISTMOUNT_REVERSE	(1 << 0) /* List later mounts first */
 
 #endif /* _UAPI_LINUX_MOUNT_H */

From f3107df39df123328a9d3c8f40c006834b37287d Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 24 Jun 2024 11:49:45 -0400
Subject: [PATCH 06/20] fs: relax permissions for statmount()

It is sufficient to have capabilities in the owning user namespace of
the mount namespace to stat a mount regardless of whether it's reachable
or not.

Link: https://lore.kernel.org/r/bf5961d71ec479ba85806766b0d8d96043e67bba.1719243756.git.josef@toxicpanda.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 22274f74beb0..352e8c90b9f7 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -4930,6 +4930,7 @@ static int copy_statmount_to_user(struct kstatmount *s)
 static int do_statmount(struct kstatmount *s)
 {
 	struct mount *m = real_mount(s->mnt);
+	struct mnt_namespace *ns = m->mnt_ns;
 	int err;
 
 	/*
@@ -4937,7 +4938,7 @@ static int do_statmount(struct kstatmount *s)
 	 * mounts to show users.
 	 */
 	if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) &&
-	    !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN))
+	    !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
 		return -EPERM;
 
 	err = security_sb_statfs(s->mnt->mnt_root);

From 1901c92497bd90caf608a474f1bf4d8795b372a2 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Mon, 24 Jun 2024 11:49:46 -0400
Subject: [PATCH 07/20] fs: keep an index of current mount namespaces

In order to allow for listmount() to be used on different namespaces we
need a way to lookup a mount ns by its id.  Keep a rbtree of the current
!anonymous mount name spaces indexed by ID that we can use to look up
the namespace.

Co-developed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Link: https://lore.kernel.org/r/e5fdd78a90f5b00a75bd893962a70f52a2c015cd.1719243756.git.josef@toxicpanda.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/mount.h     |   2 +
 fs/namespace.c | 113 ++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 113 insertions(+), 2 deletions(-)

diff --git a/fs/mount.h b/fs/mount.h
index 4a42fc68f4cc..5b980c262c0c 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -16,6 +16,8 @@ struct mnt_namespace {
 	u64 event;
 	unsigned int		nr_mounts; /* # of mounts in the namespace */
 	unsigned int		pending_mounts;
+	struct rb_node		mnt_ns_tree_node; /* node in the mnt_ns_tree */
+	refcount_t		passive; /* number references not pinning @mounts */
 } __randomize_layout;
 
 struct mnt_pcp {
diff --git a/fs/namespace.c b/fs/namespace.c
index 352e8c90b9f7..eebe9d912a71 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -78,6 +78,8 @@ static struct kmem_cache *mnt_cache __ro_after_init;
 static DECLARE_RWSEM(namespace_sem);
 static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
 static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
+static DEFINE_RWLOCK(mnt_ns_tree_lock);
+static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */
 
 struct mount_kattr {
 	unsigned int attr_set;
@@ -103,6 +105,109 @@ EXPORT_SYMBOL_GPL(fs_kobj);
  */
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
 
+static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns)
+{
+	u64 seq_b = ns->seq;
+
+	if (seq < seq_b)
+		return -1;
+	if (seq > seq_b)
+		return 1;
+	return 0;
+}
+
+static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
+{
+	if (!node)
+		return NULL;
+	return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node);
+}
+
+static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b)
+{
+	struct mnt_namespace *ns_a = node_to_mnt_ns(a);
+	struct mnt_namespace *ns_b = node_to_mnt_ns(b);
+	u64 seq_a = ns_a->seq;
+
+	return mnt_ns_cmp(seq_a, ns_b) < 0;
+}
+
+static void mnt_ns_tree_add(struct mnt_namespace *ns)
+{
+	guard(write_lock)(&mnt_ns_tree_lock);
+	rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less);
+}
+
+static void mnt_ns_release(struct mnt_namespace *ns)
+{
+	lockdep_assert_not_held(&mnt_ns_tree_lock);
+
+	/* keep alive for {list,stat}mount() */
+	if (refcount_dec_and_test(&ns->passive)) {
+		put_user_ns(ns->user_ns);
+		kfree(ns);
+	}
+}
+DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T))
+
+static void mnt_ns_tree_remove(struct mnt_namespace *ns)
+{
+	/* remove from global mount namespace list */
+	if (!is_anon_ns(ns)) {
+		guard(write_lock)(&mnt_ns_tree_lock);
+		rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree);
+	}
+
+	mnt_ns_release(ns);
+}
+
+/*
+ * Returns the mount namespace which either has the specified id, or has the
+ * next smallest id afer the specified one.
+ */
+static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
+{
+	struct rb_node *node = mnt_ns_tree.rb_node;
+	struct mnt_namespace *ret = NULL;
+
+	lockdep_assert_held(&mnt_ns_tree_lock);
+
+	while (node) {
+		struct mnt_namespace *n = node_to_mnt_ns(node);
+
+		if (mnt_ns_id <= n->seq) {
+			ret = node_to_mnt_ns(node);
+			if (mnt_ns_id == n->seq)
+				break;
+			node = node->rb_left;
+		} else {
+			node = node->rb_right;
+		}
+	}
+	return ret;
+}
+
+/*
+ * Lookup a mount namespace by id and take a passive reference count. Taking a
+ * passive reference means the mount namespace can be emptied if e.g., the last
+ * task holding an active reference exits. To access the mounts of the
+ * namespace the @namespace_sem must first be acquired. If the namespace has
+ * already shut down before acquiring @namespace_sem, {list,stat}mount() will
+ * see that the mount rbtree of the namespace is empty.
+ */
+static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
+{
+       struct mnt_namespace *ns;
+
+       guard(read_lock)(&mnt_ns_tree_lock);
+       ns = mnt_ns_find_id_at(mnt_ns_id);
+       if (!ns || ns->seq != mnt_ns_id)
+               return NULL;
+
+       refcount_inc(&ns->passive);
+       return ns;
+}
+
 static inline void lock_mount_hash(void)
 {
 	write_seqlock(&mount_lock);
@@ -3733,8 +3838,7 @@ static void free_mnt_ns(struct mnt_namespace *ns)
 	if (!is_anon_ns(ns))
 		ns_free_inum(&ns->ns);
 	dec_mnt_namespaces(ns->ucounts);
-	put_user_ns(ns->user_ns);
-	kfree(ns);
+	mnt_ns_tree_remove(ns);
 }
 
 /*
@@ -3773,7 +3877,9 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
 	if (!anon)
 		new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
 	refcount_set(&new_ns->ns.count, 1);
+	refcount_set(&new_ns->passive, 1);
 	new_ns->mounts = RB_ROOT;
+	RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node);
 	init_waitqueue_head(&new_ns->poll);
 	new_ns->user_ns = get_user_ns(user_ns);
 	new_ns->ucounts = ucounts;
@@ -3850,6 +3956,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
 		while (p->mnt.mnt_root != q->mnt.mnt_root)
 			p = next_mnt(skip_mnt_tree(p), old);
 	}
+	mnt_ns_tree_add(new_ns);
 	namespace_unlock();
 
 	if (rootmnt)
@@ -5205,6 +5312,8 @@ static void __init init_mount_tree(void)
 
 	set_fs_pwd(current->fs, &root);
 	set_fs_root(current->fs, &root);
+
+	mnt_ns_tree_add(ns);
 }
 
 void __init mnt_init(void)

From 09b31295f833031c88419550172703d45c5401e3 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Mon, 24 Jun 2024 11:49:47 -0400
Subject: [PATCH 08/20] fs: export the mount ns id via statmount

In order to allow users to iterate through children mount namespaces via
listmount we need a way for them to know what the ns id for the mount.
Add a new field to statmount called mnt_ns_id which will carry the ns id
for the given mount entry.

Co-developed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Link: https://lore.kernel.org/r/6dabf437331fb7415d886f7c64b21cb2a50b1c66.1719243756.git.josef@toxicpanda.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c             | 11 +++++++++++
 include/uapi/linux/mount.h |  4 +++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index eebe9d912a71..ed2d9353e4be 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -4974,6 +4974,14 @@ static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq)
 	return 0;
 }
 
+static void statmount_mnt_ns_id(struct kstatmount *s)
+{
+	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
+
+	s->sm.mask |= STATMOUNT_MNT_NS_ID;
+	s->sm.mnt_ns_id = ns->seq;
+}
+
 static int statmount_string(struct kstatmount *s, u64 flag)
 {
 	int ret;
@@ -5070,6 +5078,9 @@ static int do_statmount(struct kstatmount *s)
 	if (!err && s->mask & STATMOUNT_MNT_POINT)
 		err = statmount_string(s, STATMOUNT_MNT_POINT);
 
+	if (!err && s->mask & STATMOUNT_MNT_NS_ID)
+		statmount_mnt_ns_id(s);
+
 	if (err)
 		return err;
 
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index 88d78de1519f..a07508aee518 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -172,7 +172,8 @@ struct statmount {
 	__u64 propagate_from;	/* Propagation from in current namespace */
 	__u32 mnt_root;		/* [str] Root of mount relative to root of fs */
 	__u32 mnt_point;	/* [str] Mountpoint relative to current root */
-	__u64 __spare2[50];
+	__u64 mnt_ns_id;	/* ID of the mount namespace */
+	__u64 __spare2[49];
 	char str[];		/* Variable size part containing strings */
 };
 
@@ -202,6 +203,7 @@ struct mnt_id_req {
 #define STATMOUNT_MNT_ROOT		0x00000008U	/* Want/got mnt_root  */
 #define STATMOUNT_MNT_POINT		0x00000010U	/* Want/got mnt_point */
 #define STATMOUNT_FS_TYPE		0x00000020U	/* Want/got fs_type */
+#define STATMOUNT_MNT_NS_ID		0x00000040U	/* Want/got mnt_ns_id */
 
 /*
  * Special @mnt_id values that can be passed to listmount

From 0a3deb11858ae8a0b3849b5fda45512ad383f0e1 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 24 Jun 2024 11:49:48 -0400
Subject: [PATCH 09/20] fs: Allow listmount() in foreign mount namespace

Expand struct mnt_id_req to add an optional mnt_ns_id field.  When this
field is populated, listmount() will be performed on the specified mount
namespace, provided the currently application has CAP_SYS_ADMIN in its
user namespace and the mount namespace is a child of the current
namespace.

Co-developed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Link: https://lore.kernel.org/r/49930bdce29a8367a213eb14c1e68e7e49284f86.1719243756.git.josef@toxicpanda.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c             | 88 ++++++++++++++++++++++++++++++--------
 include/uapi/linux/mount.h |  2 +
 2 files changed, 72 insertions(+), 18 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index ed2d9353e4be..a54d68f822a8 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -5122,7 +5122,7 @@ static int copy_mnt_id_req(const struct mnt_id_req __user *req,
 	int ret;
 	size_t usize;
 
-	BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER0);
+	BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER1);
 
 	ret = get_user(usize, &req->size);
 	if (ret)
@@ -5140,6 +5140,58 @@ static int copy_mnt_id_req(const struct mnt_id_req __user *req,
 	return 0;
 }
 
+static struct mount *listmnt_next(struct mount *curr, bool reverse)
+{
+	struct rb_node *node;
+
+	if (reverse)
+		node = rb_prev(&curr->mnt_node);
+	else
+		node = rb_next(&curr->mnt_node);
+
+	return node_to_mount(node);
+}
+
+static int grab_requested_root(struct mnt_namespace *ns, struct path *root)
+{
+	struct mount *first;
+
+	rwsem_assert_held(&namespace_sem);
+
+	/* We're looking at our own ns, just use get_fs_root. */
+	if (ns == current->nsproxy->mnt_ns) {
+		get_fs_root(current->fs, root);
+		return 0;
+	}
+
+	/*
+	 * We have to find the first mount in our ns and use that, however it
+	 * may not exist, so handle that properly.
+	 */
+	if (RB_EMPTY_ROOT(&ns->mounts))
+		return -ENOENT;
+
+	first = listmnt_next(ns->root, false);
+	if (!first)
+		return -ENOENT;
+	root->mnt = mntget(&first->mnt);
+	root->dentry = dget(root->mnt->mnt_root);
+	return 0;
+}
+
+/*
+ * If the user requested a specific mount namespace id, look that up and return
+ * that, or if not simply grab a passive reference on our mount namespace and
+ * return that.
+ */
+static struct mnt_namespace *grab_requested_mnt_ns(u64 mnt_ns_id)
+{
+	if (mnt_ns_id)
+		return lookup_mnt_ns(mnt_ns_id);
+	refcount_inc(&current->nsproxy->mnt_ns->passive);
+	return current->nsproxy->mnt_ns;
+}
+
 SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
 		struct statmount __user *, buf, size_t, bufsize,
 		unsigned int, flags)
@@ -5185,30 +5237,21 @@ retry:
 	return ret;
 }
 
-static struct mount *listmnt_next(struct mount *curr, bool reverse)
-{
-	struct rb_node *node;
-
-	if (reverse)
-		node = rb_prev(&curr->mnt_node);
-	else
-		node = rb_next(&curr->mnt_node);
-
-	return node_to_mount(node);
-}
-
-static ssize_t do_listmount(u64 mnt_parent_id, u64 last_mnt_id, u64 *mnt_ids,
-			    size_t nr_mnt_ids, bool reverse)
+static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id,
+			    u64 last_mnt_id, u64 *mnt_ids, size_t nr_mnt_ids,
+			    bool reverse)
 {
 	struct path root __free(path_put) = {};
-	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
 	struct path orig;
 	struct mount *r, *first;
 	ssize_t ret;
 
 	rwsem_assert_held(&namespace_sem);
 
-	get_fs_root(current->fs, &root);
+	ret = grab_requested_root(ns, &root);
+	if (ret)
+		return ret;
+
 	if (mnt_parent_id == LSMT_ROOT) {
 		orig = root;
 	} else {
@@ -5260,6 +5303,7 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
 {
 	u64 *kmnt_ids __free(kvfree) = NULL;
 	const size_t maxcount = 1000000;
+	struct mnt_namespace *ns __free(mnt_ns_release) = NULL;
 	struct mnt_id_req kreq;
 	ssize_t ret;
 
@@ -5286,8 +5330,16 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
 	if (!kmnt_ids)
 		return -ENOMEM;
 
+	ns = grab_requested_mnt_ns(kreq.mnt_ns_id);
+	if (!ns)
+		return -ENOENT;
+
+	if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) &&
+	    !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
+		return -ENOENT;
+
 	scoped_guard(rwsem_read, &namespace_sem)
-		ret = do_listmount(kreq.mnt_id, kreq.param, kmnt_ids,
+		ret = do_listmount(ns, kreq.mnt_id, kreq.param, kmnt_ids,
 				   nr_mnt_ids, (flags & LISTMOUNT_REVERSE));
 
 	if (copy_to_user(mnt_ids, kmnt_ids, ret * sizeof(*mnt_ids)))
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index a07508aee518..ee1559cd6764 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -189,10 +189,12 @@ struct mnt_id_req {
 	__u32 spare;
 	__u64 mnt_id;
 	__u64 param;
+	__u64 mnt_ns_id;
 };
 
 /* List of all mnt_id_req versions. */
 #define MNT_ID_REQ_SIZE_VER0	24 /* sizeof first published struct */
+#define MNT_ID_REQ_SIZE_VER1	32 /* sizeof second published struct */
 
 /*
  * @mask bits for statmount(2)

From 71aacb4c8c3d19da053363a5fe7538a8af082d56 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 24 Jun 2024 11:49:49 -0400
Subject: [PATCH 10/20] fs: Allow statmount() in foreign mount namespace

This patch makes use of the new mnt_ns_id field in struct mnt_id_req to
allow users to stat mount entries not in their mount namespace.  The
rules are the same as listmount(), the user must have CAP_SYS_ADMIN in
their user namespace and the target mount namespace must be a child of
the current namespace.

Co-developed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Link: https://lore.kernel.org/r/52a2e17e50ba7aa420bc8bae1d9e88ff593395c1.1719243756.git.josef@toxicpanda.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index a54d68f822a8..e871f73c4c8c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -4974,10 +4974,8 @@ static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq)
 	return 0;
 }
 
-static void statmount_mnt_ns_id(struct kstatmount *s)
+static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns)
 {
-	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
-
 	s->sm.mask |= STATMOUNT_MNT_NS_ID;
 	s->sm.mnt_ns_id = ns->seq;
 }
@@ -5079,7 +5077,7 @@ static int do_statmount(struct kstatmount *s)
 		err = statmount_string(s, STATMOUNT_MNT_POINT);
 
 	if (!err && s->mask & STATMOUNT_MNT_NS_ID)
-		statmount_mnt_ns_id(s);
+		statmount_mnt_ns_id(s, ns);
 
 	if (err)
 		return err;
@@ -5196,6 +5194,7 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
 		struct statmount __user *, buf, size_t, bufsize,
 		unsigned int, flags)
 {
+	struct mnt_namespace *ns __free(mnt_ns_release) = NULL;
 	struct vfsmount *mnt;
 	struct mnt_id_req kreq;
 	struct kstatmount ks;
@@ -5210,13 +5209,28 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
 	if (ret)
 		return ret;
 
+	ns = grab_requested_mnt_ns(kreq.mnt_ns_id);
+	if (!ns)
+		return -ENOENT;
+
+	if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) &&
+	    !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
+		return -ENOENT;
+
 retry:
 	ret = prepare_kstatmount(&ks, &kreq, buf, bufsize, seq_size);
 	if (ret)
 		return ret;
 
 	down_read(&namespace_sem);
-	mnt = lookup_mnt_in_ns(kreq.mnt_id, current->nsproxy->mnt_ns);
+	/* Has the namespace already been emptied? */
+	if (kreq.mnt_ns_id && RB_EMPTY_ROOT(&ns->mounts)) {
+		up_read(&namespace_sem);
+		kvfree(ks.seq.buf);
+		return -ENOENT;
+	}
+
+	mnt = lookup_mnt_in_ns(kreq.mnt_id, ns);
 	if (!mnt) {
 		up_read(&namespace_sem);
 		kvfree(ks.seq.buf);
@@ -5224,7 +5238,12 @@ retry:
 	}
 
 	ks.mnt = mnt;
-	get_fs_root(current->fs, &ks.root);
+	ret = grab_requested_root(ns, &ks.root);
+	if (ret) {
+		up_read(&namespace_sem);
+		kvfree(ks.seq.buf);
+		return ret;
+	}
 	ret = do_statmount(&ks);
 	path_put(&ks.root);
 	up_read(&namespace_sem);

From e8e43a1fcc5c07575f37e40f8a2cd78aee46f9a0 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Mon, 24 Jun 2024 11:49:50 -0400
Subject: [PATCH 11/20] fs: add an ioctl to get the mnt ns id from nsfs

In order to utilize the listmount() and statmount() extensions that
allow us to call them on different namespaces we need a way to get the
mnt namespace id from user space.  Add an ioctl to nsfs that will allow
us to extract the mnt namespace id in order to make these new extensions
usable.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Link: https://lore.kernel.org/r/180449959d5a756af7306d6bda55f41b9d53e3cb.1719243756.git.josef@toxicpanda.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/nsfs.c                 | 14 ++++++++++++++
 include/uapi/linux/nsfs.h |  2 ++
 2 files changed, 16 insertions(+)

diff --git a/fs/nsfs.c b/fs/nsfs.c
index 07e22a15ef02..af352dadffe1 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -12,6 +12,7 @@
 #include <linux/nsfs.h>
 #include <linux/uaccess.h>
 
+#include "mount.h"
 #include "internal.h"
 
 static struct vfsmount *nsfs_mnt;
@@ -143,6 +144,19 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
 		argp = (uid_t __user *) arg;
 		uid = from_kuid_munged(current_user_ns(), user_ns->owner);
 		return put_user(uid, argp);
+	case NS_GET_MNTNS_ID: {
+		struct mnt_namespace *mnt_ns;
+		__u64 __user *idp;
+		__u64 id;
+
+		if (ns->ops->type != CLONE_NEWNS)
+			return -EINVAL;
+
+		mnt_ns = container_of(ns, struct mnt_namespace, ns);
+		idp = (__u64 __user *)arg;
+		id = mnt_ns->seq;
+		return put_user(id, idp);
+	}
 	default:
 		return -ENOTTY;
 	}
diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h
index a0c8552b64ee..56e8b1639b98 100644
--- a/include/uapi/linux/nsfs.h
+++ b/include/uapi/linux/nsfs.h
@@ -15,5 +15,7 @@
 #define NS_GET_NSTYPE		_IO(NSIO, 0x3)
 /* Get owner UID (in the caller's user namespace) for a user namespace */
 #define NS_GET_OWNER_UID	_IO(NSIO, 0x4)
+/* Get the id for a mount namespace */
+#define NS_GET_MNTNS_ID		_IO(NSIO, 0x5)
 
 #endif /* __LINUX_NSFS_H */

From d896f71ce1f2e73813dc6f639eb0cf6f4beefdaa Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Mon, 24 Jun 2024 11:49:51 -0400
Subject: [PATCH 12/20] selftests: add a test for the foreign mnt ns extensions

This tests both statmount and listmount to make sure they work with the
extensions that allow us to specify a mount ns to enter in order to find
the mount entries.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Link: https://lore.kernel.org/r/2d1a35bc9ab94b4656c056c420f25e429e7eb0b1.1719243756.git.josef@toxicpanda.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 .../selftests/filesystems/statmount/Makefile  |   2 +-
 .../filesystems/statmount/statmount.h         |  46 +++
 .../filesystems/statmount/statmount_test.c    |  53 +--
 .../filesystems/statmount/statmount_test_ns.c | 364 ++++++++++++++++++
 4 files changed, 424 insertions(+), 41 deletions(-)
 create mode 100644 tools/testing/selftests/filesystems/statmount/statmount.h
 create mode 100644 tools/testing/selftests/filesystems/statmount/statmount_test_ns.c

diff --git a/tools/testing/selftests/filesystems/statmount/Makefile b/tools/testing/selftests/filesystems/statmount/Makefile
index 07a0d5b545ca..3af3136e35a4 100644
--- a/tools/testing/selftests/filesystems/statmount/Makefile
+++ b/tools/testing/selftests/filesystems/statmount/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-or-later
 
 CFLAGS += -Wall -O2 -g $(KHDR_INCLUDES)
-TEST_GEN_PROGS := statmount_test
+TEST_GEN_PROGS := statmount_test statmount_test_ns
 
 include ../../lib.mk
diff --git a/tools/testing/selftests/filesystems/statmount/statmount.h b/tools/testing/selftests/filesystems/statmount/statmount.h
new file mode 100644
index 000000000000..f4294bab9d73
--- /dev/null
+++ b/tools/testing/selftests/filesystems/statmount/statmount.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __STATMOUNT_H
+#define __STATMOUNT_H
+
+#include <stdint.h>
+#include <linux/mount.h>
+#include <asm/unistd.h>
+
+static inline int statmount(uint64_t mnt_id, uint64_t mnt_ns_id, uint64_t mask,
+			    struct statmount *buf, size_t bufsize,
+			    unsigned int flags)
+{
+	struct mnt_id_req req = {
+		.size = MNT_ID_REQ_SIZE_VER0,
+		.mnt_id = mnt_id,
+		.param = mask,
+	};
+
+	if (mnt_ns_id) {
+		req.size = MNT_ID_REQ_SIZE_VER1;
+		req.mnt_ns_id = mnt_ns_id;
+	}
+
+	return syscall(__NR_statmount, &req, buf, bufsize, flags);
+}
+
+static ssize_t listmount(uint64_t mnt_id, uint64_t mnt_ns_id,
+			 uint64_t last_mnt_id, uint64_t list[], size_t num,
+			 unsigned int flags)
+{
+	struct mnt_id_req req = {
+		.size = MNT_ID_REQ_SIZE_VER0,
+		.mnt_id = mnt_id,
+		.param = last_mnt_id,
+	};
+
+	if (mnt_ns_id) {
+		req.size = MNT_ID_REQ_SIZE_VER1;
+		req.mnt_ns_id = mnt_ns_id;
+	}
+
+	return syscall(__NR_listmount, &req, list, num, flags);
+}
+
+#endif /* __STATMOUNT_H */
diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test.c b/tools/testing/selftests/filesystems/statmount/statmount_test.c
index e6d7c4f1c85b..4f7023c2de77 100644
--- a/tools/testing/selftests/filesystems/statmount/statmount_test.c
+++ b/tools/testing/selftests/filesystems/statmount/statmount_test.c
@@ -4,17 +4,15 @@
 
 #include <assert.h>
 #include <stddef.h>
-#include <stdint.h>
 #include <sched.h>
 #include <fcntl.h>
 #include <sys/param.h>
 #include <sys/mount.h>
 #include <sys/stat.h>
 #include <sys/statfs.h>
-#include <linux/mount.h>
 #include <linux/stat.h>
-#include <asm/unistd.h>
 
+#include "statmount.h"
 #include "../../kselftest.h"
 
 static const char *const known_fs[] = {
@@ -36,18 +34,6 @@ static const char *const known_fs[] = {
 	"ufs", "v7", "vboxsf", "vfat", "virtiofs", "vxfs", "xenfs", "xfs",
 	"zonefs", NULL };
 
-static int statmount(uint64_t mnt_id, uint64_t mask, struct statmount *buf,
-		     size_t bufsize, unsigned int flags)
-{
-	struct mnt_id_req req = {
-		.size = MNT_ID_REQ_SIZE_VER0,
-		.mnt_id = mnt_id,
-		.param = mask,
-	};
-
-	return syscall(__NR_statmount, &req, buf, bufsize, flags);
-}
-
 static struct statmount *statmount_alloc(uint64_t mnt_id, uint64_t mask, unsigned int flags)
 {
 	size_t bufsize = 1 << 15;
@@ -56,7 +42,7 @@ static struct statmount *statmount_alloc(uint64_t mnt_id, uint64_t mask, unsigne
 	int ret;
 
 	for (;;) {
-		ret = statmount(mnt_id, mask, tmp, bufsize, flags);
+		ret = statmount(mnt_id, 0, mask, tmp, bufsize, flags);
 		if (ret != -1)
 			break;
 		if (tofree)
@@ -122,7 +108,6 @@ static int orig_root;
 static uint64_t root_id, parent_id;
 static uint32_t old_root_id, old_parent_id;
 
-
 static void cleanup_namespace(void)
 {
 	fchdir(orig_root);
@@ -138,7 +123,7 @@ static void setup_namespace(void)
 	uid_t uid = getuid();
 	gid_t gid = getgid();
 
-	ret = unshare(CLONE_NEWNS|CLONE_NEWUSER);
+	ret = unshare(CLONE_NEWNS|CLONE_NEWUSER|CLONE_NEWPID);
 	if (ret == -1)
 		ksft_exit_fail_msg("unsharing mountns and userns: %s\n",
 				   strerror(errno));
@@ -208,25 +193,13 @@ static int setup_mount_tree(int log2_num)
 	return 0;
 }
 
-static ssize_t listmount(uint64_t mnt_id, uint64_t last_mnt_id,
-			 uint64_t list[], size_t num, unsigned int flags)
-{
-	struct mnt_id_req req = {
-		.size = MNT_ID_REQ_SIZE_VER0,
-		.mnt_id = mnt_id,
-		.param = last_mnt_id,
-	};
-
-	return syscall(__NR_listmount, &req, list, num, flags);
-}
-
 static void test_listmount_empty_root(void)
 {
 	ssize_t res;
 	const unsigned int size = 32;
 	uint64_t list[size];
 
-	res = listmount(LSMT_ROOT, 0, list, size, 0);
+	res = listmount(LSMT_ROOT, 0, 0, list, size, 0);
 	if (res == -1) {
 		ksft_test_result_fail("listmount: %s\n", strerror(errno));
 		return;
@@ -251,7 +224,7 @@ static void test_statmount_zero_mask(void)
 	struct statmount sm;
 	int ret;
 
-	ret = statmount(root_id, 0, &sm, sizeof(sm), 0);
+	ret = statmount(root_id, 0, 0, &sm, sizeof(sm), 0);
 	if (ret == -1) {
 		ksft_test_result_fail("statmount zero mask: %s\n",
 				      strerror(errno));
@@ -277,7 +250,7 @@ static void test_statmount_mnt_basic(void)
 	int ret;
 	uint64_t mask = STATMOUNT_MNT_BASIC;
 
-	ret = statmount(root_id, mask, &sm, sizeof(sm), 0);
+	ret = statmount(root_id, 0, mask, &sm, sizeof(sm), 0);
 	if (ret == -1) {
 		ksft_test_result_fail("statmount mnt basic: %s\n",
 				      strerror(errno));
@@ -337,7 +310,7 @@ static void test_statmount_sb_basic(void)
 	struct statx sx;
 	struct statfs sf;
 
-	ret = statmount(root_id, mask, &sm, sizeof(sm), 0);
+	ret = statmount(root_id, 0, mask, &sm, sizeof(sm), 0);
 	if (ret == -1) {
 		ksft_test_result_fail("statmount sb basic: %s\n",
 				      strerror(errno));
@@ -498,14 +471,14 @@ static void test_statmount_string(uint64_t mask, size_t off, const char *name)
 	exactsize = sm->size;
 	shortsize = sizeof(*sm) + i;
 
-	ret = statmount(root_id, mask, sm, exactsize, 0);
+	ret = statmount(root_id, 0, mask, sm, exactsize, 0);
 	if (ret == -1) {
 		ksft_test_result_fail("statmount exact size: %s\n",
 				      strerror(errno));
 		goto out;
 	}
 	errno = 0;
-	ret = statmount(root_id, mask, sm, shortsize, 0);
+	ret = statmount(root_id, 0, mask, sm, shortsize, 0);
 	if (ret != -1 || errno != EOVERFLOW) {
 		ksft_test_result_fail("should have failed with EOVERFLOW: %s\n",
 				      strerror(errno));
@@ -533,7 +506,7 @@ static void test_listmount_tree(void)
 	if (res == -1)
 		return;
 
-	num = res = listmount(LSMT_ROOT, 0, list, size, 0);
+	num = res = listmount(LSMT_ROOT, 0, 0, list, size, 0);
 	if (res == -1) {
 		ksft_test_result_fail("listmount: %s\n", strerror(errno));
 		return;
@@ -545,7 +518,7 @@ static void test_listmount_tree(void)
 	}
 
 	for (i = 0; i < size - step;) {
-		res = listmount(LSMT_ROOT, i ? list2[i - 1] : 0, list2 + i, step, 0);
+		res = listmount(LSMT_ROOT, 0, i ? list2[i - 1] : 0, list2 + i, step, 0);
 		if (res == -1)
 			ksft_test_result_fail("short listmount: %s\n",
 					      strerror(errno));
@@ -577,11 +550,11 @@ int main(void)
 	int ret;
 	uint64_t all_mask = STATMOUNT_SB_BASIC | STATMOUNT_MNT_BASIC |
 		STATMOUNT_PROPAGATE_FROM | STATMOUNT_MNT_ROOT |
-		STATMOUNT_MNT_POINT | STATMOUNT_FS_TYPE;
+		STATMOUNT_MNT_POINT | STATMOUNT_FS_TYPE | STATMOUNT_MNT_NS_ID;
 
 	ksft_print_header();
 
-	ret = statmount(0, 0, NULL, 0, 0);
+	ret = statmount(0, 0, 0, NULL, 0, 0);
 	assert(ret == -1);
 	if (errno == ENOSYS)
 		ksft_exit_skip("statmount() syscall not supported\n");
diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c b/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c
new file mode 100644
index 000000000000..e044f5fc57fd
--- /dev/null
+++ b/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c
@@ -0,0 +1,364 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#define _GNU_SOURCE
+
+#include <assert.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sched.h>
+#include <stdlib.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <linux/nsfs.h>
+#include <linux/stat.h>
+
+#include "statmount.h"
+#include "../../kselftest.h"
+
+#define NSID_PASS 0
+#define NSID_FAIL 1
+#define NSID_SKIP 2
+#define NSID_ERROR 3
+
+static void handle_result(int ret, const char *testname)
+{
+	if (ret == NSID_PASS)
+		ksft_test_result_pass("%s\n", testname);
+	else if (ret == NSID_FAIL)
+		ksft_test_result_fail("%s\n", testname);
+	else if (ret == NSID_ERROR)
+		ksft_exit_fail_msg("%s\n", testname);
+	else
+		ksft_test_result_skip("%s\n", testname);
+}
+
+static inline int wait_for_pid(pid_t pid)
+{
+	int status, ret;
+
+again:
+	ret = waitpid(pid, &status, 0);
+	if (ret == -1) {
+		if (errno == EINTR)
+			goto again;
+
+		ksft_print_msg("waitpid returned -1, errno=%d\n", errno);
+		return -1;
+	}
+
+	if (!WIFEXITED(status)) {
+		ksft_print_msg(
+		       "waitpid !WIFEXITED, WIFSIGNALED=%d, WTERMSIG=%d\n",
+		       WIFSIGNALED(status), WTERMSIG(status));
+		return -1;
+	}
+
+	ret = WEXITSTATUS(status);
+	return ret;
+}
+
+static int get_mnt_ns_id(const char *mnt_ns, uint64_t *mnt_ns_id)
+{
+	int fd = open(mnt_ns, O_RDONLY);
+
+	if (fd < 0) {
+		ksft_print_msg("failed to open for ns %s: %s\n",
+			       mnt_ns, strerror(errno));
+		sleep(60);
+		return NSID_ERROR;
+	}
+
+	if (ioctl(fd, NS_GET_MNTNS_ID, mnt_ns_id) < 0) {
+		ksft_print_msg("failed to get the nsid for ns %s: %s\n",
+			       mnt_ns, strerror(errno));
+		return NSID_ERROR;
+	}
+	close(fd);
+	return NSID_PASS;
+}
+
+static int get_mnt_id(const char *path, uint64_t *mnt_id)
+{
+	struct statx sx;
+	int ret;
+
+	ret = statx(AT_FDCWD, path, 0, STATX_MNT_ID_UNIQUE, &sx);
+	if (ret == -1) {
+		ksft_print_msg("retrieving unique mount ID for %s: %s\n", path,
+			       strerror(errno));
+		return NSID_ERROR;
+	}
+
+	if (!(sx.stx_mask & STATX_MNT_ID_UNIQUE)) {
+		ksft_print_msg("no unique mount ID available for %s\n", path);
+		return NSID_ERROR;
+	}
+
+	*mnt_id = sx.stx_mnt_id;
+	return NSID_PASS;
+}
+
+static int write_file(const char *path, const char *val)
+{
+	int fd = open(path, O_WRONLY);
+	size_t len = strlen(val);
+	int ret;
+
+	if (fd == -1) {
+		ksft_print_msg("opening %s for write: %s\n", path, strerror(errno));
+		return NSID_ERROR;
+	}
+
+	ret = write(fd, val, len);
+	if (ret == -1) {
+		ksft_print_msg("writing to %s: %s\n", path, strerror(errno));
+		return NSID_ERROR;
+	}
+	if (ret != len) {
+		ksft_print_msg("short write to %s\n", path);
+		return NSID_ERROR;
+	}
+
+	ret = close(fd);
+	if (ret == -1) {
+		ksft_print_msg("closing %s\n", path);
+		return NSID_ERROR;
+	}
+
+	return NSID_PASS;
+}
+
+static int setup_namespace(void)
+{
+	int ret;
+	char buf[32];
+	uid_t uid = getuid();
+	gid_t gid = getgid();
+
+	ret = unshare(CLONE_NEWNS|CLONE_NEWUSER|CLONE_NEWPID);
+	if (ret == -1)
+		ksft_exit_fail_msg("unsharing mountns and userns: %s\n",
+				   strerror(errno));
+
+	sprintf(buf, "0 %d 1", uid);
+	ret = write_file("/proc/self/uid_map", buf);
+	if (ret != NSID_PASS)
+		return ret;
+	ret = write_file("/proc/self/setgroups", "deny");
+	if (ret != NSID_PASS)
+		return ret;
+	sprintf(buf, "0 %d 1", gid);
+	ret = write_file("/proc/self/gid_map", buf);
+	if (ret != NSID_PASS)
+		return ret;
+
+	ret = mount("", "/", NULL, MS_REC|MS_PRIVATE, NULL);
+	if (ret == -1) {
+		ksft_print_msg("making mount tree private: %s\n",
+			       strerror(errno));
+		return NSID_ERROR;
+	}
+
+	return NSID_PASS;
+}
+
+static int _test_statmount_mnt_ns_id(void)
+{
+	struct statmount sm;
+	uint64_t mnt_ns_id;
+	uint64_t root_id;
+	int ret;
+
+	ret = get_mnt_ns_id("/proc/self/ns/mnt", &mnt_ns_id);
+	if (ret != NSID_PASS)
+		return ret;
+
+	ret = get_mnt_id("/", &root_id);
+	if (ret != NSID_PASS)
+		return ret;
+
+	ret = statmount(root_id, 0, STATMOUNT_MNT_NS_ID, &sm, sizeof(sm), 0);
+	if (ret == -1) {
+		ksft_print_msg("statmount mnt ns id: %s\n", strerror(errno));
+		return NSID_ERROR;
+	}
+
+	if (sm.size != sizeof(sm)) {
+		ksft_print_msg("unexpected size: %u != %u\n", sm.size,
+			       (uint32_t)sizeof(sm));
+		return NSID_FAIL;
+	}
+	if (sm.mask != STATMOUNT_MNT_NS_ID) {
+		ksft_print_msg("statmount mnt ns id unavailable\n");
+		return NSID_SKIP;
+	}
+
+	if (sm.mnt_ns_id != mnt_ns_id) {
+		ksft_print_msg("unexpected mnt ns ID: 0x%llx != 0x%llx\n",
+			       (unsigned long long)sm.mnt_ns_id,
+			       (unsigned long long)mnt_ns_id);
+		return NSID_FAIL;
+	}
+
+	return NSID_PASS;
+}
+
+static void test_statmount_mnt_ns_id(void)
+{
+	pid_t pid;
+	int ret;
+
+	pid = fork();
+	if (pid < 0)
+		ksft_exit_fail_msg("failed to fork: %s\n", strerror(errno));
+
+	/* We're the original pid, wait for the result. */
+	if (pid != 0) {
+		ret = wait_for_pid(pid);
+		handle_result(ret, "test statmount ns id");
+		return;
+	}
+
+	ret = setup_namespace();
+	if (ret != NSID_PASS)
+		exit(ret);
+	ret = _test_statmount_mnt_ns_id();
+	exit(ret);
+}
+
+static int validate_external_listmount(pid_t pid, uint64_t child_nr_mounts)
+{
+	uint64_t list[256];
+	uint64_t mnt_ns_id;
+	uint64_t nr_mounts;
+	char buf[256];
+	int ret;
+
+	/* Get the mount ns id for our child. */
+	snprintf(buf, sizeof(buf), "/proc/%lu/ns/mnt", (unsigned long)pid);
+	ret = get_mnt_ns_id(buf, &mnt_ns_id);
+
+	nr_mounts = listmount(LSMT_ROOT, mnt_ns_id, 0, list, 256, 0);
+	if (nr_mounts == (uint64_t)-1) {
+		ksft_print_msg("listmount: %s\n", strerror(errno));
+		return NSID_ERROR;
+	}
+
+	if (nr_mounts != child_nr_mounts) {
+		ksft_print_msg("listmount results is %zi != %zi\n", nr_mounts,
+			       child_nr_mounts);
+		return NSID_FAIL;
+	}
+
+	/* Validate that all of our entries match our mnt_ns_id. */
+	for (int i = 0; i < nr_mounts; i++) {
+		struct statmount sm;
+
+		ret = statmount(list[i], mnt_ns_id, STATMOUNT_MNT_NS_ID, &sm,
+				sizeof(sm), 0);
+		if (ret < 0) {
+			ksft_print_msg("statmount mnt ns id: %s\n", strerror(errno));
+			return NSID_ERROR;
+		}
+
+		if (sm.mask != STATMOUNT_MNT_NS_ID) {
+			ksft_print_msg("statmount mnt ns id unavailable\n");
+			return NSID_SKIP;
+		}
+
+		if (sm.mnt_ns_id != mnt_ns_id) {
+			ksft_print_msg("listmount gave us the wrong ns id: 0x%llx != 0x%llx\n",
+				       (unsigned long long)sm.mnt_ns_id,
+				       (unsigned long long)mnt_ns_id);
+			return NSID_FAIL;
+		}
+	}
+
+	return NSID_PASS;
+}
+
+static void test_listmount_ns(void)
+{
+	uint64_t nr_mounts;
+	char pval;
+	int child_ready_pipe[2];
+	int parent_ready_pipe[2];
+	pid_t pid;
+	int ret, child_ret;
+
+	if (pipe(child_ready_pipe) < 0)
+		ksft_exit_fail_msg("failed to create the child pipe: %s\n",
+				   strerror(errno));
+	if (pipe(parent_ready_pipe) < 0)
+		ksft_exit_fail_msg("failed to create the parent pipe: %s\n",
+				   strerror(errno));
+
+	pid = fork();
+	if (pid < 0)
+		ksft_exit_fail_msg("failed to fork: %s\n", strerror(errno));
+
+	if (pid == 0) {
+		char cval;
+		uint64_t list[256];
+
+		close(child_ready_pipe[0]);
+		close(parent_ready_pipe[1]);
+
+		ret = setup_namespace();
+		if (ret != NSID_PASS)
+			exit(ret);
+
+		nr_mounts = listmount(LSMT_ROOT, 0, 0, list, 256, 0);
+		if (nr_mounts == (uint64_t)-1) {
+			ksft_print_msg("listmount: %s\n", strerror(errno));
+			exit(NSID_FAIL);
+		}
+
+		/*
+		 * Tell our parent how many mounts we have, and then wait for it
+		 * to tell us we're done.
+		 */
+		write(child_ready_pipe[1], &nr_mounts, sizeof(nr_mounts));
+		read(parent_ready_pipe[0], &cval, sizeof(cval));
+		exit(NSID_PASS);
+	}
+
+	close(child_ready_pipe[1]);
+	close(parent_ready_pipe[0]);
+
+	/* Wait until the child has created everything. */
+	if (read(child_ready_pipe[0], &nr_mounts, sizeof(nr_mounts)) !=
+	    sizeof(nr_mounts))
+		ret = NSID_ERROR;
+
+	ret = validate_external_listmount(pid, nr_mounts);
+
+	if (write(parent_ready_pipe[1], &pval, sizeof(pval)) != sizeof(pval))
+		ret = NSID_ERROR;
+
+	child_ret = wait_for_pid(pid);
+	if (child_ret != NSID_PASS)
+		ret = child_ret;
+	handle_result(ret, "test listmount ns id");
+}
+
+int main(void)
+{
+	int ret;
+
+	ksft_print_header();
+	ret = statmount(0, 0, 0, NULL, 0, 0);
+	assert(ret == -1);
+	if (errno == ENOSYS)
+		ksft_exit_skip("statmount() syscall not supported\n");
+
+	ksft_set_plan(2);
+	test_statmount_mnt_ns_id();
+	test_listmount_ns();
+
+	if (ksft_get_fail_cnt() + ksft_get_error_cnt() > 0)
+		ksft_exit_fail();
+	else
+		ksft_exit_pass();
+}

From c72b6b72240508f2ed9308f0d845e3cd35a92759 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Mon, 24 Jun 2024 15:40:50 -0400
Subject: [PATCH 13/20] fs: rename show_mnt_opts -> show_vfsmnt_opts

This name is more consistent with what the helper does, which is to just
show the vfsmount options.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Link: https://lore.kernel.org/r/fb363c62ffbf78a18095d596a19b8412aa991251.1719257716.git.josef@toxicpanda.com
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/proc_namespace.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 0a808951b7d3..e133b507ddf3 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -61,7 +61,7 @@ static int show_sb_opts(struct seq_file *m, struct super_block *sb)
 	return security_sb_show_options(m, sb);
 }
 
-static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
+static void show_vfsmnt_opts(struct seq_file *m, struct vfsmount *mnt)
 {
 	static const struct proc_fs_opts mnt_opts[] = {
 		{ MNT_NOSUID, ",nosuid" },
@@ -124,7 +124,7 @@ static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt)
 	err = show_sb_opts(m, sb);
 	if (err)
 		goto out;
-	show_mnt_opts(m, mnt);
+	show_vfsmnt_opts(m, mnt);
 	if (sb->s_op->show_options)
 		err = sb->s_op->show_options(m, mnt_path.dentry);
 	seq_puts(m, " 0 0\n");
@@ -153,7 +153,7 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
 		goto out;
 
 	seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw");
-	show_mnt_opts(m, mnt);
+	show_vfsmnt_opts(m, mnt);
 
 	/* Tagged fields ("foo:X" or "bar") */
 	if (IS_MNT_SHARED(r))

From f9af549d1fd31487bbbc666b5b158cfc940ccc17 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Mon, 24 Jun 2024 15:40:52 -0400
Subject: [PATCH 14/20] fs: export mount options via statmount()

statmount() can export arbitrary strings, so utilize the __spare1 slot
for a mnt_opts string pointer, and then support asking for and setting
the mount options during statmount().  This calls into the helper for
showing mount options, which already uses a seq_file, so fits in nicely
with our existing mechanism for exporting strings via statmount().

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Link: https://lore.kernel.org/r/3aa6bf8bd5d0a21df9ebd63813af8ab532c18276.1719257716.git.josef@toxicpanda.com
Reviewed-by: Jeff Layton <jlayton@kernel.org>
[brauner: only call sb->s_op->show_options()]
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c             | 37 ++++++++++++++++++++++++++++++++++++-
 include/uapi/linux/mount.h |  3 ++-
 2 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index a989e89b0a10..c53a0ee748c6 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -4980,6 +4980,34 @@ static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns)
 	s->sm.mnt_ns_id = ns->seq;
 }
 
+static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq)
+{
+	struct vfsmount *mnt = s->mnt;
+	struct super_block *sb = mnt->mnt_sb;
+	int err;
+
+	if (sb->s_op->show_options) {
+		size_t start = seq->count;
+
+		err = sb->s_op->show_options(seq, mnt->mnt_root);
+		if (err)
+			return err;
+
+		if (unlikely(seq_has_overflowed(seq)))
+			return -EAGAIN;
+
+		if (seq->count == start)
+			return 0;
+
+		/* skip leading comma */
+		memmove(seq->buf + start, seq->buf + start + 1,
+			seq->count - start - 1);
+		seq->count--;
+	}
+
+	return 0;
+}
+
 static int statmount_string(struct kstatmount *s, u64 flag)
 {
 	int ret;
@@ -5000,6 +5028,10 @@ static int statmount_string(struct kstatmount *s, u64 flag)
 		sm->mnt_point = seq->count;
 		ret = statmount_mnt_point(s, seq);
 		break;
+	case STATMOUNT_MNT_OPTS:
+		sm->mnt_opts = seq->count;
+		ret = statmount_mnt_opts(s, seq);
+		break;
 	default:
 		WARN_ON_ONCE(true);
 		return -EINVAL;
@@ -5130,6 +5162,9 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
 	if (!err && s->mask & STATMOUNT_MNT_POINT)
 		err = statmount_string(s, STATMOUNT_MNT_POINT);
 
+	if (!err && s->mask & STATMOUNT_MNT_OPTS)
+		err = statmount_string(s, STATMOUNT_MNT_OPTS);
+
 	if (!err && s->mask & STATMOUNT_MNT_NS_ID)
 		statmount_mnt_ns_id(s, ns);
 
@@ -5151,7 +5186,7 @@ static inline bool retry_statmount(const long ret, size_t *seq_size)
 }
 
 #define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \
-			      STATMOUNT_FS_TYPE)
+			      STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS)
 
 static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq,
 			      struct statmount __user *buf, size_t bufsize,
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index ee1559cd6764..225bc366ffcb 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -154,7 +154,7 @@ struct mount_attr {
  */
 struct statmount {
 	__u32 size;		/* Total size, including strings */
-	__u32 __spare1;
+	__u32 mnt_opts;		/* [str] Mount options of the mount */
 	__u64 mask;		/* What results were written */
 	__u32 sb_dev_major;	/* Device ID */
 	__u32 sb_dev_minor;
@@ -206,6 +206,7 @@ struct mnt_id_req {
 #define STATMOUNT_MNT_POINT		0x00000010U	/* Want/got mnt_point */
 #define STATMOUNT_FS_TYPE		0x00000020U	/* Want/got fs_type */
 #define STATMOUNT_MNT_NS_ID		0x00000040U	/* Want/got mnt_ns_id */
+#define STATMOUNT_MNT_OPTS		0x00000080U	/* Want/got mnt_opts */
 
 /*
  * Special @mnt_id values that can be passed to listmount

From d842379313a2c205dae64dbfd0aa13dba142a867 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 25 Jun 2024 14:33:45 +0200
Subject: [PATCH 15/20] fs: use guard for namespace_sem in statmount()

Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c | 262 +++++++++++++++++++++++++------------------------
 1 file changed, 134 insertions(+), 128 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index e871f73c4c8c..a989e89b0a10 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -5040,104 +5040,6 @@ static int copy_statmount_to_user(struct kstatmount *s)
 	return 0;
 }
 
-static int do_statmount(struct kstatmount *s)
-{
-	struct mount *m = real_mount(s->mnt);
-	struct mnt_namespace *ns = m->mnt_ns;
-	int err;
-
-	/*
-	 * Don't trigger audit denials. We just want to determine what
-	 * mounts to show users.
-	 */
-	if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) &&
-	    !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
-		return -EPERM;
-
-	err = security_sb_statfs(s->mnt->mnt_root);
-	if (err)
-		return err;
-
-	if (s->mask & STATMOUNT_SB_BASIC)
-		statmount_sb_basic(s);
-
-	if (s->mask & STATMOUNT_MNT_BASIC)
-		statmount_mnt_basic(s);
-
-	if (s->mask & STATMOUNT_PROPAGATE_FROM)
-		statmount_propagate_from(s);
-
-	if (s->mask & STATMOUNT_FS_TYPE)
-		err = statmount_string(s, STATMOUNT_FS_TYPE);
-
-	if (!err && s->mask & STATMOUNT_MNT_ROOT)
-		err = statmount_string(s, STATMOUNT_MNT_ROOT);
-
-	if (!err && s->mask & STATMOUNT_MNT_POINT)
-		err = statmount_string(s, STATMOUNT_MNT_POINT);
-
-	if (!err && s->mask & STATMOUNT_MNT_NS_ID)
-		statmount_mnt_ns_id(s, ns);
-
-	if (err)
-		return err;
-
-	return 0;
-}
-
-static inline bool retry_statmount(const long ret, size_t *seq_size)
-{
-	if (likely(ret != -EAGAIN))
-		return false;
-	if (unlikely(check_mul_overflow(*seq_size, 2, seq_size)))
-		return false;
-	if (unlikely(*seq_size > MAX_RW_COUNT))
-		return false;
-	return true;
-}
-
-static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq,
-			      struct statmount __user *buf, size_t bufsize,
-			      size_t seq_size)
-{
-	if (!access_ok(buf, bufsize))
-		return -EFAULT;
-
-	memset(ks, 0, sizeof(*ks));
-	ks->mask = kreq->param;
-	ks->buf = buf;
-	ks->bufsize = bufsize;
-	ks->seq.size = seq_size;
-	ks->seq.buf = kvmalloc(seq_size, GFP_KERNEL_ACCOUNT);
-	if (!ks->seq.buf)
-		return -ENOMEM;
-	return 0;
-}
-
-static int copy_mnt_id_req(const struct mnt_id_req __user *req,
-			   struct mnt_id_req *kreq)
-{
-	int ret;
-	size_t usize;
-
-	BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER1);
-
-	ret = get_user(usize, &req->size);
-	if (ret)
-		return -EFAULT;
-	if (unlikely(usize > PAGE_SIZE))
-		return -E2BIG;
-	if (unlikely(usize < MNT_ID_REQ_SIZE_VER0))
-		return -EINVAL;
-	memset(kreq, 0, sizeof(*kreq));
-	ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize);
-	if (ret)
-		return ret;
-	if (kreq->spare != 0)
-		return -EINVAL;
-	return 0;
-}
-
 static struct mount *listmnt_next(struct mount *curr, bool reverse)
 {
 	struct rb_node *node;
@@ -5177,6 +5079,130 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root)
 	return 0;
 }
 
+static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
+			struct mnt_namespace *ns)
+{
+	struct path root __free(path_put) = {};
+	struct mount *m;
+	int err;
+
+	/* Has the namespace already been emptied? */
+	if (mnt_ns_id && RB_EMPTY_ROOT(&ns->mounts))
+		return -ENOENT;
+
+	s->mnt = lookup_mnt_in_ns(mnt_id, ns);
+	if (!s->mnt)
+		return -ENOENT;
+
+	err = grab_requested_root(ns, &root);
+	if (err)
+		return err;
+
+	/*
+	 * Don't trigger audit denials. We just want to determine what
+	 * mounts to show users.
+	 */
+	m = real_mount(s->mnt);
+	if (!is_path_reachable(m, m->mnt.mnt_root, &root) &&
+	    !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
+		return -EPERM;
+
+	err = security_sb_statfs(s->mnt->mnt_root);
+	if (err)
+		return err;
+
+	s->root = root;
+	if (s->mask & STATMOUNT_SB_BASIC)
+		statmount_sb_basic(s);
+
+	if (s->mask & STATMOUNT_MNT_BASIC)
+		statmount_mnt_basic(s);
+
+	if (s->mask & STATMOUNT_PROPAGATE_FROM)
+		statmount_propagate_from(s);
+
+	if (s->mask & STATMOUNT_FS_TYPE)
+		err = statmount_string(s, STATMOUNT_FS_TYPE);
+
+	if (!err && s->mask & STATMOUNT_MNT_ROOT)
+		err = statmount_string(s, STATMOUNT_MNT_ROOT);
+
+	if (!err && s->mask & STATMOUNT_MNT_POINT)
+		err = statmount_string(s, STATMOUNT_MNT_POINT);
+
+	if (!err && s->mask & STATMOUNT_MNT_NS_ID)
+		statmount_mnt_ns_id(s, ns);
+
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static inline bool retry_statmount(const long ret, size_t *seq_size)
+{
+	if (likely(ret != -EAGAIN))
+		return false;
+	if (unlikely(check_mul_overflow(*seq_size, 2, seq_size)))
+		return false;
+	if (unlikely(*seq_size > MAX_RW_COUNT))
+		return false;
+	return true;
+}
+
+#define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \
+			      STATMOUNT_FS_TYPE)
+
+static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq,
+			      struct statmount __user *buf, size_t bufsize,
+			      size_t seq_size)
+{
+	if (!access_ok(buf, bufsize))
+		return -EFAULT;
+
+	memset(ks, 0, sizeof(*ks));
+	ks->mask = kreq->param;
+	ks->buf = buf;
+	ks->bufsize = bufsize;
+
+	if (ks->mask & STATMOUNT_STRING_REQ) {
+		if (bufsize == sizeof(ks->sm))
+			return -EOVERFLOW;
+
+		ks->seq.buf = kvmalloc(seq_size, GFP_KERNEL_ACCOUNT);
+		if (!ks->seq.buf)
+			return -ENOMEM;
+
+		ks->seq.size = seq_size;
+	}
+
+	return 0;
+}
+
+static int copy_mnt_id_req(const struct mnt_id_req __user *req,
+			   struct mnt_id_req *kreq)
+{
+	int ret;
+	size_t usize;
+
+	BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER1);
+
+	ret = get_user(usize, &req->size);
+	if (ret)
+		return -EFAULT;
+	if (unlikely(usize > PAGE_SIZE))
+		return -E2BIG;
+	if (unlikely(usize < MNT_ID_REQ_SIZE_VER0))
+		return -EINVAL;
+	memset(kreq, 0, sizeof(*kreq));
+	ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize);
+	if (ret)
+		return ret;
+	if (kreq->spare != 0)
+		return -EINVAL;
+	return 0;
+}
+
 /*
  * If the user requested a specific mount namespace id, look that up and return
  * that, or if not simply grab a passive reference on our mount namespace and
@@ -5195,9 +5221,8 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
 		unsigned int, flags)
 {
 	struct mnt_namespace *ns __free(mnt_ns_release) = NULL;
-	struct vfsmount *mnt;
+	struct kstatmount *ks __free(kfree) = NULL;
 	struct mnt_id_req kreq;
-	struct kstatmount ks;
 	/* We currently support retrieval of 3 strings. */
 	size_t seq_size = 3 * PATH_MAX;
 	int ret;
@@ -5217,40 +5242,21 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
 	    !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
 		return -ENOENT;
 
+	ks = kmalloc(sizeof(*ks), GFP_KERNEL_ACCOUNT);
+	if (!ks)
+		return -ENOMEM;
+
 retry:
-	ret = prepare_kstatmount(&ks, &kreq, buf, bufsize, seq_size);
+	ret = prepare_kstatmount(ks, &kreq, buf, bufsize, seq_size);
 	if (ret)
 		return ret;
 
-	down_read(&namespace_sem);
-	/* Has the namespace already been emptied? */
-	if (kreq.mnt_ns_id && RB_EMPTY_ROOT(&ns->mounts)) {
-		up_read(&namespace_sem);
-		kvfree(ks.seq.buf);
-		return -ENOENT;
-	}
-
-	mnt = lookup_mnt_in_ns(kreq.mnt_id, ns);
-	if (!mnt) {
-		up_read(&namespace_sem);
-		kvfree(ks.seq.buf);
-		return -ENOENT;
-	}
-
-	ks.mnt = mnt;
-	ret = grab_requested_root(ns, &ks.root);
-	if (ret) {
-		up_read(&namespace_sem);
-		kvfree(ks.seq.buf);
-		return ret;
-	}
-	ret = do_statmount(&ks);
-	path_put(&ks.root);
-	up_read(&namespace_sem);
+	scoped_guard(rwsem_read, &namespace_sem)
+		ret = do_statmount(ks, kreq.mnt_id, kreq.mnt_ns_id, ns);
 
 	if (!ret)
-		ret = copy_statmount_to_user(&ks);
-	kvfree(ks.seq.buf);
+		ret = copy_statmount_to_user(ks);
+	kvfree(ks->seq.buf);
 	if (retry_statmount(ret, &seq_size))
 		goto retry;
 	return ret;

From e2f718e25537d3567ababbe4276306efd3f23f47 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Mon, 24 Jun 2024 15:40:53 -0400
Subject: [PATCH 16/20] sefltests: extend the statmount test for mount options

Now that we support exporting mount options, via statmount(), add a test
to validate that it works.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Link: https://lore.kernel.org/r/cabe09f0933d9c522da6e7b6cc160254f4f6c3b9.1719257716.git.josef@toxicpanda.com
Reviewed-by: Jeff Layton <jlayton@kernel.org>
[brauner: simplify and fix]
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 .../filesystems/statmount/statmount_test.c    | 91 ++++++++++++++++++-
 1 file changed, 90 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test.c b/tools/testing/selftests/filesystems/statmount/statmount_test.c
index 4f7023c2de77..ece005864863 100644
--- a/tools/testing/selftests/filesystems/statmount/statmount_test.c
+++ b/tools/testing/selftests/filesystems/statmount/statmount_test.c
@@ -107,6 +107,7 @@ static char root_mntpoint[] = "/tmp/statmount_test_root.XXXXXX";
 static int orig_root;
 static uint64_t root_id, parent_id;
 static uint32_t old_root_id, old_parent_id;
+static FILE *f_mountinfo;
 
 static void cleanup_namespace(void)
 {
@@ -134,6 +135,11 @@ static void setup_namespace(void)
 	sprintf(buf, "0 %d 1", gid);
 	write_file("/proc/self/gid_map", buf);
 
+	f_mountinfo = fopen("/proc/self/mountinfo", "re");
+	if (!f_mountinfo)
+		ksft_exit_fail_msg("failed to open mountinfo: %s\n",
+				   strerror(errno));
+
 	ret = mount("", "/", NULL, MS_REC|MS_PRIVATE, NULL);
 	if (ret == -1)
 		ksft_exit_fail_msg("making mount tree private: %s\n",
@@ -435,6 +441,88 @@ static void test_statmount_fs_type(void)
 	free(sm);
 }
 
+static void test_statmount_mnt_opts(void)
+{
+	struct statmount *sm;
+	const char *statmount_opts;
+	char *line = NULL;
+	size_t len = 0;
+
+	sm = statmount_alloc(root_id, STATMOUNT_MNT_BASIC | STATMOUNT_MNT_OPTS,
+			     0);
+	if (!sm) {
+		ksft_test_result_fail("statmount mnt opts: %s\n",
+				      strerror(errno));
+		return;
+	}
+
+	while (getline(&line, &len, f_mountinfo) != -1) {
+		int i;
+		char *p, *p2;
+		unsigned int old_mnt_id;
+
+		old_mnt_id = atoi(line);
+		if (old_mnt_id != sm->mnt_id_old)
+			continue;
+
+		for (p = line, i = 0; p && i < 5; i++)
+			p = strchr(p + 1, ' ');
+		if (!p)
+			continue;
+
+		p2 = strchr(p + 1, ' ');
+		if (!p2)
+			continue;
+		*p2 = '\0';
+		p = strchr(p2 + 1, '-');
+		if (!p)
+			continue;
+		for (p++, i = 0; p && i < 2; i++)
+			p = strchr(p + 1, ' ');
+		if (!p)
+			continue;
+		p++;
+
+		/* skip generic superblock options */
+		if (strncmp(p, "ro", 2) == 0)
+			p += 2;
+		else if (strncmp(p, "rw", 2) == 0)
+			p += 2;
+		if (*p == ',')
+			p++;
+		if (strncmp(p, "sync", 4) == 0)
+			p += 4;
+		if (*p == ',')
+			p++;
+		if (strncmp(p, "dirsync", 7) == 0)
+			p += 7;
+		if (*p == ',')
+			p++;
+		if (strncmp(p, "lazytime", 8) == 0)
+			p += 8;
+		if (*p == ',')
+			p++;
+		p2 = strrchr(p, '\n');
+		if (p2)
+			*p2 = '\0';
+
+		statmount_opts = sm->str + sm->mnt_opts;
+		if (strcmp(statmount_opts, p) != 0)
+			ksft_test_result_fail(
+				"unexpected mount options: '%s' != '%s'\n",
+				statmount_opts, p);
+		else
+			ksft_test_result_pass("statmount mount options\n");
+		free(sm);
+		free(line);
+		return;
+	}
+
+	ksft_test_result_fail("didnt't find mount entry\n");
+	free(sm);
+	free(line);
+}
+
 static void test_statmount_string(uint64_t mask, size_t off, const char *name)
 {
 	struct statmount *sm;
@@ -561,7 +649,7 @@ int main(void)
 
 	setup_namespace();
 
-	ksft_set_plan(14);
+	ksft_set_plan(15);
 	test_listmount_empty_root();
 	test_statmount_zero_mask();
 	test_statmount_mnt_basic();
@@ -569,6 +657,7 @@ int main(void)
 	test_statmount_mnt_root();
 	test_statmount_mnt_point();
 	test_statmount_fs_type();
+	test_statmount_mnt_opts();
 	test_statmount_string(STATMOUNT_MNT_ROOT, str_off(mnt_root), "mount root");
 	test_statmount_string(STATMOUNT_MNT_POINT, str_off(mnt_point), "mount point");
 	test_statmount_string(STATMOUNT_FS_TYPE, str_off(fs_type), "fs type");

From 8d42877ad65b02741c9099392a001b7209baa5d4 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 4 Jul 2024 17:00:19 +0200
Subject: [PATCH 17/20] fs: only copy to userspace on success in listmount()

Avoid copying when we failed to, or didn't have any mounts to list.

Fixes: cb54ef4f050e ("fs: don't copy to userspace under namespace semaphore") # mainline only
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/namespace.c b/fs/namespace.c
index c53a0ee748c6..f44e5448c8a0 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -5401,6 +5401,8 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
 	scoped_guard(rwsem_read, &namespace_sem)
 		ret = do_listmount(ns, kreq.mnt_id, kreq.param, kmnt_ids,
 				   nr_mnt_ids, (flags & LISTMOUNT_REVERSE));
+	if (ret <= 0)
+		return ret;
 
 	if (copy_to_user(mnt_ids, kmnt_ids, ret * sizeof(*mnt_ids)))
 		return -EFAULT;

From 5e8a9cebc5580ca7c01d6c151017187785dc0dfe Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 5 Jul 2024 13:08:54 +0200
Subject: [PATCH 18/20] fs: find rootfs mount of the mount namespace

The method we used was predicated on the assumption that the mount
immediately following the root mount of the mount namespace would be the
rootfs mount of the namespace. That's not always the case though. For
example:

ID  PARENT ID
408 412       0:60 /containers/overlay-containers/bc391117192b32071b22ef2083ebe7735d5c390f87a5779e02faf79ba0746ceb/userdata/hosts /etc/hosts rw,nosuid,nodev,relatime - tmpfs tmpfs rw,size=954664k,nr_inodes=238666,mode=700,uid=1000,gid=1000,inode64
409 414       0:61 / /dev/shm rw,nosuid,nodev,noexec,relatime - tmpfs shm rw,size=64000k,uid=1000,gid=1000,inode64
410 412       0:60 /containers/overlay-containers/bc391117192b32071b22ef2083ebe7735d5c390f87a5779e02faf79ba0746ceb/userdata/.containerenv /run/.containerenv rw,nosuid,nodev,relatime - tmpfs tmpfs rw,size=954664k,nr_inodes=238666,mode=700,uid=1000,gid=1000,inode64
411 412       0:60 /containers/overlay-containers/bc391117192b32071b22ef2083ebe7735d5c390f87a5779e02faf79ba0746ceb/userdata/hostname /etc/hostname rw,nosuid,nodev,relatime - tmpfs tmpfs rw,size=954664k,nr_inodes=238666,mode=700,uid=1000,gid=1000,inode64
412 363       0:65 / / rw,relatime - overlay overlay rw,lowerdir=/home/user1/.local/share/containers/storage/overlay/l/JS65SUCGTPCP2EEBHLRP4UCFI5:/home/user1/.local/share/containers/storage/overlay/l/DLW22KVDWUNI4242D6SDJ5GKCL [...]
413 412       0:68 / /proc rw,nosuid,nodev,noexec,relatime - proc proc rw
414 412       0:69 / /dev rw,nosuid - tmpfs tmpfs rw,size=65536k,mode=755,uid=1000,gid=1000,inode64
415 412       0:70 / /sys ro,nosuid,nodev,noexec,relatime - sysfs sysfs rw
416 414       0:71 / /dev/pts rw,nosuid,noexec,relatime - devpts devpts rw,gid=100004,mode=620,ptmxmode=666
417 414       0:67 / /dev/mqueue rw,nosuid,nodev,noexec,relatime - mqueue mqueue rw
418 415       0:27 / /sys/fs/cgroup ro,nosuid,nodev,noexec,relatime - cgroup2 cgroup2 rw,nsdelegate,memory_recursiveprot
419 414       0:6 /null /dev/null rw,nosuid,noexec - devtmpfs devtmpfs rw,size=4096k,nr_inodes=1179282,mode=755,inode64
420 414       0:6 /zero /dev/zero rw,nosuid,noexec - devtmpfs devtmpfs rw,size=4096k,nr_inodes=1179282,mode=755,inode64
422 414       0:6 /full /dev/full rw,nosuid,noexec - devtmpfs devtmpfs rw,size=4096k,nr_inodes=1179282,mode=755,inode64
423 414       0:6 /tty /dev/tty rw,nosuid,noexec - devtmpfs devtmpfs rw,size=4096k,nr_inodes=1179282,mode=755,inode64
430 414       0:6 /random /dev/random rw,nosuid,noexec - devtmpfs devtmpfs rw,size=4096k,nr_inodes=1179282,mode=755,inode64
431 414       0:6 /urandom /dev/urandom rw,nosuid,noexec - devtmpfs devtmpfs rw,size=4096k,nr_inodes=1179282,mode=755,inode64
433 413       0:72 / /proc/acpi ro,relatime - tmpfs tmpfs rw,size=0k,uid=1000,gid=1000,inode64
440 413       0:6 /null /proc/kcore ro,nosuid - devtmpfs devtmpfs rw,size=4096k,nr_inodes=1179282,mode=755,inode64
441 413       0:6 /null /proc/keys ro,nosuid - devtmpfs devtmpfs rw,size=4096k,nr_inodes=1179282,mode=755,inode64
442 413       0:6 /null /proc/timer_list ro,nosuid - devtmpfs devtmpfs rw,size=4096k,nr_inodes=1179282,mode=755,inode64
443 413       0:73 / /proc/scsi ro,relatime - tmpfs tmpfs rw,size=0k,uid=1000,gid=1000,inode64
444 415       0:74 / /sys/firmware ro,relatime - tmpfs tmpfs rw,size=0k,uid=1000,gid=1000,inode64
445 415       0:75 / /sys/dev/block ro,relatime - tmpfs tmpfs rw,size=0k,uid=1000,gid=1000,inode64
446 413       0:68 /bus /proc/bus ro,nosuid,nodev,noexec,relatime - proc proc rw
447 413       0:68 /fs /proc/fs ro,nosuid,nodev,noexec,relatime - proc proc rw
448 413       0:68 /irq /proc/irq ro,nosuid,nodev,noexec,relatime - proc proc rw
449 413       0:68 /sys /proc/sys ro,nosuid,nodev,noexec,relatime - proc proc rw
450 413       0:68 /sysrq-trigger /proc/sysrq-trigger ro,nosuid,nodev,noexec,relatime - proc proc rw
364 414       0:71 /0 /dev/console rw,relatime - devpts devpts rw,gid=100004,mode=620,ptmxmode=666

In this mount table the root mount of the mount namespace is the mount
with id 363 (It isn't visible because it's literally just what the
rootfs mount is mounted upon and usually it's just a copy of the real
rootfs).

The rootfs mount that's mounted on the root mount of the mount namespace
is the mount with id 412. But the mount namespace contains mounts that
were created before the rootfs mount and thus have earlier mount ids. So
the first call to listmnt_next() would return the mount with the mount
id 408 and not the rootfs mount.

So we need to find the actual rootfs mount mounted on the root mount of
the mount namespace. This logic is also present in mntns_install() where
vfs_path_lookup() is used. We can't use this though as we're holding the
namespace semaphore. We could look at the children of the root mount of
the mount namespace directly but that also seems a bit out of place
while we have the rbtree. So let's just iterate through the rbtree
starting from the root mount of the mount namespace and find the mount
whose parent is the root mount of the mount namespace. That mount will
usually appear very early in the rbtree and afaik there can only be one.
IOW, it would be very strange if we ended up with a root mount of a
mount namespace that has shadow mounts.

Fixes: 0a3deb11858a ("fs: Allow listmount() in foreign mount namespace") # mainline only
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index f44e5448c8a0..56c1dcffb4dc 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -5086,7 +5086,7 @@ static struct mount *listmnt_next(struct mount *curr, bool reverse)
 
 static int grab_requested_root(struct mnt_namespace *ns, struct path *root)
 {
-	struct mount *first;
+	struct mount *first, *child;
 
 	rwsem_assert_held(&namespace_sem);
 
@@ -5103,10 +5103,16 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root)
 	if (RB_EMPTY_ROOT(&ns->mounts))
 		return -ENOENT;
 
-	first = listmnt_next(ns->root, false);
-	if (!first)
-		return -ENOENT;
-	root->mnt = mntget(&first->mnt);
+	first = child = ns->root;
+	for (;;) {
+		child = listmnt_next(child, false);
+		if (!child)
+			return -ENOENT;
+		if (child->mnt_parent == first)
+			break;
+	}
+
+	root->mnt = mntget(&child->mnt);
 	root->dentry = dget(root->mnt->mnt_root);
 	return 0;
 }

From 80744d0e7a81c35795a2754049eafff76abbe371 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 4 Jul 2024 10:58:34 +0200
Subject: [PATCH 19/20] fs: refuse mnt id requests with invalid ids early

Unique mount ids start past the last valid old mount id value to not
confuse the two so reject invalid values early in copy_mnt_id_req().

Link: https://lore.kernel.org/r/20240704-work-mount-fixes-v1-1-d007c990de5f@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 56c1dcffb4dc..8e3603558e59 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -70,7 +70,8 @@ static DEFINE_IDA(mnt_id_ida);
 static DEFINE_IDA(mnt_group_ida);
 
 /* Don't allow confusion with old 32bit mount ID */
-static atomic64_t mnt_id_ctr = ATOMIC64_INIT(1ULL << 32);
+#define MNT_UNIQUE_ID_OFFSET (1ULL << 32)
+static atomic64_t mnt_id_ctr = ATOMIC64_INIT(MNT_UNIQUE_ID_OFFSET);
 
 static struct hlist_head *mount_hashtable __ro_after_init;
 static struct hlist_head *mountpoint_hashtable __ro_after_init;
@@ -5241,6 +5242,9 @@ static int copy_mnt_id_req(const struct mnt_id_req __user *req,
 		return ret;
 	if (kreq->spare != 0)
 		return -EINVAL;
+	/* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */
+	if (kreq->mnt_id <= MNT_UNIQUE_ID_OFFSET)
+		return -EINVAL;
 	return 0;
 }
 

From 4bed843b10004d9101b49ac7270131051c39a92b Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 4 Jul 2024 10:58:35 +0200
Subject: [PATCH 20/20] fs: reject invalid last mount id early

Unique mount ids start past the last valid old mount id value to not
confuse the two. If a last mount id has been specified, reject any
invalid values early.

Link: https://lore.kernel.org/r/20240704-work-mount-fixes-v1-2-d007c990de5f@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 8e3603558e59..ade356c7f14a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -5375,6 +5375,7 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
 	const size_t maxcount = 1000000;
 	struct mnt_namespace *ns __free(mnt_ns_release) = NULL;
 	struct mnt_id_req kreq;
+	u64 last_mnt_id;
 	ssize_t ret;
 
 	if (flags & ~LISTMOUNT_REVERSE)
@@ -5395,6 +5396,11 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
 	if (ret)
 		return ret;
 
+	last_mnt_id = kreq.param;
+	/* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */
+	if (last_mnt_id != 0 && last_mnt_id <= MNT_UNIQUE_ID_OFFSET)
+		return -EINVAL;
+
 	kmnt_ids = kvmalloc_array(nr_mnt_ids, sizeof(*kmnt_ids),
 				  GFP_KERNEL_ACCOUNT);
 	if (!kmnt_ids)
@@ -5409,7 +5415,7 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
 		return -ENOENT;
 
 	scoped_guard(rwsem_read, &namespace_sem)
-		ret = do_listmount(ns, kreq.mnt_id, kreq.param, kmnt_ids,
+		ret = do_listmount(ns, kreq.mnt_id, last_mnt_id, kmnt_ids,
 				   nr_mnt_ids, (flags & LISTMOUNT_REVERSE));
 	if (ret <= 0)
 		return ret;