linux/kernel/ns_cgroup.c

/*
 * ns_cgroup.c - namespace cgroup subsystem
 *
 * Copyright 2006, 2007 IBM Corp
 */

#include <linux/module.h>
#include <linux/cgroup.h>
#include <linux/fs.h>
#include <linux/proc_fs.h>
#include <linux/slab.h>
#include <linux/nsproxy.h>

struct ns_cgroup {
	struct cgroup_subsys_state css;
};

struct cgroup_subsys ns_subsys;

static inline struct ns_cgroup *cgroup_to_ns(
		struct cgroup *cgroup)
{
	return container_of(cgroup_subsys_state(cgroup, ns_subsys_id),
			    struct ns_cgroup, css);
}

int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
{
	char name[PROC_NUMBUF];

	snprintf(name, PROC_NUMBUF, "%d", pid_vnr(pid));
	return cgroup_clone(task, &ns_subsys, name);
}

/*
 * Rules:
 *   1. you can only enter a cgroup which is a child of your current
 *     cgroup
 *   2. you can only place another process into a cgroup if
 *     a. you have CAP_SYS_ADMIN
 *     b. your cgroup is an ancestor of task's destination cgroup
 *       (hence either you are in the same cgroup as task, or in an
 *        ancestor cgroup thereof)
 */
static int ns_can_attach(struct cgroup_subsys *ss,
		struct cgroup *new_cgroup, struct task_struct *task)
{
	struct cgroup *orig;

	if (current != task) {
		if (!capable(CAP_SYS_ADMIN))
			return -EPERM;

		if (!cgroup_is_descendant(new_cgroup))
			return -EPERM;
	}

	if (atomic_read(&new_cgroup->count) != 0)
		return -EPERM;

	orig = task_cgroup(task, ns_subsys_id);
	if (orig && orig != new_cgroup->parent)
		return -EPERM;

	return 0;
}

/*
 * Rules: you can only create a cgroup if
 *     1. you are capable(CAP_SYS_ADMIN)
 *     2. the target cgroup is a descendant of your own cgroup
 */
static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
						struct cgroup *cgroup)
{
	struct ns_cgroup *ns_cgroup;

	if (!capable(CAP_SYS_ADMIN))
		return ERR_PTR(-EPERM);
	if (!cgroup_is_descendant(cgroup))
		return ERR_PTR(-EPERM);

	ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
	if (!ns_cgroup)
		return ERR_PTR(-ENOMEM);
	return &ns_cgroup->css;
}

static void ns_destroy(struct cgroup_subsys *ss,
			struct cgroup *cgroup)
{
	struct ns_cgroup *ns_cgroup;

	ns_cgroup = cgroup_to_ns(cgroup);
	kfree(ns_cgroup);
}

struct cgroup_subsys ns_subsys = {
	.name = "ns",
	.can_attach = ns_can_attach,
	.create = ns_create,
	.destroy  = ns_destroy,
	.subsys_id = ns_subsys_id,
};
cgroups: implement namespace tracking subsystem When a task enters a new namespace via a clone() or unshare(), a new cgroup is created and the task moves into it. This version names cgroups which are automatically created using cgroup_clone() as "node_<pid>" where pid is the pid of the unsharing or cloned process. (Thanks Pavel for the idea) This is safe because if the process unshares again, it will create /cgroups/(...)/node_<pid>/node_<pid> The only possibilities (AFAICT) for a -EEXIST on unshare are 1. pid wraparound 2. a process fails an unshare, then tries again. Case 1 is unlikely enough that I ignore it (at least for now). In case 2, the node_<pid> will be empty and can be rmdir'ed to make the subsequent unshare() succeed. Changelog: Name cloned cgroups as "node_<pid>". [clg@fr.ibm.com: fix order of cgroup subsystems in init/Kconfig] Signed-off-by: Serge E. Hallyn <serue@us.ibm.com> Cc: Paul Menage <menage@google.com> Signed-off-by: Cedric Le Goater <clg@fr.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2007-10-18 23:39:45 -07:00			`/*`
			`* ns_cgroup.c - namespace cgroup subsystem`
			`*`
			`* Copyright 2006, 2007 IBM Corp`
			`*/`

			`#include <linux/module.h>`
			`#include <linux/cgroup.h>`
			`#include <linux/fs.h>`
cgroup_clone: use pid of newly created task for new cgroup cgroup_clone creates a new cgroup with the pid of the task. This works correctly for unshare, but for clone cgroup_clone is called from copy_namespaces inside copy_process, which happens before the new pid is created. As a result, the new cgroup was created with current's pid. This patch: 1. Moves the call inside copy_process to after the new pid is created 2. Passes the struct pid into ns_cgroup_clone (as it is not yet attached to the task) 3. Passes a name from ns_cgroup_clone() into cgroup_clone() so as to keep cgroup_clone() itself simpler 4. Uses pid_vnr() to get the process id value, so that the pid used to name the new cgroup is always the pid as it would be known to the task which did the cloning or unsharing. I think that is the most intuitive thing to do. This way, task t1 does clone(CLONE_NEWPID) to get t2, which does clone(CLONE_NEWPID) to get t3, then the cgroup for t3 will be named for the pid by which t2 knows t3. (Thanks to Dan Smith for finding the main bug) Changelog: June 11: Incorporate Paul Menage's feedback: don't pass NULL to ns_cgroup_clone from unshare, and reduce patch size by using 'nodename' in cgroup_clone. June 10: Original version [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Serge Hallyn <serge@us.ibm.com> Acked-by: Paul Menage <menage@google.com> Tested-by: Dan Smith <danms@us.ibm.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2008-07-25 01:47:06 -07:00			`#include <linux/proc_fs.h>`
kernel: explicitly include required header files under kernel/ Following an experimental deletion of the unnecessary directive #include <linux/slab.h> from the header file <linux/percpu.h>, these files under kernel/ were exposed as needing to include one of <linux/slab.h> or <linux/gfp.h>, so explicit includes were added where necessary. Signed-off-by: Robert P. J. Day <rpjday@crashcourse.ca> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2008-04-29 00:59:25 -07:00			`#include <linux/slab.h>`
cgroups: kernel/ns_cgroup.c should #include <linux/nsproxy.h> Every file should include the headers containing the externs its global functions (in this case for ns_cgroup_clone()). Signed-off-by: Adrian Bunk <bunk@kernel.org> Acked-by: Serge Hallyn <serue@us.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2008-04-29 00:59:55 -07:00			`#include <linux/nsproxy.h>`
cgroups: implement namespace tracking subsystem When a task enters a new namespace via a clone() or unshare(), a new cgroup is created and the task moves into it. This version names cgroups which are automatically created using cgroup_clone() as "node_<pid>" where pid is the pid of the unsharing or cloned process. (Thanks Pavel for the idea) This is safe because if the process unshares again, it will create /cgroups/(...)/node_<pid>/node_<pid> The only possibilities (AFAICT) for a -EEXIST on unshare are 1. pid wraparound 2. a process fails an unshare, then tries again. Case 1 is unlikely enough that I ignore it (at least for now). In case 2, the node_<pid> will be empty and can be rmdir'ed to make the subsequent unshare() succeed. Changelog: Name cloned cgroups as "node_<pid>". [clg@fr.ibm.com: fix order of cgroup subsystems in init/Kconfig] Signed-off-by: Serge E. Hallyn <serue@us.ibm.com> Cc: Paul Menage <menage@google.com> Signed-off-by: Cedric Le Goater <clg@fr.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2007-10-18 23:39:45 -07:00
			`struct ns_cgroup {`
			`struct cgroup_subsys_state css;`
			`};`

			`struct cgroup_subsys ns_subsys;`

			`static inline struct ns_cgroup *cgroup_to_ns(`
			`struct cgroup *cgroup)`
			`{`
			`return container_of(cgroup_subsys_state(cgroup, ns_subsys_id),`
			`struct ns_cgroup, css);`
			`}`

cgroup_clone: use pid of newly created task for new cgroup cgroup_clone creates a new cgroup with the pid of the task. This works correctly for unshare, but for clone cgroup_clone is called from copy_namespaces inside copy_process, which happens before the new pid is created. As a result, the new cgroup was created with current's pid. This patch: 1. Moves the call inside copy_process to after the new pid is created 2. Passes the struct pid into ns_cgroup_clone (as it is not yet attached to the task) 3. Passes a name from ns_cgroup_clone() into cgroup_clone() so as to keep cgroup_clone() itself simpler 4. Uses pid_vnr() to get the process id value, so that the pid used to name the new cgroup is always the pid as it would be known to the task which did the cloning or unsharing. I think that is the most intuitive thing to do. This way, task t1 does clone(CLONE_NEWPID) to get t2, which does clone(CLONE_NEWPID) to get t3, then the cgroup for t3 will be named for the pid by which t2 knows t3. (Thanks to Dan Smith for finding the main bug) Changelog: June 11: Incorporate Paul Menage's feedback: don't pass NULL to ns_cgroup_clone from unshare, and reduce patch size by using 'nodename' in cgroup_clone. June 10: Original version [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Serge Hallyn <serge@us.ibm.com> Acked-by: Paul Menage <menage@google.com> Tested-by: Dan Smith <danms@us.ibm.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2008-07-25 01:47:06 -07:00			`int ns_cgroup_clone(struct task_struct task, struct pid pid)`
cgroups: implement namespace tracking subsystem When a task enters a new namespace via a clone() or unshare(), a new cgroup is created and the task moves into it. This version names cgroups which are automatically created using cgroup_clone() as "node_<pid>" where pid is the pid of the unsharing or cloned process. (Thanks Pavel for the idea) This is safe because if the process unshares again, it will create /cgroups/(...)/node_<pid>/node_<pid> The only possibilities (AFAICT) for a -EEXIST on unshare are 1. pid wraparound 2. a process fails an unshare, then tries again. Case 1 is unlikely enough that I ignore it (at least for now). In case 2, the node_<pid> will be empty and can be rmdir'ed to make the subsequent unshare() succeed. Changelog: Name cloned cgroups as "node_<pid>". [clg@fr.ibm.com: fix order of cgroup subsystems in init/Kconfig] Signed-off-by: Serge E. Hallyn <serue@us.ibm.com> Cc: Paul Menage <menage@google.com> Signed-off-by: Cedric Le Goater <clg@fr.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2007-10-18 23:39:45 -07:00			`{`
cgroup_clone: use pid of newly created task for new cgroup cgroup_clone creates a new cgroup with the pid of the task. This works correctly for unshare, but for clone cgroup_clone is called from copy_namespaces inside copy_process, which happens before the new pid is created. As a result, the new cgroup was created with current's pid. This patch: 1. Moves the call inside copy_process to after the new pid is created 2. Passes the struct pid into ns_cgroup_clone (as it is not yet attached to the task) 3. Passes a name from ns_cgroup_clone() into cgroup_clone() so as to keep cgroup_clone() itself simpler 4. Uses pid_vnr() to get the process id value, so that the pid used to name the new cgroup is always the pid as it would be known to the task which did the cloning or unsharing. I think that is the most intuitive thing to do. This way, task t1 does clone(CLONE_NEWPID) to get t2, which does clone(CLONE_NEWPID) to get t3, then the cgroup for t3 will be named for the pid by which t2 knows t3. (Thanks to Dan Smith for finding the main bug) Changelog: June 11: Incorporate Paul Menage's feedback: don't pass NULL to ns_cgroup_clone from unshare, and reduce patch size by using 'nodename' in cgroup_clone. June 10: Original version [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Serge Hallyn <serge@us.ibm.com> Acked-by: Paul Menage <menage@google.com> Tested-by: Dan Smith <danms@us.ibm.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2008-07-25 01:47:06 -07:00			`char name[PROC_NUMBUF];`

			`snprintf(name, PROC_NUMBUF, "%d", pid_vnr(pid));`
			`return cgroup_clone(task, &ns_subsys, name);`
cgroups: implement namespace tracking subsystem When a task enters a new namespace via a clone() or unshare(), a new cgroup is created and the task moves into it. This version names cgroups which are automatically created using cgroup_clone() as "node_<pid>" where pid is the pid of the unsharing or cloned process. (Thanks Pavel for the idea) This is safe because if the process unshares again, it will create /cgroups/(...)/node_<pid>/node_<pid> The only possibilities (AFAICT) for a -EEXIST on unshare are 1. pid wraparound 2. a process fails an unshare, then tries again. Case 1 is unlikely enough that I ignore it (at least for now). In case 2, the node_<pid> will be empty and can be rmdir'ed to make the subsequent unshare() succeed. Changelog: Name cloned cgroups as "node_<pid>". [clg@fr.ibm.com: fix order of cgroup subsystems in init/Kconfig] Signed-off-by: Serge E. Hallyn <serue@us.ibm.com> Cc: Paul Menage <menage@google.com> Signed-off-by: Cedric Le Goater <clg@fr.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2007-10-18 23:39:45 -07:00			`}`

			`/*`
			`* Rules:`
			`* 1. you can only enter a cgroup which is a child of your current`
			`* cgroup`
			`* 2. you can only place another process into a cgroup if`
			`* a. you have CAP_SYS_ADMIN`
			`* b. your cgroup is an ancestor of task's destination cgroup`
			`* (hence either you are in the same cgroup as task, or in an`
			`* ancestor cgroup thereof)`
			`*/`
			`static int ns_can_attach(struct cgroup_subsys *ss,`
			`struct cgroup new_cgroup, struct task_struct task)`
			`{`
			`struct cgroup *orig;`

			`if (current != task) {`
			`if (!capable(CAP_SYS_ADMIN))`
			`return -EPERM;`

			`if (!cgroup_is_descendant(new_cgroup))`
			`return -EPERM;`
			`}`

			`if (atomic_read(&new_cgroup->count) != 0)`
			`return -EPERM;`

			`orig = task_cgroup(task, ns_subsys_id);`
			`if (orig && orig != new_cgroup->parent)`
			`return -EPERM;`

			`return 0;`
			`}`

			`/*`
			`* Rules: you can only create a cgroup if`
			`* 1. you are capable(CAP_SYS_ADMIN)`
			`* 2. the target cgroup is a descendant of your own cgroup`
			`*/`
			`static struct cgroup_subsys_state ns_create(struct cgroup_subsys ss,`
			`struct cgroup *cgroup)`
			`{`
			`struct ns_cgroup *ns_cgroup;`

			`if (!capable(CAP_SYS_ADMIN))`
			`return ERR_PTR(-EPERM);`
			`if (!cgroup_is_descendant(cgroup))`
			`return ERR_PTR(-EPERM);`

			`ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);`
			`if (!ns_cgroup)`
			`return ERR_PTR(-ENOMEM);`
			`return &ns_cgroup->css;`
			`}`

			`static void ns_destroy(struct cgroup_subsys *ss,`
			`struct cgroup *cgroup)`
			`{`
			`struct ns_cgroup *ns_cgroup;`

			`ns_cgroup = cgroup_to_ns(cgroup);`
			`kfree(ns_cgroup);`
			`}`

			`struct cgroup_subsys ns_subsys = {`
			`.name = "ns",`
			`.can_attach = ns_can_attach,`
			`.create = ns_create,`
			`.destroy = ns_destroy,`
			`.subsys_id = ns_subsys_id,`
			`};`