1cb6f0bae5
Pedro Pinto and later independently also Hyunwoo Kim and Wongi Lee reported
an issue that the tcx_entry can be released too early leading to a use
after free (UAF) when an active old-style ingress or clsact qdisc with a
shared tc block is later replaced by another ingress or clsact instance.
Essentially, the sequence to trigger the UAF (one example) can be as follows:
1. A network namespace is created
2. An ingress qdisc is created. This allocates a tcx_entry, and
&tcx_entry->miniq is stored in the qdisc's miniqp->p_miniq. At the
same time, a tcf block with index 1 is created.
3. chain0 is attached to the tcf block. chain0 must be connected to
the block linked to the ingress qdisc to later reach the function
tcf_chain0_head_change_cb_del() which triggers the UAF.
4. Create and graft a clsact qdisc. This causes the ingress qdisc
created in step 1 to be removed, thus freeing the previously linked
tcx_entry:
rtnetlink_rcv_msg()
=> tc_modify_qdisc()
=> qdisc_create()
=> clsact_init() [a]
=> qdisc_graft()
=> qdisc_destroy()
=> __qdisc_destroy()
=> ingress_destroy() [b]
=> tcx_entry_free()
=> kfree_rcu() // tcx_entry freed
5. Finally, the network namespace is closed. This registers the
cleanup_net worker, and during the process of releasing the
remaining clsact qdisc, it accesses the tcx_entry that was
already freed in step 4, causing the UAF to occur:
cleanup_net()
=> ops_exit_list()
=> default_device_exit_batch()
=> unregister_netdevice_many()
=> unregister_netdevice_many_notify()
=> dev_shutdown()
=> qdisc_put()
=> clsact_destroy() [c]
=> tcf_block_put_ext()
=> tcf_chain0_head_change_cb_del()
=> tcf_chain_head_change_item()
=> clsact_chain_head_change()
=> mini_qdisc_pair_swap() // UAF
There are also other variants, the gist is to add an ingress (or clsact)
qdisc with a specific shared block, then to replace that qdisc, waiting
for the tcx_entry kfree_rcu() to be executed and subsequently accessing
the current active qdisc's miniq one way or another.
The correct fix is to turn the miniq_active boolean into a counter. What
can be observed, at step 2 above, the counter transitions from 0->1, at
step [a] from 1->2 (in order for the miniq object to remain active during
the replacement), then in [b] from 2->1 and finally [c] 1->0 with the
eventual release. The reference counter in general ranges from [0,2] and
it does not need to be atomic since all access to the counter is protected
by the rtnl mutex. With this in place, there is no longer a UAF happening
and the tcx_entry is freed at the correct time.
Fixes: e420bed025
("bpf: Add fd-based tcx multi-prog infra with link support")
Reported-by: Pedro Pinto <xten@osec.io>
Co-developed-by: Pedro Pinto <xten@osec.io>
Signed-off-by: Pedro Pinto <xten@osec.io>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Hyunwoo Kim <v4bel@theori.io>
Cc: Wongi Lee <qwerty@theori.io>
Cc: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20240708133130.11609-1-daniel@iogearbox.net
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
375 lines
9.2 KiB
C
375 lines
9.2 KiB
C
// SPDX-License-Identifier: GPL-2.0-or-later
|
|
/* net/sched/sch_ingress.c - Ingress and clsact qdisc
|
|
*
|
|
* Authors: Jamal Hadi Salim 1999
|
|
*/
|
|
|
|
#include <linux/module.h>
|
|
#include <linux/types.h>
|
|
#include <linux/list.h>
|
|
#include <linux/skbuff.h>
|
|
#include <linux/rtnetlink.h>
|
|
|
|
#include <net/netlink.h>
|
|
#include <net/pkt_sched.h>
|
|
#include <net/pkt_cls.h>
|
|
#include <net/tcx.h>
|
|
|
|
struct ingress_sched_data {
|
|
struct tcf_block *block;
|
|
struct tcf_block_ext_info block_info;
|
|
struct mini_Qdisc_pair miniqp;
|
|
};
|
|
|
|
static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg)
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
static unsigned long ingress_find(struct Qdisc *sch, u32 classid)
|
|
{
|
|
return TC_H_MIN(classid) + 1;
|
|
}
|
|
|
|
static unsigned long ingress_bind_filter(struct Qdisc *sch,
|
|
unsigned long parent, u32 classid)
|
|
{
|
|
return ingress_find(sch, classid);
|
|
}
|
|
|
|
static void ingress_unbind_filter(struct Qdisc *sch, unsigned long cl)
|
|
{
|
|
}
|
|
|
|
static void ingress_walk(struct Qdisc *sch, struct qdisc_walker *walker)
|
|
{
|
|
}
|
|
|
|
static struct tcf_block *ingress_tcf_block(struct Qdisc *sch, unsigned long cl,
|
|
struct netlink_ext_ack *extack)
|
|
{
|
|
struct ingress_sched_data *q = qdisc_priv(sch);
|
|
|
|
return q->block;
|
|
}
|
|
|
|
static void clsact_chain_head_change(struct tcf_proto *tp_head, void *priv)
|
|
{
|
|
struct mini_Qdisc_pair *miniqp = priv;
|
|
|
|
mini_qdisc_pair_swap(miniqp, tp_head);
|
|
};
|
|
|
|
static void ingress_ingress_block_set(struct Qdisc *sch, u32 block_index)
|
|
{
|
|
struct ingress_sched_data *q = qdisc_priv(sch);
|
|
|
|
q->block_info.block_index = block_index;
|
|
}
|
|
|
|
static u32 ingress_ingress_block_get(struct Qdisc *sch)
|
|
{
|
|
struct ingress_sched_data *q = qdisc_priv(sch);
|
|
|
|
return q->block_info.block_index;
|
|
}
|
|
|
|
static int ingress_init(struct Qdisc *sch, struct nlattr *opt,
|
|
struct netlink_ext_ack *extack)
|
|
{
|
|
struct ingress_sched_data *q = qdisc_priv(sch);
|
|
struct net_device *dev = qdisc_dev(sch);
|
|
struct bpf_mprog_entry *entry;
|
|
bool created;
|
|
int err;
|
|
|
|
if (sch->parent != TC_H_INGRESS)
|
|
return -EOPNOTSUPP;
|
|
|
|
net_inc_ingress_queue();
|
|
|
|
entry = tcx_entry_fetch_or_create(dev, true, &created);
|
|
if (!entry)
|
|
return -ENOMEM;
|
|
tcx_miniq_inc(entry);
|
|
mini_qdisc_pair_init(&q->miniqp, sch, &tcx_entry(entry)->miniq);
|
|
if (created)
|
|
tcx_entry_update(dev, entry, true);
|
|
|
|
q->block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
|
|
q->block_info.chain_head_change = clsact_chain_head_change;
|
|
q->block_info.chain_head_change_priv = &q->miniqp;
|
|
|
|
err = tcf_block_get_ext(&q->block, sch, &q->block_info, extack);
|
|
if (err)
|
|
return err;
|
|
|
|
mini_qdisc_pair_block_init(&q->miniqp, q->block);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void ingress_destroy(struct Qdisc *sch)
|
|
{
|
|
struct ingress_sched_data *q = qdisc_priv(sch);
|
|
struct net_device *dev = qdisc_dev(sch);
|
|
struct bpf_mprog_entry *entry = rtnl_dereference(dev->tcx_ingress);
|
|
|
|
if (sch->parent != TC_H_INGRESS)
|
|
return;
|
|
|
|
tcf_block_put_ext(q->block, sch, &q->block_info);
|
|
|
|
if (entry) {
|
|
tcx_miniq_dec(entry);
|
|
if (!tcx_entry_is_active(entry)) {
|
|
tcx_entry_update(dev, NULL, true);
|
|
tcx_entry_free(entry);
|
|
}
|
|
}
|
|
|
|
net_dec_ingress_queue();
|
|
}
|
|
|
|
static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb)
|
|
{
|
|
struct nlattr *nest;
|
|
|
|
nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
|
|
if (nest == NULL)
|
|
goto nla_put_failure;
|
|
|
|
return nla_nest_end(skb, nest);
|
|
|
|
nla_put_failure:
|
|
nla_nest_cancel(skb, nest);
|
|
return -1;
|
|
}
|
|
|
|
static const struct Qdisc_class_ops ingress_class_ops = {
|
|
.flags = QDISC_CLASS_OPS_DOIT_UNLOCKED,
|
|
.leaf = ingress_leaf,
|
|
.find = ingress_find,
|
|
.walk = ingress_walk,
|
|
.tcf_block = ingress_tcf_block,
|
|
.bind_tcf = ingress_bind_filter,
|
|
.unbind_tcf = ingress_unbind_filter,
|
|
};
|
|
|
|
static struct Qdisc_ops ingress_qdisc_ops __read_mostly = {
|
|
.cl_ops = &ingress_class_ops,
|
|
.id = "ingress",
|
|
.priv_size = sizeof(struct ingress_sched_data),
|
|
.static_flags = TCQ_F_INGRESS | TCQ_F_CPUSTATS,
|
|
.init = ingress_init,
|
|
.destroy = ingress_destroy,
|
|
.dump = ingress_dump,
|
|
.ingress_block_set = ingress_ingress_block_set,
|
|
.ingress_block_get = ingress_ingress_block_get,
|
|
.owner = THIS_MODULE,
|
|
};
|
|
MODULE_ALIAS_NET_SCH("ingress");
|
|
|
|
struct clsact_sched_data {
|
|
struct tcf_block *ingress_block;
|
|
struct tcf_block *egress_block;
|
|
struct tcf_block_ext_info ingress_block_info;
|
|
struct tcf_block_ext_info egress_block_info;
|
|
struct mini_Qdisc_pair miniqp_ingress;
|
|
struct mini_Qdisc_pair miniqp_egress;
|
|
};
|
|
|
|
static unsigned long clsact_find(struct Qdisc *sch, u32 classid)
|
|
{
|
|
switch (TC_H_MIN(classid)) {
|
|
case TC_H_MIN(TC_H_MIN_INGRESS):
|
|
case TC_H_MIN(TC_H_MIN_EGRESS):
|
|
return TC_H_MIN(classid);
|
|
default:
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
static unsigned long clsact_bind_filter(struct Qdisc *sch,
|
|
unsigned long parent, u32 classid)
|
|
{
|
|
return clsact_find(sch, classid);
|
|
}
|
|
|
|
static struct tcf_block *clsact_tcf_block(struct Qdisc *sch, unsigned long cl,
|
|
struct netlink_ext_ack *extack)
|
|
{
|
|
struct clsact_sched_data *q = qdisc_priv(sch);
|
|
|
|
switch (cl) {
|
|
case TC_H_MIN(TC_H_MIN_INGRESS):
|
|
return q->ingress_block;
|
|
case TC_H_MIN(TC_H_MIN_EGRESS):
|
|
return q->egress_block;
|
|
default:
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
static void clsact_ingress_block_set(struct Qdisc *sch, u32 block_index)
|
|
{
|
|
struct clsact_sched_data *q = qdisc_priv(sch);
|
|
|
|
q->ingress_block_info.block_index = block_index;
|
|
}
|
|
|
|
static void clsact_egress_block_set(struct Qdisc *sch, u32 block_index)
|
|
{
|
|
struct clsact_sched_data *q = qdisc_priv(sch);
|
|
|
|
q->egress_block_info.block_index = block_index;
|
|
}
|
|
|
|
static u32 clsact_ingress_block_get(struct Qdisc *sch)
|
|
{
|
|
struct clsact_sched_data *q = qdisc_priv(sch);
|
|
|
|
return q->ingress_block_info.block_index;
|
|
}
|
|
|
|
static u32 clsact_egress_block_get(struct Qdisc *sch)
|
|
{
|
|
struct clsact_sched_data *q = qdisc_priv(sch);
|
|
|
|
return q->egress_block_info.block_index;
|
|
}
|
|
|
|
static int clsact_init(struct Qdisc *sch, struct nlattr *opt,
|
|
struct netlink_ext_ack *extack)
|
|
{
|
|
struct clsact_sched_data *q = qdisc_priv(sch);
|
|
struct net_device *dev = qdisc_dev(sch);
|
|
struct bpf_mprog_entry *entry;
|
|
bool created;
|
|
int err;
|
|
|
|
if (sch->parent != TC_H_CLSACT)
|
|
return -EOPNOTSUPP;
|
|
|
|
net_inc_ingress_queue();
|
|
net_inc_egress_queue();
|
|
|
|
entry = tcx_entry_fetch_or_create(dev, true, &created);
|
|
if (!entry)
|
|
return -ENOMEM;
|
|
tcx_miniq_inc(entry);
|
|
mini_qdisc_pair_init(&q->miniqp_ingress, sch, &tcx_entry(entry)->miniq);
|
|
if (created)
|
|
tcx_entry_update(dev, entry, true);
|
|
|
|
q->ingress_block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
|
|
q->ingress_block_info.chain_head_change = clsact_chain_head_change;
|
|
q->ingress_block_info.chain_head_change_priv = &q->miniqp_ingress;
|
|
|
|
err = tcf_block_get_ext(&q->ingress_block, sch, &q->ingress_block_info,
|
|
extack);
|
|
if (err)
|
|
return err;
|
|
|
|
mini_qdisc_pair_block_init(&q->miniqp_ingress, q->ingress_block);
|
|
|
|
entry = tcx_entry_fetch_or_create(dev, false, &created);
|
|
if (!entry)
|
|
return -ENOMEM;
|
|
tcx_miniq_inc(entry);
|
|
mini_qdisc_pair_init(&q->miniqp_egress, sch, &tcx_entry(entry)->miniq);
|
|
if (created)
|
|
tcx_entry_update(dev, entry, false);
|
|
|
|
q->egress_block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS;
|
|
q->egress_block_info.chain_head_change = clsact_chain_head_change;
|
|
q->egress_block_info.chain_head_change_priv = &q->miniqp_egress;
|
|
|
|
return tcf_block_get_ext(&q->egress_block, sch, &q->egress_block_info, extack);
|
|
}
|
|
|
|
static void clsact_destroy(struct Qdisc *sch)
|
|
{
|
|
struct clsact_sched_data *q = qdisc_priv(sch);
|
|
struct net_device *dev = qdisc_dev(sch);
|
|
struct bpf_mprog_entry *ingress_entry = rtnl_dereference(dev->tcx_ingress);
|
|
struct bpf_mprog_entry *egress_entry = rtnl_dereference(dev->tcx_egress);
|
|
|
|
if (sch->parent != TC_H_CLSACT)
|
|
return;
|
|
|
|
tcf_block_put_ext(q->ingress_block, sch, &q->ingress_block_info);
|
|
tcf_block_put_ext(q->egress_block, sch, &q->egress_block_info);
|
|
|
|
if (ingress_entry) {
|
|
tcx_miniq_dec(ingress_entry);
|
|
if (!tcx_entry_is_active(ingress_entry)) {
|
|
tcx_entry_update(dev, NULL, true);
|
|
tcx_entry_free(ingress_entry);
|
|
}
|
|
}
|
|
|
|
if (egress_entry) {
|
|
tcx_miniq_dec(egress_entry);
|
|
if (!tcx_entry_is_active(egress_entry)) {
|
|
tcx_entry_update(dev, NULL, false);
|
|
tcx_entry_free(egress_entry);
|
|
}
|
|
}
|
|
|
|
net_dec_ingress_queue();
|
|
net_dec_egress_queue();
|
|
}
|
|
|
|
static const struct Qdisc_class_ops clsact_class_ops = {
|
|
.flags = QDISC_CLASS_OPS_DOIT_UNLOCKED,
|
|
.leaf = ingress_leaf,
|
|
.find = clsact_find,
|
|
.walk = ingress_walk,
|
|
.tcf_block = clsact_tcf_block,
|
|
.bind_tcf = clsact_bind_filter,
|
|
.unbind_tcf = ingress_unbind_filter,
|
|
};
|
|
|
|
static struct Qdisc_ops clsact_qdisc_ops __read_mostly = {
|
|
.cl_ops = &clsact_class_ops,
|
|
.id = "clsact",
|
|
.priv_size = sizeof(struct clsact_sched_data),
|
|
.static_flags = TCQ_F_INGRESS | TCQ_F_CPUSTATS,
|
|
.init = clsact_init,
|
|
.destroy = clsact_destroy,
|
|
.dump = ingress_dump,
|
|
.ingress_block_set = clsact_ingress_block_set,
|
|
.egress_block_set = clsact_egress_block_set,
|
|
.ingress_block_get = clsact_ingress_block_get,
|
|
.egress_block_get = clsact_egress_block_get,
|
|
.owner = THIS_MODULE,
|
|
};
|
|
MODULE_ALIAS_NET_SCH("clsact");
|
|
|
|
static int __init ingress_module_init(void)
|
|
{
|
|
int ret;
|
|
|
|
ret = register_qdisc(&ingress_qdisc_ops);
|
|
if (!ret) {
|
|
ret = register_qdisc(&clsact_qdisc_ops);
|
|
if (ret)
|
|
unregister_qdisc(&ingress_qdisc_ops);
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
static void __exit ingress_module_exit(void)
|
|
{
|
|
unregister_qdisc(&ingress_qdisc_ops);
|
|
unregister_qdisc(&clsact_qdisc_ops);
|
|
}
|
|
|
|
module_init(ingress_module_init);
|
|
module_exit(ingress_module_exit);
|
|
|
|
MODULE_LICENSE("GPL");
|
|
MODULE_DESCRIPTION("Ingress and clsact based ingress and egress qdiscs");
|