7f60951ff4
The commit in the Fixes tag has shuffled some code.
Now 'mcg_num' is incremented before the kzalloc(). So if the memory
allocation fails, this increment must be undone.
Fixes: a926a903b7
("RDMA/rxe: Do not call dev_mc_add/del() under a spinlock")
Link: https://lore.kernel.org/r/fe137cd8b1f17593243aa73d59c18ea71ab9ee36.1653225896.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
480 lines
11 KiB
C
480 lines
11 KiB
C
// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
|
|
/*
|
|
* Copyright (c) 2022 Hewlett Packard Enterprise, Inc. All rights reserved.
|
|
* Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
|
|
* Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
|
|
*/
|
|
|
|
/*
|
|
* rxe_mcast.c implements driver support for multicast transport.
|
|
* It is based on two data structures struct rxe_mcg ('mcg') and
|
|
* struct rxe_mca ('mca'). An mcg is allocated each time a qp is
|
|
* attached to a new mgid for the first time. These are indexed by
|
|
* a red-black tree using the mgid. This data structure is searched
|
|
* for the mcg when a multicast packet is received and when another
|
|
* qp is attached to the same mgid. It is cleaned up when the last qp
|
|
* is detached from the mcg. Each time a qp is attached to an mcg an
|
|
* mca is created. It holds a pointer to the qp and is added to a list
|
|
* of qp's that are attached to the mcg. The qp_list is used to replicate
|
|
* mcast packets in the rxe receive path.
|
|
*/
|
|
|
|
#include "rxe.h"
|
|
|
|
/**
|
|
* rxe_mcast_add - add multicast address to rxe device
|
|
* @rxe: rxe device object
|
|
* @mgid: multicast address as a gid
|
|
*
|
|
* Returns 0 on success else an error
|
|
*/
|
|
static int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid)
|
|
{
|
|
unsigned char ll_addr[ETH_ALEN];
|
|
|
|
ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr);
|
|
|
|
return dev_mc_add(rxe->ndev, ll_addr);
|
|
}
|
|
|
|
/**
|
|
* rxe_mcast_del - delete multicast address from rxe device
|
|
* @rxe: rxe device object
|
|
* @mgid: multicast address as a gid
|
|
*
|
|
* Returns 0 on success else an error
|
|
*/
|
|
static int rxe_mcast_del(struct rxe_dev *rxe, union ib_gid *mgid)
|
|
{
|
|
unsigned char ll_addr[ETH_ALEN];
|
|
|
|
ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr);
|
|
|
|
return dev_mc_del(rxe->ndev, ll_addr);
|
|
}
|
|
|
|
/**
|
|
* __rxe_insert_mcg - insert an mcg into red-black tree (rxe->mcg_tree)
|
|
* @mcg: mcg object with an embedded red-black tree node
|
|
*
|
|
* Context: caller must hold a reference to mcg and rxe->mcg_lock and
|
|
* is responsible to avoid adding the same mcg twice to the tree.
|
|
*/
|
|
static void __rxe_insert_mcg(struct rxe_mcg *mcg)
|
|
{
|
|
struct rb_root *tree = &mcg->rxe->mcg_tree;
|
|
struct rb_node **link = &tree->rb_node;
|
|
struct rb_node *node = NULL;
|
|
struct rxe_mcg *tmp;
|
|
int cmp;
|
|
|
|
while (*link) {
|
|
node = *link;
|
|
tmp = rb_entry(node, struct rxe_mcg, node);
|
|
|
|
cmp = memcmp(&tmp->mgid, &mcg->mgid, sizeof(mcg->mgid));
|
|
if (cmp > 0)
|
|
link = &(*link)->rb_left;
|
|
else
|
|
link = &(*link)->rb_right;
|
|
}
|
|
|
|
rb_link_node(&mcg->node, node, link);
|
|
rb_insert_color(&mcg->node, tree);
|
|
}
|
|
|
|
/**
|
|
* __rxe_remove_mcg - remove an mcg from red-black tree holding lock
|
|
* @mcg: mcast group object with an embedded red-black tree node
|
|
*
|
|
* Context: caller must hold a reference to mcg and rxe->mcg_lock
|
|
*/
|
|
static void __rxe_remove_mcg(struct rxe_mcg *mcg)
|
|
{
|
|
rb_erase(&mcg->node, &mcg->rxe->mcg_tree);
|
|
}
|
|
|
|
/**
|
|
* __rxe_lookup_mcg - lookup mcg in rxe->mcg_tree while holding lock
|
|
* @rxe: rxe device object
|
|
* @mgid: multicast IP address
|
|
*
|
|
* Context: caller must hold rxe->mcg_lock
|
|
* Returns: mcg on success and takes a ref to mcg else NULL
|
|
*/
|
|
static struct rxe_mcg *__rxe_lookup_mcg(struct rxe_dev *rxe,
|
|
union ib_gid *mgid)
|
|
{
|
|
struct rb_root *tree = &rxe->mcg_tree;
|
|
struct rxe_mcg *mcg;
|
|
struct rb_node *node;
|
|
int cmp;
|
|
|
|
node = tree->rb_node;
|
|
|
|
while (node) {
|
|
mcg = rb_entry(node, struct rxe_mcg, node);
|
|
|
|
cmp = memcmp(&mcg->mgid, mgid, sizeof(*mgid));
|
|
|
|
if (cmp > 0)
|
|
node = node->rb_left;
|
|
else if (cmp < 0)
|
|
node = node->rb_right;
|
|
else
|
|
break;
|
|
}
|
|
|
|
if (node) {
|
|
kref_get(&mcg->ref_cnt);
|
|
return mcg;
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
/**
|
|
* rxe_lookup_mcg - lookup up mcg in red-back tree
|
|
* @rxe: rxe device object
|
|
* @mgid: multicast IP address
|
|
*
|
|
* Returns: mcg if found else NULL
|
|
*/
|
|
struct rxe_mcg *rxe_lookup_mcg(struct rxe_dev *rxe, union ib_gid *mgid)
|
|
{
|
|
struct rxe_mcg *mcg;
|
|
|
|
spin_lock_bh(&rxe->mcg_lock);
|
|
mcg = __rxe_lookup_mcg(rxe, mgid);
|
|
spin_unlock_bh(&rxe->mcg_lock);
|
|
|
|
return mcg;
|
|
}
|
|
|
|
/**
|
|
* __rxe_init_mcg - initialize a new mcg
|
|
* @rxe: rxe device
|
|
* @mgid: multicast address as a gid
|
|
* @mcg: new mcg object
|
|
*
|
|
* Context: caller should hold rxe->mcg lock
|
|
*/
|
|
static void __rxe_init_mcg(struct rxe_dev *rxe, union ib_gid *mgid,
|
|
struct rxe_mcg *mcg)
|
|
{
|
|
kref_init(&mcg->ref_cnt);
|
|
memcpy(&mcg->mgid, mgid, sizeof(mcg->mgid));
|
|
INIT_LIST_HEAD(&mcg->qp_list);
|
|
mcg->rxe = rxe;
|
|
|
|
/* caller holds a ref on mcg but that will be
|
|
* dropped when mcg goes out of scope. We need to take a ref
|
|
* on the pointer that will be saved in the red-black tree
|
|
* by __rxe_insert_mcg and used to lookup mcg from mgid later.
|
|
* Inserting mcg makes it visible to outside so this should
|
|
* be done last after the object is ready.
|
|
*/
|
|
kref_get(&mcg->ref_cnt);
|
|
__rxe_insert_mcg(mcg);
|
|
}
|
|
|
|
/**
|
|
* rxe_get_mcg - lookup or allocate a mcg
|
|
* @rxe: rxe device object
|
|
* @mgid: multicast IP address as a gid
|
|
*
|
|
* Returns: mcg on success else ERR_PTR(error)
|
|
*/
|
|
static struct rxe_mcg *rxe_get_mcg(struct rxe_dev *rxe, union ib_gid *mgid)
|
|
{
|
|
struct rxe_mcg *mcg, *tmp;
|
|
int err;
|
|
|
|
if (rxe->attr.max_mcast_grp == 0)
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
/* check to see if mcg already exists */
|
|
mcg = rxe_lookup_mcg(rxe, mgid);
|
|
if (mcg)
|
|
return mcg;
|
|
|
|
/* check to see if we have reached limit */
|
|
if (atomic_inc_return(&rxe->mcg_num) > rxe->attr.max_mcast_grp) {
|
|
err = -ENOMEM;
|
|
goto err_dec;
|
|
}
|
|
|
|
/* speculative alloc of new mcg */
|
|
mcg = kzalloc(sizeof(*mcg), GFP_KERNEL);
|
|
if (!mcg) {
|
|
err = -ENOMEM;
|
|
goto err_dec;
|
|
}
|
|
|
|
spin_lock_bh(&rxe->mcg_lock);
|
|
/* re-check to see if someone else just added it */
|
|
tmp = __rxe_lookup_mcg(rxe, mgid);
|
|
if (tmp) {
|
|
spin_unlock_bh(&rxe->mcg_lock);
|
|
atomic_dec(&rxe->mcg_num);
|
|
kfree(mcg);
|
|
return tmp;
|
|
}
|
|
|
|
__rxe_init_mcg(rxe, mgid, mcg);
|
|
spin_unlock_bh(&rxe->mcg_lock);
|
|
|
|
/* add mcast address outside of lock */
|
|
err = rxe_mcast_add(rxe, mgid);
|
|
if (!err)
|
|
return mcg;
|
|
|
|
kfree(mcg);
|
|
err_dec:
|
|
atomic_dec(&rxe->mcg_num);
|
|
return ERR_PTR(err);
|
|
}
|
|
|
|
/**
|
|
* rxe_cleanup_mcg - cleanup mcg for kref_put
|
|
* @kref: struct kref embnedded in mcg
|
|
*/
|
|
void rxe_cleanup_mcg(struct kref *kref)
|
|
{
|
|
struct rxe_mcg *mcg = container_of(kref, typeof(*mcg), ref_cnt);
|
|
|
|
kfree(mcg);
|
|
}
|
|
|
|
/**
|
|
* __rxe_destroy_mcg - destroy mcg object holding rxe->mcg_lock
|
|
* @mcg: the mcg object
|
|
*
|
|
* Context: caller is holding rxe->mcg_lock
|
|
* no qp's are attached to mcg
|
|
*/
|
|
static void __rxe_destroy_mcg(struct rxe_mcg *mcg)
|
|
{
|
|
struct rxe_dev *rxe = mcg->rxe;
|
|
|
|
/* remove mcg from red-black tree then drop ref */
|
|
__rxe_remove_mcg(mcg);
|
|
kref_put(&mcg->ref_cnt, rxe_cleanup_mcg);
|
|
|
|
atomic_dec(&rxe->mcg_num);
|
|
}
|
|
|
|
/**
|
|
* rxe_destroy_mcg - destroy mcg object
|
|
* @mcg: the mcg object
|
|
*
|
|
* Context: no qp's are attached to mcg
|
|
*/
|
|
static void rxe_destroy_mcg(struct rxe_mcg *mcg)
|
|
{
|
|
/* delete mcast address outside of lock */
|
|
rxe_mcast_del(mcg->rxe, &mcg->mgid);
|
|
|
|
spin_lock_bh(&mcg->rxe->mcg_lock);
|
|
__rxe_destroy_mcg(mcg);
|
|
spin_unlock_bh(&mcg->rxe->mcg_lock);
|
|
}
|
|
|
|
/**
|
|
* __rxe_init_mca - initialize a new mca holding lock
|
|
* @qp: qp object
|
|
* @mcg: mcg object
|
|
* @mca: empty space for new mca
|
|
*
|
|
* Context: caller must hold references on qp and mcg, rxe->mcg_lock
|
|
* and pass memory for new mca
|
|
*
|
|
* Returns: 0 on success else an error
|
|
*/
|
|
static int __rxe_init_mca(struct rxe_qp *qp, struct rxe_mcg *mcg,
|
|
struct rxe_mca *mca)
|
|
{
|
|
struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
|
|
int n;
|
|
|
|
n = atomic_inc_return(&rxe->mcg_attach);
|
|
if (n > rxe->attr.max_total_mcast_qp_attach) {
|
|
atomic_dec(&rxe->mcg_attach);
|
|
return -ENOMEM;
|
|
}
|
|
|
|
n = atomic_inc_return(&mcg->qp_num);
|
|
if (n > rxe->attr.max_mcast_qp_attach) {
|
|
atomic_dec(&mcg->qp_num);
|
|
atomic_dec(&rxe->mcg_attach);
|
|
return -ENOMEM;
|
|
}
|
|
|
|
atomic_inc(&qp->mcg_num);
|
|
|
|
rxe_get(qp);
|
|
mca->qp = qp;
|
|
|
|
list_add_tail(&mca->qp_list, &mcg->qp_list);
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* rxe_attach_mcg - attach qp to mcg if not already attached
|
|
* @qp: qp object
|
|
* @mcg: mcg object
|
|
*
|
|
* Context: caller must hold reference on qp and mcg.
|
|
* Returns: 0 on success else an error
|
|
*/
|
|
static int rxe_attach_mcg(struct rxe_mcg *mcg, struct rxe_qp *qp)
|
|
{
|
|
struct rxe_dev *rxe = mcg->rxe;
|
|
struct rxe_mca *mca, *tmp;
|
|
int err;
|
|
|
|
/* check to see if the qp is already a member of the group */
|
|
spin_lock_bh(&rxe->mcg_lock);
|
|
list_for_each_entry(mca, &mcg->qp_list, qp_list) {
|
|
if (mca->qp == qp) {
|
|
spin_unlock_bh(&rxe->mcg_lock);
|
|
return 0;
|
|
}
|
|
}
|
|
spin_unlock_bh(&rxe->mcg_lock);
|
|
|
|
/* speculative alloc new mca without using GFP_ATOMIC */
|
|
mca = kzalloc(sizeof(*mca), GFP_KERNEL);
|
|
if (!mca)
|
|
return -ENOMEM;
|
|
|
|
spin_lock_bh(&rxe->mcg_lock);
|
|
/* re-check to see if someone else just attached qp */
|
|
list_for_each_entry(tmp, &mcg->qp_list, qp_list) {
|
|
if (tmp->qp == qp) {
|
|
kfree(mca);
|
|
err = 0;
|
|
goto out;
|
|
}
|
|
}
|
|
|
|
err = __rxe_init_mca(qp, mcg, mca);
|
|
if (err)
|
|
kfree(mca);
|
|
out:
|
|
spin_unlock_bh(&rxe->mcg_lock);
|
|
return err;
|
|
}
|
|
|
|
/**
|
|
* __rxe_cleanup_mca - cleanup mca object holding lock
|
|
* @mca: mca object
|
|
* @mcg: mcg object
|
|
*
|
|
* Context: caller must hold a reference to mcg and rxe->mcg_lock
|
|
*/
|
|
static void __rxe_cleanup_mca(struct rxe_mca *mca, struct rxe_mcg *mcg)
|
|
{
|
|
list_del(&mca->qp_list);
|
|
|
|
atomic_dec(&mcg->qp_num);
|
|
atomic_dec(&mcg->rxe->mcg_attach);
|
|
atomic_dec(&mca->qp->mcg_num);
|
|
rxe_put(mca->qp);
|
|
|
|
kfree(mca);
|
|
}
|
|
|
|
/**
|
|
* rxe_detach_mcg - detach qp from mcg
|
|
* @mcg: mcg object
|
|
* @qp: qp object
|
|
*
|
|
* Returns: 0 on success else an error if qp is not attached.
|
|
*/
|
|
static int rxe_detach_mcg(struct rxe_mcg *mcg, struct rxe_qp *qp)
|
|
{
|
|
struct rxe_dev *rxe = mcg->rxe;
|
|
struct rxe_mca *mca, *tmp;
|
|
|
|
spin_lock_bh(&rxe->mcg_lock);
|
|
list_for_each_entry_safe(mca, tmp, &mcg->qp_list, qp_list) {
|
|
if (mca->qp == qp) {
|
|
__rxe_cleanup_mca(mca, mcg);
|
|
|
|
/* if the number of qp's attached to the
|
|
* mcast group falls to zero go ahead and
|
|
* tear it down. This will not free the
|
|
* object since we are still holding a ref
|
|
* from the caller
|
|
*/
|
|
if (atomic_read(&mcg->qp_num) <= 0)
|
|
__rxe_destroy_mcg(mcg);
|
|
|
|
spin_unlock_bh(&rxe->mcg_lock);
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
/* we didn't find the qp on the list */
|
|
spin_unlock_bh(&rxe->mcg_lock);
|
|
return -EINVAL;
|
|
}
|
|
|
|
/**
|
|
* rxe_attach_mcast - attach qp to multicast group (see IBA-11.3.1)
|
|
* @ibqp: (IB) qp object
|
|
* @mgid: multicast IP address
|
|
* @mlid: multicast LID, ignored for RoCEv2 (see IBA-A17.5.6)
|
|
*
|
|
* Returns: 0 on success else an errno
|
|
*/
|
|
int rxe_attach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid)
|
|
{
|
|
int err;
|
|
struct rxe_dev *rxe = to_rdev(ibqp->device);
|
|
struct rxe_qp *qp = to_rqp(ibqp);
|
|
struct rxe_mcg *mcg;
|
|
|
|
/* takes a ref on mcg if successful */
|
|
mcg = rxe_get_mcg(rxe, mgid);
|
|
if (IS_ERR(mcg))
|
|
return PTR_ERR(mcg);
|
|
|
|
err = rxe_attach_mcg(mcg, qp);
|
|
|
|
/* if we failed to attach the first qp to mcg tear it down */
|
|
if (atomic_read(&mcg->qp_num) == 0)
|
|
rxe_destroy_mcg(mcg);
|
|
|
|
kref_put(&mcg->ref_cnt, rxe_cleanup_mcg);
|
|
|
|
return err;
|
|
}
|
|
|
|
/**
|
|
* rxe_detach_mcast - detach qp from multicast group (see IBA-11.3.2)
|
|
* @ibqp: address of (IB) qp object
|
|
* @mgid: multicast IP address
|
|
* @mlid: multicast LID, ignored for RoCEv2 (see IBA-A17.5.6)
|
|
*
|
|
* Returns: 0 on success else an errno
|
|
*/
|
|
int rxe_detach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid)
|
|
{
|
|
struct rxe_dev *rxe = to_rdev(ibqp->device);
|
|
struct rxe_qp *qp = to_rqp(ibqp);
|
|
struct rxe_mcg *mcg;
|
|
int err;
|
|
|
|
mcg = rxe_lookup_mcg(rxe, mgid);
|
|
if (!mcg)
|
|
return -EINVAL;
|
|
|
|
err = rxe_detach_mcg(mcg, qp);
|
|
kref_put(&mcg->ref_cnt, rxe_cleanup_mcg);
|
|
|
|
return err;
|
|
}
|