d2ed0f206a
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
651 lines
15 KiB
C
651 lines
15 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
#define pr_fmt(fmt) "%s() " fmt "\n", __func__
|
|
|
|
#include <linux/generic-radix-tree.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/percpu.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/srcu.h>
|
|
#include <linux/vmalloc.h>
|
|
|
|
#include "rcu_pending.h"
|
|
#include "darray.h"
|
|
#include "util.h"
|
|
|
|
#define static_array_for_each(_a, _i) \
|
|
for (typeof(&(_a)[0]) _i = _a; \
|
|
_i < (_a) + ARRAY_SIZE(_a); \
|
|
_i++)
|
|
|
|
enum rcu_pending_special {
|
|
RCU_PENDING_KVFREE = 1,
|
|
RCU_PENDING_CALL_RCU = 2,
|
|
};
|
|
|
|
#define RCU_PENDING_KVFREE_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_KVFREE)
|
|
#define RCU_PENDING_CALL_RCU_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_CALL_RCU)
|
|
|
|
static inline unsigned long __get_state_synchronize_rcu(struct srcu_struct *ssp)
|
|
{
|
|
return ssp
|
|
? get_state_synchronize_srcu(ssp)
|
|
: get_state_synchronize_rcu();
|
|
}
|
|
|
|
static inline unsigned long __start_poll_synchronize_rcu(struct srcu_struct *ssp)
|
|
{
|
|
return ssp
|
|
? start_poll_synchronize_srcu(ssp)
|
|
: start_poll_synchronize_rcu();
|
|
}
|
|
|
|
static inline bool __poll_state_synchronize_rcu(struct srcu_struct *ssp, unsigned long cookie)
|
|
{
|
|
return ssp
|
|
? poll_state_synchronize_srcu(ssp, cookie)
|
|
: poll_state_synchronize_rcu(cookie);
|
|
}
|
|
|
|
static inline void __rcu_barrier(struct srcu_struct *ssp)
|
|
{
|
|
return ssp
|
|
? srcu_barrier(ssp)
|
|
: rcu_barrier();
|
|
}
|
|
|
|
static inline void __call_rcu(struct srcu_struct *ssp, struct rcu_head *rhp,
|
|
rcu_callback_t func)
|
|
{
|
|
if (ssp)
|
|
call_srcu(ssp, rhp, func);
|
|
else
|
|
call_rcu(rhp, func);
|
|
}
|
|
|
|
struct rcu_pending_seq {
|
|
/*
|
|
* We're using a radix tree like a vector - we're just pushing elements
|
|
* onto the end; we're using a radix tree instead of an actual vector to
|
|
* avoid reallocation overhead
|
|
*/
|
|
GENRADIX(struct rcu_head *) objs;
|
|
size_t nr;
|
|
struct rcu_head **cursor;
|
|
unsigned long seq;
|
|
};
|
|
|
|
struct rcu_pending_list {
|
|
struct rcu_head *head;
|
|
struct rcu_head *tail;
|
|
unsigned long seq;
|
|
};
|
|
|
|
struct rcu_pending_pcpu {
|
|
struct rcu_pending *parent;
|
|
spinlock_t lock;
|
|
int cpu;
|
|
|
|
/*
|
|
* We can't bound the number of unprocessed gp sequence numbers, and we
|
|
* can't efficiently merge radix trees for expired grace periods, so we
|
|
* need darray/vector:
|
|
*/
|
|
DARRAY_PREALLOCATED(struct rcu_pending_seq, 4) objs;
|
|
|
|
/* Third entry is for expired objects: */
|
|
struct rcu_pending_list lists[NUM_ACTIVE_RCU_POLL_OLDSTATE + 1];
|
|
|
|
struct rcu_head cb;
|
|
bool cb_armed;
|
|
struct work_struct work;
|
|
};
|
|
|
|
static bool __rcu_pending_has_pending(struct rcu_pending_pcpu *p)
|
|
{
|
|
if (p->objs.nr)
|
|
return true;
|
|
|
|
static_array_for_each(p->lists, i)
|
|
if (i->head)
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
static void rcu_pending_list_merge(struct rcu_pending_list *l1,
|
|
struct rcu_pending_list *l2)
|
|
{
|
|
#ifdef __KERNEL__
|
|
if (!l1->head)
|
|
l1->head = l2->head;
|
|
else
|
|
l1->tail->next = l2->head;
|
|
#else
|
|
if (!l1->head)
|
|
l1->head = l2->head;
|
|
else
|
|
l1->tail->next.next = (void *) l2->head;
|
|
#endif
|
|
|
|
l1->tail = l2->tail;
|
|
l2->head = l2->tail = NULL;
|
|
}
|
|
|
|
static void rcu_pending_list_add(struct rcu_pending_list *l,
|
|
struct rcu_head *n)
|
|
{
|
|
#ifdef __KERNEL__
|
|
if (!l->head)
|
|
l->head = n;
|
|
else
|
|
l->tail->next = n;
|
|
l->tail = n;
|
|
n->next = NULL;
|
|
#else
|
|
if (!l->head)
|
|
l->head = n;
|
|
else
|
|
l->tail->next.next = (void *) n;
|
|
l->tail = n;
|
|
n->next.next = NULL;
|
|
#endif
|
|
}
|
|
|
|
static void merge_expired_lists(struct rcu_pending_pcpu *p)
|
|
{
|
|
struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE];
|
|
|
|
for (struct rcu_pending_list *i = p->lists; i < expired; i++)
|
|
if (i->head && __poll_state_synchronize_rcu(p->parent->srcu, i->seq))
|
|
rcu_pending_list_merge(expired, i);
|
|
}
|
|
|
|
#ifndef __KERNEL__
|
|
static inline void kfree_bulk(size_t nr, void ** p)
|
|
{
|
|
while (nr--)
|
|
kfree(*p);
|
|
}
|
|
|
|
#define local_irq_save(flags) \
|
|
do { \
|
|
flags = 0; \
|
|
} while (0)
|
|
#endif
|
|
|
|
static noinline void __process_finished_items(struct rcu_pending *pending,
|
|
struct rcu_pending_pcpu *p,
|
|
unsigned long flags)
|
|
{
|
|
struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE];
|
|
struct rcu_pending_seq objs = {};
|
|
struct rcu_head *list = NULL;
|
|
|
|
if (p->objs.nr &&
|
|
__poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) {
|
|
objs = p->objs.data[0];
|
|
darray_remove_item(&p->objs, p->objs.data);
|
|
}
|
|
|
|
merge_expired_lists(p);
|
|
|
|
list = expired->head;
|
|
expired->head = expired->tail = NULL;
|
|
|
|
spin_unlock_irqrestore(&p->lock, flags);
|
|
|
|
switch ((ulong) pending->process) {
|
|
case RCU_PENDING_KVFREE:
|
|
for (size_t i = 0; i < objs.nr; ) {
|
|
size_t nr_this_node = min(GENRADIX_NODE_SIZE / sizeof(void *), objs.nr - i);
|
|
|
|
kfree_bulk(nr_this_node, (void **) genradix_ptr(&objs.objs, i));
|
|
i += nr_this_node;
|
|
}
|
|
genradix_free(&objs.objs);
|
|
|
|
while (list) {
|
|
struct rcu_head *obj = list;
|
|
#ifdef __KERNEL__
|
|
list = obj->next;
|
|
#else
|
|
list = (void *) obj->next.next;
|
|
#endif
|
|
|
|
/*
|
|
* low bit of pointer indicates whether rcu_head needs
|
|
* to be freed - kvfree_rcu_mightsleep()
|
|
*/
|
|
BUILD_BUG_ON(ARCH_SLAB_MINALIGN == 0);
|
|
|
|
void *ptr = (void *)(((unsigned long) obj->func) & ~1UL);
|
|
bool free_head = ((unsigned long) obj->func) & 1UL;
|
|
|
|
kvfree(ptr);
|
|
if (free_head)
|
|
kfree(obj);
|
|
}
|
|
|
|
break;
|
|
|
|
case RCU_PENDING_CALL_RCU:
|
|
for (size_t i = 0; i < objs.nr; i++) {
|
|
struct rcu_head *obj = *genradix_ptr(&objs.objs, i);
|
|
obj->func(obj);
|
|
}
|
|
genradix_free(&objs.objs);
|
|
|
|
while (list) {
|
|
struct rcu_head *obj = list;
|
|
#ifdef __KERNEL__
|
|
list = obj->next;
|
|
#else
|
|
list = (void *) obj->next.next;
|
|
#endif
|
|
obj->func(obj);
|
|
}
|
|
break;
|
|
|
|
default:
|
|
for (size_t i = 0; i < objs.nr; i++)
|
|
pending->process(pending, *genradix_ptr(&objs.objs, i));
|
|
genradix_free(&objs.objs);
|
|
|
|
while (list) {
|
|
struct rcu_head *obj = list;
|
|
#ifdef __KERNEL__
|
|
list = obj->next;
|
|
#else
|
|
list = (void *) obj->next.next;
|
|
#endif
|
|
pending->process(pending, obj);
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
static bool process_finished_items(struct rcu_pending *pending,
|
|
struct rcu_pending_pcpu *p,
|
|
unsigned long flags)
|
|
{
|
|
/*
|
|
* XXX: we should grab the gp seq once and avoid multiple function
|
|
* calls, this is called from __rcu_pending_enqueue() fastpath in
|
|
* may_sleep==true mode
|
|
*/
|
|
if ((p->objs.nr && __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) ||
|
|
(p->lists[0].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[0].seq)) ||
|
|
(p->lists[1].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[1].seq)) ||
|
|
p->lists[2].head) {
|
|
__process_finished_items(pending, p, flags);
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
static void rcu_pending_work(struct work_struct *work)
|
|
{
|
|
struct rcu_pending_pcpu *p =
|
|
container_of(work, struct rcu_pending_pcpu, work);
|
|
struct rcu_pending *pending = p->parent;
|
|
unsigned long flags;
|
|
|
|
do {
|
|
spin_lock_irqsave(&p->lock, flags);
|
|
} while (process_finished_items(pending, p, flags));
|
|
|
|
spin_unlock_irqrestore(&p->lock, flags);
|
|
}
|
|
|
|
static void rcu_pending_rcu_cb(struct rcu_head *rcu)
|
|
{
|
|
struct rcu_pending_pcpu *p = container_of(rcu, struct rcu_pending_pcpu, cb);
|
|
|
|
schedule_work_on(p->cpu, &p->work);
|
|
|
|
unsigned long flags;
|
|
spin_lock_irqsave(&p->lock, flags);
|
|
if (__rcu_pending_has_pending(p)) {
|
|
spin_unlock_irqrestore(&p->lock, flags);
|
|
__call_rcu(p->parent->srcu, &p->cb, rcu_pending_rcu_cb);
|
|
} else {
|
|
p->cb_armed = false;
|
|
spin_unlock_irqrestore(&p->lock, flags);
|
|
}
|
|
}
|
|
|
|
static __always_inline struct rcu_pending_seq *
|
|
get_object_radix(struct rcu_pending_pcpu *p, unsigned long seq)
|
|
{
|
|
darray_for_each_reverse(p->objs, objs)
|
|
if (objs->seq == seq)
|
|
return objs;
|
|
|
|
if (darray_push_gfp(&p->objs, ((struct rcu_pending_seq) { .seq = seq }), GFP_ATOMIC))
|
|
return NULL;
|
|
|
|
return &darray_last(p->objs);
|
|
}
|
|
|
|
static noinline bool
|
|
rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, unsigned long seq,
|
|
struct rcu_head *head, void *ptr,
|
|
unsigned long *flags)
|
|
{
|
|
if (ptr) {
|
|
if (!head) {
|
|
/*
|
|
* kvfree_rcu_mightsleep(): we weren't passed an
|
|
* rcu_head, but we need one: use the low bit of the
|
|
* ponter to free to flag that the head needs to be
|
|
* freed as well:
|
|
*/
|
|
ptr = (void *)(((unsigned long) ptr)|1UL);
|
|
head = kmalloc(sizeof(*head), __GFP_NOWARN);
|
|
if (!head) {
|
|
spin_unlock_irqrestore(&p->lock, *flags);
|
|
head = kmalloc(sizeof(*head), GFP_KERNEL|__GFP_NOFAIL);
|
|
/*
|
|
* dropped lock, did GFP_KERNEL allocation,
|
|
* check for gp expiration
|
|
*/
|
|
if (unlikely(__poll_state_synchronize_rcu(p->parent->srcu, seq))) {
|
|
kvfree(--ptr);
|
|
kfree(head);
|
|
spin_lock_irqsave(&p->lock, *flags);
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
head->func = ptr;
|
|
}
|
|
again:
|
|
for (struct rcu_pending_list *i = p->lists;
|
|
i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) {
|
|
if (i->seq == seq) {
|
|
rcu_pending_list_add(i, head);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
for (struct rcu_pending_list *i = p->lists;
|
|
i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) {
|
|
if (!i->head) {
|
|
i->seq = seq;
|
|
rcu_pending_list_add(i, head);
|
|
return true;
|
|
}
|
|
}
|
|
|
|
merge_expired_lists(p);
|
|
goto again;
|
|
}
|
|
|
|
/*
|
|
* __rcu_pending_enqueue: enqueue a pending RCU item, to be processed (via
|
|
* pending->pracess) once grace period elapses.
|
|
*
|
|
* Attempt to enqueue items onto a radix tree; if memory allocation fails, fall
|
|
* back to a linked list.
|
|
*
|
|
* - If @ptr is NULL, we're enqueuing an item for a generic @pending with a
|
|
* process callback
|
|
*
|
|
* - If @ptr and @head are both not NULL, we're kvfree_rcu()
|
|
*
|
|
* - If @ptr is not NULL and @head is, we're kvfree_rcu_mightsleep()
|
|
*
|
|
* - If @may_sleep is true, will do GFP_KERNEL memory allocations and process
|
|
* expired items.
|
|
*/
|
|
static __always_inline void
|
|
__rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head,
|
|
void *ptr, bool may_sleep)
|
|
{
|
|
|
|
struct rcu_pending_pcpu *p;
|
|
struct rcu_pending_seq *objs;
|
|
struct genradix_node *new_node = NULL;
|
|
unsigned long seq, flags;
|
|
bool start_gp = false;
|
|
|
|
BUG_ON((ptr != NULL) != (pending->process == RCU_PENDING_KVFREE_FN));
|
|
|
|
local_irq_save(flags);
|
|
p = this_cpu_ptr(pending->p);
|
|
spin_lock(&p->lock);
|
|
seq = __get_state_synchronize_rcu(pending->srcu);
|
|
restart:
|
|
if (may_sleep &&
|
|
unlikely(process_finished_items(pending, p, flags)))
|
|
goto check_expired;
|
|
|
|
/*
|
|
* In kvfree_rcu() mode, the radix tree is only for slab pointers so
|
|
* that we can do kfree_bulk() - vmalloc pointers always use the linked
|
|
* list:
|
|
*/
|
|
if (ptr && unlikely(is_vmalloc_addr(ptr)))
|
|
goto list_add;
|
|
|
|
objs = get_object_radix(p, seq);
|
|
if (unlikely(!objs))
|
|
goto list_add;
|
|
|
|
if (unlikely(!objs->cursor)) {
|
|
/*
|
|
* New radix tree nodes must be added under @p->lock because the
|
|
* tree root is in a darray that can be resized (typically,
|
|
* genradix supports concurrent unlocked allocation of new
|
|
* nodes) - hence preallocation and the retry loop:
|
|
*/
|
|
objs->cursor = genradix_ptr_alloc_preallocated_inlined(&objs->objs,
|
|
objs->nr, &new_node, GFP_ATOMIC|__GFP_NOWARN);
|
|
if (unlikely(!objs->cursor)) {
|
|
if (may_sleep) {
|
|
spin_unlock_irqrestore(&p->lock, flags);
|
|
|
|
gfp_t gfp = GFP_KERNEL;
|
|
if (!head)
|
|
gfp |= __GFP_NOFAIL;
|
|
|
|
new_node = genradix_alloc_node(gfp);
|
|
if (!new_node)
|
|
may_sleep = false;
|
|
goto check_expired;
|
|
}
|
|
list_add:
|
|
start_gp = rcu_pending_enqueue_list(p, seq, head, ptr, &flags);
|
|
goto start_gp;
|
|
}
|
|
}
|
|
|
|
*objs->cursor++ = ptr ?: head;
|
|
/* zero cursor if we hit the end of a radix tree node: */
|
|
if (!(((ulong) objs->cursor) & (GENRADIX_NODE_SIZE - 1)))
|
|
objs->cursor = NULL;
|
|
start_gp = !objs->nr;
|
|
objs->nr++;
|
|
start_gp:
|
|
if (unlikely(start_gp)) {
|
|
/*
|
|
* We only have one callback (ideally, we would have one for
|
|
* every outstanding graceperiod) - so if our callback is
|
|
* already in flight, we may still have to start a grace period
|
|
* (since we used get_state() above, not start_poll())
|
|
*/
|
|
if (!p->cb_armed) {
|
|
p->cb_armed = true;
|
|
__call_rcu(pending->srcu, &p->cb, rcu_pending_rcu_cb);
|
|
} else {
|
|
__start_poll_synchronize_rcu(pending->srcu);
|
|
}
|
|
}
|
|
spin_unlock_irqrestore(&p->lock, flags);
|
|
free_node:
|
|
if (new_node)
|
|
genradix_free_node(new_node);
|
|
return;
|
|
check_expired:
|
|
if (unlikely(__poll_state_synchronize_rcu(pending->srcu, seq))) {
|
|
switch ((ulong) pending->process) {
|
|
case RCU_PENDING_KVFREE:
|
|
kvfree(ptr);
|
|
break;
|
|
case RCU_PENDING_CALL_RCU:
|
|
head->func(head);
|
|
break;
|
|
default:
|
|
pending->process(pending, head);
|
|
break;
|
|
}
|
|
goto free_node;
|
|
}
|
|
|
|
local_irq_save(flags);
|
|
p = this_cpu_ptr(pending->p);
|
|
spin_lock(&p->lock);
|
|
goto restart;
|
|
}
|
|
|
|
void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj)
|
|
{
|
|
__rcu_pending_enqueue(pending, obj, NULL, true);
|
|
}
|
|
|
|
static struct rcu_head *rcu_pending_pcpu_dequeue(struct rcu_pending_pcpu *p)
|
|
{
|
|
struct rcu_head *ret = NULL;
|
|
|
|
spin_lock_irq(&p->lock);
|
|
darray_for_each(p->objs, objs)
|
|
if (objs->nr) {
|
|
ret = *genradix_ptr(&objs->objs, --objs->nr);
|
|
objs->cursor = NULL;
|
|
if (!objs->nr)
|
|
genradix_free(&objs->objs);
|
|
goto out;
|
|
}
|
|
|
|
static_array_for_each(p->lists, i)
|
|
if (i->head) {
|
|
ret = i->head;
|
|
#ifdef __KERNEL__
|
|
i->head = ret->next;
|
|
#else
|
|
i->head = (void *) ret->next.next;
|
|
#endif
|
|
if (!i->head)
|
|
i->tail = NULL;
|
|
goto out;
|
|
}
|
|
out:
|
|
spin_unlock_irq(&p->lock);
|
|
|
|
return ret;
|
|
}
|
|
|
|
struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending)
|
|
{
|
|
return rcu_pending_pcpu_dequeue(raw_cpu_ptr(pending->p));
|
|
}
|
|
|
|
struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending)
|
|
{
|
|
struct rcu_head *ret = rcu_pending_dequeue(pending);
|
|
|
|
if (ret)
|
|
return ret;
|
|
|
|
int cpu;
|
|
for_each_possible_cpu(cpu) {
|
|
ret = rcu_pending_pcpu_dequeue(per_cpu_ptr(pending->p, cpu));
|
|
if (ret)
|
|
break;
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
static bool rcu_pending_has_pending_or_armed(struct rcu_pending *pending)
|
|
{
|
|
int cpu;
|
|
for_each_possible_cpu(cpu) {
|
|
struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
|
|
spin_lock_irq(&p->lock);
|
|
if (__rcu_pending_has_pending(p) || p->cb_armed) {
|
|
spin_unlock_irq(&p->lock);
|
|
return true;
|
|
}
|
|
spin_unlock_irq(&p->lock);
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
void rcu_pending_exit(struct rcu_pending *pending)
|
|
{
|
|
int cpu;
|
|
|
|
if (!pending->p)
|
|
return;
|
|
|
|
while (rcu_pending_has_pending_or_armed(pending)) {
|
|
__rcu_barrier(pending->srcu);
|
|
|
|
for_each_possible_cpu(cpu) {
|
|
struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
|
|
flush_work(&p->work);
|
|
}
|
|
}
|
|
|
|
for_each_possible_cpu(cpu) {
|
|
struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
|
|
flush_work(&p->work);
|
|
}
|
|
|
|
for_each_possible_cpu(cpu) {
|
|
struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
|
|
|
|
static_array_for_each(p->lists, i)
|
|
WARN_ON(i->head);
|
|
WARN_ON(p->objs.nr);
|
|
darray_exit(&p->objs);
|
|
}
|
|
free_percpu(pending->p);
|
|
}
|
|
|
|
/**
|
|
* rcu_pending_init: - initialize a rcu_pending
|
|
*
|
|
* @pending: Object to init
|
|
* @srcu: May optionally be used with an srcu_struct; if NULL, uses normal
|
|
* RCU flavor
|
|
* @process: Callback function invoked on objects once their RCU barriers
|
|
* have completed; if NULL, kvfree() is used.
|
|
*/
|
|
int rcu_pending_init(struct rcu_pending *pending,
|
|
struct srcu_struct *srcu,
|
|
rcu_pending_process_fn process)
|
|
{
|
|
pending->p = alloc_percpu(struct rcu_pending_pcpu);
|
|
if (!pending->p)
|
|
return -ENOMEM;
|
|
|
|
int cpu;
|
|
for_each_possible_cpu(cpu) {
|
|
struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
|
|
p->parent = pending;
|
|
p->cpu = cpu;
|
|
spin_lock_init(&p->lock);
|
|
darray_init(&p->objs);
|
|
INIT_WORK(&p->work, rcu_pending_work);
|
|
}
|
|
|
|
pending->srcu = srcu;
|
|
pending->process = process;
|
|
|
|
return 0;
|
|
}
|