4f878e8475
This changes the btrfs worker threads to batch work items into a local list. It allows us to pull work items in large chunks and significantly reduces the number of times we need to take the worker thread spinlock. Signed-off-by: Chris Mason <chris.mason@oracle.com>
640 lines
16 KiB
C
640 lines
16 KiB
C
/*
|
|
* Copyright (C) 2007 Oracle. All rights reserved.
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public
|
|
* License v2 as published by the Free Software Foundation.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public
|
|
* License along with this program; if not, write to the
|
|
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
|
* Boston, MA 021110-1307, USA.
|
|
*/
|
|
|
|
#include <linux/kthread.h>
|
|
#include <linux/list.h>
|
|
#include <linux/spinlock.h>
|
|
#include <linux/freezer.h>
|
|
#include "async-thread.h"
|
|
|
|
#define WORK_QUEUED_BIT 0
|
|
#define WORK_DONE_BIT 1
|
|
#define WORK_ORDER_DONE_BIT 2
|
|
#define WORK_HIGH_PRIO_BIT 3
|
|
|
|
/*
|
|
* container for the kthread task pointer and the list of pending work
|
|
* One of these is allocated per thread.
|
|
*/
|
|
struct btrfs_worker_thread {
|
|
/* pool we belong to */
|
|
struct btrfs_workers *workers;
|
|
|
|
/* list of struct btrfs_work that are waiting for service */
|
|
struct list_head pending;
|
|
struct list_head prio_pending;
|
|
|
|
/* list of worker threads from struct btrfs_workers */
|
|
struct list_head worker_list;
|
|
|
|
/* kthread */
|
|
struct task_struct *task;
|
|
|
|
/* number of things on the pending list */
|
|
atomic_t num_pending;
|
|
|
|
/* reference counter for this struct */
|
|
atomic_t refs;
|
|
|
|
unsigned long sequence;
|
|
|
|
/* protects the pending list. */
|
|
spinlock_t lock;
|
|
|
|
/* set to non-zero when this thread is already awake and kicking */
|
|
int working;
|
|
|
|
/* are we currently idle */
|
|
int idle;
|
|
};
|
|
|
|
/*
|
|
* helper function to move a thread onto the idle list after it
|
|
* has finished some requests.
|
|
*/
|
|
static void check_idle_worker(struct btrfs_worker_thread *worker)
|
|
{
|
|
if (!worker->idle && atomic_read(&worker->num_pending) <
|
|
worker->workers->idle_thresh / 2) {
|
|
unsigned long flags;
|
|
spin_lock_irqsave(&worker->workers->lock, flags);
|
|
worker->idle = 1;
|
|
list_move(&worker->worker_list, &worker->workers->idle_list);
|
|
spin_unlock_irqrestore(&worker->workers->lock, flags);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* helper function to move a thread off the idle list after new
|
|
* pending work is added.
|
|
*/
|
|
static void check_busy_worker(struct btrfs_worker_thread *worker)
|
|
{
|
|
if (worker->idle && atomic_read(&worker->num_pending) >=
|
|
worker->workers->idle_thresh) {
|
|
unsigned long flags;
|
|
spin_lock_irqsave(&worker->workers->lock, flags);
|
|
worker->idle = 0;
|
|
list_move_tail(&worker->worker_list,
|
|
&worker->workers->worker_list);
|
|
spin_unlock_irqrestore(&worker->workers->lock, flags);
|
|
}
|
|
}
|
|
|
|
static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
|
|
{
|
|
struct btrfs_workers *workers = worker->workers;
|
|
unsigned long flags;
|
|
|
|
rmb();
|
|
if (!workers->atomic_start_pending)
|
|
return;
|
|
|
|
spin_lock_irqsave(&workers->lock, flags);
|
|
if (!workers->atomic_start_pending)
|
|
goto out;
|
|
|
|
workers->atomic_start_pending = 0;
|
|
if (workers->num_workers >= workers->max_workers)
|
|
goto out;
|
|
|
|
spin_unlock_irqrestore(&workers->lock, flags);
|
|
btrfs_start_workers(workers, 1);
|
|
return;
|
|
|
|
out:
|
|
spin_unlock_irqrestore(&workers->lock, flags);
|
|
}
|
|
|
|
static noinline int run_ordered_completions(struct btrfs_workers *workers,
|
|
struct btrfs_work *work)
|
|
{
|
|
if (!workers->ordered)
|
|
return 0;
|
|
|
|
set_bit(WORK_DONE_BIT, &work->flags);
|
|
|
|
spin_lock(&workers->order_lock);
|
|
|
|
while (1) {
|
|
if (!list_empty(&workers->prio_order_list)) {
|
|
work = list_entry(workers->prio_order_list.next,
|
|
struct btrfs_work, order_list);
|
|
} else if (!list_empty(&workers->order_list)) {
|
|
work = list_entry(workers->order_list.next,
|
|
struct btrfs_work, order_list);
|
|
} else {
|
|
break;
|
|
}
|
|
if (!test_bit(WORK_DONE_BIT, &work->flags))
|
|
break;
|
|
|
|
/* we are going to call the ordered done function, but
|
|
* we leave the work item on the list as a barrier so
|
|
* that later work items that are done don't have their
|
|
* functions called before this one returns
|
|
*/
|
|
if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
|
|
break;
|
|
|
|
spin_unlock(&workers->order_lock);
|
|
|
|
work->ordered_func(work);
|
|
|
|
/* now take the lock again and call the freeing code */
|
|
spin_lock(&workers->order_lock);
|
|
list_del(&work->order_list);
|
|
work->ordered_free(work);
|
|
}
|
|
|
|
spin_unlock(&workers->order_lock);
|
|
return 0;
|
|
}
|
|
|
|
static void put_worker(struct btrfs_worker_thread *worker)
|
|
{
|
|
if (atomic_dec_and_test(&worker->refs))
|
|
kfree(worker);
|
|
}
|
|
|
|
static int try_worker_shutdown(struct btrfs_worker_thread *worker)
|
|
{
|
|
int freeit = 0;
|
|
|
|
spin_lock_irq(&worker->lock);
|
|
spin_lock_irq(&worker->workers->lock);
|
|
if (worker->workers->num_workers > 1 &&
|
|
worker->idle &&
|
|
!worker->working &&
|
|
!list_empty(&worker->worker_list) &&
|
|
list_empty(&worker->prio_pending) &&
|
|
list_empty(&worker->pending)) {
|
|
freeit = 1;
|
|
list_del_init(&worker->worker_list);
|
|
worker->workers->num_workers--;
|
|
}
|
|
spin_unlock_irq(&worker->workers->lock);
|
|
spin_unlock_irq(&worker->lock);
|
|
|
|
if (freeit)
|
|
put_worker(worker);
|
|
return freeit;
|
|
}
|
|
|
|
static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker,
|
|
struct list_head *prio_head,
|
|
struct list_head *head)
|
|
{
|
|
struct btrfs_work *work = NULL;
|
|
struct list_head *cur = NULL;
|
|
|
|
if(!list_empty(prio_head))
|
|
cur = prio_head->next;
|
|
|
|
smp_mb();
|
|
if (!list_empty(&worker->prio_pending))
|
|
goto refill;
|
|
|
|
if (!list_empty(head))
|
|
cur = head->next;
|
|
|
|
if (cur)
|
|
goto out;
|
|
|
|
refill:
|
|
spin_lock_irq(&worker->lock);
|
|
list_splice_tail_init(&worker->prio_pending, prio_head);
|
|
list_splice_tail_init(&worker->pending, head);
|
|
|
|
if (!list_empty(prio_head))
|
|
cur = prio_head->next;
|
|
else if (!list_empty(head))
|
|
cur = head->next;
|
|
spin_unlock_irq(&worker->lock);
|
|
|
|
if (!cur)
|
|
goto out_fail;
|
|
|
|
out:
|
|
work = list_entry(cur, struct btrfs_work, list);
|
|
|
|
out_fail:
|
|
return work;
|
|
}
|
|
|
|
/*
|
|
* main loop for servicing work items
|
|
*/
|
|
static int worker_loop(void *arg)
|
|
{
|
|
struct btrfs_worker_thread *worker = arg;
|
|
struct list_head head;
|
|
struct list_head prio_head;
|
|
struct btrfs_work *work;
|
|
|
|
INIT_LIST_HEAD(&head);
|
|
INIT_LIST_HEAD(&prio_head);
|
|
|
|
do {
|
|
again:
|
|
while (1) {
|
|
|
|
|
|
work = get_next_work(worker, &prio_head, &head);
|
|
if (!work)
|
|
break;
|
|
|
|
list_del(&work->list);
|
|
clear_bit(WORK_QUEUED_BIT, &work->flags);
|
|
|
|
work->worker = worker;
|
|
|
|
work->func(work);
|
|
|
|
atomic_dec(&worker->num_pending);
|
|
/*
|
|
* unless this is an ordered work queue,
|
|
* 'work' was probably freed by func above.
|
|
*/
|
|
run_ordered_completions(worker->workers, work);
|
|
|
|
check_pending_worker_creates(worker);
|
|
|
|
}
|
|
|
|
spin_lock_irq(&worker->lock);
|
|
check_idle_worker(worker);
|
|
|
|
if (freezing(current)) {
|
|
worker->working = 0;
|
|
spin_unlock_irq(&worker->lock);
|
|
refrigerator();
|
|
} else {
|
|
spin_unlock_irq(&worker->lock);
|
|
if (!kthread_should_stop()) {
|
|
cpu_relax();
|
|
/*
|
|
* we've dropped the lock, did someone else
|
|
* jump_in?
|
|
*/
|
|
smp_mb();
|
|
if (!list_empty(&worker->pending) ||
|
|
!list_empty(&worker->prio_pending))
|
|
continue;
|
|
|
|
/*
|
|
* this short schedule allows more work to
|
|
* come in without the queue functions
|
|
* needing to go through wake_up_process()
|
|
*
|
|
* worker->working is still 1, so nobody
|
|
* is going to try and wake us up
|
|
*/
|
|
schedule_timeout(1);
|
|
smp_mb();
|
|
if (!list_empty(&worker->pending) ||
|
|
!list_empty(&worker->prio_pending))
|
|
continue;
|
|
|
|
if (kthread_should_stop())
|
|
break;
|
|
|
|
/* still no more work?, sleep for real */
|
|
spin_lock_irq(&worker->lock);
|
|
set_current_state(TASK_INTERRUPTIBLE);
|
|
if (!list_empty(&worker->pending) ||
|
|
!list_empty(&worker->prio_pending)) {
|
|
spin_unlock_irq(&worker->lock);
|
|
goto again;
|
|
}
|
|
|
|
/*
|
|
* this makes sure we get a wakeup when someone
|
|
* adds something new to the queue
|
|
*/
|
|
worker->working = 0;
|
|
spin_unlock_irq(&worker->lock);
|
|
|
|
if (!kthread_should_stop()) {
|
|
schedule_timeout(HZ * 120);
|
|
if (!worker->working &&
|
|
try_worker_shutdown(worker)) {
|
|
return 0;
|
|
}
|
|
}
|
|
}
|
|
__set_current_state(TASK_RUNNING);
|
|
}
|
|
} while (!kthread_should_stop());
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* this will wait for all the worker threads to shutdown
|
|
*/
|
|
int btrfs_stop_workers(struct btrfs_workers *workers)
|
|
{
|
|
struct list_head *cur;
|
|
struct btrfs_worker_thread *worker;
|
|
int can_stop;
|
|
|
|
spin_lock_irq(&workers->lock);
|
|
list_splice_init(&workers->idle_list, &workers->worker_list);
|
|
while (!list_empty(&workers->worker_list)) {
|
|
cur = workers->worker_list.next;
|
|
worker = list_entry(cur, struct btrfs_worker_thread,
|
|
worker_list);
|
|
|
|
atomic_inc(&worker->refs);
|
|
workers->num_workers -= 1;
|
|
if (!list_empty(&worker->worker_list)) {
|
|
list_del_init(&worker->worker_list);
|
|
put_worker(worker);
|
|
can_stop = 1;
|
|
} else
|
|
can_stop = 0;
|
|
spin_unlock_irq(&workers->lock);
|
|
if (can_stop)
|
|
kthread_stop(worker->task);
|
|
spin_lock_irq(&workers->lock);
|
|
put_worker(worker);
|
|
}
|
|
spin_unlock_irq(&workers->lock);
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* simple init on struct btrfs_workers
|
|
*/
|
|
void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
|
|
{
|
|
workers->num_workers = 0;
|
|
INIT_LIST_HEAD(&workers->worker_list);
|
|
INIT_LIST_HEAD(&workers->idle_list);
|
|
INIT_LIST_HEAD(&workers->order_list);
|
|
INIT_LIST_HEAD(&workers->prio_order_list);
|
|
spin_lock_init(&workers->lock);
|
|
spin_lock_init(&workers->order_lock);
|
|
workers->max_workers = max;
|
|
workers->idle_thresh = 32;
|
|
workers->name = name;
|
|
workers->ordered = 0;
|
|
workers->atomic_start_pending = 0;
|
|
workers->atomic_worker_start = 0;
|
|
}
|
|
|
|
/*
|
|
* starts new worker threads. This does not enforce the max worker
|
|
* count in case you need to temporarily go past it.
|
|
*/
|
|
int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
|
|
{
|
|
struct btrfs_worker_thread *worker;
|
|
int ret = 0;
|
|
int i;
|
|
|
|
for (i = 0; i < num_workers; i++) {
|
|
worker = kzalloc(sizeof(*worker), GFP_NOFS);
|
|
if (!worker) {
|
|
ret = -ENOMEM;
|
|
goto fail;
|
|
}
|
|
|
|
INIT_LIST_HEAD(&worker->pending);
|
|
INIT_LIST_HEAD(&worker->prio_pending);
|
|
INIT_LIST_HEAD(&worker->worker_list);
|
|
spin_lock_init(&worker->lock);
|
|
|
|
atomic_set(&worker->num_pending, 0);
|
|
atomic_set(&worker->refs, 1);
|
|
worker->workers = workers;
|
|
worker->task = kthread_run(worker_loop, worker,
|
|
"btrfs-%s-%d", workers->name,
|
|
workers->num_workers + i);
|
|
if (IS_ERR(worker->task)) {
|
|
ret = PTR_ERR(worker->task);
|
|
kfree(worker);
|
|
goto fail;
|
|
}
|
|
spin_lock_irq(&workers->lock);
|
|
list_add_tail(&worker->worker_list, &workers->idle_list);
|
|
worker->idle = 1;
|
|
workers->num_workers++;
|
|
spin_unlock_irq(&workers->lock);
|
|
}
|
|
return 0;
|
|
fail:
|
|
btrfs_stop_workers(workers);
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* run through the list and find a worker thread that doesn't have a lot
|
|
* to do right now. This can return null if we aren't yet at the thread
|
|
* count limit and all of the threads are busy.
|
|
*/
|
|
static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
|
|
{
|
|
struct btrfs_worker_thread *worker;
|
|
struct list_head *next;
|
|
int enforce_min = workers->num_workers < workers->max_workers;
|
|
|
|
/*
|
|
* if we find an idle thread, don't move it to the end of the
|
|
* idle list. This improves the chance that the next submission
|
|
* will reuse the same thread, and maybe catch it while it is still
|
|
* working
|
|
*/
|
|
if (!list_empty(&workers->idle_list)) {
|
|
next = workers->idle_list.next;
|
|
worker = list_entry(next, struct btrfs_worker_thread,
|
|
worker_list);
|
|
return worker;
|
|
}
|
|
if (enforce_min || list_empty(&workers->worker_list))
|
|
return NULL;
|
|
|
|
/*
|
|
* if we pick a busy task, move the task to the end of the list.
|
|
* hopefully this will keep things somewhat evenly balanced.
|
|
* Do the move in batches based on the sequence number. This groups
|
|
* requests submitted at roughly the same time onto the same worker.
|
|
*/
|
|
next = workers->worker_list.next;
|
|
worker = list_entry(next, struct btrfs_worker_thread, worker_list);
|
|
atomic_inc(&worker->num_pending);
|
|
worker->sequence++;
|
|
|
|
if (worker->sequence % workers->idle_thresh == 0)
|
|
list_move_tail(next, &workers->worker_list);
|
|
return worker;
|
|
}
|
|
|
|
/*
|
|
* selects a worker thread to take the next job. This will either find
|
|
* an idle worker, start a new worker up to the max count, or just return
|
|
* one of the existing busy workers.
|
|
*/
|
|
static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
|
|
{
|
|
struct btrfs_worker_thread *worker;
|
|
unsigned long flags;
|
|
struct list_head *fallback;
|
|
|
|
again:
|
|
spin_lock_irqsave(&workers->lock, flags);
|
|
worker = next_worker(workers);
|
|
|
|
if (!worker) {
|
|
if (workers->num_workers >= workers->max_workers) {
|
|
goto fallback;
|
|
} else if (workers->atomic_worker_start) {
|
|
workers->atomic_start_pending = 1;
|
|
goto fallback;
|
|
} else {
|
|
spin_unlock_irqrestore(&workers->lock, flags);
|
|
/* we're below the limit, start another worker */
|
|
btrfs_start_workers(workers, 1);
|
|
goto again;
|
|
}
|
|
}
|
|
spin_unlock_irqrestore(&workers->lock, flags);
|
|
return worker;
|
|
|
|
fallback:
|
|
fallback = NULL;
|
|
/*
|
|
* we have failed to find any workers, just
|
|
* return the first one we can find.
|
|
*/
|
|
if (!list_empty(&workers->worker_list))
|
|
fallback = workers->worker_list.next;
|
|
if (!list_empty(&workers->idle_list))
|
|
fallback = workers->idle_list.next;
|
|
BUG_ON(!fallback);
|
|
worker = list_entry(fallback,
|
|
struct btrfs_worker_thread, worker_list);
|
|
spin_unlock_irqrestore(&workers->lock, flags);
|
|
return worker;
|
|
}
|
|
|
|
/*
|
|
* btrfs_requeue_work just puts the work item back on the tail of the list
|
|
* it was taken from. It is intended for use with long running work functions
|
|
* that make some progress and want to give the cpu up for others.
|
|
*/
|
|
int btrfs_requeue_work(struct btrfs_work *work)
|
|
{
|
|
struct btrfs_worker_thread *worker = work->worker;
|
|
unsigned long flags;
|
|
int wake = 0;
|
|
|
|
if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
|
|
goto out;
|
|
|
|
spin_lock_irqsave(&worker->lock, flags);
|
|
if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
|
|
list_add_tail(&work->list, &worker->prio_pending);
|
|
else
|
|
list_add_tail(&work->list, &worker->pending);
|
|
atomic_inc(&worker->num_pending);
|
|
|
|
/* by definition we're busy, take ourselves off the idle
|
|
* list
|
|
*/
|
|
if (worker->idle) {
|
|
spin_lock(&worker->workers->lock);
|
|
worker->idle = 0;
|
|
list_move_tail(&worker->worker_list,
|
|
&worker->workers->worker_list);
|
|
spin_unlock(&worker->workers->lock);
|
|
}
|
|
if (!worker->working) {
|
|
wake = 1;
|
|
worker->working = 1;
|
|
}
|
|
|
|
if (wake)
|
|
wake_up_process(worker->task);
|
|
spin_unlock_irqrestore(&worker->lock, flags);
|
|
out:
|
|
|
|
return 0;
|
|
}
|
|
|
|
void btrfs_set_work_high_prio(struct btrfs_work *work)
|
|
{
|
|
set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
|
|
}
|
|
|
|
/*
|
|
* places a struct btrfs_work into the pending queue of one of the kthreads
|
|
*/
|
|
int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
|
|
{
|
|
struct btrfs_worker_thread *worker;
|
|
unsigned long flags;
|
|
int wake = 0;
|
|
|
|
/* don't requeue something already on a list */
|
|
if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
|
|
goto out;
|
|
|
|
worker = find_worker(workers);
|
|
if (workers->ordered) {
|
|
/*
|
|
* you're not allowed to do ordered queues from an
|
|
* interrupt handler
|
|
*/
|
|
spin_lock(&workers->order_lock);
|
|
if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
|
|
list_add_tail(&work->order_list,
|
|
&workers->prio_order_list);
|
|
} else {
|
|
list_add_tail(&work->order_list, &workers->order_list);
|
|
}
|
|
spin_unlock(&workers->order_lock);
|
|
} else {
|
|
INIT_LIST_HEAD(&work->order_list);
|
|
}
|
|
|
|
spin_lock_irqsave(&worker->lock, flags);
|
|
|
|
if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
|
|
list_add_tail(&work->list, &worker->prio_pending);
|
|
else
|
|
list_add_tail(&work->list, &worker->pending);
|
|
atomic_inc(&worker->num_pending);
|
|
check_busy_worker(worker);
|
|
|
|
/*
|
|
* avoid calling into wake_up_process if this thread has already
|
|
* been kicked
|
|
*/
|
|
if (!worker->working)
|
|
wake = 1;
|
|
worker->working = 1;
|
|
|
|
if (wake)
|
|
wake_up_process(worker->task);
|
|
spin_unlock_irqrestore(&worker->lock, flags);
|
|
|
|
out:
|
|
return 0;
|
|
}
|