2022-08-19 12:35:34 -07:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
|
|
|
|
#include "bcachefs.h"
|
|
|
|
#include "btree_locking.h"
|
|
|
|
#include "btree_types.h"
|
|
|
|
|
2023-03-02 22:03:01 -07:00
|
|
|
static struct lock_class_key bch2_btree_node_lock_key;
|
|
|
|
|
2023-05-20 17:57:55 -07:00
|
|
|
void bch2_btree_lock_init(struct btree_bkey_cached_common *b,
|
|
|
|
enum six_lock_init_flags flags)
|
2023-03-02 22:03:01 -07:00
|
|
|
{
|
2023-05-20 17:57:55 -07:00
|
|
|
__six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags);
|
2023-12-21 18:34:17 -07:00
|
|
|
lockdep_set_notrack_class(&b->lock);
|
2023-03-02 22:03:01 -07:00
|
|
|
}
|
|
|
|
|
2022-08-19 12:35:34 -07:00
|
|
|
/* Btree node locking: */
|
|
|
|
|
|
|
|
struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans,
|
|
|
|
struct btree_path *skip,
|
2022-08-22 10:21:10 -07:00
|
|
|
struct btree_bkey_cached_common *b,
|
2022-08-19 12:35:34 -07:00
|
|
|
unsigned level)
|
|
|
|
{
|
|
|
|
struct btree_path *path;
|
2022-08-21 20:08:53 -07:00
|
|
|
struct six_lock_count ret;
|
2023-12-10 21:37:45 -07:00
|
|
|
unsigned i;
|
2022-08-21 20:08:53 -07:00
|
|
|
|
|
|
|
memset(&ret, 0, sizeof(ret));
|
2022-08-19 12:35:34 -07:00
|
|
|
|
|
|
|
if (IS_ERR_OR_NULL(b))
|
|
|
|
return ret;
|
|
|
|
|
2023-12-10 21:37:45 -07:00
|
|
|
trans_for_each_path(trans, path, i)
|
2022-08-22 10:21:10 -07:00
|
|
|
if (path != skip && &path->l[level].b->c == b) {
|
2022-08-21 20:08:53 -07:00
|
|
|
int t = btree_node_locked_type(path, level);
|
|
|
|
|
|
|
|
if (t != BTREE_NODE_UNLOCKED)
|
|
|
|
ret.n[t]++;
|
2022-08-19 12:35:34 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2022-08-19 16:50:18 -07:00
|
|
|
/* unlock */
|
|
|
|
|
|
|
|
void bch2_btree_node_unlock_write(struct btree_trans *trans,
|
|
|
|
struct btree_path *path, struct btree *b)
|
2022-08-19 12:35:34 -07:00
|
|
|
{
|
2022-08-19 16:50:18 -07:00
|
|
|
bch2_btree_node_unlock_write_inlined(trans, path, b);
|
2022-08-19 12:35:34 -07:00
|
|
|
}
|
|
|
|
|
2022-08-19 16:50:18 -07:00
|
|
|
/* lock */
|
|
|
|
|
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 10:23:47 -07:00
|
|
|
/*
|
|
|
|
* @trans wants to lock @b with type @type
|
|
|
|
*/
|
|
|
|
struct trans_waiting_for_lock {
|
|
|
|
struct btree_trans *trans;
|
|
|
|
struct btree_bkey_cached_common *node_want;
|
|
|
|
enum six_lock_type lock_want;
|
|
|
|
|
|
|
|
/* for iterating over held locks :*/
|
|
|
|
u8 path_idx;
|
|
|
|
u8 level;
|
|
|
|
u64 lock_start_time;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct lock_graph {
|
|
|
|
struct trans_waiting_for_lock g[8];
|
|
|
|
unsigned nr;
|
|
|
|
};
|
|
|
|
|
2022-08-22 20:12:11 -07:00
|
|
|
static noinline void print_cycle(struct printbuf *out, struct lock_graph *g)
|
|
|
|
{
|
|
|
|
struct trans_waiting_for_lock *i;
|
|
|
|
|
2024-04-10 13:08:24 -07:00
|
|
|
prt_printf(out, "Found lock cycle (%u entries):\n", g->nr);
|
2022-08-22 20:12:11 -07:00
|
|
|
|
2024-01-04 16:59:17 -07:00
|
|
|
for (i = g->g; i < g->g + g->nr; i++) {
|
|
|
|
struct task_struct *task = READ_ONCE(i->trans->locking_wait.task);
|
|
|
|
if (!task)
|
|
|
|
continue;
|
|
|
|
|
2022-08-22 20:12:11 -07:00
|
|
|
bch2_btree_trans_to_text(out, i->trans);
|
2024-01-22 10:25:00 -07:00
|
|
|
bch2_prt_task_backtrace(out, task, i == g->g ? 5 : 1, GFP_NOWAIT);
|
2024-01-04 16:59:17 -07:00
|
|
|
}
|
2022-08-22 20:12:11 -07:00
|
|
|
}
|
|
|
|
|
2022-10-01 22:41:08 -07:00
|
|
|
static noinline void print_chain(struct printbuf *out, struct lock_graph *g)
|
|
|
|
{
|
|
|
|
struct trans_waiting_for_lock *i;
|
|
|
|
|
|
|
|
for (i = g->g; i != g->g + g->nr; i++) {
|
2023-12-11 09:11:22 -07:00
|
|
|
struct task_struct *task = i->trans->locking_wait.task;
|
2022-10-01 22:41:08 -07:00
|
|
|
if (i != g->g)
|
|
|
|
prt_str(out, "<- ");
|
2023-12-11 09:11:22 -07:00
|
|
|
prt_printf(out, "%u ", task ?task->pid : 0);
|
2022-10-01 22:41:08 -07:00
|
|
|
}
|
|
|
|
prt_newline(out);
|
|
|
|
}
|
|
|
|
|
2022-10-12 15:17:49 -07:00
|
|
|
static void lock_graph_up(struct lock_graph *g)
|
|
|
|
{
|
|
|
|
closure_put(&g->g[--g->nr].trans->ref);
|
|
|
|
}
|
|
|
|
|
2023-01-20 13:35:07 -07:00
|
|
|
static noinline void lock_graph_pop_all(struct lock_graph *g)
|
|
|
|
{
|
|
|
|
while (g->nr)
|
|
|
|
lock_graph_up(g);
|
|
|
|
}
|
|
|
|
|
2023-06-18 21:07:40 -07:00
|
|
|
static void __lock_graph_down(struct lock_graph *g, struct btree_trans *trans)
|
2022-10-12 15:17:49 -07:00
|
|
|
{
|
|
|
|
g->g[g->nr++] = (struct trans_waiting_for_lock) {
|
|
|
|
.trans = trans,
|
|
|
|
.node_want = trans->locking,
|
|
|
|
.lock_want = trans->locking_wait.lock_want,
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2023-06-18 21:07:40 -07:00
|
|
|
static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans)
|
|
|
|
{
|
|
|
|
closure_get(&trans->ref);
|
|
|
|
__lock_graph_down(g, trans);
|
|
|
|
}
|
|
|
|
|
2022-10-12 15:17:49 -07:00
|
|
|
static bool lock_graph_remove_non_waiters(struct lock_graph *g)
|
|
|
|
{
|
|
|
|
struct trans_waiting_for_lock *i;
|
|
|
|
|
|
|
|
for (i = g->g + 1; i < g->g + g->nr; i++)
|
|
|
|
if (i->trans->locking != i->node_want ||
|
|
|
|
i->trans->locking_wait.start_time != i[-1].lock_start_time) {
|
|
|
|
while (g->g + g->nr > i)
|
|
|
|
lock_graph_up(g);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2024-01-04 16:59:17 -07:00
|
|
|
static void trace_would_deadlock(struct lock_graph *g, struct btree_trans *trans)
|
2023-05-26 13:59:07 -07:00
|
|
|
{
|
|
|
|
struct bch_fs *c = trans->c;
|
|
|
|
|
|
|
|
count_event(c, trans_restart_would_deadlock);
|
|
|
|
|
|
|
|
if (trace_trans_restart_would_deadlock_enabled()) {
|
|
|
|
struct printbuf buf = PRINTBUF;
|
|
|
|
|
|
|
|
buf.atomic++;
|
|
|
|
print_cycle(&buf, g);
|
|
|
|
|
2024-01-04 16:59:17 -07:00
|
|
|
trace_trans_restart_would_deadlock(trans, buf.buf);
|
2023-05-26 13:59:07 -07:00
|
|
|
printbuf_exit(&buf);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 10:23:47 -07:00
|
|
|
static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i)
|
|
|
|
{
|
|
|
|
if (i == g->g) {
|
2024-01-04 16:59:17 -07:00
|
|
|
trace_would_deadlock(g, i->trans);
|
2022-10-09 01:55:02 -07:00
|
|
|
return btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock);
|
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 10:23:47 -07:00
|
|
|
} else {
|
|
|
|
i->trans->lock_must_abort = true;
|
|
|
|
wake_up_process(i->trans->locking_wait.task);
|
2022-10-09 01:55:02 -07:00
|
|
|
return 0;
|
|
|
|
}
|
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 10:23:47 -07:00
|
|
|
}
|
|
|
|
|
2022-10-12 15:17:49 -07:00
|
|
|
static int btree_trans_abort_preference(struct btree_trans *trans)
|
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 10:23:47 -07:00
|
|
|
{
|
2022-10-12 15:17:49 -07:00
|
|
|
if (trans->lock_may_not_fail)
|
|
|
|
return 0;
|
|
|
|
if (trans->locking_wait.lock_want == SIX_LOCK_write)
|
|
|
|
return 1;
|
|
|
|
if (!trans->in_traverse_all)
|
|
|
|
return 2;
|
|
|
|
return 3;
|
|
|
|
}
|
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 10:23:47 -07:00
|
|
|
|
2022-10-12 15:17:49 -07:00
|
|
|
static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle)
|
|
|
|
{
|
|
|
|
struct trans_waiting_for_lock *i, *abort = NULL;
|
|
|
|
unsigned best = 0, pref;
|
|
|
|
int ret;
|
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 10:23:47 -07:00
|
|
|
|
2022-10-12 15:17:49 -07:00
|
|
|
if (lock_graph_remove_non_waiters(g))
|
|
|
|
return 0;
|
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 10:23:47 -07:00
|
|
|
|
2022-10-12 15:17:49 -07:00
|
|
|
/* Only checking, for debugfs: */
|
|
|
|
if (cycle) {
|
|
|
|
print_cycle(cycle, g);
|
|
|
|
ret = -1;
|
|
|
|
goto out;
|
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 10:23:47 -07:00
|
|
|
}
|
|
|
|
|
2022-10-12 15:17:49 -07:00
|
|
|
for (i = g->g; i < g->g + g->nr; i++) {
|
|
|
|
pref = btree_trans_abort_preference(i->trans);
|
|
|
|
if (pref > best) {
|
|
|
|
abort = i;
|
|
|
|
best = pref;
|
|
|
|
}
|
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 10:23:47 -07:00
|
|
|
}
|
|
|
|
|
2022-10-12 15:17:49 -07:00
|
|
|
if (unlikely(!best)) {
|
2022-10-09 01:29:04 -07:00
|
|
|
struct printbuf buf = PRINTBUF;
|
2024-06-02 19:52:24 -07:00
|
|
|
buf.atomic++;
|
2022-10-09 01:29:04 -07:00
|
|
|
|
2022-11-13 18:01:42 -07:00
|
|
|
prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks"));
|
2022-10-09 01:29:04 -07:00
|
|
|
|
|
|
|
for (i = g->g; i < g->g + g->nr; i++) {
|
|
|
|
struct btree_trans *trans = i->trans;
|
|
|
|
|
|
|
|
bch2_btree_trans_to_text(&buf, trans);
|
|
|
|
|
2024-04-10 13:08:24 -07:00
|
|
|
prt_printf(&buf, "backtrace:\n");
|
2022-10-09 01:29:04 -07:00
|
|
|
printbuf_indent_add(&buf, 2);
|
2024-01-22 10:25:00 -07:00
|
|
|
bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT);
|
2022-10-09 01:29:04 -07:00
|
|
|
printbuf_indent_sub(&buf, 2);
|
|
|
|
prt_newline(&buf);
|
|
|
|
}
|
|
|
|
|
2024-07-10 09:59:28 -07:00
|
|
|
bch2_print_string_as_lines_nonblocking(KERN_ERR, buf.buf);
|
2022-10-09 01:29:04 -07:00
|
|
|
printbuf_exit(&buf);
|
|
|
|
BUG();
|
|
|
|
}
|
2022-10-01 22:41:08 -07:00
|
|
|
|
2022-10-12 15:17:49 -07:00
|
|
|
ret = abort_lock(g, abort);
|
|
|
|
out:
|
|
|
|
if (ret)
|
|
|
|
while (g->nr)
|
|
|
|
lock_graph_up(g);
|
|
|
|
return ret;
|
2022-10-01 22:41:08 -07:00
|
|
|
}
|
|
|
|
|
2022-08-22 20:12:11 -07:00
|
|
|
static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans,
|
|
|
|
struct printbuf *cycle)
|
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 10:23:47 -07:00
|
|
|
{
|
|
|
|
struct btree_trans *orig_trans = g->g->trans;
|
|
|
|
struct trans_waiting_for_lock *i;
|
|
|
|
|
2022-10-12 15:17:49 -07:00
|
|
|
for (i = g->g; i < g->g + g->nr; i++)
|
2023-06-18 21:07:40 -07:00
|
|
|
if (i->trans == trans) {
|
|
|
|
closure_put(&trans->ref);
|
2022-10-12 15:17:49 -07:00
|
|
|
return break_cycle(g, cycle);
|
2023-06-18 21:07:40 -07:00
|
|
|
}
|
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 10:23:47 -07:00
|
|
|
|
|
|
|
if (g->nr == ARRAY_SIZE(g->g)) {
|
2023-06-18 21:07:40 -07:00
|
|
|
closure_put(&trans->ref);
|
|
|
|
|
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 10:23:47 -07:00
|
|
|
if (orig_trans->lock_may_not_fail)
|
|
|
|
return 0;
|
|
|
|
|
2022-10-12 15:17:49 -07:00
|
|
|
while (g->nr)
|
|
|
|
lock_graph_up(g);
|
2023-01-20 13:35:07 -07:00
|
|
|
|
|
|
|
if (cycle)
|
|
|
|
return 0;
|
|
|
|
|
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 10:23:47 -07:00
|
|
|
trace_and_count(trans->c, trans_restart_would_deadlock_recursion_limit, trans, _RET_IP_);
|
2022-10-12 15:17:49 -07:00
|
|
|
return btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit);
|
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 10:23:47 -07:00
|
|
|
}
|
|
|
|
|
2023-06-18 21:07:40 -07:00
|
|
|
__lock_graph_down(g, trans);
|
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 10:23:47 -07:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool lock_type_conflicts(enum six_lock_type t1, enum six_lock_type t2)
|
|
|
|
{
|
|
|
|
return t1 + t2 > 1;
|
|
|
|
}
|
|
|
|
|
2022-08-22 20:12:11 -07:00
|
|
|
int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle)
|
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 10:23:47 -07:00
|
|
|
{
|
|
|
|
struct lock_graph g;
|
|
|
|
struct trans_waiting_for_lock *top;
|
|
|
|
struct btree_bkey_cached_common *b;
|
2023-12-12 18:08:29 -07:00
|
|
|
btree_path_idx_t path_idx;
|
|
|
|
int ret = 0;
|
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 10:23:47 -07:00
|
|
|
|
2023-05-26 13:59:07 -07:00
|
|
|
g.nr = 0;
|
|
|
|
|
2022-08-22 12:29:53 -07:00
|
|
|
if (trans->lock_must_abort) {
|
2023-01-20 13:35:07 -07:00
|
|
|
if (cycle)
|
|
|
|
return -1;
|
|
|
|
|
2024-01-04 16:59:17 -07:00
|
|
|
trace_would_deadlock(&g, trans);
|
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 10:23:47 -07:00
|
|
|
return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock);
|
2022-08-22 12:29:53 -07:00
|
|
|
}
|
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 10:23:47 -07:00
|
|
|
|
2022-10-12 15:17:49 -07:00
|
|
|
lock_graph_down(&g, trans);
|
2023-12-12 18:08:29 -07:00
|
|
|
|
|
|
|
/* trans->paths is rcu protected vs. freeing */
|
|
|
|
rcu_read_lock();
|
|
|
|
if (cycle)
|
|
|
|
cycle->atomic++;
|
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 10:23:47 -07:00
|
|
|
next:
|
|
|
|
if (!g.nr)
|
2023-12-12 18:08:29 -07:00
|
|
|
goto out;
|
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 10:23:47 -07:00
|
|
|
|
|
|
|
top = &g.g[g.nr - 1];
|
|
|
|
|
2023-12-12 18:08:29 -07:00
|
|
|
struct btree_path *paths = rcu_dereference(top->trans->paths);
|
|
|
|
if (!paths)
|
|
|
|
goto up;
|
|
|
|
|
|
|
|
unsigned long *paths_allocated = trans_paths_allocated(paths);
|
|
|
|
|
|
|
|
trans_for_each_path_idx_from(paths_allocated, *trans_paths_nr(paths),
|
|
|
|
path_idx, top->path_idx) {
|
|
|
|
struct btree_path *path = paths + path_idx;
|
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 10:23:47 -07:00
|
|
|
if (!path->nodes_locked)
|
|
|
|
continue;
|
|
|
|
|
2023-05-27 16:55:54 -07:00
|
|
|
if (path_idx != top->path_idx) {
|
|
|
|
top->path_idx = path_idx;
|
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 10:23:47 -07:00
|
|
|
top->level = 0;
|
|
|
|
top->lock_start_time = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (;
|
|
|
|
top->level < BTREE_MAX_DEPTH;
|
|
|
|
top->level++, top->lock_start_time = 0) {
|
|
|
|
int lock_held = btree_node_locked_type(path, top->level);
|
|
|
|
|
|
|
|
if (lock_held == BTREE_NODE_UNLOCKED)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
b = &READ_ONCE(path->l[top->level].b)->c;
|
|
|
|
|
2023-01-20 13:35:07 -07:00
|
|
|
if (IS_ERR_OR_NULL(b)) {
|
|
|
|
/*
|
|
|
|
* If we get here, it means we raced with the
|
|
|
|
* other thread updating its btree_path
|
|
|
|
* structures - which means it can't be blocked
|
|
|
|
* waiting on a lock:
|
|
|
|
*/
|
|
|
|
if (!lock_graph_remove_non_waiters(&g)) {
|
|
|
|
/*
|
|
|
|
* If lock_graph_remove_non_waiters()
|
|
|
|
* didn't do anything, it must be
|
|
|
|
* because we're being called by debugfs
|
|
|
|
* checking for lock cycles, which
|
|
|
|
* invokes us on btree_transactions that
|
|
|
|
* aren't actually waiting on anything.
|
|
|
|
* Just bail out:
|
|
|
|
*/
|
|
|
|
lock_graph_pop_all(&g);
|
|
|
|
}
|
|
|
|
|
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 10:23:47 -07:00
|
|
|
goto next;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (list_empty_careful(&b->lock.wait_list))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
raw_spin_lock(&b->lock.wait_lock);
|
|
|
|
list_for_each_entry(trans, &b->lock.wait_list, locking_wait.list) {
|
|
|
|
BUG_ON(b != trans->locking);
|
|
|
|
|
|
|
|
if (top->lock_start_time &&
|
|
|
|
time_after_eq64(top->lock_start_time, trans->locking_wait.start_time))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
top->lock_start_time = trans->locking_wait.start_time;
|
|
|
|
|
|
|
|
/* Don't check for self deadlock: */
|
|
|
|
if (trans == top->trans ||
|
|
|
|
!lock_type_conflicts(lock_held, trans->locking_wait.lock_want))
|
|
|
|
continue;
|
|
|
|
|
2023-06-18 21:07:40 -07:00
|
|
|
closure_get(&trans->ref);
|
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 10:23:47 -07:00
|
|
|
raw_spin_unlock(&b->lock.wait_lock);
|
|
|
|
|
2023-06-18 21:07:40 -07:00
|
|
|
ret = lock_graph_descend(&g, trans, cycle);
|
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 10:23:47 -07:00
|
|
|
if (ret)
|
2023-12-12 18:08:29 -07:00
|
|
|
goto out;
|
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 10:23:47 -07:00
|
|
|
goto next;
|
|
|
|
|
|
|
|
}
|
|
|
|
raw_spin_unlock(&b->lock.wait_lock);
|
|
|
|
}
|
|
|
|
}
|
2023-12-12 18:08:29 -07:00
|
|
|
up:
|
2022-10-01 22:41:08 -07:00
|
|
|
if (g.nr > 1 && cycle)
|
|
|
|
print_chain(cycle, &g);
|
2022-10-12 15:17:49 -07:00
|
|
|
lock_graph_up(&g);
|
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 10:23:47 -07:00
|
|
|
goto next;
|
2023-12-12 18:08:29 -07:00
|
|
|
out:
|
|
|
|
if (cycle)
|
|
|
|
--cycle->atomic;
|
|
|
|
rcu_read_unlock();
|
|
|
|
return ret;
|
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 10:23:47 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
int bch2_six_check_for_deadlock(struct six_lock *lock, void *p)
|
|
|
|
{
|
|
|
|
struct btree_trans *trans = p;
|
|
|
|
|
2022-08-22 20:12:11 -07:00
|
|
|
return bch2_check_for_deadlock(trans, NULL);
|
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 10:23:47 -07:00
|
|
|
}
|
|
|
|
|
2022-08-22 12:29:53 -07:00
|
|
|
int __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree_path *path,
|
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 10:23:47 -07:00
|
|
|
struct btree_bkey_cached_common *b,
|
|
|
|
bool lock_may_not_fail)
|
2022-08-19 12:35:34 -07:00
|
|
|
{
|
2022-09-03 18:09:54 -07:00
|
|
|
int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->level).n[SIX_LOCK_read];
|
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 10:23:47 -07:00
|
|
|
int ret;
|
2022-08-19 12:35:34 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Must drop our read locks before calling six_lock_write() -
|
|
|
|
* six_unlock() won't do wakeups until the reader count
|
|
|
|
* goes to 0, and it's safe because we have the node intent
|
|
|
|
* locked:
|
|
|
|
*/
|
2022-09-03 18:09:54 -07:00
|
|
|
six_lock_readers_add(&b->lock, -readers);
|
2023-02-04 17:39:59 -07:00
|
|
|
ret = __btree_node_lock_nopath(trans, b, SIX_LOCK_write,
|
|
|
|
lock_may_not_fail, _RET_IP_);
|
2022-09-03 18:09:54 -07:00
|
|
|
six_lock_readers_add(&b->lock, readers);
|
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 10:23:47 -07:00
|
|
|
|
2022-08-22 12:29:53 -07:00
|
|
|
if (ret)
|
2023-08-01 17:06:45 -07:00
|
|
|
mark_btree_node_locked_noreset(path, b->level, BTREE_NODE_INTENT_LOCKED);
|
2022-08-19 12:35:34 -07:00
|
|
|
|
2022-08-22 12:29:53 -07:00
|
|
|
return ret;
|
2022-08-19 12:35:34 -07:00
|
|
|
}
|
|
|
|
|
2023-03-06 06:58:02 -07:00
|
|
|
void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
|
|
|
|
struct btree_path *path,
|
|
|
|
struct btree_bkey_cached_common *b)
|
|
|
|
{
|
2024-04-09 21:10:18 -07:00
|
|
|
int ret = __btree_node_lock_write(trans, path, b, true);
|
2023-03-06 06:58:02 -07:00
|
|
|
BUG_ON(ret);
|
|
|
|
}
|
|
|
|
|
2022-08-19 16:50:18 -07:00
|
|
|
/* relock */
|
2022-08-19 12:35:34 -07:00
|
|
|
|
2022-08-19 16:50:18 -07:00
|
|
|
static inline bool btree_path_get_locks(struct btree_trans *trans,
|
|
|
|
struct btree_path *path,
|
2023-10-27 12:23:46 -07:00
|
|
|
bool upgrade,
|
|
|
|
struct get_locks_fail *f)
|
2022-08-19 12:35:34 -07:00
|
|
|
{
|
2022-08-19 16:50:18 -07:00
|
|
|
unsigned l = path->level;
|
|
|
|
int fail_idx = -1;
|
2022-08-19 12:35:34 -07:00
|
|
|
|
2022-08-19 16:50:18 -07:00
|
|
|
do {
|
|
|
|
if (!btree_path_node(path, l))
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (!(upgrade
|
|
|
|
? bch2_btree_node_upgrade(trans, path, l)
|
2023-10-27 12:23:46 -07:00
|
|
|
: bch2_btree_node_relock(trans, path, l))) {
|
|
|
|
fail_idx = l;
|
|
|
|
|
|
|
|
if (f) {
|
|
|
|
f->l = l;
|
|
|
|
f->b = path->l[l].b;
|
|
|
|
}
|
|
|
|
}
|
2022-08-19 16:50:18 -07:00
|
|
|
|
|
|
|
l++;
|
|
|
|
} while (l < path->locks_want);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* When we fail to get a lock, we have to ensure that any child nodes
|
|
|
|
* can't be relocked so bch2_btree_path_traverse has to walk back up to
|
|
|
|
* the node that we failed to relock:
|
|
|
|
*/
|
|
|
|
if (fail_idx >= 0) {
|
|
|
|
__bch2_btree_path_unlock(trans, path);
|
|
|
|
btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
|
|
|
|
|
|
|
|
do {
|
|
|
|
path->l[fail_idx].b = upgrade
|
|
|
|
? ERR_PTR(-BCH_ERR_no_btree_node_upgrade)
|
|
|
|
: ERR_PTR(-BCH_ERR_no_btree_node_relock);
|
|
|
|
--fail_idx;
|
|
|
|
} while (fail_idx >= 0);
|
2022-08-19 12:35:34 -07:00
|
|
|
}
|
|
|
|
|
2022-08-19 16:50:18 -07:00
|
|
|
if (path->uptodate == BTREE_ITER_NEED_RELOCK)
|
|
|
|
path->uptodate = BTREE_ITER_UPTODATE;
|
|
|
|
|
|
|
|
return path->uptodate < BTREE_ITER_NEED_RELOCK;
|
2022-08-19 12:35:34 -07:00
|
|
|
}
|
|
|
|
|
2022-08-19 16:50:18 -07:00
|
|
|
bool __bch2_btree_node_relock(struct btree_trans *trans,
|
2022-09-25 13:42:53 -07:00
|
|
|
struct btree_path *path, unsigned level,
|
|
|
|
bool trace)
|
2022-08-19 12:35:34 -07:00
|
|
|
{
|
2022-08-19 16:50:18 -07:00
|
|
|
struct btree *b = btree_path_node(path, level);
|
|
|
|
int want = __btree_lock_want(path, level);
|
2022-08-19 12:35:34 -07:00
|
|
|
|
2022-08-19 16:50:18 -07:00
|
|
|
if (race_fault())
|
|
|
|
goto fail;
|
|
|
|
|
|
|
|
if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
|
|
|
|
(btree_node_lock_seq_matches(path, b, level) &&
|
2022-08-22 10:21:10 -07:00
|
|
|
btree_node_lock_increment(trans, &b->c, level, want))) {
|
2022-08-19 16:50:18 -07:00
|
|
|
mark_btree_node_locked(trans, path, level, want);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
fail:
|
2023-02-09 12:48:54 -07:00
|
|
|
if (trace && !trans->notrace_relock_fail)
|
2022-09-25 13:42:53 -07:00
|
|
|
trace_and_count(trans->c, btree_path_relock_fail, trans, _RET_IP_, path, level);
|
2022-08-19 16:50:18 -07:00
|
|
|
return false;
|
2022-08-19 12:35:34 -07:00
|
|
|
}
|
|
|
|
|
2022-08-19 16:50:18 -07:00
|
|
|
/* upgrade */
|
|
|
|
|
|
|
|
bool bch2_btree_node_upgrade(struct btree_trans *trans,
|
|
|
|
struct btree_path *path, unsigned level)
|
|
|
|
{
|
|
|
|
struct btree *b = path->l[level].b;
|
2022-08-05 10:06:44 -07:00
|
|
|
struct six_lock_count count = bch2_btree_node_lock_counts(trans, path, &b->c, level);
|
2022-08-19 16:50:18 -07:00
|
|
|
|
|
|
|
if (!is_btree_node(path, level))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
switch (btree_lock_want(path, level)) {
|
|
|
|
case BTREE_NODE_UNLOCKED:
|
|
|
|
BUG_ON(btree_node_locked(path, level));
|
|
|
|
return true;
|
|
|
|
case BTREE_NODE_READ_LOCKED:
|
|
|
|
BUG_ON(btree_node_intent_locked(path, level));
|
|
|
|
return bch2_btree_node_relock(trans, path, level);
|
|
|
|
case BTREE_NODE_INTENT_LOCKED:
|
|
|
|
break;
|
2022-08-22 18:05:31 -07:00
|
|
|
case BTREE_NODE_WRITE_LOCKED:
|
|
|
|
BUG();
|
2022-08-19 16:50:18 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
if (btree_node_intent_locked(path, level))
|
|
|
|
return true;
|
|
|
|
|
|
|
|
if (race_fault())
|
|
|
|
return false;
|
|
|
|
|
2022-08-05 10:06:44 -07:00
|
|
|
if (btree_node_locked(path, level)) {
|
|
|
|
bool ret;
|
|
|
|
|
|
|
|
six_lock_readers_add(&b->c.lock, -count.n[SIX_LOCK_read]);
|
|
|
|
ret = six_lock_tryupgrade(&b->c.lock);
|
|
|
|
six_lock_readers_add(&b->c.lock, count.n[SIX_LOCK_read]);
|
|
|
|
|
|
|
|
if (ret)
|
|
|
|
goto success;
|
|
|
|
} else {
|
|
|
|
if (six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq))
|
|
|
|
goto success;
|
|
|
|
}
|
2022-08-19 16:50:18 -07:00
|
|
|
|
2022-08-05 10:06:44 -07:00
|
|
|
/*
|
|
|
|
* Do we already have an intent lock via another path? If so, just bump
|
|
|
|
* lock count:
|
|
|
|
*/
|
2022-08-19 16:50:18 -07:00
|
|
|
if (btree_node_lock_seq_matches(path, b, level) &&
|
2022-08-22 10:21:10 -07:00
|
|
|
btree_node_lock_increment(trans, &b->c, level, BTREE_NODE_INTENT_LOCKED)) {
|
2022-08-19 16:50:18 -07:00
|
|
|
btree_node_unlock(trans, path, level);
|
|
|
|
goto success;
|
|
|
|
}
|
|
|
|
|
2022-08-27 09:48:36 -07:00
|
|
|
trace_and_count(trans->c, btree_path_upgrade_fail, trans, _RET_IP_, path, level);
|
2022-08-19 16:50:18 -07:00
|
|
|
return false;
|
|
|
|
success:
|
2023-08-01 17:06:45 -07:00
|
|
|
mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED);
|
2022-08-19 16:50:18 -07:00
|
|
|
return true;
|
|
|
|
}
|
2022-08-19 12:35:34 -07:00
|
|
|
|
|
|
|
/* Btree path locking: */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Only for btree_cache.c - only relocks intent locks
|
|
|
|
*/
|
|
|
|
int bch2_btree_path_relock_intent(struct btree_trans *trans,
|
|
|
|
struct btree_path *path)
|
|
|
|
{
|
|
|
|
unsigned l;
|
|
|
|
|
|
|
|
for (l = path->level;
|
|
|
|
l < path->locks_want && btree_path_node(path, l);
|
|
|
|
l++) {
|
|
|
|
if (!bch2_btree_node_relock(trans, path, l)) {
|
|
|
|
__bch2_btree_path_unlock(trans, path);
|
|
|
|
btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
|
2022-08-27 09:48:36 -07:00
|
|
|
trace_and_count(trans->c, trans_restart_relock_path_intent, trans, _RET_IP_, path);
|
2022-08-19 12:35:34 -07:00
|
|
|
return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
__flatten
|
2024-01-15 18:40:06 -07:00
|
|
|
bool bch2_btree_path_relock_norestart(struct btree_trans *trans, struct btree_path *path)
|
2022-08-19 12:35:34 -07:00
|
|
|
{
|
2023-10-27 12:23:46 -07:00
|
|
|
struct get_locks_fail f;
|
|
|
|
|
2024-04-09 16:57:08 -07:00
|
|
|
bool ret = btree_path_get_locks(trans, path, false, &f);
|
|
|
|
bch2_trans_verify_locks(trans);
|
|
|
|
return ret;
|
2022-08-19 12:35:34 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
int __bch2_btree_path_relock(struct btree_trans *trans,
|
|
|
|
struct btree_path *path, unsigned long trace_ip)
|
|
|
|
{
|
2024-01-15 18:40:06 -07:00
|
|
|
if (!bch2_btree_path_relock_norestart(trans, path)) {
|
2022-08-27 09:48:36 -07:00
|
|
|
trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path);
|
2022-08-19 12:35:34 -07:00
|
|
|
return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path);
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans,
|
|
|
|
struct btree_path *path,
|
2023-10-27 12:23:46 -07:00
|
|
|
unsigned new_locks_want,
|
|
|
|
struct get_locks_fail *f)
|
2022-08-19 12:35:34 -07:00
|
|
|
{
|
|
|
|
EBUG_ON(path->locks_want >= new_locks_want);
|
|
|
|
|
|
|
|
path->locks_want = new_locks_want;
|
|
|
|
|
2024-04-09 16:57:08 -07:00
|
|
|
bool ret = btree_path_get_locks(trans, path, true, f);
|
|
|
|
bch2_trans_verify_locks(trans);
|
|
|
|
return ret;
|
2022-08-19 12:35:34 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
bool __bch2_btree_path_upgrade(struct btree_trans *trans,
|
|
|
|
struct btree_path *path,
|
2023-10-27 12:23:46 -07:00
|
|
|
unsigned new_locks_want,
|
|
|
|
struct get_locks_fail *f)
|
2022-08-19 12:35:34 -07:00
|
|
|
{
|
2024-04-09 16:57:08 -07:00
|
|
|
bool ret = bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f);
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
2022-08-19 12:35:34 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* XXX: this is ugly - we'd prefer to not be mucking with other
|
|
|
|
* iterators in the btree_trans here.
|
|
|
|
*
|
|
|
|
* On failure to upgrade the iterator, setting iter->locks_want and
|
|
|
|
* calling get_locks() is sufficient to make bch2_btree_path_traverse()
|
|
|
|
* get the locks we want on transaction restart.
|
|
|
|
*
|
|
|
|
* But if this iterator was a clone, on transaction restart what we did
|
|
|
|
* to this iterator isn't going to be preserved.
|
|
|
|
*
|
|
|
|
* Possibly we could add an iterator field for the parent iterator when
|
|
|
|
* an iterator is a copy - for now, we'll just upgrade any other
|
|
|
|
* iterators with the same btree id.
|
|
|
|
*
|
|
|
|
* The code below used to be needed to ensure ancestor nodes get locked
|
|
|
|
* before interior nodes - now that's handled by
|
|
|
|
* bch2_btree_path_traverse_all().
|
|
|
|
*/
|
2023-12-10 21:37:45 -07:00
|
|
|
if (!path->cached && !trans->in_traverse_all) {
|
|
|
|
struct btree_path *linked;
|
|
|
|
unsigned i;
|
|
|
|
|
|
|
|
trans_for_each_path(trans, linked, i)
|
2022-08-19 12:35:34 -07:00
|
|
|
if (linked != path &&
|
|
|
|
linked->cached == path->cached &&
|
|
|
|
linked->btree_id == path->btree_id &&
|
|
|
|
linked->locks_want < new_locks_want) {
|
|
|
|
linked->locks_want = new_locks_want;
|
2023-10-27 12:23:46 -07:00
|
|
|
btree_path_get_locks(trans, linked, true, NULL);
|
2022-08-19 12:35:34 -07:00
|
|
|
}
|
2023-12-10 21:37:45 -07:00
|
|
|
}
|
2024-04-09 16:57:08 -07:00
|
|
|
out:
|
|
|
|
bch2_trans_verify_locks(trans);
|
|
|
|
return ret;
|
2022-08-19 12:35:34 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
void __bch2_btree_path_downgrade(struct btree_trans *trans,
|
|
|
|
struct btree_path *path,
|
|
|
|
unsigned new_locks_want)
|
|
|
|
{
|
2023-11-12 19:47:15 -07:00
|
|
|
unsigned l, old_locks_want = path->locks_want;
|
2022-08-19 12:35:34 -07:00
|
|
|
|
2023-10-27 12:23:46 -07:00
|
|
|
if (trans->restarted)
|
|
|
|
return;
|
|
|
|
|
2022-08-19 12:35:34 -07:00
|
|
|
EBUG_ON(path->locks_want < new_locks_want);
|
|
|
|
|
|
|
|
path->locks_want = new_locks_want;
|
|
|
|
|
|
|
|
while (path->nodes_locked &&
|
2022-08-21 15:17:51 -07:00
|
|
|
(l = btree_path_highest_level_locked(path)) >= path->locks_want) {
|
2022-08-19 12:35:34 -07:00
|
|
|
if (l > path->level) {
|
|
|
|
btree_node_unlock(trans, path, l);
|
|
|
|
} else {
|
|
|
|
if (btree_node_intent_locked(path, l)) {
|
|
|
|
six_lock_downgrade(&path->l[l].b->c.lock);
|
2023-08-01 17:06:45 -07:00
|
|
|
mark_btree_node_locked_noreset(path, l, BTREE_NODE_READ_LOCKED);
|
2022-08-19 12:35:34 -07:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bch2_btree_path_verify_locks(path);
|
2023-10-27 12:23:46 -07:00
|
|
|
|
2023-11-12 19:47:15 -07:00
|
|
|
trace_path_downgrade(trans, _RET_IP_, path, old_locks_want);
|
2022-08-19 12:35:34 -07:00
|
|
|
}
|
|
|
|
|
2022-08-19 16:50:18 -07:00
|
|
|
/* Btree transaction locking: */
|
|
|
|
|
2022-08-19 12:35:34 -07:00
|
|
|
void bch2_trans_downgrade(struct btree_trans *trans)
|
|
|
|
{
|
|
|
|
struct btree_path *path;
|
2023-12-10 21:37:45 -07:00
|
|
|
unsigned i;
|
2022-08-19 12:35:34 -07:00
|
|
|
|
2023-10-27 12:23:46 -07:00
|
|
|
if (trans->restarted)
|
|
|
|
return;
|
|
|
|
|
2023-12-10 21:37:45 -07:00
|
|
|
trans_for_each_path(trans, path, i)
|
2024-02-16 21:50:05 -07:00
|
|
|
if (path->ref)
|
|
|
|
bch2_btree_path_downgrade(trans, path);
|
2022-08-19 12:35:34 -07:00
|
|
|
}
|
|
|
|
|
2024-04-09 16:45:41 -07:00
|
|
|
static inline void __bch2_trans_unlock(struct btree_trans *trans)
|
2022-08-19 12:35:34 -07:00
|
|
|
{
|
|
|
|
struct btree_path *path;
|
2023-12-10 21:37:45 -07:00
|
|
|
unsigned i;
|
2022-08-19 12:35:34 -07:00
|
|
|
|
2024-04-09 16:45:41 -07:00
|
|
|
trans_for_each_path(trans, path, i)
|
|
|
|
__bch2_btree_path_unlock(trans, path);
|
|
|
|
}
|
2022-08-19 12:35:34 -07:00
|
|
|
|
2024-04-09 16:45:41 -07:00
|
|
|
static noinline __cold int bch2_trans_relock_fail(struct btree_trans *trans, struct btree_path *path,
|
|
|
|
struct get_locks_fail *f, bool trace)
|
|
|
|
{
|
|
|
|
if (!trace)
|
|
|
|
goto out;
|
2024-01-15 18:40:06 -07:00
|
|
|
|
2024-04-09 16:45:41 -07:00
|
|
|
if (trace_trans_restart_relock_enabled()) {
|
|
|
|
struct printbuf buf = PRINTBUF;
|
2024-01-15 18:40:06 -07:00
|
|
|
|
2024-04-09 16:45:41 -07:00
|
|
|
bch2_bpos_to_text(&buf, path->pos);
|
|
|
|
prt_printf(&buf, " l=%u seq=%u node seq=", f->l, path->l[f->l].lock_seq);
|
|
|
|
if (IS_ERR_OR_NULL(f->b)) {
|
|
|
|
prt_str(&buf, bch2_err_str(PTR_ERR(f->b)));
|
|
|
|
} else {
|
|
|
|
prt_printf(&buf, "%u", f->b->c.lock.seq);
|
2024-01-15 18:40:06 -07:00
|
|
|
|
2024-04-09 16:45:41 -07:00
|
|
|
struct six_lock_count c =
|
|
|
|
bch2_btree_node_lock_counts(trans, NULL, &f->b->c, f->l);
|
|
|
|
prt_printf(&buf, " self locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
|
|
|
|
|
|
|
|
c = six_lock_counts(&f->b->c.lock);
|
|
|
|
prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
|
2022-08-19 12:35:34 -07:00
|
|
|
}
|
2024-04-09 16:45:41 -07:00
|
|
|
|
|
|
|
trace_trans_restart_relock(trans, _RET_IP_, buf.buf);
|
|
|
|
printbuf_exit(&buf);
|
2024-01-15 18:40:06 -07:00
|
|
|
}
|
|
|
|
|
2024-04-09 16:45:41 -07:00
|
|
|
count_event(trans->c, trans_restart_relock);
|
|
|
|
out:
|
|
|
|
__bch2_trans_unlock(trans);
|
|
|
|
bch2_trans_verify_locks(trans);
|
|
|
|
return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
|
2022-08-19 12:35:34 -07:00
|
|
|
}
|
|
|
|
|
2024-04-09 16:45:41 -07:00
|
|
|
static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace)
|
2023-01-23 22:26:48 -07:00
|
|
|
{
|
2024-04-09 17:14:21 -07:00
|
|
|
bch2_trans_verify_locks(trans);
|
2023-01-23 22:26:48 -07:00
|
|
|
|
|
|
|
if (unlikely(trans->restarted))
|
|
|
|
return -((int) trans->restarted);
|
2024-04-09 16:57:08 -07:00
|
|
|
if (unlikely(trans->locked))
|
|
|
|
goto out;
|
2023-01-23 22:26:48 -07:00
|
|
|
|
2024-04-09 17:14:21 -07:00
|
|
|
struct btree_path *path;
|
|
|
|
unsigned i;
|
|
|
|
|
2024-04-09 16:45:41 -07:00
|
|
|
trans_for_each_path(trans, path, i) {
|
|
|
|
struct get_locks_fail f;
|
|
|
|
|
2023-01-23 22:26:48 -07:00
|
|
|
if (path->should_be_locked &&
|
2024-04-09 16:45:41 -07:00
|
|
|
!btree_path_get_locks(trans, path, false, &f))
|
|
|
|
return bch2_trans_relock_fail(trans, path, &f, trace);
|
|
|
|
}
|
|
|
|
|
2024-07-03 17:35:36 -07:00
|
|
|
trans_set_locked(trans);
|
2024-04-09 16:57:08 -07:00
|
|
|
out:
|
2024-04-09 16:45:41 -07:00
|
|
|
bch2_trans_verify_locks(trans);
|
2023-01-23 22:26:48 -07:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2024-04-09 16:45:41 -07:00
|
|
|
int bch2_trans_relock(struct btree_trans *trans)
|
2023-06-18 10:25:09 -07:00
|
|
|
{
|
2024-04-09 16:45:41 -07:00
|
|
|
return __bch2_trans_relock(trans, true);
|
|
|
|
}
|
2023-06-18 10:25:09 -07:00
|
|
|
|
2024-04-09 16:45:41 -07:00
|
|
|
int bch2_trans_relock_notrace(struct btree_trans *trans)
|
|
|
|
{
|
|
|
|
return __bch2_trans_relock(trans, false);
|
2023-06-18 10:25:09 -07:00
|
|
|
}
|
|
|
|
|
2024-04-09 16:45:41 -07:00
|
|
|
void bch2_trans_unlock_noassert(struct btree_trans *trans)
|
2022-08-19 12:35:34 -07:00
|
|
|
{
|
2024-04-09 16:45:41 -07:00
|
|
|
__bch2_trans_unlock(trans);
|
2024-04-09 16:57:08 -07:00
|
|
|
|
2024-07-03 17:35:36 -07:00
|
|
|
trans_set_unlocked(trans);
|
2024-04-09 16:45:41 -07:00
|
|
|
}
|
2022-08-19 12:35:34 -07:00
|
|
|
|
2024-04-09 16:45:41 -07:00
|
|
|
void bch2_trans_unlock(struct btree_trans *trans)
|
|
|
|
{
|
|
|
|
__bch2_trans_unlock(trans);
|
2024-04-09 16:57:08 -07:00
|
|
|
|
2024-07-03 17:35:36 -07:00
|
|
|
trans_set_unlocked(trans);
|
2022-08-19 12:35:34 -07:00
|
|
|
}
|
2022-08-19 16:50:18 -07:00
|
|
|
|
2023-10-30 09:30:52 -07:00
|
|
|
void bch2_trans_unlock_long(struct btree_trans *trans)
|
|
|
|
{
|
|
|
|
bch2_trans_unlock(trans);
|
|
|
|
bch2_trans_srcu_unlock(trans);
|
|
|
|
}
|
|
|
|
|
2023-02-17 20:43:47 -07:00
|
|
|
int __bch2_trans_mutex_lock(struct btree_trans *trans,
|
|
|
|
struct mutex *lock)
|
|
|
|
{
|
2023-05-28 15:06:27 -07:00
|
|
|
int ret = drop_locks_do(trans, (mutex_lock(lock), 0));
|
2023-02-17 20:43:47 -07:00
|
|
|
|
|
|
|
if (ret)
|
|
|
|
mutex_unlock(lock);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2022-08-19 16:50:18 -07:00
|
|
|
/* Debug */
|
|
|
|
|
|
|
|
#ifdef CONFIG_BCACHEFS_DEBUG
|
|
|
|
|
|
|
|
void bch2_btree_path_verify_locks(struct btree_path *path)
|
|
|
|
{
|
2024-04-05 18:32:06 -07:00
|
|
|
/*
|
|
|
|
* A path may be uptodate and yet have nothing locked if and only if
|
|
|
|
* there is no node at path->level, which generally means we were
|
|
|
|
* iterating over all nodes and got to the end of the btree
|
|
|
|
*/
|
2024-04-10 19:19:40 -07:00
|
|
|
BUG_ON(path->uptodate == BTREE_ITER_UPTODATE &&
|
|
|
|
btree_path_node(path, path->level) &&
|
|
|
|
!path->nodes_locked);
|
|
|
|
|
|
|
|
if (!path->nodes_locked)
|
2022-08-19 16:50:18 -07:00
|
|
|
return;
|
|
|
|
|
2024-04-10 19:19:40 -07:00
|
|
|
for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) {
|
2022-08-22 18:05:31 -07:00
|
|
|
int want = btree_lock_want(path, l);
|
|
|
|
int have = btree_node_locked_type(path, l);
|
|
|
|
|
|
|
|
BUG_ON(!is_btree_node(path, l) && have != BTREE_NODE_UNLOCKED);
|
|
|
|
|
|
|
|
BUG_ON(is_btree_node(path, l) &&
|
|
|
|
(want == BTREE_NODE_UNLOCKED ||
|
|
|
|
have != BTREE_NODE_WRITE_LOCKED) &&
|
|
|
|
want != have);
|
|
|
|
}
|
2022-08-19 16:50:18 -07:00
|
|
|
}
|
|
|
|
|
2024-04-09 16:57:08 -07:00
|
|
|
static bool bch2_trans_locked(struct btree_trans *trans)
|
|
|
|
{
|
|
|
|
struct btree_path *path;
|
|
|
|
unsigned i;
|
|
|
|
|
|
|
|
trans_for_each_path(trans, path, i)
|
|
|
|
if (path->nodes_locked)
|
|
|
|
return true;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2022-08-19 16:50:18 -07:00
|
|
|
void bch2_trans_verify_locks(struct btree_trans *trans)
|
|
|
|
{
|
2024-04-09 17:14:21 -07:00
|
|
|
if (!trans->locked) {
|
|
|
|
BUG_ON(bch2_trans_locked(trans));
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2022-08-19 16:50:18 -07:00
|
|
|
struct btree_path *path;
|
2023-12-10 21:37:45 -07:00
|
|
|
unsigned i;
|
2022-08-19 16:50:18 -07:00
|
|
|
|
2023-12-10 21:37:45 -07:00
|
|
|
trans_for_each_path(trans, path, i)
|
2022-08-19 16:50:18 -07:00
|
|
|
bch2_btree_path_verify_locks(path);
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|