locking/csd-lock: Use backoff for repeated reports of same incident
Currently, the CSD-lock diagnostics in CONFIG_CSD_LOCK_WAIT_DEBUG=y kernels are emitted at five-second intervals. Although this has proven to be a good time interval for the first diagnostic, if the target CPU keeps interrupts disabled for way longer than five seconds, the ratio of useful new information to pointless repetition increases considerably. Therefore, back off the time period for repeated reports of the same incident, increasing linearly with the number of reports and logarithmicly with the number of online CPUs. [ paulmck: Apply Dan Carpenter feedback. ] Signed-off-by: Paul E. McKenney <paulmck@kernel.org> Cc: Imran Khan <imran.f.khan@oracle.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Leonardo Bras <leobras@redhat.com> Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org> Cc: Rik van Riel <riel@surriel.com> Reviewed-by: Rik van Riel <riel@surriel.com> Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
This commit is contained in:
parent
ac9d45544c
commit
d40760d681
10
kernel/smp.c
10
kernel/smp.c
@ -226,7 +226,7 @@ bool csd_lock_is_stuck(void)
|
|||||||
* the CSD_TYPE_SYNC/ASYNC types provide the destination CPU,
|
* the CSD_TYPE_SYNC/ASYNC types provide the destination CPU,
|
||||||
* so waiting on other types gets much less information.
|
* so waiting on other types gets much less information.
|
||||||
*/
|
*/
|
||||||
static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id)
|
static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id, unsigned long *nmessages)
|
||||||
{
|
{
|
||||||
int cpu = -1;
|
int cpu = -1;
|
||||||
int cpux;
|
int cpux;
|
||||||
@ -249,7 +249,9 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in
|
|||||||
ts2 = sched_clock();
|
ts2 = sched_clock();
|
||||||
/* How long since we last checked for a stuck CSD lock.*/
|
/* How long since we last checked for a stuck CSD lock.*/
|
||||||
ts_delta = ts2 - *ts1;
|
ts_delta = ts2 - *ts1;
|
||||||
if (likely(ts_delta <= csd_lock_timeout_ns || csd_lock_timeout_ns == 0))
|
if (likely(ts_delta <= csd_lock_timeout_ns * (*nmessages + 1) *
|
||||||
|
(!*nmessages ? 1 : (ilog2(num_online_cpus()) / 2 + 1)) ||
|
||||||
|
csd_lock_timeout_ns == 0))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
firsttime = !*bug_id;
|
firsttime = !*bug_id;
|
||||||
@ -266,6 +268,7 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in
|
|||||||
pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %lld ns for CPU#%02d %pS(%ps).\n",
|
pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %lld ns for CPU#%02d %pS(%ps).\n",
|
||||||
firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), (s64)ts_delta,
|
firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), (s64)ts_delta,
|
||||||
cpu, csd->func, csd->info);
|
cpu, csd->func, csd->info);
|
||||||
|
(*nmessages)++;
|
||||||
if (firsttime)
|
if (firsttime)
|
||||||
atomic_inc(&n_csd_lock_stuck);
|
atomic_inc(&n_csd_lock_stuck);
|
||||||
/*
|
/*
|
||||||
@ -306,12 +309,13 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in
|
|||||||
*/
|
*/
|
||||||
static void __csd_lock_wait(call_single_data_t *csd)
|
static void __csd_lock_wait(call_single_data_t *csd)
|
||||||
{
|
{
|
||||||
|
unsigned long nmessages = 0;
|
||||||
int bug_id = 0;
|
int bug_id = 0;
|
||||||
u64 ts0, ts1;
|
u64 ts0, ts1;
|
||||||
|
|
||||||
ts1 = ts0 = sched_clock();
|
ts1 = ts0 = sched_clock();
|
||||||
for (;;) {
|
for (;;) {
|
||||||
if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id))
|
if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id, &nmessages))
|
||||||
break;
|
break;
|
||||||
cpu_relax();
|
cpu_relax();
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user