1

Fixes and minor updates for the timer migration code:

- Stop testing the group->parent pointer as it is not guaranteed to be
       stable over a chain of operations by design. This includes a warning
       which would be nice to have but it produces false positives due to
       the racy nature of the check.
 
     - Plug a race between CPUs going in and out of idle and a CPU hotplug
       operation. The latter can create and connect a new hierarchy level
       which is missed in the concurrent updates of CPUs which go into idle.
       As a result the events of such a CPU might not be processed and
       timers go stale.
 
       Cure it by splitting the hotplug operation into a prepare and online
       callback. The prepare callback is guaranteed to run on an online and
       therefore active CPU. This CPU updates the hierarchy and being online
       ensures that there is always at least one migrator active which
       handles the modified hierarchy correctly when going idle. The online
       callback which runs on the incoming CPU then just marks the CPU
       active and brings it into operation.
 
     - Improve tracing and polish the code further so it is more obvious
       what's going on.
 -----BEGIN PGP SIGNATURE-----
 
 iQJHBAABCgAxFiEEQp8+kY+LLUocC4bMphj1TA10mKEFAmajmXYTHHRnbHhAbGlu
 dXRyb25peC5kZQAKCRCmGPVMDXSYoWaGD/4iAuj0S3AQ+odB/Fkg1cvCF8YbOJGy
 PEMSDnYtW7ErShEwVQMsWnXCFhIMuqMN0KIzOChpya2ZkmdT47mNwKuwdOMEQZzg
 dKhmqGWR+sk4LMszCFbf5u6JjJeQ+nnxthgJ1IieJwC4VEfRceXYk7ng6Wvu1+lU
 JEIukUh9nRJWma7FYW8MeNZ4lJGdvawZ5UjUAkPtzeKWn0+0/oqV5t1c8E/1jBbi
 sKZW2soL716Xd/3QJUKKmtAcH7yDFwq5AY5bJURr5ztJw/yr2loVdvAPEdn/RQ6f
 fzN/J+nu2ig14g/QvhI8Ke+HbHJEZHpo6simSZRbdaqnCX3R/lYZwLDe7EGjqKIb
 0slsx2V1UxQ+qIYRplrtr/HGChjG/mXDLPIWRWjsiUAqyygy6QtUIko9AuH99Kd6
 7cBjOzajKIAA/J9SUD03VgjXcQ53bW64NMe2pOX9ED1mbfmmu/ROd0neOgksKw5o
 G5XQ+T6tNOoHMzJgv4R8PiViVdrf53A/g1wYTY1RR3XI8IWpounkyDExDvbtigGo
 N+reKoawDGpXeMAByO2E6UDFNA05NYPjlvSrzTS5ywwyF1qCowKI1Qyup9wA/God
 WJvfesmOJHtfcuUcVZf6Pm6+otJiKrT3reauFd6laEbyRuGTKvtNN/pQa6yxiZzT
 FTxZKpcYkPE76g==
 =G1Lg
 -----END PGP SIGNATURE-----

Merge tag 'timers-urgent-2024-07-26' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull timer migration updates from Thomas Gleixner:
 "Fixes and minor updates for the timer migration code:

   - Stop testing the group->parent pointer as it is not guaranteed to
     be stable over a chain of operations by design.

     This includes a warning which would be nice to have but it produces
     false positives due to the racy nature of the check.

   - Plug a race between CPUs going in and out of idle and a CPU hotplug
     operation. The latter can create and connect a new hierarchy level
     which is missed in the concurrent updates of CPUs which go into
     idle. As a result the events of such a CPU might not be processed
     and timers go stale.

     Cure it by splitting the hotplug operation into a prepare and
     online callback. The prepare callback is guaranteed to run on an
     online and therefore active CPU. This CPU updates the hierarchy and
     being online ensures that there is always at least one migrator
     active which handles the modified hierarchy correctly when going
     idle. The online callback which runs on the incoming CPU then just
     marks the CPU active and brings it into operation.

   - Improve tracing and polish the code further so it is more obvious
     what's going on"

* tag 'timers-urgent-2024-07-26' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  timers/migration: Fix grammar in comment
  timers/migration: Spare write when nothing changed
  timers/migration: Rename childmask by groupmask to make naming more obvious
  timers/migration: Read childmask and parent pointer in a single place
  timers/migration: Use a single struct for hierarchy walk data
  timers/migration: Improve tracing
  timers/migration: Move hierarchy setup into cpuhotplug prepare callback
  timers/migration: Do not rely always on group->parent
This commit is contained in:
Linus Torvalds 2024-07-27 10:19:55 -07:00
commit 5256184b61
4 changed files with 224 additions and 213 deletions

View File

@ -122,6 +122,7 @@ enum cpuhp_state {
CPUHP_KVM_PPC_BOOK3S_PREPARE, CPUHP_KVM_PPC_BOOK3S_PREPARE,
CPUHP_ZCOMP_PREPARE, CPUHP_ZCOMP_PREPARE,
CPUHP_TIMERS_PREPARE, CPUHP_TIMERS_PREPARE,
CPUHP_TMIGR_PREPARE,
CPUHP_MIPS_SOC_PREPARE, CPUHP_MIPS_SOC_PREPARE,
CPUHP_BP_PREPARE_DYN, CPUHP_BP_PREPARE_DYN,
CPUHP_BP_PREPARE_DYN_END = CPUHP_BP_PREPARE_DYN + 20, CPUHP_BP_PREPARE_DYN_END = CPUHP_BP_PREPARE_DYN + 20,

View File

@ -43,7 +43,7 @@ TRACE_EVENT(tmigr_connect_child_parent,
__field( unsigned int, lvl ) __field( unsigned int, lvl )
__field( unsigned int, numa_node ) __field( unsigned int, numa_node )
__field( unsigned int, num_children ) __field( unsigned int, num_children )
__field( u32, childmask ) __field( u32, groupmask )
), ),
TP_fast_assign( TP_fast_assign(
@ -52,11 +52,11 @@ TRACE_EVENT(tmigr_connect_child_parent,
__entry->lvl = child->parent->level; __entry->lvl = child->parent->level;
__entry->numa_node = child->parent->numa_node; __entry->numa_node = child->parent->numa_node;
__entry->num_children = child->parent->num_children; __entry->num_children = child->parent->num_children;
__entry->childmask = child->childmask; __entry->groupmask = child->groupmask;
), ),
TP_printk("group=%p childmask=%0x parent=%p lvl=%d numa=%d num_children=%d", TP_printk("group=%p groupmask=%0x parent=%p lvl=%d numa=%d num_children=%d",
__entry->child, __entry->childmask, __entry->parent, __entry->child, __entry->groupmask, __entry->parent,
__entry->lvl, __entry->numa_node, __entry->num_children) __entry->lvl, __entry->numa_node, __entry->num_children)
); );
@ -72,7 +72,7 @@ TRACE_EVENT(tmigr_connect_cpu_parent,
__field( unsigned int, lvl ) __field( unsigned int, lvl )
__field( unsigned int, numa_node ) __field( unsigned int, numa_node )
__field( unsigned int, num_children ) __field( unsigned int, num_children )
__field( u32, childmask ) __field( u32, groupmask )
), ),
TP_fast_assign( TP_fast_assign(
@ -81,11 +81,11 @@ TRACE_EVENT(tmigr_connect_cpu_parent,
__entry->lvl = tmc->tmgroup->level; __entry->lvl = tmc->tmgroup->level;
__entry->numa_node = tmc->tmgroup->numa_node; __entry->numa_node = tmc->tmgroup->numa_node;
__entry->num_children = tmc->tmgroup->num_children; __entry->num_children = tmc->tmgroup->num_children;
__entry->childmask = tmc->childmask; __entry->groupmask = tmc->groupmask;
), ),
TP_printk("cpu=%d childmask=%0x parent=%p lvl=%d numa=%d num_children=%d", TP_printk("cpu=%d groupmask=%0x parent=%p lvl=%d numa=%d num_children=%d",
__entry->cpu, __entry->childmask, __entry->parent, __entry->cpu, __entry->groupmask, __entry->parent,
__entry->lvl, __entry->numa_node, __entry->num_children) __entry->lvl, __entry->numa_node, __entry->num_children)
); );

View File

@ -475,9 +475,54 @@ static bool tmigr_check_lonely(struct tmigr_group *group)
return bitmap_weight(&active, BIT_CNT) <= 1; return bitmap_weight(&active, BIT_CNT) <= 1;
} }
typedef bool (*up_f)(struct tmigr_group *, struct tmigr_group *, void *); /**
* struct tmigr_walk - data required for walking the hierarchy
* @nextexp: Next CPU event expiry information which is handed into
* the timer migration code by the timer code
* (get_next_timer_interrupt())
* @firstexp: Contains the first event expiry information when
* hierarchy is completely idle. When CPU itself was the
* last going idle, information makes sure, that CPU will
* be back in time. When using this value in the remote
* expiry case, firstexp is stored in the per CPU tmigr_cpu
* struct of CPU which expires remote timers. It is updated
* in top level group only. Be aware, there could occur a
* new top level of the hierarchy between the 'top level
* call' in tmigr_update_events() and the check for the
* parent group in walk_groups(). Then @firstexp might
* contain a value != KTIME_MAX even if it was not the
* final top level. This is not a problem, as the worst
* outcome is a CPU which might wake up a little early.
* @evt: Pointer to tmigr_event which needs to be queued (of idle
* child group)
* @childmask: groupmask of child group
* @remote: Is set, when the new timer path is executed in
* tmigr_handle_remote_cpu()
* @basej: timer base in jiffies
* @now: timer base monotonic
* @check: is set if there is the need to handle remote timers;
* required in tmigr_requires_handle_remote() only
* @tmc_active: this flag indicates, whether the CPU which triggers
* the hierarchy walk is !idle in the timer migration
* hierarchy. When the CPU is idle and the whole hierarchy is
* idle, only the first event of the top level has to be
* considered.
*/
struct tmigr_walk {
u64 nextexp;
u64 firstexp;
struct tmigr_event *evt;
u8 childmask;
bool remote;
unsigned long basej;
u64 now;
bool check;
bool tmc_active;
};
static void __walk_groups(up_f up, void *data, typedef bool (*up_f)(struct tmigr_group *, struct tmigr_group *, struct tmigr_walk *);
static void __walk_groups(up_f up, struct tmigr_walk *data,
struct tmigr_cpu *tmc) struct tmigr_cpu *tmc)
{ {
struct tmigr_group *child = NULL, *group = tmc->tmgroup; struct tmigr_group *child = NULL, *group = tmc->tmgroup;
@ -490,64 +535,17 @@ static void __walk_groups(up_f up, void *data,
child = group; child = group;
group = group->parent; group = group->parent;
data->childmask = child->groupmask;
} while (group); } while (group);
} }
static void walk_groups(up_f up, void *data, struct tmigr_cpu *tmc) static void walk_groups(up_f up, struct tmigr_walk *data, struct tmigr_cpu *tmc)
{ {
lockdep_assert_held(&tmc->lock); lockdep_assert_held(&tmc->lock);
__walk_groups(up, data, tmc); __walk_groups(up, data, tmc);
} }
/**
* struct tmigr_walk - data required for walking the hierarchy
* @nextexp: Next CPU event expiry information which is handed into
* the timer migration code by the timer code
* (get_next_timer_interrupt())
* @firstexp: Contains the first event expiry information when last
* active CPU of hierarchy is on the way to idle to make
* sure CPU will be back in time.
* @evt: Pointer to tmigr_event which needs to be queued (of idle
* child group)
* @childmask: childmask of child group
* @remote: Is set, when the new timer path is executed in
* tmigr_handle_remote_cpu()
*/
struct tmigr_walk {
u64 nextexp;
u64 firstexp;
struct tmigr_event *evt;
u8 childmask;
bool remote;
};
/**
* struct tmigr_remote_data - data required for remote expiry hierarchy walk
* @basej: timer base in jiffies
* @now: timer base monotonic
* @firstexp: returns expiry of the first timer in the idle timer
* migration hierarchy to make sure the timer is handled in
* time; it is stored in the per CPU tmigr_cpu struct of
* CPU which expires remote timers
* @childmask: childmask of child group
* @check: is set if there is the need to handle remote timers;
* required in tmigr_requires_handle_remote() only
* @tmc_active: this flag indicates, whether the CPU which triggers
* the hierarchy walk is !idle in the timer migration
* hierarchy. When the CPU is idle and the whole hierarchy is
* idle, only the first event of the top level has to be
* considered.
*/
struct tmigr_remote_data {
unsigned long basej;
u64 now;
u64 firstexp;
u8 childmask;
bool check;
bool tmc_active;
};
/* /*
* Returns the next event of the timerqueue @group->events * Returns the next event of the timerqueue @group->events
* *
@ -618,10 +616,9 @@ static u64 tmigr_next_groupevt_expires(struct tmigr_group *group)
static bool tmigr_active_up(struct tmigr_group *group, static bool tmigr_active_up(struct tmigr_group *group,
struct tmigr_group *child, struct tmigr_group *child,
void *ptr) struct tmigr_walk *data)
{ {
union tmigr_state curstate, newstate; union tmigr_state curstate, newstate;
struct tmigr_walk *data = ptr;
bool walk_done; bool walk_done;
u8 childmask; u8 childmask;
@ -649,8 +646,7 @@ static bool tmigr_active_up(struct tmigr_group *group,
} while (!atomic_try_cmpxchg(&group->migr_state, &curstate.state, newstate.state)); } while (!atomic_try_cmpxchg(&group->migr_state, &curstate.state, newstate.state));
if ((walk_done == false) && group->parent) trace_tmigr_group_set_cpu_active(group, newstate, childmask);
data->childmask = group->childmask;
/* /*
* The group is active (again). The group event might be still queued * The group is active (again). The group event might be still queued
@ -666,8 +662,6 @@ static bool tmigr_active_up(struct tmigr_group *group,
*/ */
group->groupevt.ignore = true; group->groupevt.ignore = true;
trace_tmigr_group_set_cpu_active(group, newstate, childmask);
return walk_done; return walk_done;
} }
@ -675,7 +669,7 @@ static void __tmigr_cpu_activate(struct tmigr_cpu *tmc)
{ {
struct tmigr_walk data; struct tmigr_walk data;
data.childmask = tmc->childmask; data.childmask = tmc->groupmask;
trace_tmigr_cpu_active(tmc); trace_tmigr_cpu_active(tmc);
@ -860,10 +854,8 @@ unlock:
static bool tmigr_new_timer_up(struct tmigr_group *group, static bool tmigr_new_timer_up(struct tmigr_group *group,
struct tmigr_group *child, struct tmigr_group *child,
void *ptr) struct tmigr_walk *data)
{ {
struct tmigr_walk *data = ptr;
return tmigr_update_events(group, child, data); return tmigr_update_events(group, child, data);
} }
@ -995,9 +987,8 @@ unlock:
static bool tmigr_handle_remote_up(struct tmigr_group *group, static bool tmigr_handle_remote_up(struct tmigr_group *group,
struct tmigr_group *child, struct tmigr_group *child,
void *ptr) struct tmigr_walk *data)
{ {
struct tmigr_remote_data *data = ptr;
struct tmigr_event *evt; struct tmigr_event *evt;
unsigned long jif; unsigned long jif;
u8 childmask; u8 childmask;
@ -1034,12 +1025,10 @@ again:
} }
/* /*
* Update of childmask for the next level and keep track of the expiry * Keep track of the expiry of the first event that needs to be handled
* of the first event that needs to be handled (group->next_expiry was * (group->next_expiry was updated by tmigr_next_expired_groupevt(),
* updated by tmigr_next_expired_groupevt(), next was set by * next was set by tmigr_handle_remote_cpu()).
* tmigr_handle_remote_cpu()).
*/ */
data->childmask = group->childmask;
data->firstexp = group->next_expiry; data->firstexp = group->next_expiry;
raw_spin_unlock_irq(&group->lock); raw_spin_unlock_irq(&group->lock);
@ -1055,12 +1044,12 @@ again:
void tmigr_handle_remote(void) void tmigr_handle_remote(void)
{ {
struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
struct tmigr_remote_data data; struct tmigr_walk data;
if (tmigr_is_not_available(tmc)) if (tmigr_is_not_available(tmc))
return; return;
data.childmask = tmc->childmask; data.childmask = tmc->groupmask;
data.firstexp = KTIME_MAX; data.firstexp = KTIME_MAX;
/* /*
@ -1068,7 +1057,7 @@ void tmigr_handle_remote(void)
* in tmigr_handle_remote_up() anyway. Keep this check to speed up the * in tmigr_handle_remote_up() anyway. Keep this check to speed up the
* return when nothing has to be done. * return when nothing has to be done.
*/ */
if (!tmigr_check_migrator(tmc->tmgroup, tmc->childmask)) { if (!tmigr_check_migrator(tmc->tmgroup, tmc->groupmask)) {
/* /*
* If this CPU was an idle migrator, make sure to clear its wakeup * If this CPU was an idle migrator, make sure to clear its wakeup
* value so it won't chase timers that have already expired elsewhere. * value so it won't chase timers that have already expired elsewhere.
@ -1097,9 +1086,8 @@ void tmigr_handle_remote(void)
static bool tmigr_requires_handle_remote_up(struct tmigr_group *group, static bool tmigr_requires_handle_remote_up(struct tmigr_group *group,
struct tmigr_group *child, struct tmigr_group *child,
void *ptr) struct tmigr_walk *data)
{ {
struct tmigr_remote_data *data = ptr;
u8 childmask; u8 childmask;
childmask = data->childmask; childmask = data->childmask;
@ -1118,7 +1106,7 @@ static bool tmigr_requires_handle_remote_up(struct tmigr_group *group,
* group before reading the next_expiry value. * group before reading the next_expiry value.
*/ */
if (group->parent && !data->tmc_active) if (group->parent && !data->tmc_active)
goto out; return false;
/* /*
* The lock is required on 32bit architectures to read the variable * The lock is required on 32bit architectures to read the variable
@ -1143,9 +1131,6 @@ static bool tmigr_requires_handle_remote_up(struct tmigr_group *group,
raw_spin_unlock(&group->lock); raw_spin_unlock(&group->lock);
} }
out:
/* Update of childmask for the next level */
data->childmask = group->childmask;
return false; return false;
} }
@ -1157,7 +1142,7 @@ out:
bool tmigr_requires_handle_remote(void) bool tmigr_requires_handle_remote(void)
{ {
struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
struct tmigr_remote_data data; struct tmigr_walk data;
unsigned long jif; unsigned long jif;
bool ret = false; bool ret = false;
@ -1165,7 +1150,7 @@ bool tmigr_requires_handle_remote(void)
return ret; return ret;
data.now = get_jiffies_update(&jif); data.now = get_jiffies_update(&jif);
data.childmask = tmc->childmask; data.childmask = tmc->groupmask;
data.firstexp = KTIME_MAX; data.firstexp = KTIME_MAX;
data.tmc_active = !tmc->idle; data.tmc_active = !tmc->idle;
data.check = false; data.check = false;
@ -1230,14 +1215,13 @@ u64 tmigr_cpu_new_timer(u64 nextexp)
if (nextexp != tmc->cpuevt.nextevt.expires || if (nextexp != tmc->cpuevt.nextevt.expires ||
tmc->cpuevt.ignore) { tmc->cpuevt.ignore) {
ret = tmigr_new_timer(tmc, nextexp); ret = tmigr_new_timer(tmc, nextexp);
/*
* Make sure the reevaluation of timers in idle path
* will not miss an event.
*/
WRITE_ONCE(tmc->wakeup, ret);
} }
} }
/*
* Make sure the reevaluation of timers in idle path will not miss an
* event.
*/
WRITE_ONCE(tmc->wakeup, ret);
trace_tmigr_cpu_new_timer_idle(tmc, nextexp); trace_tmigr_cpu_new_timer_idle(tmc, nextexp);
raw_spin_unlock(&tmc->lock); raw_spin_unlock(&tmc->lock);
return ret; return ret;
@ -1245,10 +1229,9 @@ u64 tmigr_cpu_new_timer(u64 nextexp)
static bool tmigr_inactive_up(struct tmigr_group *group, static bool tmigr_inactive_up(struct tmigr_group *group,
struct tmigr_group *child, struct tmigr_group *child,
void *ptr) struct tmigr_walk *data)
{ {
union tmigr_state curstate, newstate, childstate; union tmigr_state curstate, newstate, childstate;
struct tmigr_walk *data = ptr;
bool walk_done; bool walk_done;
u8 childmask; u8 childmask;
@ -1299,9 +1282,10 @@ static bool tmigr_inactive_up(struct tmigr_group *group,
WARN_ON_ONCE((newstate.migrator != TMIGR_NONE) && !(newstate.active)); WARN_ON_ONCE((newstate.migrator != TMIGR_NONE) && !(newstate.active));
if (atomic_try_cmpxchg(&group->migr_state, &curstate.state, if (atomic_try_cmpxchg(&group->migr_state, &curstate.state, newstate.state)) {
newstate.state)) trace_tmigr_group_set_cpu_inactive(group, newstate, childmask);
break; break;
}
/* /*
* The memory barrier is paired with the cmpxchg() in * The memory barrier is paired with the cmpxchg() in
@ -1317,22 +1301,6 @@ static bool tmigr_inactive_up(struct tmigr_group *group,
/* Event Handling */ /* Event Handling */
tmigr_update_events(group, child, data); tmigr_update_events(group, child, data);
if (group->parent && (walk_done == false))
data->childmask = group->childmask;
/*
* data->firstexp was set by tmigr_update_events() and contains the
* expiry of the first global event which needs to be handled. It
* differs from KTIME_MAX if:
* - group is the top level group and
* - group is idle (which means CPU was the last active CPU in the
* hierarchy) and
* - there is a pending event in the hierarchy
*/
WARN_ON_ONCE(data->firstexp != KTIME_MAX && group->parent);
trace_tmigr_group_set_cpu_inactive(group, newstate, childmask);
return walk_done; return walk_done;
} }
@ -1341,7 +1309,7 @@ static u64 __tmigr_cpu_deactivate(struct tmigr_cpu *tmc, u64 nextexp)
struct tmigr_walk data = { .nextexp = nextexp, struct tmigr_walk data = { .nextexp = nextexp,
.firstexp = KTIME_MAX, .firstexp = KTIME_MAX,
.evt = &tmc->cpuevt, .evt = &tmc->cpuevt,
.childmask = tmc->childmask }; .childmask = tmc->groupmask };
/* /*
* If nextexp is KTIME_MAX, the CPU event will be ignored because the * If nextexp is KTIME_MAX, the CPU event will be ignored because the
@ -1400,7 +1368,7 @@ u64 tmigr_cpu_deactivate(u64 nextexp)
* the only one in the level 0 group; and if it is the * the only one in the level 0 group; and if it is the
* only one in level 0 group, but there are more than a * only one in level 0 group, but there are more than a
* single group active on the way to top level) * single group active on the way to top level)
* * nextevt - when CPU is offline and has to handle timer on his own * * nextevt - when CPU is offline and has to handle timer on its own
* or when on the way to top in every group only a single * or when on the way to top in every group only a single
* child is active but @nextevt is before the lowest * child is active but @nextevt is before the lowest
* next_expiry encountered while walking up to top level. * next_expiry encountered while walking up to top level.
@ -1419,7 +1387,7 @@ u64 tmigr_quick_check(u64 nextevt)
if (WARN_ON_ONCE(tmc->idle)) if (WARN_ON_ONCE(tmc->idle))
return nextevt; return nextevt;
if (!tmigr_check_migrator_and_lonely(tmc->tmgroup, tmc->childmask)) if (!tmigr_check_migrator_and_lonely(tmc->tmgroup, tmc->groupmask))
return KTIME_MAX; return KTIME_MAX;
do { do {
@ -1442,6 +1410,66 @@ u64 tmigr_quick_check(u64 nextevt)
return KTIME_MAX; return KTIME_MAX;
} }
/*
* tmigr_trigger_active() - trigger a CPU to become active again
*
* This function is executed on a CPU which is part of cpu_online_mask, when the
* last active CPU in the hierarchy is offlining. With this, it is ensured that
* the other CPU is active and takes over the migrator duty.
*/
static long tmigr_trigger_active(void *unused)
{
struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
WARN_ON_ONCE(!tmc->online || tmc->idle);
return 0;
}
static int tmigr_cpu_offline(unsigned int cpu)
{
struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
int migrator;
u64 firstexp;
raw_spin_lock_irq(&tmc->lock);
tmc->online = false;
WRITE_ONCE(tmc->wakeup, KTIME_MAX);
/*
* CPU has to handle the local events on his own, when on the way to
* offline; Therefore nextevt value is set to KTIME_MAX
*/
firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX);
trace_tmigr_cpu_offline(tmc);
raw_spin_unlock_irq(&tmc->lock);
if (firstexp != KTIME_MAX) {
migrator = cpumask_any_but(cpu_online_mask, cpu);
work_on_cpu(migrator, tmigr_trigger_active, NULL);
}
return 0;
}
static int tmigr_cpu_online(unsigned int cpu)
{
struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
/* Check whether CPU data was successfully initialized */
if (WARN_ON_ONCE(!tmc->tmgroup))
return -EINVAL;
raw_spin_lock_irq(&tmc->lock);
trace_tmigr_cpu_online(tmc);
tmc->idle = timer_base_is_idle();
if (!tmc->idle)
__tmigr_cpu_activate(tmc);
tmc->online = true;
raw_spin_unlock_irq(&tmc->lock);
return 0;
}
static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl, static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
int node) int node)
{ {
@ -1514,21 +1542,25 @@ static struct tmigr_group *tmigr_get_group(unsigned int cpu, int node,
} }
static void tmigr_connect_child_parent(struct tmigr_group *child, static void tmigr_connect_child_parent(struct tmigr_group *child,
struct tmigr_group *parent) struct tmigr_group *parent,
bool activate)
{ {
union tmigr_state childstate; struct tmigr_walk data;
raw_spin_lock_irq(&child->lock); raw_spin_lock_irq(&child->lock);
raw_spin_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING); raw_spin_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING);
child->parent = parent; child->parent = parent;
child->childmask = BIT(parent->num_children++); child->groupmask = BIT(parent->num_children++);
raw_spin_unlock(&parent->lock); raw_spin_unlock(&parent->lock);
raw_spin_unlock_irq(&child->lock); raw_spin_unlock_irq(&child->lock);
trace_tmigr_connect_child_parent(child); trace_tmigr_connect_child_parent(child);
if (!activate)
return;
/* /*
* To prevent inconsistent states, active children need to be active in * To prevent inconsistent states, active children need to be active in
* the new parent as well. Inactive children are already marked inactive * the new parent as well. Inactive children are already marked inactive
@ -1544,21 +1576,24 @@ static void tmigr_connect_child_parent(struct tmigr_group *child,
* child to the new parent. So tmigr_connect_child_parent() is * child to the new parent. So tmigr_connect_child_parent() is
* executed with the formerly top level group (child) and the newly * executed with the formerly top level group (child) and the newly
* created group (parent). * created group (parent).
*
* * It is ensured that the child is active, as this setup path is
* executed in hotplug prepare callback. This is exectued by an
* already connected and !idle CPU. Even if all other CPUs go idle,
* the CPU executing the setup will be responsible up to current top
* level group. And the next time it goes inactive, it will release
* the new childmask and parent to subsequent walkers through this
* @child. Therefore propagate active state unconditionally.
*/ */
childstate.state = atomic_read(&child->migr_state); data.childmask = child->groupmask;
if (childstate.migrator != TMIGR_NONE) {
struct tmigr_walk data;
data.childmask = child->childmask; /*
* There is only one new level per time (which is protected by
/* * tmigr_mutex). When connecting the child and the parent and set the
* There is only one new level per time. When connecting the * child active when the parent is inactive, the parent needs to be the
* child and the parent and set the child active when the parent * uppermost level. Otherwise there went something wrong!
* is inactive, the parent needs to be the uppermost */
* level. Otherwise there went something wrong! WARN_ON(!tmigr_active_up(parent, child, &data) && parent->parent);
*/
WARN_ON(!tmigr_active_up(parent, child, &data) && parent->parent);
}
} }
static int tmigr_setup_groups(unsigned int cpu, unsigned int node) static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
@ -1611,12 +1646,12 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
* Update tmc -> group / child -> group connection * Update tmc -> group / child -> group connection
*/ */
if (i == 0) { if (i == 0) {
struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); struct tmigr_cpu *tmc = per_cpu_ptr(&tmigr_cpu, cpu);
raw_spin_lock_irq(&group->lock); raw_spin_lock_irq(&group->lock);
tmc->tmgroup = group; tmc->tmgroup = group;
tmc->childmask = BIT(group->num_children++); tmc->groupmask = BIT(group->num_children++);
raw_spin_unlock_irq(&group->lock); raw_spin_unlock_irq(&group->lock);
@ -1626,7 +1661,8 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
continue; continue;
} else { } else {
child = stack[i - 1]; child = stack[i - 1];
tmigr_connect_child_parent(child, group); /* Will be activated at online time */
tmigr_connect_child_parent(child, group, false);
} }
/* check if uppermost level was newly created */ /* check if uppermost level was newly created */
@ -1637,12 +1673,21 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
lvllist = &tmigr_level_list[top]; lvllist = &tmigr_level_list[top];
if (group->num_children == 1 && list_is_singular(lvllist)) { if (group->num_children == 1 && list_is_singular(lvllist)) {
/*
* The target CPU must never do the prepare work, except
* on early boot when the boot CPU is the target. Otherwise
* it may spuriously activate the old top level group inside
* the new one (nevertheless whether old top level group is
* active or not) and/or release an uninitialized childmask.
*/
WARN_ON_ONCE(cpu == raw_smp_processor_id());
lvllist = &tmigr_level_list[top - 1]; lvllist = &tmigr_level_list[top - 1];
list_for_each_entry(child, lvllist, list) { list_for_each_entry(child, lvllist, list) {
if (child->parent) if (child->parent)
continue; continue;
tmigr_connect_child_parent(child, group); tmigr_connect_child_parent(child, group, true);
} }
} }
} }
@ -1664,80 +1709,31 @@ static int tmigr_add_cpu(unsigned int cpu)
return ret; return ret;
} }
static int tmigr_cpu_online(unsigned int cpu) static int tmigr_cpu_prepare(unsigned int cpu)
{ {
struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); struct tmigr_cpu *tmc = per_cpu_ptr(&tmigr_cpu, cpu);
int ret; int ret = 0;
/* First online attempt? Initialize CPU data */ /* Not first online attempt? */
if (!tmc->tmgroup) { if (tmc->tmgroup)
raw_spin_lock_init(&tmc->lock); return ret;
ret = tmigr_add_cpu(cpu); raw_spin_lock_init(&tmc->lock);
if (ret < 0) timerqueue_init(&tmc->cpuevt.nextevt);
return ret; tmc->cpuevt.nextevt.expires = KTIME_MAX;
tmc->cpuevt.ignore = true;
if (tmc->childmask == 0) tmc->cpuevt.cpu = cpu;
return -EINVAL; tmc->remote = false;
timerqueue_init(&tmc->cpuevt.nextevt);
tmc->cpuevt.nextevt.expires = KTIME_MAX;
tmc->cpuevt.ignore = true;
tmc->cpuevt.cpu = cpu;
tmc->remote = false;
WRITE_ONCE(tmc->wakeup, KTIME_MAX);
}
raw_spin_lock_irq(&tmc->lock);
trace_tmigr_cpu_online(tmc);
tmc->idle = timer_base_is_idle();
if (!tmc->idle)
__tmigr_cpu_activate(tmc);
tmc->online = true;
raw_spin_unlock_irq(&tmc->lock);
return 0;
}
/*
* tmigr_trigger_active() - trigger a CPU to become active again
*
* This function is executed on a CPU which is part of cpu_online_mask, when the
* last active CPU in the hierarchy is offlining. With this, it is ensured that
* the other CPU is active and takes over the migrator duty.
*/
static long tmigr_trigger_active(void *unused)
{
struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
WARN_ON_ONCE(!tmc->online || tmc->idle);
return 0;
}
static int tmigr_cpu_offline(unsigned int cpu)
{
struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
int migrator;
u64 firstexp;
raw_spin_lock_irq(&tmc->lock);
tmc->online = false;
WRITE_ONCE(tmc->wakeup, KTIME_MAX); WRITE_ONCE(tmc->wakeup, KTIME_MAX);
/* ret = tmigr_add_cpu(cpu);
* CPU has to handle the local events on his own, when on the way to if (ret < 0)
* offline; Therefore nextevt value is set to KTIME_MAX return ret;
*/
firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX);
trace_tmigr_cpu_offline(tmc);
raw_spin_unlock_irq(&tmc->lock);
if (firstexp != KTIME_MAX) { if (tmc->groupmask == 0)
migrator = cpumask_any_but(cpu_online_mask, cpu); return -EINVAL;
work_on_cpu(migrator, tmigr_trigger_active, NULL);
}
return 0; return ret;
} }
static int __init tmigr_init(void) static int __init tmigr_init(void)
@ -1796,6 +1792,11 @@ static int __init tmigr_init(void)
tmigr_hierarchy_levels, TMIGR_CHILDREN_PER_GROUP, tmigr_hierarchy_levels, TMIGR_CHILDREN_PER_GROUP,
tmigr_crossnode_level); tmigr_crossnode_level);
ret = cpuhp_setup_state(CPUHP_TMIGR_PREPARE, "tmigr:prepare",
tmigr_cpu_prepare, NULL);
if (ret)
goto err;
ret = cpuhp_setup_state(CPUHP_AP_TMIGR_ONLINE, "tmigr:online", ret = cpuhp_setup_state(CPUHP_AP_TMIGR_ONLINE, "tmigr:online",
tmigr_cpu_online, tmigr_cpu_offline); tmigr_cpu_online, tmigr_cpu_offline);
if (ret) if (ret)
@ -1807,4 +1808,4 @@ err:
pr_err("Timer migration setup failed\n"); pr_err("Timer migration setup failed\n");
return ret; return ret;
} }
late_initcall(tmigr_init); early_initcall(tmigr_init);

View File

@ -22,7 +22,17 @@ struct tmigr_event {
* struct tmigr_group - timer migration hierarchy group * struct tmigr_group - timer migration hierarchy group
* @lock: Lock protecting the event information and group hierarchy * @lock: Lock protecting the event information and group hierarchy
* information during setup * information during setup
* @parent: Pointer to the parent group * @parent: Pointer to the parent group. Pointer is updated when a
* new hierarchy level is added because of a CPU coming
* online the first time. Once it is set, the pointer will
* not be removed or updated. When accessing parent pointer
* lock less to decide whether to abort a propagation or
* not, it is not a problem. The worst outcome is an
* unnecessary/early CPU wake up. But do not access parent
* pointer several times in the same 'action' (like
* activation, deactivation, check for remote expiry,...)
* without holding the lock as it is not ensured that value
* will not change.
* @groupevt: Next event of the group which is only used when the * @groupevt: Next event of the group which is only used when the
* group is !active. The group event is then queued into * group is !active. The group event is then queued into
* the parent timer queue. * the parent timer queue.
@ -41,9 +51,8 @@ struct tmigr_event {
* @num_children: Counter of group children to make sure the group is only * @num_children: Counter of group children to make sure the group is only
* filled with TMIGR_CHILDREN_PER_GROUP; Required for setup * filled with TMIGR_CHILDREN_PER_GROUP; Required for setup
* only * only
* @childmask: childmask of the group in the parent group; is set * @groupmask: mask of the group in the parent group; is set during
* during setup and will never change; can be read * setup and will never change; can be read lockless
* lockless
* @list: List head that is added to the per level * @list: List head that is added to the per level
* tmigr_level_list; is required during setup when a * tmigr_level_list; is required during setup when a
* new group needs to be connected to the existing * new group needs to be connected to the existing
@ -59,7 +68,7 @@ struct tmigr_group {
unsigned int level; unsigned int level;
int numa_node; int numa_node;
unsigned int num_children; unsigned int num_children;
u8 childmask; u8 groupmask;
struct list_head list; struct list_head list;
}; };
@ -79,7 +88,7 @@ struct tmigr_group {
* hierarchy * hierarchy
* @remote: Is set when timers of the CPU are expired remotely * @remote: Is set when timers of the CPU are expired remotely
* @tmgroup: Pointer to the parent group * @tmgroup: Pointer to the parent group
* @childmask: childmask of tmigr_cpu in the parent group * @groupmask: mask of tmigr_cpu in the parent group
* @wakeup: Stores the first timer when the timer migration * @wakeup: Stores the first timer when the timer migration
* hierarchy is completely idle and remote expiry was done; * hierarchy is completely idle and remote expiry was done;
* is returned to timer code in the idle path and is only * is returned to timer code in the idle path and is only
@ -92,7 +101,7 @@ struct tmigr_cpu {
bool idle; bool idle;
bool remote; bool remote;
struct tmigr_group *tmgroup; struct tmigr_group *tmgroup;
u8 childmask; u8 groupmask;
u64 wakeup; u64 wakeup;
struct tmigr_event cpuevt; struct tmigr_event cpuevt;
}; };
@ -108,8 +117,8 @@ union tmigr_state {
u32 state; u32 state;
/** /**
* struct - split state of tmigr_group * struct - split state of tmigr_group
* @active: Contains each childmask bit of the active children * @active: Contains each mask bit of the active children
* @migrator: Contains childmask of the child which is migrator * @migrator: Contains mask of the child which is migrator
* @seq: Sequence counter needs to be increased when an update * @seq: Sequence counter needs to be increased when an update
* to the tmigr_state is done. It prevents a race when * to the tmigr_state is done. It prevents a race when
* updates in the child groups are propagated in changed * updates in the child groups are propagated in changed