1

Additional thermal control fix for 6.11-rc1

Prevent the thermal core from flooding the kernel log with useless
 messages if thermal zone temperature can never be determined (or its
 sensor has failed permanently) and make it finally give up and disable
 defective thermal zones (Rafael Wysocki).
 -----BEGIN PGP SIGNATURE-----
 
 iQJGBAABCAAwFiEE4fcc61cGeeHD/fCwgsRv/nhiVHEFAmajgZUSHHJqd0Byand5
 c29ja2kubmV0AAoJEILEb/54YlRxiacP/269B//oY03dsPV2Y7KHourWHmcZL0xf
 wVNVNecRdhZ3mGxlJiQ682Sm4HF3dsgW368OgdedKXxpCdSGoi+nOyhxk8Cw1SVh
 INgq2PILfkqkT3GeUUCxI/i9fRLEl3MQMGMbrTPHlt/YjSE1BTYD8jx5+nx9uMvO
 +AiBf+n2BmxWRdZKPXgLp9VzDYIr3eG9+6wNEvBfnXN0BpfhOdI40YCsGIQGT0uf
 ISmNWOhrgNZZQageWgywwMytkyiJqWAB+mGNGtkUqZsD9K9Q8qWh/WvWRVWd7N57
 GNCkr2KiqoJ/kI2he2gQCgL27W73EFbMtt3XIdFy4pSur6UBvhs06mcUYsw3yhp6
 mXtYmf69/w9TH6Hs8Fo6teA2L3C9yasUYBr8/Sf2CUM1NtKLDi/nPBG6uTqcOj4F
 qZf25OeTkpN5ejTtZdrdUgdjkt2bAVM+zDx3JQjmfrsWHenEpCSQ4xQi4zdh8cEp
 4SocP2t6w8g3sQK/i5dDgFBW4JUbx+WgkPD/L5NrEJwPvthHHw68CuNdNgeateaH
 6WkedCm3JMLuNr4sEzLU7TArEhpFLgRnuAa3/eOQmJdWBVfY6eqLl9V2oeKsraIA
 cGSvaahumTEUcCZ/bWeMZp7jntv4txS67GVchMpMoC9oRJw9rnilGDld6l1+HWdF
 oSMv3cGaCNwH
 =DymR
 -----END PGP SIGNATURE-----

Merge tag 'thermal-6.11-rc1-3' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm

Pull thermal control fix from Rafael Wysocki:
 "Prevent the thermal core from flooding the kernel log with useless
  messages if thermal zone temperature can never be determined (or its
  sensor has failed permanently) and make it finally give up and disable
  defective thermal zones (Rafael Wysocki)"

* tag 'thermal-6.11-rc1-3' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm:
  thermal: core: Back off when polling thermal zones on errors
  thermal: trip: Split thermal_zone_device_set_mode()
This commit is contained in:
Linus Torvalds 2024-07-27 10:44:49 -07:00
commit 1fcaa5db40
2 changed files with 85 additions and 14 deletions

View File

@ -272,6 +272,44 @@ static int __init thermal_register_governors(void)
return ret;
}
static int __thermal_zone_device_set_mode(struct thermal_zone_device *tz,
enum thermal_device_mode mode)
{
if (tz->ops.change_mode) {
int ret;
ret = tz->ops.change_mode(tz, mode);
if (ret)
return ret;
}
tz->mode = mode;
return 0;
}
static void thermal_zone_broken_disable(struct thermal_zone_device *tz)
{
struct thermal_trip_desc *td;
dev_err(&tz->device, "Unable to get temperature, disabling!\n");
/*
* This function only runs for enabled thermal zones, so no need to
* check for the current mode.
*/
__thermal_zone_device_set_mode(tz, THERMAL_DEVICE_DISABLED);
thermal_notify_tz_disable(tz);
for_each_trip_desc(tz, td) {
if (td->trip.type == THERMAL_TRIP_CRITICAL &&
td->trip.temperature > THERMAL_TEMP_INVALID) {
dev_crit(&tz->device,
"Disabled thermal zone with critical trip point\n");
return;
}
}
}
/*
* Zone update section: main control loop applied to each zone while monitoring
* in polling mode. The monitoring is done using a workqueue.
@ -292,6 +330,34 @@ static void thermal_zone_device_set_polling(struct thermal_zone_device *tz,
cancel_delayed_work(&tz->poll_queue);
}
static void thermal_zone_recheck(struct thermal_zone_device *tz, int error)
{
if (error == -EAGAIN) {
thermal_zone_device_set_polling(tz, THERMAL_RECHECK_DELAY);
return;
}
/*
* Print the message once to reduce log noise. It will be followed by
* another one if the temperature cannot be determined after multiple
* attempts.
*/
if (tz->recheck_delay_jiffies == THERMAL_RECHECK_DELAY)
dev_info(&tz->device, "Temperature check failed (%d)\n", error);
thermal_zone_device_set_polling(tz, tz->recheck_delay_jiffies);
tz->recheck_delay_jiffies += max(tz->recheck_delay_jiffies >> 1, 1ULL);
if (tz->recheck_delay_jiffies > THERMAL_MAX_RECHECK_DELAY) {
thermal_zone_broken_disable(tz);
/*
* Restore the original recheck delay value to allow the thermal
* zone to try to recover when it is reenabled by user space.
*/
tz->recheck_delay_jiffies = THERMAL_RECHECK_DELAY;
}
}
static void monitor_thermal_zone(struct thermal_zone_device *tz)
{
if (tz->mode != THERMAL_DEVICE_ENABLED)
@ -491,10 +557,7 @@ void __thermal_zone_device_update(struct thermal_zone_device *tz,
ret = __thermal_zone_get_temp(tz, &temp);
if (ret) {
if (ret != -EAGAIN)
dev_info(&tz->device, "Temperature check failed (%d)\n", ret);
thermal_zone_device_set_polling(tz, msecs_to_jiffies(THERMAL_RECHECK_DELAY_MS));
thermal_zone_recheck(tz, ret);
return;
} else if (temp <= THERMAL_TEMP_INVALID) {
/*
@ -506,6 +569,8 @@ void __thermal_zone_device_update(struct thermal_zone_device *tz,
goto monitor;
}
tz->recheck_delay_jiffies = THERMAL_RECHECK_DELAY;
tz->last_temperature = tz->temperature;
tz->temperature = temp;
@ -540,7 +605,7 @@ monitor:
static int thermal_zone_device_set_mode(struct thermal_zone_device *tz,
enum thermal_device_mode mode)
{
int ret = 0;
int ret;
mutex_lock(&tz->lock);
@ -548,14 +613,15 @@ static int thermal_zone_device_set_mode(struct thermal_zone_device *tz,
if (mode == tz->mode) {
mutex_unlock(&tz->lock);
return ret;
return 0;
}
if (tz->ops.change_mode)
ret = tz->ops.change_mode(tz, mode);
ret = __thermal_zone_device_set_mode(tz, mode);
if (ret) {
mutex_unlock(&tz->lock);
if (!ret)
tz->mode = mode;
return ret;
}
__thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED);
@ -566,7 +632,7 @@ static int thermal_zone_device_set_mode(struct thermal_zone_device *tz,
else
thermal_notify_tz_disable(tz);
return ret;
return 0;
}
int thermal_zone_device_enable(struct thermal_zone_device *tz)
@ -1445,6 +1511,7 @@ thermal_zone_device_register_with_trips(const char *type,
thermal_set_delay_jiffies(&tz->passive_delay_jiffies, passive_delay);
thermal_set_delay_jiffies(&tz->polling_delay_jiffies, polling_delay);
tz->recheck_delay_jiffies = THERMAL_RECHECK_DELAY;
/* sys I/F */
/* Add nodes that are always present via .groups */

View File

@ -67,6 +67,8 @@ struct thermal_governor {
* @polling_delay_jiffies: number of jiffies to wait between polls when
* checking whether trip points have been crossed (0 for
* interrupt driven systems)
* @recheck_delay_jiffies: delay after a failed attempt to determine the zone
* temperature before trying again
* @temperature: current temperature. This is only for core code,
* drivers should use thermal_zone_get_temp() to get the
* current temperature
@ -108,6 +110,7 @@ struct thermal_zone_device {
int num_trips;
unsigned long passive_delay_jiffies;
unsigned long polling_delay_jiffies;
unsigned long recheck_delay_jiffies;
int temperature;
int last_temperature;
int emul_temperature;
@ -137,10 +140,11 @@ struct thermal_zone_device {
#define THERMAL_TEMP_INIT INT_MIN
/*
* Default delay after a failing thermal zone temperature check before
* attempting to check it again.
* Default and maximum delay after a failed thermal zone temperature check
* before attempting to check it again (in jiffies).
*/
#define THERMAL_RECHECK_DELAY_MS 250
#define THERMAL_RECHECK_DELAY msecs_to_jiffies(250)
#define THERMAL_MAX_RECHECK_DELAY (120 * HZ)
/* Default Thermal Governor */
#if defined(CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE)