Skip to content

Commit f7c1b0e

Browse files
committed
thermal: core: Back off when polling thermal zones on errors
Commit a8a2617 ("thermal: core: Call monitor_thermal_zone() if zone temperature is invalid") introduced a polling mechanism by which the thermal core attampts to get a valid temperature value for thermal zones where the .get_temp() callback returns errors to start with (for example, due to initialization ordering woes). However, this polling is carried out periodically ad infinitum and every iteration of it causes a message to be printed to the kernel log which means a lot of log noise on systems where there are thermal zones that never get ready for some reason. It is also not really useful to continuously poll thermal zones that never respond. To address this, modify the thermal core to increase the delay between consecutive thermal zone temperature checks after every check that fails until it reaches a certain maximum value. At that point, the thermal zone in question will be disabled, but user space will be able to reenable it if it believes that the failure is transient. Also change the code to print messages regarding failed temperature checks to the kernel log only twice, once when the thermal zone's .get_temp() callback returns an error for the first time and once when disabling the given thermal zone. In addition, a dev_crit() message will be printed at that point if the given thermal zone contains a critical trip point to notify the system operator about the situation. Fixes: a8a2617 ("thermal: core: Call monitor_thermal_zone() if zone temperature is invalid") Link: https://lore.kernel.org/linux-acpi/CAGnHSE=RyPK++UG0-wAtVKgeJxe0uzFYgLxm+RUOKKoQquW=Ow@mail.gmail.com/ Reported-by: Tom Yan <tom.ty89@gmail.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> Link: https://patch.msgid.link/2962033.e9J7NaK4W3@rjwysocki.net
1 parent e5f9889 commit f7c1b0e

File tree

2 files changed

+61
-7
lines changed

2 files changed

+61
-7
lines changed

drivers/thermal/thermal_core.c

Lines changed: 54 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,28 @@ static int __thermal_zone_device_set_mode(struct thermal_zone_device *tz,
288288
return 0;
289289
}
290290

291+
static void thermal_zone_broken_disable(struct thermal_zone_device *tz)
292+
{
293+
struct thermal_trip_desc *td;
294+
295+
dev_err(&tz->device, "Unable to get temperature, disabling!\n");
296+
/*
297+
* This function only runs for enabled thermal zones, so no need to
298+
* check for the current mode.
299+
*/
300+
__thermal_zone_device_set_mode(tz, THERMAL_DEVICE_DISABLED);
301+
thermal_notify_tz_disable(tz);
302+
303+
for_each_trip_desc(tz, td) {
304+
if (td->trip.type == THERMAL_TRIP_CRITICAL &&
305+
td->trip.temperature > THERMAL_TEMP_INVALID) {
306+
dev_crit(&tz->device,
307+
"Disabled thermal zone with critical trip point\n");
308+
return;
309+
}
310+
}
311+
}
312+
291313
/*
292314
* Zone update section: main control loop applied to each zone while monitoring
293315
* in polling mode. The monitoring is done using a workqueue.
@@ -308,6 +330,34 @@ static void thermal_zone_device_set_polling(struct thermal_zone_device *tz,
308330
cancel_delayed_work(&tz->poll_queue);
309331
}
310332

333+
static void thermal_zone_recheck(struct thermal_zone_device *tz, int error)
334+
{
335+
if (error == -EAGAIN) {
336+
thermal_zone_device_set_polling(tz, THERMAL_RECHECK_DELAY);
337+
return;
338+
}
339+
340+
/*
341+
* Print the message once to reduce log noise. It will be followed by
342+
* another one if the temperature cannot be determined after multiple
343+
* attempts.
344+
*/
345+
if (tz->recheck_delay_jiffies == THERMAL_RECHECK_DELAY)
346+
dev_info(&tz->device, "Temperature check failed (%d)\n", error);
347+
348+
thermal_zone_device_set_polling(tz, tz->recheck_delay_jiffies);
349+
350+
tz->recheck_delay_jiffies += max(tz->recheck_delay_jiffies >> 1, 1ULL);
351+
if (tz->recheck_delay_jiffies > THERMAL_MAX_RECHECK_DELAY) {
352+
thermal_zone_broken_disable(tz);
353+
/*
354+
* Restore the original recheck delay value to allow the thermal
355+
* zone to try to recover when it is reenabled by user space.
356+
*/
357+
tz->recheck_delay_jiffies = THERMAL_RECHECK_DELAY;
358+
}
359+
}
360+
311361
static void monitor_thermal_zone(struct thermal_zone_device *tz)
312362
{
313363
if (tz->mode != THERMAL_DEVICE_ENABLED)
@@ -507,10 +557,7 @@ void __thermal_zone_device_update(struct thermal_zone_device *tz,
507557

508558
ret = __thermal_zone_get_temp(tz, &temp);
509559
if (ret) {
510-
if (ret != -EAGAIN)
511-
dev_info(&tz->device, "Temperature check failed (%d)\n", ret);
512-
513-
thermal_zone_device_set_polling(tz, msecs_to_jiffies(THERMAL_RECHECK_DELAY_MS));
560+
thermal_zone_recheck(tz, ret);
514561
return;
515562
} else if (temp <= THERMAL_TEMP_INVALID) {
516563
/*
@@ -522,6 +569,8 @@ void __thermal_zone_device_update(struct thermal_zone_device *tz,
522569
goto monitor;
523570
}
524571

572+
tz->recheck_delay_jiffies = THERMAL_RECHECK_DELAY;
573+
525574
tz->last_temperature = tz->temperature;
526575
tz->temperature = temp;
527576

@@ -1462,6 +1511,7 @@ thermal_zone_device_register_with_trips(const char *type,
14621511

14631512
thermal_set_delay_jiffies(&tz->passive_delay_jiffies, passive_delay);
14641513
thermal_set_delay_jiffies(&tz->polling_delay_jiffies, polling_delay);
1514+
tz->recheck_delay_jiffies = THERMAL_RECHECK_DELAY;
14651515

14661516
/* sys I/F */
14671517
/* Add nodes that are always present via .groups */

drivers/thermal/thermal_core.h

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ struct thermal_governor {
6767
* @polling_delay_jiffies: number of jiffies to wait between polls when
6868
* checking whether trip points have been crossed (0 for
6969
* interrupt driven systems)
70+
* @recheck_delay_jiffies: delay after a failed attempt to determine the zone
71+
* temperature before trying again
7072
* @temperature: current temperature. This is only for core code,
7173
* drivers should use thermal_zone_get_temp() to get the
7274
* current temperature
@@ -108,6 +110,7 @@ struct thermal_zone_device {
108110
int num_trips;
109111
unsigned long passive_delay_jiffies;
110112
unsigned long polling_delay_jiffies;
113+
unsigned long recheck_delay_jiffies;
111114
int temperature;
112115
int last_temperature;
113116
int emul_temperature;
@@ -137,10 +140,11 @@ struct thermal_zone_device {
137140
#define THERMAL_TEMP_INIT INT_MIN
138141

139142
/*
140-
* Default delay after a failing thermal zone temperature check before
141-
* attempting to check it again.
143+
* Default and maximum delay after a failed thermal zone temperature check
144+
* before attempting to check it again (in jiffies).
142145
*/
143-
#define THERMAL_RECHECK_DELAY_MS 250
146+
#define THERMAL_RECHECK_DELAY msecs_to_jiffies(250)
147+
#define THERMAL_MAX_RECHECK_DELAY (120 * HZ)
144148

145149
/* Default Thermal Governor */
146150
#if defined(CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE)

0 commit comments

Comments
 (0)