Skip to content

Commit 1fcaa5d

Browse files
committed
Merge tag 'thermal-6.11-rc1-3' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm
Pull thermal control fix from Rafael Wysocki: "Prevent the thermal core from flooding the kernel log with useless messages if thermal zone temperature can never be determined (or its sensor has failed permanently) and make it finally give up and disable defective thermal zones (Rafael Wysocki)" * tag 'thermal-6.11-rc1-3' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm: thermal: core: Back off when polling thermal zones on errors thermal: trip: Split thermal_zone_device_set_mode()
2 parents 7b0acd9 + f7c1b0e commit 1fcaa5d

File tree

2 files changed

+85
-14
lines changed

2 files changed

+85
-14
lines changed

drivers/thermal/thermal_core.c

Lines changed: 78 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,44 @@ static int __init thermal_register_governors(void)
272272
return ret;
273273
}
274274

275+
static int __thermal_zone_device_set_mode(struct thermal_zone_device *tz,
276+
enum thermal_device_mode mode)
277+
{
278+
if (tz->ops.change_mode) {
279+
int ret;
280+
281+
ret = tz->ops.change_mode(tz, mode);
282+
if (ret)
283+
return ret;
284+
}
285+
286+
tz->mode = mode;
287+
288+
return 0;
289+
}
290+
291+
static void thermal_zone_broken_disable(struct thermal_zone_device *tz)
292+
{
293+
struct thermal_trip_desc *td;
294+
295+
dev_err(&tz->device, "Unable to get temperature, disabling!\n");
296+
/*
297+
* This function only runs for enabled thermal zones, so no need to
298+
* check for the current mode.
299+
*/
300+
__thermal_zone_device_set_mode(tz, THERMAL_DEVICE_DISABLED);
301+
thermal_notify_tz_disable(tz);
302+
303+
for_each_trip_desc(tz, td) {
304+
if (td->trip.type == THERMAL_TRIP_CRITICAL &&
305+
td->trip.temperature > THERMAL_TEMP_INVALID) {
306+
dev_crit(&tz->device,
307+
"Disabled thermal zone with critical trip point\n");
308+
return;
309+
}
310+
}
311+
}
312+
275313
/*
276314
* Zone update section: main control loop applied to each zone while monitoring
277315
* in polling mode. The monitoring is done using a workqueue.
@@ -292,6 +330,34 @@ static void thermal_zone_device_set_polling(struct thermal_zone_device *tz,
292330
cancel_delayed_work(&tz->poll_queue);
293331
}
294332

333+
static void thermal_zone_recheck(struct thermal_zone_device *tz, int error)
334+
{
335+
if (error == -EAGAIN) {
336+
thermal_zone_device_set_polling(tz, THERMAL_RECHECK_DELAY);
337+
return;
338+
}
339+
340+
/*
341+
* Print the message once to reduce log noise. It will be followed by
342+
* another one if the temperature cannot be determined after multiple
343+
* attempts.
344+
*/
345+
if (tz->recheck_delay_jiffies == THERMAL_RECHECK_DELAY)
346+
dev_info(&tz->device, "Temperature check failed (%d)\n", error);
347+
348+
thermal_zone_device_set_polling(tz, tz->recheck_delay_jiffies);
349+
350+
tz->recheck_delay_jiffies += max(tz->recheck_delay_jiffies >> 1, 1ULL);
351+
if (tz->recheck_delay_jiffies > THERMAL_MAX_RECHECK_DELAY) {
352+
thermal_zone_broken_disable(tz);
353+
/*
354+
* Restore the original recheck delay value to allow the thermal
355+
* zone to try to recover when it is reenabled by user space.
356+
*/
357+
tz->recheck_delay_jiffies = THERMAL_RECHECK_DELAY;
358+
}
359+
}
360+
295361
static void monitor_thermal_zone(struct thermal_zone_device *tz)
296362
{
297363
if (tz->mode != THERMAL_DEVICE_ENABLED)
@@ -491,10 +557,7 @@ void __thermal_zone_device_update(struct thermal_zone_device *tz,
491557

492558
ret = __thermal_zone_get_temp(tz, &temp);
493559
if (ret) {
494-
if (ret != -EAGAIN)
495-
dev_info(&tz->device, "Temperature check failed (%d)\n", ret);
496-
497-
thermal_zone_device_set_polling(tz, msecs_to_jiffies(THERMAL_RECHECK_DELAY_MS));
560+
thermal_zone_recheck(tz, ret);
498561
return;
499562
} else if (temp <= THERMAL_TEMP_INVALID) {
500563
/*
@@ -506,6 +569,8 @@ void __thermal_zone_device_update(struct thermal_zone_device *tz,
506569
goto monitor;
507570
}
508571

572+
tz->recheck_delay_jiffies = THERMAL_RECHECK_DELAY;
573+
509574
tz->last_temperature = tz->temperature;
510575
tz->temperature = temp;
511576

@@ -540,22 +605,23 @@ void __thermal_zone_device_update(struct thermal_zone_device *tz,
540605
static int thermal_zone_device_set_mode(struct thermal_zone_device *tz,
541606
enum thermal_device_mode mode)
542607
{
543-
int ret = 0;
608+
int ret;
544609

545610
mutex_lock(&tz->lock);
546611

547612
/* do nothing if mode isn't changing */
548613
if (mode == tz->mode) {
549614
mutex_unlock(&tz->lock);
550615

551-
return ret;
616+
return 0;
552617
}
553618

554-
if (tz->ops.change_mode)
555-
ret = tz->ops.change_mode(tz, mode);
619+
ret = __thermal_zone_device_set_mode(tz, mode);
620+
if (ret) {
621+
mutex_unlock(&tz->lock);
556622

557-
if (!ret)
558-
tz->mode = mode;
623+
return ret;
624+
}
559625

560626
__thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED);
561627

@@ -566,7 +632,7 @@ static int thermal_zone_device_set_mode(struct thermal_zone_device *tz,
566632
else
567633
thermal_notify_tz_disable(tz);
568634

569-
return ret;
635+
return 0;
570636
}
571637

572638
int thermal_zone_device_enable(struct thermal_zone_device *tz)
@@ -1445,6 +1511,7 @@ thermal_zone_device_register_with_trips(const char *type,
14451511

14461512
thermal_set_delay_jiffies(&tz->passive_delay_jiffies, passive_delay);
14471513
thermal_set_delay_jiffies(&tz->polling_delay_jiffies, polling_delay);
1514+
tz->recheck_delay_jiffies = THERMAL_RECHECK_DELAY;
14481515

14491516
/* sys I/F */
14501517
/* Add nodes that are always present via .groups */

drivers/thermal/thermal_core.h

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ struct thermal_governor {
6767
* @polling_delay_jiffies: number of jiffies to wait between polls when
6868
* checking whether trip points have been crossed (0 for
6969
* interrupt driven systems)
70+
* @recheck_delay_jiffies: delay after a failed attempt to determine the zone
71+
* temperature before trying again
7072
* @temperature: current temperature. This is only for core code,
7173
* drivers should use thermal_zone_get_temp() to get the
7274
* current temperature
@@ -108,6 +110,7 @@ struct thermal_zone_device {
108110
int num_trips;
109111
unsigned long passive_delay_jiffies;
110112
unsigned long polling_delay_jiffies;
113+
unsigned long recheck_delay_jiffies;
111114
int temperature;
112115
int last_temperature;
113116
int emul_temperature;
@@ -137,10 +140,11 @@ struct thermal_zone_device {
137140
#define THERMAL_TEMP_INIT INT_MIN
138141

139142
/*
140-
* Default delay after a failing thermal zone temperature check before
141-
* attempting to check it again.
143+
* Default and maximum delay after a failed thermal zone temperature check
144+
* before attempting to check it again (in jiffies).
142145
*/
143-
#define THERMAL_RECHECK_DELAY_MS 250
146+
#define THERMAL_RECHECK_DELAY msecs_to_jiffies(250)
147+
#define THERMAL_MAX_RECHECK_DELAY (120 * HZ)
144148

145149
/* Default Thermal Governor */
146150
#if defined(CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE)

0 commit comments

Comments
 (0)