Skip to content

Commit e528be3

Browse files
committed
thermal: core: Allow thermal zones to tell the core to ignore them
The iwlwifi wireless driver registers a thermal zone that is only needed when the network interface handled by it is up and it wants that thermal zone to be effectively ignored by the core otherwise. Before commit a8a2617 ("thermal: core: Call monitor_thermal_zone() if zone temperature is invalid") that could be achieved by returning an error code from the thermal zone's .get_temp() callback because the core did not really handle errors returned by it almost at all. However, commit a8a2617 made the core attempt to recover from the situation in which the temperature of a thermal zone cannot be determined due to errors returned by its .get_temp() and is always invalid from the core's perspective. That was done because there are thermal zones in which .get_temp() returns errors to start with due to some difficulties related to the initialization ordering, but then it will start to produce valid temperature values at one point. Unfortunately, the simple approach taken by commit a8a2617, which is to poll the thermal zone periodically until its .get_temp() callback starts to return valid temperature values, is at odds with the special thermal zone in iwlwifi in which .get_temp() may always return an error because its network interface may always be down. If that happens, every attempt to invoke the thermal zone's .get_temp() callback resulting in an error causes the thermal core to print a dev_warn() message to the kernel log which is super-noisy. To address this problem, make the core handle the case in which .get_temp() returns 0, but the temperature value returned by it is not actually valid, in a special way. Namely, make the core completely ignore the invalid temperature value coming from .get_temp() in that case, which requires folding in update_temperature() into its caller and a few related changes. On the iwlwifi side, modify iwl_mvm_tzone_get_temp() to return 0 and put THERMAL_TEMP_INVALID into the temperature return memory location instead of returning an error when the firmware is not running or it is not of the right type. Also, to clearly separate the handling of invalid temperature values from the thermal zone initialization, introduce a special THERMAL_TEMP_INIT value specifically for the latter purpose. Fixes: a8a2617 ("thermal: core: Call monitor_thermal_zone() if zone temperature is invalid") Closes: https://lore.kernel.org/linux-pm/20240715044527.GA1544@sol.localdomain/ Reported-by: Eric Biggers <ebiggers@kernel.org> Reported-by: Stefan Lippers-Hollmann <s.l-h@gmx.de> Link: https://bugzilla.kernel.org/show_bug.cgi?id=201761 Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name> Tested-by: Stefan Lippers-Hollmann <s.l-h@gmx.de> Cc: 6.10+ <stable@vger.kernel.org> # 6.10+ Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> Link: https://patch.msgid.link/4950004.31r3eYUQgx@rjwysocki.net [ rjw: Rebased on top of the current mainline ] Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
1 parent 5183594 commit e528be3

File tree

4 files changed

+37
-29
lines changed

4 files changed

+37
-29
lines changed

drivers/net/wireless/intel/iwlwifi/mvm/tt.c

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -621,8 +621,14 @@ static int iwl_mvm_tzone_get_temp(struct thermal_zone_device *device,
621621
guard(mvm)(mvm);
622622

623623
if (!iwl_mvm_firmware_running(mvm) ||
624-
mvm->fwrt.cur_fw_img != IWL_UCODE_REGULAR)
625-
return -ENODATA;
624+
mvm->fwrt.cur_fw_img != IWL_UCODE_REGULAR) {
625+
/*
626+
* Tell the core that there is no valid temperature value to
627+
* return, but it need not worry about this.
628+
*/
629+
*temperature = THERMAL_TEMP_INVALID;
630+
return 0;
631+
}
626632

627633
ret = iwl_mvm_get_temp(mvm, &temp);
628634
if (ret)

drivers/thermal/thermal_core.c

Lines changed: 24 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -300,8 +300,6 @@ static void monitor_thermal_zone(struct thermal_zone_device *tz)
300300
thermal_zone_device_set_polling(tz, tz->passive_delay_jiffies);
301301
else if (tz->polling_delay_jiffies)
302302
thermal_zone_device_set_polling(tz, tz->polling_delay_jiffies);
303-
else if (tz->temperature == THERMAL_TEMP_INVALID)
304-
thermal_zone_device_set_polling(tz, msecs_to_jiffies(THERMAL_RECHECK_DELAY_MS));
305303
}
306304

307305
static struct thermal_governor *thermal_get_tz_governor(struct thermal_zone_device *tz)
@@ -382,7 +380,7 @@ static void handle_thermal_trip(struct thermal_zone_device *tz,
382380
td->threshold = trip->temperature;
383381

384382
if (tz->last_temperature >= old_threshold &&
385-
tz->last_temperature != THERMAL_TEMP_INVALID) {
383+
tz->last_temperature != THERMAL_TEMP_INIT) {
386384
/*
387385
* Mitigation is under way, so it needs to stop if the zone
388386
* temperature falls below the low temperature of the trip.
@@ -417,27 +415,6 @@ static void handle_thermal_trip(struct thermal_zone_device *tz,
417415
}
418416
}
419417

420-
static void update_temperature(struct thermal_zone_device *tz)
421-
{
422-
int temp, ret;
423-
424-
ret = __thermal_zone_get_temp(tz, &temp);
425-
if (ret) {
426-
if (ret != -EAGAIN)
427-
dev_warn(&tz->device,
428-
"failed to read out thermal zone (%d)\n",
429-
ret);
430-
return;
431-
}
432-
433-
tz->last_temperature = tz->temperature;
434-
tz->temperature = temp;
435-
436-
trace_thermal_temperature(tz);
437-
438-
thermal_genl_sampling_temp(tz->id, temp);
439-
}
440-
441418
static void thermal_zone_device_check(struct work_struct *work)
442419
{
443420
struct thermal_zone_device *tz = container_of(work, struct
@@ -452,7 +429,7 @@ static void thermal_zone_device_init(struct thermal_zone_device *tz)
452429

453430
INIT_DELAYED_WORK(&tz->poll_queue, thermal_zone_device_check);
454431

455-
tz->temperature = THERMAL_TEMP_INVALID;
432+
tz->temperature = THERMAL_TEMP_INIT;
456433
tz->passive = 0;
457434
tz->prev_low_trip = -INT_MAX;
458435
tz->prev_high_trip = INT_MAX;
@@ -504,17 +481,37 @@ void __thermal_zone_device_update(struct thermal_zone_device *tz,
504481
struct thermal_trip_desc *td;
505482
LIST_HEAD(way_down_list);
506483
LIST_HEAD(way_up_list);
484+
int temp, ret;
507485

508486
if (tz->suspended)
509487
return;
510488

511489
if (!thermal_zone_device_is_enabled(tz))
512490
return;
513491

514-
update_temperature(tz);
492+
ret = __thermal_zone_get_temp(tz, &temp);
493+
if (ret) {
494+
if (ret != -EAGAIN)
495+
dev_info(&tz->device, "Temperature check failed (%d)\n", ret);
515496

516-
if (tz->temperature == THERMAL_TEMP_INVALID)
497+
thermal_zone_device_set_polling(tz, msecs_to_jiffies(THERMAL_RECHECK_DELAY_MS));
498+
return;
499+
} else if (temp <= THERMAL_TEMP_INVALID) {
500+
/*
501+
* Special case: No valid temperature value is available, but
502+
* the zone owner does not want the core to do anything about
503+
* it. Continue regular zone polling if needed, so that this
504+
* function can be called again, but skip everything else.
505+
*/
517506
goto monitor;
507+
}
508+
509+
tz->last_temperature = tz->temperature;
510+
tz->temperature = temp;
511+
512+
trace_thermal_temperature(tz);
513+
514+
thermal_genl_sampling_temp(tz->id, temp);
518515

519516
tz->notify_event = event;
520517

drivers/thermal/thermal_core.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,9 @@ struct thermal_zone_device {
133133
struct thermal_trip_desc trips[] __counted_by(num_trips);
134134
};
135135

136+
/* Initial thermal zone temperature. */
137+
#define THERMAL_TEMP_INIT INT_MIN
138+
136139
/*
137140
* Default delay after a failing thermal zone temperature check before
138141
* attempting to check it again.

drivers/thermal/thermal_helpers.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,8 @@ int thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp)
163163
}
164164

165165
ret = __thermal_zone_get_temp(tz, temp);
166+
if (!ret && *temp <= THERMAL_TEMP_INVALID)
167+
ret = -ENODATA;
166168

167169
unlock:
168170
mutex_unlock(&tz->lock);

0 commit comments

Comments
 (0)