Skip to content

Commit 27d1926

Browse files
committed
accel/ivpu: Improve recovery and reset support
- Synchronize job submission with reset/recovery using reset_lock - Always print recovery reason and call diagnose_failure() - Don't allow for autosupend during recovery - Prevent immediate autosuspend after reset/recovery - Prevent force_recovery for issuing TDR when device is suspended - Reset VPU instead triggering recovery after changing debugfs params Signed-off-by: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com> Reviewed-by: Wachowski, Karol <karol.wachowski@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20240122120945.1150728-4-jacek.lawrynowicz@linux.intel.com
1 parent 264b271 commit 27d1926

File tree

8 files changed

+70
-48
lines changed

8 files changed

+70
-48
lines changed

drivers/accel/ivpu/ivpu_debugfs.c

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ static int reset_pending_show(struct seq_file *s, void *v)
102102
{
103103
struct ivpu_device *vdev = seq_to_ivpu(s);
104104

105-
seq_printf(s, "%d\n", atomic_read(&vdev->pm->in_reset));
105+
seq_printf(s, "%d\n", atomic_read(&vdev->pm->reset_pending));
106106
return 0;
107107
}
108108

@@ -130,7 +130,9 @@ dvfs_mode_fops_write(struct file *file, const char __user *user_buf, size_t size
130130

131131
fw->dvfs_mode = dvfs_mode;
132132

133-
ivpu_pm_schedule_recovery(vdev);
133+
ret = pci_try_reset_function(to_pci_dev(vdev->drm.dev));
134+
if (ret)
135+
return ret;
134136

135137
return size;
136138
}
@@ -190,7 +192,10 @@ fw_profiling_freq_fops_write(struct file *file, const char __user *user_buf,
190192
return ret;
191193

192194
ivpu_hw_profiling_freq_drive(vdev, enable);
193-
ivpu_pm_schedule_recovery(vdev);
195+
196+
ret = pci_try_reset_function(to_pci_dev(vdev->drm.dev));
197+
if (ret)
198+
return ret;
194199

195200
return size;
196201
}
@@ -301,11 +306,18 @@ static ssize_t
301306
ivpu_force_recovery_fn(struct file *file, const char __user *user_buf, size_t size, loff_t *pos)
302307
{
303308
struct ivpu_device *vdev = file->private_data;
309+
int ret;
304310

305311
if (!size)
306312
return -EINVAL;
307313

308-
ivpu_pm_schedule_recovery(vdev);
314+
ret = ivpu_rpm_get(vdev);
315+
if (ret)
316+
return ret;
317+
318+
ivpu_pm_trigger_recovery(vdev, "debugfs");
319+
flush_work(&vdev->pm->recovery_work);
320+
ivpu_rpm_put(vdev);
309321
return size;
310322
}
311323

drivers/accel/ivpu/ivpu_hw_37xx.c

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -875,24 +875,18 @@ static void ivpu_hw_37xx_irq_disable(struct ivpu_device *vdev)
875875

876876
static void ivpu_hw_37xx_irq_wdt_nce_handler(struct ivpu_device *vdev)
877877
{
878-
ivpu_err_ratelimited(vdev, "WDT NCE irq\n");
879-
880-
ivpu_pm_schedule_recovery(vdev);
878+
ivpu_pm_trigger_recovery(vdev, "WDT NCE IRQ");
881879
}
882880

883881
static void ivpu_hw_37xx_irq_wdt_mss_handler(struct ivpu_device *vdev)
884882
{
885-
ivpu_err_ratelimited(vdev, "WDT MSS irq\n");
886-
887883
ivpu_hw_wdt_disable(vdev);
888-
ivpu_pm_schedule_recovery(vdev);
884+
ivpu_pm_trigger_recovery(vdev, "WDT MSS IRQ");
889885
}
890886

891887
static void ivpu_hw_37xx_irq_noc_firewall_handler(struct ivpu_device *vdev)
892888
{
893-
ivpu_err_ratelimited(vdev, "NOC Firewall irq\n");
894-
895-
ivpu_pm_schedule_recovery(vdev);
889+
ivpu_pm_trigger_recovery(vdev, "NOC Firewall IRQ");
896890
}
897891

898892
/* Handler for IRQs from VPU core (irqV) */
@@ -970,7 +964,7 @@ static bool ivpu_hw_37xx_irqb_handler(struct ivpu_device *vdev, int irq)
970964
REGB_WR32(VPU_37XX_BUTTRESS_INTERRUPT_STAT, status);
971965

972966
if (schedule_recovery)
973-
ivpu_pm_schedule_recovery(vdev);
967+
ivpu_pm_trigger_recovery(vdev, "Buttress IRQ");
974968

975969
return true;
976970
}

drivers/accel/ivpu/ivpu_hw_40xx.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1049,18 +1049,18 @@ static void ivpu_hw_40xx_irq_disable(struct ivpu_device *vdev)
10491049
static void ivpu_hw_40xx_irq_wdt_nce_handler(struct ivpu_device *vdev)
10501050
{
10511051
/* TODO: For LNN hang consider engine reset instead of full recovery */
1052-
ivpu_pm_schedule_recovery(vdev);
1052+
ivpu_pm_trigger_recovery(vdev, "WDT NCE IRQ");
10531053
}
10541054

10551055
static void ivpu_hw_40xx_irq_wdt_mss_handler(struct ivpu_device *vdev)
10561056
{
10571057
ivpu_hw_wdt_disable(vdev);
1058-
ivpu_pm_schedule_recovery(vdev);
1058+
ivpu_pm_trigger_recovery(vdev, "WDT MSS IRQ");
10591059
}
10601060

10611061
static void ivpu_hw_40xx_irq_noc_firewall_handler(struct ivpu_device *vdev)
10621062
{
1063-
ivpu_pm_schedule_recovery(vdev);
1063+
ivpu_pm_trigger_recovery(vdev, "NOC Firewall IRQ");
10641064
}
10651065

10661066
/* Handler for IRQs from VPU core (irqV) */
@@ -1154,7 +1154,7 @@ static bool ivpu_hw_40xx_irqb_handler(struct ivpu_device *vdev, int irq)
11541154
REGB_WR32(VPU_40XX_BUTTRESS_INTERRUPT_STAT, status);
11551155

11561156
if (schedule_recovery)
1157-
ivpu_pm_schedule_recovery(vdev);
1157+
ivpu_pm_trigger_recovery(vdev, "Buttress IRQ");
11581158

11591159
return true;
11601160
}

drivers/accel/ivpu/ivpu_ipc.c

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -343,10 +343,8 @@ int ivpu_ipc_send_receive_active(struct ivpu_device *vdev, struct vpu_jsm_msg *r
343343
hb_ret = ivpu_ipc_send_receive_internal(vdev, &hb_req, VPU_JSM_MSG_QUERY_ENGINE_HB_DONE,
344344
&hb_resp, VPU_IPC_CHAN_ASYNC_CMD,
345345
vdev->timeout.jsm);
346-
if (hb_ret == -ETIMEDOUT) {
347-
ivpu_hw_diagnose_failure(vdev);
348-
ivpu_pm_schedule_recovery(vdev);
349-
}
346+
if (hb_ret == -ETIMEDOUT)
347+
ivpu_pm_trigger_recovery(vdev, "IPC timeout");
350348

351349
return ret;
352350
}

drivers/accel/ivpu/ivpu_job.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -515,7 +515,9 @@ int ivpu_submit_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
515515
goto err_destroy_job;
516516
}
517517

518+
down_read(&vdev->pm->reset_lock);
518519
ret = ivpu_job_submit(job);
520+
up_read(&vdev->pm->reset_lock);
519521
if (ret)
520522
goto err_signal_fence;
521523

drivers/accel/ivpu/ivpu_mmu.c

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -887,7 +887,6 @@ static u32 *ivpu_mmu_get_event(struct ivpu_device *vdev)
887887

888888
void ivpu_mmu_irq_evtq_handler(struct ivpu_device *vdev)
889889
{
890-
bool schedule_recovery = false;
891890
u32 *event;
892891
u32 ssid;
893892

@@ -897,14 +896,13 @@ void ivpu_mmu_irq_evtq_handler(struct ivpu_device *vdev)
897896
ivpu_mmu_dump_event(vdev, event);
898897

899898
ssid = FIELD_GET(IVPU_MMU_EVT_SSID_MASK, event[0]);
900-
if (ssid == IVPU_GLOBAL_CONTEXT_MMU_SSID)
901-
schedule_recovery = true;
902-
else
903-
ivpu_mmu_user_context_mark_invalid(vdev, ssid);
904-
}
899+
if (ssid == IVPU_GLOBAL_CONTEXT_MMU_SSID) {
900+
ivpu_pm_trigger_recovery(vdev, "MMU event");
901+
return;
902+
}
905903

906-
if (schedule_recovery)
907-
ivpu_pm_schedule_recovery(vdev);
904+
ivpu_mmu_user_context_mark_invalid(vdev, ssid);
905+
}
908906
}
909907

910908
void ivpu_mmu_evtq_dump(struct ivpu_device *vdev)

drivers/accel/ivpu/ivpu_pm.c

Lines changed: 32 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,14 @@ static void ivpu_pm_recovery_work(struct work_struct *work)
112112
char *evt[2] = {"IVPU_PM_EVENT=IVPU_RECOVER", NULL};
113113
int ret;
114114

115+
ivpu_err(vdev, "Recovering the VPU (reset #%d)\n", atomic_read(&vdev->pm->reset_counter));
116+
117+
ret = pm_runtime_resume_and_get(vdev->drm.dev);
118+
if (ret)
119+
ivpu_err(vdev, "Failed to resume VPU: %d\n", ret);
120+
121+
ivpu_fw_log_dump(vdev);
122+
115123
retry:
116124
ret = pci_try_reset_function(to_pci_dev(vdev->drm.dev));
117125
if (ret == -EAGAIN && !drm_dev_is_unplugged(&vdev->drm)) {
@@ -123,11 +131,13 @@ static void ivpu_pm_recovery_work(struct work_struct *work)
123131
ivpu_err(vdev, "Failed to reset VPU: %d\n", ret);
124132

125133
kobject_uevent_env(&vdev->drm.dev->kobj, KOBJ_CHANGE, evt);
134+
pm_runtime_mark_last_busy(vdev->drm.dev);
135+
pm_runtime_put_autosuspend(vdev->drm.dev);
126136
}
127137

128-
void ivpu_pm_schedule_recovery(struct ivpu_device *vdev)
138+
void ivpu_pm_trigger_recovery(struct ivpu_device *vdev, const char *reason)
129139
{
130-
struct ivpu_pm_info *pm = vdev->pm;
140+
ivpu_err(vdev, "Recovery triggered by %s\n", reason);
131141

132142
if (ivpu_disable_recovery) {
133143
ivpu_err(vdev, "Recovery not available when disable_recovery param is set\n");
@@ -139,23 +149,20 @@ void ivpu_pm_schedule_recovery(struct ivpu_device *vdev)
139149
return;
140150
}
141151

142-
/* Schedule recovery if it's not in progress */
143-
if (atomic_cmpxchg(&pm->in_reset, 0, 1) == 0) {
144-
ivpu_hw_irq_disable(vdev);
145-
queue_work(system_long_wq, &pm->recovery_work);
152+
/* Trigger recovery if it's not in progress */
153+
if (atomic_cmpxchg(&vdev->pm->reset_pending, 0, 1) == 0) {
154+
ivpu_hw_diagnose_failure(vdev);
155+
ivpu_hw_irq_disable(vdev); /* Disable IRQ early to protect from IRQ storm */
156+
queue_work(system_long_wq, &vdev->pm->recovery_work);
146157
}
147158
}
148159

149160
static void ivpu_job_timeout_work(struct work_struct *work)
150161
{
151162
struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, job_timeout_work.work);
152163
struct ivpu_device *vdev = pm->vdev;
153-
unsigned long timeout_ms = ivpu_tdr_timeout_ms ? ivpu_tdr_timeout_ms : vdev->timeout.tdr;
154164

155-
ivpu_err(vdev, "TDR detected, timeout %lu ms", timeout_ms);
156-
ivpu_hw_diagnose_failure(vdev);
157-
158-
ivpu_pm_schedule_recovery(vdev);
165+
ivpu_pm_trigger_recovery(vdev, "TDR");
159166
}
160167

161168
void ivpu_start_job_timeout_detection(struct ivpu_device *vdev)
@@ -228,6 +235,9 @@ int ivpu_pm_runtime_suspend_cb(struct device *dev)
228235
bool hw_is_idle = true;
229236
int ret;
230237

238+
drm_WARN_ON(&vdev->drm, !xa_empty(&vdev->submitted_jobs_xa));
239+
drm_WARN_ON(&vdev->drm, work_pending(&vdev->pm->recovery_work));
240+
231241
ivpu_dbg(vdev, PM, "Runtime suspend..\n");
232242

233243
if (!ivpu_hw_is_idle(vdev) && vdev->pm->suspend_reschedule_counter) {
@@ -310,11 +320,12 @@ void ivpu_pm_reset_prepare_cb(struct pci_dev *pdev)
310320
{
311321
struct ivpu_device *vdev = pci_get_drvdata(pdev);
312322

313-
pm_runtime_get_sync(vdev->drm.dev);
314-
315323
ivpu_dbg(vdev, PM, "Pre-reset..\n");
316324
atomic_inc(&vdev->pm->reset_counter);
317-
atomic_set(&vdev->pm->in_reset, 1);
325+
atomic_set(&vdev->pm->reset_pending, 1);
326+
327+
pm_runtime_get_sync(vdev->drm.dev);
328+
down_write(&vdev->pm->reset_lock);
318329
ivpu_prepare_for_reset(vdev);
319330
ivpu_hw_reset(vdev);
320331
ivpu_pm_prepare_cold_boot(vdev);
@@ -331,9 +342,11 @@ void ivpu_pm_reset_done_cb(struct pci_dev *pdev)
331342
ret = ivpu_resume(vdev);
332343
if (ret)
333344
ivpu_err(vdev, "Failed to set RESUME state: %d\n", ret);
334-
atomic_set(&vdev->pm->in_reset, 0);
345+
up_write(&vdev->pm->reset_lock);
346+
atomic_set(&vdev->pm->reset_pending, 0);
335347
ivpu_dbg(vdev, PM, "Post-reset done.\n");
336348

349+
pm_runtime_mark_last_busy(vdev->drm.dev);
337350
pm_runtime_put_autosuspend(vdev->drm.dev);
338351
}
339352

@@ -346,7 +359,10 @@ void ivpu_pm_init(struct ivpu_device *vdev)
346359
pm->vdev = vdev;
347360
pm->suspend_reschedule_counter = PM_RESCHEDULE_LIMIT;
348361

349-
atomic_set(&pm->in_reset, 0);
362+
init_rwsem(&pm->reset_lock);
363+
atomic_set(&pm->reset_pending, 0);
364+
atomic_set(&pm->reset_counter, 0);
365+
350366
INIT_WORK(&pm->recovery_work, ivpu_pm_recovery_work);
351367
INIT_DELAYED_WORK(&pm->job_timeout_work, ivpu_job_timeout_work);
352368

drivers/accel/ivpu/ivpu_pm.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#ifndef __IVPU_PM_H__
77
#define __IVPU_PM_H__
88

9+
#include <linux/rwsem.h>
910
#include <linux/types.h>
1011

1112
struct ivpu_device;
@@ -14,8 +15,9 @@ struct ivpu_pm_info {
1415
struct ivpu_device *vdev;
1516
struct delayed_work job_timeout_work;
1617
struct work_struct recovery_work;
17-
atomic_t in_reset;
18+
struct rw_semaphore reset_lock;
1819
atomic_t reset_counter;
20+
atomic_t reset_pending;
1921
bool is_warmboot;
2022
u32 suspend_reschedule_counter;
2123
};
@@ -37,7 +39,7 @@ int __must_check ivpu_rpm_get(struct ivpu_device *vdev);
3739
int __must_check ivpu_rpm_get_if_active(struct ivpu_device *vdev);
3840
void ivpu_rpm_put(struct ivpu_device *vdev);
3941

40-
void ivpu_pm_schedule_recovery(struct ivpu_device *vdev);
42+
void ivpu_pm_trigger_recovery(struct ivpu_device *vdev, const char *reason);
4143
void ivpu_start_job_timeout_detection(struct ivpu_device *vdev);
4244
void ivpu_stop_job_timeout_detection(struct ivpu_device *vdev);
4345

0 commit comments

Comments
 (0)