Skip to content

Commit 9a0f53e

Browse files
committed
Merge tag 'csd-lock.2023.10.23a' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu
Pull CSD lock update from Paul McKenney: "This adds a kernel boot parameter that causes the kernel to panic if one of the call_smp_function() APIs is stalled for more than the specified duration. This is useful in deployments in which a clean panic is preferable to an indefinite stall" * tag 'csd-lock.2023.10.23a' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu: smp,csd: Throw an error if a CSD lock is stuck for too long
2 parents 6750f0d + 94b3f0b commit 9a0f53e

File tree

2 files changed

+19
-1
lines changed

2 files changed

+19
-1
lines changed

Documentation/admin-guide/kernel-parameters.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5864,6 +5864,13 @@
58645864
This feature may be more efficiently disabled
58655865
using the csdlock_debug- kernel parameter.
58665866

5867+
smp.panic_on_ipistall= [KNL]
5868+
If a csd_lock_timeout extends for more than
5869+
the specified number of milliseconds, panic the
5870+
system. By default, let CSD-lock acquisition
5871+
take as long as they take. Specifying 300,000
5872+
for this value provides a 5-minute timeout.
5873+
58675874
smsc-ircc2.nopnp [HW] Don't use PNP to discover SMC devices
58685875
smsc-ircc2.ircc_cfg= [HW] Device configuration I/O port
58695876
smsc-ircc2.ircc_sir= [HW] SIR base I/O port

kernel/smp.c

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,8 @@ static DEFINE_PER_CPU(void *, cur_csd_info);
170170

171171
static ulong csd_lock_timeout = 5000; /* CSD lock timeout in milliseconds. */
172172
module_param(csd_lock_timeout, ulong, 0444);
173+
static int panic_on_ipistall; /* CSD panic timeout in milliseconds, 300000 for five minutes. */
174+
module_param(panic_on_ipistall, int, 0444);
173175

174176
static atomic_t csd_bug_count = ATOMIC_INIT(0);
175177

@@ -230,6 +232,7 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in
230232
}
231233

232234
ts2 = sched_clock();
235+
/* How long since we last checked for a stuck CSD lock.*/
233236
ts_delta = ts2 - *ts1;
234237
if (likely(ts_delta <= csd_lock_timeout_ns || csd_lock_timeout_ns == 0))
235238
return false;
@@ -243,9 +246,17 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in
243246
else
244247
cpux = cpu;
245248
cpu_cur_csd = smp_load_acquire(&per_cpu(cur_csd, cpux)); /* Before func and info. */
249+
/* How long since this CSD lock was stuck. */
250+
ts_delta = ts2 - ts0;
246251
pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %llu ns for CPU#%02d %pS(%ps).\n",
247-
firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts2 - ts0,
252+
firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts_delta,
248253
cpu, csd->func, csd->info);
254+
/*
255+
* If the CSD lock is still stuck after 5 minutes, it is unlikely
256+
* to become unstuck. Use a signed comparison to avoid triggering
257+
* on underflows when the TSC is out of sync between sockets.
258+
*/
259+
BUG_ON(panic_on_ipistall > 0 && (s64)ts_delta > ((s64)panic_on_ipistall * NSEC_PER_MSEC));
249260
if (cpu_cur_csd && csd != cpu_cur_csd) {
250261
pr_alert("\tcsd: CSD lock (#%d) handling prior %pS(%ps) request.\n",
251262
*bug_id, READ_ONCE(per_cpu(cur_csd_func, cpux)),

0 commit comments

Comments
 (0)