Skip to content

Commit c959e90

Browse files
committed
Merge tag 'locking_urgent_for_v6.5_rc4' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull locking fix from Borislav Petkov: - Fix a rtmutex race condition resulting from sharing of the sort key between the lock waiters and the PI chain tree (->pi_waiters) of a task by giving each tree their own sort key * tag 'locking_urgent_for_v6.5_rc4' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: locking/rtmutex: Fix task->pi_waiters integrity
2 parents d410b62 + f7853c3 commit c959e90

File tree

4 files changed

+155
-76
lines changed

4 files changed

+155
-76
lines changed

kernel/locking/rtmutex.c

Lines changed: 114 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -333,21 +333,43 @@ static __always_inline int __waiter_prio(struct task_struct *task)
333333
return prio;
334334
}
335335

336+
/*
337+
* Update the waiter->tree copy of the sort keys.
338+
*/
336339
static __always_inline void
337340
waiter_update_prio(struct rt_mutex_waiter *waiter, struct task_struct *task)
338341
{
339-
waiter->prio = __waiter_prio(task);
340-
waiter->deadline = task->dl.deadline;
342+
lockdep_assert_held(&waiter->lock->wait_lock);
343+
lockdep_assert(RB_EMPTY_NODE(&waiter->tree.entry));
344+
345+
waiter->tree.prio = __waiter_prio(task);
346+
waiter->tree.deadline = task->dl.deadline;
347+
}
348+
349+
/*
350+
* Update the waiter->pi_tree copy of the sort keys (from the tree copy).
351+
*/
352+
static __always_inline void
353+
waiter_clone_prio(struct rt_mutex_waiter *waiter, struct task_struct *task)
354+
{
355+
lockdep_assert_held(&waiter->lock->wait_lock);
356+
lockdep_assert_held(&task->pi_lock);
357+
lockdep_assert(RB_EMPTY_NODE(&waiter->pi_tree.entry));
358+
359+
waiter->pi_tree.prio = waiter->tree.prio;
360+
waiter->pi_tree.deadline = waiter->tree.deadline;
341361
}
342362

343363
/*
344-
* Only use with rt_mutex_waiter_{less,equal}()
364+
* Only use with rt_waiter_node_{less,equal}()
345365
*/
366+
#define task_to_waiter_node(p) \
367+
&(struct rt_waiter_node){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline }
346368
#define task_to_waiter(p) \
347-
&(struct rt_mutex_waiter){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline }
369+
&(struct rt_mutex_waiter){ .tree = *task_to_waiter_node(p) }
348370

349-
static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left,
350-
struct rt_mutex_waiter *right)
371+
static __always_inline int rt_waiter_node_less(struct rt_waiter_node *left,
372+
struct rt_waiter_node *right)
351373
{
352374
if (left->prio < right->prio)
353375
return 1;
@@ -364,8 +386,8 @@ static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left,
364386
return 0;
365387
}
366388

367-
static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
368-
struct rt_mutex_waiter *right)
389+
static __always_inline int rt_waiter_node_equal(struct rt_waiter_node *left,
390+
struct rt_waiter_node *right)
369391
{
370392
if (left->prio != right->prio)
371393
return 0;
@@ -385,38 +407,38 @@ static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
385407
static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter,
386408
struct rt_mutex_waiter *top_waiter)
387409
{
388-
if (rt_mutex_waiter_less(waiter, top_waiter))
410+
if (rt_waiter_node_less(&waiter->tree, &top_waiter->tree))
389411
return true;
390412

391413
#ifdef RT_MUTEX_BUILD_SPINLOCKS
392414
/*
393415
* Note that RT tasks are excluded from same priority (lateral)
394416
* steals to prevent the introduction of an unbounded latency.
395417
*/
396-
if (rt_prio(waiter->prio) || dl_prio(waiter->prio))
418+
if (rt_prio(waiter->tree.prio) || dl_prio(waiter->tree.prio))
397419
return false;
398420

399-
return rt_mutex_waiter_equal(waiter, top_waiter);
421+
return rt_waiter_node_equal(&waiter->tree, &top_waiter->tree);
400422
#else
401423
return false;
402424
#endif
403425
}
404426

405427
#define __node_2_waiter(node) \
406-
rb_entry((node), struct rt_mutex_waiter, tree_entry)
428+
rb_entry((node), struct rt_mutex_waiter, tree.entry)
407429

408430
static __always_inline bool __waiter_less(struct rb_node *a, const struct rb_node *b)
409431
{
410432
struct rt_mutex_waiter *aw = __node_2_waiter(a);
411433
struct rt_mutex_waiter *bw = __node_2_waiter(b);
412434

413-
if (rt_mutex_waiter_less(aw, bw))
435+
if (rt_waiter_node_less(&aw->tree, &bw->tree))
414436
return 1;
415437

416438
if (!build_ww_mutex())
417439
return 0;
418440

419-
if (rt_mutex_waiter_less(bw, aw))
441+
if (rt_waiter_node_less(&bw->tree, &aw->tree))
420442
return 0;
421443

422444
/* NOTE: relies on waiter->ww_ctx being set before insertion */
@@ -434,48 +456,58 @@ static __always_inline bool __waiter_less(struct rb_node *a, const struct rb_nod
434456
static __always_inline void
435457
rt_mutex_enqueue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter)
436458
{
437-
rb_add_cached(&waiter->tree_entry, &lock->waiters, __waiter_less);
459+
lockdep_assert_held(&lock->wait_lock);
460+
461+
rb_add_cached(&waiter->tree.entry, &lock->waiters, __waiter_less);
438462
}
439463

440464
static __always_inline void
441465
rt_mutex_dequeue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter)
442466
{
443-
if (RB_EMPTY_NODE(&waiter->tree_entry))
467+
lockdep_assert_held(&lock->wait_lock);
468+
469+
if (RB_EMPTY_NODE(&waiter->tree.entry))
444470
return;
445471

446-
rb_erase_cached(&waiter->tree_entry, &lock->waiters);
447-
RB_CLEAR_NODE(&waiter->tree_entry);
472+
rb_erase_cached(&waiter->tree.entry, &lock->waiters);
473+
RB_CLEAR_NODE(&waiter->tree.entry);
448474
}
449475

450-
#define __node_2_pi_waiter(node) \
451-
rb_entry((node), struct rt_mutex_waiter, pi_tree_entry)
476+
#define __node_2_rt_node(node) \
477+
rb_entry((node), struct rt_waiter_node, entry)
452478

453-
static __always_inline bool
454-
__pi_waiter_less(struct rb_node *a, const struct rb_node *b)
479+
static __always_inline bool __pi_waiter_less(struct rb_node *a, const struct rb_node *b)
455480
{
456-
return rt_mutex_waiter_less(__node_2_pi_waiter(a), __node_2_pi_waiter(b));
481+
return rt_waiter_node_less(__node_2_rt_node(a), __node_2_rt_node(b));
457482
}
458483

459484
static __always_inline void
460485
rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
461486
{
462-
rb_add_cached(&waiter->pi_tree_entry, &task->pi_waiters, __pi_waiter_less);
487+
lockdep_assert_held(&task->pi_lock);
488+
489+
rb_add_cached(&waiter->pi_tree.entry, &task->pi_waiters, __pi_waiter_less);
463490
}
464491

465492
static __always_inline void
466493
rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
467494
{
468-
if (RB_EMPTY_NODE(&waiter->pi_tree_entry))
495+
lockdep_assert_held(&task->pi_lock);
496+
497+
if (RB_EMPTY_NODE(&waiter->pi_tree.entry))
469498
return;
470499

471-
rb_erase_cached(&waiter->pi_tree_entry, &task->pi_waiters);
472-
RB_CLEAR_NODE(&waiter->pi_tree_entry);
500+
rb_erase_cached(&waiter->pi_tree.entry, &task->pi_waiters);
501+
RB_CLEAR_NODE(&waiter->pi_tree.entry);
473502
}
474503

475-
static __always_inline void rt_mutex_adjust_prio(struct task_struct *p)
504+
static __always_inline void rt_mutex_adjust_prio(struct rt_mutex_base *lock,
505+
struct task_struct *p)
476506
{
477507
struct task_struct *pi_task = NULL;
478508

509+
lockdep_assert_held(&lock->wait_lock);
510+
lockdep_assert(rt_mutex_owner(lock) == p);
479511
lockdep_assert_held(&p->pi_lock);
480512

481513
if (task_has_pi_waiters(p))
@@ -571,9 +603,14 @@ static __always_inline struct rt_mutex_base *task_blocked_on_lock(struct task_st
571603
* Chain walk basics and protection scope
572604
*
573605
* [R] refcount on task
574-
* [P] task->pi_lock held
606+
* [Pn] task->pi_lock held
575607
* [L] rtmutex->wait_lock held
576608
*
609+
* Normal locking order:
610+
*
611+
* rtmutex->wait_lock
612+
* task->pi_lock
613+
*
577614
* Step Description Protected by
578615
* function arguments:
579616
* @task [R]
@@ -588,27 +625,32 @@ static __always_inline struct rt_mutex_base *task_blocked_on_lock(struct task_st
588625
* again:
589626
* loop_sanity_check();
590627
* retry:
591-
* [1] lock(task->pi_lock); [R] acquire [P]
592-
* [2] waiter = task->pi_blocked_on; [P]
593-
* [3] check_exit_conditions_1(); [P]
594-
* [4] lock = waiter->lock; [P]
595-
* [5] if (!try_lock(lock->wait_lock)) { [P] try to acquire [L]
596-
* unlock(task->pi_lock); release [P]
628+
* [1] lock(task->pi_lock); [R] acquire [P1]
629+
* [2] waiter = task->pi_blocked_on; [P1]
630+
* [3] check_exit_conditions_1(); [P1]
631+
* [4] lock = waiter->lock; [P1]
632+
* [5] if (!try_lock(lock->wait_lock)) { [P1] try to acquire [L]
633+
* unlock(task->pi_lock); release [P1]
597634
* goto retry;
598635
* }
599-
* [6] check_exit_conditions_2(); [P] + [L]
600-
* [7] requeue_lock_waiter(lock, waiter); [P] + [L]
601-
* [8] unlock(task->pi_lock); release [P]
636+
* [6] check_exit_conditions_2(); [P1] + [L]
637+
* [7] requeue_lock_waiter(lock, waiter); [P1] + [L]
638+
* [8] unlock(task->pi_lock); release [P1]
602639
* put_task_struct(task); release [R]
603640
* [9] check_exit_conditions_3(); [L]
604641
* [10] task = owner(lock); [L]
605642
* get_task_struct(task); [L] acquire [R]
606-
* lock(task->pi_lock); [L] acquire [P]
607-
* [11] requeue_pi_waiter(tsk, waiters(lock));[P] + [L]
608-
* [12] check_exit_conditions_4(); [P] + [L]
609-
* [13] unlock(task->pi_lock); release [P]
643+
* lock(task->pi_lock); [L] acquire [P2]
644+
* [11] requeue_pi_waiter(tsk, waiters(lock));[P2] + [L]
645+
* [12] check_exit_conditions_4(); [P2] + [L]
646+
* [13] unlock(task->pi_lock); release [P2]
610647
* unlock(lock->wait_lock); release [L]
611648
* goto again;
649+
*
650+
* Where P1 is the blocking task and P2 is the lock owner; going up one step
651+
* the owner becomes the next blocked task etc..
652+
*
653+
*
612654
*/
613655
static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
614656
enum rtmutex_chainwalk chwalk,
@@ -756,21 +798,26 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
756798
* enabled we continue, but stop the requeueing in the chain
757799
* walk.
758800
*/
759-
if (rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
801+
if (rt_waiter_node_equal(&waiter->tree, task_to_waiter_node(task))) {
760802
if (!detect_deadlock)
761803
goto out_unlock_pi;
762804
else
763805
requeue = false;
764806
}
765807

766808
/*
767-
* [4] Get the next lock
809+
* [4] Get the next lock; per holding task->pi_lock we can't unblock
810+
* and guarantee @lock's existence.
768811
*/
769812
lock = waiter->lock;
770813
/*
771814
* [5] We need to trylock here as we are holding task->pi_lock,
772815
* which is the reverse lock order versus the other rtmutex
773816
* operations.
817+
*
818+
* Per the above, holding task->pi_lock guarantees lock exists, so
819+
* inverting this lock order is infeasible from a life-time
820+
* perspective.
774821
*/
775822
if (!raw_spin_trylock(&lock->wait_lock)) {
776823
raw_spin_unlock_irq(&task->pi_lock);
@@ -874,17 +921,18 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
874921
* or
875922
*
876923
* DL CBS enforcement advancing the effective deadline.
877-
*
878-
* Even though pi_waiters also uses these fields, and that tree is only
879-
* updated in [11], we can do this here, since we hold [L], which
880-
* serializes all pi_waiters access and rb_erase() does not care about
881-
* the values of the node being removed.
882924
*/
883925
waiter_update_prio(waiter, task);
884926

885927
rt_mutex_enqueue(lock, waiter);
886928

887-
/* [8] Release the task */
929+
/*
930+
* [8] Release the (blocking) task in preparation for
931+
* taking the owner task in [10].
932+
*
933+
* Since we hold lock->waiter_lock, task cannot unblock, even if we
934+
* release task->pi_lock.
935+
*/
888936
raw_spin_unlock(&task->pi_lock);
889937
put_task_struct(task);
890938

@@ -908,7 +956,12 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
908956
return 0;
909957
}
910958

911-
/* [10] Grab the next task, i.e. the owner of @lock */
959+
/*
960+
* [10] Grab the next task, i.e. the owner of @lock
961+
*
962+
* Per holding lock->wait_lock and checking for !owner above, there
963+
* must be an owner and it cannot go away.
964+
*/
912965
task = get_task_struct(rt_mutex_owner(lock));
913966
raw_spin_lock(&task->pi_lock);
914967

@@ -921,8 +974,9 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
921974
* and adjust the priority of the owner.
922975
*/
923976
rt_mutex_dequeue_pi(task, prerequeue_top_waiter);
977+
waiter_clone_prio(waiter, task);
924978
rt_mutex_enqueue_pi(task, waiter);
925-
rt_mutex_adjust_prio(task);
979+
rt_mutex_adjust_prio(lock, task);
926980

927981
} else if (prerequeue_top_waiter == waiter) {
928982
/*
@@ -937,8 +991,9 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
937991
*/
938992
rt_mutex_dequeue_pi(task, waiter);
939993
waiter = rt_mutex_top_waiter(lock);
994+
waiter_clone_prio(waiter, task);
940995
rt_mutex_enqueue_pi(task, waiter);
941-
rt_mutex_adjust_prio(task);
996+
rt_mutex_adjust_prio(lock, task);
942997
} else {
943998
/*
944999
* Nothing changed. No need to do any priority
@@ -1154,6 +1209,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
11541209
waiter->task = task;
11551210
waiter->lock = lock;
11561211
waiter_update_prio(waiter, task);
1212+
waiter_clone_prio(waiter, task);
11571213

11581214
/* Get the top priority waiter on the lock */
11591215
if (rt_mutex_has_waiters(lock))
@@ -1187,7 +1243,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
11871243
rt_mutex_dequeue_pi(owner, top_waiter);
11881244
rt_mutex_enqueue_pi(owner, waiter);
11891245

1190-
rt_mutex_adjust_prio(owner);
1246+
rt_mutex_adjust_prio(lock, owner);
11911247
if (owner->pi_blocked_on)
11921248
chain_walk = 1;
11931249
} else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
@@ -1234,6 +1290,8 @@ static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh,
12341290
{
12351291
struct rt_mutex_waiter *waiter;
12361292

1293+
lockdep_assert_held(&lock->wait_lock);
1294+
12371295
raw_spin_lock(&current->pi_lock);
12381296

12391297
waiter = rt_mutex_top_waiter(lock);
@@ -1246,7 +1304,7 @@ static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh,
12461304
* task unblocks.
12471305
*/
12481306
rt_mutex_dequeue_pi(current, waiter);
1249-
rt_mutex_adjust_prio(current);
1307+
rt_mutex_adjust_prio(lock, current);
12501308

12511309
/*
12521310
* As we are waking up the top waiter, and the waiter stays
@@ -1482,7 +1540,7 @@ static void __sched remove_waiter(struct rt_mutex_base *lock,
14821540
if (rt_mutex_has_waiters(lock))
14831541
rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
14841542

1485-
rt_mutex_adjust_prio(owner);
1543+
rt_mutex_adjust_prio(lock, owner);
14861544

14871545
/* Store the lock on which owner is blocked or NULL */
14881546
next_lock = task_blocked_on_lock(owner);

kernel/locking/rtmutex_api.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -459,7 +459,7 @@ void __sched rt_mutex_adjust_pi(struct task_struct *task)
459459
raw_spin_lock_irqsave(&task->pi_lock, flags);
460460

461461
waiter = task->pi_blocked_on;
462-
if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
462+
if (!waiter || rt_waiter_node_equal(&waiter->tree, task_to_waiter_node(task))) {
463463
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
464464
return;
465465
}

0 commit comments

Comments
 (0)