Skip to content

Commit d039f38

Browse files
committed
Merge tag 'locking-urgent-2021-11-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull locking fixes from Thomas Gleixner: "Two regression fixes for reader writer semaphores: - Plug a race in the lock handoff which is caused by inconsistency of the reader and writer path and can lead to corruption of the underlying counter. - down_read_trylock() is suboptimal when the lock is contended and multiple readers trylock concurrently. That's due to the initial value being read non-atomically which results in at least two compare exchange loops. Making the initial readout atomic reduces this significantly. Whith 40 readers by 11% in a benchmark which enforces contention on mmap_sem" * tag 'locking-urgent-2021-11-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: locking/rwsem: Optimize down_read_trylock() under highly contended case locking/rwsem: Make handoff bit handling more consistent
2 parents f8132d6 + 14c2404 commit d039f38

File tree

1 file changed

+89
-93
lines changed

1 file changed

+89
-93
lines changed

kernel/locking/rwsem.c

Lines changed: 89 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -105,9 +105,9 @@
105105
* atomic_long_cmpxchg() will be used to obtain writer lock.
106106
*
107107
* There are three places where the lock handoff bit may be set or cleared.
108-
* 1) rwsem_mark_wake() for readers.
109-
* 2) rwsem_try_write_lock() for writers.
110-
* 3) Error path of rwsem_down_write_slowpath().
108+
* 1) rwsem_mark_wake() for readers -- set, clear
109+
* 2) rwsem_try_write_lock() for writers -- set, clear
110+
* 3) rwsem_del_waiter() -- clear
111111
*
112112
* For all the above cases, wait_lock will be held. A writer must also
113113
* be the first one in the wait_list to be eligible for setting the handoff
@@ -334,6 +334,9 @@ struct rwsem_waiter {
334334
struct task_struct *task;
335335
enum rwsem_waiter_type type;
336336
unsigned long timeout;
337+
338+
/* Writer only, not initialized in reader */
339+
bool handoff_set;
337340
};
338341
#define rwsem_first_waiter(sem) \
339342
list_first_entry(&sem->wait_list, struct rwsem_waiter, list)
@@ -344,12 +347,6 @@ enum rwsem_wake_type {
344347
RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */
345348
};
346349

347-
enum writer_wait_state {
348-
WRITER_NOT_FIRST, /* Writer is not first in wait list */
349-
WRITER_FIRST, /* Writer is first in wait list */
350-
WRITER_HANDOFF /* Writer is first & handoff needed */
351-
};
352-
353350
/*
354351
* The typical HZ value is either 250 or 1000. So set the minimum waiting
355352
* time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait
@@ -365,6 +362,31 @@ enum writer_wait_state {
365362
*/
366363
#define MAX_READERS_WAKEUP 0x100
367364

365+
static inline void
366+
rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
367+
{
368+
lockdep_assert_held(&sem->wait_lock);
369+
list_add_tail(&waiter->list, &sem->wait_list);
370+
/* caller will set RWSEM_FLAG_WAITERS */
371+
}
372+
373+
/*
374+
* Remove a waiter from the wait_list and clear flags.
375+
*
376+
* Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of
377+
* this function. Modify with care.
378+
*/
379+
static inline void
380+
rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
381+
{
382+
lockdep_assert_held(&sem->wait_lock);
383+
list_del(&waiter->list);
384+
if (likely(!list_empty(&sem->wait_list)))
385+
return;
386+
387+
atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count);
388+
}
389+
368390
/*
369391
* handle the lock release when processes blocked on it that can now run
370392
* - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must
@@ -376,6 +398,8 @@ enum writer_wait_state {
376398
* preferably when the wait_lock is released
377399
* - woken process blocks are discarded from the list after having task zeroed
378400
* - writers are only marked woken if downgrading is false
401+
*
402+
* Implies rwsem_del_waiter() for all woken readers.
379403
*/
380404
static void rwsem_mark_wake(struct rw_semaphore *sem,
381405
enum rwsem_wake_type wake_type,
@@ -490,18 +514,25 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
490514

491515
adjustment = woken * RWSEM_READER_BIAS - adjustment;
492516
lockevent_cond_inc(rwsem_wake_reader, woken);
517+
518+
oldcount = atomic_long_read(&sem->count);
493519
if (list_empty(&sem->wait_list)) {
494-
/* hit end of list above */
520+
/*
521+
* Combined with list_move_tail() above, this implies
522+
* rwsem_del_waiter().
523+
*/
495524
adjustment -= RWSEM_FLAG_WAITERS;
525+
if (oldcount & RWSEM_FLAG_HANDOFF)
526+
adjustment -= RWSEM_FLAG_HANDOFF;
527+
} else if (woken) {
528+
/*
529+
* When we've woken a reader, we no longer need to force
530+
* writers to give up the lock and we can clear HANDOFF.
531+
*/
532+
if (oldcount & RWSEM_FLAG_HANDOFF)
533+
adjustment -= RWSEM_FLAG_HANDOFF;
496534
}
497535

498-
/*
499-
* When we've woken a reader, we no longer need to force writers
500-
* to give up the lock and we can clear HANDOFF.
501-
*/
502-
if (woken && (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF))
503-
adjustment -= RWSEM_FLAG_HANDOFF;
504-
505536
if (adjustment)
506537
atomic_long_add(adjustment, &sem->count);
507538

@@ -532,12 +563,12 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
532563
* race conditions between checking the rwsem wait list and setting the
533564
* sem->count accordingly.
534565
*
535-
* If wstate is WRITER_HANDOFF, it will make sure that either the handoff
536-
* bit is set or the lock is acquired with handoff bit cleared.
566+
* Implies rwsem_del_waiter() on success.
537567
*/
538568
static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
539-
enum writer_wait_state wstate)
569+
struct rwsem_waiter *waiter)
540570
{
571+
bool first = rwsem_first_waiter(sem) == waiter;
541572
long count, new;
542573

543574
lockdep_assert_held(&sem->wait_lock);
@@ -546,13 +577,19 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
546577
do {
547578
bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
548579

549-
if (has_handoff && wstate == WRITER_NOT_FIRST)
550-
return false;
580+
if (has_handoff) {
581+
if (!first)
582+
return false;
583+
584+
/* First waiter inherits a previously set handoff bit */
585+
waiter->handoff_set = true;
586+
}
551587

552588
new = count;
553589

554590
if (count & RWSEM_LOCK_MASK) {
555-
if (has_handoff || (wstate != WRITER_HANDOFF))
591+
if (has_handoff || (!rt_task(waiter->task) &&
592+
!time_after(jiffies, waiter->timeout)))
556593
return false;
557594

558595
new |= RWSEM_FLAG_HANDOFF;
@@ -569,9 +606,17 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
569606
* We have either acquired the lock with handoff bit cleared or
570607
* set the handoff bit.
571608
*/
572-
if (new & RWSEM_FLAG_HANDOFF)
609+
if (new & RWSEM_FLAG_HANDOFF) {
610+
waiter->handoff_set = true;
611+
lockevent_inc(rwsem_wlock_handoff);
573612
return false;
613+
}
574614

615+
/*
616+
* Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on
617+
* success.
618+
*/
619+
list_del(&waiter->list);
575620
rwsem_set_owner(sem);
576621
return true;
577622
}
@@ -956,7 +1001,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat
9561001
}
9571002
adjustment += RWSEM_FLAG_WAITERS;
9581003
}
959-
list_add_tail(&waiter.list, &sem->wait_list);
1004+
rwsem_add_waiter(sem, &waiter);
9601005

9611006
/* we're now waiting on the lock, but no longer actively locking */
9621007
count = atomic_long_add_return(adjustment, &sem->count);
@@ -1002,11 +1047,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat
10021047
return sem;
10031048

10041049
out_nolock:
1005-
list_del(&waiter.list);
1006-
if (list_empty(&sem->wait_list)) {
1007-
atomic_long_andnot(RWSEM_FLAG_WAITERS|RWSEM_FLAG_HANDOFF,
1008-
&sem->count);
1009-
}
1050+
rwsem_del_waiter(sem, &waiter);
10101051
raw_spin_unlock_irq(&sem->wait_lock);
10111052
__set_current_state(TASK_RUNNING);
10121053
lockevent_inc(rwsem_rlock_fail);
@@ -1020,9 +1061,7 @@ static struct rw_semaphore *
10201061
rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
10211062
{
10221063
long count;
1023-
enum writer_wait_state wstate;
10241064
struct rwsem_waiter waiter;
1025-
struct rw_semaphore *ret = sem;
10261065
DEFINE_WAKE_Q(wake_q);
10271066

10281067
/* do optimistic spinning and steal lock if possible */
@@ -1038,16 +1077,13 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
10381077
waiter.task = current;
10391078
waiter.type = RWSEM_WAITING_FOR_WRITE;
10401079
waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
1080+
waiter.handoff_set = false;
10411081

10421082
raw_spin_lock_irq(&sem->wait_lock);
1043-
1044-
/* account for this before adding a new element to the list */
1045-
wstate = list_empty(&sem->wait_list) ? WRITER_FIRST : WRITER_NOT_FIRST;
1046-
1047-
list_add_tail(&waiter.list, &sem->wait_list);
1083+
rwsem_add_waiter(sem, &waiter);
10481084

10491085
/* we're now waiting on the lock */
1050-
if (wstate == WRITER_NOT_FIRST) {
1086+
if (rwsem_first_waiter(sem) != &waiter) {
10511087
count = atomic_long_read(&sem->count);
10521088

10531089
/*
@@ -1083,13 +1119,16 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
10831119
/* wait until we successfully acquire the lock */
10841120
set_current_state(state);
10851121
for (;;) {
1086-
if (rwsem_try_write_lock(sem, wstate)) {
1122+
if (rwsem_try_write_lock(sem, &waiter)) {
10871123
/* rwsem_try_write_lock() implies ACQUIRE on success */
10881124
break;
10891125
}
10901126

10911127
raw_spin_unlock_irq(&sem->wait_lock);
10921128

1129+
if (signal_pending_state(state, current))
1130+
goto out_nolock;
1131+
10931132
/*
10941133
* After setting the handoff bit and failing to acquire
10951134
* the lock, attempt to spin on owner to accelerate lock
@@ -1098,7 +1137,7 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
10981137
* In this case, we attempt to acquire the lock again
10991138
* without sleeping.
11001139
*/
1101-
if (wstate == WRITER_HANDOFF) {
1140+
if (waiter.handoff_set) {
11021141
enum owner_state owner_state;
11031142

11041143
preempt_disable();
@@ -1109,66 +1148,26 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
11091148
goto trylock_again;
11101149
}
11111150

1112-
/* Block until there are no active lockers. */
1113-
for (;;) {
1114-
if (signal_pending_state(state, current))
1115-
goto out_nolock;
1116-
1117-
schedule();
1118-
lockevent_inc(rwsem_sleep_writer);
1119-
set_current_state(state);
1120-
/*
1121-
* If HANDOFF bit is set, unconditionally do
1122-
* a trylock.
1123-
*/
1124-
if (wstate == WRITER_HANDOFF)
1125-
break;
1126-
1127-
if ((wstate == WRITER_NOT_FIRST) &&
1128-
(rwsem_first_waiter(sem) == &waiter))
1129-
wstate = WRITER_FIRST;
1130-
1131-
count = atomic_long_read(&sem->count);
1132-
if (!(count & RWSEM_LOCK_MASK))
1133-
break;
1134-
1135-
/*
1136-
* The setting of the handoff bit is deferred
1137-
* until rwsem_try_write_lock() is called.
1138-
*/
1139-
if ((wstate == WRITER_FIRST) && (rt_task(current) ||
1140-
time_after(jiffies, waiter.timeout))) {
1141-
wstate = WRITER_HANDOFF;
1142-
lockevent_inc(rwsem_wlock_handoff);
1143-
break;
1144-
}
1145-
}
1151+
schedule();
1152+
lockevent_inc(rwsem_sleep_writer);
1153+
set_current_state(state);
11461154
trylock_again:
11471155
raw_spin_lock_irq(&sem->wait_lock);
11481156
}
11491157
__set_current_state(TASK_RUNNING);
1150-
list_del(&waiter.list);
11511158
raw_spin_unlock_irq(&sem->wait_lock);
11521159
lockevent_inc(rwsem_wlock);
1153-
1154-
return ret;
1160+
return sem;
11551161

11561162
out_nolock:
11571163
__set_current_state(TASK_RUNNING);
11581164
raw_spin_lock_irq(&sem->wait_lock);
1159-
list_del(&waiter.list);
1160-
1161-
if (unlikely(wstate == WRITER_HANDOFF))
1162-
atomic_long_add(-RWSEM_FLAG_HANDOFF, &sem->count);
1163-
1164-
if (list_empty(&sem->wait_list))
1165-
atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count);
1166-
else
1165+
rwsem_del_waiter(sem, &waiter);
1166+
if (!list_empty(&sem->wait_list))
11671167
rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
11681168
raw_spin_unlock_irq(&sem->wait_lock);
11691169
wake_up_q(&wake_q);
11701170
lockevent_inc(rwsem_wlock_fail);
1171-
11721171
return ERR_PTR(-EINTR);
11731172
}
11741173

@@ -1249,17 +1248,14 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
12491248

12501249
DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
12511250

1252-
/*
1253-
* Optimize for the case when the rwsem is not locked at all.
1254-
*/
1255-
tmp = RWSEM_UNLOCKED_VALUE;
1256-
do {
1251+
tmp = atomic_long_read(&sem->count);
1252+
while (!(tmp & RWSEM_READ_FAILED_MASK)) {
12571253
if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
1258-
tmp + RWSEM_READER_BIAS)) {
1254+
tmp + RWSEM_READER_BIAS)) {
12591255
rwsem_set_reader_owned(sem);
12601256
return 1;
12611257
}
1262-
} while (!(tmp & RWSEM_READ_FAILED_MASK));
1258+
}
12631259
return 0;
12641260
}
12651261

0 commit comments

Comments
 (0)