Skip to content

Commit f3a7822

Browse files
committed
Merge tag 'io_uring-5.17-2022-01-21' of git://git.kernel.dk/linux-block
Pull io_uring fixes from Jens Axboe: - Fix the io_uring POLLFREE handling, similarly to how it was done for aio (Pavel) - Remove (now) unused function (Jiapeng) - Small series fixing an issue with work cancelations. A window exists where work isn't locatable in the pending list, and isn't active in a worker yet either. (me) * tag 'io_uring-5.17-2022-01-21' of git://git.kernel.dk/linux-block: io-wq: delete dead lock shuffling code io_uring: perform poll removal even if async work removal is successful io-wq: add intermediate work step between pending list and active work io-wq: perform both unstarted and started work cancelations in one go io-wq: invoke work cancelation with wqe->lock held io-wq: make io_worker lock a raw spinlock io-wq: remove useless 'work' argument to __io_worker_busy() io_uring: fix UAF due to missing POLLFREE handling io_uring: Remove unused function req_ref_put
2 parents 1f40caa + 73031f7 commit f3a7822

File tree

2 files changed

+116
-54
lines changed

2 files changed

+116
-54
lines changed

fs/io-wq.c

Lines changed: 56 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,8 @@ struct io_worker {
4848
struct io_wqe *wqe;
4949

5050
struct io_wq_work *cur_work;
51-
spinlock_t lock;
51+
struct io_wq_work *next_work;
52+
raw_spinlock_t lock;
5253

5354
struct completion ref_done;
5455

@@ -405,8 +406,7 @@ static void io_wqe_dec_running(struct io_worker *worker)
405406
* Worker will start processing some work. Move it to the busy list, if
406407
* it's currently on the freelist
407408
*/
408-
static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
409-
struct io_wq_work *work)
409+
static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker)
410410
__must_hold(wqe->lock)
411411
{
412412
if (worker->flags & IO_WORKER_F_FREE) {
@@ -529,9 +529,10 @@ static void io_assign_current_work(struct io_worker *worker,
529529
cond_resched();
530530
}
531531

532-
spin_lock(&worker->lock);
532+
raw_spin_lock(&worker->lock);
533533
worker->cur_work = work;
534-
spin_unlock(&worker->lock);
534+
worker->next_work = NULL;
535+
raw_spin_unlock(&worker->lock);
535536
}
536537

537538
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work);
@@ -546,7 +547,7 @@ static void io_worker_handle_work(struct io_worker *worker)
546547

547548
do {
548549
struct io_wq_work *work;
549-
get_next:
550+
550551
/*
551552
* If we got some work, mark us as busy. If we didn't, but
552553
* the list isn't empty, it means we stalled on hashed work.
@@ -555,9 +556,20 @@ static void io_worker_handle_work(struct io_worker *worker)
555556
* clear the stalled flag.
556557
*/
557558
work = io_get_next_work(acct, worker);
558-
if (work)
559-
__io_worker_busy(wqe, worker, work);
560-
559+
if (work) {
560+
__io_worker_busy(wqe, worker);
561+
562+
/*
563+
* Make sure cancelation can find this, even before
564+
* it becomes the active work. That avoids a window
565+
* where the work has been removed from our general
566+
* work list, but isn't yet discoverable as the
567+
* current work item for this worker.
568+
*/
569+
raw_spin_lock(&worker->lock);
570+
worker->next_work = work;
571+
raw_spin_unlock(&worker->lock);
572+
}
561573
raw_spin_unlock(&wqe->lock);
562574
if (!work)
563575
break;
@@ -594,11 +606,6 @@ static void io_worker_handle_work(struct io_worker *worker)
594606
spin_unlock_irq(&wq->hash->wait.lock);
595607
if (wq_has_sleeper(&wq->hash->wait))
596608
wake_up(&wq->hash->wait);
597-
raw_spin_lock(&wqe->lock);
598-
/* skip unnecessary unlock-lock wqe->lock */
599-
if (!work)
600-
goto get_next;
601-
raw_spin_unlock(&wqe->lock);
602609
}
603610
} while (work);
604611

@@ -815,7 +822,7 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
815822

816823
refcount_set(&worker->ref, 1);
817824
worker->wqe = wqe;
818-
spin_lock_init(&worker->lock);
825+
raw_spin_lock_init(&worker->lock);
819826
init_completion(&worker->ref_done);
820827

821828
if (index == IO_WQ_ACCT_BOUND)
@@ -973,6 +980,19 @@ void io_wq_hash_work(struct io_wq_work *work, void *val)
973980
work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT));
974981
}
975982

983+
static bool __io_wq_worker_cancel(struct io_worker *worker,
984+
struct io_cb_cancel_data *match,
985+
struct io_wq_work *work)
986+
{
987+
if (work && match->fn(work, match->data)) {
988+
work->flags |= IO_WQ_WORK_CANCEL;
989+
set_notify_signal(worker->task);
990+
return true;
991+
}
992+
993+
return false;
994+
}
995+
976996
static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
977997
{
978998
struct io_cb_cancel_data *match = data;
@@ -981,13 +1001,11 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
9811001
* Hold the lock to avoid ->cur_work going out of scope, caller
9821002
* may dereference the passed in work.
9831003
*/
984-
spin_lock(&worker->lock);
985-
if (worker->cur_work &&
986-
match->fn(worker->cur_work, match->data)) {
987-
set_notify_signal(worker->task);
1004+
raw_spin_lock(&worker->lock);
1005+
if (__io_wq_worker_cancel(worker, match, worker->cur_work) ||
1006+
__io_wq_worker_cancel(worker, match, worker->next_work))
9881007
match->nr_running++;
989-
}
990-
spin_unlock(&worker->lock);
1008+
raw_spin_unlock(&worker->lock);
9911009

9921010
return match->nr_running && !match->cancel_all;
9931011
}
@@ -1039,17 +1057,16 @@ static void io_wqe_cancel_pending_work(struct io_wqe *wqe,
10391057
{
10401058
int i;
10411059
retry:
1042-
raw_spin_lock(&wqe->lock);
10431060
for (i = 0; i < IO_WQ_ACCT_NR; i++) {
10441061
struct io_wqe_acct *acct = io_get_acct(wqe, i == 0);
10451062

10461063
if (io_acct_cancel_pending_work(wqe, acct, match)) {
1064+
raw_spin_lock(&wqe->lock);
10471065
if (match->cancel_all)
10481066
goto retry;
1049-
return;
1067+
break;
10501068
}
10511069
}
1052-
raw_spin_unlock(&wqe->lock);
10531070
}
10541071

10551072
static void io_wqe_cancel_running_work(struct io_wqe *wqe,
@@ -1074,25 +1091,27 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
10741091
* First check pending list, if we're lucky we can just remove it
10751092
* from there. CANCEL_OK means that the work is returned as-new,
10761093
* no completion will be posted for it.
1077-
*/
1078-
for_each_node(node) {
1079-
struct io_wqe *wqe = wq->wqes[node];
1080-
1081-
io_wqe_cancel_pending_work(wqe, &match);
1082-
if (match.nr_pending && !match.cancel_all)
1083-
return IO_WQ_CANCEL_OK;
1084-
}
1085-
1086-
/*
1087-
* Now check if a free (going busy) or busy worker has the work
1094+
*
1095+
* Then check if a free (going busy) or busy worker has the work
10881096
* currently running. If we find it there, we'll return CANCEL_RUNNING
10891097
* as an indication that we attempt to signal cancellation. The
10901098
* completion will run normally in this case.
1099+
*
1100+
* Do both of these while holding the wqe->lock, to ensure that
1101+
* we'll find a work item regardless of state.
10911102
*/
10921103
for_each_node(node) {
10931104
struct io_wqe *wqe = wq->wqes[node];
10941105

1106+
raw_spin_lock(&wqe->lock);
1107+
io_wqe_cancel_pending_work(wqe, &match);
1108+
if (match.nr_pending && !match.cancel_all) {
1109+
raw_spin_unlock(&wqe->lock);
1110+
return IO_WQ_CANCEL_OK;
1111+
}
1112+
10951113
io_wqe_cancel_running_work(wqe, &match);
1114+
raw_spin_unlock(&wqe->lock);
10961115
if (match.nr_running && !match.cancel_all)
10971116
return IO_WQ_CANCEL_RUNNING;
10981117
}
@@ -1263,7 +1282,9 @@ static void io_wq_destroy(struct io_wq *wq)
12631282
.fn = io_wq_work_match_all,
12641283
.cancel_all = true,
12651284
};
1285+
raw_spin_lock(&wqe->lock);
12661286
io_wqe_cancel_pending_work(wqe, &match);
1287+
raw_spin_unlock(&wqe->lock);
12671288
free_cpumask_var(wqe->cpu_mask);
12681289
kfree(wqe);
12691290
}

fs/io_uring.c

Lines changed: 60 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1192,12 +1192,6 @@ static inline bool req_ref_put_and_test(struct io_kiocb *req)
11921192
return atomic_dec_and_test(&req->refs);
11931193
}
11941194

1195-
static inline void req_ref_put(struct io_kiocb *req)
1196-
{
1197-
WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
1198-
WARN_ON_ONCE(req_ref_put_and_test(req));
1199-
}
1200-
12011195
static inline void req_ref_get(struct io_kiocb *req)
12021196
{
12031197
WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
@@ -5468,23 +5462,41 @@ static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
54685462

54695463
static inline void io_poll_remove_entry(struct io_poll_iocb *poll)
54705464
{
5471-
struct wait_queue_head *head = poll->head;
5465+
struct wait_queue_head *head = smp_load_acquire(&poll->head);
54725466

5473-
spin_lock_irq(&head->lock);
5474-
list_del_init(&poll->wait.entry);
5475-
poll->head = NULL;
5476-
spin_unlock_irq(&head->lock);
5467+
if (head) {
5468+
spin_lock_irq(&head->lock);
5469+
list_del_init(&poll->wait.entry);
5470+
poll->head = NULL;
5471+
spin_unlock_irq(&head->lock);
5472+
}
54775473
}
54785474

54795475
static void io_poll_remove_entries(struct io_kiocb *req)
54805476
{
54815477
struct io_poll_iocb *poll = io_poll_get_single(req);
54825478
struct io_poll_iocb *poll_double = io_poll_get_double(req);
54835479

5484-
if (poll->head)
5485-
io_poll_remove_entry(poll);
5486-
if (poll_double && poll_double->head)
5480+
/*
5481+
* While we hold the waitqueue lock and the waitqueue is nonempty,
5482+
* wake_up_pollfree() will wait for us. However, taking the waitqueue
5483+
* lock in the first place can race with the waitqueue being freed.
5484+
*
5485+
* We solve this as eventpoll does: by taking advantage of the fact that
5486+
* all users of wake_up_pollfree() will RCU-delay the actual free. If
5487+
* we enter rcu_read_lock() and see that the pointer to the queue is
5488+
* non-NULL, we can then lock it without the memory being freed out from
5489+
* under us.
5490+
*
5491+
* Keep holding rcu_read_lock() as long as we hold the queue lock, in
5492+
* case the caller deletes the entry from the queue, leaving it empty.
5493+
* In that case, only RCU prevents the queue memory from being freed.
5494+
*/
5495+
rcu_read_lock();
5496+
io_poll_remove_entry(poll);
5497+
if (poll_double)
54875498
io_poll_remove_entry(poll_double);
5499+
rcu_read_unlock();
54885500
}
54895501

54905502
/*
@@ -5624,6 +5636,30 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
56245636
wait);
56255637
__poll_t mask = key_to_poll(key);
56265638

5639+
if (unlikely(mask & POLLFREE)) {
5640+
io_poll_mark_cancelled(req);
5641+
/* we have to kick tw in case it's not already */
5642+
io_poll_execute(req, 0);
5643+
5644+
/*
5645+
* If the waitqueue is being freed early but someone is already
5646+
* holds ownership over it, we have to tear down the request as
5647+
* best we can. That means immediately removing the request from
5648+
* its waitqueue and preventing all further accesses to the
5649+
* waitqueue via the request.
5650+
*/
5651+
list_del_init(&poll->wait.entry);
5652+
5653+
/*
5654+
* Careful: this *must* be the last step, since as soon
5655+
* as req->head is NULL'ed out, the request can be
5656+
* completed and freed, since aio_poll_complete_work()
5657+
* will no longer need to take the waitqueue lock.
5658+
*/
5659+
smp_store_release(&poll->head, NULL);
5660+
return 1;
5661+
}
5662+
56275663
/* for instances that support it check for an event match first */
56285664
if (mask && !(mask & poll->events))
56295665
return 0;
@@ -6350,16 +6386,21 @@ static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr)
63506386
WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current);
63516387

63526388
ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
6353-
if (ret != -ENOENT)
6354-
return ret;
6389+
/*
6390+
* Fall-through even for -EALREADY, as we may have poll armed
6391+
* that need unarming.
6392+
*/
6393+
if (!ret)
6394+
return 0;
63556395

63566396
spin_lock(&ctx->completion_lock);
6397+
ret = io_poll_cancel(ctx, sqe_addr, false);
6398+
if (ret != -ENOENT)
6399+
goto out;
6400+
63576401
spin_lock_irq(&ctx->timeout_lock);
63586402
ret = io_timeout_cancel(ctx, sqe_addr);
63596403
spin_unlock_irq(&ctx->timeout_lock);
6360-
if (ret != -ENOENT)
6361-
goto out;
6362-
ret = io_poll_cancel(ctx, sqe_addr, false);
63636404
out:
63646405
spin_unlock(&ctx->completion_lock);
63656406
return ret;

0 commit comments

Comments
 (0)