Skip to content

Commit ddf21bd

Browse files
committed
Merge tag 'iov_iter.3-5.15-2021-09-17' of git://git.kernel.dk/linux-block
Pull io_uring iov_iter retry fixes from Jens Axboe: "This adds a helper to save/restore iov_iter state, and modifies io_uring to use it. After that is done, we can now kill the iter->truncated addition that we added for this release. The io_uring change is being overly cautious with the save/restore/advance, but better safe than sorry and we can always improve that and reduce the overhead if it proves to be of concern. The only case to be worried about in this regard is huge IO, where iteration can take a while to iterate segments. I spent some time writing test cases, and expanded the coverage quite a bit from the last posting of this. liburing carries this regression test case now: https://git.kernel.dk/cgit/liburing/tree/test/file-verify.c which exercises all of this. It now also supports provided buffers, and explicitly tests for end-of-file/device truncation as well. On top of that, Pavel sanitized the IOPOLL retry path to follow the exact same pattern as normal IO" * tag 'iov_iter.3-5.15-2021-09-17' of git://git.kernel.dk/linux-block: io_uring: move iopoll reissue into regular IO path Revert "iov_iter: track truncated size" io_uring: use iov_iter state save/restore helpers iov_iter: add helper to save iov_iter state
2 parents 0bc7eb0 + b66ceaf commit ddf21bd

File tree

3 files changed

+128
-45
lines changed

3 files changed

+128
-45
lines changed

fs/io_uring.c

Lines changed: 76 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -712,6 +712,7 @@ struct io_async_rw {
712712
struct iovec fast_iov[UIO_FASTIOV];
713713
const struct iovec *free_iovec;
714714
struct iov_iter iter;
715+
struct iov_iter_state iter_state;
715716
size_t bytes_done;
716717
struct wait_page_queue wpq;
717718
};
@@ -735,7 +736,6 @@ enum {
735736
REQ_F_BUFFER_SELECTED_BIT,
736737
REQ_F_COMPLETE_INLINE_BIT,
737738
REQ_F_REISSUE_BIT,
738-
REQ_F_DONT_REISSUE_BIT,
739739
REQ_F_CREDS_BIT,
740740
REQ_F_REFCOUNT_BIT,
741741
REQ_F_ARM_LTIMEOUT_BIT,
@@ -782,8 +782,6 @@ enum {
782782
REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT),
783783
/* caller should reissue async */
784784
REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT),
785-
/* don't attempt request reissue, see io_rw_reissue() */
786-
REQ_F_DONT_REISSUE = BIT(REQ_F_DONT_REISSUE_BIT),
787785
/* supports async reads */
788786
REQ_F_NOWAIT_READ = BIT(REQ_F_NOWAIT_READ_BIT),
789787
/* supports async writes */
@@ -2444,13 +2442,6 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
24442442
req = list_first_entry(done, struct io_kiocb, inflight_entry);
24452443
list_del(&req->inflight_entry);
24462444

2447-
if (READ_ONCE(req->result) == -EAGAIN &&
2448-
!(req->flags & REQ_F_DONT_REISSUE)) {
2449-
req->iopoll_completed = 0;
2450-
io_req_task_queue_reissue(req);
2451-
continue;
2452-
}
2453-
24542445
__io_cqring_fill_event(ctx, req->user_data, req->result,
24552446
io_put_rw_kbuf(req));
24562447
(*nr_events)++;
@@ -2613,8 +2604,7 @@ static bool io_resubmit_prep(struct io_kiocb *req)
26132604

26142605
if (!rw)
26152606
return !io_req_prep_async(req);
2616-
/* may have left rw->iter inconsistent on -EIOCBQUEUED */
2617-
iov_iter_revert(&rw->iter, req->result - iov_iter_count(&rw->iter));
2607+
iov_iter_restore(&rw->iter, &rw->iter_state);
26182608
return true;
26192609
}
26202610

@@ -2714,10 +2704,9 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
27142704
if (kiocb->ki_flags & IOCB_WRITE)
27152705
kiocb_end_write(req);
27162706
if (unlikely(res != req->result)) {
2717-
if (!(res == -EAGAIN && io_rw_should_reissue(req) &&
2718-
io_resubmit_prep(req))) {
2719-
req_set_fail(req);
2720-
req->flags |= REQ_F_DONT_REISSUE;
2707+
if (res == -EAGAIN && io_rw_should_reissue(req)) {
2708+
req->flags |= REQ_F_REISSUE;
2709+
return;
27212710
}
27222711
}
27232712

@@ -2937,7 +2926,6 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
29372926
{
29382927
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
29392928
struct io_async_rw *io = req->async_data;
2940-
bool check_reissue = kiocb->ki_complete == io_complete_rw;
29412929

29422930
/* add previously done IO, if any */
29432931
if (io && io->bytes_done > 0) {
@@ -2949,19 +2937,27 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
29492937

29502938
if (req->flags & REQ_F_CUR_POS)
29512939
req->file->f_pos = kiocb->ki_pos;
2952-
if (ret >= 0 && check_reissue)
2940+
if (ret >= 0 && (kiocb->ki_complete == io_complete_rw))
29532941
__io_complete_rw(req, ret, 0, issue_flags);
29542942
else
29552943
io_rw_done(kiocb, ret);
29562944

2957-
if (check_reissue && (req->flags & REQ_F_REISSUE)) {
2945+
if (req->flags & REQ_F_REISSUE) {
29582946
req->flags &= ~REQ_F_REISSUE;
29592947
if (io_resubmit_prep(req)) {
29602948
io_req_task_queue_reissue(req);
29612949
} else {
2950+
unsigned int cflags = io_put_rw_kbuf(req);
2951+
struct io_ring_ctx *ctx = req->ctx;
2952+
29622953
req_set_fail(req);
2963-
__io_req_complete(req, issue_flags, ret,
2964-
io_put_rw_kbuf(req));
2954+
if (issue_flags & IO_URING_F_NONBLOCK) {
2955+
mutex_lock(&ctx->uring_lock);
2956+
__io_req_complete(req, issue_flags, ret, cflags);
2957+
mutex_unlock(&ctx->uring_lock);
2958+
} else {
2959+
__io_req_complete(req, issue_flags, ret, cflags);
2960+
}
29652961
}
29662962
}
29672963
}
@@ -3324,12 +3320,17 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
33243320
if (!force && !io_op_defs[req->opcode].needs_async_setup)
33253321
return 0;
33263322
if (!req->async_data) {
3323+
struct io_async_rw *iorw;
3324+
33273325
if (io_alloc_async_data(req)) {
33283326
kfree(iovec);
33293327
return -ENOMEM;
33303328
}
33313329

33323330
io_req_map_rw(req, iovec, fast_iov, iter);
3331+
iorw = req->async_data;
3332+
/* we've copied and mapped the iter, ensure state is saved */
3333+
iov_iter_save_state(&iorw->iter, &iorw->iter_state);
33333334
}
33343335
return 0;
33353336
}
@@ -3348,6 +3349,7 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
33483349
iorw->free_iovec = iov;
33493350
if (iov)
33503351
req->flags |= REQ_F_NEED_CLEANUP;
3352+
iov_iter_save_state(&iorw->iter, &iorw->iter_state);
33513353
return 0;
33523354
}
33533355

@@ -3451,19 +3453,28 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
34513453
struct kiocb *kiocb = &req->rw.kiocb;
34523454
struct iov_iter __iter, *iter = &__iter;
34533455
struct io_async_rw *rw = req->async_data;
3454-
ssize_t io_size, ret, ret2;
34553456
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3457+
struct iov_iter_state __state, *state;
3458+
ssize_t ret, ret2;
34563459

34573460
if (rw) {
34583461
iter = &rw->iter;
3462+
state = &rw->iter_state;
3463+
/*
3464+
* We come here from an earlier attempt, restore our state to
3465+
* match in case it doesn't. It's cheap enough that we don't
3466+
* need to make this conditional.
3467+
*/
3468+
iov_iter_restore(iter, state);
34593469
iovec = NULL;
34603470
} else {
34613471
ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
34623472
if (ret < 0)
34633473
return ret;
3474+
state = &__state;
3475+
iov_iter_save_state(iter, state);
34643476
}
3465-
io_size = iov_iter_count(iter);
3466-
req->result = io_size;
3477+
req->result = iov_iter_count(iter);
34673478

34683479
/* Ensure we clear previously set non-block flag */
34693480
if (!force_nonblock)
@@ -3477,7 +3488,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
34773488
return ret ?: -EAGAIN;
34783489
}
34793490

3480-
ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size);
3491+
ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), req->result);
34813492
if (unlikely(ret)) {
34823493
kfree(iovec);
34833494
return ret;
@@ -3493,30 +3504,49 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
34933504
/* no retry on NONBLOCK nor RWF_NOWAIT */
34943505
if (req->flags & REQ_F_NOWAIT)
34953506
goto done;
3496-
/* some cases will consume bytes even on error returns */
3497-
iov_iter_reexpand(iter, iter->count + iter->truncated);
3498-
iov_iter_revert(iter, io_size - iov_iter_count(iter));
34993507
ret = 0;
35003508
} else if (ret == -EIOCBQUEUED) {
35013509
goto out_free;
3502-
} else if (ret <= 0 || ret == io_size || !force_nonblock ||
3510+
} else if (ret <= 0 || ret == req->result || !force_nonblock ||
35033511
(req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
35043512
/* read all, failed, already did sync or don't want to retry */
35053513
goto done;
35063514
}
35073515

3516+
/*
3517+
* Don't depend on the iter state matching what was consumed, or being
3518+
* untouched in case of error. Restore it and we'll advance it
3519+
* manually if we need to.
3520+
*/
3521+
iov_iter_restore(iter, state);
3522+
35083523
ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
35093524
if (ret2)
35103525
return ret2;
35113526

35123527
iovec = NULL;
35133528
rw = req->async_data;
3514-
/* now use our persistent iterator, if we aren't already */
3515-
iter = &rw->iter;
3529+
/*
3530+
* Now use our persistent iterator and state, if we aren't already.
3531+
* We've restored and mapped the iter to match.
3532+
*/
3533+
if (iter != &rw->iter) {
3534+
iter = &rw->iter;
3535+
state = &rw->iter_state;
3536+
}
35163537

35173538
do {
3518-
io_size -= ret;
3539+
/*
3540+
* We end up here because of a partial read, either from
3541+
* above or inside this loop. Advance the iter by the bytes
3542+
* that were consumed.
3543+
*/
3544+
iov_iter_advance(iter, ret);
3545+
if (!iov_iter_count(iter))
3546+
break;
35193547
rw->bytes_done += ret;
3548+
iov_iter_save_state(iter, state);
3549+
35203550
/* if we can retry, do so with the callbacks armed */
35213551
if (!io_rw_should_retry(req)) {
35223552
kiocb->ki_flags &= ~IOCB_WAITQ;
@@ -3534,7 +3564,8 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
35343564
return 0;
35353565
/* we got some bytes, but not all. retry. */
35363566
kiocb->ki_flags &= ~IOCB_WAITQ;
3537-
} while (ret > 0 && ret < io_size);
3567+
iov_iter_restore(iter, state);
3568+
} while (ret > 0);
35383569
done:
35393570
kiocb_done(kiocb, ret, issue_flags);
35403571
out_free:
@@ -3557,19 +3588,24 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
35573588
struct kiocb *kiocb = &req->rw.kiocb;
35583589
struct iov_iter __iter, *iter = &__iter;
35593590
struct io_async_rw *rw = req->async_data;
3560-
ssize_t ret, ret2, io_size;
35613591
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3592+
struct iov_iter_state __state, *state;
3593+
ssize_t ret, ret2;
35623594

35633595
if (rw) {
35643596
iter = &rw->iter;
3597+
state = &rw->iter_state;
3598+
iov_iter_restore(iter, state);
35653599
iovec = NULL;
35663600
} else {
35673601
ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
35683602
if (ret < 0)
35693603
return ret;
3604+
state = &__state;
3605+
iov_iter_save_state(iter, state);
35703606
}
3571-
io_size = iov_iter_count(iter);
3572-
req->result = io_size;
3607+
req->result = iov_iter_count(iter);
3608+
ret2 = 0;
35733609

35743610
/* Ensure we clear previously set non-block flag */
35753611
if (!force_nonblock)
@@ -3586,7 +3622,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
35863622
(req->flags & REQ_F_ISREG))
35873623
goto copy_iov;
35883624

3589-
ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), io_size);
3625+
ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), req->result);
35903626
if (unlikely(ret))
35913627
goto out_free;
35923628

@@ -3633,9 +3669,9 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
36333669
kiocb_done(kiocb, ret2, issue_flags);
36343670
} else {
36353671
copy_iov:
3636-
/* some cases will consume bytes even on error returns */
3637-
iov_iter_reexpand(iter, iter->count + iter->truncated);
3638-
iov_iter_revert(iter, io_size - iov_iter_count(iter));
3672+
iov_iter_restore(iter, state);
3673+
if (ret2 > 0)
3674+
iov_iter_advance(iter, ret2);
36393675
ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
36403676
return ret ?: -EAGAIN;
36413677
}

include/linux/uio.h

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,12 @@ enum iter_type {
2727
ITER_DISCARD,
2828
};
2929

30+
struct iov_iter_state {
31+
size_t iov_offset;
32+
size_t count;
33+
unsigned long nr_segs;
34+
};
35+
3036
struct iov_iter {
3137
u8 iter_type;
3238
bool data_source;
@@ -47,14 +53,21 @@ struct iov_iter {
4753
};
4854
loff_t xarray_start;
4955
};
50-
size_t truncated;
5156
};
5257

5358
static inline enum iter_type iov_iter_type(const struct iov_iter *i)
5459
{
5560
return i->iter_type;
5661
}
5762

63+
static inline void iov_iter_save_state(struct iov_iter *iter,
64+
struct iov_iter_state *state)
65+
{
66+
state->iov_offset = iter->iov_offset;
67+
state->count = iter->count;
68+
state->nr_segs = iter->nr_segs;
69+
}
70+
5871
static inline bool iter_is_iovec(const struct iov_iter *i)
5972
{
6073
return iov_iter_type(i) == ITER_IOVEC;
@@ -233,6 +246,7 @@ ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages,
233246
ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages,
234247
size_t maxsize, size_t *start);
235248
int iov_iter_npages(const struct iov_iter *i, int maxpages);
249+
void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state);
236250

237251
const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags);
238252

@@ -255,10 +269,8 @@ static inline void iov_iter_truncate(struct iov_iter *i, u64 count)
255269
* conversion in assignement is by definition greater than all
256270
* values of size_t, including old i->count.
257271
*/
258-
if (i->count > count) {
259-
i->truncated += i->count - count;
272+
if (i->count > count)
260273
i->count = count;
261-
}
262274
}
263275

264276
/*
@@ -267,7 +279,6 @@ static inline void iov_iter_truncate(struct iov_iter *i, u64 count)
267279
*/
268280
static inline void iov_iter_reexpand(struct iov_iter *i, size_t count)
269281
{
270-
i->truncated -= count - i->count;
271282
i->count = count;
272283
}
273284

lib/iov_iter.c

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1972,3 +1972,39 @@ int import_single_range(int rw, void __user *buf, size_t len,
19721972
return 0;
19731973
}
19741974
EXPORT_SYMBOL(import_single_range);
1975+
1976+
/**
1977+
* iov_iter_restore() - Restore a &struct iov_iter to the same state as when
1978+
* iov_iter_save_state() was called.
1979+
*
1980+
* @i: &struct iov_iter to restore
1981+
* @state: state to restore from
1982+
*
1983+
* Used after iov_iter_save_state() to bring restore @i, if operations may
1984+
* have advanced it.
1985+
*
1986+
* Note: only works on ITER_IOVEC, ITER_BVEC, and ITER_KVEC
1987+
*/
1988+
void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
1989+
{
1990+
if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i)) &&
1991+
!iov_iter_is_kvec(i))
1992+
return;
1993+
i->iov_offset = state->iov_offset;
1994+
i->count = state->count;
1995+
/*
1996+
* For the *vec iters, nr_segs + iov is constant - if we increment
1997+
* the vec, then we also decrement the nr_segs count. Hence we don't
1998+
* need to track both of these, just one is enough and we can deduct
1999+
* the other from that. ITER_KVEC and ITER_IOVEC are the same struct
2000+
* size, so we can just increment the iov pointer as they are unionzed.
2001+
* ITER_BVEC _may_ be the same size on some archs, but on others it is
2002+
* not. Be safe and handle it separately.
2003+
*/
2004+
BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec));
2005+
if (iov_iter_is_bvec(i))
2006+
i->bvec -= state->nr_segs - i->nr_segs;
2007+
else
2008+
i->iov -= state->nr_segs - i->nr_segs;
2009+
i->nr_segs = state->nr_segs;
2010+
}

0 commit comments

Comments
 (0)