Skip to content

Commit b1011b2

Browse files
committed
Merge tag 'io_uring-6.15-20250418' of git://git.kernel.dk/linux
Pull io_uring fixes from Jens Axboe: - Correctly cap iov_iter->nr_segs for imports of registered buffers, both kbuf and normal ones. Three cleanups to make it saner first, then two fixes for each of the buffer types. This fixes a performance regression where partial buffer usage doesn't trim the tail number of segments, leading the block layer to iterate the IOs to check if it needs splitting. - Two patches tweaking the newly introduced zero-copy rx API, mostly to keep the API consistent once we add multiple interface queues per ring support in the 6.16 release. - zc rx unmapping fix for a dead device * tag 'io_uring-6.15-20250418' of git://git.kernel.dk/linux: io_uring/zcrx: fix late dma unmap for a dead dev io_uring/rsrc: ensure segments counts are correct on kbuf buffers io_uring/rsrc: send exact nr_segs for fixed buffer io_uring/rsrc: refactor io_import_fixed io_uring/rsrc: separate kbuf offset adjustments io_uring/rsrc: don't skip offset calculation io_uring/zcrx: add pp to ifq conversion helper io_uring/zcrx: return ifq id to the user
2 parents fc96b23 + f12ecf5 commit b1011b2

File tree

4 files changed

+79
-55
lines changed

4 files changed

+79
-55
lines changed

include/uapi/linux/io_uring.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1010,7 +1010,9 @@ struct io_uring_zcrx_ifq_reg {
10101010
__u64 region_ptr; /* struct io_uring_region_desc * */
10111011

10121012
struct io_uring_zcrx_offsets offsets;
1013-
__u64 __resv[4];
1013+
__u32 zcrx_id;
1014+
__u32 __resv2;
1015+
__u64 __resv[3];
10141016
};
10151017

10161018
#ifdef __cplusplus

io_uring/rsrc.c

Lines changed: 47 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1032,10 +1032,33 @@ static int validate_fixed_range(u64 buf_addr, size_t len,
10321032
return 0;
10331033
}
10341034

1035+
static int io_import_kbuf(int ddir, struct iov_iter *iter,
1036+
struct io_mapped_ubuf *imu, size_t len, size_t offset)
1037+
{
1038+
size_t count = len + offset;
1039+
1040+
iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, count);
1041+
iov_iter_advance(iter, offset);
1042+
1043+
if (count < imu->len) {
1044+
const struct bio_vec *bvec = iter->bvec;
1045+
1046+
while (len > bvec->bv_len) {
1047+
len -= bvec->bv_len;
1048+
bvec++;
1049+
}
1050+
iter->nr_segs = 1 + bvec - iter->bvec;
1051+
}
1052+
return 0;
1053+
}
1054+
10351055
static int io_import_fixed(int ddir, struct iov_iter *iter,
10361056
struct io_mapped_ubuf *imu,
10371057
u64 buf_addr, size_t len)
10381058
{
1059+
const struct bio_vec *bvec;
1060+
size_t folio_mask;
1061+
unsigned nr_segs;
10391062
size_t offset;
10401063
int ret;
10411064

@@ -1047,56 +1070,35 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
10471070
if (!(imu->dir & (1 << ddir)))
10481071
return -EFAULT;
10491072

1050-
/*
1051-
* Might not be a start of buffer, set size appropriately
1052-
* and advance us to the beginning.
1053-
*/
10541073
offset = buf_addr - imu->ubuf;
1055-
iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);
10561074

1057-
if (offset) {
1058-
/*
1059-
* Don't use iov_iter_advance() here, as it's really slow for
1060-
* using the latter parts of a big fixed buffer - it iterates
1061-
* over each segment manually. We can cheat a bit here for user
1062-
* registered nodes, because we know that:
1063-
*
1064-
* 1) it's a BVEC iter, we set it up
1065-
* 2) all bvecs are the same in size, except potentially the
1066-
* first and last bvec
1067-
*
1068-
* So just find our index, and adjust the iterator afterwards.
1069-
* If the offset is within the first bvec (or the whole first
1070-
* bvec, just use iov_iter_advance(). This makes it easier
1071-
* since we can just skip the first segment, which may not
1072-
* be folio_size aligned.
1073-
*/
1074-
const struct bio_vec *bvec = imu->bvec;
1075-
1076-
/*
1077-
* Kernel buffer bvecs, on the other hand, don't necessarily
1078-
* have the size property of user registered ones, so we have
1079-
* to use the slow iter advance.
1080-
*/
1081-
if (offset < bvec->bv_len) {
1082-
iter->count -= offset;
1083-
iter->iov_offset = offset;
1084-
} else if (imu->is_kbuf) {
1085-
iov_iter_advance(iter, offset);
1086-
} else {
1087-
unsigned long seg_skip;
1075+
if (imu->is_kbuf)
1076+
return io_import_kbuf(ddir, iter, imu, len, offset);
10881077

1089-
/* skip first vec */
1090-
offset -= bvec->bv_len;
1091-
seg_skip = 1 + (offset >> imu->folio_shift);
1078+
/*
1079+
* Don't use iov_iter_advance() here, as it's really slow for
1080+
* using the latter parts of a big fixed buffer - it iterates
1081+
* over each segment manually. We can cheat a bit here for user
1082+
* registered nodes, because we know that:
1083+
*
1084+
* 1) it's a BVEC iter, we set it up
1085+
* 2) all bvecs are the same in size, except potentially the
1086+
* first and last bvec
1087+
*/
1088+
folio_mask = (1UL << imu->folio_shift) - 1;
1089+
bvec = imu->bvec;
1090+
if (offset >= bvec->bv_len) {
1091+
unsigned long seg_skip;
10921092

1093-
iter->bvec += seg_skip;
1094-
iter->nr_segs -= seg_skip;
1095-
iter->count -= bvec->bv_len + offset;
1096-
iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1);
1097-
}
1093+
/* skip first vec */
1094+
offset -= bvec->bv_len;
1095+
seg_skip = 1 + (offset >> imu->folio_shift);
1096+
bvec += seg_skip;
1097+
offset &= folio_mask;
10981098
}
1099-
1099+
nr_segs = (offset + len + bvec->bv_offset + folio_mask) >> imu->folio_shift;
1100+
iov_iter_bvec(iter, ddir, bvec, nr_segs, len);
1101+
iter->iov_offset = offset;
11001102
return 0;
11011103
}
11021104

io_uring/zcrx.c

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,11 @@
2626
#include "zcrx.h"
2727
#include "rsrc.h"
2828

29+
static inline struct io_zcrx_ifq *io_pp_to_ifq(struct page_pool *pp)
30+
{
31+
return pp->mp_priv;
32+
}
33+
2934
#define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
3035

3136
static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
@@ -46,14 +51,21 @@ static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
4651

4752
static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
4853
{
54+
guard(mutex)(&ifq->dma_lock);
55+
4956
if (area->is_mapped)
5057
__io_zcrx_unmap_area(ifq, area, area->nia.num_niovs);
58+
area->is_mapped = false;
5159
}
5260

5361
static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
5462
{
5563
int i;
5664

65+
guard(mutex)(&ifq->dma_lock);
66+
if (area->is_mapped)
67+
return 0;
68+
5769
for (i = 0; i < area->nia.num_niovs; i++) {
5870
struct net_iov *niov = &area->nia.niovs[i];
5971
dma_addr_t dma;
@@ -275,6 +287,7 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx)
275287
ifq->ctx = ctx;
276288
spin_lock_init(&ifq->lock);
277289
spin_lock_init(&ifq->rq_lock);
290+
mutex_init(&ifq->dma_lock);
278291
return ifq;
279292
}
280293

@@ -324,6 +337,7 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)
324337
put_device(ifq->dev);
325338

326339
io_free_rbuf_ring(ifq);
340+
mutex_destroy(&ifq->dma_lock);
327341
kfree(ifq);
328342
}
329343

@@ -354,7 +368,8 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
354368
return -EFAULT;
355369
if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd)))
356370
return -EFAULT;
357-
if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
371+
if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)) ||
372+
reg.__resv2 || reg.zcrx_id)
358373
return -EINVAL;
359374
if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags)
360375
return -EINVAL;
@@ -394,10 +409,6 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
394409
goto err;
395410
get_device(ifq->dev);
396411

397-
ret = io_zcrx_map_area(ifq, ifq->area);
398-
if (ret)
399-
goto err;
400-
401412
mp_param.mp_ops = &io_uring_pp_zc_ops;
402413
mp_param.mp_priv = ifq;
403414
ret = net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param);
@@ -585,7 +596,7 @@ static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq)
585596

586597
static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp)
587598
{
588-
struct io_zcrx_ifq *ifq = pp->mp_priv;
599+
struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
589600

590601
/* pp should already be ensuring that */
591602
if (unlikely(pp->alloc.count))
@@ -617,7 +628,8 @@ static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem)
617628

618629
static int io_pp_zc_init(struct page_pool *pp)
619630
{
620-
struct io_zcrx_ifq *ifq = pp->mp_priv;
631+
struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
632+
int ret;
621633

622634
if (WARN_ON_ONCE(!ifq))
623635
return -EINVAL;
@@ -630,13 +642,17 @@ static int io_pp_zc_init(struct page_pool *pp)
630642
if (pp->p.dma_dir != DMA_FROM_DEVICE)
631643
return -EOPNOTSUPP;
632644

645+
ret = io_zcrx_map_area(ifq, ifq->area);
646+
if (ret)
647+
return ret;
648+
633649
percpu_ref_get(&ifq->ctx->refs);
634650
return 0;
635651
}
636652

637653
static void io_pp_zc_destroy(struct page_pool *pp)
638654
{
639-
struct io_zcrx_ifq *ifq = pp->mp_priv;
655+
struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
640656
struct io_zcrx_area *area = ifq->area;
641657

642658
if (WARN_ON_ONCE(area->free_count != area->nia.num_niovs))
@@ -665,6 +681,9 @@ static void io_pp_uninstall(void *mp_priv, struct netdev_rx_queue *rxq)
665681
struct io_zcrx_ifq *ifq = mp_priv;
666682

667683
io_zcrx_drop_netdev(ifq);
684+
if (ifq->area)
685+
io_zcrx_unmap_area(ifq, ifq->area);
686+
668687
p->mp_ops = NULL;
669688
p->mp_priv = NULL;
670689
}
@@ -791,7 +810,7 @@ static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
791810

792811
niov = netmem_to_net_iov(frag->netmem);
793812
if (niov->pp->mp_ops != &io_uring_pp_zc_ops ||
794-
niov->pp->mp_priv != ifq)
813+
io_pp_to_ifq(niov->pp) != ifq)
795814
return -EFAULT;
796815

797816
if (!io_zcrx_queue_cqe(req, niov, ifq, off + skb_frag_off(frag), len))

io_uring/zcrx.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ struct io_zcrx_ifq {
3838
struct net_device *netdev;
3939
netdevice_tracker netdev_tracker;
4040
spinlock_t lock;
41+
struct mutex dma_lock;
4142
};
4243

4344
#if defined(CONFIG_IO_URING_ZCRX)

0 commit comments

Comments
 (0)