Skip to content

Commit d2c84bd

Browse files
committed
Merge tag 'for-6.9/io_uring-20240310' of git://git.kernel.dk/linux
Pull io_uring updates from Jens Axboe: - Make running of task_work internal loops more fair, and unify how the different methods deal with them (me) - Support for per-ring NAPI. The two minor networking patches are in a shared branch with netdev (Stefan) - Add support for truncate (Tony) - Export SQPOLL utilization stats (Xiaobing) - Multishot fixes (Pavel) - Fix for a race in manipulating the request flags via poll (Pavel) - Cleanup the multishot checking by making it generic, moving it out of opcode handlers (Pavel) - Various tweaks and cleanups (me, Kunwu, Alexander) * tag 'for-6.9/io_uring-20240310' of git://git.kernel.dk/linux: (53 commits) io_uring: Fix sqpoll utilization check racing with dying sqpoll io_uring/net: dedup io_recv_finish req completion io_uring: refactor DEFER_TASKRUN multishot checks io_uring: fix mshot io-wq checks io_uring/net: add io_req_msg_cleanup() helper io_uring/net: simplify msghd->msg_inq checking io_uring/kbuf: rename REQ_F_PARTIAL_IO to REQ_F_BL_NO_RECYCLE io_uring/net: remove dependency on REQ_F_PARTIAL_IO for sr->done_io io_uring/net: correctly handle multishot recvmsg retry setup io_uring/net: clear REQ_F_BL_EMPTY in the multishot retry handler io_uring: fix io_queue_proc modifying req->flags io_uring: fix mshot read defer taskrun cqe posting io_uring/net: fix overflow check in io_recvmsg_mshot_prep() io_uring/net: correct the type of variable io_uring/sqpoll: statistics of the true utilization of sq threads io_uring/net: move recv/recvmsg flags out of retry loop io_uring/kbuf: flag request if buffer pool is empty after buffer pick io_uring/net: improve the usercopy for sendmsg/recvmsg io_uring/net: move receive multishot out of the generic msghdr path io_uring/net: unify how recvmsg and sendmsg copy in the msghdr ...
2 parents 0f1a876 + 606559d commit d2c84bd

30 files changed

+1253
-504
lines changed

fs/internal.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,7 @@ extern struct open_how build_open_how(int flags, umode_t mode);
183183
extern int build_open_flags(const struct open_how *how, struct open_flags *op);
184184
struct file *file_close_fd_locked(struct files_struct *files, unsigned fd);
185185

186+
long do_ftruncate(struct file *file, loff_t length, int small);
186187
long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
187188
int chmod_common(const struct path *path, umode_t mode);
188189
int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group,

fs/open.c

Lines changed: 28 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -154,49 +154,52 @@ COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length
154154
}
155155
#endif
156156

157-
long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
157+
long do_ftruncate(struct file *file, loff_t length, int small)
158158
{
159159
struct inode *inode;
160160
struct dentry *dentry;
161-
struct fd f;
162161
int error;
163162

164-
error = -EINVAL;
165-
if (length < 0)
166-
goto out;
167-
error = -EBADF;
168-
f = fdget(fd);
169-
if (!f.file)
170-
goto out;
171-
172163
/* explicitly opened as large or we are on 64-bit box */
173-
if (f.file->f_flags & O_LARGEFILE)
164+
if (file->f_flags & O_LARGEFILE)
174165
small = 0;
175166

176-
dentry = f.file->f_path.dentry;
167+
dentry = file->f_path.dentry;
177168
inode = dentry->d_inode;
178-
error = -EINVAL;
179-
if (!S_ISREG(inode->i_mode) || !(f.file->f_mode & FMODE_WRITE))
180-
goto out_putf;
169+
if (!S_ISREG(inode->i_mode) || !(file->f_mode & FMODE_WRITE))
170+
return -EINVAL;
181171

182-
error = -EINVAL;
183172
/* Cannot ftruncate over 2^31 bytes without large file support */
184173
if (small && length > MAX_NON_LFS)
185-
goto out_putf;
174+
return -EINVAL;
186175

187-
error = -EPERM;
188176
/* Check IS_APPEND on real upper inode */
189-
if (IS_APPEND(file_inode(f.file)))
190-
goto out_putf;
177+
if (IS_APPEND(file_inode(file)))
178+
return -EPERM;
191179
sb_start_write(inode->i_sb);
192-
error = security_file_truncate(f.file);
180+
error = security_file_truncate(file);
193181
if (!error)
194-
error = do_truncate(file_mnt_idmap(f.file), dentry, length,
195-
ATTR_MTIME | ATTR_CTIME, f.file);
182+
error = do_truncate(file_mnt_idmap(file), dentry, length,
183+
ATTR_MTIME | ATTR_CTIME, file);
196184
sb_end_write(inode->i_sb);
197-
out_putf:
185+
186+
return error;
187+
}
188+
189+
long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
190+
{
191+
struct fd f;
192+
int error;
193+
194+
if (length < 0)
195+
return -EINVAL;
196+
f = fdget(fd);
197+
if (!f.file)
198+
return -EBADF;
199+
200+
error = do_ftruncate(f.file, length, small);
201+
198202
fdput(f);
199-
out:
200203
return error;
201204
}
202205

include/linux/io_uring_types.h

Lines changed: 84 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#define IO_URING_TYPES_H
33

44
#include <linux/blkdev.h>
5+
#include <linux/hashtable.h>
56
#include <linux/task_work.h>
67
#include <linux/bitmap.h>
78
#include <linux/llist.h>
@@ -240,12 +241,14 @@ struct io_ring_ctx {
240241
unsigned int poll_activated: 1;
241242
unsigned int drain_disabled: 1;
242243
unsigned int compat: 1;
244+
unsigned int iowq_limits_set : 1;
243245

244246
struct task_struct *submitter_task;
245247
struct io_rings *rings;
246248
struct percpu_ref refs;
247249

248250
enum task_work_notify_mode notify_method;
251+
unsigned sq_thread_idle;
249252
} ____cacheline_aligned_in_smp;
250253

251254
/* submission data */
@@ -274,10 +277,20 @@ struct io_ring_ctx {
274277
*/
275278
struct io_rsrc_node *rsrc_node;
276279
atomic_t cancel_seq;
280+
281+
/*
282+
* ->iopoll_list is protected by the ctx->uring_lock for
283+
* io_uring instances that don't use IORING_SETUP_SQPOLL.
284+
* For SQPOLL, only the single threaded io_sq_thread() will
285+
* manipulate the list, hence no extra locking is needed there.
286+
*/
287+
bool poll_multi_queue;
288+
struct io_wq_work_list iopoll_list;
289+
277290
struct io_file_table file_table;
291+
struct io_mapped_ubuf **user_bufs;
278292
unsigned nr_user_files;
279293
unsigned nr_user_bufs;
280-
struct io_mapped_ubuf **user_bufs;
281294

282295
struct io_submit_state submit_state;
283296

@@ -288,15 +301,6 @@ struct io_ring_ctx {
288301
struct io_alloc_cache apoll_cache;
289302
struct io_alloc_cache netmsg_cache;
290303

291-
/*
292-
* ->iopoll_list is protected by the ctx->uring_lock for
293-
* io_uring instances that don't use IORING_SETUP_SQPOLL.
294-
* For SQPOLL, only the single threaded io_sq_thread() will
295-
* manipulate the list, hence no extra locking is needed there.
296-
*/
297-
struct io_wq_work_list iopoll_list;
298-
bool poll_multi_queue;
299-
300304
/*
301305
* Any cancelable uring_cmd is added to this list in
302306
* ->uring_cmd() by io_uring_cmd_insert_cancelable()
@@ -343,8 +347,8 @@ struct io_ring_ctx {
343347
spinlock_t completion_lock;
344348

345349
/* IRQ completion list, under ->completion_lock */
346-
struct io_wq_work_list locked_free_list;
347350
unsigned int locked_free_nr;
351+
struct io_wq_work_list locked_free_list;
348352

349353
struct list_head io_buffers_comp;
350354
struct list_head cq_overflow_list;
@@ -366,9 +370,6 @@ struct io_ring_ctx {
366370
unsigned int file_alloc_start;
367371
unsigned int file_alloc_end;
368372

369-
struct xarray personalities;
370-
u32 pers_next;
371-
372373
struct list_head io_buffers_cache;
373374

374375
/* deferred free list, protected by ->uring_lock */
@@ -389,6 +390,9 @@ struct io_ring_ctx {
389390
struct wait_queue_head rsrc_quiesce_wq;
390391
unsigned rsrc_quiesce;
391392

393+
u32 pers_next;
394+
struct xarray personalities;
395+
392396
/* hashed buffered write serialization */
393397
struct io_wq_hash *hash_map;
394398

@@ -405,11 +409,22 @@ struct io_ring_ctx {
405409

406410
/* io-wq management, e.g. thread count */
407411
u32 iowq_limits[2];
408-
bool iowq_limits_set;
409412

410413
struct callback_head poll_wq_task_work;
411414
struct list_head defer_list;
412-
unsigned sq_thread_idle;
415+
416+
#ifdef CONFIG_NET_RX_BUSY_POLL
417+
struct list_head napi_list; /* track busy poll napi_id */
418+
spinlock_t napi_lock; /* napi_list lock */
419+
420+
/* napi busy poll default timeout */
421+
unsigned int napi_busy_poll_to;
422+
bool napi_prefer_busy_poll;
423+
bool napi_enabled;
424+
425+
DECLARE_HASHTABLE(napi_ht, 4);
426+
#endif
427+
413428
/* protected by ->completion_lock */
414429
unsigned evfd_last_cq_tail;
415430

@@ -455,83 +470,95 @@ enum {
455470
REQ_F_SKIP_LINK_CQES_BIT,
456471
REQ_F_SINGLE_POLL_BIT,
457472
REQ_F_DOUBLE_POLL_BIT,
458-
REQ_F_PARTIAL_IO_BIT,
459473
REQ_F_APOLL_MULTISHOT_BIT,
460474
REQ_F_CLEAR_POLLIN_BIT,
461475
REQ_F_HASH_LOCKED_BIT,
462476
/* keep async read/write and isreg together and in order */
463477
REQ_F_SUPPORT_NOWAIT_BIT,
464478
REQ_F_ISREG_BIT,
465479
REQ_F_POLL_NO_LAZY_BIT,
480+
REQ_F_CANCEL_SEQ_BIT,
481+
REQ_F_CAN_POLL_BIT,
482+
REQ_F_BL_EMPTY_BIT,
483+
REQ_F_BL_NO_RECYCLE_BIT,
466484

467485
/* not a real bit, just to check we're not overflowing the space */
468486
__REQ_F_LAST_BIT,
469487
};
470488

489+
typedef u64 __bitwise io_req_flags_t;
490+
#define IO_REQ_FLAG(bitno) ((__force io_req_flags_t) BIT_ULL((bitno)))
491+
471492
enum {
472493
/* ctx owns file */
473-
REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT),
494+
REQ_F_FIXED_FILE = IO_REQ_FLAG(REQ_F_FIXED_FILE_BIT),
474495
/* drain existing IO first */
475-
REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT),
496+
REQ_F_IO_DRAIN = IO_REQ_FLAG(REQ_F_IO_DRAIN_BIT),
476497
/* linked sqes */
477-
REQ_F_LINK = BIT(REQ_F_LINK_BIT),
498+
REQ_F_LINK = IO_REQ_FLAG(REQ_F_LINK_BIT),
478499
/* doesn't sever on completion < 0 */
479-
REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT),
500+
REQ_F_HARDLINK = IO_REQ_FLAG(REQ_F_HARDLINK_BIT),
480501
/* IOSQE_ASYNC */
481-
REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT),
502+
REQ_F_FORCE_ASYNC = IO_REQ_FLAG(REQ_F_FORCE_ASYNC_BIT),
482503
/* IOSQE_BUFFER_SELECT */
483-
REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),
504+
REQ_F_BUFFER_SELECT = IO_REQ_FLAG(REQ_F_BUFFER_SELECT_BIT),
484505
/* IOSQE_CQE_SKIP_SUCCESS */
485-
REQ_F_CQE_SKIP = BIT(REQ_F_CQE_SKIP_BIT),
506+
REQ_F_CQE_SKIP = IO_REQ_FLAG(REQ_F_CQE_SKIP_BIT),
486507

487508
/* fail rest of links */
488-
REQ_F_FAIL = BIT(REQ_F_FAIL_BIT),
509+
REQ_F_FAIL = IO_REQ_FLAG(REQ_F_FAIL_BIT),
489510
/* on inflight list, should be cancelled and waited on exit reliably */
490-
REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT),
511+
REQ_F_INFLIGHT = IO_REQ_FLAG(REQ_F_INFLIGHT_BIT),
491512
/* read/write uses file position */
492-
REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),
513+
REQ_F_CUR_POS = IO_REQ_FLAG(REQ_F_CUR_POS_BIT),
493514
/* must not punt to workers */
494-
REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
515+
REQ_F_NOWAIT = IO_REQ_FLAG(REQ_F_NOWAIT_BIT),
495516
/* has or had linked timeout */
496-
REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
517+
REQ_F_LINK_TIMEOUT = IO_REQ_FLAG(REQ_F_LINK_TIMEOUT_BIT),
497518
/* needs cleanup */
498-
REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT),
519+
REQ_F_NEED_CLEANUP = IO_REQ_FLAG(REQ_F_NEED_CLEANUP_BIT),
499520
/* already went through poll handler */
500-
REQ_F_POLLED = BIT(REQ_F_POLLED_BIT),
521+
REQ_F_POLLED = IO_REQ_FLAG(REQ_F_POLLED_BIT),
501522
/* buffer already selected */
502-
REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
523+
REQ_F_BUFFER_SELECTED = IO_REQ_FLAG(REQ_F_BUFFER_SELECTED_BIT),
503524
/* buffer selected from ring, needs commit */
504-
REQ_F_BUFFER_RING = BIT(REQ_F_BUFFER_RING_BIT),
525+
REQ_F_BUFFER_RING = IO_REQ_FLAG(REQ_F_BUFFER_RING_BIT),
505526
/* caller should reissue async */
506-
REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT),
527+
REQ_F_REISSUE = IO_REQ_FLAG(REQ_F_REISSUE_BIT),
507528
/* supports async reads/writes */
508-
REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT),
529+
REQ_F_SUPPORT_NOWAIT = IO_REQ_FLAG(REQ_F_SUPPORT_NOWAIT_BIT),
509530
/* regular file */
510-
REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
531+
REQ_F_ISREG = IO_REQ_FLAG(REQ_F_ISREG_BIT),
511532
/* has creds assigned */
512-
REQ_F_CREDS = BIT(REQ_F_CREDS_BIT),
533+
REQ_F_CREDS = IO_REQ_FLAG(REQ_F_CREDS_BIT),
513534
/* skip refcounting if not set */
514-
REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT),
535+
REQ_F_REFCOUNT = IO_REQ_FLAG(REQ_F_REFCOUNT_BIT),
515536
/* there is a linked timeout that has to be armed */
516-
REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT),
537+
REQ_F_ARM_LTIMEOUT = IO_REQ_FLAG(REQ_F_ARM_LTIMEOUT_BIT),
517538
/* ->async_data allocated */
518-
REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT),
539+
REQ_F_ASYNC_DATA = IO_REQ_FLAG(REQ_F_ASYNC_DATA_BIT),
519540
/* don't post CQEs while failing linked requests */
520-
REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT),
541+
REQ_F_SKIP_LINK_CQES = IO_REQ_FLAG(REQ_F_SKIP_LINK_CQES_BIT),
521542
/* single poll may be active */
522-
REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT),
543+
REQ_F_SINGLE_POLL = IO_REQ_FLAG(REQ_F_SINGLE_POLL_BIT),
523544
/* double poll may active */
524-
REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT),
525-
/* request has already done partial IO */
526-
REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT),
545+
REQ_F_DOUBLE_POLL = IO_REQ_FLAG(REQ_F_DOUBLE_POLL_BIT),
527546
/* fast poll multishot mode */
528-
REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT),
547+
REQ_F_APOLL_MULTISHOT = IO_REQ_FLAG(REQ_F_APOLL_MULTISHOT_BIT),
529548
/* recvmsg special flag, clear EPOLLIN */
530-
REQ_F_CLEAR_POLLIN = BIT(REQ_F_CLEAR_POLLIN_BIT),
549+
REQ_F_CLEAR_POLLIN = IO_REQ_FLAG(REQ_F_CLEAR_POLLIN_BIT),
531550
/* hashed into ->cancel_hash_locked, protected by ->uring_lock */
532-
REQ_F_HASH_LOCKED = BIT(REQ_F_HASH_LOCKED_BIT),
551+
REQ_F_HASH_LOCKED = IO_REQ_FLAG(REQ_F_HASH_LOCKED_BIT),
533552
/* don't use lazy poll wake for this request */
534-
REQ_F_POLL_NO_LAZY = BIT(REQ_F_POLL_NO_LAZY_BIT),
553+
REQ_F_POLL_NO_LAZY = IO_REQ_FLAG(REQ_F_POLL_NO_LAZY_BIT),
554+
/* cancel sequence is set and valid */
555+
REQ_F_CANCEL_SEQ = IO_REQ_FLAG(REQ_F_CANCEL_SEQ_BIT),
556+
/* file is pollable */
557+
REQ_F_CAN_POLL = IO_REQ_FLAG(REQ_F_CAN_POLL_BIT),
558+
/* buffer list was empty after selection of buffer */
559+
REQ_F_BL_EMPTY = IO_REQ_FLAG(REQ_F_BL_EMPTY_BIT),
560+
/* don't recycle provided buffers for this request */
561+
REQ_F_BL_NO_RECYCLE = IO_REQ_FLAG(REQ_F_BL_NO_RECYCLE_BIT),
535562
};
536563

537564
typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts);
@@ -592,15 +619,17 @@ struct io_kiocb {
592619
* and after selection it points to the buffer ID itself.
593620
*/
594621
u16 buf_index;
595-
unsigned int flags;
622+
623+
unsigned nr_tw;
624+
625+
/* REQ_F_* flags */
626+
io_req_flags_t flags;
596627

597628
struct io_cqe cqe;
598629

599630
struct io_ring_ctx *ctx;
600631
struct task_struct *task;
601632

602-
struct io_rsrc_node *rsrc_node;
603-
604633
union {
605634
/* store used ubuf, so we can prevent reloading */
606635
struct io_mapped_ubuf *imu;
@@ -621,10 +650,12 @@ struct io_kiocb {
621650
/* cache ->apoll->events */
622651
__poll_t apoll_events;
623652
};
653+
654+
struct io_rsrc_node *rsrc_node;
655+
624656
atomic_t refs;
625657
atomic_t poll_refs;
626658
struct io_task_work io_task_work;
627-
unsigned nr_tw;
628659
/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
629660
struct hlist_node hash_node;
630661
/* internal polling, see IORING_FEAT_FAST_POLL */

include/net/busy_poll.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,10 @@ void napi_busy_loop(unsigned int napi_id,
4848
bool (*loop_end)(void *, unsigned long),
4949
void *loop_end_arg, bool prefer_busy_poll, u16 budget);
5050

51+
void napi_busy_loop_rcu(unsigned int napi_id,
52+
bool (*loop_end)(void *, unsigned long),
53+
void *loop_end_arg, bool prefer_busy_poll, u16 budget);
54+
5155
#else /* CONFIG_NET_RX_BUSY_POLL */
5256
static inline unsigned long net_busy_loop_on(void)
5357
{

0 commit comments

Comments
 (0)