Skip to content

Commit c1b7fcf

Browse files
committed
Merge tag 'for-6.6/io_uring-2023-08-28' of git://git.kernel.dk/linux
Pull io_uring updates from Jens Axboe: "Fairly quiet round in terms of features, mostly just improvements all over the map for existing code. In detail: - Initial support for socket operations through io_uring. Latter half of this will likely land with the 6.7 kernel, then allowing things like get/setsockopt (Breno) - Cleanup of the cancel code, and then adding support for canceling requests with the opcode as the key (me) - Improvements for the io-wq locking (me) - Fix affinity setting for SQPOLL based io-wq (me) - Remove the io_uring userspace code. These were added initially as copies from liburing, but all of them have since bitrotted and are way out of date at this point. Rather than attempt to keep them in sync, just get rid of them. People will have liburing available anyway for these examples. (Pavel) - Series improving the CQ/SQ ring caching (Pavel) - Misc fixes and cleanups (Pavel, Yue, me)" * tag 'for-6.6/io_uring-2023-08-28' of git://git.kernel.dk/linux: (47 commits) io_uring: move iopoll ctx fields around io_uring: move multishot cqe cache in ctx io_uring: separate task_work/waiting cache line io_uring: banish non-hot data to end of io_ring_ctx io_uring: move non aligned field to the end io_uring: add option to remove SQ indirection io_uring: compact SQ/CQ heads/tails io_uring: force inline io_fill_cqe_req io_uring: merge iopoll and normal completion paths io_uring: reorder cqring_flush and wakeups io_uring: optimise extra io_get_cqe null check io_uring: refactor __io_get_cqe() io_uring: simplify big_cqe handling io_uring: cqe init hardening io_uring: improve cqe !tracing hot path io_uring/rsrc: Annotate struct io_mapped_ubuf with __counted_by io_uring/sqpoll: fix io-wq affinity when IORING_SETUP_SQPOLL is used io_uring: simplify io_run_task_work_sig return io_uring/rsrc: keep one global dummy_ubuf io_uring: never overflow io_aux_cqe ...
2 parents adfd671 + 644c4a7 commit c1b7fcf

31 files changed

+432
-1767
lines changed

MAINTAINERS

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10966,7 +10966,6 @@ F: include/linux/io_uring_types.h
1096610966
F: include/trace/events/io_uring.h
1096710967
F: include/uapi/linux/io_uring.h
1096810968
F: io_uring/
10969-
F: tools/io_uring/
1097010969

1097110970
IPMI SUBSYSTEM
1097210971
M: Corey Minyard <minyard@acm.org>

include/linux/io_uring.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ static inline void io_uring_free(struct task_struct *tsk)
8181
if (tsk->io_uring)
8282
__io_uring_free(tsk);
8383
}
84+
int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags);
8485
#else
8586
static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
8687
struct iov_iter *iter, void *ioucmd)
@@ -116,6 +117,11 @@ static inline const char *io_uring_get_opcode(u8 opcode)
116117
{
117118
return "";
118119
}
120+
static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd,
121+
unsigned int issue_flags)
122+
{
123+
return -EOPNOTSUPP;
124+
}
119125
#endif
120126

121127
#endif

include/linux/io_uring_types.h

Lines changed: 65 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,8 @@ struct io_uring_task {
6969
};
7070

7171
struct io_uring {
72-
u32 head ____cacheline_aligned_in_smp;
73-
u32 tail ____cacheline_aligned_in_smp;
72+
u32 head;
73+
u32 tail;
7474
};
7575

7676
/*
@@ -176,7 +176,6 @@ struct io_submit_state {
176176
unsigned short submit_nr;
177177
unsigned int cqes_count;
178178
struct blk_plug plug;
179-
struct io_uring_cqe cqes[16];
180179
};
181180

182181
struct io_ev_fd {
@@ -205,25 +204,17 @@ struct io_ring_ctx {
205204
unsigned int has_evfd: 1;
206205
/* all CQEs should be posted only by the submitter task */
207206
unsigned int task_complete: 1;
207+
unsigned int lockless_cq: 1;
208208
unsigned int syscall_iopoll: 1;
209209
unsigned int poll_activated: 1;
210210
unsigned int drain_disabled: 1;
211211
unsigned int compat: 1;
212212

213-
enum task_work_notify_mode notify_method;
213+
struct task_struct *submitter_task;
214+
struct io_rings *rings;
215+
struct percpu_ref refs;
214216

215-
/*
216-
* If IORING_SETUP_NO_MMAP is used, then the below holds
217-
* the gup'ed pages for the two rings, and the sqes.
218-
*/
219-
unsigned short n_ring_pages;
220-
unsigned short n_sqe_pages;
221-
struct page **ring_pages;
222-
struct page **sqe_pages;
223-
224-
struct io_rings *rings;
225-
struct task_struct *submitter_task;
226-
struct percpu_ref refs;
217+
enum task_work_notify_mode notify_method;
227218
} ____cacheline_aligned_in_smp;
228219

229220
/* submission data */
@@ -261,31 +252,20 @@ struct io_ring_ctx {
261252

262253
struct io_buffer_list *io_bl;
263254
struct xarray io_bl_xa;
264-
struct list_head io_buffers_cache;
265255

266256
struct io_hash_table cancel_table_locked;
267-
struct list_head cq_overflow_list;
268257
struct io_alloc_cache apoll_cache;
269258
struct io_alloc_cache netmsg_cache;
270-
} ____cacheline_aligned_in_smp;
271-
272-
/* IRQ completion list, under ->completion_lock */
273-
struct io_wq_work_list locked_free_list;
274-
unsigned int locked_free_nr;
275-
276-
const struct cred *sq_creds; /* cred used for __io_sq_thread() */
277-
struct io_sq_data *sq_data; /* if using sq thread polling */
278-
279-
struct wait_queue_head sqo_sq_wait;
280-
struct list_head sqd_list;
281259

282-
unsigned long check_cq;
283-
284-
unsigned int file_alloc_start;
285-
unsigned int file_alloc_end;
286-
287-
struct xarray personalities;
288-
u32 pers_next;
260+
/*
261+
* ->iopoll_list is protected by the ctx->uring_lock for
262+
* io_uring instances that don't use IORING_SETUP_SQPOLL.
263+
* For SQPOLL, only the single threaded io_sq_thread() will
264+
* manipulate the list, hence no extra locking is needed there.
265+
*/
266+
struct io_wq_work_list iopoll_list;
267+
bool poll_multi_queue;
268+
} ____cacheline_aligned_in_smp;
289269

290270
struct {
291271
/*
@@ -298,39 +278,55 @@ struct io_ring_ctx {
298278
unsigned cached_cq_tail;
299279
unsigned cq_entries;
300280
struct io_ev_fd __rcu *io_ev_fd;
301-
struct wait_queue_head cq_wait;
302281
unsigned cq_extra;
303282
} ____cacheline_aligned_in_smp;
304283

284+
/*
285+
* task_work and async notification delivery cacheline. Expected to
286+
* regularly bounce b/w CPUs.
287+
*/
305288
struct {
306-
spinlock_t completion_lock;
307-
308-
bool poll_multi_queue;
309-
atomic_t cq_wait_nr;
310-
311-
/*
312-
* ->iopoll_list is protected by the ctx->uring_lock for
313-
* io_uring instances that don't use IORING_SETUP_SQPOLL.
314-
* For SQPOLL, only the single threaded io_sq_thread() will
315-
* manipulate the list, hence no extra locking is needed there.
316-
*/
317-
struct io_wq_work_list iopoll_list;
318-
struct io_hash_table cancel_table;
319-
320289
struct llist_head work_llist;
321-
322-
struct list_head io_buffers_comp;
290+
unsigned long check_cq;
291+
atomic_t cq_wait_nr;
292+
atomic_t cq_timeouts;
293+
struct wait_queue_head cq_wait;
323294
} ____cacheline_aligned_in_smp;
324295

325296
/* timeouts */
326297
struct {
327298
spinlock_t timeout_lock;
328-
atomic_t cq_timeouts;
329299
struct list_head timeout_list;
330300
struct list_head ltimeout_list;
331301
unsigned cq_last_tm_flush;
332302
} ____cacheline_aligned_in_smp;
333303

304+
struct io_uring_cqe completion_cqes[16];
305+
306+
spinlock_t completion_lock;
307+
308+
/* IRQ completion list, under ->completion_lock */
309+
struct io_wq_work_list locked_free_list;
310+
unsigned int locked_free_nr;
311+
312+
struct list_head io_buffers_comp;
313+
struct list_head cq_overflow_list;
314+
struct io_hash_table cancel_table;
315+
316+
const struct cred *sq_creds; /* cred used for __io_sq_thread() */
317+
struct io_sq_data *sq_data; /* if using sq thread polling */
318+
319+
struct wait_queue_head sqo_sq_wait;
320+
struct list_head sqd_list;
321+
322+
unsigned int file_alloc_start;
323+
unsigned int file_alloc_end;
324+
325+
struct xarray personalities;
326+
u32 pers_next;
327+
328+
struct list_head io_buffers_cache;
329+
334330
/* Keep this last, we don't need it for the fast path */
335331
struct wait_queue_head poll_wq;
336332
struct io_restriction restrictions;
@@ -374,6 +370,15 @@ struct io_ring_ctx {
374370
unsigned sq_thread_idle;
375371
/* protected by ->completion_lock */
376372
unsigned evfd_last_cq_tail;
373+
374+
/*
375+
* If IORING_SETUP_NO_MMAP is used, then the below holds
376+
* the gup'ed pages for the two rings, and the sqes.
377+
*/
378+
unsigned short n_ring_pages;
379+
unsigned short n_sqe_pages;
380+
struct page **ring_pages;
381+
struct page **sqe_pages;
377382
};
378383

379384
struct io_tw_state {
@@ -409,7 +414,6 @@ enum {
409414
REQ_F_SINGLE_POLL_BIT,
410415
REQ_F_DOUBLE_POLL_BIT,
411416
REQ_F_PARTIAL_IO_BIT,
412-
REQ_F_CQE32_INIT_BIT,
413417
REQ_F_APOLL_MULTISHOT_BIT,
414418
REQ_F_CLEAR_POLLIN_BIT,
415419
REQ_F_HASH_LOCKED_BIT,
@@ -479,8 +483,6 @@ enum {
479483
REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT),
480484
/* fast poll multishot mode */
481485
REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT),
482-
/* ->extra1 and ->extra2 are initialised */
483-
REQ_F_CQE32_INIT = BIT(REQ_F_CQE32_INIT_BIT),
484486
/* recvmsg special flag, clear EPOLLIN */
485487
REQ_F_CLEAR_POLLIN = BIT(REQ_F_CLEAR_POLLIN_BIT),
486488
/* hashed into ->cancel_hash_locked, protected by ->uring_lock */
@@ -579,13 +581,7 @@ struct io_kiocb {
579581
struct io_task_work io_task_work;
580582
unsigned nr_tw;
581583
/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
582-
union {
583-
struct hlist_node hash_node;
584-
struct {
585-
u64 extra1;
586-
u64 extra2;
587-
};
588-
};
584+
struct hlist_node hash_node;
589585
/* internal polling, see IORING_FEAT_FAST_POLL */
590586
struct async_poll *apoll;
591587
/* opcode allocated if it needs to store data for async defer */
@@ -595,6 +591,11 @@ struct io_kiocb {
595591
/* custom credentials, valid IFF REQ_F_CREDS is set */
596592
const struct cred *creds;
597593
struct io_wq_work work;
594+
595+
struct {
596+
u64 extra1;
597+
u64 extra2;
598+
} big_cqe;
598599
};
599600

600601
struct io_overflow_cqe {

include/uapi/linux/io_uring.h

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,11 @@ enum {
185185
*/
186186
#define IORING_SETUP_REGISTERED_FD_ONLY (1U << 15)
187187

188+
/*
189+
* Removes indirection through the SQ index array.
190+
*/
191+
#define IORING_SETUP_NO_SQARRAY (1U << 16)
192+
188193
enum io_uring_op {
189194
IORING_OP_NOP,
190195
IORING_OP_READV,
@@ -299,11 +304,15 @@ enum io_uring_op {
299304
* request 'user_data'
300305
* IORING_ASYNC_CANCEL_ANY Match any request
301306
* IORING_ASYNC_CANCEL_FD_FIXED 'fd' passed in is a fixed descriptor
307+
* IORING_ASYNC_CANCEL_USERDATA Match on user_data, default for no other key
308+
* IORING_ASYNC_CANCEL_OP Match request based on opcode
302309
*/
303310
#define IORING_ASYNC_CANCEL_ALL (1U << 0)
304311
#define IORING_ASYNC_CANCEL_FD (1U << 1)
305312
#define IORING_ASYNC_CANCEL_ANY (1U << 2)
306313
#define IORING_ASYNC_CANCEL_FD_FIXED (1U << 3)
314+
#define IORING_ASYNC_CANCEL_USERDATA (1U << 4)
315+
#define IORING_ASYNC_CANCEL_OP (1U << 5)
307316

308317
/*
309318
* send/sendmsg and recv/recvmsg flags (sqe->ioprio)
@@ -697,7 +706,9 @@ struct io_uring_sync_cancel_reg {
697706
__s32 fd;
698707
__u32 flags;
699708
struct __kernel_timespec timeout;
700-
__u64 pad[4];
709+
__u8 opcode;
710+
__u8 pad[7];
711+
__u64 pad2[3];
701712
};
702713

703714
/*
@@ -717,6 +728,14 @@ struct io_uring_recvmsg_out {
717728
__u32 flags;
718729
};
719730

731+
/*
732+
* Argument for IORING_OP_URING_CMD when file is a socket
733+
*/
734+
enum {
735+
SOCKET_URING_OP_SIOCINQ = 0,
736+
SOCKET_URING_OP_SIOCOUTQ,
737+
};
738+
720739
#ifdef __cplusplus
721740
}
722741
#endif

0 commit comments

Comments
 (0)