@@ -325,6 +325,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
325
325
INIT_LIST_HEAD (& ctx -> sqd_list );
326
326
INIT_LIST_HEAD (& ctx -> cq_overflow_list );
327
327
INIT_LIST_HEAD (& ctx -> io_buffers_cache );
328
+ INIT_HLIST_HEAD (& ctx -> io_buf_list );
328
329
io_alloc_cache_init (& ctx -> rsrc_node_cache , IO_NODE_ALLOC_CACHE_MAX ,
329
330
sizeof (struct io_rsrc_node ));
330
331
io_alloc_cache_init (& ctx -> apoll_cache , IO_ALLOC_CACHE_MAX ,
@@ -2666,7 +2667,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
2666
2667
return READ_ONCE (rings -> cq .head ) == READ_ONCE (rings -> cq .tail ) ? ret : 0 ;
2667
2668
}
2668
2669
2669
- static void io_mem_free (void * ptr )
2670
+ void io_mem_free (void * ptr )
2670
2671
{
2671
2672
if (!ptr )
2672
2673
return ;
@@ -2697,6 +2698,7 @@ static void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
2697
2698
{
2698
2699
struct page * * page_array ;
2699
2700
unsigned int nr_pages ;
2701
+ void * page_addr ;
2700
2702
int ret , i ;
2701
2703
2702
2704
* npages = 0 ;
@@ -2718,27 +2720,29 @@ static void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
2718
2720
io_pages_free (& page_array , ret > 0 ? ret : 0 );
2719
2721
return ret < 0 ? ERR_PTR (ret ) : ERR_PTR (- EFAULT );
2720
2722
}
2721
- /*
2722
- * Should be a single page. If the ring is small enough that we can
2723
- * use a normal page, that is fine. If we need multiple pages, then
2724
- * userspace should use a huge page. That's the only way to guarantee
2725
- * that we get contigious memory, outside of just being lucky or
2726
- * (currently) having low memory fragmentation.
2727
- */
2728
- if (page_array [0 ] != page_array [ret - 1 ])
2729
- goto err ;
2730
2723
2731
- /*
2732
- * Can't support mapping user allocated ring memory on 32-bit archs
2733
- * where it could potentially reside in highmem. Just fail those with
2734
- * -EINVAL, just like we did on kernels that didn't support this
2735
- * feature.
2736
- */
2724
+ page_addr = page_address (page_array [0 ]);
2737
2725
for (i = 0 ; i < nr_pages ; i ++ ) {
2738
- if (PageHighMem (page_array [i ])) {
2739
- ret = - EINVAL ;
2726
+ ret = - EINVAL ;
2727
+
2728
+ /*
2729
+ * Can't support mapping user allocated ring memory on 32-bit
2730
+ * archs where it could potentially reside in highmem. Just
2731
+ * fail those with -EINVAL, just like we did on kernels that
2732
+ * didn't support this feature.
2733
+ */
2734
+ if (PageHighMem (page_array [i ]))
2740
2735
goto err ;
2741
- }
2736
+
2737
+ /*
2738
+ * No support for discontig pages for now, should either be a
2739
+ * single normal page, or a huge page. Later on we can add
2740
+ * support for remapping discontig pages, for now we will
2741
+ * just fail them with EINVAL.
2742
+ */
2743
+ if (page_address (page_array [i ]) != page_addr )
2744
+ goto err ;
2745
+ page_addr += PAGE_SIZE ;
2742
2746
}
2743
2747
2744
2748
* pages = page_array ;
@@ -2775,7 +2779,7 @@ static void io_rings_free(struct io_ring_ctx *ctx)
2775
2779
}
2776
2780
}
2777
2781
2778
- static void * io_mem_alloc (size_t size )
2782
+ void * io_mem_alloc (size_t size )
2779
2783
{
2780
2784
gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP ;
2781
2785
void * ret ;
@@ -2947,6 +2951,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
2947
2951
ctx -> mm_account = NULL ;
2948
2952
}
2949
2953
io_rings_free (ctx );
2954
+ io_kbuf_mmap_list_free (ctx );
2950
2955
2951
2956
percpu_ref_exit (& ctx -> refs );
2952
2957
free_uid (ctx -> user );
@@ -3475,25 +3480,27 @@ static void *io_uring_validate_mmap_request(struct file *file,
3475
3480
struct page * page ;
3476
3481
void * ptr ;
3477
3482
3478
- /* Don't allow mmap if the ring was setup without it */
3479
- if (ctx -> flags & IORING_SETUP_NO_MMAP )
3480
- return ERR_PTR (- EINVAL );
3481
-
3482
3483
switch (offset & IORING_OFF_MMAP_MASK ) {
3483
3484
case IORING_OFF_SQ_RING :
3484
3485
case IORING_OFF_CQ_RING :
3486
+ /* Don't allow mmap if the ring was setup without it */
3487
+ if (ctx -> flags & IORING_SETUP_NO_MMAP )
3488
+ return ERR_PTR (- EINVAL );
3485
3489
ptr = ctx -> rings ;
3486
3490
break ;
3487
3491
case IORING_OFF_SQES :
3492
+ /* Don't allow mmap if the ring was setup without it */
3493
+ if (ctx -> flags & IORING_SETUP_NO_MMAP )
3494
+ return ERR_PTR (- EINVAL );
3488
3495
ptr = ctx -> sq_sqes ;
3489
3496
break ;
3490
3497
case IORING_OFF_PBUF_RING : {
3491
3498
unsigned int bgid ;
3492
3499
3493
3500
bgid = (offset & ~IORING_OFF_MMAP_MASK ) >> IORING_OFF_PBUF_SHIFT ;
3494
- mutex_lock ( & ctx -> uring_lock );
3501
+ rcu_read_lock ( );
3495
3502
ptr = io_pbuf_get_address (ctx , bgid );
3496
- mutex_unlock ( & ctx -> uring_lock );
3503
+ rcu_read_unlock ( );
3497
3504
if (!ptr )
3498
3505
return ERR_PTR (- EINVAL );
3499
3506
break ;
@@ -3645,7 +3652,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
3645
3652
size_t , argsz )
3646
3653
{
3647
3654
struct io_ring_ctx * ctx ;
3648
- struct fd f ;
3655
+ struct file * file ;
3649
3656
long ret ;
3650
3657
3651
3658
if (unlikely (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
@@ -3663,20 +3670,19 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
3663
3670
if (unlikely (!tctx || fd >= IO_RINGFD_REG_MAX ))
3664
3671
return - EINVAL ;
3665
3672
fd = array_index_nospec (fd , IO_RINGFD_REG_MAX );
3666
- f .file = tctx -> registered_rings [fd ];
3667
- f .flags = 0 ;
3668
- if (unlikely (!f .file ))
3673
+ file = tctx -> registered_rings [fd ];
3674
+ if (unlikely (!file ))
3669
3675
return - EBADF ;
3670
3676
} else {
3671
- f = fdget (fd );
3672
- if (unlikely (!f . file ))
3677
+ file = fget (fd );
3678
+ if (unlikely (!file ))
3673
3679
return - EBADF ;
3674
3680
ret = - EOPNOTSUPP ;
3675
- if (unlikely (!io_is_uring_fops (f . file )))
3681
+ if (unlikely (!io_is_uring_fops (file )))
3676
3682
goto out ;
3677
3683
}
3678
3684
3679
- ctx = f . file -> private_data ;
3685
+ ctx = file -> private_data ;
3680
3686
ret = - EBADFD ;
3681
3687
if (unlikely (ctx -> flags & IORING_SETUP_R_DISABLED ))
3682
3688
goto out ;
@@ -3770,7 +3776,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
3770
3776
}
3771
3777
}
3772
3778
out :
3773
- fdput (f );
3779
+ if (!(flags & IORING_ENTER_REGISTERED_RING ))
3780
+ fput (file );
3774
3781
return ret ;
3775
3782
}
3776
3783
@@ -4611,7 +4618,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
4611
4618
{
4612
4619
struct io_ring_ctx * ctx ;
4613
4620
long ret = - EBADF ;
4614
- struct fd f ;
4621
+ struct file * file ;
4615
4622
bool use_registered_ring ;
4616
4623
4617
4624
use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING );
@@ -4630,27 +4637,27 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
4630
4637
if (unlikely (!tctx || fd >= IO_RINGFD_REG_MAX ))
4631
4638
return - EINVAL ;
4632
4639
fd = array_index_nospec (fd , IO_RINGFD_REG_MAX );
4633
- f .file = tctx -> registered_rings [fd ];
4634
- f .flags = 0 ;
4635
- if (unlikely (!f .file ))
4640
+ file = tctx -> registered_rings [fd ];
4641
+ if (unlikely (!file ))
4636
4642
return - EBADF ;
4637
4643
} else {
4638
- f = fdget (fd );
4639
- if (unlikely (!f . file ))
4644
+ file = fget (fd );
4645
+ if (unlikely (!file ))
4640
4646
return - EBADF ;
4641
4647
ret = - EOPNOTSUPP ;
4642
- if (!io_is_uring_fops (f . file ))
4648
+ if (!io_is_uring_fops (file ))
4643
4649
goto out_fput ;
4644
4650
}
4645
4651
4646
- ctx = f . file -> private_data ;
4652
+ ctx = file -> private_data ;
4647
4653
4648
4654
mutex_lock (& ctx -> uring_lock );
4649
4655
ret = __io_uring_register (ctx , opcode , arg , nr_args );
4650
4656
mutex_unlock (& ctx -> uring_lock );
4651
4657
trace_io_uring_register (ctx , opcode , ctx -> nr_user_files , ctx -> nr_user_bufs , ret );
4652
4658
out_fput :
4653
- fdput (f );
4659
+ if (!use_registered_ring )
4660
+ fput (file );
4654
4661
return ret ;
4655
4662
}
4656
4663
0 commit comments