Skip to content

Commit b9f5dd5

Browse files
Trond MyklebustAnna Schumaker
authored andcommitted
nfs/localio: use dedicated workqueues for filesystem read and write
For localio access, don't call filesystem read() and write() routines directly. This solves two problems: 1) localio writes need to use a normal (non-memreclaim) unbound workqueue. This avoids imposing new requirements on how underlying filesystems process frontend IO, which would cause a large amount of work to update all filesystems. Without this change, when XFS starts getting low on space, XFS flushes work on a non-memreclaim work queue, which causes a priority inversion problem: 00573 workqueue: WQ_MEM_RECLAIM writeback:wb_workfn is flushing !WQ_MEM_RECLAIM xfs-sync/vdc:xfs_flush_inodes_worker 00573 WARNING: CPU: 6 PID: 8525 at kernel/workqueue.c:3706 check_flush_dependency+0x2a4/0x328 00573 Modules linked in: 00573 CPU: 6 PID: 8525 Comm: kworker/u71:5 Not tainted 6.10.0-rc3-ktest-00032-g2b0a133403ab #18502 00573 Hardware name: linux,dummy-virt (DT) 00573 Workqueue: writeback wb_workfn (flush-0:33) 00573 pstate: 400010c5 (nZcv daIF -PAN -UAO -TCO -DIT +SSBS BTYPE=--) 00573 pc : check_flush_dependency+0x2a4/0x328 00573 lr : check_flush_dependency+0x2a4/0x328 00573 sp : ffff0000c5f06bb0 00573 x29: ffff0000c5f06bb0 x28: ffff0000c998a908 x27: 1fffe00019331521 00573 x26: ffff0000d0620900 x25: ffff0000c5f06ca0 x24: ffff8000828848c0 00573 x23: 1fffe00018be0d8e x22: ffff0000c1210000 x21: ffff0000c75fde00 00573 x20: ffff800080bfd258 x19: ffff0000cad63400 x18: ffff0000cd3a4810 00573 x17: 0000000000000000 x16: 0000000000000000 x15: ffff800080508d98 00573 x14: 0000000000000000 x13: 204d49414c434552 x12: 1fffe0001b6eeab2 00573 x11: ffff60001b6eeab2 x10: dfff800000000000 x9 : ffff60001b6eeab3 00573 x8 : 0000000000000001 x7 : 00009fffe491154e x6 : ffff0000db775593 00573 x5 : ffff0000db775590 x4 : ffff0000db775590 x3 : 0000000000000000 00573 x2 : 0000000000000027 x1 : ffff600018be0d62 x0 : dfff800000000000 00573 Call trace: 00573 check_flush_dependency+0x2a4/0x328 00573 __flush_work+0x184/0x5c8 00573 flush_work+0x18/0x28 00573 xfs_flush_inodes+0x68/0x88 00573 xfs_file_buffered_write+0x128/0x6f0 00573 xfs_file_write_iter+0x358/0x448 00573 nfs_local_doio+0x854/0x1568 00573 nfs_initiate_pgio+0x214/0x418 00573 nfs_generic_pg_pgios+0x304/0x480 00573 nfs_pageio_doio+0xe8/0x240 00573 nfs_pageio_complete+0x160/0x480 00573 nfs_writepages+0x300/0x4f0 00573 do_writepages+0x12c/0x4a0 00573 __writeback_single_inode+0xd4/0xa68 00573 writeback_sb_inodes+0x470/0xcb0 00573 __writeback_inodes_wb+0xb0/0x1d0 00573 wb_writeback+0x594/0x808 00573 wb_workfn+0x5e8/0x9e0 00573 process_scheduled_works+0x53c/0xd90 00573 worker_thread+0x370/0x8c8 00573 kthread+0x258/0x2e8 00573 ret_from_fork+0x10/0x20 2) Some filesystem writeback routines can end up taking up a lot of stack space (particularly XFS). Instead of risking running over due to the extra overhead from the NFS stack, we should just call these routines from a workqueue job. Since we need to do this to address 1) above we're able to avoid possibly blowing the stack "for free". Use of dedicated workqueues improves performance over using the system_unbound_wq. Also, the creds used to open the file are used to override_creds() in both nfs_local_call_read() and nfs_local_call_write() -- otherwise the workqueue could have elevated capabilities (which the caller may not). Lastly, care is taken to set PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO in nfs_do_local_write() to avoid writeback deadlocks. The PF_LOCAL_THROTTLE flag prevents deadlocks in balance_dirty_pages() by causing writes to only be throttled against other writes to the same bdi (it keeps the throttling local). Normally all writes to bdi(s) are throttled equally (after throughput factors are allowed for). The PF_MEMALLOC_NOIO flag prevents the lower filesystem IO from causing memory reclaim to re-enter filesystems or IO devices and so prevents deadlocks from occuring where IO that cleans pages is waiting on IO to complete. Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com> Co-developed-by: Mike Snitzer <snitzer@kernel.org> Signed-off-by: Mike Snitzer <snitzer@kernel.org> Co-developed-by: NeilBrown <neilb@suse.de> Signed-off-by: NeilBrown <neilb@suse.de> # eliminated wait_for_completion Reviewed-by: Jeff Layton <jlayton@kernel.org> Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
1 parent d488b9d commit b9f5dd5

File tree

3 files changed

+91
-38
lines changed

3 files changed

+91
-38
lines changed

fs/nfs/inode.c

Lines changed: 36 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2461,35 +2461,54 @@ static void nfs_destroy_inodecache(void)
24612461
kmem_cache_destroy(nfs_inode_cachep);
24622462
}
24632463

2464+
struct workqueue_struct *nfslocaliod_workqueue;
24642465
struct workqueue_struct *nfsiod_workqueue;
24652466
EXPORT_SYMBOL_GPL(nfsiod_workqueue);
24662467

24672468
/*
2468-
* start up the nfsiod workqueue
2469+
* Destroy the nfsiod workqueues
24692470
*/
2470-
static int nfsiod_start(void)
2471+
static void nfsiod_stop(void)
24712472
{
24722473
struct workqueue_struct *wq;
2473-
dprintk("RPC: creating workqueue nfsiod\n");
2474-
wq = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
2475-
if (wq == NULL)
2476-
return -ENOMEM;
2477-
nfsiod_workqueue = wq;
2478-
return 0;
2474+
2475+
wq = nfsiod_workqueue;
2476+
if (wq != NULL) {
2477+
nfsiod_workqueue = NULL;
2478+
destroy_workqueue(wq);
2479+
}
2480+
#if IS_ENABLED(CONFIG_NFS_LOCALIO)
2481+
wq = nfslocaliod_workqueue;
2482+
if (wq != NULL) {
2483+
nfslocaliod_workqueue = NULL;
2484+
destroy_workqueue(wq);
2485+
}
2486+
#endif /* CONFIG_NFS_LOCALIO */
24792487
}
24802488

24812489
/*
2482-
* Destroy the nfsiod workqueue
2490+
* Start the nfsiod workqueues
24832491
*/
2484-
static void nfsiod_stop(void)
2492+
static int nfsiod_start(void)
24852493
{
2486-
struct workqueue_struct *wq;
2487-
2488-
wq = nfsiod_workqueue;
2489-
if (wq == NULL)
2490-
return;
2491-
nfsiod_workqueue = NULL;
2492-
destroy_workqueue(wq);
2494+
dprintk("RPC: creating workqueue nfsiod\n");
2495+
nfsiod_workqueue = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
2496+
if (nfsiod_workqueue == NULL)
2497+
return -ENOMEM;
2498+
#if IS_ENABLED(CONFIG_NFS_LOCALIO)
2499+
/*
2500+
* localio writes need to use a normal (non-memreclaim) workqueue.
2501+
* When we start getting low on space, XFS goes and calls flush_work() on
2502+
* a non-memreclaim work queue, which causes a priority inversion problem.
2503+
*/
2504+
dprintk("RPC: creating workqueue nfslocaliod\n");
2505+
nfslocaliod_workqueue = alloc_workqueue("nfslocaliod", WQ_UNBOUND, 0);
2506+
if (unlikely(nfslocaliod_workqueue == NULL)) {
2507+
nfsiod_stop();
2508+
return -ENOMEM;
2509+
}
2510+
#endif /* CONFIG_NFS_LOCALIO */
2511+
return 0;
24932512
}
24942513

24952514
unsigned int nfs_net_id;

fs/nfs/internal.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,7 @@ int nfs_check_flags(int);
440440

441441
/* inode.c */
442442
extern struct workqueue_struct *nfsiod_workqueue;
443+
extern struct workqueue_struct *nfslocaliod_workqueue;
443444
extern struct inode *nfs_alloc_inode(struct super_block *sb);
444445
extern void nfs_free_inode(struct inode *);
445446
extern int nfs_write_inode(struct inode *, struct writeback_control *);

fs/nfs/localio.c

Lines changed: 54 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -238,32 +238,47 @@ nfs_local_read_done(struct nfs_local_kiocb *iocb, long status)
238238
status > 0 ? status : 0, hdr->res.eof);
239239
}
240240

241+
static void nfs_local_call_read(struct work_struct *work)
242+
{
243+
struct nfs_local_kiocb *iocb =
244+
container_of(work, struct nfs_local_kiocb, work);
245+
struct file *filp = iocb->kiocb.ki_filp;
246+
const struct cred *save_cred;
247+
struct iov_iter iter;
248+
ssize_t status;
249+
250+
save_cred = override_creds(filp->f_cred);
251+
252+
nfs_local_iter_init(&iter, iocb, READ);
253+
254+
status = filp->f_op->read_iter(&iocb->kiocb, &iter);
255+
WARN_ON_ONCE(status == -EIOCBQUEUED);
256+
257+
nfs_local_read_done(iocb, status);
258+
nfs_local_pgio_release(iocb);
259+
260+
revert_creds(save_cred);
261+
}
262+
241263
static int
242264
nfs_do_local_read(struct nfs_pgio_header *hdr,
243265
struct nfsd_file *localio,
244266
const struct rpc_call_ops *call_ops)
245267
{
246-
struct file *filp = nfs_to->nfsd_file_file(localio);
247268
struct nfs_local_kiocb *iocb;
248-
struct iov_iter iter;
249-
ssize_t status;
250269

251270
dprintk("%s: vfs_read count=%u pos=%llu\n",
252271
__func__, hdr->args.count, hdr->args.offset);
253272

254273
iocb = nfs_local_iocb_alloc(hdr, localio, GFP_KERNEL);
255274
if (iocb == NULL)
256275
return -ENOMEM;
257-
nfs_local_iter_init(&iter, iocb, READ);
258276

259277
nfs_local_pgio_init(hdr, call_ops);
260278
hdr->res.eof = false;
261279

262-
status = filp->f_op->read_iter(&iocb->kiocb, &iter);
263-
WARN_ON_ONCE(status == -EIOCBQUEUED);
264-
265-
nfs_local_read_done(iocb, status);
266-
nfs_local_pgio_release(iocb);
280+
INIT_WORK(&iocb->work, nfs_local_call_read);
281+
queue_work(nfslocaliod_workqueue, &iocb->work);
267282

268283
return 0;
269284
}
@@ -391,15 +406,40 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status)
391406
nfs_local_pgio_done(hdr, status);
392407
}
393408

409+
static void nfs_local_call_write(struct work_struct *work)
410+
{
411+
struct nfs_local_kiocb *iocb =
412+
container_of(work, struct nfs_local_kiocb, work);
413+
struct file *filp = iocb->kiocb.ki_filp;
414+
unsigned long old_flags = current->flags;
415+
const struct cred *save_cred;
416+
struct iov_iter iter;
417+
ssize_t status;
418+
419+
current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
420+
save_cred = override_creds(filp->f_cred);
421+
422+
nfs_local_iter_init(&iter, iocb, WRITE);
423+
424+
file_start_write(filp);
425+
status = filp->f_op->write_iter(&iocb->kiocb, &iter);
426+
file_end_write(filp);
427+
WARN_ON_ONCE(status == -EIOCBQUEUED);
428+
429+
nfs_local_write_done(iocb, status);
430+
nfs_local_vfs_getattr(iocb);
431+
nfs_local_pgio_release(iocb);
432+
433+
revert_creds(save_cred);
434+
current->flags = old_flags;
435+
}
436+
394437
static int
395438
nfs_do_local_write(struct nfs_pgio_header *hdr,
396439
struct nfsd_file *localio,
397440
const struct rpc_call_ops *call_ops)
398441
{
399-
struct file *filp = nfs_to->nfsd_file_file(localio);
400442
struct nfs_local_kiocb *iocb;
401-
struct iov_iter iter;
402-
ssize_t status;
403443

404444
dprintk("%s: vfs_write count=%u pos=%llu %s\n",
405445
__func__, hdr->args.count, hdr->args.offset,
@@ -408,7 +448,6 @@ nfs_do_local_write(struct nfs_pgio_header *hdr,
408448
iocb = nfs_local_iocb_alloc(hdr, localio, GFP_NOIO);
409449
if (iocb == NULL)
410450
return -ENOMEM;
411-
nfs_local_iter_init(&iter, iocb, WRITE);
412451

413452
switch (hdr->args.stable) {
414453
default:
@@ -423,14 +462,8 @@ nfs_do_local_write(struct nfs_pgio_header *hdr,
423462

424463
nfs_set_local_verifier(hdr->inode, hdr->res.verf, hdr->args.stable);
425464

426-
file_start_write(filp);
427-
status = filp->f_op->write_iter(&iocb->kiocb, &iter);
428-
file_end_write(filp);
429-
WARN_ON_ONCE(status == -EIOCBQUEUED);
430-
431-
nfs_local_write_done(iocb, status);
432-
nfs_local_vfs_getattr(iocb);
433-
nfs_local_pgio_release(iocb);
465+
INIT_WORK(&iocb->work, nfs_local_call_write);
466+
queue_work(nfslocaliod_workqueue, &iocb->work);
434467

435468
return 0;
436469
}

0 commit comments

Comments
 (0)