Skip to content

Commit cb12fd8

Browse files
committed
pidfd: add pidfs
This moves pidfds from the anonymous inode infrastructure to a tiny pseudo filesystem. This has been on my todo for quite a while as it will unblock further work that we weren't able to do simply because of the very justified limitations of anonymous inodes. Moving pidfds to a tiny pseudo filesystem allows: * statx() on pidfds becomes useful for the first time. * pidfds can be compared simply via statx() and then comparing inode numbers. * pidfds have unique inode numbers for the system lifetime. * struct pid is now stashed in inode->i_private instead of file->private_data. This means it is now possible to introduce concepts that operate on a process once all file descriptors have been closed. A concrete example is kill-on-last-close. * file->private_data is freed up for per-file options for pidfds. * Each struct pid will refer to a different inode but the same struct pid will refer to the same inode if it's opened multiple times. In contrast to now where each struct pid refers to the same inode. Even if we were to move to anon_inode_create_getfile() which creates new inodes we'd still be associating the same struct pid with multiple different inodes. The tiny pseudo filesystem is not visible anywhere in userspace exactly like e.g., pipefs and sockfs. There's no lookup, there's no complex inode operations, nothing. Dentries and inodes are always deleted when the last pidfd is closed. We allocate a new inode for each struct pid and we reuse that inode for all pidfds. We use iget_locked() to find that inode again based on the inode number which isn't recycled. We allocate a new dentry for each pidfd that uses the same inode. That is similar to anonymous inodes which reuse the same inode for thousands of dentries. For pidfds we're talking way less than that. There usually won't be a lot of concurrent openers of the same struct pid. They can probably often be counted on two hands. I know that systemd does use separate pidfd for the same struct pid for various complex process tracking issues. So I think with that things actually become way simpler. Especially because we don't have to care about lookup. Dentries and inodes continue to be always deleted. The code is entirely optional and fairly small. If it's not selected we fallback to anonymous inodes. Heavily inspired by nsfs which uses a similar stashing mechanism just for namespaces. Link: https://lore.kernel.org/r/20240213-vfs-pidfd_fs-v1-2-f863f58cfce1@kernel.org Signed-off-by: Christian Brauner <brauner@kernel.org>
1 parent 50f4f2d commit cb12fd8

File tree

9 files changed

+188
-17
lines changed

9 files changed

+188
-17
lines changed

fs/Kconfig

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,13 @@ source "fs/proc/Kconfig"
174174
source "fs/kernfs/Kconfig"
175175
source "fs/sysfs/Kconfig"
176176

177+
config FS_PID
178+
bool "Pseudo filesystem for process file descriptors"
179+
depends on 64BIT
180+
default y
181+
help
182+
Pidfs implements advanced features for process file descriptors.
183+
177184
config TMPFS
178185
bool "Tmpfs virtual memory file system support (former shm fs)"
179186
depends on SHMEM

fs/pidfs.c

Lines changed: 153 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
// SPDX-License-Identifier: GPL-2.0
2+
#include <linux/anon_inodes.h>
23
#include <linux/file.h>
34
#include <linux/fs.h>
45
#include <linux/magic.h>
56
#include <linux/mount.h>
67
#include <linux/pid.h>
8+
#include <linux/pidfs.h>
79
#include <linux/pid_namespace.h>
810
#include <linux/poll.h>
911
#include <linux/proc_fs.h>
@@ -14,10 +16,12 @@
1416

1517
static int pidfd_release(struct inode *inode, struct file *file)
1618
{
19+
#ifndef CONFIG_FS_PID
1720
struct pid *pid = file->private_data;
1821

1922
file->private_data = NULL;
2023
put_pid(pid);
24+
#endif
2125
return 0;
2226
}
2327

@@ -59,7 +63,7 @@ static int pidfd_release(struct inode *inode, struct file *file)
5963
*/
6064
static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
6165
{
62-
struct pid *pid = f->private_data;
66+
struct pid *pid = pidfd_pid(f);
6367
struct pid_namespace *ns;
6468
pid_t nr = -1;
6569

@@ -93,7 +97,7 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
9397
*/
9498
static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
9599
{
96-
struct pid *pid = file->private_data;
100+
struct pid *pid = pidfd_pid(file);
97101
bool thread = file->f_flags & PIDFD_THREAD;
98102
struct task_struct *task;
99103
__poll_t poll_flags = 0;
@@ -113,10 +117,156 @@ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
113117
return poll_flags;
114118
}
115119

116-
const struct file_operations pidfd_fops = {
120+
static const struct file_operations pidfs_file_operations = {
117121
.release = pidfd_release,
118122
.poll = pidfd_poll,
119123
#ifdef CONFIG_PROC_FS
120124
.show_fdinfo = pidfd_show_fdinfo,
121125
#endif
122126
};
127+
128+
struct pid *pidfd_pid(const struct file *file)
129+
{
130+
if (file->f_op != &pidfs_file_operations)
131+
return ERR_PTR(-EBADF);
132+
#ifdef CONFIG_FS_PID
133+
return file_inode(file)->i_private;
134+
#else
135+
return file->private_data;
136+
#endif
137+
}
138+
139+
#ifdef CONFIG_FS_PID
140+
static struct vfsmount *pidfs_mnt __ro_after_init;
141+
static struct super_block *pidfs_sb __ro_after_init;
142+
143+
/*
144+
* The vfs falls back to simple_setattr() if i_op->setattr() isn't
145+
* implemented. Let's reject it completely until we have a clean
146+
* permission concept for pidfds.
147+
*/
148+
static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
149+
struct iattr *attr)
150+
{
151+
return -EOPNOTSUPP;
152+
}
153+
154+
static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path,
155+
struct kstat *stat, u32 request_mask,
156+
unsigned int query_flags)
157+
{
158+
struct inode *inode = d_inode(path->dentry);
159+
160+
generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
161+
return 0;
162+
}
163+
164+
static const struct inode_operations pidfs_inode_operations = {
165+
.getattr = pidfs_getattr,
166+
.setattr = pidfs_setattr,
167+
};
168+
169+
static void pidfs_evict_inode(struct inode *inode)
170+
{
171+
struct pid *pid = inode->i_private;
172+
173+
clear_inode(inode);
174+
put_pid(pid);
175+
}
176+
177+
static const struct super_operations pidfs_sops = {
178+
.drop_inode = generic_delete_inode,
179+
.evict_inode = pidfs_evict_inode,
180+
.statfs = simple_statfs,
181+
};
182+
183+
static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen)
184+
{
185+
return dynamic_dname(buffer, buflen, "pidfd:[%lu]",
186+
d_inode(dentry)->i_ino);
187+
}
188+
189+
static const struct dentry_operations pidfs_dentry_operations = {
190+
.d_delete = always_delete_dentry,
191+
.d_dname = pidfs_dname,
192+
};
193+
194+
static int pidfs_init_fs_context(struct fs_context *fc)
195+
{
196+
struct pseudo_fs_context *ctx;
197+
198+
ctx = init_pseudo(fc, PID_FS_MAGIC);
199+
if (!ctx)
200+
return -ENOMEM;
201+
202+
ctx->ops = &pidfs_sops;
203+
ctx->dops = &pidfs_dentry_operations;
204+
return 0;
205+
}
206+
207+
static struct file_system_type pidfs_type = {
208+
.name = "pidfs",
209+
.init_fs_context = pidfs_init_fs_context,
210+
.kill_sb = kill_anon_super,
211+
};
212+
213+
struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
214+
{
215+
216+
struct inode *inode;
217+
struct file *pidfd_file;
218+
219+
inode = iget_locked(pidfs_sb, pid->ino);
220+
if (!inode)
221+
return ERR_PTR(-ENOMEM);
222+
223+
if (inode->i_state & I_NEW) {
224+
/*
225+
* Inode numbering for pidfs start at RESERVED_PIDS + 1.
226+
* This avoids collisions with the root inode which is 1
227+
* for pseudo filesystems.
228+
*/
229+
inode->i_ino = pid->ino;
230+
inode->i_mode = S_IFREG | S_IRUGO;
231+
inode->i_op = &pidfs_inode_operations;
232+
inode->i_fop = &pidfs_file_operations;
233+
inode->i_flags |= S_IMMUTABLE;
234+
inode->i_private = get_pid(pid);
235+
simple_inode_init_ts(inode);
236+
unlock_new_inode(inode);
237+
}
238+
239+
pidfd_file = alloc_file_pseudo(inode, pidfs_mnt, "", flags,
240+
&pidfs_file_operations);
241+
if (IS_ERR(pidfd_file))
242+
iput(inode);
243+
244+
return pidfd_file;
245+
}
246+
247+
void __init pidfs_init(void)
248+
{
249+
pidfs_mnt = kern_mount(&pidfs_type);
250+
if (IS_ERR(pidfs_mnt))
251+
panic("Failed to mount pidfs pseudo filesystem");
252+
253+
pidfs_sb = pidfs_mnt->mnt_sb;
254+
}
255+
256+
#else /* !CONFIG_FS_PID */
257+
258+
struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
259+
{
260+
struct file *pidfd_file;
261+
262+
pidfd_file = anon_inode_getfile("[pidfd]", &pidfs_file_operations, pid,
263+
flags | O_RDWR);
264+
if (IS_ERR(pidfd_file))
265+
return pidfd_file;
266+
267+
get_pid(pid);
268+
return pidfd_file;
269+
}
270+
271+
void __init pidfs_init(void) { }
272+
#endif

include/linux/pid.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,9 @@ struct pid
5555
refcount_t count;
5656
unsigned int level;
5757
spinlock_t lock;
58+
#ifdef CONFIG_FS_PID
59+
unsigned long ino;
60+
#endif
5861
/* lists of tasks that use this pid */
5962
struct hlist_head tasks[PIDTYPE_MAX];
6063
struct hlist_head inodes;
@@ -66,8 +69,6 @@ struct pid
6669

6770
extern struct pid init_struct_pid;
6871

69-
extern const struct file_operations pidfd_fops;
70-
7172
struct file;
7273

7374
struct pid *pidfd_pid(const struct file *file);

include/linux/pidfs.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
/* SPDX-License-Identifier: GPL-2.0 */
2+
#ifndef _LINUX_PID_FS_H
3+
#define _LINUX_PID_FS_H
4+
5+
struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags);
6+
void __init pidfs_init(void);
7+
8+
#endif /* _LINUX_PID_FS_H */

include/uapi/linux/magic.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,5 +101,6 @@
101101
#define DMA_BUF_MAGIC 0x444d4142 /* "DMAB" */
102102
#define DEVMEM_MAGIC 0x454d444d /* "DMEM" */
103103
#define SECRETMEM_MAGIC 0x5345434d /* "SECM" */
104+
#define PID_FS_MAGIC 0x50494446 /* "PIDF" */
104105

105106
#endif /* __LINUX_MAGIC_H__ */

init/main.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@
9999
#include <linux/init_syscalls.h>
100100
#include <linux/stackdepot.h>
101101
#include <linux/randomize_kstack.h>
102+
#include <linux/pidfs.h>
102103
#include <net/net_namespace.h>
103104

104105
#include <asm/io.h>
@@ -1059,6 +1060,7 @@ void start_kernel(void)
10591060
seq_file_init();
10601061
proc_root_init();
10611062
nsfs_init();
1063+
pidfs_init();
10621064
cpuset_init();
10631065
cgroup_init();
10641066
taskstats_init_early();

kernel/fork.c

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@
102102
#include <linux/iommu.h>
103103
#include <linux/rseq.h>
104104
#include <uapi/linux/pidfd.h>
105+
#include <linux/pidfs.h>
105106

106107
#include <asm/pgalloc.h>
107108
#include <linux/uaccess.h>
@@ -1985,14 +1986,6 @@ static inline void rcu_copy_process(struct task_struct *p)
19851986
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
19861987
}
19871988

1988-
struct pid *pidfd_pid(const struct file *file)
1989-
{
1990-
if (file->f_op == &pidfd_fops)
1991-
return file->private_data;
1992-
1993-
return ERR_PTR(-EBADF);
1994-
}
1995-
19961989
/**
19971990
* __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
19981991
* @pid: the struct pid for which to create a pidfd
@@ -2030,13 +2023,11 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
20302023
if (pidfd < 0)
20312024
return pidfd;
20322025

2033-
pidfd_file = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
2034-
flags | O_RDWR);
2026+
pidfd_file = pidfs_alloc_file(pid, flags | O_RDWR);
20352027
if (IS_ERR(pidfd_file)) {
20362028
put_unused_fd(pidfd);
20372029
return PTR_ERR(pidfd_file);
20382030
}
2039-
get_pid(pid); /* held by pidfd_file now */
20402031
/*
20412032
* anon_inode_getfile() ignores everything outside of the
20422033
* O_ACCMODE | O_NONBLOCK mask, set PIDFD_THREAD manually.

kernel/nsproxy.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -573,7 +573,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags)
573573
if (proc_ns_file(f.file))
574574
err = validate_ns(&nsset, ns);
575575
else
576-
err = validate_nsset(&nsset, f.file->private_data);
576+
err = validate_nsset(&nsset, pidfd_pid(f.file));
577577
if (!err) {
578578
commit_nsset(&nsset);
579579
perf_event_namespaces(current);

kernel/pid.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
#include <linux/sched/signal.h>
4343
#include <linux/sched/task.h>
4444
#include <linux/idr.h>
45+
#include <linux/pidfs.h>
4546
#include <net/sock.h>
4647
#include <uapi/linux/pidfd.h>
4748

@@ -65,6 +66,13 @@ int pid_max = PID_MAX_DEFAULT;
6566

6667
int pid_max_min = RESERVED_PIDS + 1;
6768
int pid_max_max = PID_MAX_LIMIT;
69+
#ifdef CONFIG_FS_PID
70+
/*
71+
* Pseudo filesystems start inode numbering after one. We use Reserved
72+
* PIDs as a natural offset.
73+
*/
74+
static u64 pidfs_ino = RESERVED_PIDS;
75+
#endif
6876

6977
/*
7078
* PID-map pages start out as NULL, they get allocated upon
@@ -272,6 +280,9 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
272280
spin_lock_irq(&pidmap_lock);
273281
if (!(ns->pid_allocated & PIDNS_ADDING))
274282
goto out_unlock;
283+
#ifdef CONFIG_FS_PID
284+
pid->ino = ++pidfs_ino;
285+
#endif
275286
for ( ; upid >= pid->numbers; --upid) {
276287
/* Make the PID visible to find_pid_ns. */
277288
idr_replace(&upid->ns->idr, pid, upid->nr);

0 commit comments

Comments
 (0)