Skip to content

Commit 9d9539d

Browse files
braunertorvalds
authored andcommitted
pidfs: remove config option
As Linus suggested this enables pidfs unconditionally. A key property to retain is the ability to compare pidfds by inode number (cf. [1]). That's extremely helpful just as comparing namespace file descriptors by inode number is. They are used in a variety of scenarios where they need to be compared, e.g., when receiving a pidfd via SO_PEERPIDFD from a socket to trivially authenticate a the sender and various other use-cases. For 64bit systems this is pretty trivial to do. For 32bit it's slightly more annoying as we discussed but we simply add a dumb ida based allocator that gets used on 32bit. This gives the same guarantees about inode numbers on 64bit without any overflow risk. Practically, we'll never run into overflow issues because we're constrained by the number of processes that can exist on 32bit and by the number of open files that can exist on a 32bit system. On 64bit none of this matters and things are very simple. If 32bit also needs the uniqueness guarantee they can simply parse the contents of /proc/<pid>/fd/<nr>. The uniqueness guarantees have a variety of use-cases. One of the most obvious ones is that they will make pidfiles (or "pidfdfiles", I guess) reliable as the unique identifier can be placed into there that won't be reycled. Also a frequent request. Note, I took the chance and simplified path_from_stashed() even further. Instead of passing the inode number explicitly to path_from_stashed() we let the filesystem handle that internally. So path_from_stashed() ends up even simpler than it is now. This is also a good solution allowing the cleanup code to be clean and consistent between 32bit and 64bit. The cleanup path in prepare_anon_dentry() is also switched around so we put the inode before the dentry allocation. This means we only have to call the cleanup handler for the filesystem's inode data once and can rely ->evict_inode() otherwise. Aside from having to have a bit of extra code for 32bit it actually ends up a nice cleanup for path_from_stashed() imho. Tested on both 32 and 64bit including error injection. Link: systemd/systemd#31713 [1] Link: https://lore.kernel.org/r/20240312-dingo-sehnlich-b3ecc35c6de7@brauner Signed-off-by: Christian Brauner <brauner@kernel.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent ce0c1c9 commit 9d9539d

File tree

8 files changed

+78
-93
lines changed

8 files changed

+78
-93
lines changed

fs/Kconfig

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -173,13 +173,6 @@ source "fs/proc/Kconfig"
173173
source "fs/kernfs/Kconfig"
174174
source "fs/sysfs/Kconfig"
175175

176-
config FS_PID
177-
bool "Pseudo filesystem for process file descriptors"
178-
depends on 64BIT
179-
default y
180-
help
181-
Pidfs implements advanced features for process file descriptors.
182-
183176
config TMPFS
184177
bool "Tmpfs virtual memory file system support (former shm fs)"
185178
depends on SHMEM

fs/internal.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -313,8 +313,8 @@ struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap);
313313
void mnt_idmap_put(struct mnt_idmap *idmap);
314314
struct stashed_operations {
315315
void (*put_data)(void *data);
316-
void (*init_inode)(struct inode *inode, void *data);
316+
int (*init_inode)(struct inode *inode, void *data);
317317
};
318-
int path_from_stashed(struct dentry **stashed, unsigned long ino,
319-
struct vfsmount *mnt, void *data, struct path *path);
318+
int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
319+
struct path *path);
320320
void stashed_dentry_prune(struct dentry *dentry);

fs/libfs.c

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2001,34 +2001,40 @@ static inline struct dentry *get_stashed_dentry(struct dentry *stashed)
20012001
}
20022002

20032003
static struct dentry *prepare_anon_dentry(struct dentry **stashed,
2004-
unsigned long ino,
20052004
struct super_block *sb,
20062005
void *data)
20072006
{
20082007
struct dentry *dentry;
20092008
struct inode *inode;
20102009
const struct stashed_operations *sops = sb->s_fs_info;
2011-
2012-
dentry = d_alloc_anon(sb);
2013-
if (!dentry)
2014-
return ERR_PTR(-ENOMEM);
2010+
int ret;
20152011

20162012
inode = new_inode_pseudo(sb);
20172013
if (!inode) {
2018-
dput(dentry);
2014+
sops->put_data(data);
20192015
return ERR_PTR(-ENOMEM);
20202016
}
20212017

2022-
inode->i_ino = ino;
20232018
inode->i_flags |= S_IMMUTABLE;
20242019
inode->i_mode = S_IFREG;
20252020
simple_inode_init_ts(inode);
2026-
sops->init_inode(inode, data);
2021+
2022+
ret = sops->init_inode(inode, data);
2023+
if (ret < 0) {
2024+
iput(inode);
2025+
return ERR_PTR(ret);
2026+
}
20272027

20282028
/* Notice when this is changed. */
20292029
WARN_ON_ONCE(!S_ISREG(inode->i_mode));
20302030
WARN_ON_ONCE(!IS_IMMUTABLE(inode));
20312031

2032+
dentry = d_alloc_anon(sb);
2033+
if (!dentry) {
2034+
iput(inode);
2035+
return ERR_PTR(-ENOMEM);
2036+
}
2037+
20322038
/* Store address of location where dentry's supposed to be stashed. */
20332039
dentry->d_fsdata = stashed;
20342040

@@ -2062,7 +2068,6 @@ static struct dentry *stash_dentry(struct dentry **stashed,
20622068
/**
20632069
* path_from_stashed - create path from stashed or new dentry
20642070
* @stashed: where to retrieve or stash dentry
2065-
* @ino: inode number to use
20662071
* @mnt: mnt of the filesystems to use
20672072
* @data: data to store in inode->i_private
20682073
* @path: path to create
@@ -2077,8 +2082,8 @@ static struct dentry *stash_dentry(struct dentry **stashed,
20772082
*
20782083
* Return: On success zero and on failure a negative error is returned.
20792084
*/
2080-
int path_from_stashed(struct dentry **stashed, unsigned long ino,
2081-
struct vfsmount *mnt, void *data, struct path *path)
2085+
int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
2086+
struct path *path)
20822087
{
20832088
struct dentry *dentry;
20842089
const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info;
@@ -2091,11 +2096,9 @@ int path_from_stashed(struct dentry **stashed, unsigned long ino,
20912096
}
20922097

20932098
/* Allocate a new dentry. */
2094-
dentry = prepare_anon_dentry(stashed, ino, mnt->mnt_sb, data);
2095-
if (IS_ERR(dentry)) {
2096-
sops->put_data(data);
2099+
dentry = prepare_anon_dentry(stashed, mnt->mnt_sb, data);
2100+
if (IS_ERR(dentry))
20972101
return PTR_ERR(dentry);
2098-
}
20992102

21002103
/* Added a new dentry. @data is now owned by the filesystem. */
21012104
path->dentry = stash_dentry(stashed, dentry);

fs/nsfs.c

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ int ns_get_path_cb(struct path *path, ns_get_path_helper_t *ns_get_cb,
5656
if (!ns)
5757
return -ENOENT;
5858

59-
return path_from_stashed(&ns->stashed, ns->inum, nsfs_mnt, ns, path);
59+
return path_from_stashed(&ns->stashed, nsfs_mnt, ns, path);
6060
}
6161

6262
struct ns_get_path_task_args {
@@ -101,8 +101,7 @@ int open_related_ns(struct ns_common *ns,
101101
return PTR_ERR(relative);
102102
}
103103

104-
err = path_from_stashed(&relative->stashed, relative->inum, nsfs_mnt,
105-
relative, &path);
104+
err = path_from_stashed(&relative->stashed, nsfs_mnt, relative, &path);
106105
if (err < 0) {
107106
put_unused_fd(fd);
108107
return err;
@@ -199,11 +198,15 @@ static const struct super_operations nsfs_ops = {
199198
.show_path = nsfs_show_path,
200199
};
201200

202-
static void nsfs_init_inode(struct inode *inode, void *data)
201+
static int nsfs_init_inode(struct inode *inode, void *data)
203202
{
203+
struct ns_common *ns = data;
204+
204205
inode->i_private = data;
205206
inode->i_mode |= S_IRUGO;
206207
inode->i_fop = &ns_file_operations;
208+
inode->i_ino = ns->inum;
209+
return 0;
207210
}
208211

209212
static void nsfs_put_data(void *data)

fs/pidfs.c

Lines changed: 47 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -16,17 +16,6 @@
1616

1717
#include "internal.h"
1818

19-
static int pidfd_release(struct inode *inode, struct file *file)
20-
{
21-
#ifndef CONFIG_FS_PID
22-
struct pid *pid = file->private_data;
23-
24-
file->private_data = NULL;
25-
put_pid(pid);
26-
#endif
27-
return 0;
28-
}
29-
3019
#ifdef CONFIG_PROC_FS
3120
/**
3221
* pidfd_show_fdinfo - print information about a pidfd
@@ -120,7 +109,6 @@ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
120109
}
121110

122111
static const struct file_operations pidfs_file_operations = {
123-
.release = pidfd_release,
124112
.poll = pidfd_poll,
125113
#ifdef CONFIG_PROC_FS
126114
.show_fdinfo = pidfd_show_fdinfo,
@@ -131,16 +119,45 @@ struct pid *pidfd_pid(const struct file *file)
131119
{
132120
if (file->f_op != &pidfs_file_operations)
133121
return ERR_PTR(-EBADF);
134-
#ifdef CONFIG_FS_PID
135122
return file_inode(file)->i_private;
136-
#else
137-
return file->private_data;
138-
#endif
139123
}
140124

141-
#ifdef CONFIG_FS_PID
142125
static struct vfsmount *pidfs_mnt __ro_after_init;
143126

127+
#if BITS_PER_LONG == 32
128+
/*
129+
* Provide a fallback mechanism for 32-bit systems so processes remain
130+
* reliably comparable by inode number even on those systems.
131+
*/
132+
static DEFINE_IDA(pidfd_inum_ida);
133+
134+
static int pidfs_inum(struct pid *pid, unsigned long *ino)
135+
{
136+
int ret;
137+
138+
ret = ida_alloc_range(&pidfd_inum_ida, RESERVED_PIDS + 1,
139+
UINT_MAX, GFP_ATOMIC);
140+
if (ret < 0)
141+
return -ENOSPC;
142+
143+
*ino = ret;
144+
return 0;
145+
}
146+
147+
static inline void pidfs_free_inum(unsigned long ino)
148+
{
149+
if (ino > 0)
150+
ida_free(&pidfd_inum_ida, ino);
151+
}
152+
#else
153+
static inline int pidfs_inum(struct pid *pid, unsigned long *ino)
154+
{
155+
*ino = pid->ino;
156+
return 0;
157+
}
158+
#define pidfs_free_inum(ino) ((void)(ino))
159+
#endif
160+
144161
/*
145162
* The vfs falls back to simple_setattr() if i_op->setattr() isn't
146163
* implemented. Let's reject it completely until we have a clean
@@ -173,6 +190,7 @@ static void pidfs_evict_inode(struct inode *inode)
173190

174191
clear_inode(inode);
175192
put_pid(pid);
193+
pidfs_free_inum(inode->i_ino);
176194
}
177195

178196
static const struct super_operations pidfs_sops = {
@@ -183,8 +201,10 @@ static const struct super_operations pidfs_sops = {
183201

184202
static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen)
185203
{
186-
return dynamic_dname(buffer, buflen, "pidfd:[%lu]",
187-
d_inode(dentry)->i_ino);
204+
struct inode *inode = d_inode(dentry);
205+
struct pid *pid = inode->i_private;
206+
207+
return dynamic_dname(buffer, buflen, "pidfd:[%llu]", pid->ino);
188208
}
189209

190210
static const struct dentry_operations pidfs_dentry_operations = {
@@ -193,13 +213,19 @@ static const struct dentry_operations pidfs_dentry_operations = {
193213
.d_prune = stashed_dentry_prune,
194214
};
195215

196-
static void pidfs_init_inode(struct inode *inode, void *data)
216+
static int pidfs_init_inode(struct inode *inode, void *data)
197217
{
198218
inode->i_private = data;
199219
inode->i_flags |= S_PRIVATE;
200220
inode->i_mode |= S_IRWXU;
201221
inode->i_op = &pidfs_inode_operations;
202222
inode->i_fop = &pidfs_file_operations;
223+
/*
224+
* Inode numbering for pidfs start at RESERVED_PIDS + 1. This
225+
* avoids collisions with the root inode which is 1 for pseudo
226+
* filesystems.
227+
*/
228+
return pidfs_inum(data, &inode->i_ino);
203229
}
204230

205231
static void pidfs_put_data(void *data)
@@ -240,13 +266,7 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
240266
struct path path;
241267
int ret;
242268

243-
/*
244-
* Inode numbering for pidfs start at RESERVED_PIDS + 1.
245-
* This avoids collisions with the root inode which is 1
246-
* for pseudo filesystems.
247-
*/
248-
ret = path_from_stashed(&pid->stashed, pid->ino, pidfs_mnt,
249-
get_pid(pid), &path);
269+
ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path);
250270
if (ret < 0)
251271
return ERR_PTR(ret);
252272

@@ -261,30 +281,3 @@ void __init pidfs_init(void)
261281
if (IS_ERR(pidfs_mnt))
262282
panic("Failed to mount pidfs pseudo filesystem");
263283
}
264-
265-
bool is_pidfs_sb(const struct super_block *sb)
266-
{
267-
return sb == pidfs_mnt->mnt_sb;
268-
}
269-
270-
#else /* !CONFIG_FS_PID */
271-
272-
struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
273-
{
274-
struct file *pidfd_file;
275-
276-
pidfd_file = anon_inode_getfile("[pidfd]", &pidfs_file_operations, pid,
277-
flags | O_RDWR);
278-
if (IS_ERR(pidfd_file))
279-
return pidfd_file;
280-
281-
get_pid(pid);
282-
return pidfd_file;
283-
}
284-
285-
void __init pidfs_init(void) { }
286-
bool is_pidfs_sb(const struct super_block *sb)
287-
{
288-
return false;
289-
}
290-
#endif

include/linux/pid.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@
4545
* find_pid_ns() using the int nr and struct pid_namespace *ns.
4646
*/
4747

48+
#define RESERVED_PIDS 300
49+
4850
struct upid {
4951
int nr;
5052
struct pid_namespace *ns;
@@ -55,10 +57,8 @@ struct pid
5557
refcount_t count;
5658
unsigned int level;
5759
spinlock_t lock;
58-
#ifdef CONFIG_FS_PID
5960
struct dentry *stashed;
60-
unsigned long ino;
61-
#endif
61+
u64 ino;
6262
/* lists of tasks that use this pid */
6363
struct hlist_head tasks[PIDTYPE_MAX];
6464
struct hlist_head inodes;

include/linux/pidfs.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,5 @@
44

55
struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags);
66
void __init pidfs_init(void);
7-
bool is_pidfs_sb(const struct super_block *sb);
87

98
#endif /* _LINUX_PID_FS_H */

kernel/pid.c

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -62,17 +62,13 @@ struct pid init_struct_pid = {
6262

6363
int pid_max = PID_MAX_DEFAULT;
6464

65-
#define RESERVED_PIDS 300
66-
6765
int pid_max_min = RESERVED_PIDS + 1;
6866
int pid_max_max = PID_MAX_LIMIT;
69-
#ifdef CONFIG_FS_PID
7067
/*
7168
* Pseudo filesystems start inode numbering after one. We use Reserved
7269
* PIDs as a natural offset.
7370
*/
7471
static u64 pidfs_ino = RESERVED_PIDS;
75-
#endif
7672

7773
/*
7874
* PID-map pages start out as NULL, they get allocated upon
@@ -280,10 +276,8 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
280276
spin_lock_irq(&pidmap_lock);
281277
if (!(ns->pid_allocated & PIDNS_ADDING))
282278
goto out_unlock;
283-
#ifdef CONFIG_FS_PID
284279
pid->stashed = NULL;
285280
pid->ino = ++pidfs_ino;
286-
#endif
287281
for ( ; upid >= pid->numbers; --upid) {
288282
/* Make the PID visible to find_pid_ns. */
289283
idr_replace(&upid->ns->idr, pid, upid->nr);

0 commit comments

Comments
 (0)