Skip to content

Commit 2401892

Browse files
committed
Merge patch series "acct: don't allow access to internal filesystems"
Christian Brauner <brauner@kernel.org> says: In [1] it was reported that the acct(2) system call can be used to trigger a NULL deref in cases where it is set to write to a file that triggers an internal lookup. This can e.g., happen when pointing acct(2) to /sys/power/resume. At the point the where the write to this file happens the calling task has already exited and called exit_fs() but an internal lookup might be triggered through lookup_bdev(). This may trigger a NULL-deref when accessing current->fs. This series does two things: - Reorganize the code so that the the final write happens from the workqueue but with the caller's credentials. This preserves the (strange) permission model and has almost no regression risk. - Block access to kernel internal filesystems as well as procfs and sysfs in the first place. This api should stop to exist imho. Link: https://lore.kernel.org/r/20250127091811.3183623-1-quzicheng@huawei.com [1] * patches from https://lore.kernel.org/r/20250211-work-acct-v1-0-1c16aecab8b3@kernel.org: acct: block access to kernel internal filesystems acct: perform last write from workqueue Link: https://lore.kernel.org/r/20250211-work-acct-v1-0-1c16aecab8b3@kernel.org Signed-off-by: Christian Brauner <brauner@kernel.org>
2 parents a64dcfb + 890ed45 commit 2401892

File tree

1 file changed

+84
-50
lines changed

1 file changed

+84
-50
lines changed

kernel/acct.c

Lines changed: 84 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -103,48 +103,50 @@ struct bsd_acct_struct {
103103
atomic_long_t count;
104104
struct rcu_head rcu;
105105
struct mutex lock;
106-
int active;
106+
bool active;
107+
bool check_space;
107108
unsigned long needcheck;
108109
struct file *file;
109110
struct pid_namespace *ns;
110111
struct work_struct work;
111112
struct completion done;
113+
acct_t ac;
112114
};
113115

114-
static void do_acct_process(struct bsd_acct_struct *acct);
116+
static void fill_ac(struct bsd_acct_struct *acct);
117+
static void acct_write_process(struct bsd_acct_struct *acct);
115118

116119
/*
117120
* Check the amount of free space and suspend/resume accordingly.
118121
*/
119-
static int check_free_space(struct bsd_acct_struct *acct)
122+
static bool check_free_space(struct bsd_acct_struct *acct)
120123
{
121124
struct kstatfs sbuf;
122125

123-
if (time_is_after_jiffies(acct->needcheck))
124-
goto out;
126+
if (!acct->check_space)
127+
return acct->active;
125128

126129
/* May block */
127130
if (vfs_statfs(&acct->file->f_path, &sbuf))
128-
goto out;
131+
return acct->active;
129132

130133
if (acct->active) {
131134
u64 suspend = sbuf.f_blocks * SUSPEND;
132135
do_div(suspend, 100);
133136
if (sbuf.f_bavail <= suspend) {
134-
acct->active = 0;
137+
acct->active = false;
135138
pr_info("Process accounting paused\n");
136139
}
137140
} else {
138141
u64 resume = sbuf.f_blocks * RESUME;
139142
do_div(resume, 100);
140143
if (sbuf.f_bavail >= resume) {
141-
acct->active = 1;
144+
acct->active = true;
142145
pr_info("Process accounting resumed\n");
143146
}
144147
}
145148

146149
acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
147-
out:
148150
return acct->active;
149151
}
150152

@@ -189,7 +191,11 @@ static void acct_pin_kill(struct fs_pin *pin)
189191
{
190192
struct bsd_acct_struct *acct = to_acct(pin);
191193
mutex_lock(&acct->lock);
192-
do_acct_process(acct);
194+
/*
195+
* Fill the accounting struct with the exiting task's info
196+
* before punting to the workqueue.
197+
*/
198+
fill_ac(acct);
193199
schedule_work(&acct->work);
194200
wait_for_completion(&acct->done);
195201
cmpxchg(&acct->ns->bacct, pin, NULL);
@@ -202,6 +208,9 @@ static void close_work(struct work_struct *work)
202208
{
203209
struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work);
204210
struct file *file = acct->file;
211+
212+
/* We were fired by acct_pin_kill() which holds acct->lock. */
213+
acct_write_process(acct);
205214
if (file->f_op->flush)
206215
file->f_op->flush(file, NULL);
207216
__fput_sync(file);
@@ -234,6 +243,20 @@ static int acct_on(struct filename *pathname)
234243
return -EACCES;
235244
}
236245

246+
/* Exclude kernel kernel internal filesystems. */
247+
if (file_inode(file)->i_sb->s_flags & (SB_NOUSER | SB_KERNMOUNT)) {
248+
kfree(acct);
249+
filp_close(file, NULL);
250+
return -EINVAL;
251+
}
252+
253+
/* Exclude procfs and sysfs. */
254+
if (file_inode(file)->i_sb->s_iflags & SB_I_USERNS_VISIBLE) {
255+
kfree(acct);
256+
filp_close(file, NULL);
257+
return -EINVAL;
258+
}
259+
237260
if (!(file->f_mode & FMODE_CAN_WRITE)) {
238261
kfree(acct);
239262
filp_close(file, NULL);
@@ -430,13 +453,27 @@ static u32 encode_float(u64 value)
430453
* do_exit() or when switching to a different output file.
431454
*/
432455

433-
static void fill_ac(acct_t *ac)
456+
static void fill_ac(struct bsd_acct_struct *acct)
434457
{
435458
struct pacct_struct *pacct = &current->signal->pacct;
459+
struct file *file = acct->file;
460+
acct_t *ac = &acct->ac;
436461
u64 elapsed, run_time;
437462
time64_t btime;
438463
struct tty_struct *tty;
439464

465+
lockdep_assert_held(&acct->lock);
466+
467+
if (time_is_after_jiffies(acct->needcheck)) {
468+
acct->check_space = false;
469+
470+
/* Don't fill in @ac if nothing will be written. */
471+
if (!acct->active)
472+
return;
473+
} else {
474+
acct->check_space = true;
475+
}
476+
440477
/*
441478
* Fill the accounting struct with the needed info as recorded
442479
* by the different kernel functions.
@@ -484,64 +521,61 @@ static void fill_ac(acct_t *ac)
484521
ac->ac_majflt = encode_comp_t(pacct->ac_majflt);
485522
ac->ac_exitcode = pacct->ac_exitcode;
486523
spin_unlock_irq(&current->sighand->siglock);
487-
}
488-
/*
489-
* do_acct_process does all actual work. Caller holds the reference to file.
490-
*/
491-
static void do_acct_process(struct bsd_acct_struct *acct)
492-
{
493-
acct_t ac;
494-
unsigned long flim;
495-
const struct cred *orig_cred;
496-
struct file *file = acct->file;
497524

498-
/*
499-
* Accounting records are not subject to resource limits.
500-
*/
501-
flim = rlimit(RLIMIT_FSIZE);
502-
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
503-
/* Perform file operations on behalf of whoever enabled accounting */
504-
orig_cred = override_creds(file->f_cred);
505-
506-
/*
507-
* First check to see if there is enough free_space to continue
508-
* the process accounting system.
509-
*/
510-
if (!check_free_space(acct))
511-
goto out;
512-
513-
fill_ac(&ac);
514525
/* we really need to bite the bullet and change layout */
515-
ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
516-
ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
526+
ac->ac_uid = from_kuid_munged(file->f_cred->user_ns, current_uid());
527+
ac->ac_gid = from_kgid_munged(file->f_cred->user_ns, current_gid());
517528
#if ACCT_VERSION == 1 || ACCT_VERSION == 2
518529
/* backward-compatible 16 bit fields */
519-
ac.ac_uid16 = ac.ac_uid;
520-
ac.ac_gid16 = ac.ac_gid;
530+
ac->ac_uid16 = ac->ac_uid;
531+
ac->ac_gid16 = ac->ac_gid;
521532
#elif ACCT_VERSION == 3
522533
{
523534
struct pid_namespace *ns = acct->ns;
524535

525-
ac.ac_pid = task_tgid_nr_ns(current, ns);
536+
ac->ac_pid = task_tgid_nr_ns(current, ns);
526537
rcu_read_lock();
527-
ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent),
528-
ns);
538+
ac->ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns);
529539
rcu_read_unlock();
530540
}
531541
#endif
542+
}
543+
544+
static void acct_write_process(struct bsd_acct_struct *acct)
545+
{
546+
struct file *file = acct->file;
547+
const struct cred *cred;
548+
acct_t *ac = &acct->ac;
549+
550+
/* Perform file operations on behalf of whoever enabled accounting */
551+
cred = override_creds(file->f_cred);
552+
532553
/*
533-
* Get freeze protection. If the fs is frozen, just skip the write
534-
* as we could deadlock the system otherwise.
554+
* First check to see if there is enough free_space to continue
555+
* the process accounting system. Then get freeze protection. If
556+
* the fs is frozen, just skip the write as we could deadlock
557+
* the system otherwise.
535558
*/
536-
if (file_start_write_trylock(file)) {
559+
if (check_free_space(acct) && file_start_write_trylock(file)) {
537560
/* it's been opened O_APPEND, so position is irrelevant */
538561
loff_t pos = 0;
539-
__kernel_write(file, &ac, sizeof(acct_t), &pos);
562+
__kernel_write(file, ac, sizeof(acct_t), &pos);
540563
file_end_write(file);
541564
}
542-
out:
565+
566+
revert_creds(cred);
567+
}
568+
569+
static void do_acct_process(struct bsd_acct_struct *acct)
570+
{
571+
unsigned long flim;
572+
573+
/* Accounting records are not subject to resource limits. */
574+
flim = rlimit(RLIMIT_FSIZE);
575+
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
576+
fill_ac(acct);
577+
acct_write_process(acct);
543578
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
544-
revert_creds(orig_cred);
545579
}
546580

547581
/**

0 commit comments

Comments
 (0)