Skip to content

Commit 8883957

Browse files
committed
Merge tag 'fsnotify_hsm_for_v6.14-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs
Pull fsnotify pre-content notification support from Jan Kara: "This introduces a new fsnotify event (FS_PRE_ACCESS) that gets generated before a file contents is accessed. The event is synchronous so if there is listener for this event, the kernel waits for reply. On success the execution continues as usual, on failure we propagate the error to userspace. This allows userspace to fill in file content on demand from slow storage. The context in which the events are generated has been picked so that we don't hold any locks and thus there's no risk of a deadlock for the userspace handler. The new pre-content event is available only for users with global CAP_SYS_ADMIN capability (similarly to other parts of fanotify functionality) and it is an administrator responsibility to make sure the userspace event handler doesn't do stupid stuff that can DoS the system. Based on your feedback from the last submission, fsnotify code has been improved and now file->f_mode encodes whether pre-content event needs to be generated for the file so the fast path when nobody wants pre-content event for the file just grows the additional file->f_mode check. As a bonus this also removes the checks whether the old FS_ACCESS event needs to be generated from the fast path. Also the place where the event is generated during page fault has been moved so now filemap_fault() generates the event if and only if there is no uptodate folio in the page cache. Also we have dropped FS_PRE_MODIFY event as current real-world users of the pre-content functionality don't really use it so let's start with the minimal useful feature set" * tag 'fsnotify_hsm_for_v6.14-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs: (21 commits) fanotify: Fix crash in fanotify_init(2) fs: don't block write during exec on pre-content watched files fs: enable pre-content events on supported file systems ext4: add pre-content fsnotify hook for DAX faults btrfs: disable defrag on pre-content watched files xfs: add pre-content fsnotify hook for DAX faults fsnotify: generate pre-content permission event on page fault mm: don't allow huge faults for files with pre content watches fanotify: disable readahead if we have pre-content watches fanotify: allow to set errno in FAN_DENY permission response fanotify: report file range info with pre-content events fanotify: introduce FAN_PRE_ACCESS permission event fsnotify: generate pre-content permission event on truncate fsnotify: pass optional file access range in pre-content event fsnotify: introduce pre-content permission events fanotify: reserve event bit of deprecated FAN_DIR_MODIFY fanotify: rename a misnamed constant fanotify: don't skip extra event info if no info_mode is set fsnotify: check if file is actually being watched for pre-content events on open fsnotify: opt-in for permission events at file open time ...
2 parents fb6fec6 + 0c0214d commit 8883957

File tree

28 files changed

+669
-106
lines changed

28 files changed

+669
-106
lines changed

fs/binfmt_elf.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1257,7 +1257,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
12571257
}
12581258
reloc_func_desc = interp_load_addr;
12591259

1260-
allow_write_access(interpreter);
1260+
exe_file_allow_write_access(interpreter);
12611261
fput(interpreter);
12621262

12631263
kfree(interp_elf_ex);
@@ -1354,7 +1354,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
13541354
kfree(interp_elf_ex);
13551355
kfree(interp_elf_phdata);
13561356
out_free_file:
1357-
allow_write_access(interpreter);
1357+
exe_file_allow_write_access(interpreter);
13581358
if (interpreter)
13591359
fput(interpreter);
13601360
out_free_ph:

fs/binfmt_elf_fdpic.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -394,7 +394,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
394394
goto error;
395395
}
396396

397-
allow_write_access(interpreter);
397+
exe_file_allow_write_access(interpreter);
398398
fput(interpreter);
399399
interpreter = NULL;
400400
}
@@ -467,7 +467,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
467467

468468
error:
469469
if (interpreter) {
470-
allow_write_access(interpreter);
470+
exe_file_allow_write_access(interpreter);
471471
fput(interpreter);
472472
}
473473
kfree(interpreter_name);

fs/btrfs/ioctl.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2544,6 +2544,15 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
25442544
goto out;
25452545
}
25462546

2547+
/*
2548+
* Don't allow defrag on pre-content watched files, as it could
2549+
* populate the page cache with 0's via readahead.
2550+
*/
2551+
if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) {
2552+
ret = -EINVAL;
2553+
goto out;
2554+
}
2555+
25472556
if (argp) {
25482557
if (copy_from_user(&range, argp, sizeof(range))) {
25492558
ret = -EFAULT;

fs/btrfs/super.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -961,7 +961,7 @@ static int btrfs_fill_super(struct super_block *sb,
961961
#endif
962962
sb->s_xattr = btrfs_xattr_handlers;
963963
sb->s_time_gran = 1;
964-
sb->s_iflags |= SB_I_CGROUPWB;
964+
sb->s_iflags |= SB_I_CGROUPWB | SB_I_ALLOW_HSM;
965965

966966
err = super_setup_bdi(sb);
967967
if (err) {

fs/exec.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -913,7 +913,7 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
913913
path_noexec(&file->f_path))
914914
return ERR_PTR(-EACCES);
915915

916-
err = deny_write_access(file);
916+
err = exe_file_deny_write_access(file);
917917
if (err)
918918
return ERR_PTR(err);
919919

@@ -928,7 +928,7 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
928928
* Returns ERR_PTR on failure or allocated struct file on success.
929929
*
930930
* As this is a wrapper for the internal do_open_execat(), callers
931-
* must call allow_write_access() before fput() on release. Also see
931+
* must call exe_file_allow_write_access() before fput() on release. Also see
932932
* do_close_execat().
933933
*/
934934
struct file *open_exec(const char *name)
@@ -1493,7 +1493,7 @@ static void do_close_execat(struct file *file)
14931493
{
14941494
if (!file)
14951495
return;
1496-
allow_write_access(file);
1496+
exe_file_allow_write_access(file);
14971497
fput(file);
14981498
}
14991499

@@ -1822,7 +1822,7 @@ static int exec_binprm(struct linux_binprm *bprm)
18221822
bprm->file = bprm->interpreter;
18231823
bprm->interpreter = NULL;
18241824

1825-
allow_write_access(exec);
1825+
exe_file_allow_write_access(exec);
18261826
if (unlikely(bprm->have_execfd)) {
18271827
if (bprm->executable) {
18281828
fput(exec);

fs/ext4/file.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -756,6 +756,9 @@ static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, unsigned int order)
756756
return VM_FAULT_SIGBUS;
757757
}
758758
} else {
759+
result = filemap_fsnotify_fault(vmf);
760+
if (unlikely(result))
761+
return result;
759762
filemap_invalidate_lock_shared(mapping);
760763
}
761764
result = dax_iomap_fault(vmf, order, &pfn, &error, &ext4_iomap_ops);

fs/ext4/super.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5301,6 +5301,9 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
53015301
/* i_version is always enabled now */
53025302
sb->s_flags |= SB_I_VERSION;
53035303

5304+
/* HSM events are allowed by default. */
5305+
sb->s_iflags |= SB_I_ALLOW_HSM;
5306+
53045307
err = ext4_check_feature_compatibility(sb, es, silent);
53055308
if (err)
53065309
goto failed_mount;

fs/fcntl.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1158,10 +1158,10 @@ static int __init fcntl_init(void)
11581158
* Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
11591159
* is defined as O_NONBLOCK on some platforms and not on others.
11601160
*/
1161-
BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ !=
1161+
BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ !=
11621162
HWEIGHT32(
11631163
(VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
1164-
__FMODE_EXEC | __FMODE_NONOTIFY));
1164+
__FMODE_EXEC));
11651165

11661166
fasync_cache = kmem_cache_create("fasync_cache",
11671167
sizeof(struct fasync_struct), 0,

fs/notify/fanotify/fanotify.c

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,7 @@ static int fanotify_get_response(struct fsnotify_group *group,
223223
struct fanotify_perm_event *event,
224224
struct fsnotify_iter_info *iter_info)
225225
{
226-
int ret;
226+
int ret, errno;
227227

228228
pr_debug("%s: group=%p event=%p\n", __func__, group, event);
229229

@@ -262,14 +262,23 @@ static int fanotify_get_response(struct fsnotify_group *group,
262262
ret = 0;
263263
break;
264264
case FAN_DENY:
265+
/* Check custom errno from pre-content events */
266+
errno = fanotify_get_response_errno(event->response);
267+
if (errno) {
268+
ret = -errno;
269+
break;
270+
}
271+
fallthrough;
265272
default:
266273
ret = -EPERM;
267274
}
268275

269276
/* Check if the response should be audited */
270-
if (event->response & FAN_AUDIT)
271-
audit_fanotify(event->response & ~FAN_AUDIT,
272-
&event->audit_rule);
277+
if (event->response & FAN_AUDIT) {
278+
u32 response = event->response &
279+
(FANOTIFY_RESPONSE_ACCESS | FANOTIFY_RESPONSE_FLAGS);
280+
audit_fanotify(response & ~FAN_AUDIT, &event->audit_rule);
281+
}
273282

274283
pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__,
275284
group, event, ret);
@@ -548,9 +557,13 @@ static struct fanotify_event *fanotify_alloc_path_event(const struct path *path,
548557
return &pevent->fae;
549558
}
550559

551-
static struct fanotify_event *fanotify_alloc_perm_event(const struct path *path,
560+
static struct fanotify_event *fanotify_alloc_perm_event(const void *data,
561+
int data_type,
552562
gfp_t gfp)
553563
{
564+
const struct path *path = fsnotify_data_path(data, data_type);
565+
const struct file_range *range =
566+
fsnotify_data_file_range(data, data_type);
554567
struct fanotify_perm_event *pevent;
555568

556569
pevent = kmem_cache_alloc(fanotify_perm_event_cachep, gfp);
@@ -564,6 +577,9 @@ static struct fanotify_event *fanotify_alloc_perm_event(const struct path *path,
564577
pevent->hdr.len = 0;
565578
pevent->state = FAN_EVENT_INIT;
566579
pevent->path = *path;
580+
/* NULL ppos means no range info */
581+
pevent->ppos = range ? &range->pos : NULL;
582+
pevent->count = range ? range->count : 0;
567583
path_get(path);
568584

569585
return &pevent->fae;
@@ -801,7 +817,7 @@ static struct fanotify_event *fanotify_alloc_event(
801817
old_memcg = set_active_memcg(group->memcg);
802818

803819
if (fanotify_is_perm_event(mask)) {
804-
event = fanotify_alloc_perm_event(path, gfp);
820+
event = fanotify_alloc_perm_event(data, data_type, gfp);
805821
} else if (fanotify_is_error_event(mask)) {
806822
event = fanotify_alloc_error_event(group, fsid, data,
807823
data_type, &hash);
@@ -909,8 +925,9 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
909925
BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM);
910926
BUILD_BUG_ON(FAN_FS_ERROR != FS_ERROR);
911927
BUILD_BUG_ON(FAN_RENAME != FS_RENAME);
928+
BUILD_BUG_ON(FAN_PRE_ACCESS != FS_PRE_ACCESS);
912929

913-
BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 21);
930+
BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 22);
914931

915932
mask = fanotify_group_event_mask(group, iter_info, &match_mask,
916933
mask, data, data_type, dir);

fs/notify/fanotify/fanotify.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,8 @@ FANOTIFY_PE(struct fanotify_event *event)
425425
struct fanotify_perm_event {
426426
struct fanotify_event fae;
427427
struct path path;
428+
const loff_t *ppos; /* optional file range info */
429+
size_t count;
428430
u32 response; /* userspace answer to the event */
429431
unsigned short state; /* state of the event */
430432
int fd; /* fd we passed to userspace for this event */
@@ -446,6 +448,14 @@ static inline bool fanotify_is_perm_event(u32 mask)
446448
mask & FANOTIFY_PERM_EVENTS;
447449
}
448450

451+
static inline bool fanotify_event_has_access_range(struct fanotify_event *event)
452+
{
453+
if (!(event->mask & FANOTIFY_PRE_CONTENT_EVENTS))
454+
return false;
455+
456+
return FANOTIFY_PERM(event)->ppos;
457+
}
458+
449459
static inline struct fanotify_event *FANOTIFY_E(struct fsnotify_event *fse)
450460
{
451461
return container_of(fse, struct fanotify_event, fse);
@@ -518,3 +528,8 @@ static inline unsigned int fanotify_mark_user_flags(struct fsnotify_mark *mark)
518528

519529
return mflags;
520530
}
531+
532+
static inline u32 fanotify_get_response_errno(int res)
533+
{
534+
return (res >> FAN_ERRNO_SHIFT) & FAN_ERRNO_MASK;
535+
}

0 commit comments

Comments
 (0)