From 9b128c578957a6e2012f0015fc361d9a40e884d3 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 30 Nov 2023 18:56:18 +0200 Subject: [PATCH 01/37] fanotify: store fsid in mark instead of in connector ANBZ: #33963 commit 7232522e6cafdf466ed7649c14546fd07ccc1978 upstream Some filesystems like fuse and nfs have zero or non-unique fsid. We would like to avoid reporting ambiguous fsid in events, so we need to avoid marking objects with same fsid and different sb. To make this easier to enforce, store the fsid in the marks of the group instead of in the shared conenctor. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Message-Id: <20231130165619.3386452-2-amir73il@gmail.com> [ Gao Xiang: needed by "fsnotify: rename fsnotify_{get,put}_sb_connectors()". ] Signed-off-by: Gao Xiang --- fs/notify/fanotify/fanotify.c | 19 +++-------- fs/notify/fanotify/fanotify.h | 10 ++++++ fs/notify/fanotify/fanotify_user.c | 18 ++++++++--- fs/notify/mark.c | 52 +++++------------------------- include/linux/fsnotify_backend.h | 13 +++----- 5 files changed, 42 insertions(+), 70 deletions(-) diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 723ff9cad9ed..46eb6345a870 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -844,9 +844,8 @@ static struct fanotify_event *fanotify_alloc_event( } /* - * Get cached fsid of the filesystem containing the object from any connector. - * All connectors are supposed to have the same fsid, but we do not verify that - * here. + * Get cached fsid of the filesystem containing the object from any mark. + * All marks are supposed to have the same fsid, but we do not verify that here. */ static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info) { @@ -855,17 +854,9 @@ static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info) __kernel_fsid_t fsid = {}; fsnotify_foreach_iter_mark_type(iter_info, mark, type) { - struct fsnotify_mark_connector *conn; - - conn = READ_ONCE(mark->connector); - /* Mark is just getting destroyed or created? */ - if (!conn) - continue; - if (!(conn->flags & FSNOTIFY_CONN_FLAG_HAS_FSID)) + if (!(mark->flags & FSNOTIFY_MARK_FLAG_HAS_FSID)) continue; - /* Pairs with smp_wmb() in fsnotify_add_mark_list() */ - smp_rmb(); - fsid = conn->fsid; + fsid = FANOTIFY_MARK(mark)->fsid; if (WARN_ON_ONCE(!fsid.val[0] && !fsid.val[1])) continue; return fsid; @@ -1074,7 +1065,7 @@ static void fanotify_freeing_mark(struct fsnotify_mark *mark, static void fanotify_free_mark(struct fsnotify_mark *fsn_mark) { - kmem_cache_free(fanotify_mark_cache, fsn_mark); + kmem_cache_free(fanotify_mark_cache, FANOTIFY_MARK(fsn_mark)); } const struct fsnotify_ops fanotify_fsnotify_ops = { diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index e8a3c28c5d12..88367e7b41dc 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -489,6 +489,16 @@ static inline unsigned int fanotify_event_hash_bucket( return event->hash & FANOTIFY_HTABLE_MASK; } +struct fanotify_mark { + struct fsnotify_mark fsn_mark; + __kernel_fsid_t fsid; +}; + +static inline struct fanotify_mark *FANOTIFY_MARK(struct fsnotify_mark *mark) +{ + return container_of(mark, struct fanotify_mark, fsn_mark); +} + static inline unsigned int fanotify_mark_user_flags(struct fsnotify_mark *mark) { unsigned int mflags = 0; diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index f4798d613dc2..3e38eabe8775 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1210,6 +1210,7 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, __kernel_fsid_t *fsid) { struct ucounts *ucounts = group->fanotify_data.ucounts; + struct fanotify_mark *fan_mark; struct fsnotify_mark *mark; int ret; @@ -1222,17 +1223,26 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, !inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS)) return ERR_PTR(-ENOSPC); - mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); - if (!mark) { + fan_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); + if (!fan_mark) { ret = -ENOMEM; goto out_dec_ucounts; } + mark = &fan_mark->fsn_mark; fsnotify_init_mark(mark, group); if (fan_flags & FAN_MARK_EVICTABLE) mark->flags |= FSNOTIFY_MARK_FLAG_NO_IREF; - ret = fsnotify_add_mark_locked(mark, connp, obj_type, 0, fsid); + /* Cache fsid of filesystem containing the marked object */ + if (fsid) { + fan_mark->fsid = *fsid; + mark->flags |= FSNOTIFY_MARK_FLAG_HAS_FSID; + } else { + fan_mark->fsid.val[0] = fan_mark->fsid.val[1] = 0; + } + + ret = fsnotify_add_mark_locked(mark, connp, obj_type, 0); if (ret) { fsnotify_put_mark(mark); goto out_dec_ucounts; @@ -1946,7 +1956,7 @@ static int __init fanotify_user_setup(void) BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 13); BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 11); - fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, + fanotify_mark_cache = KMEM_CACHE(fanotify_mark, SLAB_PANIC|SLAB_ACCOUNT); fanotify_fid_event_cachep = KMEM_CACHE(fanotify_fid_event, SLAB_PANIC); diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 4be6e883d492..c8f804814117 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -563,8 +563,7 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b) } static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, - unsigned int obj_type, - __kernel_fsid_t *fsid) + unsigned int obj_type) { struct fsnotify_mark_connector *conn; @@ -576,14 +575,7 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, conn->flags = 0; conn->type = obj_type; conn->obj = connp; - /* Cache fsid of filesystem containing the object */ - if (fsid) { - conn->fsid = *fsid; - conn->flags = FSNOTIFY_CONN_FLAG_HAS_FSID; - } else { - conn->fsid.val[0] = conn->fsid.val[1] = 0; - conn->flags = 0; - } + conn->flags = 0; fsnotify_get_sb_connectors(conn); /* @@ -634,8 +626,7 @@ static struct fsnotify_mark_connector *fsnotify_grab_connector( */ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, fsnotify_connp_t *connp, - unsigned int obj_type, - int add_flags, __kernel_fsid_t *fsid) + unsigned int obj_type, int add_flags) { struct fsnotify_mark *lmark, *last = NULL; struct fsnotify_mark_connector *conn; @@ -645,41 +636,15 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, if (WARN_ON(!fsnotify_valid_obj_type(obj_type))) return -EINVAL; - /* Backend is expected to check for zero fsid (e.g. tmpfs) */ - if (fsid && WARN_ON_ONCE(!fsid->val[0] && !fsid->val[1])) - return -ENODEV; - restart: spin_lock(&mark->lock); conn = fsnotify_grab_connector(connp); if (!conn) { spin_unlock(&mark->lock); - err = fsnotify_attach_connector_to_object(connp, obj_type, - fsid); + err = fsnotify_attach_connector_to_object(connp, obj_type); if (err) return err; goto restart; - } else if (fsid && !(conn->flags & FSNOTIFY_CONN_FLAG_HAS_FSID)) { - conn->fsid = *fsid; - /* Pairs with smp_rmb() in fanotify_get_fsid() */ - smp_wmb(); - conn->flags |= FSNOTIFY_CONN_FLAG_HAS_FSID; - } else if (fsid && (conn->flags & FSNOTIFY_CONN_FLAG_HAS_FSID) && - (fsid->val[0] != conn->fsid.val[0] || - fsid->val[1] != conn->fsid.val[1])) { - /* - * Backend is expected to check for non uniform fsid - * (e.g. btrfs), but maybe we missed something? - * Only allow setting conn->fsid once to non zero fsid. - * inotify and non-fid fanotify groups do not set nor test - * conn->fsid. - */ - pr_warn_ratelimited("%s: fsid mismatch on object of type %u: " - "%x.%x != %x.%x\n", __func__, conn->type, - fsid->val[0], fsid->val[1], - conn->fsid.val[0], conn->fsid.val[1]); - err = -EXDEV; - goto out_err; } /* is mark the first mark? */ @@ -729,7 +694,7 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, */ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, fsnotify_connp_t *connp, unsigned int obj_type, - int add_flags, __kernel_fsid_t *fsid) + int add_flags) { struct fsnotify_group *group = mark->group; int ret = 0; @@ -749,7 +714,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, fsnotify_get_mark(mark); /* for g_list */ spin_unlock(&mark->lock); - ret = fsnotify_add_mark_list(mark, connp, obj_type, add_flags, fsid); + ret = fsnotify_add_mark_list(mark, connp, obj_type, add_flags); if (ret) goto err; @@ -768,14 +733,13 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, } int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp, - unsigned int obj_type, int add_flags, - __kernel_fsid_t *fsid) + unsigned int obj_type, int add_flags) { int ret; struct fsnotify_group *group = mark->group; fsnotify_group_lock(group); - ret = fsnotify_add_mark_locked(mark, connp, obj_type, add_flags, fsid); + ret = fsnotify_add_mark_locked(mark, connp, obj_type, add_flags); fsnotify_group_unlock(group); return ret; } diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 575415b51349..0b6b83ea8be8 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -472,10 +472,8 @@ typedef struct fsnotify_mark_connector __rcu *fsnotify_connp_t; struct fsnotify_mark_connector { spinlock_t lock; unsigned short type; /* Type of object [lock] */ -#define FSNOTIFY_CONN_FLAG_HAS_FSID 0x01 #define FSNOTIFY_CONN_FLAG_HAS_IREF 0x02 unsigned short flags; /* flags [lock] */ - __kernel_fsid_t fsid; /* fsid of filesystem containing object */ union { /* Object pointer [lock] */ fsnotify_connp_t *obj; @@ -530,6 +528,7 @@ struct fsnotify_mark { #define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x0100 #define FSNOTIFY_MARK_FLAG_NO_IREF 0x0200 #define FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS 0x0400 +#define FSNOTIFY_MARK_FLAG_HAS_FSID 0x0800 unsigned int flags; /* flags [mark->lock] */ }; @@ -765,11 +764,10 @@ extern struct fsnotify_mark *fsnotify_find_mark(fsnotify_connp_t *connp, /* attach the mark to the object */ extern int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp, unsigned int obj_type, - int add_flags, __kernel_fsid_t *fsid); + int add_flags); extern int fsnotify_add_mark_locked(struct fsnotify_mark *mark, fsnotify_connp_t *connp, - unsigned int obj_type, int add_flags, - __kernel_fsid_t *fsid); + unsigned int obj_type, int add_flags); /* attach the mark to the inode */ static inline int fsnotify_add_inode_mark(struct fsnotify_mark *mark, @@ -777,15 +775,14 @@ static inline int fsnotify_add_inode_mark(struct fsnotify_mark *mark, int add_flags) { return fsnotify_add_mark(mark, &inode->i_fsnotify_marks, - FSNOTIFY_OBJ_TYPE_INODE, add_flags, NULL); + FSNOTIFY_OBJ_TYPE_INODE, add_flags); } static inline int fsnotify_add_inode_mark_locked(struct fsnotify_mark *mark, struct inode *inode, int add_flags) { return fsnotify_add_mark_locked(mark, &inode->i_fsnotify_marks, - FSNOTIFY_OBJ_TYPE_INODE, add_flags, - NULL); + FSNOTIFY_OBJ_TYPE_INODE, add_flags); } /* given a group and a mark, flag mark to be freed when all references are dropped */ -- Gitee From cdcf114d2e3e93ae897bfbf95778fd15e3ff25d8 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 30 Nov 2023 18:56:19 +0200 Subject: [PATCH 02/37] fanotify: allow "weak" fsid when watching a single filesystem ANBZ: #33963 commit 30ad1938326bf9303ca38090339d948975a626f5 upstream So far, fanotify returns -ENODEV or -EXDEV when trying to set a mark on a filesystem with a "weak" fsid, namely, zero fsid (e.g. fuse), or non-uniform fsid (e.g. btrfs non-root subvol). When group is watching inodes all from the same filesystem (or subvol), allow adding inode marks with "weak" fsid, because there is no ambiguity regarding which filesystem reports the event. The first mark added to a group determines if this group is single or multi filesystem, depending on the fsid at the path of the added mark. If the first mark added has a "strong" fsid, marks with "weak" fsid cannot be added and vice versa. If the first mark added has a "weak" fsid, following marks must have the same "weak" fsid and the same sb as the first mark. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Message-Id: <20231130165619.3386452-3-amir73il@gmail.com> Signed-off-by: Gao Xiang --- fs/notify/fanotify/fanotify.c | 15 +--- fs/notify/fanotify/fanotify.h | 6 ++ fs/notify/fanotify/fanotify_user.c | 112 +++++++++++++++++++++++------ include/linux/fsnotify_backend.h | 1 + 4 files changed, 101 insertions(+), 33 deletions(-) diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 46eb6345a870..de7f5ce1fef9 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -29,12 +29,6 @@ static unsigned int fanotify_hash_path(const struct path *path) hash_ptr(path->mnt, FANOTIFY_EVENT_HASH_BITS); } -static inline bool fanotify_fsid_equal(__kernel_fsid_t *fsid1, - __kernel_fsid_t *fsid2) -{ - return fsid1->val[0] == fsid2->val[0] && fsid1->val[1] == fsid2->val[1]; -} - static unsigned int fanotify_hash_fsid(__kernel_fsid_t *fsid) { return hash_32(fsid->val[0], FANOTIFY_EVENT_HASH_BITS) ^ @@ -857,7 +851,8 @@ static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info) if (!(mark->flags & FSNOTIFY_MARK_FLAG_HAS_FSID)) continue; fsid = FANOTIFY_MARK(mark)->fsid; - if (WARN_ON_ONCE(!fsid.val[0] && !fsid.val[1])) + if (!(mark->flags & FSNOTIFY_MARK_FLAG_WEAK_FSID) && + WARN_ON_ONCE(!fsid.val[0] && !fsid.val[1])) continue; return fsid; } @@ -939,12 +934,8 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask, return 0; } - if (FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS)) { + if (FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS)) fsid = fanotify_get_fsid(iter_info); - /* Racing with mark destruction or creation? */ - if (!fsid.val[0] && !fsid.val[1]) - return 0; - } event = fanotify_alloc_event(group, mask, data, data_type, dir, file_name, &fsid, match_mask); diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 88367e7b41dc..444507b1ad0a 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -499,6 +499,12 @@ static inline struct fanotify_mark *FANOTIFY_MARK(struct fsnotify_mark *mark) return container_of(mark, struct fanotify_mark, fsn_mark); } +static inline bool fanotify_fsid_equal(__kernel_fsid_t *fsid1, + __kernel_fsid_t *fsid2) +{ + return fsid1->val[0] == fsid2->val[0] && fsid1->val[1] == fsid2->val[1]; +} + static inline unsigned int fanotify_mark_user_flags(struct fsnotify_mark *mark) { unsigned int mflags = 0; diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 3e38eabe8775..dc469c2fa51b 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -23,7 +23,7 @@ #include -#include "../../mount.h" +#include "../fsnotify.h" #include "../fdinfo.h" #include "fanotify.h" @@ -1203,11 +1203,68 @@ static bool fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, return recalc; } +struct fan_fsid { + struct super_block *sb; + __kernel_fsid_t id; + bool weak; +}; + +static int fanotify_set_mark_fsid(struct fsnotify_group *group, + struct fsnotify_mark *mark, + struct fan_fsid *fsid) +{ + struct fsnotify_mark_connector *conn; + struct fsnotify_mark *old; + struct super_block *old_sb = NULL; + + FANOTIFY_MARK(mark)->fsid = fsid->id; + mark->flags |= FSNOTIFY_MARK_FLAG_HAS_FSID; + if (fsid->weak) + mark->flags |= FSNOTIFY_MARK_FLAG_WEAK_FSID; + + /* First mark added will determine if group is single or multi fsid */ + if (list_empty(&group->marks_list)) + return 0; + + /* Find sb of an existing mark */ + list_for_each_entry(old, &group->marks_list, g_list) { + conn = READ_ONCE(old->connector); + if (!conn) + continue; + old_sb = fsnotify_connector_sb(conn); + if (old_sb) + break; + } + + /* Only detached marks left? */ + if (!old_sb) + return 0; + + /* Do not allow mixing of marks with weak and strong fsid */ + if ((mark->flags ^ old->flags) & FSNOTIFY_MARK_FLAG_WEAK_FSID) + return -EXDEV; + + /* Allow mixing of marks with strong fsid from different fs */ + if (!fsid->weak) + return 0; + + /* Do not allow mixing marks with weak fsid from different fs */ + if (old_sb != fsid->sb) + return -EXDEV; + + /* Do not allow mixing marks from different btrfs sub-volumes */ + if (!fanotify_fsid_equal(&FANOTIFY_MARK(old)->fsid, + &FANOTIFY_MARK(mark)->fsid)) + return -EXDEV; + + return 0; +} + static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, fsnotify_connp_t *connp, unsigned int obj_type, unsigned int fan_flags, - __kernel_fsid_t *fsid) + struct fan_fsid *fsid) { struct ucounts *ucounts = group->fanotify_data.ucounts; struct fanotify_mark *fan_mark; @@ -1236,20 +1293,21 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, /* Cache fsid of filesystem containing the marked object */ if (fsid) { - fan_mark->fsid = *fsid; - mark->flags |= FSNOTIFY_MARK_FLAG_HAS_FSID; + ret = fanotify_set_mark_fsid(group, mark, fsid); + if (ret) + goto out_put_mark; } else { fan_mark->fsid.val[0] = fan_mark->fsid.val[1] = 0; } ret = fsnotify_add_mark_locked(mark, connp, obj_type, 0); - if (ret) { - fsnotify_put_mark(mark); - goto out_dec_ucounts; - } + if (ret) + goto out_put_mark; return mark; +out_put_mark: + fsnotify_put_mark(mark); out_dec_ucounts: if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS)) dec_ucount(ucounts, UCOUNT_FANOTIFY_MARKS); @@ -1300,7 +1358,7 @@ static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark, static int fanotify_add_mark(struct fsnotify_group *group, fsnotify_connp_t *connp, unsigned int obj_type, __u32 mask, unsigned int fan_flags, - __kernel_fsid_t *fsid) + struct fan_fsid *fsid) { struct fsnotify_mark *fsn_mark; bool recalc; @@ -1348,7 +1406,7 @@ static int fanotify_add_mark(struct fsnotify_group *group, static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, struct vfsmount *mnt, __u32 mask, - unsigned int flags, __kernel_fsid_t *fsid) + unsigned int flags, struct fan_fsid *fsid) { return fanotify_add_mark(group, &real_mount(mnt)->mnt_fsnotify_marks, FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags, fsid); @@ -1356,7 +1414,7 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, static int fanotify_add_sb_mark(struct fsnotify_group *group, struct super_block *sb, __u32 mask, - unsigned int flags, __kernel_fsid_t *fsid) + unsigned int flags, struct fan_fsid *fsid) { return fanotify_add_mark(group, &sb->s_fsnotify_marks, FSNOTIFY_OBJ_TYPE_SB, mask, flags, fsid); @@ -1364,7 +1422,7 @@ static int fanotify_add_sb_mark(struct fsnotify_group *group, static int fanotify_add_inode_mark(struct fsnotify_group *group, struct inode *inode, __u32 mask, - unsigned int flags, __kernel_fsid_t *fsid) + unsigned int flags, struct fan_fsid *fsid) { pr_debug("%s: group=%p inode=%p\n", __func__, group, inode); @@ -1575,20 +1633,25 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) return fd; } -static int fanotify_test_fsid(struct dentry *dentry, __kernel_fsid_t *fsid) +static int fanotify_test_fsid(struct dentry *dentry, unsigned int flags, + struct fan_fsid *fsid) { + unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; __kernel_fsid_t root_fsid; int err; /* * Make sure dentry is not of a filesystem with zero fsid (e.g. fuse). */ - err = vfs_get_fsid(dentry, fsid); + err = vfs_get_fsid(dentry, &fsid->id); if (err) return err; - if (!fsid->val[0] && !fsid->val[1]) - return -ENODEV; + fsid->sb = dentry->d_sb; + if (!fsid->id.val[0] && !fsid->id.val[1]) { + err = -ENODEV; + goto weak; + } /* * Make sure dentry is not of a filesystem subvolume (e.g. btrfs) @@ -1598,11 +1661,18 @@ static int fanotify_test_fsid(struct dentry *dentry, __kernel_fsid_t *fsid) if (err) return err; - if (root_fsid.val[0] != fsid->val[0] || - root_fsid.val[1] != fsid->val[1]) - return -EXDEV; + if (!fanotify_fsid_equal(&root_fsid, &fsid->id)) { + err = -EXDEV; + goto weak; + } + fsid->weak = false; return 0; + +weak: + /* Allow weak fsid when marking inodes */ + fsid->weak = true; + return (mark_type == FAN_MARK_INODE) ? 0 : err; } /* Check if filesystem can encode a unique fid */ @@ -1686,7 +1756,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, struct fsnotify_group *group; struct fd f; struct path path; - __kernel_fsid_t __fsid, *fsid = NULL; + struct fan_fsid __fsid, *fsid = NULL; u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS; unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS; @@ -1838,7 +1908,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, } if (fid_mode) { - ret = fanotify_test_fsid(path.dentry, &__fsid); + ret = fanotify_test_fsid(path.dentry, flags, &__fsid); if (ret) goto path_put_and_out; diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 0b6b83ea8be8..9734b76bc585 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -529,6 +529,7 @@ struct fsnotify_mark { #define FSNOTIFY_MARK_FLAG_NO_IREF 0x0200 #define FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS 0x0400 #define FSNOTIFY_MARK_FLAG_HAS_FSID 0x0800 +#define FSNOTIFY_MARK_FLAG_WEAK_FSID 0x1000 unsigned int flags; /* flags [mark->lock] */ }; -- Gitee From c8006e5d033b1838d0f81defb757c7d56fc2efcb Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Tue, 12 Dec 2023 11:44:38 +0200 Subject: [PATCH 03/37] fsnotify: split fsnotify_perm() into two hooks ANBZ: #33963 commit 36e28c42187c95eb148873ffb059bfdcb8cdb75b upstream We would like to make changes to the fsnotify access permission hook - add file range arguments and add the pre modify event. In preparation for these changes, split the fsnotify_perm() hook into fsnotify_open_perm() and fsnotify_file_perm(). This is needed for fanotify "pre content" events. Reviewed-by: Josef Bacik Reviewed-by: Jan Kara Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20231212094440.250945-4-amir73il@gmail.com Signed-off-by: Christian Brauner Signed-off-by: Gao Xiang --- include/linux/fsnotify.h | 34 +++++++++++++++++++--------------- security/security.c | 4 ++-- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 912b58bb66c5..dfc5972f7644 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -106,29 +106,33 @@ static inline int fsnotify_file(struct file *file, __u32 mask) return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH); } -/* Simple call site for access decisions */ -static inline int fsnotify_perm(struct file *file, int mask) +/* + * fsnotify_file_perm - permission hook before file access + */ +static inline int fsnotify_file_perm(struct file *file, int perm_mask) { - int ret; - __u32 fsnotify_mask = 0; + __u32 fsnotify_mask = FS_ACCESS_PERM; - if (!(mask & (MAY_READ | MAY_OPEN))) + if (!(perm_mask & MAY_READ)) return 0; - if (mask & MAY_OPEN) { - fsnotify_mask = FS_OPEN_PERM; + return fsnotify_file(file, fsnotify_mask); +} - if (file->f_flags & __FMODE_EXEC) { - ret = fsnotify_file(file, FS_OPEN_EXEC_PERM); +/* + * fsnotify_open_perm - permission hook before file open + */ +static inline int fsnotify_open_perm(struct file *file) +{ + int ret; - if (ret) - return ret; - } - } else if (mask & MAY_READ) { - fsnotify_mask = FS_ACCESS_PERM; + if (file->f_flags & __FMODE_EXEC) { + ret = fsnotify_file(file, FS_OPEN_EXEC_PERM); + if (ret) + return ret; } - return fsnotify_file(file, fsnotify_mask); + return fsnotify_file(file, FS_OPEN_PERM); } /* diff --git a/security/security.c b/security/security.c index bfd2172ecb3c..815b9ab6ff6d 100644 --- a/security/security.c +++ b/security/security.c @@ -2600,7 +2600,7 @@ int security_file_permission(struct file *file, int mask) if (ret) return ret; - return fsnotify_perm(file, mask); + return fsnotify_file_perm(file, mask); } /** @@ -2869,7 +2869,7 @@ int security_file_open(struct file *file) if (ret) return ret; - return fsnotify_perm(file, MAY_OPEN); + return fsnotify_open_perm(file); } /** -- Gitee From 46c7179bbd49ad56414de7abf011b6b7227032dc Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Tue, 12 Dec 2023 11:44:39 +0200 Subject: [PATCH 04/37] fsnotify: assert that file_start_write() is not held in permission hooks ANBZ: #33963 commit cb383f06686734ef04daf63a4369566800717b7b upstream filesystem may be modified in the context of fanotify permission events (e.g. by HSM service), so assert that sb freeze protection is not held. If the assertion fails, then the following deadlock would be possible: CPU0 CPU1 CPU2 ------------------------------------------------------------------------- file_start_write()#0 ... fsnotify_perm() fanotify_get_response() => (read event and fill file) ... ... freeze_super() ... sb_wait_write() ... vfs_write() file_start_write()#1 This example demonstrates a use case of an hierarchical storage management (HSM) service that uses fanotify permission events to fill the content of a file before access, while a 3rd process starts fsfreeze. This creates a circular dependeny: file_start_write()#0 => fanotify_get_response => file_start_write()#1 => sb_wait_write() => file_end_write()#0 Where file_end_write()#0 can never be called and none of the threads can make progress. The assertion is checked for both MAY_READ and MAY_WRITE permission hooks in preparation for a pre-modify permission event. The assertion is not checked for an open permission event, because do_open() takes mnt_want_write() in O_TRUNC case, meaning that it is not safe to write to filesystem in the content of an open permission event. Reviewed-by: Josef Bacik Reviewed-by: Jan Kara Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20231212094440.250945-5-amir73il@gmail.com Signed-off-by: Christian Brauner Signed-off-by: Gao Xiang --- include/linux/fsnotify.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index dfc5972f7644..037f8a9d393c 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -113,6 +113,13 @@ static inline int fsnotify_file_perm(struct file *file, int perm_mask) { __u32 fsnotify_mask = FS_ACCESS_PERM; + /* + * filesystem may be modified in the context of permission events + * (e.g. by HSM filling a file on access), so sb freeze protection + * must not be held. + */ + lockdep_assert_once(file_write_not_started(file)); + if (!(perm_mask & MAY_READ)) return 0; -- Gitee From 6ab67a2acff0df726de2c82128ff30d0ca7b28d8 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Tue, 12 Dec 2023 11:44:40 +0200 Subject: [PATCH 05/37] fsnotify: optionally pass access range in file permission hooks ANBZ: #33963 commit d9e5d31084b024734e64307521414ef0ae1d5333 upstream In preparation for pre-content permission events with file access range, move fsnotify_file_perm() hook out of security_file_permission() and into the callers. Callers that have the access range information call the new hook fsnotify_file_area_perm() with the access range. Reviewed-by: Jan Kara Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20231212094440.250945-6-amir73il@gmail.com Signed-off-by: Christian Brauner Signed-off-by: Gao Xiang --- fs/open.c | 4 ++++ fs/read_write.c | 10 ++++++++-- fs/readdir.c | 4 ++++ fs/remap_range.c | 8 +++++++- include/linux/fsnotify.h | 13 +++++++++++-- security/security.c | 8 +------- 6 files changed, 35 insertions(+), 12 deletions(-) diff --git a/fs/open.c b/fs/open.c index 704b8dfee259..4bdafb322a7c 100644 --- a/fs/open.c +++ b/fs/open.c @@ -303,6 +303,10 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (ret) return ret; + ret = fsnotify_file_area_perm(file, MAY_WRITE, &offset, len); + if (ret) + return ret; + if (S_ISFIFO(inode->i_mode)) return -ESPIPE; diff --git a/fs/read_write.c b/fs/read_write.c index 4a7c722b7952..e7b9567e3049 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -354,6 +354,9 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count) { + int mask = read_write == READ ? MAY_READ : MAY_WRITE; + int ret; + if (unlikely((ssize_t) count < 0)) return -EINVAL; @@ -371,8 +374,11 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t } } - return security_file_permission(file, - read_write == READ ? MAY_READ : MAY_WRITE); + ret = security_file_permission(file, mask); + if (ret) + return ret; + + return fsnotify_file_area_perm(file, mask, ppos, count); } EXPORT_SYMBOL(rw_verify_area); diff --git a/fs/readdir.c b/fs/readdir.c index c8c46e294431..278bc0254732 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -96,6 +96,10 @@ int iterate_dir(struct file *file, struct dir_context *ctx) if (res) goto out; + res = fsnotify_file_perm(file, MAY_READ); + if (res) + goto out; + res = down_read_killable(&inode->i_rwsem); if (res) goto out; diff --git a/fs/remap_range.c b/fs/remap_range.c index 0c2c9fab0266..7a59d2a57258 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -102,7 +102,9 @@ static int generic_remap_checks(struct file *file_in, loff_t pos_in, static int remap_verify_area(struct file *file, loff_t pos, loff_t len, bool write) { + int mask = write ? MAY_WRITE : MAY_READ; loff_t tmp; + int ret; if (unlikely(pos < 0 || len < 0)) return -EINVAL; @@ -110,7 +112,11 @@ static int remap_verify_area(struct file *file, loff_t pos, loff_t len, if (unlikely(check_add_overflow(pos, len, &tmp))) return -EINVAL; - return security_file_permission(file, write ? MAY_WRITE : MAY_READ); + ret = security_file_permission(file, mask); + if (ret) + return ret; + + return fsnotify_file_area_perm(file, mask, &pos, len); } /* diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 037f8a9d393c..123d8c81edf0 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -107,9 +107,10 @@ static inline int fsnotify_file(struct file *file, __u32 mask) } /* - * fsnotify_file_perm - permission hook before file access + * fsnotify_file_area_perm - permission hook before access to file range */ -static inline int fsnotify_file_perm(struct file *file, int perm_mask) +static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, + const loff_t *ppos, size_t count) { __u32 fsnotify_mask = FS_ACCESS_PERM; @@ -126,6 +127,14 @@ static inline int fsnotify_file_perm(struct file *file, int perm_mask) return fsnotify_file(file, fsnotify_mask); } +/* + * fsnotify_file_perm - permission hook before file access + */ +static inline int fsnotify_file_perm(struct file *file, int perm_mask) +{ + return fsnotify_file_area_perm(file, perm_mask, NULL, 0); +} + /* * fsnotify_open_perm - permission hook before file open */ diff --git a/security/security.c b/security/security.c index 815b9ab6ff6d..cb60fdcf78eb 100644 --- a/security/security.c +++ b/security/security.c @@ -2594,13 +2594,7 @@ int security_kernfs_init_security(struct kernfs_node *kn_dir, */ int security_file_permission(struct file *file, int mask) { - int ret; - - ret = call_int_hook(file_permission, 0, file, mask); - if (ret) - return ret; - - return fsnotify_file_perm(file, mask); + return call_int_hook(file_permission, 0, file, mask); } /** -- Gitee From b885fed35b870e205609ded3a08b27a8539cb31f Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Tue, 9 Jan 2024 20:22:45 +0200 Subject: [PATCH 06/37] fsnotify: compile out fsnotify permission hooks if !FANOTIFY_ACCESS_PERMISSIONS ANBZ: #33963 commit 7ea26f9460c6c76b1d6e36f39fce34b16cb88300 upstream The depency of FANOTIFY_ACCESS_PERMISSIONS on SECURITY made sure that the fsnotify permission hooks were never called when SECURITY was disabled. Moving the fsnotify permission hook out of the secutiy hook broke that optimisation. Reported-and-tested-by: Jens Axboe Closes: https://lore.kernel.org/linux-fsdevel/53682ece-f0e7-48de-9a1c-879ee34b0449@kernel.dk/ Fixes: d9e5d31084b0 ("fsnotify: optionally pass access range in file permission hooks") Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20240109182245.38884-1-amir73il@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner Signed-off-by: Gao Xiang --- include/linux/fsnotify.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 123d8c81edf0..418c83f26565 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -106,6 +106,7 @@ static inline int fsnotify_file(struct file *file, __u32 mask) return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH); } +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS /* * fsnotify_file_area_perm - permission hook before access to file range */ @@ -151,6 +152,24 @@ static inline int fsnotify_open_perm(struct file *file) return fsnotify_file(file, FS_OPEN_PERM); } +#else +static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, + const loff_t *ppos, size_t count) +{ + return 0; +} + +static inline int fsnotify_file_perm(struct file *file, int perm_mask) +{ + return 0; +} + +static inline int fsnotify_open_perm(struct file *file) +{ + return 0; +} +#endif + /* * fsnotify_link_count - inode's link count changed */ -- Gitee From dd7c21b1a3cd939f00157aa52f11435e6e21df76 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Tue, 16 Jan 2024 13:32:47 +0200 Subject: [PATCH 07/37] fsnotify: optimize the case of no parent watcher ANBZ: #33963 commit 082fd1ea1f98e6bb1189213a2404ddd774de3843 upstream If parent inode is not watching, check for the event in masks of sb/mount/inode masks early to optimize out most of the code in __fsnotify_parent() and avoid calling fsnotify(). Jens has reported that this optimization improves BW and IOPS in an io_uring benchmark by more than 10% and reduces perf reported CPU usage. before: + 4.51% io_uring [kernel.vmlinux] [k] fsnotify + 3.67% io_uring [kernel.vmlinux] [k] __fsnotify_parent after: + 2.37% io_uring [kernel.vmlinux] [k] __fsnotify_parent Reported-and-tested-by: Jens Axboe Link: https://lore.kernel.org/linux-fsdevel/b45bd8ff-5654-4e67-90a6-aad5e6759e0b@kernel.dk/ Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Message-Id: <20240116113247.758848-1-amir73il@gmail.com> Signed-off-by: Gao Xiang --- fs/notify/fsnotify.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 9cc4ebb53504..374d5dbc0130 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -152,7 +152,7 @@ static void fsnotify_clear_child_dentry_flag(struct inode *pinode, } /* Are inode/sb/mount interested in parent and name info with this event? */ -static bool fsnotify_event_needs_parent(struct inode *inode, struct mount *mnt, +static bool fsnotify_event_needs_parent(struct inode *inode, __u32 mnt_mask, __u32 mask) { __u32 marks_mask = 0; @@ -171,13 +171,22 @@ static bool fsnotify_event_needs_parent(struct inode *inode, struct mount *mnt, /* Did either inode/sb/mount subscribe for events with parent/name? */ marks_mask |= fsnotify_parent_needed_mask(inode->i_fsnotify_mask); marks_mask |= fsnotify_parent_needed_mask(inode->i_sb->s_fsnotify_mask); - if (mnt) - marks_mask |= fsnotify_parent_needed_mask(mnt->mnt_fsnotify_mask); + marks_mask |= fsnotify_parent_needed_mask(mnt_mask); /* Did they subscribe for this event with parent/name info? */ return mask & marks_mask; } +/* Are there any inode/mount/sb objects that are interested in this event? */ +static inline bool fsnotify_object_watched(struct inode *inode, __u32 mnt_mask, + __u32 mask) +{ + __u32 marks_mask = inode->i_fsnotify_mask | mnt_mask | + inode->i_sb->s_fsnotify_mask; + + return mask & marks_mask & ALL_FSNOTIFY_EVENTS; +} + /* * Notify this dentry's parent about a child's events with child name info * if parent is watching or if inode/sb/mount are interested in events with @@ -190,7 +199,7 @@ int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data, int data_type) { const struct path *path = fsnotify_data_path(data, data_type); - struct mount *mnt = path ? real_mount(path->mnt) : NULL; + __u32 mnt_mask = path ? real_mount(path->mnt)->mnt_fsnotify_mask : 0; struct inode *inode = d_inode(dentry); struct dentry *parent; bool parent_watched = dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED; @@ -201,16 +210,13 @@ int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data, struct qstr *file_name = NULL; int ret = 0; - /* - * Do inode/sb/mount care about parent and name info on non-dir? - * Do they care about any event at all? - */ - if (!inode->i_fsnotify_marks && !inode->i_sb->s_fsnotify_marks && - (!mnt || !mnt->mnt_fsnotify_marks) && !parent_watched) + /* Optimize the likely case of nobody watching this path */ + if (likely(!parent_watched && + !fsnotify_object_watched(inode, mnt_mask, mask))) return 0; parent = NULL; - parent_needed = fsnotify_event_needs_parent(inode, mnt, mask); + parent_needed = fsnotify_event_needs_parent(inode, mnt_mask, mask); if (!parent_watched && !parent_needed) goto notify; -- Gitee From d8467002c392bd0c32b9828a4f25cdf8ef093412 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 24 Jan 2024 16:29:33 +0100 Subject: [PATCH 08/37] fsnotify: Add fsnotify_sb_has_watchers() helper ANBZ: #33963 commit b7dbaace39713025f1fd33407c89651a0c09f667 upstream Instead of opencoded checks for number of fsnotify connectors add a helper fsnotify_sb_has_watchers(). Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Signed-off-by: Gao Xiang --- include/linux/fsnotify.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 418c83f26565..f79853d2778b 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -17,6 +17,12 @@ #include #include +/* Are there any inode/mount/sb objects that are being watched at all? */ +static inline bool fsnotify_sb_has_watchers(struct super_block *sb) +{ + return atomic_long_read(&sb->s_fsnotify_connectors); +} + /* * Notify this @dir inode about a change in a child directory entry. * The directory entry may have turned positive or negative or its inode may @@ -30,7 +36,7 @@ static inline int fsnotify_name(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, u32 cookie) { - if (atomic_long_read(&dir->i_sb->s_fsnotify_connectors) == 0) + if (!fsnotify_sb_has_watchers(dir->i_sb)) return 0; return fsnotify(mask, data, data_type, dir, name, NULL, cookie); @@ -44,7 +50,7 @@ static inline void fsnotify_dirent(struct inode *dir, struct dentry *dentry, static inline void fsnotify_inode(struct inode *inode, __u32 mask) { - if (atomic_long_read(&inode->i_sb->s_fsnotify_connectors) == 0) + if (!fsnotify_sb_has_watchers(inode->i_sb)) return; if (S_ISDIR(inode->i_mode)) @@ -59,7 +65,7 @@ static inline int fsnotify_parent(struct dentry *dentry, __u32 mask, { struct inode *inode = d_inode(dentry); - if (atomic_long_read(&inode->i_sb->s_fsnotify_connectors) == 0) + if (!fsnotify_sb_has_watchers(inode->i_sb)) return 0; if (S_ISDIR(inode->i_mode)) { -- Gitee From 1835278c2542d374daaf6b5d40d91d396c92822b Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 17 Mar 2024 20:41:45 +0200 Subject: [PATCH 09/37] fsnotify: rename fsnotify_{get,put}_sb_connectors() ANBZ: #33963 commit d2f277e26f521ccf6fb438463b41dba6123caabe upstream Instead of counting the number of connectors in an sb, we would like to count the number of watched objects per priority group. As a start, create an accessor fsnotify_sb_watched_objects() to s_fsnotify_connectors and rename the fsnotify_{get,put}_sb_connectors() helpers to fsnotify_{get,put}_sb_watchers() to better describes the counter. Increment the counter at the end of fsnotify_attach_connector_to_object() if connector was attached instead of decrementing it on race to connect. This is fine, because fsnotify_delete_sb() cannot be running in parallel to fsnotify_attach_connector_to_object() which requires a reference to a filesystem object. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Message-Id: <20240317184154.1200192-2-amir73il@gmail.com> Signed-off-by: Gao Xiang --- fs/notify/fsnotify.c | 4 +- fs/notify/mark.c | 67 ++++++++++++++++++-------------- include/linux/fsnotify.h | 2 +- include/linux/fsnotify_backend.h | 5 +++ 4 files changed, 45 insertions(+), 33 deletions(-) diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 374d5dbc0130..c50ce0ea6ec1 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -92,8 +92,8 @@ void fsnotify_sb_delete(struct super_block *sb) fsnotify_unmount_inodes(sb); fsnotify_clear_marks_by_sb(sb); /* Wait for outstanding object references from connectors */ - wait_var_event(&sb->s_fsnotify_connectors, - !atomic_long_read(&sb->s_fsnotify_connectors)); + wait_var_event(fsnotify_sb_watched_objects(sb), + !atomic_long_read(fsnotify_sb_watched_objects(sb))); } /* diff --git a/fs/notify/mark.c b/fs/notify/mark.c index c8f804814117..d5c069f45495 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -116,10 +116,43 @@ __u32 fsnotify_conn_mask(struct fsnotify_mark_connector *conn) return *fsnotify_conn_mask_p(conn); } +static void fsnotify_get_sb_watched_objects(struct super_block *sb) +{ + atomic_long_inc(fsnotify_sb_watched_objects(sb)); +} + +static void fsnotify_put_sb_watched_objects(struct super_block *sb) +{ + if (atomic_long_dec_and_test(fsnotify_sb_watched_objects(sb))) + wake_up_var(fsnotify_sb_watched_objects(sb)); +} + static void fsnotify_get_inode_ref(struct inode *inode) { ihold(inode); - atomic_long_inc(&inode->i_sb->s_fsnotify_connectors); + fsnotify_get_sb_watched_objects(inode->i_sb); +} + +static void fsnotify_put_inode_ref(struct inode *inode) +{ + fsnotify_put_sb_watched_objects(inode->i_sb); + iput(inode); +} + +static void fsnotify_get_sb_watchers(struct fsnotify_mark_connector *conn) +{ + struct super_block *sb = fsnotify_connector_sb(conn); + + if (sb) + fsnotify_get_sb_watched_objects(sb); +} + +static void fsnotify_put_sb_watchers(struct fsnotify_mark_connector *conn) +{ + struct super_block *sb = fsnotify_connector_sb(conn); + + if (sb) + fsnotify_put_sb_watched_objects(sb); } /* @@ -239,31 +272,6 @@ static void fsnotify_connector_destroy_workfn(struct work_struct *work) } } -static void fsnotify_put_inode_ref(struct inode *inode) -{ - struct super_block *sb = inode->i_sb; - - iput(inode); - if (atomic_long_dec_and_test(&sb->s_fsnotify_connectors)) - wake_up_var(&sb->s_fsnotify_connectors); -} - -static void fsnotify_get_sb_connectors(struct fsnotify_mark_connector *conn) -{ - struct super_block *sb = fsnotify_connector_sb(conn); - - if (sb) - atomic_long_inc(&sb->s_fsnotify_connectors); -} - -static void fsnotify_put_sb_connectors(struct fsnotify_mark_connector *conn) -{ - struct super_block *sb = fsnotify_connector_sb(conn); - - if (sb && atomic_long_dec_and_test(&sb->s_fsnotify_connectors)) - wake_up_var(&sb->s_fsnotify_connectors); -} - static void *fsnotify_detach_connector_from_object( struct fsnotify_mark_connector *conn, unsigned int *type) @@ -287,7 +295,7 @@ static void *fsnotify_detach_connector_from_object( fsnotify_conn_sb(conn)->s_fsnotify_mask = 0; } - fsnotify_put_sb_connectors(conn); + fsnotify_put_sb_watchers(conn); rcu_assign_pointer(*(conn->obj), NULL); conn->obj = NULL; conn->type = FSNOTIFY_OBJ_TYPE_DETACHED; @@ -575,8 +583,6 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, conn->flags = 0; conn->type = obj_type; conn->obj = connp; - conn->flags = 0; - fsnotify_get_sb_connectors(conn); /* * cmpxchg() provides the barrier so that readers of *connp can see @@ -584,10 +590,11 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, */ if (cmpxchg(connp, NULL, conn)) { /* Someone else created list structure for us */ - fsnotify_put_sb_connectors(conn); kmem_cache_free(fsnotify_mark_connector_cachep, conn); + return 0; } + fsnotify_get_sb_watchers(conn); return 0; } diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index f79853d2778b..b22069e450f8 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -20,7 +20,7 @@ /* Are there any inode/mount/sb objects that are being watched at all? */ static inline bool fsnotify_sb_has_watchers(struct super_block *sb) { - return atomic_long_read(&sb->s_fsnotify_connectors); + return atomic_long_read(fsnotify_sb_watched_objects(sb)); } /* diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 9734b76bc585..3c7b4f719561 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -483,6 +483,11 @@ struct fsnotify_mark_connector { struct hlist_head list; }; +static inline atomic_long_t *fsnotify_sb_watched_objects(struct super_block *sb) +{ + return &sb->s_fsnotify_connectors; +} + /* * A mark is simply an object attached to an in core inode which allows an * fsnotify listener to indicate they are either no longer interested in events -- Gitee From b630530f1ce6d923aa5b91f28a84f1df7f55b8d6 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 17 Mar 2024 20:41:46 +0200 Subject: [PATCH 10/37] fsnotify: create helpers to get sb and connp from object ANBZ: #33963 commit f115815d75332f9dabb0d7c29c8f67b0f26889c5 upstream In preparation to passing an object pointer to add/remove/find mark helpers, create helpers to get sb and connp by object type. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Message-Id: <20240317184154.1200192-3-amir73il@gmail.com> Signed-off-by: Gao Xiang --- fs/notify/fsnotify.h | 15 +++++++++++++++ fs/notify/mark.c | 15 +++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index 2b4267de86e6..df7d4b1fcd68 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -27,6 +27,21 @@ static inline struct super_block *fsnotify_conn_sb( return container_of(conn->obj, struct super_block, s_fsnotify_marks); } +static inline struct super_block *fsnotify_object_sb(void *obj, + enum fsnotify_obj_type obj_type) +{ + switch (obj_type) { + case FSNOTIFY_OBJ_TYPE_INODE: + return ((struct inode *)obj)->i_sb; + case FSNOTIFY_OBJ_TYPE_VFSMOUNT: + return ((struct vfsmount *)obj)->mnt_sb; + case FSNOTIFY_OBJ_TYPE_SB: + return (struct super_block *)obj; + default: + return NULL; + } +} + static inline struct super_block *fsnotify_connector_sb( struct fsnotify_mark_connector *conn) { diff --git a/fs/notify/mark.c b/fs/notify/mark.c index d5c069f45495..1091f3ba2374 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -97,6 +97,21 @@ void fsnotify_get_mark(struct fsnotify_mark *mark) refcount_inc(&mark->refcnt); } +static fsnotify_connp_t *fsnotify_object_connp(void *obj, + enum fsnotify_obj_type obj_type) +{ + switch (obj_type) { + case FSNOTIFY_OBJ_TYPE_INODE: + return &((struct inode *)obj)->i_fsnotify_marks; + case FSNOTIFY_OBJ_TYPE_VFSMOUNT: + return &real_mount(obj)->mnt_fsnotify_marks; + case FSNOTIFY_OBJ_TYPE_SB: + return &((struct super_block *)obj)->s_fsnotify_marks; + default: + return NULL; + } +} + static __u32 *fsnotify_conn_mask_p(struct fsnotify_mark_connector *conn) { if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) -- Gitee From 96d6bf346795ea5dd207fcf9e4bb3045d556e343 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 17 Mar 2024 20:41:47 +0200 Subject: [PATCH 11/37] fsnotify: create a wrapper fsnotify_find_inode_mark() ANBZ: #33963 commit 230d97d39ee2eb9030309f04f98615aaeb420dac upstream In preparation to passing an object pointer to fsnotify_find_mark(), add a wrapper fsnotify_find_inode_mark() and use it where possible. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Message-Id: <20240317184154.1200192-4-amir73il@gmail.com> Signed-off-by: Gao Xiang --- fs/nfsd/filecache.c | 4 ++-- fs/notify/dnotify/dnotify.c | 4 ++-- fs/notify/inotify/inotify_user.c | 2 +- include/linux/fsnotify_backend.h | 7 +++++++ kernel/audit_tree.c | 2 +- kernel/audit_watch.c | 2 +- 6 files changed, 14 insertions(+), 7 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 2419cf330210..00fa6969fb53 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -162,8 +162,8 @@ nfsd_file_mark_find_or_create(struct nfsd_file *nf, struct inode *inode) do { fsnotify_group_lock(nfsd_file_fsnotify_group); - mark = fsnotify_find_mark(&inode->i_fsnotify_marks, - nfsd_file_fsnotify_group); + mark = fsnotify_find_inode_mark(inode, + nfsd_file_fsnotify_group); if (mark) { nfm = nfsd_file_mark_get(container_of(mark, struct nfsd_file_mark, diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index ebdcc25df0f7..c5a19554ea85 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -163,7 +163,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id) if (!S_ISDIR(inode->i_mode)) return; - fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group); + fsn_mark = fsnotify_find_inode_mark(inode, dnotify_group); if (!fsn_mark) return; dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); @@ -327,7 +327,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg) fsnotify_group_lock(dnotify_group); /* add the new_fsn_mark or find an old one. */ - fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group); + fsn_mark = fsnotify_find_inode_mark(inode, dnotify_group); if (fsn_mark) { dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); spin_lock(&fsn_mark->lock); diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 1c4bfdab008d..16871e8475e6 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -545,7 +545,7 @@ static int inotify_update_existing_watch(struct fsnotify_group *group, int create = (arg & IN_MASK_CREATE); int ret; - fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group); + fsn_mark = fsnotify_find_inode_mark(inode, group); if (!fsn_mark) return -ENOENT; else if (create) { diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 3c7b4f719561..f5bd10880aca 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -791,6 +791,13 @@ static inline int fsnotify_add_inode_mark_locked(struct fsnotify_mark *mark, FSNOTIFY_OBJ_TYPE_INODE, add_flags); } +static inline struct fsnotify_mark *fsnotify_find_inode_mark( + struct inode *inode, + struct fsnotify_group *group) +{ + return fsnotify_find_mark(&inode->i_fsnotify_marks, group); +} + /* given a group and a mark, flag mark to be freed when all references are dropped */ extern void fsnotify_destroy_mark(struct fsnotify_mark *mark, struct fsnotify_group *group); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index e867c17d3f84..328e6ec7c90a 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -463,7 +463,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) int n; fsnotify_group_lock(audit_tree_group); - mark = fsnotify_find_mark(&inode->i_fsnotify_marks, audit_tree_group); + mark = fsnotify_find_inode_mark(inode, audit_tree_group); if (!mark) return create_chunk(inode, tree); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 7a98cd176a12..7f358740e958 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -90,7 +90,7 @@ static inline struct audit_parent *audit_find_parent(struct inode *inode) struct audit_parent *parent = NULL; struct fsnotify_mark *entry; - entry = fsnotify_find_mark(&inode->i_fsnotify_marks, audit_watch_group); + entry = fsnotify_find_inode_mark(inode, audit_watch_group); if (entry) parent = container_of(entry, struct audit_parent, mark); -- Gitee From b20e08f151c8abc3312374e678348112e30a9d69 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 17 Mar 2024 20:41:48 +0200 Subject: [PATCH 12/37] fanotify: merge two checks regarding add of ignore mark ANBZ: #33963 commit b5cae086cc2fde629495d988106d70e44c90cb20 upstream There are two similar checks for adding an ignore mark without FAN_MARK_IGNORED_SURV_MODIFY, one for the old FAN_MARK_IGNORED flag and one for the new FAN_MARK_IGNORE flag. Merge the two checks into a single location. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Message-Id: <20240317184154.1200192-5-amir73il@gmail.com> Signed-off-by: Gao Xiang --- fs/notify/fanotify/fanotify_user.c | 35 +++++++++++++++--------------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index dc469c2fa51b..7f261d4fa331 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1424,18 +1424,6 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group, struct inode *inode, __u32 mask, unsigned int flags, struct fan_fsid *fsid) { - pr_debug("%s: group=%p inode=%p\n", __func__, group, inode); - - /* - * If some other task has this inode open for write we should not add - * an ignore mask, unless that ignore mask is supposed to survive - * modification changes anyway. - */ - if ((flags & FANOTIFY_MARK_IGNORE_BITS) && - !(flags & FAN_MARK_IGNORED_SURV_MODIFY) && - inode_is_open_for_write(inode)) - return 0; - return fanotify_add_mark(group, &inode->i_fsnotify_marks, FSNOTIFY_OBJ_TYPE_INODE, mask, flags, fsid); } @@ -1925,12 +1913,23 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, else mnt = path.mnt; - ret = mnt ? -EINVAL : -EISDIR; - /* FAN_MARK_IGNORE requires SURV_MODIFY for sb/mount/dir marks */ - if (mark_cmd == FAN_MARK_ADD && ignore == FAN_MARK_IGNORE && - (mnt || S_ISDIR(inode->i_mode)) && - !(flags & FAN_MARK_IGNORED_SURV_MODIFY)) - goto path_put_and_out; + /* + * If some other task has this inode open for write we should not add + * an ignore mask, unless that ignore mask is supposed to survive + * modification changes anyway. + */ + if (mark_cmd == FAN_MARK_ADD && (flags & FANOTIFY_MARK_IGNORE_BITS) && + !(flags & FAN_MARK_IGNORED_SURV_MODIFY)) { + ret = mnt ? -EINVAL : -EISDIR; + /* FAN_MARK_IGNORE requires SURV_MODIFY for sb/mount/dir marks */ + if (ignore == FAN_MARK_IGNORE && + (mnt || S_ISDIR(inode->i_mode))) + goto path_put_and_out; + + ret = 0; + if (inode && inode_is_open_for_write(inode)) + goto path_put_and_out; + } /* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */ if (mnt || !S_ISDIR(inode->i_mode)) { -- Gitee From 7a785e2d18c8a1acedb8cb5187db75694cf59590 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 17 Mar 2024 20:41:49 +0200 Subject: [PATCH 13/37] fsnotify: pass object pointer and type to fsnotify mark helpers ANBZ: #33963 commit 687c217c6aa2c24e6ddf98cc7f86a5b986e5918d upstream Instead of passing fsnotify_connp_t, pass the pointer to the marked object. Store the object pointer in the connector and move the definition of fsnotify_connp_t to internal fsnotify subsystem API, so it is no longer used by fsnotify backends. Suggested-by: Jan Kara Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Message-Id: <20240317184154.1200192-6-amir73il@gmail.com> Signed-off-by: Gao Xiang --- fs/notify/fanotify/fanotify_user.c | 95 +++++++----------------------- fs/notify/fsnotify.h | 23 ++++---- fs/notify/mark.c | 28 +++++---- include/linux/fsnotify_backend.h | 33 ++++------- 4 files changed, 59 insertions(+), 120 deletions(-) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 7f261d4fa331..d90184ab340e 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1088,7 +1088,7 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, } static int fanotify_remove_mark(struct fsnotify_group *group, - fsnotify_connp_t *connp, __u32 mask, + void *obj, unsigned int obj_type, __u32 mask, unsigned int flags, __u32 umask) { struct fsnotify_mark *fsn_mark = NULL; @@ -1096,7 +1096,7 @@ static int fanotify_remove_mark(struct fsnotify_group *group, int destroy_mark; fsnotify_group_lock(group); - fsn_mark = fsnotify_find_mark(connp, group); + fsn_mark = fsnotify_find_mark(obj, obj_type, group); if (!fsn_mark) { fsnotify_group_unlock(group); return -ENOENT; @@ -1117,30 +1117,6 @@ static int fanotify_remove_mark(struct fsnotify_group *group, return 0; } -static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group, - struct vfsmount *mnt, __u32 mask, - unsigned int flags, __u32 umask) -{ - return fanotify_remove_mark(group, &real_mount(mnt)->mnt_fsnotify_marks, - mask, flags, umask); -} - -static int fanotify_remove_sb_mark(struct fsnotify_group *group, - struct super_block *sb, __u32 mask, - unsigned int flags, __u32 umask) -{ - return fanotify_remove_mark(group, &sb->s_fsnotify_marks, mask, - flags, umask); -} - -static int fanotify_remove_inode_mark(struct fsnotify_group *group, - struct inode *inode, __u32 mask, - unsigned int flags, __u32 umask) -{ - return fanotify_remove_mark(group, &inode->i_fsnotify_marks, mask, - flags, umask); -} - static bool fanotify_mark_update_flags(struct fsnotify_mark *fsn_mark, unsigned int fan_flags) { @@ -1261,7 +1237,7 @@ static int fanotify_set_mark_fsid(struct fsnotify_group *group, } static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, - fsnotify_connp_t *connp, + void *obj, unsigned int obj_type, unsigned int fan_flags, struct fan_fsid *fsid) @@ -1300,7 +1276,7 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, fan_mark->fsid.val[0] = fan_mark->fsid.val[1] = 0; } - ret = fsnotify_add_mark_locked(mark, connp, obj_type, 0); + ret = fsnotify_add_mark_locked(mark, obj, obj_type, 0); if (ret) goto out_put_mark; @@ -1356,7 +1332,7 @@ static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark, } static int fanotify_add_mark(struct fsnotify_group *group, - fsnotify_connp_t *connp, unsigned int obj_type, + void *obj, unsigned int obj_type, __u32 mask, unsigned int fan_flags, struct fan_fsid *fsid) { @@ -1365,9 +1341,9 @@ static int fanotify_add_mark(struct fsnotify_group *group, int ret = 0; fsnotify_group_lock(group); - fsn_mark = fsnotify_find_mark(connp, group); + fsn_mark = fsnotify_find_mark(obj, obj_type, group); if (!fsn_mark) { - fsn_mark = fanotify_add_new_mark(group, connp, obj_type, + fsn_mark = fanotify_add_new_mark(group, obj, obj_type, fan_flags, fsid); if (IS_ERR(fsn_mark)) { fsnotify_group_unlock(group); @@ -1404,30 +1380,6 @@ static int fanotify_add_mark(struct fsnotify_group *group, return ret; } -static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, - struct vfsmount *mnt, __u32 mask, - unsigned int flags, struct fan_fsid *fsid) -{ - return fanotify_add_mark(group, &real_mount(mnt)->mnt_fsnotify_marks, - FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags, fsid); -} - -static int fanotify_add_sb_mark(struct fsnotify_group *group, - struct super_block *sb, __u32 mask, - unsigned int flags, struct fan_fsid *fsid) -{ - return fanotify_add_mark(group, &sb->s_fsnotify_marks, - FSNOTIFY_OBJ_TYPE_SB, mask, flags, fsid); -} - -static int fanotify_add_inode_mark(struct fsnotify_group *group, - struct inode *inode, __u32 mask, - unsigned int flags, struct fan_fsid *fsid) -{ - return fanotify_add_mark(group, &inode->i_fsnotify_marks, - FSNOTIFY_OBJ_TYPE_INODE, mask, flags, fsid); -} - static struct fsnotify_event *fanotify_alloc_overflow_event(void) { struct fanotify_event *oevent; @@ -1750,6 +1702,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS; unsigned int ignore = flags & FANOTIFY_MARK_IGNORE_BITS; unsigned int obj_type, fid_mode; + void *obj; u32 umask = 0; int ret; @@ -1908,10 +1861,16 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, } /* inode held in place by reference to path; group by fget on fd */ - if (mark_type == FAN_MARK_INODE) + if (mark_type == FAN_MARK_INODE) { inode = path.dentry->d_inode; - else + obj = inode; + } else { mnt = path.mnt; + if (mark_type == FAN_MARK_MOUNT) + obj = mnt; + else + obj = mnt->mnt_sb; + } /* * If some other task has this inode open for write we should not add @@ -1947,26 +1906,12 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, /* create/update an inode mark */ switch (mark_cmd) { case FAN_MARK_ADD: - if (mark_type == FAN_MARK_MOUNT) - ret = fanotify_add_vfsmount_mark(group, mnt, mask, - flags, fsid); - else if (mark_type == FAN_MARK_FILESYSTEM) - ret = fanotify_add_sb_mark(group, mnt->mnt_sb, mask, - flags, fsid); - else - ret = fanotify_add_inode_mark(group, inode, mask, - flags, fsid); + ret = fanotify_add_mark(group, obj, obj_type, mask, flags, + fsid); break; case FAN_MARK_REMOVE: - if (mark_type == FAN_MARK_MOUNT) - ret = fanotify_remove_vfsmount_mark(group, mnt, mask, - flags, umask); - else if (mark_type == FAN_MARK_FILESYSTEM) - ret = fanotify_remove_sb_mark(group, mnt->mnt_sb, mask, - flags, umask); - else - ret = fanotify_remove_inode_mark(group, inode, mask, - flags, umask); + ret = fanotify_remove_mark(group, obj, obj_type, mask, flags, + umask); break; default: ret = -EINVAL; diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index df7d4b1fcd68..6ae038f8e88b 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -9,22 +9,28 @@ #include "../mount.h" +/* + * fsnotify_connp_t is what we embed in objects which connector can be attached + * to. + */ +typedef struct fsnotify_mark_connector __rcu *fsnotify_connp_t; + static inline struct inode *fsnotify_conn_inode( struct fsnotify_mark_connector *conn) { - return container_of(conn->obj, struct inode, i_fsnotify_marks); + return conn->obj; } static inline struct mount *fsnotify_conn_mount( struct fsnotify_mark_connector *conn) { - return container_of(conn->obj, struct mount, mnt_fsnotify_marks); + return real_mount(conn->obj); } static inline struct super_block *fsnotify_conn_sb( struct fsnotify_mark_connector *conn) { - return container_of(conn->obj, struct super_block, s_fsnotify_marks); + return conn->obj; } static inline struct super_block *fsnotify_object_sb(void *obj, @@ -45,16 +51,7 @@ static inline struct super_block *fsnotify_object_sb(void *obj, static inline struct super_block *fsnotify_connector_sb( struct fsnotify_mark_connector *conn) { - switch (conn->type) { - case FSNOTIFY_OBJ_TYPE_INODE: - return fsnotify_conn_inode(conn)->i_sb; - case FSNOTIFY_OBJ_TYPE_VFSMOUNT: - return fsnotify_conn_mount(conn)->mnt.mnt_sb; - case FSNOTIFY_OBJ_TYPE_SB: - return fsnotify_conn_sb(conn); - default: - return NULL; - } + return fsnotify_object_sb(conn->obj, conn->type); } /* destroy all events sitting in this groups notification queue */ diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 1091f3ba2374..cd639300bd03 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -291,6 +291,7 @@ static void *fsnotify_detach_connector_from_object( struct fsnotify_mark_connector *conn, unsigned int *type) { + fsnotify_connp_t *connp = fsnotify_object_connp(conn->obj, conn->type); struct inode *inode = NULL; *type = conn->type; @@ -311,7 +312,7 @@ static void *fsnotify_detach_connector_from_object( } fsnotify_put_sb_watchers(conn); - rcu_assign_pointer(*(conn->obj), NULL); + rcu_assign_pointer(*connp, NULL); conn->obj = NULL; conn->type = FSNOTIFY_OBJ_TYPE_DETACHED; @@ -586,7 +587,7 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b) } static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, - unsigned int obj_type) + void *obj, unsigned int obj_type) { struct fsnotify_mark_connector *conn; @@ -597,7 +598,7 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, INIT_HLIST_HEAD(&conn->list); conn->flags = 0; conn->type = obj_type; - conn->obj = connp; + conn->obj = obj; /* * cmpxchg() provides the barrier so that readers of *connp can see @@ -646,24 +647,25 @@ static struct fsnotify_mark_connector *fsnotify_grab_connector( * to which group and for which inodes. These marks are ordered according to * priority, highest number first, and then by the group's location in memory. */ -static int fsnotify_add_mark_list(struct fsnotify_mark *mark, - fsnotify_connp_t *connp, +static int fsnotify_add_mark_list(struct fsnotify_mark *mark, void *obj, unsigned int obj_type, int add_flags) { struct fsnotify_mark *lmark, *last = NULL; struct fsnotify_mark_connector *conn; + fsnotify_connp_t *connp; int cmp; int err = 0; if (WARN_ON(!fsnotify_valid_obj_type(obj_type))) return -EINVAL; + connp = fsnotify_object_connp(obj, obj_type); restart: spin_lock(&mark->lock); conn = fsnotify_grab_connector(connp); if (!conn) { spin_unlock(&mark->lock); - err = fsnotify_attach_connector_to_object(connp, obj_type); + err = fsnotify_attach_connector_to_object(connp, obj, obj_type); if (err) return err; goto restart; @@ -715,7 +717,7 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, * event types should be delivered to which group. */ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, - fsnotify_connp_t *connp, unsigned int obj_type, + void *obj, unsigned int obj_type, int add_flags) { struct fsnotify_group *group = mark->group; @@ -736,7 +738,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, fsnotify_get_mark(mark); /* for g_list */ spin_unlock(&mark->lock); - ret = fsnotify_add_mark_list(mark, connp, obj_type, add_flags); + ret = fsnotify_add_mark_list(mark, obj, obj_type, add_flags); if (ret) goto err; @@ -754,14 +756,14 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, return ret; } -int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp, +int fsnotify_add_mark(struct fsnotify_mark *mark, void *obj, unsigned int obj_type, int add_flags) { int ret; struct fsnotify_group *group = mark->group; fsnotify_group_lock(group); - ret = fsnotify_add_mark_locked(mark, connp, obj_type, add_flags); + ret = fsnotify_add_mark_locked(mark, obj, obj_type, add_flags); fsnotify_group_unlock(group); return ret; } @@ -771,12 +773,16 @@ EXPORT_SYMBOL_GPL(fsnotify_add_mark); * Given a list of marks, find the mark associated with given group. If found * take a reference to that mark and return it, else return NULL. */ -struct fsnotify_mark *fsnotify_find_mark(fsnotify_connp_t *connp, +struct fsnotify_mark *fsnotify_find_mark(void *obj, unsigned int obj_type, struct fsnotify_group *group) { + fsnotify_connp_t *connp = fsnotify_object_connp(obj, obj_type); struct fsnotify_mark_connector *conn; struct fsnotify_mark *mark; + if (!connp) + return NULL; + conn = fsnotify_grab_connector(connp); if (!conn) return NULL; diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index f5bd10880aca..7047a43cb2f9 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -456,13 +456,6 @@ FSNOTIFY_ITER_FUNCS(sb, SB) type < FSNOTIFY_ITER_TYPE_COUNT; \ type++) -/* - * fsnotify_connp_t is what we embed in objects which connector can be attached - * to. fsnotify_connp_t * is how we refer from connector back to object. - */ -struct fsnotify_mark_connector; -typedef struct fsnotify_mark_connector __rcu *fsnotify_connp_t; - /* * Inode/vfsmount/sb point to this structure which tracks all marks attached to * the inode/vfsmount/sb. The reference to inode/vfsmount/sb is held by this @@ -476,7 +469,7 @@ struct fsnotify_mark_connector { unsigned short flags; /* flags [lock] */ union { /* Object pointer [lock] */ - fsnotify_connp_t *obj; + void *obj; /* Used listing heads to free after srcu period expires */ struct fsnotify_mark_connector *destroy_next; }; @@ -765,37 +758,35 @@ extern void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn); extern void fsnotify_init_mark(struct fsnotify_mark *mark, struct fsnotify_group *group); /* Find mark belonging to given group in the list of marks */ -extern struct fsnotify_mark *fsnotify_find_mark(fsnotify_connp_t *connp, - struct fsnotify_group *group); +struct fsnotify_mark *fsnotify_find_mark(void *obj, unsigned int obj_type, + struct fsnotify_group *group); /* attach the mark to the object */ -extern int fsnotify_add_mark(struct fsnotify_mark *mark, - fsnotify_connp_t *connp, unsigned int obj_type, - int add_flags); -extern int fsnotify_add_mark_locked(struct fsnotify_mark *mark, - fsnotify_connp_t *connp, - unsigned int obj_type, int add_flags); +int fsnotify_add_mark(struct fsnotify_mark *mark, void *obj, + unsigned int obj_type, int add_flags); +int fsnotify_add_mark_locked(struct fsnotify_mark *mark, void *obj, + unsigned int obj_type, int add_flags); /* attach the mark to the inode */ static inline int fsnotify_add_inode_mark(struct fsnotify_mark *mark, struct inode *inode, int add_flags) { - return fsnotify_add_mark(mark, &inode->i_fsnotify_marks, - FSNOTIFY_OBJ_TYPE_INODE, add_flags); + return fsnotify_add_mark(mark, inode, FSNOTIFY_OBJ_TYPE_INODE, + add_flags); } static inline int fsnotify_add_inode_mark_locked(struct fsnotify_mark *mark, struct inode *inode, int add_flags) { - return fsnotify_add_mark_locked(mark, &inode->i_fsnotify_marks, - FSNOTIFY_OBJ_TYPE_INODE, add_flags); + return fsnotify_add_mark_locked(mark, inode, FSNOTIFY_OBJ_TYPE_INODE, + add_flags); } static inline struct fsnotify_mark *fsnotify_find_inode_mark( struct inode *inode, struct fsnotify_group *group) { - return fsnotify_find_mark(&inode->i_fsnotify_marks, group); + return fsnotify_find_mark(inode, FSNOTIFY_OBJ_TYPE_INODE, group); } /* given a group and a mark, flag mark to be freed when all references are dropped */ -- Gitee From bb4baf75c94dfafbfa5621a4602eb1460791d2a8 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 17 Mar 2024 20:41:50 +0200 Subject: [PATCH 14/37] fsnotify: create helper fsnotify_update_sb_watchers() ANBZ: #33963 commit c9d4603b054f9a63c07c7012040af4c80adb2c60 upstream We would like to count watched objects by priority group, so we will need to update the watched object counter after adding/removing marks. Create a helper fsnotify_update_sb_watchers() and call it after attaching/detaching a mark, instead of fsnotify_{get,put}_sb_watchers() only after attaching/detaching a connector. Soon, we will use this helper to count watched objects by the highest watching priority group. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Message-Id: <20240317184154.1200192-7-amir73il@gmail.com> Signed-off-by: Gao Xiang --- fs/notify/mark.c | 36 +++++++++++++++++++------------- include/linux/fsnotify_backend.h | 1 + 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/fs/notify/mark.c b/fs/notify/mark.c index cd639300bd03..5c0d358aca07 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -154,20 +154,23 @@ static void fsnotify_put_inode_ref(struct inode *inode) iput(inode); } -static void fsnotify_get_sb_watchers(struct fsnotify_mark_connector *conn) +/* + * Grab or drop watched objects reference depending on whether the connector + * is attached and has any marks attached. + */ +static void fsnotify_update_sb_watchers(struct super_block *sb, + struct fsnotify_mark_connector *conn) { - struct super_block *sb = fsnotify_connector_sb(conn); + bool is_watched = conn->flags & FSNOTIFY_CONN_FLAG_IS_WATCHED; + bool has_marks = conn->obj && !hlist_empty(&conn->list); - if (sb) + if (has_marks && !is_watched) { + conn->flags |= FSNOTIFY_CONN_FLAG_IS_WATCHED; fsnotify_get_sb_watched_objects(sb); -} - -static void fsnotify_put_sb_watchers(struct fsnotify_mark_connector *conn) -{ - struct super_block *sb = fsnotify_connector_sb(conn); - - if (sb) + } else if (!has_marks && is_watched) { + conn->flags &= ~FSNOTIFY_CONN_FLAG_IS_WATCHED; fsnotify_put_sb_watched_objects(sb); + } } /* @@ -292,6 +295,7 @@ static void *fsnotify_detach_connector_from_object( unsigned int *type) { fsnotify_connp_t *connp = fsnotify_object_connp(conn->obj, conn->type); + struct super_block *sb = fsnotify_connector_sb(conn); struct inode *inode = NULL; *type = conn->type; @@ -311,10 +315,10 @@ static void *fsnotify_detach_connector_from_object( fsnotify_conn_sb(conn)->s_fsnotify_mask = 0; } - fsnotify_put_sb_watchers(conn); rcu_assign_pointer(*connp, NULL); conn->obj = NULL; conn->type = FSNOTIFY_OBJ_TYPE_DETACHED; + fsnotify_update_sb_watchers(sb, conn); return inode; } @@ -366,6 +370,11 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) objp = fsnotify_detach_connector_from_object(conn, &type); free_conn = true; } else { + struct super_block *sb = fsnotify_connector_sb(conn); + + /* Update watched objects after detaching mark */ + if (sb) + fsnotify_update_sb_watchers(sb, conn); objp = __fsnotify_recalc_mask(conn); type = conn->type; } @@ -607,10 +616,7 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, if (cmpxchg(connp, NULL, conn)) { /* Someone else created list structure for us */ kmem_cache_free(fsnotify_mark_connector_cachep, conn); - return 0; } - - fsnotify_get_sb_watchers(conn); return 0; } @@ -650,6 +656,7 @@ static struct fsnotify_mark_connector *fsnotify_grab_connector( static int fsnotify_add_mark_list(struct fsnotify_mark *mark, void *obj, unsigned int obj_type, int add_flags) { + struct super_block *sb = fsnotify_object_sb(obj, obj_type); struct fsnotify_mark *lmark, *last = NULL; struct fsnotify_mark_connector *conn; fsnotify_connp_t *connp; @@ -699,6 +706,7 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, void *obj, /* mark should be the last entry. last is the current last entry */ hlist_add_behind_rcu(&mark->obj_list, &last->obj_list); added: + fsnotify_update_sb_watchers(sb, conn); /* * Since connector is attached to object using cmpxchg() we are * guaranteed that connector initialization is fully visible by anyone diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 7047a43cb2f9..eb43944ac3b0 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -465,6 +465,7 @@ FSNOTIFY_ITER_FUNCS(sb, SB) struct fsnotify_mark_connector { spinlock_t lock; unsigned short type; /* Type of object [lock] */ +#define FSNOTIFY_CONN_FLAG_IS_WATCHED 0x01 #define FSNOTIFY_CONN_FLAG_HAS_IREF 0x02 unsigned short flags; /* flags [lock] */ union { -- Gitee From 7d92eef5ee59cde20284d41383e3bad0c6b02f08 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 17 Mar 2024 20:41:51 +0200 Subject: [PATCH 15/37] fsnotify: lazy attach fsnotify_sb_info state to sb ANBZ: #33963 commit 07a3b8d0bf726a1e49b050bbc6bd72f031e505fe upstream Define a container struct fsnotify_sb_info to hold per-sb state, including the reference to sb marks connector. Allocate the fsnotify_sb_info state before attaching connector to any object on the sb and free it only when killing sb. This state is going to be used for storing per priority watched objects counters. Suggested-by: Jan Kara Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Message-Id: <20240317184154.1200192-8-amir73il@gmail.com> Signed-off-by: Gao Xiang --- fs/notify/fsnotify.c | 16 +++++++++++++--- fs/notify/fsnotify.h | 9 ++++++++- fs/notify/mark.c | 32 +++++++++++++++++++++++++++++++- include/linux/fs.h | 8 ++++---- include/linux/fsnotify_backend.h | 17 +++++++++++++++++ 5 files changed, 73 insertions(+), 9 deletions(-) diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index c50ce0ea6ec1..8bf7d43ae780 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -89,11 +89,18 @@ static void fsnotify_unmount_inodes(struct super_block *sb) void fsnotify_sb_delete(struct super_block *sb) { + struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb); + + /* Were any marks ever added to any object on this sb? */ + if (!sbinfo) + return; + fsnotify_unmount_inodes(sb); fsnotify_clear_marks_by_sb(sb); /* Wait for outstanding object references from connectors */ wait_var_event(fsnotify_sb_watched_objects(sb), !atomic_long_read(fsnotify_sb_watched_objects(sb))); + kfree(sbinfo); } /* @@ -503,6 +510,7 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, { const struct path *path = fsnotify_data_path(data, data_type); struct super_block *sb = fsnotify_data_sb(data, data_type); + struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb); struct fsnotify_iter_info iter_info = {}; struct mount *mnt = NULL; struct inode *inode2 = NULL; @@ -539,7 +547,7 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, * SRCU because we have no references to any objects and do not * need SRCU to keep them "alive". */ - if (!sb->s_fsnotify_marks && + if ((!sbinfo || !sbinfo->sb_marks) && (!mnt || !mnt->mnt_fsnotify_marks) && (!inode || !inode->i_fsnotify_marks) && (!inode2 || !inode2->i_fsnotify_marks)) @@ -566,8 +574,10 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu); - iter_info.marks[FSNOTIFY_ITER_TYPE_SB] = - fsnotify_first_mark(&sb->s_fsnotify_marks); + if (sbinfo) { + iter_info.marks[FSNOTIFY_ITER_TYPE_SB] = + fsnotify_first_mark(&sbinfo->sb_marks); + } if (mnt) { iter_info.marks[FSNOTIFY_ITER_TYPE_VFSMOUNT] = fsnotify_first_mark(&mnt->mnt_fsnotify_marks); diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index 6ae038f8e88b..663759ed6fbc 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -54,6 +54,13 @@ static inline struct super_block *fsnotify_connector_sb( return fsnotify_object_sb(conn->obj, conn->type); } +static inline fsnotify_connp_t *fsnotify_sb_marks(struct super_block *sb) +{ + struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb); + + return sbinfo ? &sbinfo->sb_marks : NULL; +} + /* destroy all events sitting in this groups notification queue */ extern void fsnotify_flush_notify(struct fsnotify_group *group); @@ -79,7 +86,7 @@ static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) /* run the list of all marks associated with sb and destroy them */ static inline void fsnotify_clear_marks_by_sb(struct super_block *sb) { - fsnotify_destroy_marks(&sb->s_fsnotify_marks); + fsnotify_destroy_marks(fsnotify_sb_marks(sb)); } /* diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 5c0d358aca07..65fe4d74b997 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -106,7 +106,7 @@ static fsnotify_connp_t *fsnotify_object_connp(void *obj, case FSNOTIFY_OBJ_TYPE_VFSMOUNT: return &real_mount(obj)->mnt_fsnotify_marks; case FSNOTIFY_OBJ_TYPE_SB: - return &((struct super_block *)obj)->s_fsnotify_marks; + return fsnotify_sb_marks(obj); default: return NULL; } @@ -595,6 +595,26 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b) return -1; } +static int fsnotify_attach_info_to_sb(struct super_block *sb) +{ + struct fsnotify_sb_info *sbinfo; + + /* sb info is freed on fsnotify_sb_delete() */ + sbinfo = kzalloc(sizeof(*sbinfo), GFP_KERNEL); + if (!sbinfo) + return -ENOMEM; + + /* + * cmpxchg() provides the barrier so that callers of fsnotify_sb_info() + * will observe an initialized structure + */ + if (cmpxchg(&sb->s_fsnotify_info, NULL, sbinfo)) { + /* Someone else created sbinfo for us */ + kfree(sbinfo); + } + return 0; +} + static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, void *obj, unsigned int obj_type) { @@ -666,6 +686,16 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, void *obj, if (WARN_ON(!fsnotify_valid_obj_type(obj_type))) return -EINVAL; + /* + * Attach the sb info before attaching a connector to any object on sb. + * The sb info will remain attached as long as sb lives. + */ + if (!fsnotify_sb_info(sb)) { + err = fsnotify_attach_info_to_sb(sb); + if (err) + return err; + } + connp = fsnotify_object_connp(obj, obj_type); restart: spin_lock(&mark->lock); diff --git a/include/linux/fs.h b/include/linux/fs.h index f58eefd376b0..a458c9fa9307 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -71,6 +71,8 @@ struct fscrypt_info; struct fscrypt_operations; struct fsverity_info; struct fsverity_operations; +struct fsnotify_mark_connector; +struct fsnotify_sb_info; struct fs_context; struct fs_parameter_spec; struct fileattr; @@ -640,8 +642,6 @@ is_uncached_acl(struct posix_acl *acl) #define IOP_XATTR 0x0008 #define IOP_DEFAULT_READLINK 0x0010 -struct fsnotify_mark_connector; - /* * Keep mostly read-only and often accessed (especially for * the RCU path lookup and 'stat' data) fields at the beginning @@ -1246,7 +1246,7 @@ struct super_block { /* * Keep s_fs_info, s_time_gran, s_fsnotify_mask, and - * s_fsnotify_marks together for cache efficiency. They are frequently + * s_fsnotify_info together for cache efficiency. They are frequently * accessed and rarely modified. */ void *s_fs_info; /* Filesystem private info */ @@ -1258,7 +1258,7 @@ struct super_block { time64_t s_time_max; #ifdef CONFIG_FSNOTIFY __u32 s_fsnotify_mask; - struct fsnotify_mark_connector __rcu *s_fsnotify_marks; + struct fsnotify_sb_info *s_fsnotify_info; #endif /* diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index eb43944ac3b0..7c88409d8660 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -477,6 +477,23 @@ struct fsnotify_mark_connector { struct hlist_head list; }; +/* + * Container for per-sb fsnotify state (sb marks and more). + * Attached lazily on first marked object on the sb and freed when killing sb. + */ +struct fsnotify_sb_info { + struct fsnotify_mark_connector __rcu *sb_marks; +}; + +static inline struct fsnotify_sb_info *fsnotify_sb_info(struct super_block *sb) +{ +#ifdef CONFIG_FSNOTIFY + return READ_ONCE(sb->s_fsnotify_info); +#else + return NULL; +#endif +} + static inline atomic_long_t *fsnotify_sb_watched_objects(struct super_block *sb) { return &sb->s_fsnotify_connectors; -- Gitee From be0bf3ff8bb6d630d6de7acd27cb037d074f97f7 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 17 Mar 2024 20:41:52 +0200 Subject: [PATCH 16/37] fsnotify: move s_fsnotify_connectors into fsnotify_sb_info ANBZ: #33963 commit cb5d4f48c10445c97a22af0bd8b9cf0ed6cc8036 upstream Move the s_fsnotify_connectors counter into the per-sb fsnotify state. Suggested-by: Christian Brauner Signed-off-by: Amir Goldstein Reviewed-by: Christian Brauner Signed-off-by: Jan Kara Message-Id: <20240317184154.1200192-9-amir73il@gmail.com> [ Gao Xiang: needed by "fsnotify: optimize the case of no permission event watchers". ] Signed-off-by: Gao Xiang --- include/linux/fs.h | 6 ------ include/linux/fsnotify.h | 8 +++++++- include/linux/fsnotify_backend.h | 7 ++++++- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index a458c9fa9307..9c9dcd960c13 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1299,12 +1299,6 @@ struct super_block { /* Number of inodes with nlink == 0 but still referenced */ atomic_long_t s_remove_count; - /* - * Number of inode/mount/sb objects that are being watched, note that - * inodes objects are currently double-accounted. - */ - atomic_long_t s_fsnotify_connectors; - /* Read-only state of the superblock is being changed */ int s_readonly_remount; diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index b22069e450f8..01c429230862 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -20,7 +20,13 @@ /* Are there any inode/mount/sb objects that are being watched at all? */ static inline bool fsnotify_sb_has_watchers(struct super_block *sb) { - return atomic_long_read(fsnotify_sb_watched_objects(sb)); + struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb); + + /* Were any marks ever added to any object on this sb? */ + if (!sbinfo) + return false; + + return atomic_long_read(&sbinfo->watched_objects); } /* diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 7c88409d8660..df1faff6bf9b 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -483,6 +483,11 @@ struct fsnotify_mark_connector { */ struct fsnotify_sb_info { struct fsnotify_mark_connector __rcu *sb_marks; + /* + * Number of inode/mount/sb objects that are being watched in this sb. + * Note that inodes objects are currently double-accounted. + */ + atomic_long_t watched_objects; }; static inline struct fsnotify_sb_info *fsnotify_sb_info(struct super_block *sb) @@ -496,7 +501,7 @@ static inline struct fsnotify_sb_info *fsnotify_sb_info(struct super_block *sb) static inline atomic_long_t *fsnotify_sb_watched_objects(struct super_block *sb) { - return &sb->s_fsnotify_connectors; + return &fsnotify_sb_info(sb)->watched_objects; } /* -- Gitee From a45898fa62254c5db61cdcd893d8855e7404ba9b Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 17 Mar 2024 20:41:53 +0200 Subject: [PATCH 17/37] fsnotify: use an enum for group priority constants ANBZ: #33963 commit 477cf917dd02853ba78a73cdeb6548889e5f8cd7 upstream And use meaningfull names for the constants. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Message-Id: <20240317184154.1200192-10-amir73il@gmail.com> [ Gao Xiang: needed by "fanotify: introduce FAN_PRE_ACCESS permission event". ] Signed-off-by: Gao Xiang --- fs/notify/fanotify/fanotify_user.c | 11 +++++------ include/linux/fsnotify_backend.h | 20 ++++++++++++-------- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index d90184ab340e..d20f68613bd5 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1528,13 +1528,13 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) INIT_LIST_HEAD(&group->fanotify_data.access_list); switch (class) { case FAN_CLASS_NOTIF: - group->priority = FS_PRIO_0; + group->priority = FSNOTIFY_PRIO_NORMAL; break; case FAN_CLASS_CONTENT: - group->priority = FS_PRIO_1; + group->priority = FSNOTIFY_PRIO_CONTENT; break; case FAN_CLASS_PRE_CONTENT: - group->priority = FS_PRIO_2; + group->priority = FSNOTIFY_PRIO_PRE_CONTENT; break; default: fd = -EINVAL; @@ -1786,12 +1786,11 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, goto fput_and_out; /* - * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF. These are not - * allowed to set permissions events. + * Permission events require minimum priority FAN_CLASS_CONTENT. */ ret = -EINVAL; if (mask & FANOTIFY_PERM_EVENTS && - group->priority == FS_PRIO_0) + group->priority < FSNOTIFY_PRIO_CONTENT) goto fput_and_out; if (mask & FAN_FS_ERROR && diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index df1faff6bf9b..4258a10be152 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -176,6 +176,17 @@ struct fsnotify_event { struct list_head list; }; +/* + * fsnotify group priorities. + * Events are sent in order from highest priority to lowest priority. + */ +enum fsnotify_group_prio { + FSNOTIFY_PRIO_NORMAL = 0, /* normal notifiers, no permissions */ + FSNOTIFY_PRIO_CONTENT, /* fanotify permission events */ + FSNOTIFY_PRIO_PRE_CONTENT, /* fanotify pre-content events */ + __FSNOTIFY_PRIO_NUM +}; + /* * A group is a "thing" that wants to receive notification about filesystem * events. The mask holds the subset of event types this group cares about. @@ -201,14 +212,7 @@ struct fsnotify_group { wait_queue_head_t notification_waitq; /* read() on the notification file blocks on this waitq */ unsigned int q_len; /* events on the queue */ unsigned int max_events; /* maximum events allowed on the list */ - /* - * Valid fsnotify group priorities. Events are send in order from highest - * priority to lowest priority. We default to the lowest priority. - */ - #define FS_PRIO_0 0 /* normal notifiers, no permissions */ - #define FS_PRIO_1 1 /* fanotify content based access control */ - #define FS_PRIO_2 2 /* fanotify pre-content access */ - unsigned int priority; + enum fsnotify_group_prio priority; /* priority for sending events */ bool shutdown; /* group is being shut down, don't queue more events */ #define FSNOTIFY_GROUP_USER 0x01 /* user allocated group */ -- Gitee From 001c01c945b87c36a93b68deb3de9d5652b466e3 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 17 Mar 2024 20:41:54 +0200 Subject: [PATCH 18/37] fsnotify: optimize the case of no permission event watchers ANBZ: #33963 commit a5e57b4d370c6d320e5bfb0c919fe00aee29e039 upstream Commit e43de7f0862b ("fsnotify: optimize the case of no marks of any type") optimized the case where there are no fsnotify watchers on any of the filesystem's objects. It is quite common for a system to have a single local filesystem and it is quite common for the system to have some inotify watches on some config files or directories, so the optimization of no marks at all is often not in effect. Permission event watchers, which require high priority group are more rare, so optimizing the case of no marks og high priority groups can improve performance for more systems, especially for performance sensitive io workloads. Count per-sb watched objects by high priority groups and use that the optimize out the call to __fsnotify_parent() and fsnotify() in fsnotify permission hooks. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Message-Id: <20240317184154.1200192-11-amir73il@gmail.com> [ Gao Xiang: needed by "fsnotify: opt-in for permission events at file open time". ] Signed-off-by: Gao Xiang --- fs/notify/fsnotify.c | 3 +++ fs/notify/mark.c | 30 +++++++++++++++++++++++++++--- include/linux/fsnotify.h | 19 ++++++++++++++++--- include/linux/fsnotify_backend.h | 11 ++++++++--- 4 files changed, 54 insertions(+), 9 deletions(-) diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 8bf7d43ae780..4d3fd169e5aa 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -100,6 +100,9 @@ void fsnotify_sb_delete(struct super_block *sb) /* Wait for outstanding object references from connectors */ wait_var_event(fsnotify_sb_watched_objects(sb), !atomic_long_read(fsnotify_sb_watched_objects(sb))); + WARN_ON(fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_CONTENT)); + WARN_ON(fsnotify_sb_has_priority_watchers(sb, + FSNOTIFY_PRIO_PRE_CONTENT)); kfree(sbinfo); } diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 65fe4d74b997..5e170e713088 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -161,13 +161,36 @@ static void fsnotify_put_inode_ref(struct inode *inode) static void fsnotify_update_sb_watchers(struct super_block *sb, struct fsnotify_mark_connector *conn) { + struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb); bool is_watched = conn->flags & FSNOTIFY_CONN_FLAG_IS_WATCHED; - bool has_marks = conn->obj && !hlist_empty(&conn->list); + struct fsnotify_mark *first_mark = NULL; + unsigned int highest_prio = 0; - if (has_marks && !is_watched) { + if (conn->obj) + first_mark = hlist_entry_safe(conn->list.first, + struct fsnotify_mark, obj_list); + if (first_mark) + highest_prio = first_mark->group->priority; + if (WARN_ON(highest_prio >= __FSNOTIFY_PRIO_NUM)) + highest_prio = 0; + + /* + * If the highest priority of group watching this object is prio, + * then watched object has a reference on counters [0..prio]. + * Update priority >= 1 watched objects counters. + */ + for (unsigned int p = conn->prio + 1; p <= highest_prio; p++) + atomic_long_inc(&sbinfo->watched_objects[p]); + for (unsigned int p = conn->prio; p > highest_prio; p--) + atomic_long_dec(&sbinfo->watched_objects[p]); + conn->prio = highest_prio; + + /* Update priority >= 0 (a.k.a total) watched objects counter */ + BUILD_BUG_ON(FSNOTIFY_PRIO_NORMAL != 0); + if (first_mark && !is_watched) { conn->flags |= FSNOTIFY_CONN_FLAG_IS_WATCHED; fsnotify_get_sb_watched_objects(sb); - } else if (!has_marks && is_watched) { + } else if (!first_mark && is_watched) { conn->flags &= ~FSNOTIFY_CONN_FLAG_IS_WATCHED; fsnotify_put_sb_watched_objects(sb); } @@ -626,6 +649,7 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, spin_lock_init(&conn->lock); INIT_HLIST_HEAD(&conn->list); conn->flags = 0; + conn->prio = 0; conn->type = obj_type; conn->obj = obj; diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 01c429230862..278620e063ab 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -17,8 +17,9 @@ #include #include -/* Are there any inode/mount/sb objects that are being watched at all? */ -static inline bool fsnotify_sb_has_watchers(struct super_block *sb) +/* Are there any inode/mount/sb objects watched with priority prio or above? */ +static inline bool fsnotify_sb_has_priority_watchers(struct super_block *sb, + int prio) { struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb); @@ -26,7 +27,13 @@ static inline bool fsnotify_sb_has_watchers(struct super_block *sb) if (!sbinfo) return false; - return atomic_long_read(&sbinfo->watched_objects); + return atomic_long_read(&sbinfo->watched_objects[prio]); +} + +/* Are there any inode/mount/sb objects that are being watched at all? */ +static inline bool fsnotify_sb_has_watchers(struct super_block *sb) +{ + return fsnotify_sb_has_priority_watchers(sb, 0); } /* @@ -115,6 +122,12 @@ static inline int fsnotify_file(struct file *file, __u32 mask) return 0; path = &file->f_path; + /* Permission events require group prio >= FSNOTIFY_PRIO_CONTENT */ + if (mask & ALL_FSNOTIFY_PERM_EVENTS && + !fsnotify_sb_has_priority_watchers(path->dentry->d_sb, + FSNOTIFY_PRIO_CONTENT)) + return 0; + return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH); } diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 4258a10be152..64b204d00c82 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -468,7 +468,8 @@ FSNOTIFY_ITER_FUNCS(sb, SB) */ struct fsnotify_mark_connector { spinlock_t lock; - unsigned short type; /* Type of object [lock] */ + unsigned char type; /* Type of object [lock] */ + unsigned char prio; /* Highest priority group */ #define FSNOTIFY_CONN_FLAG_IS_WATCHED 0x01 #define FSNOTIFY_CONN_FLAG_HAS_IREF 0x02 unsigned short flags; /* flags [lock] */ @@ -490,8 +491,12 @@ struct fsnotify_sb_info { /* * Number of inode/mount/sb objects that are being watched in this sb. * Note that inodes objects are currently double-accounted. + * + * The value in watched_objects[prio] is the number of objects that are + * watched by groups of priority >= prio, so watched_objects[0] is the + * total number of watched objects in this sb. */ - atomic_long_t watched_objects; + atomic_long_t watched_objects[__FSNOTIFY_PRIO_NUM]; }; static inline struct fsnotify_sb_info *fsnotify_sb_info(struct super_block *sb) @@ -505,7 +510,7 @@ static inline struct fsnotify_sb_info *fsnotify_sb_info(struct super_block *sb) static inline atomic_long_t *fsnotify_sb_watched_objects(struct super_block *sb) { - return &fsnotify_sb_info(sb)->watched_objects; + return &fsnotify_sb_info(sb)->watched_objects[0]; } /* -- Gitee From 2d5ababac9f35fd994878a8a330d8f9160ad2d75 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 7 Apr 2024 15:16:55 +0200 Subject: [PATCH 19/37] fs: use bit shifts for FMODE_* flags ANBZ: #33963 commit 0a4f544d83990e4b7326020a802d64956fa366de upstream Make it easier to see what bits are still available. Link: https://lore.kernel.org/r/20240406061604.GA538574@ZenIV Signed-off-by: Christian Brauner Signed-off-by: Gao Xiang --- include/linux/fs.h | 57 +++++++++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 26 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 9c9dcd960c13..957fbf10a632 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -110,21 +110,24 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, */ /* file is open for reading */ -#define FMODE_READ ((__force fmode_t)0x1) +#define FMODE_READ ((__force fmode_t)(1 << 0)) /* file is open for writing */ -#define FMODE_WRITE ((__force fmode_t)0x2) +#define FMODE_WRITE ((__force fmode_t)(1 << 1)) /* file is seekable */ -#define FMODE_LSEEK ((__force fmode_t)0x4) +#define FMODE_LSEEK ((__force fmode_t)(1 << 2)) /* file can be accessed using pread */ -#define FMODE_PREAD ((__force fmode_t)0x8) +#define FMODE_PREAD ((__force fmode_t)(1 << 3)) /* file can be accessed using pwrite */ -#define FMODE_PWRITE ((__force fmode_t)0x10) +#define FMODE_PWRITE ((__force fmode_t)(1 << 4)) /* File is opened for execution with sys_execve / sys_uselib */ -#define FMODE_EXEC ((__force fmode_t)0x20) +#define FMODE_EXEC ((__force fmode_t)(1 << 5)) + +/* FMODE_* bits 6 to 8 */ + /* 32bit hashes as llseek() offset (for directories) */ -#define FMODE_32BITHASH ((__force fmode_t)0x200) +#define FMODE_32BITHASH ((__force fmode_t)(1 << 9)) /* 64bit hashes as llseek() offset (for directories) */ -#define FMODE_64BITHASH ((__force fmode_t)0x400) +#define FMODE_64BITHASH ((__force fmode_t)(1 << 10)) /* * Don't update ctime and mtime. @@ -132,51 +135,53 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, * Currently a special hack for the XFS open_by_handle ioctl, but we'll * hopefully graduate it to a proper O_CMTIME flag supported by open(2) soon. */ -#define FMODE_NOCMTIME ((__force fmode_t)0x800) +#define FMODE_NOCMTIME ((__force fmode_t)(1 << 11)) /* Expect random access pattern */ -#define FMODE_RANDOM ((__force fmode_t)0x1000) +#define FMODE_RANDOM ((__force fmode_t)(1 << 12)) /* File is huge (eg. /dev/mem): treat loff_t as unsigned */ -#define FMODE_UNSIGNED_OFFSET ((__force fmode_t)0x2000) +#define FMODE_UNSIGNED_OFFSET ((__force fmode_t)(1 << 13)) /* File is opened with O_PATH; almost nothing can be done with it */ -#define FMODE_PATH ((__force fmode_t)0x4000) +#define FMODE_PATH ((__force fmode_t)(1 << 14)) /* File needs atomic accesses to f_pos */ -#define FMODE_ATOMIC_POS ((__force fmode_t)0x8000) +#define FMODE_ATOMIC_POS ((__force fmode_t)(1 << 15)) /* Write access to underlying fs */ -#define FMODE_WRITER ((__force fmode_t)0x10000) +#define FMODE_WRITER ((__force fmode_t)(1 << 16)) /* Has read method(s) */ -#define FMODE_CAN_READ ((__force fmode_t)0x20000) +#define FMODE_CAN_READ ((__force fmode_t)(1 << 17)) /* Has write method(s) */ -#define FMODE_CAN_WRITE ((__force fmode_t)0x40000) +#define FMODE_CAN_WRITE ((__force fmode_t)(1 << 18)) -#define FMODE_OPENED ((__force fmode_t)0x80000) -#define FMODE_CREATED ((__force fmode_t)0x100000) +#define FMODE_OPENED ((__force fmode_t)(1 << 19)) +#define FMODE_CREATED ((__force fmode_t)(1 << 20)) /* File is stream-like */ -#define FMODE_STREAM ((__force fmode_t)0x200000) +#define FMODE_STREAM ((__force fmode_t)(1 << 21)) /* File supports DIRECT IO */ -#define FMODE_CAN_ODIRECT ((__force fmode_t)0x400000) +#define FMODE_CAN_ODIRECT ((__force fmode_t)(1 << 22)) + +#define FMODE_NOREUSE ((__force fmode_t)(1 << 23)) -#define FMODE_NOREUSE ((__force fmode_t)0x800000) +/* FMODE_* bit 24 */ /* File is embedded in backing_file object */ -#define FMODE_BACKING ((__force fmode_t)0x2000000) +#define FMODE_BACKING ((__force fmode_t)(1 << 25)) /* File was opened by fanotify and shouldn't generate fanotify events */ -#define FMODE_NONOTIFY ((__force fmode_t)0x4000000) +#define FMODE_NONOTIFY ((__force fmode_t)(1 << 26)) /* File is capable of returning -EAGAIN if I/O will block */ -#define FMODE_NOWAIT ((__force fmode_t)0x8000000) +#define FMODE_NOWAIT ((__force fmode_t)(1 << 27)) /* File represents mount that needs unmounting */ -#define FMODE_NEED_UNMOUNT ((__force fmode_t)0x10000000) +#define FMODE_NEED_UNMOUNT ((__force fmode_t)(1 << 28)) /* File does not contribute to nr_files count */ -#define FMODE_NOACCOUNT ((__force fmode_t)0x20000000) +#define FMODE_NOACCOUNT ((__force fmode_t)(1 << 29)) /* * Attribute flags. These should be or-ed together to figure out what -- Gitee From c31a76994cabf25f438050e7c343b7f1a423d0a7 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Sat, 12 Oct 2024 17:22:48 -0700 Subject: [PATCH 20/37] fsnotify, lsm: Decouple fsnotify from lsm ANBZ: #33963 commit 1cda52f1b4611f4daa9d89e69d9428fb4137dc3f upstream Currently, fsnotify_open_perm() is called from security_file_open(). This is a a bit unexpected and creates otherwise unnecessary dependency of CONFIG_FANOTIFY_ACCESS_PERMISSIONS on CONFIG_SECURITY. Fix this by calling fsnotify_open_perm() directly. Signed-off-by: Song Liu Acked-by: Paul Moore Signed-off-by: Jan Kara Link: https://patch.msgid.link/20241013002248.3984442-1-song@kernel.org Signed-off-by: Gao Xiang --- fs/notify/fanotify/Kconfig | 1 - fs/open.c | 4 ++++ security/security.c | 9 +-------- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig index a511f9d8677b..0e36aaf379b7 100644 --- a/fs/notify/fanotify/Kconfig +++ b/fs/notify/fanotify/Kconfig @@ -15,7 +15,6 @@ config FANOTIFY config FANOTIFY_ACCESS_PERMISSIONS bool "fanotify permissions checking" depends on FANOTIFY - depends on SECURITY default n help Say Y here is you want fanotify listeners to be able to make permissions diff --git a/fs/open.c b/fs/open.c index 4bdafb322a7c..f9b0b0c15ec0 100644 --- a/fs/open.c +++ b/fs/open.c @@ -939,6 +939,10 @@ static int do_dentry_open(struct file *f, if (error) goto cleanup_all; + error = fsnotify_open_perm(f); + if (error) + goto cleanup_all; + error = break_lease(file_inode(f), f->f_flags); if (error) goto cleanup_all; diff --git a/security/security.c b/security/security.c index cb60fdcf78eb..18258ca42437 100644 --- a/security/security.c +++ b/security/security.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include @@ -2857,13 +2856,7 @@ int security_file_receive(struct file *file) */ int security_file_open(struct file *file) { - int ret; - - ret = call_int_hook(file_open, 0, file); - if (ret) - return ret; - - return fsnotify_open_perm(file); + return call_int_hook(file_open, 0, file); } /** -- Gitee From 6af9e9073d77294cc751e0e5246f1cf117e16d2a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 15 Nov 2024 10:30:14 -0500 Subject: [PATCH 21/37] fs: get rid of __FMODE_NONOTIFY kludge ANBZ: #33963 commit ebe559609d7829b52c6642b581860760984faf9d upstream All it takes to get rid of the __FMODE_NONOTIFY kludge is switching fanotify from anon_inode_getfd() to anon_inode_getfile_fmode() and adding a dentry_open_nonotify() helper to be used by fanotify on the other path. That's it - no more weird shit in OPEN_FMODE(), etc. Signed-off-by: Al Viro Link: https://lore.kernel.org/linux-fsdevel/20241113043003.GH3387508@ZenIV/ Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://patch.msgid.link/d1231137e7b661a382459e79a764259509a4115d.1731684329.git.josef@toxicpanda.com Signed-off-by: Gao Xiang --- fs/fcntl.c | 4 ++-- fs/notify/fanotify/fanotify_user.c | 25 ++++++++++++++++--------- fs/open.c | 23 +++++++++++++++++++---- include/linux/fs.h | 6 +++--- include/uapi/asm-generic/fcntl.h | 1 - 5 files changed, 40 insertions(+), 19 deletions(-) diff --git a/fs/fcntl.c b/fs/fcntl.c index 1484f062ee65..52efa1860f42 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -1021,10 +1021,10 @@ static int __init fcntl_init(void) * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY * is defined as O_NONBLOCK on some platforms and not on others. */ - BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ != + BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( (VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) | - __FMODE_EXEC | __FMODE_NONOTIFY)); + __FMODE_EXEC)); fasync_cache = kmem_cache_create("fasync_cache", sizeof(struct fasync_struct), 0, diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index d20f68613bd5..808c60b24679 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -102,8 +102,7 @@ static void __init fanotify_sysctls_init(void) * * Internal and external open flags are stored together in field f_flags of * struct file. Only external open flags shall be allowed in event_f_flags. - * Internal flags like FMODE_NONOTIFY, FMODE_EXEC, FMODE_NOCMTIME shall be - * excluded. + * Internal flags like FMODE_EXEC shall be excluded. */ #define FANOTIFY_INIT_ALL_EVENT_F_BITS ( \ O_ACCMODE | O_APPEND | O_NONBLOCK | \ @@ -260,12 +259,11 @@ static int create_fd(struct fsnotify_group *group, const struct path *path, return client_fd; /* - * we need a new file handle for the userspace program so it can read even if it was - * originally opened O_WRONLY. + * We provide an fd for the userspace program, so it could access the + * file without generating fanotify events itself. */ - new_file = dentry_open(path, - group->fanotify_data.f_flags | __FMODE_NONOTIFY, - current_cred()); + new_file = dentry_open_nonotify(path, group->fanotify_data.f_flags, + current_cred()); if (IS_ERR(new_file)) { put_unused_fd(client_fd); client_fd = PTR_ERR(new_file); @@ -1416,6 +1414,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) unsigned int fid_mode = flags & FANOTIFY_FID_BITS; unsigned int class = flags & FANOTIFY_CLASS_BITS; unsigned int internal_flags = 0; + struct file *file; pr_debug("%s: flags=%x event_f_flags=%x\n", __func__, flags, event_f_flags); @@ -1484,7 +1483,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) (!(fid_mode & FAN_REPORT_NAME) || !(fid_mode & FAN_REPORT_FID))) return -EINVAL; - f_flags = O_RDWR | __FMODE_NONOTIFY; + f_flags = O_RDWR; if (flags & FAN_CLOEXEC) f_flags |= O_CLOEXEC; if (flags & FAN_NONBLOCK) @@ -1562,10 +1561,18 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) goto out_destroy_group; } - fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags); + fd = get_unused_fd_flags(f_flags); if (fd < 0) goto out_destroy_group; + file = anon_inode_getfile_fmode("[fanotify]", &fanotify_fops, group, + f_flags, FMODE_NONOTIFY); + if (IS_ERR(file)) { + fd = PTR_ERR(file); + put_unused_fd(fd); + goto out_destroy_group; + } + fd_install(fd, file); return fd; out_destroy_group: diff --git a/fs/open.c b/fs/open.c index f9b0b0c15ec0..8edf260e1c34 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1115,6 +1115,23 @@ struct file *dentry_open(const struct path *path, int flags, } EXPORT_SYMBOL(dentry_open); +struct file *dentry_open_nonotify(const struct path *path, int flags, + const struct cred *cred) +{ + struct file *f = alloc_empty_file(flags, cred); + if (!IS_ERR(f)) { + int error; + + f->f_mode |= FMODE_NONOTIFY; + error = vfs_open(path, f); + if (error) { + fput(f); + f = ERR_PTR(error); + } + } + return f; +} + /** * dentry_create - Create and open a file * @path: path to create @@ -1249,7 +1266,7 @@ inline struct open_how build_open_how(int flags, umode_t mode) inline int build_open_flags(const struct open_how *how, struct open_flags *op) { u64 flags = how->flags; - u64 strip = __FMODE_NONOTIFY | O_CLOEXEC; + u64 strip = O_CLOEXEC; int lookup_flags = 0; int acc_mode = ACC_MODE(flags); @@ -1257,9 +1274,7 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op) "struct open_flags doesn't yet handle flags > 32 bits"); /* - * Strip flags that either shouldn't be set by userspace like - * FMODE_NONOTIFY or that aren't relevant in determining struct - * open_flags like O_CLOEXEC. + * Strip flags that aren't relevant in determining struct open_flags. */ flags &= ~strip; diff --git a/include/linux/fs.h b/include/linux/fs.h index 957fbf10a632..eed8da9368e2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2677,6 +2677,8 @@ static inline struct file *file_open_root_mnt(struct vfsmount *mnt, } struct file *dentry_open(const struct path *path, int flags, const struct cred *creds); +struct file *dentry_open_nonotify(const struct path *path, int flags, + const struct cred *cred); struct file *dentry_create(const struct path *path, int flags, umode_t mode, const struct cred *cred); struct file *backing_file_open(const struct path *user_path, int flags, @@ -3562,11 +3564,9 @@ struct ctl_table; int __init list_bdev_fs_names(char *buf, size_t size); #define __FMODE_EXEC ((__force int) FMODE_EXEC) -#define __FMODE_NONOTIFY ((__force int) FMODE_NONOTIFY) #define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE]) -#define OPEN_FMODE(flag) ((__force fmode_t)(((flag + 1) & O_ACCMODE) | \ - (flag & __FMODE_NONOTIFY))) +#define OPEN_FMODE(flag) ((__force fmode_t)((flag + 1) & O_ACCMODE)) static inline bool is_sxid(umode_t mode) { diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h index 80f37a0d40d7..613475285643 100644 --- a/include/uapi/asm-generic/fcntl.h +++ b/include/uapi/asm-generic/fcntl.h @@ -6,7 +6,6 @@ /* * FMODE_EXEC is 0x20 - * FMODE_NONOTIFY is 0x4000000 * These cannot be used by userspace O_* until internal and external open * flags are split. * -Eric Paris -- Gitee From 9a8a7f67ca0a86289f49e7e120c3772c33ca627c Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 15 Nov 2024 10:30:15 -0500 Subject: [PATCH 22/37] fsnotify: opt-in for permission events at file open time ANBZ: #33963 commit a94204f4d48e28a711b7ed10399f749286c433e3 upstream Legacy inotify/fanotify listeners can add watches for events on inode, parent or mount and expect to get events (e.g. FS_MODIFY) on files that were already open at the time of setting up the watches. fanotify permission events are typically used by Anti-malware sofware, that is watching the entire mount and it is not common to have more that one Anti-malware engine installed on a system. To reduce the overhead of the fsnotify_file_perm() hooks on every file access, relax the semantics of the legacy FAN_ACCESS_PERM event to generate events only if there were *any* permission event listeners on the filesystem at the time that the file was opened. The new semantic is implemented by extending the FMODE_NONOTIFY bit into two FMODE_NONOTIFY_* bits, that are used to store a mode for which of the events types to report. This is going to apply to the new fanotify pre-content events in order to reduce the cost of the new pre-content event vfs hooks. [Thanks to Bert Karwatzki for reporting a bug in this code with CONFIG_FANOTIFY_ACCESS_PERMISSIONS disabled] Suggested-by: Linus Torvalds Link: https://lore.kernel.org/linux-fsdevel/CAHk-=wj8L=mtcRTi=NECHMGfZQgXOp_uix1YVh04fEmrKaMnXA@mail.gmail.com/ Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://patch.msgid.link/5ea5f8e283d1edb55aa79c35187bfe344056af14.1731684329.git.josef@toxicpanda.com Signed-off-by: Gao Xiang --- fs/notify/fsnotify.c | 38 +++++++++++++++++++++++++++++++++++ fs/open.c | 8 +++++++- include/linux/fs.h | 43 +++++++++++++++++++++++++++++++++++----- include/linux/fsnotify.h | 39 +++++++++++++++++++++--------------- 4 files changed, 106 insertions(+), 22 deletions(-) diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 4d3fd169e5aa..942f46faae50 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -616,6 +616,44 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, } EXPORT_SYMBOL_GPL(fsnotify); +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS +/* + * At open time we check fsnotify_sb_has_priority_watchers() and set the + * FMODE_NONOTIFY_ mode bits accordignly. + * Later, fsnotify permission hooks do not check if there are permission event + * watches, but that there were permission event watches at open time. + */ +void file_set_fsnotify_mode(struct file *file) +{ + struct super_block *sb = file->f_path.dentry->d_sb; + + /* Is it a file opened by fanotify? */ + if (FMODE_FSNOTIFY_NONE(file->f_mode)) + return; + + /* + * Permission events is a super set of pre-content events, so if there + * are no permission event watchers, there are also no pre-content event + * watchers and this is implied from the single FMODE_NONOTIFY_PERM bit. + */ + if (likely(!fsnotify_sb_has_priority_watchers(sb, + FSNOTIFY_PRIO_CONTENT))) { + file->f_mode |= FMODE_NONOTIFY_PERM; + return; + } + + /* + * If there are permission event watchers but no pre-content event + * watchers, set FMODE_NONOTIFY | FMODE_NONOTIFY_PERM to indicate that. + */ + if (likely(!fsnotify_sb_has_priority_watchers(sb, + FSNOTIFY_PRIO_PRE_CONTENT))) { + file->f_mode |= FMODE_NONOTIFY | FMODE_NONOTIFY_PERM; + return; + } +} +#endif + static __init int fsnotify_init(void) { int ret; diff --git a/fs/open.c b/fs/open.c index 8edf260e1c34..ce5acb2778fb 100644 --- a/fs/open.c +++ b/fs/open.c @@ -911,7 +911,7 @@ static int do_dentry_open(struct file *f, f->f_sb_err = file_sample_sb_err(f); if (unlikely(f->f_flags & O_PATH)) { - f->f_mode = FMODE_PATH | FMODE_OPENED; + f->f_mode = FMODE_PATH | FMODE_OPENED | FMODE_NONOTIFY; f->f_op = &empty_fops; return 0; } @@ -939,6 +939,12 @@ static int do_dentry_open(struct file *f, if (error) goto cleanup_all; + /* + * Set FMODE_NONOTIFY_* bits according to existing permission watches. + * If FMODE_NONOTIFY was already set for an fanotify fd, this doesn't + * change anything. + */ + file_set_fsnotify_mode(f); error = fsnotify_open_perm(f); if (error) goto cleanup_all; diff --git a/include/linux/fs.h b/include/linux/fs.h index eed8da9368e2..41f198709c46 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -166,13 +166,20 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define FMODE_NOREUSE ((__force fmode_t)(1 << 23)) -/* FMODE_* bit 24 */ - /* File is embedded in backing_file object */ -#define FMODE_BACKING ((__force fmode_t)(1 << 25)) +#define FMODE_BACKING ((__force fmode_t)(1 << 24)) + +/* + * Together with FMODE_NONOTIFY_PERM defines which fsnotify events shouldn't be + * generated (see below) + */ +#define FMODE_NONOTIFY ((__force fmode_t)(1 << 25)) -/* File was opened by fanotify and shouldn't generate fanotify events */ -#define FMODE_NONOTIFY ((__force fmode_t)(1 << 26)) +/* + * Together with FMODE_NONOTIFY defines which fsnotify events shouldn't be + * generated (see below) + */ +#define FMODE_NONOTIFY_PERM ((__force fmode_t)(1 << 26)) /* File is capable of returning -EAGAIN if I/O will block */ #define FMODE_NOWAIT ((__force fmode_t)(1 << 27)) @@ -183,6 +190,32 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* File does not contribute to nr_files count */ #define FMODE_NOACCOUNT ((__force fmode_t)(1 << 29)) +/* + * The two FMODE_NONOTIFY* define which fsnotify events should not be generated + * for a file. These are the possible values of (f->f_mode & + * FMODE_FSNOTIFY_MASK) and their meaning: + * + * FMODE_NONOTIFY - suppress all (incl. non-permission) events. + * FMODE_NONOTIFY_PERM - suppress permission (incl. pre-content) events. + * FMODE_NONOTIFY | FMODE_NONOTIFY_PERM - suppress only pre-content events. + */ +#define FMODE_FSNOTIFY_MASK \ + (FMODE_NONOTIFY | FMODE_NONOTIFY_PERM) + +#define FMODE_FSNOTIFY_NONE(mode) \ + ((mode & FMODE_FSNOTIFY_MASK) == FMODE_NONOTIFY) +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS +#define FMODE_FSNOTIFY_PERM(mode) \ + ((mode & FMODE_FSNOTIFY_MASK) == 0 || \ + (mode & FMODE_FSNOTIFY_MASK) == (FMODE_NONOTIFY | FMODE_NONOTIFY_PERM)) +#define FMODE_FSNOTIFY_HSM(mode) \ + ((mode & FMODE_FSNOTIFY_MASK) == 0) +#else +#define FMODE_FSNOTIFY_PERM(mode) 0 +#define FMODE_FSNOTIFY_HSM(mode) 0 +#endif + + /* * Attribute flags. These should be or-ed together to figure out what * has been changed! diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 278620e063ab..8d1849137a96 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -108,38 +108,35 @@ static inline void fsnotify_dentry(struct dentry *dentry, __u32 mask) fsnotify_parent(dentry, mask, dentry, FSNOTIFY_EVENT_DENTRY); } -static inline int fsnotify_file(struct file *file, __u32 mask) +static inline int fsnotify_path(const struct path *path, __u32 mask) { - const struct path *path; + return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH); +} +static inline int fsnotify_file(struct file *file, __u32 mask) +{ /* * FMODE_NONOTIFY are fds generated by fanotify itself which should not * generate new events. We also don't want to generate events for * FMODE_PATH fds (involves open & close events) as they are just * handle creation / destruction events and not "real" file events. */ - if (file->f_mode & (FMODE_NONOTIFY | FMODE_PATH)) + if (FMODE_FSNOTIFY_NONE(file->f_mode)) return 0; - path = &file->f_path; - /* Permission events require group prio >= FSNOTIFY_PRIO_CONTENT */ - if (mask & ALL_FSNOTIFY_PERM_EVENTS && - !fsnotify_sb_has_priority_watchers(path->dentry->d_sb, - FSNOTIFY_PRIO_CONTENT)) - return 0; - - return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH); + return fsnotify_path(&file->f_path, mask); } #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + +void file_set_fsnotify_mode(struct file *file); + /* * fsnotify_file_area_perm - permission hook before access to file range */ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, const loff_t *ppos, size_t count) { - __u32 fsnotify_mask = FS_ACCESS_PERM; - /* * filesystem may be modified in the context of permission events * (e.g. by HSM filling a file on access), so sb freeze protection @@ -150,7 +147,10 @@ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, if (!(perm_mask & MAY_READ)) return 0; - return fsnotify_file(file, fsnotify_mask); + if (likely(!FMODE_FSNOTIFY_PERM(file->f_mode))) + return 0; + + return fsnotify_path(&file->f_path, FS_ACCESS_PERM); } /* @@ -168,16 +168,23 @@ static inline int fsnotify_open_perm(struct file *file) { int ret; + if (likely(!FMODE_FSNOTIFY_PERM(file->f_mode))) + return 0; + if (file->f_flags & __FMODE_EXEC) { - ret = fsnotify_file(file, FS_OPEN_EXEC_PERM); + ret = fsnotify_path(&file->f_path, FS_OPEN_EXEC_PERM); if (ret) return ret; } - return fsnotify_file(file, FS_OPEN_PERM); + return fsnotify_path(&file->f_path, FS_OPEN_PERM); } #else +static inline void file_set_fsnotify_mode(struct file *file) +{ +} + static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, const loff_t *ppos, size_t count) { -- Gitee From 09d59f1d11fba5c37081e5b76008bbe7441ddacb Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 15 Nov 2024 10:30:16 -0500 Subject: [PATCH 23/37] fsnotify: check if file is actually being watched for pre-content events on open ANBZ: #33963 commit 318652e07fa5b1743d08eeccd69a1f47f2c15710 upstream So far, we set FMODE_NONOTIFY_ flags at open time if we know that there are no permission event watchers at all on the filesystem, but lack of FMODE_NONOTIFY_ flags does not mean that the file is actually watched. For pre-content events, it is possible to optimize things so that we don't bother trying to send pre-content events if file was not watched (through sb, mnt, parent or inode itself) on open. Set FMODE_NONOTIFY_ flags according to that. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://patch.msgid.link/2ddcc9f8d1fde48d085318a6b5a889289d8871d8.1731684329.git.josef@toxicpanda.com Signed-off-by: Gao Xiang --- fs/notify/fsnotify.c | 29 ++++++++++++++++++++++++++--- include/linux/fsnotify_backend.h | 3 +++ 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 942f46faae50..bb3809fc5b6e 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -187,7 +187,7 @@ static bool fsnotify_event_needs_parent(struct inode *inode, __u32 mnt_mask, return mask & marks_mask; } -/* Are there any inode/mount/sb objects that are interested in this event? */ +/* Are there any inode/mount/sb objects that watch for these events? */ static inline bool fsnotify_object_watched(struct inode *inode, __u32 mnt_mask, __u32 mask) { @@ -625,7 +625,9 @@ EXPORT_SYMBOL_GPL(fsnotify); */ void file_set_fsnotify_mode(struct file *file) { - struct super_block *sb = file->f_path.dentry->d_sb; + struct dentry *dentry = file->f_path.dentry, *parent; + struct super_block *sb = dentry->d_sb; + __u32 mnt_mask, p_mask; /* Is it a file opened by fanotify? */ if (FMODE_FSNOTIFY_NONE(file->f_mode)) @@ -646,11 +648,32 @@ void file_set_fsnotify_mode(struct file *file) * If there are permission event watchers but no pre-content event * watchers, set FMODE_NONOTIFY | FMODE_NONOTIFY_PERM to indicate that. */ - if (likely(!fsnotify_sb_has_priority_watchers(sb, + if ((!d_is_dir(dentry) && !d_is_reg(dentry)) || + likely(!fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_PRE_CONTENT))) { file->f_mode |= FMODE_NONOTIFY | FMODE_NONOTIFY_PERM; return; } + + /* + * OK, there are some pre-content watchers. Check if anybody is + * watching for pre-content events on *this* file. + */ + mnt_mask = READ_ONCE(real_mount(file->f_path.mnt)->mnt_fsnotify_mask); + if (unlikely(fsnotify_object_watched(d_inode(dentry), mnt_mask, + FSNOTIFY_PRE_CONTENT_EVENTS))) + return; + + /* Is parent watching for pre-content events on this file? */ + if (dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED) { + parent = dget_parent(dentry); + p_mask = fsnotify_inode_watches_children(d_inode(parent)); + dput(parent); + if (p_mask & FSNOTIFY_PRE_CONTENT_EVENTS) + return; + } + /* Nobody watching for pre-content events from this file */ + file->f_mode |= FMODE_NONOTIFY | FMODE_NONOTIFY_PERM; } #endif diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 64b204d00c82..70b9c5f8f28c 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -77,6 +77,9 @@ */ #define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE | FS_RENAME) +/* Pre-content events can be used to fill file content */ +#define FSNOTIFY_PRE_CONTENT_EVENTS 0 + #define ALL_FSNOTIFY_PERM_EVENTS (FS_OPEN_PERM | FS_ACCESS_PERM | \ FS_OPEN_EXEC_PERM) -- Gitee From 54898e4107aa2cbdcbe8027abd19e3a7756a7d6d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 15 Nov 2024 10:30:17 -0500 Subject: [PATCH 24/37] fanotify: don't skip extra event info if no info_mode is set ANBZ: #33963 commit b82c6f5930f65c510f5b6b4b0d7d1913a6dda3db upstream Previously we would only include optional information if you requested it via an FAN_ flag at fanotify_init time (FAN_REPORT_FID for example). However this isn't necessary as the event length is encoded in the metadata, and if the user doesn't want to consume the information they don't have to. With the PRE_ACCESS events we will always generate range information, so drop this check in order to allow this extra information to be exported without needing to have another flag. Signed-off-by: Josef Bacik Signed-off-by: Jan Kara Link: https://patch.msgid.link/afcbc4e4139dee076ef1757918b037d3b48c3edb.1731684329.git.josef@toxicpanda.com Signed-off-by: Gao Xiang --- fs/notify/fanotify/fanotify_user.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 808c60b24679..db0e2dabc2b8 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -160,9 +160,6 @@ static size_t fanotify_event_len(unsigned int info_mode, int fh_len; int dot_len = 0; - if (!info_mode) - return event_len; - if (fanotify_is_error_event(event->mask)) event_len += FANOTIFY_ERROR_INFO_LEN; @@ -756,12 +753,10 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, buf += FAN_EVENT_METADATA_LEN; count -= FAN_EVENT_METADATA_LEN; - if (info_mode) { - ret = copy_info_records_to_user(event, info, info_mode, pidfd, - buf, count); - if (ret < 0) - goto out_close_fd; - } + ret = copy_info_records_to_user(event, info, info_mode, pidfd, + buf, count); + if (ret < 0) + goto out_close_fd; if (f) fd_install(fd, f); -- Gitee From 538a8820558b815c35b1bbb0067dcdee5a2f6dce Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 15 Nov 2024 10:30:18 -0500 Subject: [PATCH 25/37] fanotify: rename a misnamed constant ANBZ: #33963 commit 4edcb9f7b7179ef87ca16440da50ff01f05f268c upstream FANOTIFY_PIDFD_INFO_HDR_LEN is not the length of the header. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://patch.msgid.link/8776ab90fe538225aeb561c560296bafd16b97c4.1731684329.git.josef@toxicpanda.com Signed-off-by: Gao Xiang --- fs/notify/fanotify/fanotify_user.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index db0e2dabc2b8..d1cd2e2f5fca 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -119,7 +119,7 @@ struct kmem_cache *fanotify_perm_event_cachep __read_mostly; #define FANOTIFY_EVENT_ALIGN 4 #define FANOTIFY_FID_INFO_HDR_LEN \ (sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle)) -#define FANOTIFY_PIDFD_INFO_HDR_LEN \ +#define FANOTIFY_PIDFD_INFO_LEN \ sizeof(struct fanotify_event_info_pidfd) #define FANOTIFY_ERROR_INFO_LEN \ (sizeof(struct fanotify_event_info_error)) @@ -174,14 +174,14 @@ static size_t fanotify_event_len(unsigned int info_mode, dot_len = 1; } - if (info_mode & FAN_REPORT_PIDFD) - event_len += FANOTIFY_PIDFD_INFO_HDR_LEN; - if (fanotify_event_has_object_fh(event)) { fh_len = fanotify_event_object_fh_len(event); event_len += fanotify_fid_info_len(fh_len, dot_len); } + if (info_mode & FAN_REPORT_PIDFD) + event_len += FANOTIFY_PIDFD_INFO_LEN; + return event_len; } @@ -503,7 +503,7 @@ static int copy_pidfd_info_to_user(int pidfd, size_t count) { struct fanotify_event_info_pidfd info = { }; - size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN; + size_t info_len = FANOTIFY_PIDFD_INFO_LEN; if (WARN_ON_ONCE(info_len > count)) return -EFAULT; -- Gitee From af94a7f2d3d495af6e742a7fe98dbf954bdeafd3 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 15 Nov 2024 10:30:19 -0500 Subject: [PATCH 26/37] fanotify: reserve event bit of deprecated FAN_DIR_MODIFY ANBZ: #33963 commit 0a076036b631f086a6bce93a45eaa216f234f121 upstream Avoid reusing it, because we would like to reserve it for future FAN_PATH_MODIFY pre-content event. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://patch.msgid.link/632d9f80428e2e7a6b6a8ccc2925d87c92bbb518.1731684329.git.josef@toxicpanda.com Signed-off-by: Gao Xiang --- include/linux/fsnotify_backend.h | 1 + include/uapi/linux/fanotify.h | 1 + 2 files changed, 2 insertions(+) diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 70b9c5f8f28c..7016fc388f6d 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -55,6 +55,7 @@ #define FS_OPEN_PERM 0x00010000 /* open event in an permission hook */ #define FS_ACCESS_PERM 0x00020000 /* access event in a permissions hook */ #define FS_OPEN_EXEC_PERM 0x00040000 /* open/exec event in a permission hook */ +/* #define FS_DIR_MODIFY 0x00080000 */ /* Deprecated (reserved) */ /* * Set on inode mark that cares about things that happen to its children. diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h index 2aae6dd706a9..73849868311f 100644 --- a/include/uapi/linux/fanotify.h +++ b/include/uapi/linux/fanotify.h @@ -25,6 +25,7 @@ #define FAN_OPEN_PERM 0x00010000 /* File open in perm check */ #define FAN_ACCESS_PERM 0x00020000 /* File accessed in perm check */ #define FAN_OPEN_EXEC_PERM 0x00040000 /* File open/exec in perm check */ +/* #define FAN_DIR_MODIFY 0x00080000 */ /* Deprecated (reserved) */ #define FAN_EVENT_ON_CHILD 0x08000000 /* Interested in child events */ -- Gitee From 90749630cacd191146d9c2905491abea2879b698 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 15 Nov 2024 10:30:20 -0500 Subject: [PATCH 27/37] fsnotify: introduce pre-content permission events ANBZ: #33963 commit f156524e5d72c81792eee81f828784dc8a37a7f2 upstream The new FS_PRE_ACCESS permission event is similar to FS_ACCESS_PERM, but it meant for a different use case of filling file content before access to a file range, so it has slightly different semantics. Generate FS_PRE_ACCESS/FS_ACCESS_PERM as two seperate events, so content scanners could inspect the content filled by pre-content event handler. Unlike FS_ACCESS_PERM, FS_PRE_ACCESS is also called before a file is modified by syscalls as write() and fallocate(). FS_ACCESS_PERM is reported also on blockdev and pipes, but the new pre-content events are only reported for regular files and dirs. The pre-content events are meant to be used by hierarchical storage managers that want to fill the content of files on first access. There are some specific requirements from filesystems that could be used with pre-content events, so add a flag for fs to opt-in for pre-content events explicitly before they can be used. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://patch.msgid.link/b934c5e3af205abc4e0e4709f6486815937ddfdf.1731684329.git.josef@toxicpanda.com Conflicts: include/linux/fs.h Signed-off-by: Gao Xiang --- fs/notify/fsnotify.c | 2 +- include/linux/fs.h | 3 +++ include/linux/fsnotify.h | 19 ++++++++++++++++++- include/linux/fsnotify_backend.h | 11 ++++++++--- security/selinux/hooks.c | 3 ++- 5 files changed, 32 insertions(+), 6 deletions(-) diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index bb3809fc5b6e..c05db5b10e42 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -681,7 +681,7 @@ static __init int fsnotify_init(void) { int ret; - BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 23); + BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 24); ret = init_srcu_struct(&fsnotify_mark_srcu); if (ret) diff --git a/include/linux/fs.h b/include/linux/fs.h index 41f198709c46..0f757ba2029b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1218,6 +1218,9 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */ #define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */ #define SB_I_RETIRED 0x00000800 /* superblock shouldn't be reused */ +#define SB_I_NOUMASK 0x00001000 /* VFS does not apply umask */ +#define SB_I_NOIDMAP 0x00002000 /* No idmapped mounts on this superblock */ +#define SB_I_ALLOW_HSM 0x00004000 /* Allow HSM events on this superblock */ /* Possible states of 'frozen' field */ enum { diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 8d1849137a96..d91aa064f0e4 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -144,12 +144,29 @@ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, */ lockdep_assert_once(file_write_not_started(file)); - if (!(perm_mask & MAY_READ)) + if (!(perm_mask & (MAY_READ | MAY_WRITE | MAY_ACCESS))) return 0; if (likely(!FMODE_FSNOTIFY_PERM(file->f_mode))) return 0; + /* + * read()/write() and other types of access generate pre-content events. + */ + if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) { + int ret = fsnotify_path(&file->f_path, FS_PRE_ACCESS); + + if (ret) + return ret; + } + + if (!(perm_mask & MAY_READ)) + return 0; + + /* + * read() also generates the legacy FS_ACCESS_PERM event, so content + * scanners can inspect the content filled by pre-content event. + */ return fsnotify_path(&file->f_path, FS_ACCESS_PERM); } diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 7016fc388f6d..e63ee3c89ef8 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -57,6 +57,8 @@ #define FS_OPEN_EXEC_PERM 0x00040000 /* open/exec event in a permission hook */ /* #define FS_DIR_MODIFY 0x00080000 */ /* Deprecated (reserved) */ +#define FS_PRE_ACCESS 0x00100000 /* Pre-content access hook */ + /* * Set on inode mark that cares about things that happen to its children. * Always set for dnotify and inotify. @@ -78,11 +80,14 @@ */ #define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE | FS_RENAME) +/* Content events can be used to inspect file content */ +#define FSNOTIFY_CONTENT_PERM_EVENTS (FS_OPEN_PERM | FS_OPEN_EXEC_PERM | \ + FS_ACCESS_PERM) /* Pre-content events can be used to fill file content */ -#define FSNOTIFY_PRE_CONTENT_EVENTS 0 +#define FSNOTIFY_PRE_CONTENT_EVENTS (FS_PRE_ACCESS) -#define ALL_FSNOTIFY_PERM_EVENTS (FS_OPEN_PERM | FS_ACCESS_PERM | \ - FS_OPEN_EXEC_PERM) +#define ALL_FSNOTIFY_PERM_EVENTS (FSNOTIFY_CONTENT_PERM_EVENTS | \ + FSNOTIFY_PRE_CONTENT_EVENTS) /* * This is a list of all events that may get sent to a parent that is watching diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 8a8e4d51f298..e07298c94d14 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -3390,7 +3390,8 @@ static int selinux_path_notify(const struct path *path, u64 mask, perm |= FILE__WATCH_WITH_PERM; /* watches on read-like events need the file:watch_reads permission */ - if (mask & (FS_ACCESS | FS_ACCESS_PERM | FS_CLOSE_NOWRITE)) + if (mask & (FS_ACCESS | FS_ACCESS_PERM | FS_PRE_ACCESS | + FS_CLOSE_NOWRITE)) perm |= FILE__WATCH_READS; return path_has_perm(current_cred(), path, perm); -- Gitee From 15cc0eabc0995dfeef6d0282fa74294cd59e339f Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 15 Nov 2024 10:30:21 -0500 Subject: [PATCH 28/37] fsnotify: pass optional file access range in pre-content event ANBZ: #33963 commit 9740d17162deca7138fad7dcf3ef52324832c32b upstream We would like to add file range information to pre-content events. Pass a struct file_range with offset and length to event handler along with pre-content permission event. The offset and length are aligned to page size, but we may need to align them to minimum folio size for filesystems with large block size. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://patch.msgid.link/88eddee301231d814aede27fb4d5b41ae37c9702.1731684329.git.josef@toxicpanda.com Signed-off-by: Gao Xiang --- fs/notify/fanotify/fanotify.c | 11 +++++++-- fs/notify/fanotify/fanotify.h | 2 ++ fs/notify/fsnotify.c | 18 ++++++++++++++ include/linux/fsnotify.h | 4 ++-- include/linux/fsnotify_backend.h | 40 ++++++++++++++++++++++++++++++++ 5 files changed, 71 insertions(+), 4 deletions(-) diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index de7f5ce1fef9..a44ee94e0841 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -553,9 +553,13 @@ static struct fanotify_event *fanotify_alloc_path_event(const struct path *path, return &pevent->fae; } -static struct fanotify_event *fanotify_alloc_perm_event(const struct path *path, +static struct fanotify_event *fanotify_alloc_perm_event(const void *data, + int data_type, gfp_t gfp) { + const struct path *path = fsnotify_data_path(data, data_type); + const struct file_range *range = + fsnotify_data_file_range(data, data_type); struct fanotify_perm_event *pevent; pevent = kmem_cache_alloc(fanotify_perm_event_cachep, gfp); @@ -569,6 +573,9 @@ static struct fanotify_event *fanotify_alloc_perm_event(const struct path *path, pevent->hdr.len = 0; pevent->state = FAN_EVENT_INIT; pevent->path = *path; + /* NULL ppos means no range info */ + pevent->ppos = range ? &range->pos : NULL; + pevent->count = range ? range->count : 0; path_get(path); return &pevent->fae; @@ -806,7 +813,7 @@ static struct fanotify_event *fanotify_alloc_event( old_memcg = set_active_memcg(group->memcg); if (fanotify_is_perm_event(mask)) { - event = fanotify_alloc_perm_event(path, gfp); + event = fanotify_alloc_perm_event(data, data_type, gfp); } else if (fanotify_is_error_event(mask)) { event = fanotify_alloc_error_event(group, fsid, data, data_type, &hash); diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 444507b1ad0a..d457628d5a4a 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -425,6 +425,8 @@ FANOTIFY_PE(struct fanotify_event *event) struct fanotify_perm_event { struct fanotify_event fae; struct path path; + const loff_t *ppos; /* optional file range info */ + size_t count; u32 response; /* userspace answer to the event */ unsigned short state; /* state of the event */ int fd; /* fd we passed to userspace for this event */ diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index c05db5b10e42..d76f7f1659b3 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -197,6 +197,24 @@ static inline bool fsnotify_object_watched(struct inode *inode, __u32 mnt_mask, return mask & marks_mask & ALL_FSNOTIFY_EVENTS; } +/* Report pre-content event with optional range info */ +int fsnotify_pre_content(const struct path *path, const loff_t *ppos, + size_t count) +{ + struct file_range range; + + /* Report page aligned range only when pos is known */ + if (!ppos) + return fsnotify_path(path, FS_PRE_ACCESS); + + range.path = path; + range.pos = PAGE_ALIGN_DOWN(*ppos); + range.count = PAGE_ALIGN(*ppos + count) - range.pos; + + return fsnotify_parent(path->dentry, FS_PRE_ACCESS, &range, + FSNOTIFY_EVENT_FILE_RANGE); +} + /* * Notify this dentry's parent about a child's events with child name info * if parent is watching or if inode/sb/mount are interested in events with diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index d91aa064f0e4..87044acf8e79 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -154,7 +154,7 @@ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, * read()/write() and other types of access generate pre-content events. */ if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) { - int ret = fsnotify_path(&file->f_path, FS_PRE_ACCESS); + int ret = fsnotify_pre_content(&file->f_path, ppos, count); if (ret) return ret; @@ -171,7 +171,7 @@ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, } /* - * fsnotify_file_perm - permission hook before file access + * fsnotify_file_perm - permission hook before file access (unknown range) */ static inline int fsnotify_file_perm(struct file *file, int perm_mask) { diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index e63ee3c89ef8..3fb6dc029fec 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -298,6 +298,7 @@ static inline void fsnotify_group_assert_locked(struct fsnotify_group *group) /* When calling fsnotify tell it if the data is a path or inode */ enum fsnotify_data_type { FSNOTIFY_EVENT_NONE, + FSNOTIFY_EVENT_FILE_RANGE, FSNOTIFY_EVENT_PATH, FSNOTIFY_EVENT_INODE, FSNOTIFY_EVENT_DENTRY, @@ -310,6 +311,17 @@ struct fs_error_report { struct super_block *sb; }; +struct file_range { + const struct path *path; + loff_t pos; + size_t count; +}; + +static inline const struct path *file_range_path(const struct file_range *range) +{ + return range->path; +} + static inline struct inode *fsnotify_data_inode(const void *data, int data_type) { switch (data_type) { @@ -319,6 +331,8 @@ static inline struct inode *fsnotify_data_inode(const void *data, int data_type) return d_inode(data); case FSNOTIFY_EVENT_PATH: return d_inode(((const struct path *)data)->dentry); + case FSNOTIFY_EVENT_FILE_RANGE: + return d_inode(file_range_path(data)->dentry); case FSNOTIFY_EVENT_ERROR: return ((struct fs_error_report *)data)->inode; default: @@ -334,6 +348,8 @@ static inline struct dentry *fsnotify_data_dentry(const void *data, int data_typ return (struct dentry *)data; case FSNOTIFY_EVENT_PATH: return ((const struct path *)data)->dentry; + case FSNOTIFY_EVENT_FILE_RANGE: + return file_range_path(data)->dentry; default: return NULL; } @@ -345,6 +361,8 @@ static inline const struct path *fsnotify_data_path(const void *data, switch (data_type) { case FSNOTIFY_EVENT_PATH: return data; + case FSNOTIFY_EVENT_FILE_RANGE: + return file_range_path(data); default: return NULL; } @@ -360,6 +378,8 @@ static inline struct super_block *fsnotify_data_sb(const void *data, return ((struct dentry *)data)->d_sb; case FSNOTIFY_EVENT_PATH: return ((const struct path *)data)->dentry->d_sb; + case FSNOTIFY_EVENT_FILE_RANGE: + return file_range_path(data)->dentry->d_sb; case FSNOTIFY_EVENT_ERROR: return ((struct fs_error_report *) data)->sb; default: @@ -379,6 +399,18 @@ static inline struct fs_error_report *fsnotify_data_error_report( } } +static inline const struct file_range *fsnotify_data_file_range( + const void *data, + int data_type) +{ + switch (data_type) { + case FSNOTIFY_EVENT_FILE_RANGE: + return (struct file_range *)data; + default: + return NULL; + } +} + /* * Index to merged marks iterator array that correlates to a type of watch. * The type of watched object can be deduced from the iterator type, but not @@ -866,9 +898,17 @@ static inline void fsnotify_init_event(struct fsnotify_event *event) { INIT_LIST_HEAD(&event->list); } +int fsnotify_pre_content(const struct path *path, const loff_t *ppos, + size_t count); #else +static inline int fsnotify_pre_content(const struct path *path, + const loff_t *ppos, size_t count) +{ + return 0; +} + static inline int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, struct inode *inode, u32 cookie) -- Gitee From de4764270f295f5a03ecfee2036e68352327450e Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 15 Nov 2024 10:30:22 -0500 Subject: [PATCH 29/37] fsnotify: generate pre-content permission event on truncate ANBZ: #33963 commit 4acf3bc76e521b47acebcefc6312c97992f4ca29 upstream Generate FS_PRE_ACCESS event before truncate, without sb_writers held. Move the security hooks also before sb_start_write() to conform with other security hooks (e.g. in write, fallocate). The event will have a range info of the page surrounding the new size to provide an opportunity to fill the conetnt at the end of file before truncating to non-page aligned size. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://patch.msgid.link/23af8201db6ac2efdea94f09ab067d81ba5de7a7.1731684329.git.josef@toxicpanda.com Conflicts: fs/open.c [ Gao Xiang: no need to backport upstream commit 5f0d594c602f. ] Signed-off-by: Gao Xiang --- fs/open.c | 26 +++++++++++++++++++------- include/linux/fsnotify.h | 20 ++++++++++++++++++++ 2 files changed, 39 insertions(+), 7 deletions(-) diff --git a/fs/open.c b/fs/open.c index ce5acb2778fb..98c1e2c38807 100644 --- a/fs/open.c +++ b/fs/open.c @@ -82,14 +82,18 @@ long vfs_truncate(const struct path *path, loff_t length) if (!S_ISREG(inode->i_mode)) return -EINVAL; - error = mnt_want_write(path->mnt); - if (error) - goto out; - idmap = mnt_idmap(path->mnt); error = inode_permission(idmap, inode, MAY_WRITE); if (error) - goto mnt_drop_write_and_out; + return error; + + error = fsnotify_truncate_perm(path, length); + if (error) + return error; + + error = mnt_want_write(path->mnt); + if (error) + return error; error = -EPERM; if (IS_APPEND(inode)) @@ -115,7 +119,7 @@ long vfs_truncate(const struct path *path, loff_t length) put_write_access(inode); mnt_drop_write_and_out: mnt_drop_write(path->mnt); -out: + return error; } EXPORT_SYMBOL_GPL(vfs_truncate); @@ -188,8 +192,16 @@ long do_sys_ftruncate(unsigned int fd, loff_t length, int small) /* Check IS_APPEND on real upper inode */ if (IS_APPEND(file_inode(f.file))) goto out_putf; - sb_start_write(inode->i_sb); + error = security_file_truncate(f.file); + if (error) + goto out_putf; + + error = fsnotify_truncate_perm(&f.file->f_path, length); + if (error) + goto out_putf; + + sb_start_write(inode->i_sb); if (!error) error = do_truncate(file_mnt_idmap(f.file), dentry, length, ATTR_MTIME | ATTR_CTIME, f.file); diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 87044acf8e79..1a9ef8f6784d 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -170,6 +170,21 @@ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, return fsnotify_path(&file->f_path, FS_ACCESS_PERM); } +/* + * fsnotify_truncate_perm - permission hook before file truncate + */ +static inline int fsnotify_truncate_perm(const struct path *path, loff_t length) +{ + struct inode *inode = d_inode(path->dentry); + + if (!(inode->i_sb->s_iflags & SB_I_ALLOW_HSM) || + !fsnotify_sb_has_priority_watchers(inode->i_sb, + FSNOTIFY_PRIO_PRE_CONTENT)) + return 0; + + return fsnotify_pre_content(path, &length, 0); +} + /* * fsnotify_file_perm - permission hook before file access (unknown range) */ @@ -208,6 +223,11 @@ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, return 0; } +static inline int fsnotify_truncate_perm(const struct path *path, loff_t length) +{ + return 0; +} + static inline int fsnotify_file_perm(struct file *file, int perm_mask) { return 0; -- Gitee From 9df7a32a057c0f09c7799dce5ed35bf0cce101b6 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 15 Nov 2024 10:30:23 -0500 Subject: [PATCH 30/37] fanotify: introduce FAN_PRE_ACCESS permission event ANBZ: #33963 commit 4f8afa33817a6420398d1c177c6e220a05081f51 upstream Similar to FAN_ACCESS_PERM permission event, but it is only allowed with class FAN_CLASS_PRE_CONTENT and only allowed on regular files and dirs. Unlike FAN_ACCESS_PERM, it is safe to write to the file being accessed in the context of the event handler. This pre-content event is meant to be used by hierarchical storage managers that want to fill the content of files on first read access. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://patch.msgid.link/b80986f8d5b860acea2c9a73c0acd93587be5fe4.1731684329.git.josef@toxicpanda.com Conflicts: fs/notify/fanotify/fanotify_user.c [ Gao Xiang: don't backport upstream commit 8152f8201088 for now. ] Signed-off-by: Gao Xiang --- fs/notify/fanotify/fanotify.c | 3 ++- fs/notify/fanotify/fanotify_user.c | 35 +++++++++++++++++++++++++----- include/linux/fanotify.h | 14 ++++++++---- include/uapi/linux/fanotify.h | 2 ++ 4 files changed, 44 insertions(+), 10 deletions(-) diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index a44ee94e0841..e6cdb5191a2c 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -921,8 +921,9 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask, BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM); BUILD_BUG_ON(FAN_FS_ERROR != FS_ERROR); BUILD_BUG_ON(FAN_RENAME != FS_RENAME); + BUILD_BUG_ON(FAN_PRE_ACCESS != FS_PRE_ACCESS); - BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 21); + BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 22); mask = fanotify_group_event_mask(group, iter_info, &match_mask, mask, data, data_type, dir); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index d1cd2e2f5fca..6e4bdd80ada7 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1294,7 +1294,7 @@ static int fanotify_group_init_error_pool(struct fsnotify_group *group) } static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark, - unsigned int fan_flags) + __u32 mask, unsigned int fan_flags) { /* * Non evictable mark cannot be downgraded to evictable mark. @@ -1321,6 +1321,11 @@ static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark, fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY) return -EEXIST; + /* For now pre-content events are not generated for directories */ + mask |= fsn_mark->mask; + if (mask & FANOTIFY_PRE_CONTENT_EVENTS && mask & FAN_ONDIR) + return -EEXIST; + return 0; } @@ -1347,7 +1352,7 @@ static int fanotify_add_mark(struct fsnotify_group *group, /* * Check if requested mark flags conflict with an existing mark flags. */ - ret = fanotify_may_update_existing_mark(fsn_mark, fan_flags); + ret = fanotify_may_update_existing_mark(fsn_mark, mask, fan_flags); if (ret) goto out; @@ -1647,11 +1652,23 @@ static int fanotify_events_supported(struct fsnotify_group *group, unsigned int flags) { unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; + bool is_dir = d_is_dir(path->dentry); /* Strict validation of events in non-dir inode mask with v5.17+ APIs */ bool strict_dir_events = FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID) || (mask & FAN_RENAME) || (flags & FAN_MARK_IGNORE); + /* + * Filesystems need to opt-into pre-content evnets (a.k.a HSM) + * and they are only supported on regular files and directories. + */ + if (mask & FANOTIFY_PRE_CONTENT_EVENTS) { + if (!(path->mnt->mnt_sb->s_iflags & SB_I_ALLOW_HSM)) + return -EOPNOTSUPP; + if (!is_dir && !d_is_reg(path->dentry)) + return -EINVAL; + } + /* * Some filesystems such as 'proc' acquire unusual locks when opening * files. For them fanotify permission events have high chances of @@ -1684,7 +1701,7 @@ static int fanotify_events_supported(struct fsnotify_group *group, * but because we always allowed it, error only when using new APIs. */ if (strict_dir_events && mark_type == FAN_MARK_INODE && - !d_is_dir(path->dentry) && (mask & FANOTIFY_DIRONLY_EVENT_BITS)) + !is_dir && (mask & FANOTIFY_DIRONLY_EVENT_BITS)) return -ENOTDIR; return 0; @@ -1788,11 +1805,15 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, goto fput_and_out; /* - * Permission events require minimum priority FAN_CLASS_CONTENT. + * Permission events are not allowed for FAN_CLASS_NOTIF. + * Pre-content permission events are not allowed for FAN_CLASS_CONTENT. */ ret = -EINVAL; if (mask & FANOTIFY_PERM_EVENTS && - group->priority < FSNOTIFY_PRIO_CONTENT) + group->priority == FSNOTIFY_PRIO_NORMAL) + goto fput_and_out; + else if (mask & FANOTIFY_PRE_CONTENT_EVENTS && + group->priority == FSNOTIFY_PRIO_CONTENT) goto fput_and_out; if (mask & FAN_FS_ERROR && @@ -1827,6 +1848,10 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, if (mask & FAN_RENAME && !(fid_mode & FAN_REPORT_NAME)) goto fput_and_out; + /* Pre-content events are not currently generated for directories. */ + if (mask & FANOTIFY_PRE_CONTENT_EVENTS && mask & FAN_ONDIR) + goto fput_and_out; + if (mark_cmd == FAN_MARK_FLUSH) { ret = 0; if (mark_type == FAN_MARK_MOUNT) diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index 89ff45bd6f01..c747af064d2c 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -89,6 +89,16 @@ #define FANOTIFY_DIRENT_EVENTS (FAN_MOVE | FAN_CREATE | FAN_DELETE | \ FAN_RENAME) +/* Content events can be used to inspect file content */ +#define FANOTIFY_CONTENT_PERM_EVENTS (FAN_OPEN_PERM | FAN_OPEN_EXEC_PERM | \ + FAN_ACCESS_PERM) +/* Pre-content events can be used to fill file content */ +#define FANOTIFY_PRE_CONTENT_EVENTS (FAN_PRE_ACCESS) + +/* Events that require a permission response from user */ +#define FANOTIFY_PERM_EVENTS (FANOTIFY_CONTENT_PERM_EVENTS | \ + FANOTIFY_PRE_CONTENT_EVENTS) + /* Events that can be reported with event->fd */ #define FANOTIFY_FD_EVENTS (FANOTIFY_PATH_EVENTS | FANOTIFY_PERM_EVENTS) @@ -104,10 +114,6 @@ FANOTIFY_INODE_EVENTS | \ FANOTIFY_ERROR_EVENTS) -/* Events that require a permission response from user */ -#define FANOTIFY_PERM_EVENTS (FAN_OPEN_PERM | FAN_ACCESS_PERM | \ - FAN_OPEN_EXEC_PERM) - /* Extra flags that may be reported with event or control handling of events */ #define FANOTIFY_EVENT_FLAGS (FAN_EVENT_ON_CHILD | FAN_ONDIR) diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h index 73849868311f..ee4a9cfafa94 100644 --- a/include/uapi/linux/fanotify.h +++ b/include/uapi/linux/fanotify.h @@ -27,6 +27,8 @@ #define FAN_OPEN_EXEC_PERM 0x00040000 /* File open/exec in perm check */ /* #define FAN_DIR_MODIFY 0x00080000 */ /* Deprecated (reserved) */ +#define FAN_PRE_ACCESS 0x00100000 /* Pre-content access hook */ + #define FAN_EVENT_ON_CHILD 0x08000000 /* Interested in child events */ #define FAN_RENAME 0x10000000 /* File was renamed */ -- Gitee From 671023e14f204dc800c3da5651607c95842b6dac Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 15 Nov 2024 10:30:24 -0500 Subject: [PATCH 31/37] fanotify: report file range info with pre-content events ANBZ: #33963 commit 870499bc1d4dc04cba1f63dd5e7bc02b983e2458 upstream With group class FAN_CLASS_PRE_CONTENT, report offset and length info along with FAN_PRE_ACCESS pre-content events. This information is meant to be used by hierarchical storage managers that want to fill partial content of files on first access to range. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://patch.msgid.link/b90a9e6c809dd3cad5684da90f23ea93ec6ce8c8.1731684329.git.josef@toxicpanda.com Signed-off-by: Gao Xiang --- fs/notify/fanotify/fanotify.h | 8 +++++++ fs/notify/fanotify/fanotify_user.c | 38 ++++++++++++++++++++++++++++++ include/uapi/linux/fanotify.h | 8 +++++++ 3 files changed, 54 insertions(+) diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index d457628d5a4a..32ba63d2c028 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -448,6 +448,14 @@ static inline bool fanotify_is_perm_event(u32 mask) mask & FANOTIFY_PERM_EVENTS; } +static inline bool fanotify_event_has_access_range(struct fanotify_event *event) +{ + if (!(event->mask & FANOTIFY_PRE_CONTENT_EVENTS)) + return false; + + return FANOTIFY_PERM(event)->ppos; +} + static inline struct fanotify_event *FANOTIFY_E(struct fsnotify_event *fse) { return container_of(fse, struct fanotify_event, fse); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 6e4bdd80ada7..bd88e79b5f48 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -123,6 +123,8 @@ struct kmem_cache *fanotify_perm_event_cachep __read_mostly; sizeof(struct fanotify_event_info_pidfd) #define FANOTIFY_ERROR_INFO_LEN \ (sizeof(struct fanotify_event_info_error)) +#define FANOTIFY_RANGE_INFO_LEN \ + (sizeof(struct fanotify_event_info_range)) static int fanotify_fid_info_len(int fh_len, int name_len) { @@ -182,6 +184,9 @@ static size_t fanotify_event_len(unsigned int info_mode, if (info_mode & FAN_REPORT_PIDFD) event_len += FANOTIFY_PIDFD_INFO_LEN; + if (fanotify_event_has_access_range(event)) + event_len += FANOTIFY_RANGE_INFO_LEN; + return event_len; } @@ -518,6 +523,30 @@ static int copy_pidfd_info_to_user(int pidfd, return info_len; } +static size_t copy_range_info_to_user(struct fanotify_event *event, + char __user *buf, int count) +{ + struct fanotify_perm_event *pevent = FANOTIFY_PERM(event); + struct fanotify_event_info_range info = { }; + size_t info_len = FANOTIFY_RANGE_INFO_LEN; + + if (WARN_ON_ONCE(info_len > count)) + return -EFAULT; + + if (WARN_ON_ONCE(!pevent->ppos)) + return -EINVAL; + + info.hdr.info_type = FAN_EVENT_INFO_TYPE_RANGE; + info.hdr.len = info_len; + info.offset = *(pevent->ppos); + info.count = pevent->count; + + if (copy_to_user(buf, &info, info_len)) + return -EFAULT; + + return info_len; +} + static int copy_info_records_to_user(struct fanotify_event *event, struct fanotify_info *info, unsigned int info_mode, int pidfd, @@ -639,6 +668,15 @@ static int copy_info_records_to_user(struct fanotify_event *event, total_bytes += ret; } + if (fanotify_event_has_access_range(event)) { + ret = copy_range_info_to_user(event, buf, count); + if (ret < 0) + return ret; + buf += ret; + count -= ret; + total_bytes += ret; + } + return total_bytes; } diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h index ee4a9cfafa94..06b4a17b82f8 100644 --- a/include/uapi/linux/fanotify.h +++ b/include/uapi/linux/fanotify.h @@ -146,6 +146,7 @@ struct fanotify_event_metadata { #define FAN_EVENT_INFO_TYPE_DFID 3 #define FAN_EVENT_INFO_TYPE_PIDFD 4 #define FAN_EVENT_INFO_TYPE_ERROR 5 +#define FAN_EVENT_INFO_TYPE_RANGE 6 /* Special info types for FAN_RENAME */ #define FAN_EVENT_INFO_TYPE_OLD_DFID_NAME 10 @@ -192,6 +193,13 @@ struct fanotify_event_info_error { __u32 error_count; }; +struct fanotify_event_info_range { + struct fanotify_event_info_header hdr; + __u32 pad; + __u64 offset; + __u64 count; +}; + /* * User space may need to record additional information about its decision. * The extra information type records what kind of information is included. -- Gitee From 9cd92d4d5567c511ba3494dbab1c31a12f978a49 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 15 Nov 2024 10:30:25 -0500 Subject: [PATCH 32/37] fanotify: allow to set errno in FAN_DENY permission response ANBZ: #33963 commit b4b2ff4f61ded819bfa22e50fdec7693f51cbbee upstream With FAN_DENY response, user trying to perform the filesystem operation gets an error with errno set to EPERM. It is useful for hierarchical storage management (HSM) service to be able to deny access for reasons more diverse than EPERM, for example EAGAIN, if HSM could retry the operation later. Allow fanotify groups with priority FAN_CLASSS_PRE_CONTENT to responsd to permission events with the response value FAN_DENY_ERRNO(errno), instead of FAN_DENY to return a custom error. Limit custom error values to errors expected on read(2)/write(2) and open(2) of regular files. This list could be extended in the future. Userspace can test for legitimate values of FAN_DENY_ERRNO(errno) by writing a response to an fanotify group fd with a value of FAN_NOFD in the fd field of the response. The change in fanotify_response is backward compatible, because errno is written in the high 8 bits of the 32bit response field and old kernels reject respose value with high bits set. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://patch.msgid.link/1e5fb6af84b69ca96b5c849fa5f10bdf4d1dc414.1731684329.git.josef@toxicpanda.com Signed-off-by: Gao Xiang --- fs/notify/fanotify/fanotify.c | 17 +++++++++++++---- fs/notify/fanotify/fanotify.h | 5 +++++ fs/notify/fanotify/fanotify_user.c | 29 +++++++++++++++++++++++++++-- include/linux/fanotify.h | 4 +++- include/uapi/linux/fanotify.h | 7 +++++++ 5 files changed, 55 insertions(+), 7 deletions(-) diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index e6cdb5191a2c..d8eec50e5407 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -224,7 +224,7 @@ static int fanotify_get_response(struct fsnotify_group *group, struct fanotify_perm_event *event, struct fsnotify_iter_info *iter_info) { - int ret; + int ret, errno; pr_debug("%s: group=%p event=%p\n", __func__, group, event); @@ -261,14 +261,23 @@ static int fanotify_get_response(struct fsnotify_group *group, ret = 0; break; case FAN_DENY: + /* Check custom errno from pre-content events */ + errno = fanotify_get_response_errno(event->response); + if (errno) { + ret = -errno; + break; + } + fallthrough; default: ret = -EPERM; } /* Check if the response should be audited */ - if (event->response & FAN_AUDIT) - audit_fanotify(event->response & ~FAN_AUDIT, - &event->audit_rule); + if (event->response & FAN_AUDIT) { + u32 response = event->response & + (FANOTIFY_RESPONSE_ACCESS | FANOTIFY_RESPONSE_FLAGS); + audit_fanotify(response & ~FAN_AUDIT, &event->audit_rule); + } pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__, group, event, ret); diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 32ba63d2c028..997c010899f5 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -528,3 +528,8 @@ static inline unsigned int fanotify_mark_user_flags(struct fsnotify_mark *mark) return mflags; } + +static inline u32 fanotify_get_response_errno(int res) +{ + return (res >> FAN_ERRNO_SHIFT) & FAN_ERRNO_MASK; +} diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index bd88e79b5f48..37e9bbaf565a 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -329,11 +329,12 @@ static int process_access_response(struct fsnotify_group *group, struct fanotify_perm_event *event; int fd = response_struct->fd; u32 response = response_struct->response; + int errno = fanotify_get_response_errno(response); int ret = info_len; struct fanotify_response_info_audit_rule friar; - pr_debug("%s: group=%p fd=%d response=%u buf=%p size=%zu\n", __func__, - group, fd, response, info, info_len); + pr_debug("%s: group=%p fd=%d response=%x errno=%d buf=%p size=%zu\n", + __func__, group, fd, response, errno, info, info_len); /* * make sure the response is valid, if invalid we do nothing and either * userspace can send a valid response or we will clean it up after the @@ -344,7 +345,31 @@ static int process_access_response(struct fsnotify_group *group, switch (response & FANOTIFY_RESPONSE_ACCESS) { case FAN_ALLOW: + if (errno) + return -EINVAL; + break; case FAN_DENY: + /* Custom errno is supported only for pre-content groups */ + if (errno && group->priority != FSNOTIFY_PRIO_PRE_CONTENT) + return -EINVAL; + + /* + * Limit errno to values expected on open(2)/read(2)/write(2) + * of regular files. + */ + switch (errno) { + case 0: + case EIO: + case EPERM: + case EBUSY: + case ETXTBSY: + case EAGAIN: + case ENOSPC: + case EDQUOT: + break; + default: + return -EINVAL; + } break; default: return -EINVAL; diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index c747af064d2c..78f660ebc318 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -132,7 +132,9 @@ /* These masks check for invalid bits in permission responses. */ #define FANOTIFY_RESPONSE_ACCESS (FAN_ALLOW | FAN_DENY) #define FANOTIFY_RESPONSE_FLAGS (FAN_AUDIT | FAN_INFO) -#define FANOTIFY_RESPONSE_VALID_MASK (FANOTIFY_RESPONSE_ACCESS | FANOTIFY_RESPONSE_FLAGS) +#define FANOTIFY_RESPONSE_VALID_MASK \ + (FANOTIFY_RESPONSE_ACCESS | FANOTIFY_RESPONSE_FLAGS | \ + (FAN_ERRNO_MASK << FAN_ERRNO_SHIFT)) /* Do not use these old uapi constants internally */ #undef FAN_ALL_CLASS_BITS diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h index 06b4a17b82f8..9f09cbf9ed6b 100644 --- a/include/uapi/linux/fanotify.h +++ b/include/uapi/linux/fanotify.h @@ -235,6 +235,13 @@ struct fanotify_response_info_audit_rule { /* Legit userspace responses to a _PERM event */ #define FAN_ALLOW 0x01 #define FAN_DENY 0x02 +/* errno other than EPERM can specified in upper byte of deny response */ +#define FAN_ERRNO_BITS 8 +#define FAN_ERRNO_SHIFT (32 - FAN_ERRNO_BITS) +#define FAN_ERRNO_MASK ((1 << FAN_ERRNO_BITS) - 1) +#define FAN_DENY_ERRNO(err) \ + (FAN_DENY | ((((__u32)(err)) & FAN_ERRNO_MASK) << FAN_ERRNO_SHIFT)) + #define FAN_AUDIT 0x10 /* Bitmask to create audit record for result */ #define FAN_INFO 0x20 /* Bitmask to indicate additional information */ -- Gitee From 49b9661bba0564e82b2627a2d5e62da3c93ae5a1 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 12 Mar 2025 08:38:47 +0100 Subject: [PATCH 33/37] fsnotify: add pre-content hooks on mmap() ANBZ: #33963 commit 066e053fe208a3b83ee89dc5a192146add688861 upstream Pre-content hooks in page faults introduces potential deadlock of HSM handler in userspace with filesystem freezing. The requirement with pre-content event is that for every accessed file range an event covering at least this range will be generated at least once before the file data is accesses. In preparation to disabling pre-content event hooks on page faults, add pre-content hooks at mmap() variants for the entire mmaped range, so HSM can fill content when user requests to map a portion of the file. Note that exec() variant also calls vm_mmap_pgoff() internally to map code sections, so pre-content hooks are also generated in this case. Link: https://lore.kernel.org/linux-fsdevel/7ehxrhbvehlrjwvrduoxsao5k3x4aw275patsb3krkwuq573yv@o2hskrfawbnc/ Suggested-by: Josef Bacik Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://patch.msgid.link/20250312073852.2123409-2-amir73il@gmail.com Signed-off-by: Gao Xiang --- include/linux/fsnotify.h | 21 +++++++++++++++++++++ mm/util.c | 3 +++ 2 files changed, 24 insertions(+) diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 1a9ef8f6784d..ae4d57316499 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -170,6 +170,21 @@ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, return fsnotify_path(&file->f_path, FS_ACCESS_PERM); } +/* + * fsnotify_mmap_perm - permission hook before mmap of file range + */ +static inline int fsnotify_mmap_perm(struct file *file, int prot, + const loff_t off, size_t len) +{ + /* + * mmap() generates only pre-content events. + */ + if (!file || likely(!FMODE_FSNOTIFY_HSM(file->f_mode))) + return 0; + + return fsnotify_pre_content(&file->f_path, &off, len); +} + /* * fsnotify_truncate_perm - permission hook before file truncate */ @@ -223,6 +238,12 @@ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, return 0; } +static inline int fsnotify_mmap_perm(struct file *file, int prot, + const loff_t off, size_t len) +{ + return 0; +} + static inline int fsnotify_truncate_perm(const struct path *path, loff_t length) { return 0; diff --git a/mm/util.c b/mm/util.c index 278f0d285005..ee4838ab7124 100644 --- a/mm/util.c +++ b/mm/util.c @@ -23,6 +23,7 @@ #include #include #include +#include #include @@ -571,6 +572,8 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, LIST_HEAD(uf); ret = security_mmap_file(file, prot, flag); + if (!ret) + ret = fsnotify_mmap_perm(file, prot, pgoff >> PAGE_SHIFT, len); if (!ret) { if (mmap_write_lock_killable(mm)) return -EINTR; -- Gitee From 53457139f46a052b12b147e4a209650c3d9092e2 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 15 Nov 2024 10:30:31 -0500 Subject: [PATCH 34/37] btrfs: disable defrag on pre-content watched files ANBZ: #33963 commit b722e40be2bda7a688f74e1a794121e84f717fdc upstream We queue up inodes to be defrag'ed asynchronously, which means we do not have their original file for readahead. This means that the code to skip readahead on pre-content watched files will not run, and we could potentially read in empty pages. Handle this corner case by disabling defrag on files that are currently being watched for pre-content events. Signed-off-by: Josef Bacik Signed-off-by: Jan Kara Link: https://patch.msgid.link/4cc5bcea13db7904174353d08e85157356282a59.1731684329.git.josef@toxicpanda.com Signed-off-by: Gao Xiang --- fs/btrfs/ioctl.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7222ef921442..6c259b522dff 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2627,6 +2627,15 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) goto out; } + /* + * Don't allow defrag on pre-content watched files, as it could + * populate the page cache with 0's via readahead. + */ + if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) { + ret = -EINVAL; + goto out; + } + if (argp) { if (copy_from_user(&range, argp, sizeof(range))) { ret = -EFAULT; -- Gitee From 5e177152addfd7838dc09f744207b44501dddf49 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 15 Nov 2024 10:30:32 -0500 Subject: [PATCH 35/37] fs: enable pre-content events on supported file systems ANBZ: #33963 commit 5121711eb8dbcbed70b1db429a4665f413844164 upstream Now that all the code has been added for pre-content events, and the various file systems that need the page fault hooks for fsnotify have been updated, add SB_I_ALLOW_HSM to the supported file systems. Signed-off-by: Josef Bacik Signed-off-by: Jan Kara Link: https://patch.msgid.link/46960dcb2725fa0317895ed66a8409ba1c306a82.1731684329.git.josef@toxicpanda.com Signed-off-by: Gao Xiang --- fs/btrfs/super.c | 2 +- fs/ext4/super.c | 3 +++ fs/xfs/xfs_super.c | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 55dfa57d51a7..46a53e3c406f 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1152,7 +1152,7 @@ static int btrfs_fill_super(struct super_block *sb, sb->s_flags |= SB_POSIXACL; #endif sb->s_flags |= SB_I_VERSION; - sb->s_iflags |= SB_I_CGROUPWB; + sb->s_iflags |= SB_I_CGROUPWB | SB_I_ALLOW_HSM; err = super_setup_bdi(sb); if (err) { diff --git a/fs/ext4/super.c b/fs/ext4/super.c index bc6225ad494f..d56b3f83fdf0 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5342,6 +5342,9 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) /* i_version is always enabled now */ sb->s_flags |= SB_I_VERSION; + /* HSM events are allowed by default. */ + sb->s_iflags |= SB_I_ALLOW_HSM; + err = ext4_check_feature_compatibility(sb, es, silent); if (err) goto failed_mount; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index e470bc5c1951..af6c948c87f0 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1699,7 +1699,7 @@ xfs_fs_fill_super( sb->s_time_max = XFS_LEGACY_TIME_MAX; } trace_xfs_inode_timestamp_range(mp, sb->s_time_min, sb->s_time_max); - sb->s_iflags |= SB_I_CGROUPWB; + sb->s_iflags |= SB_I_CGROUPWB | SB_I_ALLOW_HSM; set_posix_acl_flag(sb); -- Gitee From 5ac4798656c972b8b33ae22f35eaa7dc0ebaa804 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 6 Jan 2025 12:08:42 +0100 Subject: [PATCH 36/37] fanotify: Fix crash in fanotify_init(2) ANBZ: #33963 commit 0c0214df28f0dba8de084cb4dedc0c459dfbc083 upstream The rrror handling in fanotify_init(2) is buggy and overwrites 'fd' before calling put_unused_fd() leading to possible access beyond the end of fd bitmap. Fix it. Reported-by: syzbot+6a3aa63412255587b21b@syzkaller.appspotmail.com Fixes: ebe559609d78 ("fs: get rid of __FMODE_NONOTIFY kludge") Signed-off-by: Jan Kara Signed-off-by: Gao Xiang --- fs/notify/fanotify/fanotify_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 37e9bbaf565a..e7ec57892266 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1631,8 +1631,8 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) file = anon_inode_getfile_fmode("[fanotify]", &fanotify_fops, group, f_flags, FMODE_NONOTIFY); if (IS_ERR(file)) { - fd = PTR_ERR(file); put_unused_fd(fd); + fd = PTR_ERR(file); goto out_destroy_group; } fd_install(fd, file); -- Gitee From 9995622ce4c632266d096ed47ad2cb416f317db3 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Fri, 3 Oct 2025 16:52:36 +0100 Subject: [PATCH 37/37] fsnotify: pass correct offset to fsnotify_mmap_perm() ANBZ: #33963 commit 28bba2c2935e219d6cb6946e16b9a0b7c47913be upstream fsnotify_mmap_perm() requires a byte offset for the file about to be mmap'ed. But it is called from vm_mmap_pgoff(), which has a page offset. Previously the conversion was done incorrectly so let's fix it, being careful not to overflow on 32-bit platforms. Discovered during code review. Link: https://lkml.kernel.org/r/20251003155238.2147410-1-ryan.roberts@arm.com Fixes: 066e053fe208 ("fsnotify: add pre-content hooks on mmap()") Signed-off-by: Ryan Roberts Reviewed-by: Kiryl Shutsemau Cc: Amir Goldstein Cc: David Hildenbrand Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton [ Gao Xiang: fix ltp `fs/fanotify24`. ] Signed-off-by: Gao Xiang --- mm/util.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/util.c b/mm/util.c index ee4838ab7124..d72855e78fd0 100644 --- a/mm/util.c +++ b/mm/util.c @@ -566,6 +566,7 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flag, unsigned long pgoff) { + loff_t off = (loff_t)pgoff << PAGE_SHIFT; unsigned long ret; struct mm_struct *mm = current->mm; unsigned long populate; @@ -573,7 +574,7 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, ret = security_mmap_file(file, prot, flag); if (!ret) - ret = fsnotify_mmap_perm(file, prot, pgoff >> PAGE_SHIFT, len); + ret = fsnotify_mmap_perm(file, prot, off, len); if (!ret) { if (mmap_write_lock_killable(mm)) return -EINTR; -- Gitee