diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7222ef921442ac6db3468827a0393552d42541cd..6c259b522dff068321e4b95c8e4b24cd5b509aca 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2627,6 +2627,15 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) goto out; } + /* + * Don't allow defrag on pre-content watched files, as it could + * populate the page cache with 0's via readahead. + */ + if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) { + ret = -EINVAL; + goto out; + } + if (argp) { if (copy_from_user(&range, argp, sizeof(range))) { ret = -EFAULT; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 55dfa57d51a783f8f3056c34f9277f81db7666cf..46a53e3c406f04ea1930db615800dc141b73f34e 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1152,7 +1152,7 @@ static int btrfs_fill_super(struct super_block *sb, sb->s_flags |= SB_POSIXACL; #endif sb->s_flags |= SB_I_VERSION; - sb->s_iflags |= SB_I_CGROUPWB; + sb->s_iflags |= SB_I_CGROUPWB | SB_I_ALLOW_HSM; err = super_setup_bdi(sb); if (err) { diff --git a/fs/ext4/super.c b/fs/ext4/super.c index bc6225ad494f7f66500bf7794dd673ab29c4ae42..d56b3f83fdf0295152a7ed93c45811a5ee02b68a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5342,6 +5342,9 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) /* i_version is always enabled now */ sb->s_flags |= SB_I_VERSION; + /* HSM events are allowed by default. */ + sb->s_iflags |= SB_I_ALLOW_HSM; + err = ext4_check_feature_compatibility(sb, es, silent); if (err) goto failed_mount; diff --git a/fs/fcntl.c b/fs/fcntl.c index 1484f062ee65e3d3de2ae43286007ebf8cc2326b..52efa1860f429e92b6c723ab02a426233d94fb3f 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -1021,10 +1021,10 @@ static int __init fcntl_init(void) * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY * is defined as O_NONBLOCK on some platforms and not on others. */ - BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ != + BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( (VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) | - __FMODE_EXEC | __FMODE_NONOTIFY)); + __FMODE_EXEC)); fasync_cache = kmem_cache_create("fasync_cache", sizeof(struct fasync_struct), 0, diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 2419cf33021058a5bfc32034040420ab44df86c5..00fa6969fb5337db7e39399b9a66b87ffed46ec0 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -162,8 +162,8 @@ nfsd_file_mark_find_or_create(struct nfsd_file *nf, struct inode *inode) do { fsnotify_group_lock(nfsd_file_fsnotify_group); - mark = fsnotify_find_mark(&inode->i_fsnotify_marks, - nfsd_file_fsnotify_group); + mark = fsnotify_find_inode_mark(inode, + nfsd_file_fsnotify_group); if (mark) { nfm = nfsd_file_mark_get(container_of(mark, struct nfsd_file_mark, diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index ebdcc25df0f70938f54ffac25c71caefd1583f00..c5a19554ea85e15645331f50ddafed9f8befcbf0 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -163,7 +163,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id) if (!S_ISDIR(inode->i_mode)) return; - fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group); + fsn_mark = fsnotify_find_inode_mark(inode, dnotify_group); if (!fsn_mark) return; dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); @@ -327,7 +327,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg) fsnotify_group_lock(dnotify_group); /* add the new_fsn_mark or find an old one. */ - fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group); + fsn_mark = fsnotify_find_inode_mark(inode, dnotify_group); if (fsn_mark) { dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); spin_lock(&fsn_mark->lock); diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig index a511f9d8677ba02076e5a71a755b77e87db2d843..0e36aaf379b7369843422d359506f3862a8abbd6 100644 --- a/fs/notify/fanotify/Kconfig +++ b/fs/notify/fanotify/Kconfig @@ -15,7 +15,6 @@ config FANOTIFY config FANOTIFY_ACCESS_PERMISSIONS bool "fanotify permissions checking" depends on FANOTIFY - depends on SECURITY default n help Say Y here is you want fanotify listeners to be able to make permissions diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 723ff9cad9ed9234efebd843871005eb521ef9c0..d8eec50e5407248aff15c627b5ec97010754bf73 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -29,12 +29,6 @@ static unsigned int fanotify_hash_path(const struct path *path) hash_ptr(path->mnt, FANOTIFY_EVENT_HASH_BITS); } -static inline bool fanotify_fsid_equal(__kernel_fsid_t *fsid1, - __kernel_fsid_t *fsid2) -{ - return fsid1->val[0] == fsid2->val[0] && fsid1->val[1] == fsid2->val[1]; -} - static unsigned int fanotify_hash_fsid(__kernel_fsid_t *fsid) { return hash_32(fsid->val[0], FANOTIFY_EVENT_HASH_BITS) ^ @@ -230,7 +224,7 @@ static int fanotify_get_response(struct fsnotify_group *group, struct fanotify_perm_event *event, struct fsnotify_iter_info *iter_info) { - int ret; + int ret, errno; pr_debug("%s: group=%p event=%p\n", __func__, group, event); @@ -267,14 +261,23 @@ static int fanotify_get_response(struct fsnotify_group *group, ret = 0; break; case FAN_DENY: + /* Check custom errno from pre-content events */ + errno = fanotify_get_response_errno(event->response); + if (errno) { + ret = -errno; + break; + } + fallthrough; default: ret = -EPERM; } /* Check if the response should be audited */ - if (event->response & FAN_AUDIT) - audit_fanotify(event->response & ~FAN_AUDIT, - &event->audit_rule); + if (event->response & FAN_AUDIT) { + u32 response = event->response & + (FANOTIFY_RESPONSE_ACCESS | FANOTIFY_RESPONSE_FLAGS); + audit_fanotify(response & ~FAN_AUDIT, &event->audit_rule); + } pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__, group, event, ret); @@ -559,9 +562,13 @@ static struct fanotify_event *fanotify_alloc_path_event(const struct path *path, return &pevent->fae; } -static struct fanotify_event *fanotify_alloc_perm_event(const struct path *path, +static struct fanotify_event *fanotify_alloc_perm_event(const void *data, + int data_type, gfp_t gfp) { + const struct path *path = fsnotify_data_path(data, data_type); + const struct file_range *range = + fsnotify_data_file_range(data, data_type); struct fanotify_perm_event *pevent; pevent = kmem_cache_alloc(fanotify_perm_event_cachep, gfp); @@ -575,6 +582,9 @@ static struct fanotify_event *fanotify_alloc_perm_event(const struct path *path, pevent->hdr.len = 0; pevent->state = FAN_EVENT_INIT; pevent->path = *path; + /* NULL ppos means no range info */ + pevent->ppos = range ? &range->pos : NULL; + pevent->count = range ? range->count : 0; path_get(path); return &pevent->fae; @@ -812,7 +822,7 @@ static struct fanotify_event *fanotify_alloc_event( old_memcg = set_active_memcg(group->memcg); if (fanotify_is_perm_event(mask)) { - event = fanotify_alloc_perm_event(path, gfp); + event = fanotify_alloc_perm_event(data, data_type, gfp); } else if (fanotify_is_error_event(mask)) { event = fanotify_alloc_error_event(group, fsid, data, data_type, &hash); @@ -844,9 +854,8 @@ static struct fanotify_event *fanotify_alloc_event( } /* - * Get cached fsid of the filesystem containing the object from any connector. - * All connectors are supposed to have the same fsid, but we do not verify that - * here. + * Get cached fsid of the filesystem containing the object from any mark. + * All marks are supposed to have the same fsid, but we do not verify that here. */ static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info) { @@ -855,18 +864,11 @@ static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info) __kernel_fsid_t fsid = {}; fsnotify_foreach_iter_mark_type(iter_info, mark, type) { - struct fsnotify_mark_connector *conn; - - conn = READ_ONCE(mark->connector); - /* Mark is just getting destroyed or created? */ - if (!conn) - continue; - if (!(conn->flags & FSNOTIFY_CONN_FLAG_HAS_FSID)) + if (!(mark->flags & FSNOTIFY_MARK_FLAG_HAS_FSID)) continue; - /* Pairs with smp_wmb() in fsnotify_add_mark_list() */ - smp_rmb(); - fsid = conn->fsid; - if (WARN_ON_ONCE(!fsid.val[0] && !fsid.val[1])) + fsid = FANOTIFY_MARK(mark)->fsid; + if (!(mark->flags & FSNOTIFY_MARK_FLAG_WEAK_FSID) && + WARN_ON_ONCE(!fsid.val[0] && !fsid.val[1])) continue; return fsid; } @@ -928,8 +930,9 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask, BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM); BUILD_BUG_ON(FAN_FS_ERROR != FS_ERROR); BUILD_BUG_ON(FAN_RENAME != FS_RENAME); + BUILD_BUG_ON(FAN_PRE_ACCESS != FS_PRE_ACCESS); - BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 21); + BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 22); mask = fanotify_group_event_mask(group, iter_info, &match_mask, mask, data, data_type, dir); @@ -948,12 +951,8 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask, return 0; } - if (FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS)) { + if (FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS)) fsid = fanotify_get_fsid(iter_info); - /* Racing with mark destruction or creation? */ - if (!fsid.val[0] && !fsid.val[1]) - return 0; - } event = fanotify_alloc_event(group, mask, data, data_type, dir, file_name, &fsid, match_mask); @@ -1074,7 +1073,7 @@ static void fanotify_freeing_mark(struct fsnotify_mark *mark, static void fanotify_free_mark(struct fsnotify_mark *fsn_mark) { - kmem_cache_free(fanotify_mark_cache, fsn_mark); + kmem_cache_free(fanotify_mark_cache, FANOTIFY_MARK(fsn_mark)); } const struct fsnotify_ops fanotify_fsnotify_ops = { diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index e8a3c28c5d12032c473512016af20958285616c2..997c010899f5c348b6505d2495e81fd7d3a61412 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -425,6 +425,8 @@ FANOTIFY_PE(struct fanotify_event *event) struct fanotify_perm_event { struct fanotify_event fae; struct path path; + const loff_t *ppos; /* optional file range info */ + size_t count; u32 response; /* userspace answer to the event */ unsigned short state; /* state of the event */ int fd; /* fd we passed to userspace for this event */ @@ -446,6 +448,14 @@ static inline bool fanotify_is_perm_event(u32 mask) mask & FANOTIFY_PERM_EVENTS; } +static inline bool fanotify_event_has_access_range(struct fanotify_event *event) +{ + if (!(event->mask & FANOTIFY_PRE_CONTENT_EVENTS)) + return false; + + return FANOTIFY_PERM(event)->ppos; +} + static inline struct fanotify_event *FANOTIFY_E(struct fsnotify_event *fse) { return container_of(fse, struct fanotify_event, fse); @@ -489,6 +499,22 @@ static inline unsigned int fanotify_event_hash_bucket( return event->hash & FANOTIFY_HTABLE_MASK; } +struct fanotify_mark { + struct fsnotify_mark fsn_mark; + __kernel_fsid_t fsid; +}; + +static inline struct fanotify_mark *FANOTIFY_MARK(struct fsnotify_mark *mark) +{ + return container_of(mark, struct fanotify_mark, fsn_mark); +} + +static inline bool fanotify_fsid_equal(__kernel_fsid_t *fsid1, + __kernel_fsid_t *fsid2) +{ + return fsid1->val[0] == fsid2->val[0] && fsid1->val[1] == fsid2->val[1]; +} + static inline unsigned int fanotify_mark_user_flags(struct fsnotify_mark *mark) { unsigned int mflags = 0; @@ -502,3 +528,8 @@ static inline unsigned int fanotify_mark_user_flags(struct fsnotify_mark *mark) return mflags; } + +static inline u32 fanotify_get_response_errno(int res) +{ + return (res >> FAN_ERRNO_SHIFT) & FAN_ERRNO_MASK; +} diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index f4798d613dc229465ae083e56077c49084017d31..e7ec5789226669f25f5cd6d3cdcc80fa055b8c8d 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -23,7 +23,7 @@ #include -#include "../../mount.h" +#include "../fsnotify.h" #include "../fdinfo.h" #include "fanotify.h" @@ -102,8 +102,7 @@ static void __init fanotify_sysctls_init(void) * * Internal and external open flags are stored together in field f_flags of * struct file. Only external open flags shall be allowed in event_f_flags. - * Internal flags like FMODE_NONOTIFY, FMODE_EXEC, FMODE_NOCMTIME shall be - * excluded. + * Internal flags like FMODE_EXEC shall be excluded. */ #define FANOTIFY_INIT_ALL_EVENT_F_BITS ( \ O_ACCMODE | O_APPEND | O_NONBLOCK | \ @@ -120,10 +119,12 @@ struct kmem_cache *fanotify_perm_event_cachep __read_mostly; #define FANOTIFY_EVENT_ALIGN 4 #define FANOTIFY_FID_INFO_HDR_LEN \ (sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle)) -#define FANOTIFY_PIDFD_INFO_HDR_LEN \ +#define FANOTIFY_PIDFD_INFO_LEN \ sizeof(struct fanotify_event_info_pidfd) #define FANOTIFY_ERROR_INFO_LEN \ (sizeof(struct fanotify_event_info_error)) +#define FANOTIFY_RANGE_INFO_LEN \ + (sizeof(struct fanotify_event_info_range)) static int fanotify_fid_info_len(int fh_len, int name_len) { @@ -161,9 +162,6 @@ static size_t fanotify_event_len(unsigned int info_mode, int fh_len; int dot_len = 0; - if (!info_mode) - return event_len; - if (fanotify_is_error_event(event->mask)) event_len += FANOTIFY_ERROR_INFO_LEN; @@ -178,14 +176,17 @@ static size_t fanotify_event_len(unsigned int info_mode, dot_len = 1; } - if (info_mode & FAN_REPORT_PIDFD) - event_len += FANOTIFY_PIDFD_INFO_HDR_LEN; - if (fanotify_event_has_object_fh(event)) { fh_len = fanotify_event_object_fh_len(event); event_len += fanotify_fid_info_len(fh_len, dot_len); } + if (info_mode & FAN_REPORT_PIDFD) + event_len += FANOTIFY_PIDFD_INFO_LEN; + + if (fanotify_event_has_access_range(event)) + event_len += FANOTIFY_RANGE_INFO_LEN; + return event_len; } @@ -260,12 +261,11 @@ static int create_fd(struct fsnotify_group *group, const struct path *path, return client_fd; /* - * we need a new file handle for the userspace program so it can read even if it was - * originally opened O_WRONLY. + * We provide an fd for the userspace program, so it could access the + * file without generating fanotify events itself. */ - new_file = dentry_open(path, - group->fanotify_data.f_flags | __FMODE_NONOTIFY, - current_cred()); + new_file = dentry_open_nonotify(path, group->fanotify_data.f_flags, + current_cred()); if (IS_ERR(new_file)) { put_unused_fd(client_fd); client_fd = PTR_ERR(new_file); @@ -329,11 +329,12 @@ static int process_access_response(struct fsnotify_group *group, struct fanotify_perm_event *event; int fd = response_struct->fd; u32 response = response_struct->response; + int errno = fanotify_get_response_errno(response); int ret = info_len; struct fanotify_response_info_audit_rule friar; - pr_debug("%s: group=%p fd=%d response=%u buf=%p size=%zu\n", __func__, - group, fd, response, info, info_len); + pr_debug("%s: group=%p fd=%d response=%x errno=%d buf=%p size=%zu\n", + __func__, group, fd, response, errno, info, info_len); /* * make sure the response is valid, if invalid we do nothing and either * userspace can send a valid response or we will clean it up after the @@ -344,7 +345,31 @@ static int process_access_response(struct fsnotify_group *group, switch (response & FANOTIFY_RESPONSE_ACCESS) { case FAN_ALLOW: + if (errno) + return -EINVAL; + break; case FAN_DENY: + /* Custom errno is supported only for pre-content groups */ + if (errno && group->priority != FSNOTIFY_PRIO_PRE_CONTENT) + return -EINVAL; + + /* + * Limit errno to values expected on open(2)/read(2)/write(2) + * of regular files. + */ + switch (errno) { + case 0: + case EIO: + case EPERM: + case EBUSY: + case ETXTBSY: + case EAGAIN: + case ENOSPC: + case EDQUOT: + break; + default: + return -EINVAL; + } break; default: return -EINVAL; @@ -508,7 +533,7 @@ static int copy_pidfd_info_to_user(int pidfd, size_t count) { struct fanotify_event_info_pidfd info = { }; - size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN; + size_t info_len = FANOTIFY_PIDFD_INFO_LEN; if (WARN_ON_ONCE(info_len > count)) return -EFAULT; @@ -523,6 +548,30 @@ static int copy_pidfd_info_to_user(int pidfd, return info_len; } +static size_t copy_range_info_to_user(struct fanotify_event *event, + char __user *buf, int count) +{ + struct fanotify_perm_event *pevent = FANOTIFY_PERM(event); + struct fanotify_event_info_range info = { }; + size_t info_len = FANOTIFY_RANGE_INFO_LEN; + + if (WARN_ON_ONCE(info_len > count)) + return -EFAULT; + + if (WARN_ON_ONCE(!pevent->ppos)) + return -EINVAL; + + info.hdr.info_type = FAN_EVENT_INFO_TYPE_RANGE; + info.hdr.len = info_len; + info.offset = *(pevent->ppos); + info.count = pevent->count; + + if (copy_to_user(buf, &info, info_len)) + return -EFAULT; + + return info_len; +} + static int copy_info_records_to_user(struct fanotify_event *event, struct fanotify_info *info, unsigned int info_mode, int pidfd, @@ -644,6 +693,15 @@ static int copy_info_records_to_user(struct fanotify_event *event, total_bytes += ret; } + if (fanotify_event_has_access_range(event)) { + ret = copy_range_info_to_user(event, buf, count); + if (ret < 0) + return ret; + buf += ret; + count -= ret; + total_bytes += ret; + } + return total_bytes; } @@ -758,12 +816,10 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, buf += FAN_EVENT_METADATA_LEN; count -= FAN_EVENT_METADATA_LEN; - if (info_mode) { - ret = copy_info_records_to_user(event, info, info_mode, pidfd, - buf, count); - if (ret < 0) - goto out_close_fd; - } + ret = copy_info_records_to_user(event, info, info_mode, pidfd, + buf, count); + if (ret < 0) + goto out_close_fd; if (f) fd_install(fd, f); @@ -1088,7 +1144,7 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, } static int fanotify_remove_mark(struct fsnotify_group *group, - fsnotify_connp_t *connp, __u32 mask, + void *obj, unsigned int obj_type, __u32 mask, unsigned int flags, __u32 umask) { struct fsnotify_mark *fsn_mark = NULL; @@ -1096,7 +1152,7 @@ static int fanotify_remove_mark(struct fsnotify_group *group, int destroy_mark; fsnotify_group_lock(group); - fsn_mark = fsnotify_find_mark(connp, group); + fsn_mark = fsnotify_find_mark(obj, obj_type, group); if (!fsn_mark) { fsnotify_group_unlock(group); return -ENOENT; @@ -1117,30 +1173,6 @@ static int fanotify_remove_mark(struct fsnotify_group *group, return 0; } -static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group, - struct vfsmount *mnt, __u32 mask, - unsigned int flags, __u32 umask) -{ - return fanotify_remove_mark(group, &real_mount(mnt)->mnt_fsnotify_marks, - mask, flags, umask); -} - -static int fanotify_remove_sb_mark(struct fsnotify_group *group, - struct super_block *sb, __u32 mask, - unsigned int flags, __u32 umask) -{ - return fanotify_remove_mark(group, &sb->s_fsnotify_marks, mask, - flags, umask); -} - -static int fanotify_remove_inode_mark(struct fsnotify_group *group, - struct inode *inode, __u32 mask, - unsigned int flags, __u32 umask) -{ - return fanotify_remove_mark(group, &inode->i_fsnotify_marks, mask, - flags, umask); -} - static bool fanotify_mark_update_flags(struct fsnotify_mark *fsn_mark, unsigned int fan_flags) { @@ -1203,13 +1235,71 @@ static bool fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, return recalc; } +struct fan_fsid { + struct super_block *sb; + __kernel_fsid_t id; + bool weak; +}; + +static int fanotify_set_mark_fsid(struct fsnotify_group *group, + struct fsnotify_mark *mark, + struct fan_fsid *fsid) +{ + struct fsnotify_mark_connector *conn; + struct fsnotify_mark *old; + struct super_block *old_sb = NULL; + + FANOTIFY_MARK(mark)->fsid = fsid->id; + mark->flags |= FSNOTIFY_MARK_FLAG_HAS_FSID; + if (fsid->weak) + mark->flags |= FSNOTIFY_MARK_FLAG_WEAK_FSID; + + /* First mark added will determine if group is single or multi fsid */ + if (list_empty(&group->marks_list)) + return 0; + + /* Find sb of an existing mark */ + list_for_each_entry(old, &group->marks_list, g_list) { + conn = READ_ONCE(old->connector); + if (!conn) + continue; + old_sb = fsnotify_connector_sb(conn); + if (old_sb) + break; + } + + /* Only detached marks left? */ + if (!old_sb) + return 0; + + /* Do not allow mixing of marks with weak and strong fsid */ + if ((mark->flags ^ old->flags) & FSNOTIFY_MARK_FLAG_WEAK_FSID) + return -EXDEV; + + /* Allow mixing of marks with strong fsid from different fs */ + if (!fsid->weak) + return 0; + + /* Do not allow mixing marks with weak fsid from different fs */ + if (old_sb != fsid->sb) + return -EXDEV; + + /* Do not allow mixing marks from different btrfs sub-volumes */ + if (!fanotify_fsid_equal(&FANOTIFY_MARK(old)->fsid, + &FANOTIFY_MARK(mark)->fsid)) + return -EXDEV; + + return 0; +} + static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, - fsnotify_connp_t *connp, + void *obj, unsigned int obj_type, unsigned int fan_flags, - __kernel_fsid_t *fsid) + struct fan_fsid *fsid) { struct ucounts *ucounts = group->fanotify_data.ucounts; + struct fanotify_mark *fan_mark; struct fsnotify_mark *mark; int ret; @@ -1222,24 +1312,34 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, !inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS)) return ERR_PTR(-ENOSPC); - mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); - if (!mark) { + fan_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); + if (!fan_mark) { ret = -ENOMEM; goto out_dec_ucounts; } + mark = &fan_mark->fsn_mark; fsnotify_init_mark(mark, group); if (fan_flags & FAN_MARK_EVICTABLE) mark->flags |= FSNOTIFY_MARK_FLAG_NO_IREF; - ret = fsnotify_add_mark_locked(mark, connp, obj_type, 0, fsid); - if (ret) { - fsnotify_put_mark(mark); - goto out_dec_ucounts; + /* Cache fsid of filesystem containing the marked object */ + if (fsid) { + ret = fanotify_set_mark_fsid(group, mark, fsid); + if (ret) + goto out_put_mark; + } else { + fan_mark->fsid.val[0] = fan_mark->fsid.val[1] = 0; } + ret = fsnotify_add_mark_locked(mark, obj, obj_type, 0); + if (ret) + goto out_put_mark; + return mark; +out_put_mark: + fsnotify_put_mark(mark); out_dec_ucounts: if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS)) dec_ucount(ucounts, UCOUNT_FANOTIFY_MARKS); @@ -1257,7 +1357,7 @@ static int fanotify_group_init_error_pool(struct fsnotify_group *group) } static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark, - unsigned int fan_flags) + __u32 mask, unsigned int fan_flags) { /* * Non evictable mark cannot be downgraded to evictable mark. @@ -1284,22 +1384,27 @@ static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark, fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY) return -EEXIST; + /* For now pre-content events are not generated for directories */ + mask |= fsn_mark->mask; + if (mask & FANOTIFY_PRE_CONTENT_EVENTS && mask & FAN_ONDIR) + return -EEXIST; + return 0; } static int fanotify_add_mark(struct fsnotify_group *group, - fsnotify_connp_t *connp, unsigned int obj_type, + void *obj, unsigned int obj_type, __u32 mask, unsigned int fan_flags, - __kernel_fsid_t *fsid) + struct fan_fsid *fsid) { struct fsnotify_mark *fsn_mark; bool recalc; int ret = 0; fsnotify_group_lock(group); - fsn_mark = fsnotify_find_mark(connp, group); + fsn_mark = fsnotify_find_mark(obj, obj_type, group); if (!fsn_mark) { - fsn_mark = fanotify_add_new_mark(group, connp, obj_type, + fsn_mark = fanotify_add_new_mark(group, obj, obj_type, fan_flags, fsid); if (IS_ERR(fsn_mark)) { fsnotify_group_unlock(group); @@ -1310,7 +1415,7 @@ static int fanotify_add_mark(struct fsnotify_group *group, /* * Check if requested mark flags conflict with an existing mark flags. */ - ret = fanotify_may_update_existing_mark(fsn_mark, fan_flags); + ret = fanotify_may_update_existing_mark(fsn_mark, mask, fan_flags); if (ret) goto out; @@ -1336,42 +1441,6 @@ static int fanotify_add_mark(struct fsnotify_group *group, return ret; } -static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, - struct vfsmount *mnt, __u32 mask, - unsigned int flags, __kernel_fsid_t *fsid) -{ - return fanotify_add_mark(group, &real_mount(mnt)->mnt_fsnotify_marks, - FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags, fsid); -} - -static int fanotify_add_sb_mark(struct fsnotify_group *group, - struct super_block *sb, __u32 mask, - unsigned int flags, __kernel_fsid_t *fsid) -{ - return fanotify_add_mark(group, &sb->s_fsnotify_marks, - FSNOTIFY_OBJ_TYPE_SB, mask, flags, fsid); -} - -static int fanotify_add_inode_mark(struct fsnotify_group *group, - struct inode *inode, __u32 mask, - unsigned int flags, __kernel_fsid_t *fsid) -{ - pr_debug("%s: group=%p inode=%p\n", __func__, group, inode); - - /* - * If some other task has this inode open for write we should not add - * an ignore mask, unless that ignore mask is supposed to survive - * modification changes anyway. - */ - if ((flags & FANOTIFY_MARK_IGNORE_BITS) && - !(flags & FAN_MARK_IGNORED_SURV_MODIFY) && - inode_is_open_for_write(inode)) - return 0; - - return fanotify_add_mark(group, &inode->i_fsnotify_marks, - FSNOTIFY_OBJ_TYPE_INODE, mask, flags, fsid); -} - static struct fsnotify_event *fanotify_alloc_overflow_event(void) { struct fanotify_event *oevent; @@ -1408,6 +1477,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) unsigned int fid_mode = flags & FANOTIFY_FID_BITS; unsigned int class = flags & FANOTIFY_CLASS_BITS; unsigned int internal_flags = 0; + struct file *file; pr_debug("%s: flags=%x event_f_flags=%x\n", __func__, flags, event_f_flags); @@ -1476,7 +1546,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) (!(fid_mode & FAN_REPORT_NAME) || !(fid_mode & FAN_REPORT_FID))) return -EINVAL; - f_flags = O_RDWR | __FMODE_NONOTIFY; + f_flags = O_RDWR; if (flags & FAN_CLOEXEC) f_flags |= O_CLOEXEC; if (flags & FAN_NONBLOCK) @@ -1520,13 +1590,13 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) INIT_LIST_HEAD(&group->fanotify_data.access_list); switch (class) { case FAN_CLASS_NOTIF: - group->priority = FS_PRIO_0; + group->priority = FSNOTIFY_PRIO_NORMAL; break; case FAN_CLASS_CONTENT: - group->priority = FS_PRIO_1; + group->priority = FSNOTIFY_PRIO_CONTENT; break; case FAN_CLASS_PRE_CONTENT: - group->priority = FS_PRIO_2; + group->priority = FSNOTIFY_PRIO_PRE_CONTENT; break; default: fd = -EINVAL; @@ -1554,10 +1624,18 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) goto out_destroy_group; } - fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags); + fd = get_unused_fd_flags(f_flags); if (fd < 0) goto out_destroy_group; + file = anon_inode_getfile_fmode("[fanotify]", &fanotify_fops, group, + f_flags, FMODE_NONOTIFY); + if (IS_ERR(file)) { + put_unused_fd(fd); + fd = PTR_ERR(file); + goto out_destroy_group; + } + fd_install(fd, file); return fd; out_destroy_group: @@ -1565,20 +1643,25 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) return fd; } -static int fanotify_test_fsid(struct dentry *dentry, __kernel_fsid_t *fsid) +static int fanotify_test_fsid(struct dentry *dentry, unsigned int flags, + struct fan_fsid *fsid) { + unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; __kernel_fsid_t root_fsid; int err; /* * Make sure dentry is not of a filesystem with zero fsid (e.g. fuse). */ - err = vfs_get_fsid(dentry, fsid); + err = vfs_get_fsid(dentry, &fsid->id); if (err) return err; - if (!fsid->val[0] && !fsid->val[1]) - return -ENODEV; + fsid->sb = dentry->d_sb; + if (!fsid->id.val[0] && !fsid->id.val[1]) { + err = -ENODEV; + goto weak; + } /* * Make sure dentry is not of a filesystem subvolume (e.g. btrfs) @@ -1588,11 +1671,18 @@ static int fanotify_test_fsid(struct dentry *dentry, __kernel_fsid_t *fsid) if (err) return err; - if (root_fsid.val[0] != fsid->val[0] || - root_fsid.val[1] != fsid->val[1]) - return -EXDEV; + if (!fanotify_fsid_equal(&root_fsid, &fsid->id)) { + err = -EXDEV; + goto weak; + } + fsid->weak = false; return 0; + +weak: + /* Allow weak fsid when marking inodes */ + fsid->weak = true; + return (mark_type == FAN_MARK_INODE) ? 0 : err; } /* Check if filesystem can encode a unique fid */ @@ -1625,11 +1715,23 @@ static int fanotify_events_supported(struct fsnotify_group *group, unsigned int flags) { unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; + bool is_dir = d_is_dir(path->dentry); /* Strict validation of events in non-dir inode mask with v5.17+ APIs */ bool strict_dir_events = FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID) || (mask & FAN_RENAME) || (flags & FAN_MARK_IGNORE); + /* + * Filesystems need to opt-into pre-content evnets (a.k.a HSM) + * and they are only supported on regular files and directories. + */ + if (mask & FANOTIFY_PRE_CONTENT_EVENTS) { + if (!(path->mnt->mnt_sb->s_iflags & SB_I_ALLOW_HSM)) + return -EOPNOTSUPP; + if (!is_dir && !d_is_reg(path->dentry)) + return -EINVAL; + } + /* * Some filesystems such as 'proc' acquire unusual locks when opening * files. For them fanotify permission events have high chances of @@ -1662,7 +1764,7 @@ static int fanotify_events_supported(struct fsnotify_group *group, * but because we always allowed it, error only when using new APIs. */ if (strict_dir_events && mark_type == FAN_MARK_INODE && - !d_is_dir(path->dentry) && (mask & FANOTIFY_DIRONLY_EVENT_BITS)) + !is_dir && (mask & FANOTIFY_DIRONLY_EVENT_BITS)) return -ENOTDIR; return 0; @@ -1676,12 +1778,13 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, struct fsnotify_group *group; struct fd f; struct path path; - __kernel_fsid_t __fsid, *fsid = NULL; + struct fan_fsid __fsid, *fsid = NULL; u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS; unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS; unsigned int ignore = flags & FANOTIFY_MARK_IGNORE_BITS; unsigned int obj_type, fid_mode; + void *obj; u32 umask = 0; int ret; @@ -1765,12 +1868,15 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, goto fput_and_out; /* - * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF. These are not - * allowed to set permissions events. + * Permission events are not allowed for FAN_CLASS_NOTIF. + * Pre-content permission events are not allowed for FAN_CLASS_CONTENT. */ ret = -EINVAL; if (mask & FANOTIFY_PERM_EVENTS && - group->priority == FS_PRIO_0) + group->priority == FSNOTIFY_PRIO_NORMAL) + goto fput_and_out; + else if (mask & FANOTIFY_PRE_CONTENT_EVENTS && + group->priority == FSNOTIFY_PRIO_CONTENT) goto fput_and_out; if (mask & FAN_FS_ERROR && @@ -1805,6 +1911,10 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, if (mask & FAN_RENAME && !(fid_mode & FAN_REPORT_NAME)) goto fput_and_out; + /* Pre-content events are not currently generated for directories. */ + if (mask & FANOTIFY_PRE_CONTENT_EVENTS && mask & FAN_ONDIR) + goto fput_and_out; + if (mark_cmd == FAN_MARK_FLUSH) { ret = 0; if (mark_type == FAN_MARK_MOUNT) @@ -1828,7 +1938,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, } if (fid_mode) { - ret = fanotify_test_fsid(path.dentry, &__fsid); + ret = fanotify_test_fsid(path.dentry, flags, &__fsid); if (ret) goto path_put_and_out; @@ -1840,17 +1950,34 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, } /* inode held in place by reference to path; group by fget on fd */ - if (mark_type == FAN_MARK_INODE) + if (mark_type == FAN_MARK_INODE) { inode = path.dentry->d_inode; - else + obj = inode; + } else { mnt = path.mnt; + if (mark_type == FAN_MARK_MOUNT) + obj = mnt; + else + obj = mnt->mnt_sb; + } - ret = mnt ? -EINVAL : -EISDIR; - /* FAN_MARK_IGNORE requires SURV_MODIFY for sb/mount/dir marks */ - if (mark_cmd == FAN_MARK_ADD && ignore == FAN_MARK_IGNORE && - (mnt || S_ISDIR(inode->i_mode)) && - !(flags & FAN_MARK_IGNORED_SURV_MODIFY)) - goto path_put_and_out; + /* + * If some other task has this inode open for write we should not add + * an ignore mask, unless that ignore mask is supposed to survive + * modification changes anyway. + */ + if (mark_cmd == FAN_MARK_ADD && (flags & FANOTIFY_MARK_IGNORE_BITS) && + !(flags & FAN_MARK_IGNORED_SURV_MODIFY)) { + ret = mnt ? -EINVAL : -EISDIR; + /* FAN_MARK_IGNORE requires SURV_MODIFY for sb/mount/dir marks */ + if (ignore == FAN_MARK_IGNORE && + (mnt || S_ISDIR(inode->i_mode))) + goto path_put_and_out; + + ret = 0; + if (inode && inode_is_open_for_write(inode)) + goto path_put_and_out; + } /* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */ if (mnt || !S_ISDIR(inode->i_mode)) { @@ -1868,26 +1995,12 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, /* create/update an inode mark */ switch (mark_cmd) { case FAN_MARK_ADD: - if (mark_type == FAN_MARK_MOUNT) - ret = fanotify_add_vfsmount_mark(group, mnt, mask, - flags, fsid); - else if (mark_type == FAN_MARK_FILESYSTEM) - ret = fanotify_add_sb_mark(group, mnt->mnt_sb, mask, - flags, fsid); - else - ret = fanotify_add_inode_mark(group, inode, mask, - flags, fsid); + ret = fanotify_add_mark(group, obj, obj_type, mask, flags, + fsid); break; case FAN_MARK_REMOVE: - if (mark_type == FAN_MARK_MOUNT) - ret = fanotify_remove_vfsmount_mark(group, mnt, mask, - flags, umask); - else if (mark_type == FAN_MARK_FILESYSTEM) - ret = fanotify_remove_sb_mark(group, mnt->mnt_sb, mask, - flags, umask); - else - ret = fanotify_remove_inode_mark(group, inode, mask, - flags, umask); + ret = fanotify_remove_mark(group, obj, obj_type, mask, flags, + umask); break; default: ret = -EINVAL; @@ -1946,7 +2059,7 @@ static int __init fanotify_user_setup(void) BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 13); BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 11); - fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, + fanotify_mark_cache = KMEM_CACHE(fanotify_mark, SLAB_PANIC|SLAB_ACCOUNT); fanotify_fid_event_cachep = KMEM_CACHE(fanotify_fid_event, SLAB_PANIC); diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 9cc4ebb535046413e6c05340d76c64381b59654b..d76f7f1659b37990adf53039df1af10299ae1e6c 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -89,11 +89,21 @@ static void fsnotify_unmount_inodes(struct super_block *sb) void fsnotify_sb_delete(struct super_block *sb) { + struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb); + + /* Were any marks ever added to any object on this sb? */ + if (!sbinfo) + return; + fsnotify_unmount_inodes(sb); fsnotify_clear_marks_by_sb(sb); /* Wait for outstanding object references from connectors */ - wait_var_event(&sb->s_fsnotify_connectors, - !atomic_long_read(&sb->s_fsnotify_connectors)); + wait_var_event(fsnotify_sb_watched_objects(sb), + !atomic_long_read(fsnotify_sb_watched_objects(sb))); + WARN_ON(fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_CONTENT)); + WARN_ON(fsnotify_sb_has_priority_watchers(sb, + FSNOTIFY_PRIO_PRE_CONTENT)); + kfree(sbinfo); } /* @@ -152,7 +162,7 @@ static void fsnotify_clear_child_dentry_flag(struct inode *pinode, } /* Are inode/sb/mount interested in parent and name info with this event? */ -static bool fsnotify_event_needs_parent(struct inode *inode, struct mount *mnt, +static bool fsnotify_event_needs_parent(struct inode *inode, __u32 mnt_mask, __u32 mask) { __u32 marks_mask = 0; @@ -171,13 +181,40 @@ static bool fsnotify_event_needs_parent(struct inode *inode, struct mount *mnt, /* Did either inode/sb/mount subscribe for events with parent/name? */ marks_mask |= fsnotify_parent_needed_mask(inode->i_fsnotify_mask); marks_mask |= fsnotify_parent_needed_mask(inode->i_sb->s_fsnotify_mask); - if (mnt) - marks_mask |= fsnotify_parent_needed_mask(mnt->mnt_fsnotify_mask); + marks_mask |= fsnotify_parent_needed_mask(mnt_mask); /* Did they subscribe for this event with parent/name info? */ return mask & marks_mask; } +/* Are there any inode/mount/sb objects that watch for these events? */ +static inline bool fsnotify_object_watched(struct inode *inode, __u32 mnt_mask, + __u32 mask) +{ + __u32 marks_mask = inode->i_fsnotify_mask | mnt_mask | + inode->i_sb->s_fsnotify_mask; + + return mask & marks_mask & ALL_FSNOTIFY_EVENTS; +} + +/* Report pre-content event with optional range info */ +int fsnotify_pre_content(const struct path *path, const loff_t *ppos, + size_t count) +{ + struct file_range range; + + /* Report page aligned range only when pos is known */ + if (!ppos) + return fsnotify_path(path, FS_PRE_ACCESS); + + range.path = path; + range.pos = PAGE_ALIGN_DOWN(*ppos); + range.count = PAGE_ALIGN(*ppos + count) - range.pos; + + return fsnotify_parent(path->dentry, FS_PRE_ACCESS, &range, + FSNOTIFY_EVENT_FILE_RANGE); +} + /* * Notify this dentry's parent about a child's events with child name info * if parent is watching or if inode/sb/mount are interested in events with @@ -190,7 +227,7 @@ int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data, int data_type) { const struct path *path = fsnotify_data_path(data, data_type); - struct mount *mnt = path ? real_mount(path->mnt) : NULL; + __u32 mnt_mask = path ? real_mount(path->mnt)->mnt_fsnotify_mask : 0; struct inode *inode = d_inode(dentry); struct dentry *parent; bool parent_watched = dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED; @@ -201,16 +238,13 @@ int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data, struct qstr *file_name = NULL; int ret = 0; - /* - * Do inode/sb/mount care about parent and name info on non-dir? - * Do they care about any event at all? - */ - if (!inode->i_fsnotify_marks && !inode->i_sb->s_fsnotify_marks && - (!mnt || !mnt->mnt_fsnotify_marks) && !parent_watched) + /* Optimize the likely case of nobody watching this path */ + if (likely(!parent_watched && + !fsnotify_object_watched(inode, mnt_mask, mask))) return 0; parent = NULL; - parent_needed = fsnotify_event_needs_parent(inode, mnt, mask); + parent_needed = fsnotify_event_needs_parent(inode, mnt_mask, mask); if (!parent_watched && !parent_needed) goto notify; @@ -497,6 +531,7 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, { const struct path *path = fsnotify_data_path(data, data_type); struct super_block *sb = fsnotify_data_sb(data, data_type); + struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb); struct fsnotify_iter_info iter_info = {}; struct mount *mnt = NULL; struct inode *inode2 = NULL; @@ -533,7 +568,7 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, * SRCU because we have no references to any objects and do not * need SRCU to keep them "alive". */ - if (!sb->s_fsnotify_marks && + if ((!sbinfo || !sbinfo->sb_marks) && (!mnt || !mnt->mnt_fsnotify_marks) && (!inode || !inode->i_fsnotify_marks) && (!inode2 || !inode2->i_fsnotify_marks)) @@ -560,8 +595,10 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu); - iter_info.marks[FSNOTIFY_ITER_TYPE_SB] = - fsnotify_first_mark(&sb->s_fsnotify_marks); + if (sbinfo) { + iter_info.marks[FSNOTIFY_ITER_TYPE_SB] = + fsnotify_first_mark(&sbinfo->sb_marks); + } if (mnt) { iter_info.marks[FSNOTIFY_ITER_TYPE_VFSMOUNT] = fsnotify_first_mark(&mnt->mnt_fsnotify_marks); @@ -597,11 +634,72 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, } EXPORT_SYMBOL_GPL(fsnotify); +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS +/* + * At open time we check fsnotify_sb_has_priority_watchers() and set the + * FMODE_NONOTIFY_ mode bits accordignly. + * Later, fsnotify permission hooks do not check if there are permission event + * watches, but that there were permission event watches at open time. + */ +void file_set_fsnotify_mode(struct file *file) +{ + struct dentry *dentry = file->f_path.dentry, *parent; + struct super_block *sb = dentry->d_sb; + __u32 mnt_mask, p_mask; + + /* Is it a file opened by fanotify? */ + if (FMODE_FSNOTIFY_NONE(file->f_mode)) + return; + + /* + * Permission events is a super set of pre-content events, so if there + * are no permission event watchers, there are also no pre-content event + * watchers and this is implied from the single FMODE_NONOTIFY_PERM bit. + */ + if (likely(!fsnotify_sb_has_priority_watchers(sb, + FSNOTIFY_PRIO_CONTENT))) { + file->f_mode |= FMODE_NONOTIFY_PERM; + return; + } + + /* + * If there are permission event watchers but no pre-content event + * watchers, set FMODE_NONOTIFY | FMODE_NONOTIFY_PERM to indicate that. + */ + if ((!d_is_dir(dentry) && !d_is_reg(dentry)) || + likely(!fsnotify_sb_has_priority_watchers(sb, + FSNOTIFY_PRIO_PRE_CONTENT))) { + file->f_mode |= FMODE_NONOTIFY | FMODE_NONOTIFY_PERM; + return; + } + + /* + * OK, there are some pre-content watchers. Check if anybody is + * watching for pre-content events on *this* file. + */ + mnt_mask = READ_ONCE(real_mount(file->f_path.mnt)->mnt_fsnotify_mask); + if (unlikely(fsnotify_object_watched(d_inode(dentry), mnt_mask, + FSNOTIFY_PRE_CONTENT_EVENTS))) + return; + + /* Is parent watching for pre-content events on this file? */ + if (dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED) { + parent = dget_parent(dentry); + p_mask = fsnotify_inode_watches_children(d_inode(parent)); + dput(parent); + if (p_mask & FSNOTIFY_PRE_CONTENT_EVENTS) + return; + } + /* Nobody watching for pre-content events from this file */ + file->f_mode |= FMODE_NONOTIFY | FMODE_NONOTIFY_PERM; +} +#endif + static __init int fsnotify_init(void) { int ret; - BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 23); + BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 24); ret = init_srcu_struct(&fsnotify_mark_srcu); if (ret) diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index 2b4267de86e6b27cdb3d7bd96f7e9e6bc9676d62..663759ed6fbc1b87144471f53faef6ea55ad6f7a 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -9,39 +9,58 @@ #include "../mount.h" +/* + * fsnotify_connp_t is what we embed in objects which connector can be attached + * to. + */ +typedef struct fsnotify_mark_connector __rcu *fsnotify_connp_t; + static inline struct inode *fsnotify_conn_inode( struct fsnotify_mark_connector *conn) { - return container_of(conn->obj, struct inode, i_fsnotify_marks); + return conn->obj; } static inline struct mount *fsnotify_conn_mount( struct fsnotify_mark_connector *conn) { - return container_of(conn->obj, struct mount, mnt_fsnotify_marks); + return real_mount(conn->obj); } static inline struct super_block *fsnotify_conn_sb( struct fsnotify_mark_connector *conn) { - return container_of(conn->obj, struct super_block, s_fsnotify_marks); + return conn->obj; } -static inline struct super_block *fsnotify_connector_sb( - struct fsnotify_mark_connector *conn) +static inline struct super_block *fsnotify_object_sb(void *obj, + enum fsnotify_obj_type obj_type) { - switch (conn->type) { + switch (obj_type) { case FSNOTIFY_OBJ_TYPE_INODE: - return fsnotify_conn_inode(conn)->i_sb; + return ((struct inode *)obj)->i_sb; case FSNOTIFY_OBJ_TYPE_VFSMOUNT: - return fsnotify_conn_mount(conn)->mnt.mnt_sb; + return ((struct vfsmount *)obj)->mnt_sb; case FSNOTIFY_OBJ_TYPE_SB: - return fsnotify_conn_sb(conn); + return (struct super_block *)obj; default: return NULL; } } +static inline struct super_block *fsnotify_connector_sb( + struct fsnotify_mark_connector *conn) +{ + return fsnotify_object_sb(conn->obj, conn->type); +} + +static inline fsnotify_connp_t *fsnotify_sb_marks(struct super_block *sb) +{ + struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb); + + return sbinfo ? &sbinfo->sb_marks : NULL; +} + /* destroy all events sitting in this groups notification queue */ extern void fsnotify_flush_notify(struct fsnotify_group *group); @@ -67,7 +86,7 @@ static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) /* run the list of all marks associated with sb and destroy them */ static inline void fsnotify_clear_marks_by_sb(struct super_block *sb) { - fsnotify_destroy_marks(&sb->s_fsnotify_marks); + fsnotify_destroy_marks(fsnotify_sb_marks(sb)); } /* diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 1c4bfdab008d9b4232cdbf5604bf42f87f5288eb..16871e8475e62d04f747a1187cc4087574313821 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -545,7 +545,7 @@ static int inotify_update_existing_watch(struct fsnotify_group *group, int create = (arg & IN_MASK_CREATE); int ret; - fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group); + fsn_mark = fsnotify_find_inode_mark(inode, group); if (!fsn_mark) return -ENOENT; else if (create) { diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 4be6e883d492f65e6a6e005a33b1960da9b7dfe6..5e170e71308868c7ab352847b524e8f6a4fb22cb 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -97,6 +97,21 @@ void fsnotify_get_mark(struct fsnotify_mark *mark) refcount_inc(&mark->refcnt); } +static fsnotify_connp_t *fsnotify_object_connp(void *obj, + enum fsnotify_obj_type obj_type) +{ + switch (obj_type) { + case FSNOTIFY_OBJ_TYPE_INODE: + return &((struct inode *)obj)->i_fsnotify_marks; + case FSNOTIFY_OBJ_TYPE_VFSMOUNT: + return &real_mount(obj)->mnt_fsnotify_marks; + case FSNOTIFY_OBJ_TYPE_SB: + return fsnotify_sb_marks(obj); + default: + return NULL; + } +} + static __u32 *fsnotify_conn_mask_p(struct fsnotify_mark_connector *conn) { if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) @@ -116,10 +131,69 @@ __u32 fsnotify_conn_mask(struct fsnotify_mark_connector *conn) return *fsnotify_conn_mask_p(conn); } +static void fsnotify_get_sb_watched_objects(struct super_block *sb) +{ + atomic_long_inc(fsnotify_sb_watched_objects(sb)); +} + +static void fsnotify_put_sb_watched_objects(struct super_block *sb) +{ + if (atomic_long_dec_and_test(fsnotify_sb_watched_objects(sb))) + wake_up_var(fsnotify_sb_watched_objects(sb)); +} + static void fsnotify_get_inode_ref(struct inode *inode) { ihold(inode); - atomic_long_inc(&inode->i_sb->s_fsnotify_connectors); + fsnotify_get_sb_watched_objects(inode->i_sb); +} + +static void fsnotify_put_inode_ref(struct inode *inode) +{ + fsnotify_put_sb_watched_objects(inode->i_sb); + iput(inode); +} + +/* + * Grab or drop watched objects reference depending on whether the connector + * is attached and has any marks attached. + */ +static void fsnotify_update_sb_watchers(struct super_block *sb, + struct fsnotify_mark_connector *conn) +{ + struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb); + bool is_watched = conn->flags & FSNOTIFY_CONN_FLAG_IS_WATCHED; + struct fsnotify_mark *first_mark = NULL; + unsigned int highest_prio = 0; + + if (conn->obj) + first_mark = hlist_entry_safe(conn->list.first, + struct fsnotify_mark, obj_list); + if (first_mark) + highest_prio = first_mark->group->priority; + if (WARN_ON(highest_prio >= __FSNOTIFY_PRIO_NUM)) + highest_prio = 0; + + /* + * If the highest priority of group watching this object is prio, + * then watched object has a reference on counters [0..prio]. + * Update priority >= 1 watched objects counters. + */ + for (unsigned int p = conn->prio + 1; p <= highest_prio; p++) + atomic_long_inc(&sbinfo->watched_objects[p]); + for (unsigned int p = conn->prio; p > highest_prio; p--) + atomic_long_dec(&sbinfo->watched_objects[p]); + conn->prio = highest_prio; + + /* Update priority >= 0 (a.k.a total) watched objects counter */ + BUILD_BUG_ON(FSNOTIFY_PRIO_NORMAL != 0); + if (first_mark && !is_watched) { + conn->flags |= FSNOTIFY_CONN_FLAG_IS_WATCHED; + fsnotify_get_sb_watched_objects(sb); + } else if (!first_mark && is_watched) { + conn->flags &= ~FSNOTIFY_CONN_FLAG_IS_WATCHED; + fsnotify_put_sb_watched_objects(sb); + } } /* @@ -239,35 +313,12 @@ static void fsnotify_connector_destroy_workfn(struct work_struct *work) } } -static void fsnotify_put_inode_ref(struct inode *inode) -{ - struct super_block *sb = inode->i_sb; - - iput(inode); - if (atomic_long_dec_and_test(&sb->s_fsnotify_connectors)) - wake_up_var(&sb->s_fsnotify_connectors); -} - -static void fsnotify_get_sb_connectors(struct fsnotify_mark_connector *conn) -{ - struct super_block *sb = fsnotify_connector_sb(conn); - - if (sb) - atomic_long_inc(&sb->s_fsnotify_connectors); -} - -static void fsnotify_put_sb_connectors(struct fsnotify_mark_connector *conn) -{ - struct super_block *sb = fsnotify_connector_sb(conn); - - if (sb && atomic_long_dec_and_test(&sb->s_fsnotify_connectors)) - wake_up_var(&sb->s_fsnotify_connectors); -} - static void *fsnotify_detach_connector_from_object( struct fsnotify_mark_connector *conn, unsigned int *type) { + fsnotify_connp_t *connp = fsnotify_object_connp(conn->obj, conn->type); + struct super_block *sb = fsnotify_connector_sb(conn); struct inode *inode = NULL; *type = conn->type; @@ -287,10 +338,10 @@ static void *fsnotify_detach_connector_from_object( fsnotify_conn_sb(conn)->s_fsnotify_mask = 0; } - fsnotify_put_sb_connectors(conn); - rcu_assign_pointer(*(conn->obj), NULL); + rcu_assign_pointer(*connp, NULL); conn->obj = NULL; conn->type = FSNOTIFY_OBJ_TYPE_DETACHED; + fsnotify_update_sb_watchers(sb, conn); return inode; } @@ -342,6 +393,11 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) objp = fsnotify_detach_connector_from_object(conn, &type); free_conn = true; } else { + struct super_block *sb = fsnotify_connector_sb(conn); + + /* Update watched objects after detaching mark */ + if (sb) + fsnotify_update_sb_watchers(sb, conn); objp = __fsnotify_recalc_mask(conn); type = conn->type; } @@ -562,9 +618,28 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b) return -1; } +static int fsnotify_attach_info_to_sb(struct super_block *sb) +{ + struct fsnotify_sb_info *sbinfo; + + /* sb info is freed on fsnotify_sb_delete() */ + sbinfo = kzalloc(sizeof(*sbinfo), GFP_KERNEL); + if (!sbinfo) + return -ENOMEM; + + /* + * cmpxchg() provides the barrier so that callers of fsnotify_sb_info() + * will observe an initialized structure + */ + if (cmpxchg(&sb->s_fsnotify_info, NULL, sbinfo)) { + /* Someone else created sbinfo for us */ + kfree(sbinfo); + } + return 0; +} + static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, - unsigned int obj_type, - __kernel_fsid_t *fsid) + void *obj, unsigned int obj_type) { struct fsnotify_mark_connector *conn; @@ -574,17 +649,9 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, spin_lock_init(&conn->lock); INIT_HLIST_HEAD(&conn->list); conn->flags = 0; + conn->prio = 0; conn->type = obj_type; - conn->obj = connp; - /* Cache fsid of filesystem containing the object */ - if (fsid) { - conn->fsid = *fsid; - conn->flags = FSNOTIFY_CONN_FLAG_HAS_FSID; - } else { - conn->fsid.val[0] = conn->fsid.val[1] = 0; - conn->flags = 0; - } - fsnotify_get_sb_connectors(conn); + conn->obj = obj; /* * cmpxchg() provides the barrier so that readers of *connp can see @@ -592,10 +659,8 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, */ if (cmpxchg(connp, NULL, conn)) { /* Someone else created list structure for us */ - fsnotify_put_sb_connectors(conn); kmem_cache_free(fsnotify_mark_connector_cachep, conn); } - return 0; } @@ -632,54 +697,39 @@ static struct fsnotify_mark_connector *fsnotify_grab_connector( * to which group and for which inodes. These marks are ordered according to * priority, highest number first, and then by the group's location in memory. */ -static int fsnotify_add_mark_list(struct fsnotify_mark *mark, - fsnotify_connp_t *connp, - unsigned int obj_type, - int add_flags, __kernel_fsid_t *fsid) +static int fsnotify_add_mark_list(struct fsnotify_mark *mark, void *obj, + unsigned int obj_type, int add_flags) { + struct super_block *sb = fsnotify_object_sb(obj, obj_type); struct fsnotify_mark *lmark, *last = NULL; struct fsnotify_mark_connector *conn; + fsnotify_connp_t *connp; int cmp; int err = 0; if (WARN_ON(!fsnotify_valid_obj_type(obj_type))) return -EINVAL; - /* Backend is expected to check for zero fsid (e.g. tmpfs) */ - if (fsid && WARN_ON_ONCE(!fsid->val[0] && !fsid->val[1])) - return -ENODEV; + /* + * Attach the sb info before attaching a connector to any object on sb. + * The sb info will remain attached as long as sb lives. + */ + if (!fsnotify_sb_info(sb)) { + err = fsnotify_attach_info_to_sb(sb); + if (err) + return err; + } + connp = fsnotify_object_connp(obj, obj_type); restart: spin_lock(&mark->lock); conn = fsnotify_grab_connector(connp); if (!conn) { spin_unlock(&mark->lock); - err = fsnotify_attach_connector_to_object(connp, obj_type, - fsid); + err = fsnotify_attach_connector_to_object(connp, obj, obj_type); if (err) return err; goto restart; - } else if (fsid && !(conn->flags & FSNOTIFY_CONN_FLAG_HAS_FSID)) { - conn->fsid = *fsid; - /* Pairs with smp_rmb() in fanotify_get_fsid() */ - smp_wmb(); - conn->flags |= FSNOTIFY_CONN_FLAG_HAS_FSID; - } else if (fsid && (conn->flags & FSNOTIFY_CONN_FLAG_HAS_FSID) && - (fsid->val[0] != conn->fsid.val[0] || - fsid->val[1] != conn->fsid.val[1])) { - /* - * Backend is expected to check for non uniform fsid - * (e.g. btrfs), but maybe we missed something? - * Only allow setting conn->fsid once to non zero fsid. - * inotify and non-fid fanotify groups do not set nor test - * conn->fsid. - */ - pr_warn_ratelimited("%s: fsid mismatch on object of type %u: " - "%x.%x != %x.%x\n", __func__, conn->type, - fsid->val[0], fsid->val[1], - conn->fsid.val[0], conn->fsid.val[1]); - err = -EXDEV; - goto out_err; } /* is mark the first mark? */ @@ -710,6 +760,7 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, /* mark should be the last entry. last is the current last entry */ hlist_add_behind_rcu(&mark->obj_list, &last->obj_list); added: + fsnotify_update_sb_watchers(sb, conn); /* * Since connector is attached to object using cmpxchg() we are * guaranteed that connector initialization is fully visible by anyone @@ -728,8 +779,8 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, * event types should be delivered to which group. */ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, - fsnotify_connp_t *connp, unsigned int obj_type, - int add_flags, __kernel_fsid_t *fsid) + void *obj, unsigned int obj_type, + int add_flags) { struct fsnotify_group *group = mark->group; int ret = 0; @@ -749,7 +800,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, fsnotify_get_mark(mark); /* for g_list */ spin_unlock(&mark->lock); - ret = fsnotify_add_mark_list(mark, connp, obj_type, add_flags, fsid); + ret = fsnotify_add_mark_list(mark, obj, obj_type, add_flags); if (ret) goto err; @@ -767,15 +818,14 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, return ret; } -int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp, - unsigned int obj_type, int add_flags, - __kernel_fsid_t *fsid) +int fsnotify_add_mark(struct fsnotify_mark *mark, void *obj, + unsigned int obj_type, int add_flags) { int ret; struct fsnotify_group *group = mark->group; fsnotify_group_lock(group); - ret = fsnotify_add_mark_locked(mark, connp, obj_type, add_flags, fsid); + ret = fsnotify_add_mark_locked(mark, obj, obj_type, add_flags); fsnotify_group_unlock(group); return ret; } @@ -785,12 +835,16 @@ EXPORT_SYMBOL_GPL(fsnotify_add_mark); * Given a list of marks, find the mark associated with given group. If found * take a reference to that mark and return it, else return NULL. */ -struct fsnotify_mark *fsnotify_find_mark(fsnotify_connp_t *connp, +struct fsnotify_mark *fsnotify_find_mark(void *obj, unsigned int obj_type, struct fsnotify_group *group) { + fsnotify_connp_t *connp = fsnotify_object_connp(obj, obj_type); struct fsnotify_mark_connector *conn; struct fsnotify_mark *mark; + if (!connp) + return NULL; + conn = fsnotify_grab_connector(connp); if (!conn) return NULL; diff --git a/fs/open.c b/fs/open.c index 704b8dfee2593f378a0109a41a137f51dac5beeb..98c1e2c3880725f1f64ef81b865a8ef1150a388b 100644 --- a/fs/open.c +++ b/fs/open.c @@ -82,14 +82,18 @@ long vfs_truncate(const struct path *path, loff_t length) if (!S_ISREG(inode->i_mode)) return -EINVAL; - error = mnt_want_write(path->mnt); - if (error) - goto out; - idmap = mnt_idmap(path->mnt); error = inode_permission(idmap, inode, MAY_WRITE); if (error) - goto mnt_drop_write_and_out; + return error; + + error = fsnotify_truncate_perm(path, length); + if (error) + return error; + + error = mnt_want_write(path->mnt); + if (error) + return error; error = -EPERM; if (IS_APPEND(inode)) @@ -115,7 +119,7 @@ long vfs_truncate(const struct path *path, loff_t length) put_write_access(inode); mnt_drop_write_and_out: mnt_drop_write(path->mnt); -out: + return error; } EXPORT_SYMBOL_GPL(vfs_truncate); @@ -188,8 +192,16 @@ long do_sys_ftruncate(unsigned int fd, loff_t length, int small) /* Check IS_APPEND on real upper inode */ if (IS_APPEND(file_inode(f.file))) goto out_putf; - sb_start_write(inode->i_sb); + error = security_file_truncate(f.file); + if (error) + goto out_putf; + + error = fsnotify_truncate_perm(&f.file->f_path, length); + if (error) + goto out_putf; + + sb_start_write(inode->i_sb); if (!error) error = do_truncate(file_mnt_idmap(f.file), dentry, length, ATTR_MTIME | ATTR_CTIME, f.file); @@ -303,6 +315,10 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (ret) return ret; + ret = fsnotify_file_area_perm(file, MAY_WRITE, &offset, len); + if (ret) + return ret; + if (S_ISFIFO(inode->i_mode)) return -ESPIPE; @@ -907,7 +923,7 @@ static int do_dentry_open(struct file *f, f->f_sb_err = file_sample_sb_err(f); if (unlikely(f->f_flags & O_PATH)) { - f->f_mode = FMODE_PATH | FMODE_OPENED; + f->f_mode = FMODE_PATH | FMODE_OPENED | FMODE_NONOTIFY; f->f_op = &empty_fops; return 0; } @@ -935,6 +951,16 @@ static int do_dentry_open(struct file *f, if (error) goto cleanup_all; + /* + * Set FMODE_NONOTIFY_* bits according to existing permission watches. + * If FMODE_NONOTIFY was already set for an fanotify fd, this doesn't + * change anything. + */ + file_set_fsnotify_mode(f); + error = fsnotify_open_perm(f); + if (error) + goto cleanup_all; + error = break_lease(file_inode(f), f->f_flags); if (error) goto cleanup_all; @@ -1107,6 +1133,23 @@ struct file *dentry_open(const struct path *path, int flags, } EXPORT_SYMBOL(dentry_open); +struct file *dentry_open_nonotify(const struct path *path, int flags, + const struct cred *cred) +{ + struct file *f = alloc_empty_file(flags, cred); + if (!IS_ERR(f)) { + int error; + + f->f_mode |= FMODE_NONOTIFY; + error = vfs_open(path, f); + if (error) { + fput(f); + f = ERR_PTR(error); + } + } + return f; +} + /** * dentry_create - Create and open a file * @path: path to create @@ -1241,7 +1284,7 @@ inline struct open_how build_open_how(int flags, umode_t mode) inline int build_open_flags(const struct open_how *how, struct open_flags *op) { u64 flags = how->flags; - u64 strip = __FMODE_NONOTIFY | O_CLOEXEC; + u64 strip = O_CLOEXEC; int lookup_flags = 0; int acc_mode = ACC_MODE(flags); @@ -1249,9 +1292,7 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op) "struct open_flags doesn't yet handle flags > 32 bits"); /* - * Strip flags that either shouldn't be set by userspace like - * FMODE_NONOTIFY or that aren't relevant in determining struct - * open_flags like O_CLOEXEC. + * Strip flags that aren't relevant in determining struct open_flags. */ flags &= ~strip; diff --git a/fs/read_write.c b/fs/read_write.c index 4a7c722b795297ef80d022fa7601933282e41adb..e7b9567e304913e0ee76e4dfddbb23400663c090 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -354,6 +354,9 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count) { + int mask = read_write == READ ? MAY_READ : MAY_WRITE; + int ret; + if (unlikely((ssize_t) count < 0)) return -EINVAL; @@ -371,8 +374,11 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t } } - return security_file_permission(file, - read_write == READ ? MAY_READ : MAY_WRITE); + ret = security_file_permission(file, mask); + if (ret) + return ret; + + return fsnotify_file_area_perm(file, mask, ppos, count); } EXPORT_SYMBOL(rw_verify_area); diff --git a/fs/readdir.c b/fs/readdir.c index c8c46e29443139d1e496bbb2236e4faf6af9effa..278bc025473274e77ca924f801ddfd5333d7c2cf 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -96,6 +96,10 @@ int iterate_dir(struct file *file, struct dir_context *ctx) if (res) goto out; + res = fsnotify_file_perm(file, MAY_READ); + if (res) + goto out; + res = down_read_killable(&inode->i_rwsem); if (res) goto out; diff --git a/fs/remap_range.c b/fs/remap_range.c index 0c2c9fab026605ade59fd9351080763d8e589590..7a59d2a572587f4a90f0dee44b8455e36d0fa1bb 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -102,7 +102,9 @@ static int generic_remap_checks(struct file *file_in, loff_t pos_in, static int remap_verify_area(struct file *file, loff_t pos, loff_t len, bool write) { + int mask = write ? MAY_WRITE : MAY_READ; loff_t tmp; + int ret; if (unlikely(pos < 0 || len < 0)) return -EINVAL; @@ -110,7 +112,11 @@ static int remap_verify_area(struct file *file, loff_t pos, loff_t len, if (unlikely(check_add_overflow(pos, len, &tmp))) return -EINVAL; - return security_file_permission(file, write ? MAY_WRITE : MAY_READ); + ret = security_file_permission(file, mask); + if (ret) + return ret; + + return fsnotify_file_area_perm(file, mask, &pos, len); } /* diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index e470bc5c1951bc605614c42dafb018dd6dcaba05..af6c948c87f0f65fa04122f000bce35e4500ed0e 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1699,7 +1699,7 @@ xfs_fs_fill_super( sb->s_time_max = XFS_LEGACY_TIME_MAX; } trace_xfs_inode_timestamp_range(mp, sb->s_time_min, sb->s_time_max); - sb->s_iflags |= SB_I_CGROUPWB; + sb->s_iflags |= SB_I_CGROUPWB | SB_I_ALLOW_HSM; set_posix_acl_flag(sb); diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index 89ff45bd6f01baad88eb8328675e27133fc797a6..78f660ebc3180d5a7697370fcbdcec3c2fc3d7ad 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -89,6 +89,16 @@ #define FANOTIFY_DIRENT_EVENTS (FAN_MOVE | FAN_CREATE | FAN_DELETE | \ FAN_RENAME) +/* Content events can be used to inspect file content */ +#define FANOTIFY_CONTENT_PERM_EVENTS (FAN_OPEN_PERM | FAN_OPEN_EXEC_PERM | \ + FAN_ACCESS_PERM) +/* Pre-content events can be used to fill file content */ +#define FANOTIFY_PRE_CONTENT_EVENTS (FAN_PRE_ACCESS) + +/* Events that require a permission response from user */ +#define FANOTIFY_PERM_EVENTS (FANOTIFY_CONTENT_PERM_EVENTS | \ + FANOTIFY_PRE_CONTENT_EVENTS) + /* Events that can be reported with event->fd */ #define FANOTIFY_FD_EVENTS (FANOTIFY_PATH_EVENTS | FANOTIFY_PERM_EVENTS) @@ -104,10 +114,6 @@ FANOTIFY_INODE_EVENTS | \ FANOTIFY_ERROR_EVENTS) -/* Events that require a permission response from user */ -#define FANOTIFY_PERM_EVENTS (FAN_OPEN_PERM | FAN_ACCESS_PERM | \ - FAN_OPEN_EXEC_PERM) - /* Extra flags that may be reported with event or control handling of events */ #define FANOTIFY_EVENT_FLAGS (FAN_EVENT_ON_CHILD | FAN_ONDIR) @@ -126,7 +132,9 @@ /* These masks check for invalid bits in permission responses. */ #define FANOTIFY_RESPONSE_ACCESS (FAN_ALLOW | FAN_DENY) #define FANOTIFY_RESPONSE_FLAGS (FAN_AUDIT | FAN_INFO) -#define FANOTIFY_RESPONSE_VALID_MASK (FANOTIFY_RESPONSE_ACCESS | FANOTIFY_RESPONSE_FLAGS) +#define FANOTIFY_RESPONSE_VALID_MASK \ + (FANOTIFY_RESPONSE_ACCESS | FANOTIFY_RESPONSE_FLAGS | \ + (FAN_ERRNO_MASK << FAN_ERRNO_SHIFT)) /* Do not use these old uapi constants internally */ #undef FAN_ALL_CLASS_BITS diff --git a/include/linux/fs.h b/include/linux/fs.h index f58eefd376b0c7af9c1ed90dbaf24dfb934136ec..0f757ba2029bdc149af9284b65144c71f1d0105b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -71,6 +71,8 @@ struct fscrypt_info; struct fscrypt_operations; struct fsverity_info; struct fsverity_operations; +struct fsnotify_mark_connector; +struct fsnotify_sb_info; struct fs_context; struct fs_parameter_spec; struct fileattr; @@ -108,21 +110,24 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, */ /* file is open for reading */ -#define FMODE_READ ((__force fmode_t)0x1) +#define FMODE_READ ((__force fmode_t)(1 << 0)) /* file is open for writing */ -#define FMODE_WRITE ((__force fmode_t)0x2) +#define FMODE_WRITE ((__force fmode_t)(1 << 1)) /* file is seekable */ -#define FMODE_LSEEK ((__force fmode_t)0x4) +#define FMODE_LSEEK ((__force fmode_t)(1 << 2)) /* file can be accessed using pread */ -#define FMODE_PREAD ((__force fmode_t)0x8) +#define FMODE_PREAD ((__force fmode_t)(1 << 3)) /* file can be accessed using pwrite */ -#define FMODE_PWRITE ((__force fmode_t)0x10) +#define FMODE_PWRITE ((__force fmode_t)(1 << 4)) /* File is opened for execution with sys_execve / sys_uselib */ -#define FMODE_EXEC ((__force fmode_t)0x20) +#define FMODE_EXEC ((__force fmode_t)(1 << 5)) + +/* FMODE_* bits 6 to 8 */ + /* 32bit hashes as llseek() offset (for directories) */ -#define FMODE_32BITHASH ((__force fmode_t)0x200) +#define FMODE_32BITHASH ((__force fmode_t)(1 << 9)) /* 64bit hashes as llseek() offset (for directories) */ -#define FMODE_64BITHASH ((__force fmode_t)0x400) +#define FMODE_64BITHASH ((__force fmode_t)(1 << 10)) /* * Don't update ctime and mtime. @@ -130,51 +135,86 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, * Currently a special hack for the XFS open_by_handle ioctl, but we'll * hopefully graduate it to a proper O_CMTIME flag supported by open(2) soon. */ -#define FMODE_NOCMTIME ((__force fmode_t)0x800) +#define FMODE_NOCMTIME ((__force fmode_t)(1 << 11)) /* Expect random access pattern */ -#define FMODE_RANDOM ((__force fmode_t)0x1000) +#define FMODE_RANDOM ((__force fmode_t)(1 << 12)) /* File is huge (eg. /dev/mem): treat loff_t as unsigned */ -#define FMODE_UNSIGNED_OFFSET ((__force fmode_t)0x2000) +#define FMODE_UNSIGNED_OFFSET ((__force fmode_t)(1 << 13)) /* File is opened with O_PATH; almost nothing can be done with it */ -#define FMODE_PATH ((__force fmode_t)0x4000) +#define FMODE_PATH ((__force fmode_t)(1 << 14)) /* File needs atomic accesses to f_pos */ -#define FMODE_ATOMIC_POS ((__force fmode_t)0x8000) +#define FMODE_ATOMIC_POS ((__force fmode_t)(1 << 15)) /* Write access to underlying fs */ -#define FMODE_WRITER ((__force fmode_t)0x10000) +#define FMODE_WRITER ((__force fmode_t)(1 << 16)) /* Has read method(s) */ -#define FMODE_CAN_READ ((__force fmode_t)0x20000) +#define FMODE_CAN_READ ((__force fmode_t)(1 << 17)) /* Has write method(s) */ -#define FMODE_CAN_WRITE ((__force fmode_t)0x40000) +#define FMODE_CAN_WRITE ((__force fmode_t)(1 << 18)) -#define FMODE_OPENED ((__force fmode_t)0x80000) -#define FMODE_CREATED ((__force fmode_t)0x100000) +#define FMODE_OPENED ((__force fmode_t)(1 << 19)) +#define FMODE_CREATED ((__force fmode_t)(1 << 20)) /* File is stream-like */ -#define FMODE_STREAM ((__force fmode_t)0x200000) +#define FMODE_STREAM ((__force fmode_t)(1 << 21)) /* File supports DIRECT IO */ -#define FMODE_CAN_ODIRECT ((__force fmode_t)0x400000) +#define FMODE_CAN_ODIRECT ((__force fmode_t)(1 << 22)) -#define FMODE_NOREUSE ((__force fmode_t)0x800000) +#define FMODE_NOREUSE ((__force fmode_t)(1 << 23)) /* File is embedded in backing_file object */ -#define FMODE_BACKING ((__force fmode_t)0x2000000) +#define FMODE_BACKING ((__force fmode_t)(1 << 24)) -/* File was opened by fanotify and shouldn't generate fanotify events */ -#define FMODE_NONOTIFY ((__force fmode_t)0x4000000) +/* + * Together with FMODE_NONOTIFY_PERM defines which fsnotify events shouldn't be + * generated (see below) + */ +#define FMODE_NONOTIFY ((__force fmode_t)(1 << 25)) + +/* + * Together with FMODE_NONOTIFY defines which fsnotify events shouldn't be + * generated (see below) + */ +#define FMODE_NONOTIFY_PERM ((__force fmode_t)(1 << 26)) /* File is capable of returning -EAGAIN if I/O will block */ -#define FMODE_NOWAIT ((__force fmode_t)0x8000000) +#define FMODE_NOWAIT ((__force fmode_t)(1 << 27)) /* File represents mount that needs unmounting */ -#define FMODE_NEED_UNMOUNT ((__force fmode_t)0x10000000) +#define FMODE_NEED_UNMOUNT ((__force fmode_t)(1 << 28)) /* File does not contribute to nr_files count */ -#define FMODE_NOACCOUNT ((__force fmode_t)0x20000000) +#define FMODE_NOACCOUNT ((__force fmode_t)(1 << 29)) + +/* + * The two FMODE_NONOTIFY* define which fsnotify events should not be generated + * for a file. These are the possible values of (f->f_mode & + * FMODE_FSNOTIFY_MASK) and their meaning: + * + * FMODE_NONOTIFY - suppress all (incl. non-permission) events. + * FMODE_NONOTIFY_PERM - suppress permission (incl. pre-content) events. + * FMODE_NONOTIFY | FMODE_NONOTIFY_PERM - suppress only pre-content events. + */ +#define FMODE_FSNOTIFY_MASK \ + (FMODE_NONOTIFY | FMODE_NONOTIFY_PERM) + +#define FMODE_FSNOTIFY_NONE(mode) \ + ((mode & FMODE_FSNOTIFY_MASK) == FMODE_NONOTIFY) +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS +#define FMODE_FSNOTIFY_PERM(mode) \ + ((mode & FMODE_FSNOTIFY_MASK) == 0 || \ + (mode & FMODE_FSNOTIFY_MASK) == (FMODE_NONOTIFY | FMODE_NONOTIFY_PERM)) +#define FMODE_FSNOTIFY_HSM(mode) \ + ((mode & FMODE_FSNOTIFY_MASK) == 0) +#else +#define FMODE_FSNOTIFY_PERM(mode) 0 +#define FMODE_FSNOTIFY_HSM(mode) 0 +#endif + /* * Attribute flags. These should be or-ed together to figure out what @@ -640,8 +680,6 @@ is_uncached_acl(struct posix_acl *acl) #define IOP_XATTR 0x0008 #define IOP_DEFAULT_READLINK 0x0010 -struct fsnotify_mark_connector; - /* * Keep mostly read-only and often accessed (especially for * the RCU path lookup and 'stat' data) fields at the beginning @@ -1180,6 +1218,9 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */ #define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */ #define SB_I_RETIRED 0x00000800 /* superblock shouldn't be reused */ +#define SB_I_NOUMASK 0x00001000 /* VFS does not apply umask */ +#define SB_I_NOIDMAP 0x00002000 /* No idmapped mounts on this superblock */ +#define SB_I_ALLOW_HSM 0x00004000 /* Allow HSM events on this superblock */ /* Possible states of 'frozen' field */ enum { @@ -1246,7 +1287,7 @@ struct super_block { /* * Keep s_fs_info, s_time_gran, s_fsnotify_mask, and - * s_fsnotify_marks together for cache efficiency. They are frequently + * s_fsnotify_info together for cache efficiency. They are frequently * accessed and rarely modified. */ void *s_fs_info; /* Filesystem private info */ @@ -1258,7 +1299,7 @@ struct super_block { time64_t s_time_max; #ifdef CONFIG_FSNOTIFY __u32 s_fsnotify_mask; - struct fsnotify_mark_connector __rcu *s_fsnotify_marks; + struct fsnotify_sb_info *s_fsnotify_info; #endif /* @@ -1299,12 +1340,6 @@ struct super_block { /* Number of inodes with nlink == 0 but still referenced */ atomic_long_t s_remove_count; - /* - * Number of inode/mount/sb objects that are being watched, note that - * inodes objects are currently double-accounted. - */ - atomic_long_t s_fsnotify_connectors; - /* Read-only state of the superblock is being changed */ int s_readonly_remount; @@ -2678,6 +2713,8 @@ static inline struct file *file_open_root_mnt(struct vfsmount *mnt, } struct file *dentry_open(const struct path *path, int flags, const struct cred *creds); +struct file *dentry_open_nonotify(const struct path *path, int flags, + const struct cred *cred); struct file *dentry_create(const struct path *path, int flags, umode_t mode, const struct cred *cred); struct file *backing_file_open(const struct path *user_path, int flags, @@ -3563,11 +3600,9 @@ struct ctl_table; int __init list_bdev_fs_names(char *buf, size_t size); #define __FMODE_EXEC ((__force int) FMODE_EXEC) -#define __FMODE_NONOTIFY ((__force int) FMODE_NONOTIFY) #define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE]) -#define OPEN_FMODE(flag) ((__force fmode_t)(((flag + 1) & O_ACCMODE) | \ - (flag & __FMODE_NONOTIFY))) +#define OPEN_FMODE(flag) ((__force fmode_t)((flag + 1) & O_ACCMODE)) static inline bool is_sxid(umode_t mode) { diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 912b58bb66c548a6e3d7393c52d88571a773f31b..ae4d57316499bf5250db632e5db395128b671ae6 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -17,6 +17,25 @@ #include #include +/* Are there any inode/mount/sb objects watched with priority prio or above? */ +static inline bool fsnotify_sb_has_priority_watchers(struct super_block *sb, + int prio) +{ + struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb); + + /* Were any marks ever added to any object on this sb? */ + if (!sbinfo) + return false; + + return atomic_long_read(&sbinfo->watched_objects[prio]); +} + +/* Are there any inode/mount/sb objects that are being watched at all? */ +static inline bool fsnotify_sb_has_watchers(struct super_block *sb) +{ + return fsnotify_sb_has_priority_watchers(sb, 0); +} + /* * Notify this @dir inode about a change in a child directory entry. * The directory entry may have turned positive or negative or its inode may @@ -30,7 +49,7 @@ static inline int fsnotify_name(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, u32 cookie) { - if (atomic_long_read(&dir->i_sb->s_fsnotify_connectors) == 0) + if (!fsnotify_sb_has_watchers(dir->i_sb)) return 0; return fsnotify(mask, data, data_type, dir, name, NULL, cookie); @@ -44,7 +63,7 @@ static inline void fsnotify_dirent(struct inode *dir, struct dentry *dentry, static inline void fsnotify_inode(struct inode *inode, __u32 mask) { - if (atomic_long_read(&inode->i_sb->s_fsnotify_connectors) == 0) + if (!fsnotify_sb_has_watchers(inode->i_sb)) return; if (S_ISDIR(inode->i_mode)) @@ -59,7 +78,7 @@ static inline int fsnotify_parent(struct dentry *dentry, __u32 mask, { struct inode *inode = d_inode(dentry); - if (atomic_long_read(&inode->i_sb->s_fsnotify_connectors) == 0) + if (!fsnotify_sb_has_watchers(inode->i_sb)) return 0; if (S_ISDIR(inode->i_mode)) { @@ -89,47 +108,157 @@ static inline void fsnotify_dentry(struct dentry *dentry, __u32 mask) fsnotify_parent(dentry, mask, dentry, FSNOTIFY_EVENT_DENTRY); } -static inline int fsnotify_file(struct file *file, __u32 mask) +static inline int fsnotify_path(const struct path *path, __u32 mask) { - const struct path *path; + return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH); +} +static inline int fsnotify_file(struct file *file, __u32 mask) +{ /* * FMODE_NONOTIFY are fds generated by fanotify itself which should not * generate new events. We also don't want to generate events for * FMODE_PATH fds (involves open & close events) as they are just * handle creation / destruction events and not "real" file events. */ - if (file->f_mode & (FMODE_NONOTIFY | FMODE_PATH)) + if (FMODE_FSNOTIFY_NONE(file->f_mode)) return 0; - path = &file->f_path; - return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH); + return fsnotify_path(&file->f_path, mask); } -/* Simple call site for access decisions */ -static inline int fsnotify_perm(struct file *file, int mask) +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + +void file_set_fsnotify_mode(struct file *file); + +/* + * fsnotify_file_area_perm - permission hook before access to file range + */ +static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, + const loff_t *ppos, size_t count) { - int ret; - __u32 fsnotify_mask = 0; + /* + * filesystem may be modified in the context of permission events + * (e.g. by HSM filling a file on access), so sb freeze protection + * must not be held. + */ + lockdep_assert_once(file_write_not_started(file)); - if (!(mask & (MAY_READ | MAY_OPEN))) + if (!(perm_mask & (MAY_READ | MAY_WRITE | MAY_ACCESS))) return 0; - if (mask & MAY_OPEN) { - fsnotify_mask = FS_OPEN_PERM; + if (likely(!FMODE_FSNOTIFY_PERM(file->f_mode))) + return 0; - if (file->f_flags & __FMODE_EXEC) { - ret = fsnotify_file(file, FS_OPEN_EXEC_PERM); + /* + * read()/write() and other types of access generate pre-content events. + */ + if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) { + int ret = fsnotify_pre_content(&file->f_path, ppos, count); - if (ret) - return ret; - } - } else if (mask & MAY_READ) { - fsnotify_mask = FS_ACCESS_PERM; + if (ret) + return ret; } - return fsnotify_file(file, fsnotify_mask); + if (!(perm_mask & MAY_READ)) + return 0; + + /* + * read() also generates the legacy FS_ACCESS_PERM event, so content + * scanners can inspect the content filled by pre-content event. + */ + return fsnotify_path(&file->f_path, FS_ACCESS_PERM); +} + +/* + * fsnotify_mmap_perm - permission hook before mmap of file range + */ +static inline int fsnotify_mmap_perm(struct file *file, int prot, + const loff_t off, size_t len) +{ + /* + * mmap() generates only pre-content events. + */ + if (!file || likely(!FMODE_FSNOTIFY_HSM(file->f_mode))) + return 0; + + return fsnotify_pre_content(&file->f_path, &off, len); +} + +/* + * fsnotify_truncate_perm - permission hook before file truncate + */ +static inline int fsnotify_truncate_perm(const struct path *path, loff_t length) +{ + struct inode *inode = d_inode(path->dentry); + + if (!(inode->i_sb->s_iflags & SB_I_ALLOW_HSM) || + !fsnotify_sb_has_priority_watchers(inode->i_sb, + FSNOTIFY_PRIO_PRE_CONTENT)) + return 0; + + return fsnotify_pre_content(path, &length, 0); +} + +/* + * fsnotify_file_perm - permission hook before file access (unknown range) + */ +static inline int fsnotify_file_perm(struct file *file, int perm_mask) +{ + return fsnotify_file_area_perm(file, perm_mask, NULL, 0); +} + +/* + * fsnotify_open_perm - permission hook before file open + */ +static inline int fsnotify_open_perm(struct file *file) +{ + int ret; + + if (likely(!FMODE_FSNOTIFY_PERM(file->f_mode))) + return 0; + + if (file->f_flags & __FMODE_EXEC) { + ret = fsnotify_path(&file->f_path, FS_OPEN_EXEC_PERM); + if (ret) + return ret; + } + + return fsnotify_path(&file->f_path, FS_OPEN_PERM); +} + +#else +static inline void file_set_fsnotify_mode(struct file *file) +{ +} + +static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, + const loff_t *ppos, size_t count) +{ + return 0; +} + +static inline int fsnotify_mmap_perm(struct file *file, int prot, + const loff_t off, size_t len) +{ + return 0; +} + +static inline int fsnotify_truncate_perm(const struct path *path, loff_t length) +{ + return 0; +} + +static inline int fsnotify_file_perm(struct file *file, int perm_mask) +{ + return 0; +} + +static inline int fsnotify_open_perm(struct file *file) +{ + return 0; } +#endif /* * fsnotify_link_count - inode's link count changed diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 575415b5134972f69e60f5b14c8edee823fc24cd..3fb6dc029fec8566f2a602f3e90b0c95afd2034a 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -55,6 +55,9 @@ #define FS_OPEN_PERM 0x00010000 /* open event in an permission hook */ #define FS_ACCESS_PERM 0x00020000 /* access event in a permissions hook */ #define FS_OPEN_EXEC_PERM 0x00040000 /* open/exec event in a permission hook */ +/* #define FS_DIR_MODIFY 0x00080000 */ /* Deprecated (reserved) */ + +#define FS_PRE_ACCESS 0x00100000 /* Pre-content access hook */ /* * Set on inode mark that cares about things that happen to its children. @@ -77,8 +80,14 @@ */ #define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE | FS_RENAME) -#define ALL_FSNOTIFY_PERM_EVENTS (FS_OPEN_PERM | FS_ACCESS_PERM | \ - FS_OPEN_EXEC_PERM) +/* Content events can be used to inspect file content */ +#define FSNOTIFY_CONTENT_PERM_EVENTS (FS_OPEN_PERM | FS_OPEN_EXEC_PERM | \ + FS_ACCESS_PERM) +/* Pre-content events can be used to fill file content */ +#define FSNOTIFY_PRE_CONTENT_EVENTS (FS_PRE_ACCESS) + +#define ALL_FSNOTIFY_PERM_EVENTS (FSNOTIFY_CONTENT_PERM_EVENTS | \ + FSNOTIFY_PRE_CONTENT_EVENTS) /* * This is a list of all events that may get sent to a parent that is watching @@ -176,6 +185,17 @@ struct fsnotify_event { struct list_head list; }; +/* + * fsnotify group priorities. + * Events are sent in order from highest priority to lowest priority. + */ +enum fsnotify_group_prio { + FSNOTIFY_PRIO_NORMAL = 0, /* normal notifiers, no permissions */ + FSNOTIFY_PRIO_CONTENT, /* fanotify permission events */ + FSNOTIFY_PRIO_PRE_CONTENT, /* fanotify pre-content events */ + __FSNOTIFY_PRIO_NUM +}; + /* * A group is a "thing" that wants to receive notification about filesystem * events. The mask holds the subset of event types this group cares about. @@ -201,14 +221,7 @@ struct fsnotify_group { wait_queue_head_t notification_waitq; /* read() on the notification file blocks on this waitq */ unsigned int q_len; /* events on the queue */ unsigned int max_events; /* maximum events allowed on the list */ - /* - * Valid fsnotify group priorities. Events are send in order from highest - * priority to lowest priority. We default to the lowest priority. - */ - #define FS_PRIO_0 0 /* normal notifiers, no permissions */ - #define FS_PRIO_1 1 /* fanotify content based access control */ - #define FS_PRIO_2 2 /* fanotify pre-content access */ - unsigned int priority; + enum fsnotify_group_prio priority; /* priority for sending events */ bool shutdown; /* group is being shut down, don't queue more events */ #define FSNOTIFY_GROUP_USER 0x01 /* user allocated group */ @@ -285,6 +298,7 @@ static inline void fsnotify_group_assert_locked(struct fsnotify_group *group) /* When calling fsnotify tell it if the data is a path or inode */ enum fsnotify_data_type { FSNOTIFY_EVENT_NONE, + FSNOTIFY_EVENT_FILE_RANGE, FSNOTIFY_EVENT_PATH, FSNOTIFY_EVENT_INODE, FSNOTIFY_EVENT_DENTRY, @@ -297,6 +311,17 @@ struct fs_error_report { struct super_block *sb; }; +struct file_range { + const struct path *path; + loff_t pos; + size_t count; +}; + +static inline const struct path *file_range_path(const struct file_range *range) +{ + return range->path; +} + static inline struct inode *fsnotify_data_inode(const void *data, int data_type) { switch (data_type) { @@ -306,6 +331,8 @@ static inline struct inode *fsnotify_data_inode(const void *data, int data_type) return d_inode(data); case FSNOTIFY_EVENT_PATH: return d_inode(((const struct path *)data)->dentry); + case FSNOTIFY_EVENT_FILE_RANGE: + return d_inode(file_range_path(data)->dentry); case FSNOTIFY_EVENT_ERROR: return ((struct fs_error_report *)data)->inode; default: @@ -321,6 +348,8 @@ static inline struct dentry *fsnotify_data_dentry(const void *data, int data_typ return (struct dentry *)data; case FSNOTIFY_EVENT_PATH: return ((const struct path *)data)->dentry; + case FSNOTIFY_EVENT_FILE_RANGE: + return file_range_path(data)->dentry; default: return NULL; } @@ -332,6 +361,8 @@ static inline const struct path *fsnotify_data_path(const void *data, switch (data_type) { case FSNOTIFY_EVENT_PATH: return data; + case FSNOTIFY_EVENT_FILE_RANGE: + return file_range_path(data); default: return NULL; } @@ -347,6 +378,8 @@ static inline struct super_block *fsnotify_data_sb(const void *data, return ((struct dentry *)data)->d_sb; case FSNOTIFY_EVENT_PATH: return ((const struct path *)data)->dentry->d_sb; + case FSNOTIFY_EVENT_FILE_RANGE: + return file_range_path(data)->dentry->d_sb; case FSNOTIFY_EVENT_ERROR: return ((struct fs_error_report *) data)->sb; default: @@ -366,6 +399,18 @@ static inline struct fs_error_report *fsnotify_data_error_report( } } +static inline const struct file_range *fsnotify_data_file_range( + const void *data, + int data_type) +{ + switch (data_type) { + case FSNOTIFY_EVENT_FILE_RANGE: + return (struct file_range *)data; + default: + return NULL; + } +} + /* * Index to merged marks iterator array that correlates to a type of watch. * The type of watched object can be deduced from the iterator type, but not @@ -456,13 +501,6 @@ FSNOTIFY_ITER_FUNCS(sb, SB) type < FSNOTIFY_ITER_TYPE_COUNT; \ type++) -/* - * fsnotify_connp_t is what we embed in objects which connector can be attached - * to. fsnotify_connp_t * is how we refer from connector back to object. - */ -struct fsnotify_mark_connector; -typedef struct fsnotify_mark_connector __rcu *fsnotify_connp_t; - /* * Inode/vfsmount/sb point to this structure which tracks all marks attached to * the inode/vfsmount/sb. The reference to inode/vfsmount/sb is held by this @@ -471,20 +509,51 @@ typedef struct fsnotify_mark_connector __rcu *fsnotify_connp_t; */ struct fsnotify_mark_connector { spinlock_t lock; - unsigned short type; /* Type of object [lock] */ -#define FSNOTIFY_CONN_FLAG_HAS_FSID 0x01 + unsigned char type; /* Type of object [lock] */ + unsigned char prio; /* Highest priority group */ +#define FSNOTIFY_CONN_FLAG_IS_WATCHED 0x01 #define FSNOTIFY_CONN_FLAG_HAS_IREF 0x02 unsigned short flags; /* flags [lock] */ - __kernel_fsid_t fsid; /* fsid of filesystem containing object */ union { /* Object pointer [lock] */ - fsnotify_connp_t *obj; + void *obj; /* Used listing heads to free after srcu period expires */ struct fsnotify_mark_connector *destroy_next; }; struct hlist_head list; }; +/* + * Container for per-sb fsnotify state (sb marks and more). + * Attached lazily on first marked object on the sb and freed when killing sb. + */ +struct fsnotify_sb_info { + struct fsnotify_mark_connector __rcu *sb_marks; + /* + * Number of inode/mount/sb objects that are being watched in this sb. + * Note that inodes objects are currently double-accounted. + * + * The value in watched_objects[prio] is the number of objects that are + * watched by groups of priority >= prio, so watched_objects[0] is the + * total number of watched objects in this sb. + */ + atomic_long_t watched_objects[__FSNOTIFY_PRIO_NUM]; +}; + +static inline struct fsnotify_sb_info *fsnotify_sb_info(struct super_block *sb) +{ +#ifdef CONFIG_FSNOTIFY + return READ_ONCE(sb->s_fsnotify_info); +#else + return NULL; +#endif +} + +static inline atomic_long_t *fsnotify_sb_watched_objects(struct super_block *sb) +{ + return &fsnotify_sb_info(sb)->watched_objects[0]; +} + /* * A mark is simply an object attached to an in core inode which allows an * fsnotify listener to indicate they are either no longer interested in events @@ -530,6 +599,8 @@ struct fsnotify_mark { #define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x0100 #define FSNOTIFY_MARK_FLAG_NO_IREF 0x0200 #define FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS 0x0400 +#define FSNOTIFY_MARK_FLAG_HAS_FSID 0x0800 +#define FSNOTIFY_MARK_FLAG_WEAK_FSID 0x1000 unsigned int flags; /* flags [mark->lock] */ }; @@ -760,32 +831,35 @@ extern void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn); extern void fsnotify_init_mark(struct fsnotify_mark *mark, struct fsnotify_group *group); /* Find mark belonging to given group in the list of marks */ -extern struct fsnotify_mark *fsnotify_find_mark(fsnotify_connp_t *connp, - struct fsnotify_group *group); +struct fsnotify_mark *fsnotify_find_mark(void *obj, unsigned int obj_type, + struct fsnotify_group *group); /* attach the mark to the object */ -extern int fsnotify_add_mark(struct fsnotify_mark *mark, - fsnotify_connp_t *connp, unsigned int obj_type, - int add_flags, __kernel_fsid_t *fsid); -extern int fsnotify_add_mark_locked(struct fsnotify_mark *mark, - fsnotify_connp_t *connp, - unsigned int obj_type, int add_flags, - __kernel_fsid_t *fsid); +int fsnotify_add_mark(struct fsnotify_mark *mark, void *obj, + unsigned int obj_type, int add_flags); +int fsnotify_add_mark_locked(struct fsnotify_mark *mark, void *obj, + unsigned int obj_type, int add_flags); /* attach the mark to the inode */ static inline int fsnotify_add_inode_mark(struct fsnotify_mark *mark, struct inode *inode, int add_flags) { - return fsnotify_add_mark(mark, &inode->i_fsnotify_marks, - FSNOTIFY_OBJ_TYPE_INODE, add_flags, NULL); + return fsnotify_add_mark(mark, inode, FSNOTIFY_OBJ_TYPE_INODE, + add_flags); } static inline int fsnotify_add_inode_mark_locked(struct fsnotify_mark *mark, struct inode *inode, int add_flags) { - return fsnotify_add_mark_locked(mark, &inode->i_fsnotify_marks, - FSNOTIFY_OBJ_TYPE_INODE, add_flags, - NULL); + return fsnotify_add_mark_locked(mark, inode, FSNOTIFY_OBJ_TYPE_INODE, + add_flags); +} + +static inline struct fsnotify_mark *fsnotify_find_inode_mark( + struct inode *inode, + struct fsnotify_group *group) +{ + return fsnotify_find_mark(inode, FSNOTIFY_OBJ_TYPE_INODE, group); } /* given a group and a mark, flag mark to be freed when all references are dropped */ @@ -824,9 +898,17 @@ static inline void fsnotify_init_event(struct fsnotify_event *event) { INIT_LIST_HEAD(&event->list); } +int fsnotify_pre_content(const struct path *path, const loff_t *ppos, + size_t count); #else +static inline int fsnotify_pre_content(const struct path *path, + const loff_t *ppos, size_t count) +{ + return 0; +} + static inline int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, struct inode *inode, u32 cookie) diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h index 80f37a0d40d7d4efefe406f73598f0f150bd9973..613475285643b3354c31b3b2567ccf8bccbd7148 100644 --- a/include/uapi/asm-generic/fcntl.h +++ b/include/uapi/asm-generic/fcntl.h @@ -6,7 +6,6 @@ /* * FMODE_EXEC is 0x20 - * FMODE_NONOTIFY is 0x4000000 * These cannot be used by userspace O_* until internal and external open * flags are split. * -Eric Paris diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h index 2aae6dd706a91ff0358d73b0ed797e33c80f06ee..9f09cbf9ed6b193fef3967bfe7105bd24304a2f6 100644 --- a/include/uapi/linux/fanotify.h +++ b/include/uapi/linux/fanotify.h @@ -25,6 +25,9 @@ #define FAN_OPEN_PERM 0x00010000 /* File open in perm check */ #define FAN_ACCESS_PERM 0x00020000 /* File accessed in perm check */ #define FAN_OPEN_EXEC_PERM 0x00040000 /* File open/exec in perm check */ +/* #define FAN_DIR_MODIFY 0x00080000 */ /* Deprecated (reserved) */ + +#define FAN_PRE_ACCESS 0x00100000 /* Pre-content access hook */ #define FAN_EVENT_ON_CHILD 0x08000000 /* Interested in child events */ @@ -143,6 +146,7 @@ struct fanotify_event_metadata { #define FAN_EVENT_INFO_TYPE_DFID 3 #define FAN_EVENT_INFO_TYPE_PIDFD 4 #define FAN_EVENT_INFO_TYPE_ERROR 5 +#define FAN_EVENT_INFO_TYPE_RANGE 6 /* Special info types for FAN_RENAME */ #define FAN_EVENT_INFO_TYPE_OLD_DFID_NAME 10 @@ -189,6 +193,13 @@ struct fanotify_event_info_error { __u32 error_count; }; +struct fanotify_event_info_range { + struct fanotify_event_info_header hdr; + __u32 pad; + __u64 offset; + __u64 count; +}; + /* * User space may need to record additional information about its decision. * The extra information type records what kind of information is included. @@ -224,6 +235,13 @@ struct fanotify_response_info_audit_rule { /* Legit userspace responses to a _PERM event */ #define FAN_ALLOW 0x01 #define FAN_DENY 0x02 +/* errno other than EPERM can specified in upper byte of deny response */ +#define FAN_ERRNO_BITS 8 +#define FAN_ERRNO_SHIFT (32 - FAN_ERRNO_BITS) +#define FAN_ERRNO_MASK ((1 << FAN_ERRNO_BITS) - 1) +#define FAN_DENY_ERRNO(err) \ + (FAN_DENY | ((((__u32)(err)) & FAN_ERRNO_MASK) << FAN_ERRNO_SHIFT)) + #define FAN_AUDIT 0x10 /* Bitmask to create audit record for result */ #define FAN_INFO 0x20 /* Bitmask to indicate additional information */ diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index e867c17d3f84df3a986140a054075ed5cb7764ab..328e6ec7c90adfe105f8d2a1a08cff68ba5c6233 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -463,7 +463,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) int n; fsnotify_group_lock(audit_tree_group); - mark = fsnotify_find_mark(&inode->i_fsnotify_marks, audit_tree_group); + mark = fsnotify_find_inode_mark(inode, audit_tree_group); if (!mark) return create_chunk(inode, tree); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 7a98cd176a127d2ef1726024a34ad9674b54c28f..7f358740e95826a8fd52e8ec2a2a5a0b92643497 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -90,7 +90,7 @@ static inline struct audit_parent *audit_find_parent(struct inode *inode) struct audit_parent *parent = NULL; struct fsnotify_mark *entry; - entry = fsnotify_find_mark(&inode->i_fsnotify_marks, audit_watch_group); + entry = fsnotify_find_inode_mark(inode, audit_watch_group); if (entry) parent = container_of(entry, struct audit_parent, mark); diff --git a/mm/util.c b/mm/util.c index 278f0d285005c6c6c39135fe0ca0ea9c7cb594c4..d72855e78fd0dbe5488507b17954b4dbff39bf86 100644 --- a/mm/util.c +++ b/mm/util.c @@ -23,6 +23,7 @@ #include #include #include +#include #include @@ -565,12 +566,15 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flag, unsigned long pgoff) { + loff_t off = (loff_t)pgoff << PAGE_SHIFT; unsigned long ret; struct mm_struct *mm = current->mm; unsigned long populate; LIST_HEAD(uf); ret = security_mmap_file(file, prot, flag); + if (!ret) + ret = fsnotify_mmap_perm(file, prot, off, len); if (!ret) { if (mmap_write_lock_killable(mm)) return -EINTR; diff --git a/security/security.c b/security/security.c index bfd2172ecb3cbdd81df1c526c83db788a7448945..18258ca42437301a6be2bf1b63bf2168165a1a3b 100644 --- a/security/security.c +++ b/security/security.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include @@ -2594,13 +2593,7 @@ int security_kernfs_init_security(struct kernfs_node *kn_dir, */ int security_file_permission(struct file *file, int mask) { - int ret; - - ret = call_int_hook(file_permission, 0, file, mask); - if (ret) - return ret; - - return fsnotify_perm(file, mask); + return call_int_hook(file_permission, 0, file, mask); } /** @@ -2863,13 +2856,7 @@ int security_file_receive(struct file *file) */ int security_file_open(struct file *file) { - int ret; - - ret = call_int_hook(file_open, 0, file); - if (ret) - return ret; - - return fsnotify_perm(file, MAY_OPEN); + return call_int_hook(file_open, 0, file); } /** diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 8a8e4d51f298da57a1e40489af1eb94a48b7c2db..e07298c94d14e361b2c9cfb3a3115b2a059396e7 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -3390,7 +3390,8 @@ static int selinux_path_notify(const struct path *path, u64 mask, perm |= FILE__WATCH_WITH_PERM; /* watches on read-like events need the file:watch_reads permission */ - if (mask & (FS_ACCESS | FS_ACCESS_PERM | FS_CLOSE_NOWRITE)) + if (mask & (FS_ACCESS | FS_ACCESS_PERM | FS_PRE_ACCESS | + FS_CLOSE_NOWRITE)) perm |= FILE__WATCH_READS; return path_has_perm(current_cred(), path, perm);