diff --git a/arch/arm64/include/asm/vkernel.h b/arch/arm64/include/asm/vkernel.h new file mode 100644 index 0000000000000000000000000000000000000000..31feb69670754a376d01939fef0122404a1abed8 --- /dev/null +++ b/arch/arm64/include/asm/vkernel.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2024 ARM Ltd. + * Author: Joy Allen + */ + +#ifndef __ASM__VKERNEL_H +#define __ASM__VKERNEL_H + +#define sys_call_vk_t syscall_fn_t + +DECLARE_PER_CPU(struct task_struct *, current_syscall_task); +DECLARE_PER_CPU(struct vkernel *, current_syscall_vk); + +static __always_inline struct task_struct *get_current_syscall_task(void) +{ + return this_cpu_read_8(current_syscall_task); +} + +static __always_inline struct vkernel *get_current_syscall_vk(void) +{ + return this_cpu_read_8(current_syscall_vk); +} + +#endif diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c index f090e39f69bc4a0507beb745e8bbbd690596b812..ff0a13710f9edf6a93dbc32e7339711702fed4a8 100644 --- a/arch/arm64/kernel/syscall.c +++ b/arch/arm64/kernel/syscall.c @@ -7,6 +7,9 @@ #include #include #include +#ifdef CONFIG_VKERNEL +#include +#endif #include #include @@ -42,13 +45,31 @@ static void invoke_syscall(struct pt_regs *regs, unsigned int scno, const syscall_fn_t syscall_table[]) { long ret; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif add_random_kstack_offset(); if (scno < sc_nr) { syscall_fn_t syscall_fn; +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (!vk) { + syscall_fn = syscall_table[array_index_nospec(scno, sc_nr)]; + ret = __invoke_syscall(regs, syscall_fn); + } else { + syscall_fn = (vk->syscall.table)[array_index_nospec(scno, sc_nr)]; + this_cpu_write(current_syscall_task, current); + this_cpu_write(current_syscall_vk, vk); + ret = __invoke_syscall(regs, syscall_fn); + this_cpu_write(current_syscall_vk, NULL); + this_cpu_write(current_syscall_task, NULL); + } +#else syscall_fn = syscall_table[array_index_nospec(scno, sc_nr)]; ret = __invoke_syscall(regs, syscall_fn); +#endif } else { ret = do_ni_syscall(regs, scno); } diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 57d2bd04e5edfbf014d9ca5640ea7b93dc8e0cec..31b93f33dc89c93a8d532f54c4d21858f1e14104 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -20,6 +20,9 @@ #include #include #include +#ifdef CONFIG_VKERNEL +#include +#endif #ifdef CONFIG_XEN_PV #include @@ -46,10 +49,26 @@ static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr) * numbers for comparisons. */ unsigned int unr = nr; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif if (likely(unr < NR_syscalls)) { unr = array_index_nospec(unr, NR_syscalls); +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (!vk) + regs->ax = sys_call_table[unr](regs); + else { + this_cpu_write(current_syscall_task, current); + this_cpu_write(current_syscall_vk, vk); + regs->ax = (vk->syscall.table)[unr](regs); + this_cpu_write(current_syscall_vk, NULL); + this_cpu_write(current_syscall_task, NULL); + } +#else regs->ax = x64_sys_call(regs, unr); +#endif return true; } return false; diff --git a/arch/x86/include/asm/vkernel.h b/arch/x86/include/asm/vkernel.h new file mode 100644 index 0000000000000000000000000000000000000000..f46c3262e3c356214392eccd113f2fdd61182047 --- /dev/null +++ b/arch/x86/include/asm/vkernel.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2024 ARM Ltd. + * Author: Joy Allen + */ + +#ifndef __ASM_X86_VKERNEL_H +#define __ASM_X86_VKERNEL_H + +#define sys_call_vk_t sys_call_ptr_t + +DECLARE_PER_CPU(struct task_struct *, current_syscall_task); +DECLARE_PER_CPU(struct vkernel *, current_syscall_vk); + +static __always_inline struct task_struct *get_current_syscall_task(void) +{ + return this_cpu_read_stable(current_syscall_task); +} + +static __always_inline struct vkernel *get_current_syscall_vk(void) +{ + return this_cpu_read_stable(current_syscall_vk); +} + + +#endif diff --git a/drivers/Makefile b/drivers/Makefile index f36e00dfd1bd681d78c3e9ce2c6f90707e15ee88..3f87c2905025174c21a77ea584602b5d873ef5ba 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -202,3 +202,5 @@ obj-$(CONFIG_DRM_ACCEL) += accel/ obj-$(CONFIG_CDX_BUS) += cdx/ obj-$(CONFIG_S390) += s390/ + +obj-$(CONFIG_VKERNEL_DRIVER) += vkernel/ diff --git a/drivers/vkernel/Makefile b/drivers/vkernel/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..1fd838c2196725abdab0551a8e98ed8e8ea1f297 --- /dev/null +++ b/drivers/vkernel/Makefile @@ -0,0 +1,11 @@ +obj-$(CONFIG_VKERNEL_DRIVER) += vkernel.o + +ccflags-y := -I$(srctree)/drivers/vkernel/include + +vkernel-y := vkernel_main.o syscall.o +vkernel-y += fs/acl.o +vkernel-y += mm/mm.o +vkernel-y += security/capability.o +vkernel-y += sched/cpu.o +vkernel-y += sysctl/fs.o sysctl/kernel.o sysctl/net.o sysctl/vm.o sysctl/raw.o +vkernel-y += utils/kallsyms.o diff --git a/drivers/vkernel/fs/acl.c b/drivers/vkernel/fs/acl.c new file mode 100644 index 0000000000000000000000000000000000000000..8205cc3ef6f74fbf0004c97520bf5f7c4c1f1991 --- /dev/null +++ b/drivers/vkernel/fs/acl.c @@ -0,0 +1,367 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024 ARM Ltd. + * Author: Joy Allen + */ + +#include + +#include "fs.h" + +static char *def_path[] = { + /* open, access, append, read, exec */ + "/proc/sys/abi", + "/proc/sys/debug", + "/proc/sys/dev", + "/proc/sys/fs", + "/proc/sys/net", + "/proc/sys/user", + "/proc/sys/vm", + /* open, read, exec */ + "/sys/kernel", + "/sys/power", + "/sys/class", + "/sys/devices", + "/sys/dev", + "/sys/bus", + "/sys/block", + "/sys/module", + "/sys/firmware", + "/sys/fs/pstore", + "/sys/fs/bpf", + "/sys/fs/fuse", + "/sys/fs/ext4", + /* open */ + "/proc/sysrq-trigger", + "/sys/kernel/security", + /* nop */ + "/sys/fs/cgroup", + "/dev/vkernel", +}; + +static unsigned short def_mode[] = { + 0x803d, 0x803d, 0x803d, 0x803d, 0x803d, 0x803d, 0x803d, 0x8025, + 0x8025, 0x8025, 0x8025, 0x8025, 0x8025, 0x8025, 0x8025, 0x8025, + 0x8025, 0x8024, 0x8024, 0x8024, 0x8024, 0x8020, 0x8020, 0x0000, + 0x0000, +}; + +static struct kmem_cache *acl_node_cache; + +int vk_acl_init(void) +{ + acl_node_cache = kmem_cache_create("vkernel_acl_node", + sizeof(struct vkernel_acl_node), 0, SLAB_ACCOUNT, NULL); + if (!acl_node_cache) { + pr_err("failed to create slab for acl node\n"); + return -ENOMEM; + } + + return 0; +} + +void vk_acl_uninit(void) +{ + kmem_cache_destroy(acl_node_cache); +} + +int vk_init_acl(struct vkernel_acl *acl, unsigned int bits) +{ + + acl->ht = kcalloc( + 1UL << bits, sizeof(struct hlist_head), GFP_KERNEL); + if (!acl->ht) + return -ENOMEM; + + acl->bits = bits; + INIT_LIST_HEAD(&acl->nodes); + acl->active = false; + + return 0; +} + +void vk_uninit_acl(struct vkernel_acl *acl) +{ + struct hlist_head *ht = acl->ht; + struct vkernel_acl_node *node; + struct vkernel_acl_node *tmp; + + if (!acl->ht || !acl->bits) + return; + + acl->active = false; + list_for_each_entry_safe(node, tmp, &acl->nodes, link) { + if (!hlist_unhashed(&node->hash)) + hlist_del(&node->hash); + list_del(&node->link); + kmem_cache_free(acl_node_cache, node); + } + INIT_LIST_HEAD(&acl->nodes); + + acl->bits = 0; + kfree(ht); +} + +/* inode hash, copy from inode.c */ +static unsigned long inode_hash(struct inode *inode, unsigned long shift) +{ + struct super_block *sb = inode->i_sb; + unsigned long hashval = inode->i_ino; + unsigned long tmp; + + tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / + L1_CACHE_BYTES; + tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> shift); + return tmp; +} + +static struct vkernel_acl_node *vk_acl_node_get(struct vkernel_acl *acl, struct inode *inode) +{ + struct hlist_head *ht = acl->ht; + struct vkernel_acl_node *node; + unsigned long key = inode_hash(inode, acl->bits); + + hlist_for_each_entry(node, &ht[hash_min(key, acl->bits)], hash) { + if (inode->i_ino == node->ino && inode->i_sb == node->sb) + return node; + } + + return NULL; +} + +static int vk_acl_node_del(struct vkernel_acl *acl, struct inode *inode) +{ + struct vkernel_acl_node *node; + + node = vk_acl_node_get(acl, inode); + if (!node) + return -1; + + hlist_del(&node->hash); + node->ino = 0; + node->sb = NULL; + + return 0; +} + +static int vk_acl_node_add(struct vkernel_acl *acl, struct inode *inode, + struct vkernel_acl_node *node) +{ + struct hlist_head *ht = acl->ht; + unsigned long key = inode_hash(inode, acl->bits); + + /* Remove old rule if exists */ + vk_acl_node_del(acl, inode); + + node->ino = inode->i_ino; + node->sb = inode->i_sb; + hlist_add_head(&node->hash, &ht[hash_min(key, acl->bits)]); + + return 0; +} + +/* + * Inode from file->f_inode may be destroyed at following access + * Using kern_path is also unstable, is there a better way? + */ +static struct inode *kern_path_to_inode(const char *filename) +{ + struct path path; + struct inode *inode; + int ret; + + ret = kern_path(filename, LOOKUP_FOLLOW | LOOKUP_OPEN, &path); + if (ret) + return NULL; + + inode = path.dentry->d_inode; + path_put(&path); + + return inode; +} + +static int vk_activate_acl(struct vkernel_acl *acl, struct vkernel_acl_node *node) +{ + struct inode *inode; + + inode = kern_path_to_inode(node->path); + if (!inode) { + pr_warn("vkernel: cannot set cal, no such file or directory %s\n", node->path); + return 0; + } + + if (!vk_acl_node_add(acl, inode, node)) { + if (S_ISDIR(inode->i_mode)) + inode->i_opflags |= IOP_VKERNEL_DIR; + else + inode->i_opflags |= IOP_VKERNEL_REG; + } + + pr_debug("activate acl, path %s mode 0x%x ino %lu\n", node->path, node->mode, inode->i_ino); + + return 0; +} + +int vk_deactivate_acl(struct vkernel_acl *acl, struct vkernel_acl_node *node) +{ + struct inode *inode; + + inode = kern_path_to_inode(node->path); + if (!inode) + return -EINVAL; + + if (!vk_acl_node_del(acl, inode)) { + if (S_ISDIR(inode->i_mode)) + inode->i_opflags &= ~IOP_VKERNEL_DIR; + else + inode->i_opflags &= ~IOP_VKERNEL_REG; + } + + return 0; +} + +static void vk_activate_acl_all(struct vkernel_acl *acl) +{ + static DEFINE_MUTEX(vk_activate_lock); + struct vkernel_acl_node *node; + + /* Failure on trylock means someone is doing this job */ + if (!mutex_trylock(&vk_activate_lock)) + return; + + acl->active = true; + list_for_each_entry(node, &acl->nodes, link) { + if (hlist_unhashed(&node->hash)) + vk_activate_acl(acl, node); + } + + mutex_unlock(&vk_activate_lock); +} + +static int vk_permission(struct vkernel *vk, struct inode *inode, int mask) +{ + struct vkernel_acl_node *node; + + node = vk_acl_node_get(&vk->acl, inode); + if (node) { + if ((mask & ~(node->mode) & (MAY_READ | MAY_WRITE | MAY_EXEC)) != 0) { + pr_err("vkernel: permision denied, pid %d mask 0x%x vmode 0x%x path %s\n", + current->pid, mask, node->mode, node->path); + return -EACCES; + } + } + + return 0; +} + +/* + * Note: some filesystems or inodes may define their own permission hook. + * In such cases, vkernel permission check will be skipped. + */ +int vk_generic_permission(struct vkernel *vk, struct mnt_idmap *idmap, + struct inode *inode, int mask) +{ + int ret = 0; + + /* Activate acl at first check */ + if (unlikely(!vk->acl.active)) + vk_activate_acl_all(&vk->acl); + + if (inode->i_opflags & (IOP_VKERNEL_REG|IOP_VKERNEL_DIR)) + ret = vk_permission(vk, inode, mask); + + return ret; +} + +int vkernel_set_acl(struct vkernel_acl *acl, char *path, unsigned short mode) +{ + struct vkernel_acl_node *node; + + pr_debug("set acl, path %s mode 0x%x\n", path, mode); + node = kmem_cache_alloc(acl_node_cache, GFP_KERNEL_ACCOUNT); + if (!node) { + pr_err("failed to alloc acl node\n"); + return -ENOMEM; + } + INIT_HLIST_NODE(&node->hash); + node->ino = 0; + node->sb = NULL; + memcpy(node->path, path, VKERNEL_PATH_MAX); + node->mode = mode; + list_add_tail(&node->link, &acl->nodes); + + if (acl->active) + return vk_activate_acl(acl, node); + + return 0; +} +EXPORT_SYMBOL(vkernel_set_acl); + +int vkernel_clear_acl(struct vkernel_acl *acl, char *path) +{ + struct vkernel_acl_node *node; + bool found = false; + + list_for_each_entry(node, &acl->nodes, link) { + if (!strncmp(node->path, path, VKERNEL_PATH_MAX)) { + found = true; + break; + } + } + if (!found) + return -EINVAL; + + if (!hlist_unhashed(&node->hash)) + vk_deactivate_acl(acl, node); + + list_del(&node->link); + kmem_cache_free(acl_node_cache, node); + + return 0; +} +EXPORT_SYMBOL(vkernel_clear_acl); + +int vkernel_set_acl_set(struct vkernel_acl *acl, struct vkernel_file_desc_set *set) +{ + u64 i; + int r; + + for (i = 0; i < set->nr_descs; i++) { + r = vkernel_set_acl(acl, set->descs[i].path, set->descs[i].mode); + if (r) + return r; + } + + return 0; +} +EXPORT_SYMBOL(vkernel_set_acl_set); + +int vkernel_clear_acl_set(struct vkernel_acl *acl, struct vkernel_file_desc_set *set) +{ + u64 i; + int r; + + for (i = 0; i < set->nr_descs; i++) { + r = vkernel_clear_acl(acl, set->descs[i].path); + if (r) + return r; + } + + return 0; +} +EXPORT_SYMBOL(vkernel_clear_acl_set); + +int vkernel_set_default_acl_set(struct vkernel_acl *acl) +{ + u64 i; + int r; + + for (i = 0; i < ARRAY_SIZE(def_path); i++) { + r = vkernel_set_acl(acl, def_path[i], def_mode[i]); + if (r) + return r; + } + + return 0; +} +EXPORT_SYMBOL(vkernel_set_default_acl_set); diff --git a/drivers/vkernel/include/fs.h b/drivers/vkernel/include/fs.h new file mode 100644 index 0000000000000000000000000000000000000000..ce94c327482777dbe0e014f4e33e2437362dbc9c --- /dev/null +++ b/drivers/vkernel/include/fs.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _VKERNEL_FS_H +#define _VKERNEL_FS_H + +#include + +int vk_acl_init(void); +void vk_acl_uninit(void); + +int vk_init_acl(struct vkernel_acl *acl, unsigned int bits); +void vk_uninit_acl(struct vkernel_acl *acl); +int vkernel_set_default_acl_set(struct vkernel_acl *acl); + +int vk_generic_permission(struct vkernel *vk, struct mnt_idmap *idmap, + struct inode *inode, int mask); + +#endif diff --git a/drivers/vkernel/include/mm.h b/drivers/vkernel/include/mm.h new file mode 100644 index 0000000000000000000000000000000000000000..c2fdf89ba8eb3505e5d00ad8f98a7a2e8c517d00 --- /dev/null +++ b/drivers/vkernel/include/mm.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _VKERNEL_MM_H +#define _VKERNEL_MM_H + +#include + +/* Copy from mm/shmem.c */ + +#define SHMEM_HUGE_NEVER 0 +#define SHMEM_HUGE_ALWAYS 1 +#define SHMEM_HUGE_WITHIN_SIZE 2 +#define SHMEM_HUGE_ADVISE 3 + +/* + * Special values. + * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled: + * + * SHMEM_HUGE_DENY: + * disables huge on shm_mnt and all mounts, for emergency use; + * SHMEM_HUGE_FORCE: + * enables huge on shm_mnt and all mounts, w/o needing option, for testing; + * + */ +#define SHMEM_HUGE_DENY (-1) +#define SHMEM_HUGE_FORCE (-2) + +int vk_init_memory_pref(struct vkernel_mem_pref *mem); +void vk_uninit_memory_pref(struct vkernel_mem_pref *mem); + +#endif diff --git a/drivers/vkernel/include/sched.h b/drivers/vkernel/include/sched.h new file mode 100644 index 0000000000000000000000000000000000000000..c273e3dea619ebcd553d236e1be985f46e229466 --- /dev/null +++ b/drivers/vkernel/include/sched.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _VKERNEL_SCHED_H +#define _VKERNEL_SCHED_H + +#include + +int vk_init_cpu_pref(struct vkernel_cpu_pref *cpu); +void vk_uninit_cpu_pref(struct vkernel_cpu_pref *cpu); + +#endif diff --git a/drivers/vkernel/include/security.h b/drivers/vkernel/include/security.h new file mode 100644 index 0000000000000000000000000000000000000000..0eac382ffd1bd00474c5a8658081e3e3d8c7522f --- /dev/null +++ b/drivers/vkernel/include/security.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _VKERNEL_SECURITY_H +#define _VKERNEL_SECURITY_H + +#include + +int vk_cap_init(void); +void vk_cap_uninit(void); + +int vk_cap_capable(struct vkernel *vk, const struct cred *cred, + struct user_namespace *targ_ns, + int cap, unsigned int opts); + +#endif diff --git a/drivers/vkernel/include/syscall.h b/drivers/vkernel/include/syscall.h new file mode 100644 index 0000000000000000000000000000000000000000..4bb75703f43046dcbd63bebb256c1b3b13be3080 --- /dev/null +++ b/drivers/vkernel/include/syscall.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _VKERNEL_SYSCALL_H +#define _VKERNEL_SYSCALL_H + +#include + +extern sys_call_vk_t *sys_call_table_ptr; + +int vk_syscall_init(void); +void vk_syscall_uninit(void); + +long vk_sys_ni_syscall(const struct pt_regs *regs); +long vk_sys_forbid_syscall(const struct pt_regs *regs); +long vk_sys_ni_cond_syscall(const struct pt_regs *regs); +long vk_sys_forbid_cond_syscall(const struct pt_regs *regs); + +int vk_init_syscall(struct vkernel_syscall *syscall); +void vk_uninit_syscall(struct vkernel_syscall *syscall); +void vk_install_default_syscalls(struct vkernel_syscall *syscall); + +extern struct vkernel_custom_type analysis_custom; + +#endif diff --git a/drivers/vkernel/include/sysctl.h b/drivers/vkernel/include/sysctl.h new file mode 100644 index 0000000000000000000000000000000000000000..7520ee7cbf747d8b103c960d14c8054eb861587f --- /dev/null +++ b/drivers/vkernel/include/sysctl.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _VKERNEL_SYSCTL_H +#define _VKERNEL_SYSCTL_H + +#include +#include + +#define IPC_SEM_IDS 0 +#define IPC_MSG_IDS 1 +#define IPC_SHM_IDS 2 + +/* defined at kernel/fork.c */ +#define MIN_THREADS 20 +#define MAX_THREADS FUTEX_TID_MASK + +int vk_init_sysctl_fs(struct vkernel_sysctl_fs *fs); +void vk_uninit_sysctl_fs(struct vkernel_sysctl_fs *fs); + +int vk_init_sysctl_kernel(struct vkernel_sysctl_kernel *k); +void vk_uninit_sysctl_kernel(struct vkernel_sysctl_kernel *k); + +int vk_init_sysctl_net(struct vkernel_sysctl_net *net, struct task_struct *tsk); +void vk_uninit_sysctl_net(struct vkernel_sysctl_net *net); + +extern int (*tcp_set_default_congestion_control_ptr)(struct net *net, const char *name); + +int devconf_proc(struct net *net, struct ipv4_devconf *conf, + int val, int i, int type); +int devconf_forward(struct net *net, struct ipv4_devconf *conf, + int val, int i, int type); +int devconf_flush(struct net *net, struct ipv4_devconf *conf, + int val, int i, int type); + +int vk_init_sysctl_vm(struct vkernel_sysctl_vm *vm); +void vk_uninit_sysctl_vm(struct vkernel_sysctl_vm *vm); + +void vk_sync_overcommit_as(struct vkernel *vk); + +int vkernel_set_sysctl_raw(struct vkernel *vk, char *buf); + +/* Defined at ipc/util.h, MODIFIED */ +static inline int sem_check_semmni(struct ipc_namespace *ns) +{ + /* + * Check semmni range [0, ipc_mni] + * semmni is the last element of sem_ctls[4] array + */ + return ((ns->sem_ctls[3] < 0) || (ns->sem_ctls[3] > (1<<15))) + ? -ERANGE : 0; +} + +#endif diff --git a/drivers/vkernel/include/utils.h b/drivers/vkernel/include/utils.h new file mode 100644 index 0000000000000000000000000000000000000000..9bcb29e144caeea180b2532e585fa31ac2f43dc2 --- /dev/null +++ b/drivers/vkernel/include/utils.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _VKERNEL_UTILS_H +#define _VKERNEL_UTILS_H + +int vk_kallsyms_init(void); +void vk_kallsyms_uninit(void); + +unsigned long lookup_name(const char *name); + +#endif diff --git a/drivers/vkernel/mm/mm.c b/drivers/vkernel/mm/mm.c new file mode 100644 index 0000000000000000000000000000000000000000..6eee69ccf25a1a37197c74db7da3a478d48b9f76 --- /dev/null +++ b/drivers/vkernel/mm/mm.c @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024 ARM Ltd. + * Author: Joy Allen + */ + +#include "mm.h" + +int vk_init_memory_pref(struct vkernel_mem_pref *mem) +{ + mem->default_policy.refcnt = (atomic_t)ATOMIC_INIT(1); + mem->default_policy.mode = MPOL_LOCAL; + + mem->shmem_huge = SHMEM_HUGE_NEVER; + mem->thp_flags = +#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS + (1<thp_flags; + + if (desc->numa_mode >= 0 && desc->numa_mode < MPOL_MAX) { + /* TODO: Setup all fields */ + // mem->default_policy.mode = desc->numa_mode; + pr_info("set default numa policy is not supported yet\n"); + } + + if (desc->shmem_enabled >= SHMEM_HUGE_FORCE && + desc->shmem_enabled <= SHMEM_HUGE_ADVISE) + mem->shmem_huge = desc->shmem_enabled; + + if (desc->thp_enabled > -1 && + desc->thp_enabled < TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG) { + clear_bit(TRANSPARENT_HUGEPAGE_FLAG, flags); + clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, flags); + if (desc->thp_enabled > TRANSPARENT_HUGEPAGE_UNSUPPORTED) + set_bit(desc->thp_enabled, flags); + } + + if (desc->thp_defrag > -1 && + desc->thp_defrag < TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG) { + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, flags); + if (desc->thp_defrag > TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG) + set_bit(desc->thp_defrag, flags); + } + + if (desc->thp_use_zero_page == 0) + clear_bit(TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG, flags); + else if (desc->thp_use_zero_page == 1) + set_bit(TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG, flags); + + return 0; +} diff --git a/drivers/vkernel/sched/cpu.c b/drivers/vkernel/sched/cpu.c new file mode 100644 index 0000000000000000000000000000000000000000..16af7da434801a63fa4448bd5566c4fb34e2551f --- /dev/null +++ b/drivers/vkernel/sched/cpu.c @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024 ARM Ltd. + * Author: Joy Allen + */ + +#include "sched.h" + +int vk_init_cpu_pref(struct vkernel_cpu_pref *cpu) +{ + cpu->policy = SCHED_NORMAL; + cpu->rr_timeslice_us = 0; + cpu->wakeup_gran_us = 0; + + return 0; +} + +void vk_uninit_cpu_pref(struct vkernel_cpu_pref *cpu) +{ + +} + +int vkernel_set_cpu_pref(struct vkernel *vk, struct vkernel_cpu_desc *desc) +{ + if (desc->policy >= 0) + vk->cpu_pref.policy = desc->policy; + + if (desc->rr_timeslice_us > 0) + vk->cpu_pref.rr_timeslice_us = desc->rr_timeslice_us; + + if (desc->wakeup_gran_us > 0) + vk->cpu_pref.wakeup_gran_us = desc->wakeup_gran_us; + + return 0; +} +EXPORT_SYMBOL(vkernel_set_cpu_pref); diff --git a/drivers/vkernel/security/capability.c b/drivers/vkernel/security/capability.c new file mode 100644 index 0000000000000000000000000000000000000000..2b07101f260b70aeb1464e473d0b1aa41efec385 --- /dev/null +++ b/drivers/vkernel/security/capability.c @@ -0,0 +1,111 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024 ARM Ltd. + * Author: Joy Allen + */ + +#include +#include + +#include "security.h" +#include "utils.h" + +int (*cap_capget_ptr)(struct task_struct *target, kernel_cap_t *effective, + kernel_cap_t *inheritable, kernel_cap_t *permitted); +int (*cap_capset_ptr)(struct cred *new, const struct cred *old, + const kernel_cap_t *effective, + const kernel_cap_t *inheritable, + const kernel_cap_t *permitted); +int (*cap_task_prctl_ptr)(int option, unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5); + +int vk_cap_init(void) +{ + cap_capget_ptr = (void *)lookup_name("cap_capget"); + cap_capset_ptr = (void *)lookup_name("cap_capset"); + cap_task_prctl_ptr = (void *)lookup_name("cap_task_prctl"); + if (!cap_capget_ptr || !cap_capset_ptr || !cap_task_prctl_ptr) { + pr_err("failed to find cap symbols, get: %p, set: %p, prctl: %p\n", + cap_capget_ptr, cap_capset_ptr, cap_task_prctl_ptr); + return -1; + } + + return 0; +} + +void vk_cap_uninit(void) {} + +int vk_cap_capable(struct vkernel *vk, const struct cred *cred, struct user_namespace *ns, + int cap, unsigned int opts) +{ + /* Check cred and real_cred to allow fs overried_creds */ + if (current_cred() == current_real_cred() && + !cap_issubset(cred->cap_effective, vk->linux_cap.effective)) { + pr_debug("vkernel: cap eff %llx escalated? use vk eff %llx instead\n", + cred->cap_effective.val, vk->linux_cap.effective.val); + for (;;) { + if (ns == cred->user_ns) + return cap_raised(vk->linux_cap.effective, cap) ? 0 : -EPERM; + if (ns->level <= cred->user_ns->level) + return -EPERM; + if ((ns->parent == cred->user_ns) && uid_eq(ns->owner, cred->euid)) + return 0; + ns = ns->parent; + } + } + return 0; +} + +/* + * Set cap for `current`, and `current` should be vk->init_process + * + * Note: this operation will take effect immediately. + */ +int vkernel_set_linux_cap(struct vkernel *vk, struct vkernel_linux_cap *cap) +{ + kernel_cap_t effective, inheritable, permitted; + struct cred *cred; + int action; + int ret; + int i; + + vk->linux_cap = *cap; + + /* Get current [effective,inheritable,permitted] */ + cap_capget_ptr(vk->init_process, &effective, &inheritable, &permitted); + + /* Drop bset according to linux_cap, which affects the following capset */ + if (cap_raised(effective, CAP_SETPCAP)) { + for (i = 0; i <= CAP_LAST_CAP; i++) { + if (!cap_raised(cap->bset, i)) { + ret = cap_task_prctl_ptr(PR_CAPBSET_DROP, i, 0, 0, 0); + if (ret) + return ret; + } + } + } + + /* Set current [effective,inheritable,permitted], ambient is automatically updated */ + cred = prepare_creds(); + if (!cred) + return -ENOMEM; + ret = cap_capset_ptr(cred, current_cred(), &cap->effective, &cap->inheritable, + &cap->permitted); + if (ret) + return ret; + commit_creds(cred); + + /* Raise or lower abmient according to linux_cap */ + for (i = 0; i < CAP_LAST_CAP; i++) { + if (cap_raised(cap->ambient, i)) + action = PR_CAP_AMBIENT_RAISE; + else + action = PR_CAP_AMBIENT_LOWER; + ret = cap_task_prctl_ptr(PR_CAP_AMBIENT, action, i, 0, 0); + if (ret) + return ret; + } + + return 0; +} +EXPORT_SYMBOL(vkernel_set_linux_cap); diff --git a/drivers/vkernel/syscall.c b/drivers/vkernel/syscall.c new file mode 100644 index 0000000000000000000000000000000000000000..0aa03db71dcc87b9ee55c2231e40f47538754ba9 --- /dev/null +++ b/drivers/vkernel/syscall.c @@ -0,0 +1,636 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024 ARM Ltd. + * Author: Joy Allen + */ + +#include +#include +#include +#include + +#include "syscall.h" +#include "utils.h" + +sys_call_vk_t *sys_call_table_ptr; + +int (*force_sig_seccomp_ptr)(int syscall, int reason, bool force_coredump); +void (*do_exit_ptr)(long code); + +#define NOTIF_SYSCALL_RULE(name) \ +{ \ + .nr = __NR_##name, \ + .act = (VKERNEL_SYSCALL_ACT_ERRNO << VKERNEL_SYSCALL_ERRNO_BITS) | ENOSYS, \ +} \ + +static struct vkernel_syscall_rule_desc def_rules[] = { + NOTIF_SYSCALL_RULE(move_pages), + NOTIF_SYSCALL_RULE(fsconfig), + NOTIF_SYSCALL_RULE(kexec_load), + // NOTIF_SYSCALL_RULE(sysfs), + NOTIF_SYSCALL_RULE(fsopen), + NOTIF_SYSCALL_RULE(pkey_mprotect), + // NOTIF_SYSCALL_RULE(ustat), + NOTIF_SYSCALL_RULE(pkey_free), + NOTIF_SYSCALL_RULE(pkey_alloc), + NOTIF_SYSCALL_RULE(userfaultfd), + NOTIF_SYSCALL_RULE(migrate_pages), + NOTIF_SYSCALL_RULE(add_key), + NOTIF_SYSCALL_RULE(keyctl), + NOTIF_SYSCALL_RULE(clone3), + NOTIF_SYSCALL_RULE(kexec_file_load), + NOTIF_SYSCALL_RULE(swapoff), + NOTIF_SYSCALL_RULE(fsmount), + NOTIF_SYSCALL_RULE(open_tree), + // NOTIF_SYSCALL_RULE(_sysctl), + NOTIF_SYSCALL_RULE(move_mount), + NOTIF_SYSCALL_RULE(swapon), + NOTIF_SYSCALL_RULE(pivot_root), + NOTIF_SYSCALL_RULE(fspick), +}; + +static struct kmem_cache *syscall_rule_cache; + +int vk_syscall_init(void) +{ + sys_call_table_ptr = (void *)lookup_name("sys_call_table"); + if (!sys_call_table_ptr) { + pr_err("failed to find sys_call_table\n"); + return -1; + } + + force_sig_seccomp_ptr = (void *)lookup_name("force_sig_seccomp"); + if (!force_sig_seccomp_ptr) { + pr_err("failed to find force_sig_seccomp\n"); + return -1; + } + + do_exit_ptr = (void *)lookup_name("do_exit"); + if (!force_sig_seccomp_ptr) { + pr_err("failed to find do_exit\n"); + return -1; + } + + syscall_rule_cache = kmem_cache_create("vkernel_syscall_rule", + sizeof(struct vkernel_syscall_rule), 0, SLAB_ACCOUNT, NULL); + if (!syscall_rule_cache) { + pr_err("failed to create slab for syscall rule\n"); + return -ENOMEM; + } + + return 0; +} + +void vk_syscall_uninit(void) +{ + kmem_cache_destroy(syscall_rule_cache); +} + +static inline bool check_cond(int op, unsigned long arg, + unsigned long oprand1, unsigned long oprand2) +{ + switch (op) { + case VKERNEL_SYSCALL_CMP_EQ: + return arg == oprand1; + case VKERNEL_SYSCALL_CMP_NE: + return arg != oprand1; + case VKERNEL_SYSCALL_CMP_LT: + return arg < oprand1; + case VKERNEL_SYSCALL_CMP_LE: + return arg <= oprand1; + case VKERNEL_SYSCALL_CMP_GT: + return arg > oprand1; + case VKERNEL_SYSCALL_CMP_GE: + return arg >= oprand1; + case VKRENEL_SYSCALL_CMP_ME: + return (arg & oprand1) == oprand2; + } + + return false; +} + + +static bool check_rule(struct vkernel_syscall_rule *rule, struct pt_regs *regs) +{ + struct vkernel_syscall_cond *cond; + unsigned long args[6]; + int i; + + /* Corner case */ + if (!rule) + return true; + + syscall_get_arguments(current, regs, args); + for (i = 0; i < 6; i++) { + cond = &rule->conds[i]; + if (cond->op == VKERNEL_SYSCALL_CMP_ED) + break; + if (!check_cond(cond->op, args[cond->index], cond->oprand1, cond->oprand2)) + return false; + } + + return true; +} + +asmlinkage long vk_sys_act_cond(const struct pt_regs *regs) +{ + struct vkernel *vk; + struct vkernel_syscall_rule *rule; + struct pt_regs *curr_regs; + int nr; + unsigned int act; + + curr_regs = current_pt_regs(); + nr = syscall_get_nr(current, curr_regs); + if (likely(current_vk_task == current)) + vk = current_vk; + else + vk = vkernel_find_vk_by_task(current); + + act = vk->syscall.def_act; + list_for_each_entry(rule, &vk->syscall.rule_chains[nr], link) { + if (check_rule(rule, curr_regs)) { + act = rule->act; + break; + } + } + + switch (act >> VKERNEL_SYSCALL_ERRNO_BITS) { + case VKERNEL_SYSCALL_ACT_TRAP: + pr_info("vkernel: cond trap for syscall %d\n", nr); + syscall_rollback(current, curr_regs); + force_sig_seccomp_ptr(nr, -EPERM, false); + fallthrough; + case VKERNEL_SYSCALL_ACT_ERRNO: + pr_info("vkernel: cond err for syscall %d\n", nr); + return -(act & VKERNEL_SYSCALL_ERRNO_MASK); + + case VKERNEL_SYSCALL_ACT_USER_NOTIF: + pr_info("vkernel: cond user notif (nosys) for syscall %d\n", nr); + return -ENOSYS; + + case VKERNEL_SYSCALL_ACT_TRACE: + pr_info("vkernel: cond trace (nosys) for syscall %d\n", nr); + return -ENOSYS; + + case VKERNEL_SYSCALL_ACT_LOG: + pr_info("vkernel: cond log for syscall %d\n", nr); + fallthrough; + case VKERNEL_SYSCALL_ACT_ALLOW: + return sys_call_table_ptr[nr](regs); + + case VKERNEL_SYSCALL_ACT_KILL_PROCESS: + case VKERNEL_SYSCALL_ACT_KILL_THREAD: + default: + pr_info("vkernel: cond kill process/thread for syscall %d\n", nr); + if ((act >> VKERNEL_SYSCALL_ERRNO_BITS) != SECCOMP_RET_KILL_THREAD || + (atomic_read(¤t->signal->live) == 1)) { + /* Show the original registers in the dump. */ + syscall_rollback(current, curr_regs); + /* Trigger a coredump with SIGSYS */ + force_sig_seccomp_ptr(nr, -EPERM, true); + } else { + /* Call do_exit since there is missing unified pt_reg api */ + do_exit_ptr(SIGSYS); + } + return -1; + } + + /* We never get here */ + unreachable(); + + return -1; +} + +asmlinkage long vk_sys_act_invalid(const struct pt_regs *regs) +{ + pr_info("invalid syscall, never get here\n"); + return -ENOSYS; +} + +asmlinkage long vk_sys_act_kill_process(const struct pt_regs *regs) +{ + struct pt_regs *curr_regs; + int nr; + + curr_regs = current_pt_regs(); + nr = syscall_get_nr(current, curr_regs); + pr_info("vkernel: kill process for syscall %d\n", nr); + syscall_rollback(current, curr_regs); + force_sig_seccomp_ptr(nr, -EPERM, true); + + return -1; +} + +asmlinkage long vk_sys_act_kill_thread(const struct pt_regs *regs) +{ + struct pt_regs *curr_regs; + int nr; + + curr_regs = current_pt_regs(); + nr = syscall_get_nr(current, curr_regs); + pr_info("vkernel: kill thread for syscall %d\n", nr); + if ((atomic_read(¤t->signal->live) == 1)) { + syscall_rollback(current, current_pt_regs()); + force_sig_seccomp_ptr(nr, -EPERM, true); + } else { + /* Call do_exit since there is missing unified pt_reg api */ + do_exit_ptr(SIGSYS); + } + + return -1; +} + +asmlinkage long vk_sys_act_trap(const struct pt_regs *regs) +{ + struct pt_regs *curr_regs; + int nr; + + curr_regs = current_pt_regs(); + nr = syscall_get_nr(current, curr_regs); + pr_info("vkernel: trap for syscall %d\n", nr); + syscall_rollback(current, curr_regs); + force_sig_seccomp_ptr(nr, -EPERM, false); + + return -1; +} + +asmlinkage long vk_sys_act_user_notif(const struct pt_regs *regs) +{ + pr_err("vkernel: user notif for syscall nr %d\n", + syscall_get_nr(current, current_pt_regs())); + return -ENOSYS; +} + +asmlinkage long vk_sys_act_trace(const struct pt_regs *regs) +{ + pr_err("vkernel: trace for syscall nr %d\n", + syscall_get_nr(current, current_pt_regs())); + return -ENOSYS; +} + +asmlinkage long vk_sys_act_errno(const struct pt_regs *regs) +{ + struct vkernel *vk; + struct vkernel_syscall_rule *rule; + struct pt_regs *curr_regs; + int nr; + int errno; + + if (likely(current_vk_task == current)) + vk = current_vk; + else + vk = vkernel_find_vk_by_task(current); + curr_regs = current_pt_regs(); + nr = syscall_get_nr(current, curr_regs); + if (list_empty(&vk->syscall.rule_chains[nr])) + errno = vk->syscall.def_act & 0xffff; + else { + rule = list_first_entry(&vk->syscall.rule_chains[nr], + struct vkernel_syscall_rule, link); + errno = rule->act & VKERNEL_SYSCALL_ERRNO_MASK; + } + + pr_err("vkernel: err for syscall nr %d errno -%d\n", nr, errno); + return -errno; +} + +asmlinkage long vk_sys_act_log(const struct pt_regs *regs) +{ + int nr; + + nr = syscall_get_nr(current, current_pt_regs()); + pr_info("vkernel: log for syscall %d\n", nr); + + return sys_call_table_ptr[nr](regs); +} + +static void clear_syscall_rule_chain(struct list_head *chain) +{ + struct vkernel_syscall_rule *rule; + struct vkernel_syscall_rule *tmp; + + list_for_each_entry_safe(rule, tmp, chain, link) { + list_del(&rule->link); + kmem_cache_free(syscall_rule_cache, rule); + } + INIT_LIST_HEAD(chain); +} + +int vk_init_syscall(struct vkernel_syscall *syscall) +{ + int i; + + for (i = 0; i < NR_syscalls; i++) { + syscall->table[i] = sys_call_table_ptr[i]; + INIT_LIST_HEAD(&syscall->rule_chains[i]); + } + syscall->def_act = VKERNEL_SYSCALL_ACT_ALLOW << VKERNEL_SYSCALL_ERRNO_BITS; + + return 0; +} + +void vk_uninit_syscall(struct vkernel_syscall *syscall) +{ + int i; + + for (i = 0; i < NR_syscalls; i++) + clear_syscall_rule_chain(&syscall->rule_chains[i]); +} + +int vkernel_set_syscall(struct vkernel_syscall *syscall, unsigned int nr, + sys_call_vk_t call) +{ + if (unlikely(nr >= NR_syscalls)) + return -EINVAL; + + clear_syscall_rule_chain(&syscall->rule_chains[nr]); + syscall->table[nr] = call; + + return 0; +} +EXPORT_SYMBOL(vkernel_set_syscall); + +static sys_call_vk_t uncond_table[] = { + [VKERNEL_SYSCALL_ACT_INVALID] = vk_sys_act_invalid, + [VKERNEL_SYSCALL_ACT_KILL_PROCESS] = vk_sys_act_kill_process, + [VKERNEL_SYSCALL_ACT_KILL_THREAD] = vk_sys_act_kill_thread, + [VKERNEL_SYSCALL_ACT_TRAP] = vk_sys_act_trap, + [VKERNEL_SYSCALL_ACT_ERRNO] = vk_sys_act_errno, + [VKERNEL_SYSCALL_ACT_USER_NOTIF] = vk_sys_act_user_notif, + [VKERNEL_SYSCALL_ACT_TRACE] = vk_sys_act_trace, + [VKERNEL_SYSCALL_ACT_LOG] = vk_sys_act_log, +}; + +/* + * Call before adding rules + */ +int vkernel_set_default_syscall_rule(struct vkernel_syscall *syscall, u32 act) +{ + unsigned int action; + int i; + + action = act >> VKERNEL_SYSCALL_ERRNO_BITS; + if (action == VKERNEL_SYSCALL_ACT_INVALID || + action > VKERNEL_SYSCALL_ACT_ALLOW || + act == syscall->def_act) { + pr_err("invalid default rule, act 0x%x, old 0x%x\n", act, syscall->def_act); + return -EINVAL; + } + + for (i = 0; i < NR_syscalls; i++) { + clear_syscall_rule_chain(&syscall->rule_chains[i]); + if (action < VKERNEL_SYSCALL_ACT_ALLOW) + syscall->table[i] = uncond_table[action]; + else + syscall->table[i] = sys_call_table_ptr[i]; + } + syscall->def_act = act; + + return 0; +} +EXPORT_SYMBOL(vkernel_set_default_syscall_rule); + +int vkernel_add_syscall_rule(struct vkernel_syscall *syscall, + struct vkernel_syscall_rule_desc *desc) +{ + struct vkernel_syscall_rule *rule; + unsigned int nr; + unsigned int action; + int index; + + pr_debug("set syscall rule, nr %u act 0x%x has_cond %d\n", + desc->nr, desc->act, desc->conds[0].op != VKERNEL_SYSCALL_CMP_ED); + + nr = desc->nr; + action = (desc->act >> VKERNEL_SYSCALL_ERRNO_BITS); + if (nr >= NR_syscalls || + action == VKERNEL_SYSCALL_ACT_INVALID || + action > VKERNEL_SYSCALL_ACT_ALLOW || + (desc->act == syscall->def_act && list_empty(&syscall->rule_chains[nr]))) { + pr_err("invalid rule, nr %u act 0x%x def_act 0x%x\n", + desc->nr, desc->act, syscall->def_act); + return -EINVAL; + } + + /* Update syscall rule chain */ + rule = kmem_cache_alloc(syscall_rule_cache, GFP_KERNEL_ACCOUNT); + if (!rule) { + pr_err("failed to alloc syscall rule\n"); + return -ENOMEM; + } + + rule->act = desc->act; + for (index = 0; index < 6; index++) { + rule->conds[index] = desc->conds[index]; + if (desc->conds[index].op == VKERNEL_SYSCALL_CMP_ED) + break; + } + list_add(&rule->link, &syscall->rule_chains[nr]); + + /* Update syscall table */ + if (index > 0) + syscall->table[nr] = vk_sys_act_cond; + else if (action < VKERNEL_SYSCALL_ACT_ALLOW) + syscall->table[nr] = uncond_table[action]; + else + syscall->table[nr] = sys_call_table_ptr[nr]; + + return 0; +} +EXPORT_SYMBOL(vkernel_add_syscall_rule); + +void vk_install_default_syscalls(struct vkernel_syscall *syscall) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(def_rules); i++) + vkernel_add_syscall_rule(syscall, &def_rules[i]); +} +EXPORT_SYMBOL(vk_install_default_syscalls); + + +struct vkernel_analysis { + unsigned int syscalls[NR_syscalls + 1]; + unsigned int exec_count; + unsigned int exec_capacity; + char *execs[]; +}; + +asmlinkage long vk_sys_act_analysis(const struct pt_regs *regs) +{ + struct vkernel *vk; + struct vkernel_analysis *data; + struct vkernel_analysis *newdata; + char __user *uname; + char *kname; + struct pt_regs *curr_regs; + int nr; + + if (likely(current_vk_task == current)) + vk = current_vk; + else + vk = vkernel_find_vk_by_task(current); + data = (struct vkernel_analysis *)vk->private; + curr_regs = current_pt_regs(); + nr = syscall_get_nr(current, curr_regs); + if (data->syscalls[nr] < UINT_MAX) + data->syscalls[nr]++; + if (nr == __NR_execve || nr == __NR_execveat) { + kname = __getname(); + if (unlikely(!kname)) { + pr_err("failed to alloc name\n"); + return -ENOMEM; + } + if (nr == __NR_execve) + uname = (char __user *)regs_get_kernel_argument(curr_regs, 0); + else + uname = (char __user *)regs_get_kernel_argument(curr_regs, 1); + if (strncpy_from_user(kname, uname, PATH_MAX) < 0) { + pr_err("failed to copy user filename\n"); + __putname(kname); + return -EFAULT; + } + if (data->exec_count >= data->exec_capacity) { + newdata = kzalloc(sizeof(*data) + + sizeof(char *) * (data->exec_capacity << 1), GFP_KERNEL); + if (!newdata) + return -ENOMEM; + memcpy(newdata, data, sizeof(*data) + sizeof(char *) * data->exec_capacity); + newdata->exec_capacity <<= 1; + + vk->private = newdata; + /* TODO: fix race window */ + while (refcount_read(&vk->users_count) > 1) + ; + kfree(data); + data = newdata; + } + data->execs[data->exec_count++] = kname; + } + + return sys_call_table_ptr[nr](regs); +} + +static int analysis_show(struct seq_file *m, void *v) +{ + struct vkernel *vk = m->private; + struct vkernel_analysis *data = vk->private; + unsigned int i; + bool first; + + seq_puts(m, "{\n"); + seq_puts(m, " \"syscalls\": ["); + first = true; + for (i = 0; i < NR_syscalls; i++) { + if (!data->syscalls[i]) + continue; + if (first) { + seq_printf(m, "%u", i); + first = false; + } else + seq_printf(m, ", %u", i); + } + seq_puts(m, "],\n"); + seq_puts(m, " \"execs\": [\n"); + first = true; + for (i = 0; i < data->exec_count; i++) { + if (unlikely(!data->execs[i])) { + pr_warn("encounter nil exec path in vkernel_analysis\n"); + continue; + } + if (first) { + seq_printf(m, " \"%s\"", data->execs[i]); + first = false; + } else + seq_printf(m, ",\n \"%s\"", data->execs[i]); + } + seq_puts(m, "\n ],\n"); + seq_puts(m, " \"syscall_details\": [\n"); + first = true; + for (i = 0; i < NR_syscalls; i++) { + if (!data->syscalls[i]) + continue; + if (first) { + seq_printf(m, " {\"nr\": %u, \"count\": %u}", i, data->syscalls[i]); + first = false; + } else + seq_printf(m, ",\n {\"nr\": %u, \"count\": %u}", i, data->syscalls[i]); + } + seq_puts(m, "\n ]\n"); + seq_puts(m, "}\n"); + + return 0; +} + +static int analysis_open(struct inode *inode, struct file *file) +{ + struct vkernel *vk = inode->i_private; + int r; + + if (!vkernel_get_vk_safe(vk)) + return -ENOENT; + + r = single_open(file, analysis_show, inode->i_private); + if (r < 0) + vkernel_put_vk(vk); + + return r; +} + +static int analysis_release(struct inode *inode, struct file *file) +{ + struct vkernel *vk = inode->i_private; + + vkernel_put_vk(vk); + + return single_release(inode, file); +} + +static const struct file_operations analysis_fops = { + .open = analysis_open, + .release = analysis_release, + .read = seq_read, + .llseek = seq_lseek, +}; + +static int analysis_post_create(struct vkernel *vk) +{ + struct vkernel_analysis *data; + struct vkernel_syscall *syscall; + int i; + + data = kzalloc(sizeof(*data) + sizeof(char *) * 64, GFP_KERNEL); + if (!data) + return -ENOMEM; + data->exec_capacity = 4; + vk->private = data; + + syscall = &vk->syscall; + for (i = 0; i < NR_syscalls; i++) + syscall->table[i] = vk_sys_act_analysis; + + debugfs_create_file("analysis", 0444, vk->debugfs_dentry, vk, &analysis_fops); + + return 0; +} + +static void analysis_pre_destroy(struct vkernel *vk) +{ + struct vkernel_analysis *data = (struct vkernel_analysis *)vk->private; + + if (unlikely(!data)) { + pr_warn("detroy an analysis vk without vkernel_analysis data\n"); + return; + } + + kfree(data); + vk->private = NULL; +} + +struct vkernel_custom_type analysis_custom = { + .owner = THIS_MODULE, + .name = "analysis", + .post_create = analysis_post_create, + .pre_destroy = analysis_pre_destroy, +}; diff --git a/drivers/vkernel/sysctl/fs.c b/drivers/vkernel/sysctl/fs.c new file mode 100644 index 0000000000000000000000000000000000000000..d57ebae8cfb3defb1235704da1cfa18b6604c13b --- /dev/null +++ b/drivers/vkernel/sysctl/fs.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024 ARM Ltd. + * Author: Joy Allen + */ + +#include +#include + +#include "sysctl.h" + +int vk_init_sysctl_fs(struct vkernel_sysctl_fs *fs) +{ + unsigned long n; + unsigned long nr_pages = totalram_pages(); + unsigned long memreserve = (nr_pages - nr_free_pages()) * 3/2; + + memreserve = min(memreserve, nr_pages - 1); + n = ((nr_pages - memreserve) * (PAGE_SIZE / 1024)) / 10; + fs->files_stat.max_files = max_t(unsigned long, n, NR_FILE); + fs->nr_open = 1024 * 1024; + if (percpu_counter_init(&fs->nr_files, 0, GFP_KERNEL)) { + pr_err("vkernel: failed to init sysctl_fs nr_files\n"); + return -ENOMEM; + } + + fs->nr_inodes = alloc_percpu_gfp(unsigned long, GFP_KERNEL); + if (!fs->nr_inodes) { + pr_err("vkernel: failed to alloc sysctl_fs nr_inodes\n"); + return -ENOMEM; + } + fs->nr_unused = alloc_percpu_gfp(unsigned long, GFP_KERNEL); + if (!fs->nr_unused) { + pr_err("vkernel: failed to alloc sysctl_fs nr_unused\n"); + return -ENOMEM; + } + + fs->leases_enable = 1; + fs->lease_break_time = 45; + + fs->mount_max = 100000; + + return 0; +} + +void vk_uninit_sysctl_fs(struct vkernel_sysctl_fs *fs) +{ + if (fs->nr_inodes) + free_percpu(fs->nr_inodes); + if (fs->nr_unused) + free_percpu(fs->nr_unused); + + percpu_counter_destroy(&fs->nr_files); +} + +int vkernel_set_sysctl_fs(struct vkernel_sysctl_fs *fs, struct vkernel_sysctl_fs_desc *desc) +{ + if (desc->file_max) + fs->files_stat.max_files = desc->file_max; + if (desc->nr_open) + fs->nr_open = desc->nr_open; + + if (desc->leases_enable == 0 || desc->leases_enable == 1) + fs->leases_enable = desc->leases_enable; + if (desc->lease_break_time > 0) + fs->lease_break_time = desc->lease_break_time; + + if (desc->mount_max) + fs->mount_max = desc->mount_max; + + return 0; +} +EXPORT_SYMBOL(vkernel_set_sysctl_fs); diff --git a/drivers/vkernel/sysctl/kernel.c b/drivers/vkernel/sysctl/kernel.c new file mode 100644 index 0000000000000000000000000000000000000000..5690e565a5b736eabfc05d9703b4c119f45f0faf --- /dev/null +++ b/drivers/vkernel/sysctl/kernel.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024 ARM Ltd. + * Author: Joy Allen + */ + +#include +#include +#include + +#include "sysctl.h" + +int vk_init_sysctl_kernel(struct vkernel_sysctl_kernel *k) +{ + u64 threads; + unsigned long nr_pages = totalram_pages(); + + k->nb_mode = NUMA_BALANCING_DISABLED; + k->nb_promote_rate_limit = 65536; + + k->sched_cfs_bandwidth_slice = 5000UL; + k->sched_child_runs_first = 0; + + k->sched_dl_period_max = 1 << 22; /* ~4 seconds */ + k->sched_dl_period_min = 100; /* 100 us */ + + k->sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; + k->sched_rt_period = 1000000; + k->sched_rt_runtime = 950000; + + /* + * The number of threads shall be limited such that the thread + * structures may only consume a small part of the available memory. + */ + if (fls64(nr_pages) + fls64(PAGE_SIZE) > 64) + threads = MAX_THREADS; + else + threads = div64_u64((u64) nr_pages * (u64) PAGE_SIZE, + (u64) THREAD_SIZE * 8UL); + if (threads > MAX_THREADS) + threads = MAX_THREADS; + k->nr_threads = 0; + k->max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS); + + k->key_gc_delay = 5 * 60; + k->persistent_keyring_expiry = 3 * 24 * 3600; /* Expire after 3 days of non-use */ + k->key_quota_root_maxbytes = 25000000; + k->key_quota_root_maxkeys = 1000000; + k->key_quota_maxbytes = 20000; + k->key_quota_maxkeys = 200; + + k->pty_limit = NR_UNIX98_PTY_DEFAULT; + k->pty_reserve = NR_UNIX98_PTY_RESERVE; + k->pty_count = (atomic_t)ATOMIC_INIT(0); + + return 0; +} + +void vk_uninit_sysctl_kernel(struct vkernel_sysctl_kernel *k) +{ + +} + +int vkernel_set_sysctl_kernel(struct vkernel_sysctl_kernel *k, + struct vkernel_sysctl_kernel_desc *desc) +{ + if (desc->numa_balancing >= 0) + k->nb_mode = desc->numa_balancing; + if (desc->numa_balancing_promote_rate_limit > 0) + k->nb_promote_rate_limit = desc->numa_balancing_promote_rate_limit; + + if (desc->sched_cfs_bandwidth_slice) + k->sched_cfs_bandwidth_slice = desc->sched_cfs_bandwidth_slice; + if (desc->sched_child_runs_first == 0 || desc->sched_child_runs_first == 1) + k->sched_child_runs_first = desc->sched_child_runs_first; + + if (desc->sched_dl_period_max) + k->sched_dl_period_max = desc->sched_dl_period_max; + if (desc->sched_dl_period_min) + k->sched_dl_period_min = desc->sched_dl_period_min; + + if (desc->sched_rr_timeslice > 0) + k->sched_rr_timeslice = desc->sched_rr_timeslice; + if (desc->sched_rt_period > 0) + k->sched_rt_period = desc->sched_rt_period; + if (desc->sched_rt_runtime > 0) + k->sched_rt_runtime = desc->sched_rt_runtime; + + if (desc->max_threads > 0) + k->max_threads = clamp_t(u64, desc->max_threads, MIN_THREADS, MAX_THREADS); + + if (desc->key_gc_delay) + k->key_gc_delay = desc->key_gc_delay; + if (desc->key_persistent_keyring_expiry) + k->persistent_keyring_expiry = desc->key_persistent_keyring_expiry; + if (desc->key_quota_root_maxbytes) + k->key_quota_root_maxbytes = desc->key_quota_root_maxbytes; + if (desc->key_quota_root_maxkeys) + k->key_quota_root_maxkeys = desc->key_quota_root_maxkeys; + if (desc->key_quota_maxbytes) + k->key_quota_maxbytes = desc->key_quota_maxbytes; + if (desc->key_quota_maxkeys) + k->key_quota_maxkeys = desc->key_quota_maxkeys; + + if (desc->pty_limit > 0) + k->pty_limit = desc->pty_limit; + if (desc->pty_reserve > 0) + k->pty_reserve = desc->pty_reserve; + + return 0; +} +EXPORT_SYMBOL(vkernel_set_sysctl_kernel); diff --git a/drivers/vkernel/sysctl/net.c b/drivers/vkernel/sysctl/net.c new file mode 100644 index 0000000000000000000000000000000000000000..deaff3d13d16a6afd61c3cb40c1875b5543eccd1 --- /dev/null +++ b/drivers/vkernel/sysctl/net.c @@ -0,0 +1,416 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024 ARM Ltd. + * Author: Joy Allen + */ + +#include +#include + +#include "sysctl.h" +#include "utils.h" + +int (*tcp_set_default_congestion_control_ptr)(struct net *net, const char *name); +void (*rt_cache_flush_ptr)(struct net *net); +void (*inet_netconf_notify_devconf_ptr)(struct net *net, int event, int type, + int ifindex, struct ipv4_devconf *devconf); + +// extern unsigned int nf_conntrack_max; + +int vk_init_sysctl_net(struct vkernel_sysctl_net *net, struct task_struct *tsk) +{ + tcp_set_default_congestion_control_ptr = + (void *)lookup_name("tcp_set_default_congestion_control"); + rt_cache_flush_ptr = (void *)lookup_name("rt_cache_flush"); + inet_netconf_notify_devconf_ptr = + (void *)lookup_name("inet_netconf_notify_devconf"); + + /* congestion_control can be null */ + if (!rt_cache_flush_ptr || !inet_netconf_notify_devconf_ptr) { + pr_err("failed to find net symbols, flush: %p, notify: %p\n", + rt_cache_flush_ptr, inet_netconf_notify_devconf_ptr); + return -1; + } + + if (!tsk) { + pr_err("failed to init sysctl net with invalid task\n"); + return -1; + } + + // net->nf_conntrack_max = nf_conntrack_max; + net->nf_conntrack_max = 1572864; + + net->net_busy_poll = 0; + net->net_busy_read = 0; + + net->weight_p = 64; + net->dev_weight_rx_bias = 1; + net->dev_weight_tx_bias = 1; + net->dev_rx_weight = 64; + net->dev_tx_weight = 64; + + net->netdev_budget = 300; + net->netdev_budget_usecs = 2 * USEC_PER_SEC / HZ; + net->netdev_max_backlog = 1000; + + net->optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV+512); + net->wmem_max = SK_WMEM_MAX; + net->rmem_max = SK_RMEM_MAX; + net->wmem_default = SK_WMEM_MAX; + net->rmem_default = SK_RMEM_MAX; + + net->net = ERR_PTR(-ESRCH); + rcu_read_lock(); + task_lock(tsk); + if (tsk->nsproxy) + net->net = get_net(tsk->nsproxy->net_ns); + task_unlock(tsk); + rcu_read_unlock(); + if (IS_ERR(net->net)) { + pr_err("failed to get net ns, error %ld\n", PTR_ERR(net->net)); + return -1; + } + + return 0; +} + +void vk_uninit_sysctl_net(struct vkernel_sysctl_net *net) +{ + if (!IS_ERR(net->net)) + put_net(net->net); +} + +enum { + DEVCONF_ALL, + DEVCONF_DFLT, + DEVCONF_OTHER +}; + +#define IPV4_DEVCONF_DFLT(net, attr) \ + IPV4_DEVCONF((*net->ipv4.devconf_dflt), attr) + +static void devinet_copy_dflt_conf(struct net *net, int i) +{ + struct net_device *dev; + + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { + struct in_device *in_dev; + + in_dev = __in_dev_get_rcu(dev); + if (in_dev && !test_bit(i, in_dev->cnf.state)) + in_dev->cnf.data[i] = net->ipv4.devconf_dflt->data[i]; + } + rcu_read_unlock(); +} + +static void inet_forward_change(struct net *net) +{ + struct net_device *dev; + int on = IPV4_DEVCONF_ALL(net, FORWARDING); + + IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on; + IPV4_DEVCONF_DFLT(net, FORWARDING) = on; + inet_netconf_notify_devconf_ptr(net, RTM_NEWNETCONF, + NETCONFA_FORWARDING, + NETCONFA_IFINDEX_ALL, + net->ipv4.devconf_all); + inet_netconf_notify_devconf_ptr(net, RTM_NEWNETCONF, + NETCONFA_FORWARDING, + NETCONFA_IFINDEX_DEFAULT, + net->ipv4.devconf_dflt); + + for_each_netdev(net, dev) { + struct in_device *in_dev; + + if (on) + dev_disable_lro(dev); + + in_dev = __in_dev_get_rtnl(dev); + if (in_dev) { + IN_DEV_CONF_SET(in_dev, FORWARDING, on); + inet_netconf_notify_devconf_ptr(net, RTM_NEWNETCONF, + NETCONFA_FORWARDING, + dev->ifindex, &in_dev->cnf); + } + } +} + +static int devinet_conf_ifindex(struct net *net, struct ipv4_devconf *cnf) +{ + struct in_device *idev; + + if (cnf == net->ipv4.devconf_dflt) + return NETCONFA_IFINDEX_DEFAULT; + else if (cnf == net->ipv4.devconf_all) + return NETCONFA_IFINDEX_ALL; + + idev = container_of(cnf, struct in_device, cnf); + return idev->dev->ifindex; +} + +int devconf_proc(struct net *net, struct ipv4_devconf *conf, + int val, int i, int type) +{ + int old_val; + int ifindex; + + old_val = conf->data[i - 1]; + conf->data[i - 1] = val; + + set_bit(i - 1, conf->state); + + if (type == DEVCONF_DFLT) + devinet_copy_dflt_conf(net, i - 1); // inline + if (i == IPV4_DEVCONF_ACCEPT_LOCAL || i == IPV4_DEVCONF_ROUTE_LOCALNET) + if (conf->data[i - 1] == 0 && old_val != 0) + rt_cache_flush_ptr(net); + + if (i == IPV4_DEVCONF_BC_FORWARDING && conf->data[i - 1] != old_val) + rt_cache_flush_ptr(net); + + if (i == IPV4_DEVCONF_RP_FILTER && conf->data[i - 1] != old_val) { + ifindex = devinet_conf_ifindex(net, conf); // inline + inet_netconf_notify_devconf_ptr(net, RTM_NEWNETCONF, + NETCONFA_RP_FILTER, + ifindex, conf); + } + if (i == IPV4_DEVCONF_PROXY_ARP && conf->data[i - 1] != old_val) { + ifindex = devinet_conf_ifindex(net, conf); + inet_netconf_notify_devconf_ptr(net, RTM_NEWNETCONF, + NETCONFA_PROXY_NEIGH, + ifindex, conf); + } + if (i == IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN && conf->data[i - 1] != old_val) { + ifindex = devinet_conf_ifindex(net, conf); + inet_netconf_notify_devconf_ptr(net, RTM_NEWNETCONF, + NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, + ifindex, conf); + } + + return 0; +} + +int devconf_forward(struct net *net, struct ipv4_devconf *conf, + int val, int i, int type) +{ + int old_val; + + old_val = conf->data[i - 1]; + conf->data[i - 1] = val; + if (conf->data[i - 1] != old_val) { + if (type != DEVCONF_DFLT) { + if (!rtnl_trylock()) { + conf->data[i - 1] = old_val; + return -EBUSY; + } + if (type == DEVCONF_ALL) + inet_forward_change(net); // inline + else { + struct in_device *idev = + container_of(conf, struct in_device, cnf); + dev_disable_lro(idev->dev); + inet_netconf_notify_devconf_ptr(net, RTM_NEWNETCONF, + NETCONFA_FORWARDING, + idev->dev->ifindex, + conf); + } + } else + inet_netconf_notify_devconf_ptr(net, RTM_NEWNETCONF, + NETCONFA_FORWARDING, + NETCONFA_IFINDEX_DEFAULT, + conf); + } + + return 0; +} + +int devconf_flush(struct net *net, struct ipv4_devconf *conf, + int val, int i, int type) +{ + int old_val; + + old_val = conf->data[i - 1]; + conf->data[i - 1] = val; + if (conf->data[i - 1] != old_val) + rt_cache_flush_ptr(net); + + return 0; +} + +int vkernel_set_sysctl_net(struct vkernel_sysctl_net *net, struct vkernel_sysctl_net_desc *desc) +{ + struct net *n = net->net; + int weight; + int val; + int i; + + /* netns specific */ + if (desc->nf_conntrack_max) + net->nf_conntrack_max = desc->nf_conntrack_max; + + /* core, poll/select specific */ + net->net_busy_poll = desc->core_busy_poll; + net->net_busy_read = desc->core_busy_read; + + /* napi_struct specific */ + if (desc->core_dev_weight > 0) { + net->weight_p = desc->core_dev_weight; + weight = READ_ONCE(net->weight_p); + WRITE_ONCE(net->dev_rx_weight, weight * net->dev_weight_rx_bias); + WRITE_ONCE(net->dev_tx_weight, weight * net->dev_weight_tx_bias); + } + + /* softnet_data specific */ + if (desc->core_netdev_budget > 0) + net->netdev_budget = desc->core_netdev_budget; + if (desc->core_netdev_budget_us > 0) + net->netdev_budget_usecs = desc->core_netdev_budget_us; + if (desc->core_netdev_max_backlog > 0) + net->netdev_max_backlog = desc->core_netdev_max_backlog; + + /* sock specific (netns specific) */ + if (desc->core_optmem_max > 0) + net->optmem_max = desc->core_optmem_max; + if (desc->core_wmem_max) + net->wmem_max = desc->core_wmem_max; + if (desc->core_rmem_max) + net->rmem_max = desc->core_rmem_max; + if (desc->core_wmem_default) + net->wmem_default = desc->core_wmem_default; + if (desc->core_rmem_default) + net->rmem_default = desc->core_rmem_default; + + /* net ns specific */ + + /* core */ + if (desc->core_somaxconn) + n->core.sysctl_somaxconn = desc->core_somaxconn; + + /* ipv4 */ + if (desc->ipv4_icmp_echo_ignore_broadcasts == 0 || + desc->ipv4_icmp_echo_ignore_broadcasts == 1) + n->ipv4.sysctl_icmp_echo_ignore_broadcasts = desc->ipv4_icmp_echo_ignore_broadcasts; + if (desc->ipv4_ip_local_port_range[0] > 0 && desc->ipv4_ip_local_port_range[1] > 0) { + n->ipv4.ip_local_ports.range[0] = desc->ipv4_ip_local_port_range[0]; + n->ipv4.ip_local_ports.range[1] = desc->ipv4_ip_local_port_range[1]; + } + if (desc->ipv4_max_tw_buckets > 0) + n->ipv4.tcp_death_row.sysctl_max_tw_buckets = desc->ipv4_max_tw_buckets; + if (desc->ipv4_tcp_ecn <= 2) + n->ipv4.sysctl_tcp_ecn = desc->ipv4_tcp_ecn; + if (desc->ipv4_ip_default_ttl >= 1 && desc->ipv4_ip_default_ttl <= 255) + n->ipv4.sysctl_ip_default_ttl = desc->ipv4_ip_default_ttl; + if (desc->ipv4_ip_no_pmtu_disc == 0 || desc->ipv4_ip_no_pmtu_disc == 1) + n->ipv4.sysctl_ip_no_pmtu_disc = desc->ipv4_ip_no_pmtu_disc; + if (desc->ipv4_tcp_keepalive_time > 0) + WRITE_ONCE(n->ipv4.sysctl_tcp_keepalive_time, desc->ipv4_tcp_keepalive_time * HZ); + if (desc->ipv4_tcp_keepalive_intvl > 0) + WRITE_ONCE(n->ipv4.sysctl_tcp_keepalive_intvl, desc->ipv4_tcp_keepalive_intvl * HZ); + if (desc->ipv4_tcp_keepalive_probes) + n->ipv4.sysctl_tcp_keepalive_probes = desc->ipv4_tcp_keepalive_probes; + if (desc->ipv4_tcp_syn_retries >= 1 && desc->ipv4_tcp_syn_retries <= MAX_TCP_SYNCNT) + n->ipv4.sysctl_tcp_syn_retries = desc->ipv4_tcp_syn_retries; + if (desc->ipv4_tcp_synack_retries) + n->ipv4.sysctl_tcp_synack_retries = desc->ipv4_tcp_synack_retries; + if (desc->ipv4_tcp_syncookies >= 0 && desc->ipv4_tcp_syncookies <= 2) + n->ipv4.sysctl_tcp_syncookies = desc->ipv4_tcp_syncookies; + if (desc->ipv4_tcp_reordering > 0) + n->ipv4.sysctl_tcp_reordering = desc->ipv4_tcp_reordering; + if (desc->ipv4_tcp_retries1 && desc->ipv4_tcp_retries1 <= 255) + n->ipv4.sysctl_tcp_retries1 = desc->ipv4_tcp_retries1; + if (desc->ipv4_tcp_retries2) + n->ipv4.sysctl_tcp_retries2 = desc->ipv4_tcp_retries2; + if (desc->ipv4_tcp_orphan_retries) + n->ipv4.sysctl_tcp_orphan_retries = desc->ipv4_tcp_orphan_retries; + if (desc->ipv4_tcp_tw_reuse >= 0 && desc->ipv4_tcp_tw_reuse <= 2) + n->ipv4.sysctl_tcp_tw_reuse = desc->ipv4_tcp_tw_reuse; + if (desc->ipv4_tcp_fin_timeout > 0) + WRITE_ONCE(n->ipv4.sysctl_tcp_fin_timeout, desc->ipv4_tcp_fin_timeout * HZ); + if (desc->ipv4_tcp_sack == 0 || desc->ipv4_tcp_sack == 1) + n->ipv4.sysctl_tcp_sack = desc->ipv4_tcp_sack; + if (desc->ipv4_tcp_window_scaling == 0 || + desc->ipv4_tcp_window_scaling == 1) + n->ipv4.sysctl_tcp_window_scaling = desc->ipv4_tcp_window_scaling; + if (desc->ipv4_tcp_timestamps == 0 || desc->ipv4_tcp_timestamps == 1) + n->ipv4.sysctl_tcp_timestamps = desc->ipv4_tcp_timestamps; + if (desc->ipv4_tcp_thin_linear_timeouts == 0 || + desc->ipv4_tcp_thin_linear_timeouts == 1) + n->ipv4.sysctl_tcp_thin_linear_timeouts = desc->ipv4_tcp_thin_linear_timeouts; + if (desc->ipv4_tcp_retrans_collapse == 0 || + desc->ipv4_tcp_retrans_collapse == 1) + n->ipv4.sysctl_tcp_retrans_collapse = desc->ipv4_tcp_retrans_collapse; + if (desc->ipv4_tcp_fack == 0 || desc->ipv4_tcp_fack == 1) + n->ipv4.sysctl_tcp_fack = desc->ipv4_tcp_fack; + if (desc->ipv4_tcp_adv_win_scale >= 0 && desc->ipv4_tcp_adv_win_scale <= 4) + n->ipv4.sysctl_tcp_adv_win_scale = desc->ipv4_tcp_adv_win_scale; + if (desc->ipv4_tcp_dsack == 0 || desc->ipv4_tcp_dsack == 1) + n->ipv4.sysctl_tcp_dsack = desc->ipv4_tcp_dsack; + if (desc->ipv4_tcp_nometrics_save == 0 || desc->ipv4_tcp_nometrics_save == 1) + n->ipv4.sysctl_tcp_nometrics_save = desc->ipv4_tcp_nometrics_save; + if (desc->ipv4_tcp_moderate_rcvbuf == 0 || desc->ipv4_tcp_moderate_rcvbuf == 1) + n->ipv4.sysctl_tcp_moderate_rcvbuf = desc->ipv4_tcp_moderate_rcvbuf; + if (desc->ipv4_tcp_min_tso_segs) + n->ipv4.sysctl_tcp_min_tso_segs = desc->ipv4_tcp_min_tso_segs; + if (desc->ipv4_tcp_wmem[0] > 0 && desc->ipv4_tcp_wmem[1] > 0 && + desc->ipv4_tcp_wmem[2] > 0) { + n->ipv4.sysctl_tcp_wmem[0] = desc->ipv4_tcp_wmem[0]; + n->ipv4.sysctl_tcp_wmem[1] = desc->ipv4_tcp_wmem[1]; + n->ipv4.sysctl_tcp_wmem[2] = desc->ipv4_tcp_wmem[2]; + } + if (desc->ipv4_tcp_rmem[0] > 0 && desc->ipv4_tcp_rmem[1] > 0 && + desc->ipv4_tcp_rmem[2] > 0) { + n->ipv4.sysctl_tcp_rmem[0] = desc->ipv4_tcp_rmem[0]; + n->ipv4.sysctl_tcp_rmem[1] = desc->ipv4_tcp_rmem[1]; + n->ipv4.sysctl_tcp_rmem[2] = desc->ipv4_tcp_rmem[2]; + } + if (desc->ipv4_max_syn_backlog > 0) + n->ipv4.sysctl_max_syn_backlog = desc->ipv4_max_syn_backlog; + if (desc->ipv4_tcp_fastopen == 1 || desc->ipv4_tcp_fastopen == 2 || + desc->ipv4_tcp_fastopen == 4) + n->ipv4.sysctl_tcp_fastopen = desc->ipv4_tcp_fastopen; + if (tcp_set_default_congestion_control_ptr && strlen(desc->ipv4_tcp_congestion_control) > 1) + tcp_set_default_congestion_control_ptr(n, desc->ipv4_tcp_congestion_control); + + /* ipv4 conf */ + for (i = IPV4_DEVCONF_FORWARDING; i <= IPV4_DEVCONF_MAX; i++) { + val = desc->ipv4_conf_all[i - 1]; + if (val < 0) + continue; + + if (i == IPV4_DEVCONF_FORWARDING) + devconf_forward(n, n->ipv4.devconf_all, val, i, DEVCONF_ALL); + else if (i == IPV4_DEVCONF_NOXFRM || + i == IPV4_DEVCONF_NOPOLICY || + i == IPV4_DEVCONF_PROMOTE_SECONDARIES || + i == IPV4_DEVCONF_ROUTE_LOCALNET || + i == IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST) + devconf_flush(n, n->ipv4.devconf_all, val, i, DEVCONF_ALL); + else + devconf_proc(n, n->ipv4.devconf_all, val, i, DEVCONF_ALL); + } + /* ipv4 conf default */ + for (i = IPV4_DEVCONF_FORWARDING; i <= IPV4_DEVCONF_MAX; i++) { + val = desc->ipv4_conf_default[i - 1]; + if (val != 0 && val != 1) + continue; + + if (i == IPV4_DEVCONF_FORWARDING) + devconf_forward(n, n->ipv4.devconf_dflt, val, i, DEVCONF_DFLT); + else if (i == IPV4_DEVCONF_NOXFRM || + i == IPV4_DEVCONF_NOPOLICY || + i == IPV4_DEVCONF_PROMOTE_SECONDARIES || + i == IPV4_DEVCONF_ROUTE_LOCALNET || + i == IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST) + devconf_flush(n, n->ipv4.devconf_dflt, val, i, DEVCONF_DFLT); + else + devconf_proc(n, n->ipv4.devconf_dflt, val, i, DEVCONF_DFLT); + } + + /* unix */ + if (desc->unix_max_dgram_qlen > 0) + n->unx.sysctl_max_dgram_qlen = desc->unix_max_dgram_qlen; + + return 0; +} +EXPORT_SYMBOL(vkernel_set_sysctl_net); diff --git a/drivers/vkernel/sysctl/raw.c b/drivers/vkernel/sysctl/raw.c new file mode 100644 index 0000000000000000000000000000000000000000..2b67fa89370eb2f317be0f0fb0542f35c6507906 --- /dev/null +++ b/drivers/vkernel/sysctl/raw.c @@ -0,0 +1,689 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024 ARM Ltd. + * Author: Joy Allen + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "sysctl.h" +#include "utils.h" + +enum { + DEVCONF_ALL, + DEVCONF_DFLT, + DEVCONF_OTHER +}; + +int vkernel_set_sysctl_raw(struct vkernel *vk, char *buf) +{ + struct ipc_namespace *ipc_ns = NULL; + struct net *n; + char *name; + char *val; + char *p; + u64 uval; + s64 sval, old_sval, third_sval; + bool has_uval = false, has_sval = false; + + val = strchr(buf, '='); + if (!val) + return -EINVAL; + *val++ = 0; + name = strstrip(buf); + val = strstrip(val); + + if (!kstrtou64(val, 10, &uval)) + has_uval = true; + else + pr_warn("failed to parse raw sysctl val %s to u64\n", val); + if (!kstrtos64(val, 10, &sval)) + has_sval = true; + else + pr_warn("failed to parse raw sysctl val %s to s64\n", val); + + if (vk->init_process->nsproxy) + ipc_ns = vk->init_process->nsproxy->ipc_ns; + + n = vk->sysctl_net.net; + + if (!strcmp(name, "fs.file-max")) { + if (has_uval && uval) + vk->sysctl_fs.files_stat.max_files = uval; + } else if (!strcmp(name, "fs.nr_open")) { + if (has_uval && uval) + vk->sysctl_fs.nr_open = uval; + } else if (!strcmp(name, "fs.lease-break-time")) { + if (has_uval && sval > 0) + vk->sysctl_fs.leases_enable = sval; + } else if (!strcmp(name, "fs.leases-enable")) { + if (has_sval && (sval == 0 || sval == 1)) + vk->sysctl_fs.lease_break_time = sval; + } else if (!strcmp(name, "fs.mount-max")) { + if (has_uval && uval) + vk->sysctl_fs.mount_max = uval; + } else if (!strcmp(name, "kernel.msgmax")) { + if (has_uval && ipc_ns && uval) + ipc_ns->msg_ctlmax = uval; + } else if (!strcmp(name, "kernel.msgmnb")) { + if (has_uval && ipc_ns && uval) + ipc_ns->msg_ctlmnb = uval; + } else if (!strcmp(name, "kernel.msgmni")) { + if (has_uval && ipc_ns && uval) + ipc_ns->msg_ctlmni = uval; + } +#ifdef CONFIG_CHECKPOINT_RESTORE + else if (!strcmp(name, "kernel.msg_next_id")) { + if (has_sval && ipc_ns && sval >= -1) + ipc_ns->ids[IPC_MSG_IDS].next_id = sval; + } +#endif + else if (!strcmp(name, "kernel.sem")) { + if (ipc_ns) { + old_sval = ipc_ns->sem_ctls[3]; + uval = 0; + while ((p = strsep(&val, " \t")) != NULL && uval < 4) { + if (!*p) + continue; + if (!kstrtos64(p, 10, &sval) && sval > 0) + ipc_ns->sem_ctls[uval] = sval; + uval++; + } + if (sem_check_semmni(ipc_ns)) + ipc_ns->sem_ctls[3] = old_sval; + } + } +#ifdef CONFIG_CHECKPOINT_RESTORE + else if (!strcmp(name, "kernel.sem_next_id")) { + if (has_sval && ipc_ns && sval >= -1) + ipc_ns->ids[IPC_SEM_IDS].next_id = sval; + } +#endif + else if (!strcmp(name, "kernel.shmall")) { + if (has_uval && ipc_ns && uval) + ipc_ns->shm_ctlall = uval; + } else if (!strcmp(name, "kernel.shmmax")) { + if (has_uval && ipc_ns && uval) + ipc_ns->shm_ctlmax = uval; + } else if (!strcmp(name, "kernel.shmmni")) { + if (has_uval && ipc_ns && uval) + ipc_ns->shm_ctlmni = uval; + } +#ifdef CONFIG_CHECKPOINT_RESTORE + else if (!strcmp(name, "kernel.shm_next_id")) { + if (has_uval && ipc_ns && uval) + ipc_ns->ids[IPC_SHM_IDS].next_id = uval; + } +#endif + else if (!strcmp(name, "kernel.shm_rmid_forced")) { + if (has_sval && ipc_ns && (sval == 0 || sval == 1)) + ipc_ns->shm_rmid_forced = sval; + } else if (!strcmp(name, "kernel.numa_balancing")) { + /* inactive */ + if (has_sval && sval >= 0) + vk->sysctl_kernel.nb_mode = sval; + } else if (!strcmp(name, "kernel.numa_balancing_promote_rate_limit_MBps")) { + /* inactive */ + if (has_sval && sval > 0) + vk->sysctl_kernel.nb_promote_rate_limit = sval; + } else if (!strcmp(name, "kernel.sched_cfs_bandwidth_slice_us")) { + if (has_uval && uval) + vk->sysctl_kernel.sched_cfs_bandwidth_slice = uval; + } else if (!strcmp(name, "kernel.sched_child_runs_first")) { + if (has_uval && (uval == 0 || uval == 1)) + vk->sysctl_kernel.sched_child_runs_first = uval; + } else if (!strcmp(name, "kernel.sched_deadline_period_max_us")) { + if (has_uval && uval) + vk->sysctl_kernel.sched_dl_period_max = uval; + } else if (!strcmp(name, "kernel.sched_deadline_period_min_us")) { + if (has_uval && uval) + vk->sysctl_kernel.sched_dl_period_min = uval; + } else if (!strcmp(name, "kernel.sched_rr_timeslice_ms")) { + /* inactive */ + if (has_sval && sval > 0) + vk->sysctl_kernel.sched_rr_timeslice = sval; + } else if (!strcmp(name, "kernel.sched_rt_period_us")) { + /* inactive */ + if (has_sval && sval > 0) + vk->sysctl_kernel.sched_rt_period = sval; + } else if (!strcmp(name, "kernel.sched_rt_runtime_us")) { + /* inactive */ + if (has_sval && sval > 0) + vk->sysctl_kernel.sched_rt_runtime = sval; + } else if (!strcmp(name, "kernel.threads-max")) { + if (has_sval && sval > 0) + vk->sysctl_kernel.max_threads = clamp_t(u64, sval, + MIN_THREADS, MAX_THREADS); + } else if (!strcmp(name, "kernel.keys.gc_delay")) { + if (has_uval && uval > 0) + vk->sysctl_kernel.key_gc_delay = uval; + } else if (!strcmp(name, "kernel.keys.maxbytes")) { + if (has_uval && uval > 0) + vk->sysctl_kernel.key_quota_maxbytes = uval; + } else if (!strcmp(name, "kernel.keys.maxkeys")) { + if (has_uval && uval > 0) + vk->sysctl_kernel.key_quota_maxkeys = uval; + } else if (!strcmp(name, "kernel.keys.persistent_keyring_expiry")) { + if (has_uval && uval > 0) + vk->sysctl_kernel.persistent_keyring_expiry = uval; + } else if (!strcmp(name, "kernel.keys.root_maxbytes")) { + if (has_uval && uval > 0) + vk->sysctl_kernel.key_quota_root_maxbytes = uval; + } else if (!strcmp(name, "kernel.keys.root_maxkeys")) { + if (has_uval && uval > 0) + vk->sysctl_kernel.key_quota_root_maxkeys = uval; + } else if (!strcmp(name, "kernel.pty.max")) { + if (has_sval && sval > 0) + vk->sysctl_kernel.pty_limit = sval; + } else if (!strcmp(name, "kernel.pty.reserve")) { + if (has_sval && sval > 0) + vk->sysctl_kernel.pty_reserve = sval; + } else if (!strcmp(name, "net.nf_conntrack_max")) { + if (has_uval && uval > 0) + vk->sysctl_net.nf_conntrack_max = uval; + } else if (!strcmp(name, "net.core.busy_poll")) { + if (has_uval) + vk->sysctl_net.net_busy_poll = uval; + } else if (!strcmp(name, "net.core.busy_read")) { + if (has_uval) + vk->sysctl_net.net_busy_read = uval; + } else if (!strcmp(name, "net.core.optmem_max")) { + if (has_sval && sval > 0) + vk->sysctl_net.optmem_max = sval; + } else if (!strcmp(name, "net.core.wmem_max")) { + if (has_uval && uval) + vk->sysctl_net.wmem_max = uval; + } else if (!strcmp(name, "net.core.rmem_max")) { + if (has_uval && uval) + vk->sysctl_net.rmem_max = uval; + } else if (!strcmp(name, "net.core.wmem_default")) { + if (has_uval && uval) + vk->sysctl_net.wmem_default = uval; + } else if (!strcmp(name, "net.core.rmem_default")) { + if (has_uval && uval) + vk->sysctl_net.rmem_default = uval; + } else if (!strcmp(name, "net.core.somaxconn")) { + if (has_uval && uval) + n->core.sysctl_somaxconn = uval; + } else if (!strcmp(name, "net.ipv4.icmp_echo_ignore_broadcasts")) { + if (has_uval && (uval == 0 || uval == 1)) + n->ipv4.sysctl_icmp_echo_ignore_broadcasts = uval; + } else if (!strcmp(name, "net.ipv4.ip_local_port_range")) { + uval = 0; + while ((p = strsep(&val, " \t")) != NULL && uval < 2) { + if (!*p) + continue; + if (uval == 0) { + if (kstrtos64(p, 10, &sval)) + sval = 0; + } else { + if (kstrtos64(p, 10, &old_sval)) + old_sval = 0; + } + uval++; + } + if (sval > 0 && old_sval > 0) { + n->ipv4.ip_local_ports.range[0] = sval; + n->ipv4.ip_local_ports.range[1] = old_sval; + } + } else if (!strcmp(name, "net.ipv4.tcp_max_tw_buckets")) { + if (has_sval && sval > 0) + n->ipv4.tcp_death_row.sysctl_max_tw_buckets = sval; + } else if (!strcmp(name, "net.ipv4.tcp_ecn")) { + if (has_uval && uval <= 2) + n->ipv4.sysctl_tcp_ecn = uval; + } else if (!strcmp(name, "net.ipv4.ip_default_ttl")) { + if (has_uval && (uval >= 1 && uval <= 255)) + n->ipv4.sysctl_ip_default_ttl = uval; + } else if (!strcmp(name, "net.ipv4.ip_no_pmtu_disc")) { + if (has_uval && (uval == 0 || uval == 1)) + n->ipv4.sysctl_ip_no_pmtu_disc = uval; + } else if (!strcmp(name, "net.ipv4.tcp_keepalive_time")) { + if (has_sval && sval > 0) + WRITE_ONCE(n->ipv4.sysctl_tcp_keepalive_time, sval * HZ); + } else if (!strcmp(name, "net.ipv4.tcp_keepalive_intvl")) { + if (has_sval && sval > 0) + WRITE_ONCE(n->ipv4.sysctl_tcp_keepalive_intvl, sval * HZ); + } else if (!strcmp(name, "net.ipv4.tcp_keepalive_probes")) { + if (has_uval && uval) + n->ipv4.sysctl_tcp_keepalive_probes = uval; + } else if (!strcmp(name, "net.ipv4.tcp_syn_retries")) { + if (has_uval && uval >= 1 && uval <= MAX_TCP_SYNCNT) + n->ipv4.sysctl_tcp_syn_retries = uval; + } else if (!strcmp(name, "net.ipv4.tcp_synack_retries")) { + if (has_uval && uval) + n->ipv4.sysctl_tcp_synack_retries = uval; + } else if (!strcmp(name, "net.ipv4.tcp_syncookies")) { + if (has_uval && uval >= 0 && uval <= 2) + n->ipv4.sysctl_tcp_syncookies = uval; + } else if (!strcmp(name, "net.ipv4.tcp_reordering")) { + if (has_sval && sval > 0) + n->ipv4.sysctl_tcp_reordering = sval; + } else if (!strcmp(name, "net.ipv4.tcp_retries1")) { + if (has_uval && uval && uval <= 255) + n->ipv4.sysctl_tcp_retries1 = uval; + } else if (!strcmp(name, "net.ipv4.tcp_retries2")) { + if (has_uval && uval) + n->ipv4.sysctl_tcp_retries2 = uval; + } else if (!strcmp(name, "net.ipv4.tcp_orphan_retries")) { + if (has_uval && uval) + n->ipv4.sysctl_tcp_orphan_retries = uval; + } else if (!strcmp(name, "net.ipv4.tcp_tw_reuse")) { + if (has_uval && uval >= 0 && uval <= 2) + n->ipv4.sysctl_tcp_tw_reuse = uval; + } else if (!strcmp(name, "net.ipv4.tcp_fin_timeout")) { + if (has_sval && sval > 0) + WRITE_ONCE(n->ipv4.sysctl_tcp_fin_timeout, sval * HZ); + } else if (!strcmp(name, "net.ipv4.tcp_sack")) { + if (has_uval && (uval == 0 || uval == 1)) + n->ipv4.sysctl_tcp_sack = uval; + } else if (!strcmp(name, "net.ipv4.tcp_window_scaling")) { + if (has_uval && (uval == 0 || uval == 1)) + n->ipv4.sysctl_tcp_window_scaling = uval; + } else if (!strcmp(name, "net.ipv4.tcp_timestamps")) { + if (has_uval && (uval == 0 || uval == 1)) + n->ipv4.sysctl_tcp_timestamps = uval; + } else if (!strcmp(name, "net.ipv4.tcp_thin_linear_timeouts")) { + if (has_uval && (uval == 0 || uval == 1)) + n->ipv4.sysctl_tcp_thin_linear_timeouts = uval; + } else if (!strcmp(name, "net.ipv4.tcp_retrans_collapse")) { + if (has_uval && (uval == 0 || uval == 1)) + n->ipv4.sysctl_tcp_retrans_collapse = uval; + } else if (!strcmp(name, "net.ipv4.tcp_fack")) { + if (has_uval && (uval == 0 || uval == 1)) + n->ipv4.sysctl_tcp_fack = uval; + } else if (!strcmp(name, "net.ipv4.tcp_adv_win_scale")) { + if (has_sval && sval >= 0 && sval <= 4) + n->ipv4.sysctl_tcp_adv_win_scale = sval; + } else if (!strcmp(name, "net.ipv4.tcp_dsack")) { + if (has_uval && (uval == 0 || uval == 1)) + n->ipv4.sysctl_tcp_dsack = uval; // ? + } else if (!strcmp(name, "net.ipv4.tcp_nometrics_save")) { + if (has_uval && (uval == 0 || uval == 1)) + n->ipv4.sysctl_tcp_nometrics_save = uval; + } else if (!strcmp(name, "net.ipv4.tcp_moderate_rcvbuf")) { + if (has_uval && (uval == 0 || uval == 1)) + n->ipv4.sysctl_tcp_moderate_rcvbuf = uval; + } else if (!strcmp(name, "net.ipv4.tcp_min_tso_segs")) { + if (has_uval && uval) + n->ipv4.sysctl_tcp_min_tso_segs = uval; + } else if (!strcmp(name, "net.ipv4.tcp_wmem")) { + uval = 0; + while ((p = strsep(&val, " \t")) != NULL && uval < 3) { + if (!*p) + continue; + if (uval == 0) { + if (kstrtos64(p, 10, &sval)) + sval = 0; + } else if (uval == 1) { + if (kstrtos64(p, 10, &old_sval)) + old_sval = 0; + } else { + if (kstrtos64(p, 10, &third_sval)) + third_sval = 0; + } + uval++; + } + if (sval > 0 && old_sval > 0 && third_sval > 0) { + n->ipv4.sysctl_tcp_wmem[0] = sval; + n->ipv4.sysctl_tcp_wmem[1] = old_sval; + n->ipv4.sysctl_tcp_wmem[2] = third_sval; + } + } else if (!strcmp(name, "net.ipv4.tcp_rmem")) { + uval = 0; + while ((p = strsep(&val, " \t")) != NULL && uval < 3) { + if (!*p) + continue; + if (uval == 0) { + if (kstrtos64(p, 10, &sval)) + sval = 0; + } else if (uval == 1) { + if (kstrtos64(p, 10, &old_sval)) + old_sval = 0; + } else { + if (kstrtos64(p, 10, &third_sval)) + third_sval = 0; + } + uval++; + } + if (sval > 0 && old_sval > 0 && third_sval > 0) { + n->ipv4.sysctl_tcp_rmem[0] = sval; + n->ipv4.sysctl_tcp_rmem[1] = old_sval; + n->ipv4.sysctl_tcp_rmem[2] = third_sval; + } + } else if (!strcmp(name, "net.ipv4.max_syn_backlog")) { + if (has_sval && sval > 0) + n->ipv4.sysctl_max_syn_backlog = sval; + } else if (!strcmp(name, "net.ipv4.tcp_fastopen")) { + if (has_sval && (sval == 1 || sval == 2 || sval == 4)) + n->ipv4.sysctl_tcp_fastopen = sval; + } else if (!strcmp(name, "net.ipv4.tcp_congestion_control")) { + if (strlen(val) > 1) + tcp_set_default_congestion_control_ptr(n, val); + } else if (!strcmp(name, "net.ipv4.conf.all.forwarding")) { + if (has_sval && sval >= 0) + devconf_forward(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_FORWARDING, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.mc_forwarding")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_MC_FORWARDING, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.proxy_arp")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_PROXY_ARP, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.accept_redirects")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_ACCEPT_REDIRECTS, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.secure_redirects")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_SECURE_REDIRECTS, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.send_redirects")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_SEND_REDIRECTS, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.shared_media")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_SHARED_MEDIA, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.rp_filter")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_RP_FILTER, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.accept_source_route")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.bootp_relay")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_BOOTP_RELAY, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.log_martians")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_LOG_MARTIANS, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.tag")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_TAG, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.arp_filter")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_ARPFILTER, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.medium_id")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_MEDIUM_ID, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.disable_xfrm")) { + if (has_sval && sval >= 0) + devconf_flush(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_NOXFRM, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.disable_policy")) { + if (has_sval && sval >= 0) + devconf_flush(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_NOPOLICY, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.force_igmp_version")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_FORCE_IGMP_VERSION, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.arp_announce")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_ARP_ANNOUNCE, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.arp_ignore")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_ARP_IGNORE, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.promote_secondaries")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_PROMOTE_SECONDARIES, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.arp_accept")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_ARP_ACCEPT, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.arp_notify")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_ARP_NOTIFY, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.accept_local")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_ACCEPT_LOCAL, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.src_valid_mark")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_SRC_VMARK, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.proxy_arp_pvlan")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_PROXY_ARP_PVLAN, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.route_localnet")) { + if (has_sval && sval >= 0) + devconf_flush(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_ROUTE_LOCALNET, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.igmpv2_unsolicited_report_interval")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL, + DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.igmpv3_unsolicited_report_interval")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL, + DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.ignore_routes_with_linkdown")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN, + DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.drop_unicast_in_l2_multicast")) { + if (has_sval && sval >= 0) + devconf_flush(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST, + DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.drop_gratuitous_arp")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_DROP_GRATUITOUS_ARP, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.bc_forwarding")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_BC_FORWARDING, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.arp_evict_nocarrier")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_ARP_EVICT_NOCARRIER, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.default.forwarding")) { + if (has_sval && sval >= 0) + devconf_forward(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_FORWARDING, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.mc_forwarding")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_MC_FORWARDING, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.proxy_arp")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_PROXY_ARP, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.accept_redirects")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_ACCEPT_REDIRECTS, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.secure_redirects")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_SECURE_REDIRECTS, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.send_redirects")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_SEND_REDIRECTS, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.shared_media")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_SHARED_MEDIA, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.rp_filter")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_RP_FILTER, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.accept_source_route")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.bootp_relay")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_BOOTP_RELAY, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.log_martians")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_LOG_MARTIANS, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.tag")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_TAG, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.arp_filter")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_ARPFILTER, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.medium_id")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_MEDIUM_ID, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.disable_xfrm")) { + if (has_sval && sval >= 0) + devconf_flush(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_NOXFRM, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.disable_policy")) { + if (has_sval && sval >= 0) + devconf_flush(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_NOPOLICY, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.force_igmp_version")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_FORCE_IGMP_VERSION, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.arp_announce")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_ARP_ANNOUNCE, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.arp_ignore")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_ARP_IGNORE, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.promote_secondaries")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_PROMOTE_SECONDARIES, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.arp_accept")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_ARP_ACCEPT, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.arp_notify")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_ARP_NOTIFY, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.accept_local")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_ACCEPT_LOCAL, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.src_valid_mark")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_SRC_VMARK, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.proxy_arp_pvlan")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_PROXY_ARP_PVLAN, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.route_localnet")) { + if (has_sval && sval >= 0) + devconf_flush(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_ROUTE_LOCALNET, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.igmpv2_unsolicited_report_interval")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL, + DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.igmpv3_unsolicited_report_interval")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL, + DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.ignore_routes_with_linkdown")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN, + DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.drop_unicast_in_l2_multicast")) { + if (has_sval && sval >= 0) + devconf_flush(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST, + DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.drop_gratuitous_arp")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_DROP_GRATUITOUS_ARP, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.bc_forwarding")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_BC_FORWARDING, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.arp_evict_nocarrier")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_ARP_EVICT_NOCARRIER, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.unix_max_dgram_qlen")) { + if (has_sval && sval > 0) + n->unx.sysctl_max_dgram_qlen = sval; + } else if (!strcmp(name, "vm.max_map_count")) { + if (has_sval && sval > 0) + vk->sysctl_vm.max_map_count = sval; + } else if (!strcmp(name, "vm.mmap_min_addr")) { + if (!has_uval && kstrtou64(val, 16, &uval)) { + pr_warn("failed to parse raw sysctl val %s to u64\n", val); + return -EINVAL; + } + if (uval) { + vk->sysctl_vm.dac_mmap_min_addr = uval; +#ifdef CONFIG_LSM_MMAP_MIN_ADDR + if (vk->sysctl_vm.dac_mmap_min_addr > CONFIG_LSM_MMAP_MIN_ADDR) + vk->sysctl_vm.mmap_min_addr = vk->sysctl_vm.dac_mmap_min_addr; + else + vk->sysctl_vm.mmap_min_addr = CONFIG_LSM_MMAP_MIN_ADDR; +#else + vk->sysctl_vm.mmap_min_addr = vk->sysctl_vm.dac_mmap_min_addr; +#endif + } + } else if (!strcmp(name, "vm.overcommit_kbytes")) { + if (has_uval && uval) { + vk->sysctl_vm.overcommit_kbytes = uval; + vk->sysctl_vm.overcommit_ratio = 0; + } + } else if (!strcmp(name, "vm.overcommit_memory")) { + if (has_sval && sval > 0) { + if (sval == OVERCOMMIT_NEVER) + vk_sync_overcommit_as(vk); + vk->sysctl_vm.overcommit_memory = sval; + } + } else if (!strcmp(name, "vm.overcommit_ratio")) { + if (has_sval && sval) { + vk->sysctl_vm.overcommit_ratio = sval; + vk->sysctl_vm.overcommit_kbytes = 0; + } + } else { + pr_err("vkernel: unsupported sysctl %s\n", name); + return -EINVAL; + } + + pr_debug("handled sysctl %s\n", name); + return 0; +} diff --git a/drivers/vkernel/sysctl/vm.c b/drivers/vkernel/sysctl/vm.c new file mode 100644 index 0000000000000000000000000000000000000000..4b322b455da2eee05e584bd3fb49bf665d5bf49a --- /dev/null +++ b/drivers/vkernel/sysctl/vm.c @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024 ARM Ltd. + * Author: Joy Allen + */ + +#include +#include + +#include "sysctl.h" + +static s32 vk_mm_compute_batch(void) +{ + u64 memsized_batch; + s32 nr = num_present_cpus(); + s32 batch = max_t(s32, nr * 2, 32); + + /* batch size set to 0.4% of (total memory/#cpus), or max int32 */ + memsized_batch = min_t(u64, (totalram_pages() / nr) / 256, INT_MAX); + + return max_t(s32, memsized_batch, batch); +} + +void vk_sync_overcommit_as(struct vkernel *vk) +{ + struct percpu_counter *fbc = &vk->sysctl_vm.vm_committed_as; + unsigned long flags; + int cpu; + s32 *pcount; + s32 count; + + raw_spin_lock_irqsave(&fbc->lock, flags); + for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) { + pcount = per_cpu_ptr(fbc->counters, cpu); + count = *pcount; + fbc->count += count; + *pcount -= count; + } + raw_spin_unlock_irqrestore(&fbc->lock, flags); +} + +int vk_init_sysctl_vm(struct vkernel_sysctl_vm *vm) +{ + vm->max_map_count = DEFAULT_MAX_MAP_COUNT; + vm->dac_mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; +#ifdef CONFIG_LSM_MMAP_MIN_ADDR + if (vm->dac_mmap_min_addr > CONFIG_LSM_MMAP_MIN_ADDR) + vm->mmap_min_addr = vm->dac_mmap_min_addr; + else + vm->mmap_min_addr = CONFIG_LSM_MMAP_MIN_ADDR; +#else + vm->mmap_min_addr = vm->dac_mmap_min_addr; +#endif + + vm->overcommit_memory = 0; + vm->overcommit_ratio = 50; + vm->overcommit_kbytes = 0; + vm->as_batch = vk_mm_compute_batch(); + if (percpu_counter_init(&vm->vm_committed_as, 0, GFP_KERNEL)) { + pr_err("vkernel: failed to init sysctl_vm vm_committed_as\n"); + return -ENOMEM; + } + + return 0; +} + +void vk_uninit_sysctl_vm(struct vkernel_sysctl_vm *vm) +{ + percpu_counter_destroy(&vm->vm_committed_as); +} + +int vkernel_set_sysctl_vm(struct vkernel_sysctl_vm *vm, struct vkernel_sysctl_vm_desc *desc) +{ + if (desc->max_map_count > 0) + vm->max_map_count = desc->max_map_count; + + if (desc->mmap_min_addr) { + vm->dac_mmap_min_addr = desc->mmap_min_addr; +#ifdef CONFIG_LSM_MMAP_MIN_ADDR + if (vm->dac_mmap_min_addr > CONFIG_LSM_MMAP_MIN_ADDR) + vm->mmap_min_addr = vm->dac_mmap_min_addr; + else + vm->mmap_min_addr = CONFIG_LSM_MMAP_MIN_ADDR; +#else + vm->mmap_min_addr = vm->dac_mmap_min_addr; +#endif + } + + if (desc->overcommit_memory > 0) { + vm->overcommit_memory = desc->overcommit_memory; + if (desc->overcommit_ratio > 0) { + vm->overcommit_ratio = desc->overcommit_ratio; + vm->overcommit_kbytes = 0; + } else if (desc->overcommit_kbytes) { + vm->overcommit_ratio = 0; + vm->overcommit_kbytes = desc->overcommit_kbytes; + } + } + + return 0; +} +EXPORT_SYMBOL(vkernel_set_sysctl_vm); diff --git a/drivers/vkernel/utils/kallsyms.c b/drivers/vkernel/utils/kallsyms.c new file mode 100644 index 0000000000000000000000000000000000000000..613d1f0b28dc426c580fcf3c33b31d9f882ec66c --- /dev/null +++ b/drivers/vkernel/utils/kallsyms.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Wrapper of lookup_name + * Define the wrapper, so other components can include a function not a symbol + * + * Copyright (C) 2024 ARM Ltd. + * Author: Joy Allen + */ + +#include +#include + +#include "utils.h" + +/* + * There are two ways of preventing vicious recursive loops when hooking: + * - detect recusion using function return address (USE_FENTRY_OFFSET = 0) + * - avoid recusion by jumping over the ftrace call (USE_FENTRY_OFFSET = 1) + */ +#define USE_FENTRY_OFFSET 0 + +/* + * Tail call optimization can interfere with recursion detection based on + * return address on the stack. Disable it to avoid machine hangups. + */ +#if !USE_FENTRY_OFFSET +#pragma GCC optimize("-fno-optimize-sibling-calls") +#endif + +unsigned long vk_lookup_name(const char *name) +{ + struct kprobe kp = { .symbol_name = name }; + unsigned long retval; + + if (register_kprobe(&kp) < 0) + return 0; + + retval = (unsigned long)kp.addr; + unregister_kprobe(&kp); + + return retval; +} + +static unsigned long (*kallsyms_lookup_name_ptr)(const char *name); + +int vk_kallsyms_init(void) +{ + kallsyms_lookup_name_ptr = (void *)vk_lookup_name("kallsyms_lookup_name"); + if (!kallsyms_lookup_name_ptr) { + pr_err("cannot resolve symbol: kallsyms_lookup_name\n"); + return -ENOENT; + } + + return 0; +} + +void vk_kallsyms_uninit(void) {} + +unsigned long lookup_name(const char *name) +{ + return kallsyms_lookup_name_ptr(name); +} +EXPORT_SYMBOL(lookup_name); diff --git a/drivers/vkernel/vkernel_main.c b/drivers/vkernel/vkernel_main.c new file mode 100644 index 0000000000000000000000000000000000000000..32e6c0a428bb7622772692d86cd186b5216a0207 --- /dev/null +++ b/drivers/vkernel/vkernel_main.c @@ -0,0 +1,1444 @@ +// SPDX-License-Identifier: GPL-2.0 +/** + * vkernel core + * + * Copyright (C) 2024 ARM Ltd. + * Author: Joy Allen + **/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "fs.h" +#include "mm.h" +#include "sched.h" +#include "security.h" +#include "syscall.h" +#include "sysctl.h" +#include "utils.h" + +MODULE_AUTHOR("JYH Lab"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("vkernel core module"); + +/* Worst case buffer size needed for holding an integer. */ +#define ITOA_MAX_LEN 12 + +static DEFINE_MUTEX(vk_lock); +static LIST_HEAD(vk_list); + +static DEFINE_MUTEX(custom_lock); +static DEFINE_HASHTABLE(custom_ht, 6); + +struct dentry *vkernel_debugfs_dir; +EXPORT_SYMBOL_GPL(vkernel_debugfs_dir); + +static const struct file_operations vkernel_chardev_ops; + +#define CONFIG_VKERNEL_COMPAT + +#ifdef CONFIG_VKERNEL_COMPAT +#define VKERNEL_COMPAT(c) .compat_ioctl = (c) +#else +/* + * For architectures that don't implement a compat infrastructure, + * adopt a double line of defense: + * - Prevent a compat task from opening /dev/vkernel + * - If the open has been done by a 64bit task, and the vkernel fd + * passed to a compat task, let the ioctls fail. + */ +static long vkernel_no_compat_ioctl(struct file *file, unsigned int ioctl, + unsigned long arg) +{ + return -EINVAL; +} + +static int vkernel_no_compat_open(struct inode *inode, struct file *file) +{ + return is_compat_task() ? -ENODEV : 0; +} +#define VKERNEL_COMPAT(c) .compat_ioctl = vkernel_no_compat_ioctl, \ + .open = vkernel_no_compat_open +#endif + +#define VKERNEL_EVENT_CREATE_VK 0 +#define VKERNEL_EVENT_DESTROY_VK 1 + +#define VKERNEL_CAP_MASK ((1 << VKERNEL_CAP_ISOLATE_ANON) |\ + (1 << VKERNEL_CAP_ISOLATE_ANON_PIPE) | \ + (1 << VKERNEL_CAP_ISOLATE_RAMFS)) + +static void vkernel_uevent_notify_change(unsigned int type, struct vkernel *vk); +static DEFINE_MUTEX(event_lock); +static unsigned long long vkernel_createvk_count; +static unsigned long long vkernel_active_vks; + + +static int default_post_create(struct vkernel *vk) +{ + /* Set default syscall and acl rules */ + vk_install_default_syscalls(&vk->syscall); + return vkernel_set_default_acl_set(&vk->acl); +} + +static struct vkernel_custom_type default_custom = { + .owner = THIS_MODULE, + .name = "default", + .post_create = default_post_create, + .pre_destroy = NULL, +}; + +struct vkernel_custom_type *vkernel_find_custom(const char *name) +{ + struct vkernel_custom_type *custom; + unsigned int key; + + key = full_name_hash(NULL, name, strlen(name)); + + hash_for_each_possible(custom_ht, custom, hash, key) { + if (!strcmp(name, custom->name)) + return custom; + } + + return NULL; +} +EXPORT_SYMBOL(vkernel_find_custom); + +int vkernel_register_custom(struct vkernel_custom_type *custom) +{ + unsigned int key; + + if (!custom->owner) { + pr_err("custom type %s has no owner\n", custom->name); + return -EINVAL; + } + + if (vkernel_find_custom(custom->name)) { + pr_err("custom type %s already existed\n", custom->name); + return -EEXIST; + } + + key = full_name_hash(NULL, custom->name, strlen(custom->name)); + mutex_lock(&custom_lock); + hash_add(custom_ht, &custom->hash, key); + mutex_unlock(&custom_lock); + + pr_info("register cutom type %s\n", custom->name); + + return 0; +} +EXPORT_SYMBOL(vkernel_register_custom); + +int vkernel_unregister_custom(struct vkernel_custom_type *custom) +{ + pr_info("unregister cutom type %s\n", custom->name); + + mutex_lock(&custom_lock); + /* It is also ok to remove an unhashed custom */ + hash_del(&custom->hash); + mutex_unlock(&custom_lock); + + return 0; +} +EXPORT_SYMBOL(vkernel_unregister_custom); + + +__weak int vkernel_arch_vk_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + return 0; +} + +__weak int vkernel_arch_dev_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + return 0; +} + +static int vkernel_vk_ioctl_set_def_syscall(struct vkernel *vk, unsigned long arg) +{ + return vkernel_set_default_syscall_rule(&vk->syscall, arg); +} + +static int vkernel_vk_ioctl_restrict_syscall(struct vkernel *vk, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct vkernel_syscall_rule_desc desc; + + if (copy_from_user(&desc, argp, sizeof(desc))) + return -EFAULT; + + return vkernel_add_syscall_rule(&vk->syscall, &desc); +} + +static int vkernel_vk_ioctl_restrict_file(struct vkernel *vk, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct vkernel_file_desc_set header; + struct vkernel_file_desc_set *set = NULL; + unsigned long full_size; + int r = 0; + + if (copy_from_user(&header, argp, sizeof(header))) { + r = -EFAULT; + goto out; + } + if (!header.nr_descs) { + r = -EINVAL; + goto out; + } + + full_size = sizeof(header) + sizeof(struct vkernel_file_desc) * header.nr_descs; + set = kmalloc(full_size, GFP_KERNEL); + if (!set) { + r = -ENOMEM; + goto out_set; + } + if (copy_from_user(set, argp, full_size)) { + r = -EFAULT; + goto out_set; + } + + r = vkernel_set_acl_set(&vk->acl, set); + +out_set: + kfree(set); +out: + return r; +} + +static int vkernel_vk_ioctl_restrict_linux_cap(struct vkernel *vk, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct vkernel_linux_cap cap; + + if (copy_from_user(&cap, argp, sizeof(cap))) + return -EFAULT; + + return vkernel_set_linux_cap(vk, &cap); +} + +static int vkernel_vk_ioctl_set_cpu(struct vkernel *vk, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct vkernel_cpu_desc desc; + + if (copy_from_user(&desc, argp, sizeof(desc))) + return -EFAULT; + + return vkernel_set_cpu_pref(vk, &desc); +} + +static int vkernel_vk_ioctl_set_memory(struct vkernel *vk, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct vkernel_mem_desc desc; + + if (copy_from_user(&desc, argp, sizeof(desc))) + return -EFAULT; + + return vkernel_set_memory_pref(&vk->mem_pref, &desc); +} + +static int vkernel_vk_ioctl_set_sysctl_fs(struct vkernel *vk, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct vkernel_sysctl_fs_desc desc; + + if (copy_from_user(&desc, argp, sizeof(desc))) + return -EFAULT; + + return vkernel_set_sysctl_fs(&vk->sysctl_fs, &desc); +} + +static int vkernel_vk_ioctl_set_sysctl_kernel(struct vkernel *vk, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct vkernel_sysctl_kernel_desc desc; + struct ipc_namespace *ipc_ns; + + if (copy_from_user(&desc, argp, sizeof(desc))) + return -EFAULT; + + /* Handle namespace fields */ + if (vk->init_process->nsproxy) + ipc_ns = vk->init_process->nsproxy->ipc_ns; + if (likely(ipc_ns)) { + if (desc.msgmax) + ipc_ns->msg_ctlmax = desc.msgmax; + if (desc.msgmnb) + ipc_ns->msg_ctlmnb = desc.msgmnb; + if (desc.msgmni) + ipc_ns->msg_ctlmni = desc.msgmni; +#ifdef CONFIG_CHECKPOINT_RESTORE + if (desc.msg_next_id >= -1) + ipc_ns->ids[IPC_MSG_IDS].next_id = desc.msg_next_id; +#endif + if (desc.semmsl > 0) + ipc_ns->sem_ctls[0] = desc.semmsl; + if (desc.semmns > 0) + ipc_ns->sem_ctls[1] = desc.semmns; + if (desc.semopm > 0) + ipc_ns->sem_ctls[2] = desc.semopm; + if (desc.semmni > 0) + ipc_ns->sem_ctls[3] = desc.semmni; +#ifdef CONFIG_CHECKPOINT_RESTORE + if (desc.sem_next_id >= -1) + ipc_ns->ids[IPC_SEM_IDS].next_id = desc.sem_next_id; +#endif + if (desc.shmall) + ipc_ns->shm_ctlall = desc.shmall; + if (desc.shmmax) + ipc_ns->shm_ctlmax = desc.shmmax; + if (desc.shmmni) + ipc_ns->shm_ctlmni = desc.shmmni; +#ifdef CONFIG_CHECKPOINT_RESTORE + if (desc.shm_next_id) + ipc_ns->ids[IPC_SHM_IDS].next_id = desc.shm_next_id; +#endif + if (desc.shm_rmid_forced == 0 || desc.shm_rmid_forced == 1) + ipc_ns->shm_rmid_forced = desc.shm_rmid_forced; + } + + return vkernel_set_sysctl_kernel(&vk->sysctl_kernel, &desc); +} + +static int vkernel_vk_ioctl_set_sysctl_net(struct vkernel *vk, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct vkernel_sysctl_net_desc desc; + + if (copy_from_user(&desc, argp, sizeof(desc))) + return -EFAULT; + + return vkernel_set_sysctl_net(&vk->sysctl_net, &desc); +} + +static int vkernel_vk_ioctl_set_sysctl_vm(struct vkernel *vk, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct vkernel_sysctl_vm_desc desc; + + if (copy_from_user(&desc, argp, sizeof(desc))) + return -EFAULT; + + if (desc.overcommit_memory == OVERCOMMIT_NEVER) + vk_sync_overcommit_as(vk); + + return vkernel_set_sysctl_vm(&vk->sysctl_vm, &desc); +} + +static int vkernel_vk_ioctl_check_extension(struct vkernel *vk, unsigned long arg) +{ + int r = 0; + + switch (arg) { + case VKERNEL_CAP_ISOLATE_LOG: + r = 0; + break; + default: + r = -EOPNOTSUPP; + break; + } + + return r; +} + +static int vkernel_vk_ioctl_enable_cap(struct vkernel *vk, unsigned long arg) +{ + int r = 0; + + if (arg >= VKERNEL_CAP_NUM) + return -EINVAL; + + if (vk->caps & (arg << 1)) + return 0; + + switch (arg) { + case VKERNEL_CAP_ISOLATE_LOG: + vk->log_ns = vk->pid_ns->ns.inum; + break; + default: + r = -EOPNOTSUPP; + } + + if (!r) + vk->caps |= (1 << arg); + + return r; +} + +static int stat_show(struct seq_file *m, void *v) +{ + struct vkernel *vk = m->private; + + seq_puts(m, "=== BASIC ===\n"); + seq_printf(m, "Name: %s\n", vk->name); + seq_printf(m, "Pid ns: %u\n", vk->pid_ns->ns.inum); + seq_printf(m, "Uts ns: %u\n", vk->uts_ns->ns.inum); + seq_printf(m, "Init pid: %d\n", vk->init_pid); + seq_printf(m, "Users count: %d\n", refcount_read(&vk->users_count)); + seq_printf(m, "Active: %d\n", vk->active); + + seq_puts(m, "=== SECURITY ===\n"); + seq_printf(m, "Syscall def act: %d\n", vk->syscall.def_act); + seq_printf(m, "Syscall do_futex %p\n", vk->syscall.do_futex); + seq_printf(m, "ACL bits: %d\n", vk->acl.bits); + seq_printf(m, "ACL active: %d\n", vk->acl.active); + seq_printf(m, "Cap inheritable: 0x%llx\n", vk->linux_cap.inheritable.val); + seq_printf(m, "Cap permitted: 0x%llx\n", vk->linux_cap.permitted.val); + seq_printf(m, "Cap effective: 0x%llx\n", vk->linux_cap.effective.val); + seq_printf(m, "Cap bset: 0x%llx\n", vk->linux_cap.bset.val); + seq_printf(m, "Cap ambient: 0x%llx\n", vk->linux_cap.ambient.val); + + seq_puts(m, "=== RESOURCE ===\n"); + seq_printf(m, "Cpu policy: %d\n", vk->cpu_pref.policy); + seq_printf(m, "Cpu rr timeslice: %lu\n", vk->cpu_pref.rr_timeslice_us); + seq_printf(m, "Cpu wakeup gran: %lu\n", vk->cpu_pref.wakeup_gran_us); + seq_printf(m, "Mem def polciy: %u\n", vk->mem_pref.default_policy.mode); + seq_printf(m, "Mem shmem huge: %d\n", vk->mem_pref.shmem_huge); + seq_printf(m, "Mem thp flags: 0x%lx\n", vk->mem_pref.thp_flags); + + seq_puts(m, "EXTENSION CAP\n"); + seq_printf(m, "Isolation caps: 0x%lx\n", vk->caps); + seq_printf(m, "Log ns: %u\n", vk->log_ns); + + seq_puts(m, "=== SYSCTL ===\n"); + seq_printf(m, "fs.file-max=%lu\n", vk->sysctl_fs.files_stat.max_files); + seq_printf(m, "fs.nr_open=%u\n", vk->sysctl_fs.nr_open); + seq_printf(m, "fs.lease-break-time=%d\n", vk->sysctl_fs.lease_break_time); + seq_printf(m, "fs.leases-enable=%d\n", vk->sysctl_fs.leases_enable); + seq_printf(m, "fs.mount-max=%u\n", vk->sysctl_fs.mount_max); + seq_printf(m, "kernel.numa_balancing=%d\n", vk->sysctl_kernel.nb_mode); + seq_printf(m, "kernel.numa_balancing_promote_rate_limit_MBps=%d\n", + vk->sysctl_kernel.nb_promote_rate_limit); + seq_printf(m, "kernel.sched_cfs_bandwidth_slice_us=%u\n", + vk->sysctl_kernel.sched_cfs_bandwidth_slice); + seq_printf(m, "kernel.sched_child_runs_first=%u\n", + vk->sysctl_kernel.sched_child_runs_first); + seq_printf(m, "kernel.sched_deadline_period_max_us=%u\n", + vk->sysctl_kernel.sched_dl_period_max); + seq_printf(m, "kernel.sched_deadline_period_min_us=%u\n", + vk->sysctl_kernel.sched_dl_period_min); + seq_printf(m, "kernel.sched_rr_timeslice_ms=%d\n", + vk->sysctl_kernel.sched_rr_timeslice); + seq_printf(m, "kernel.sched_rt_period_us=%d\n", + vk->sysctl_kernel.sched_rt_period); + seq_printf(m, "kernel.sched_rt_runtime_us=%d\n", + vk->sysctl_kernel.sched_rt_runtime); + seq_printf(m, "kernel.threads-max=%d\n", vk->sysctl_kernel.max_threads); + seq_printf(m, "kernel.keys.gc_delay=%u\n", vk->sysctl_kernel.key_gc_delay); + seq_printf(m, "kernel.keys.maxbytes=%u\n", vk->sysctl_kernel.key_quota_maxbytes); + seq_printf(m, "kernel.keys.maxkeys=%u\n", vk->sysctl_kernel.key_quota_maxkeys); + seq_printf(m, "kernel.keys.persistent_keyring_expiry=%u\n", + vk->sysctl_kernel.persistent_keyring_expiry); + seq_printf(m, "kernel.keys.root_maxbytes=%u\n", + vk->sysctl_kernel.key_quota_root_maxbytes); + seq_printf(m, "kernel.keys.root_maxkeys=%u\n", + vk->sysctl_kernel.key_quota_root_maxkeys); + seq_printf(m, "kernel.pty.max=%d\n", vk->sysctl_kernel.pty_limit); + seq_printf(m, "kernel.pty.reserve=%d\n", vk->sysctl_kernel.pty_reserve); + seq_printf(m, "net.nf_conntrack_max=%u\n", vk->sysctl_net.nf_conntrack_max); + seq_printf(m, "net.core.busy_poll=%u\n", vk->sysctl_net.net_busy_poll); + seq_printf(m, "net.core.busy_read=%u\n", vk->sysctl_net.net_busy_read); + seq_printf(m, "net.core.optmem_max=%d\n", vk->sysctl_net.optmem_max); + seq_printf(m, "net.core.wmem_max=%u\n", vk->sysctl_net.wmem_max); + seq_printf(m, "net.core.rmem_max=%u\n", vk->sysctl_net.rmem_max); + seq_printf(m, "net.core.wmem_default=%u\n", vk->sysctl_net.wmem_default); + seq_printf(m, "net.core.rmem_default=%u\n", vk->sysctl_net.rmem_default); + seq_printf(m, "vm.max_map_count=%d\n", vk->sysctl_vm.max_map_count); + seq_printf(m, "vm.mmap_min_addr=0x%lx\n", vk->sysctl_vm.mmap_min_addr); + seq_printf(m, "vm.dac_mmap_min_addr=0x%lx\n", vk->sysctl_vm.dac_mmap_min_addr); + seq_printf(m, "vm.overcommit_kbytes=%lu\n", vk->sysctl_vm.overcommit_kbytes); + seq_printf(m, "vm.overcommit_memory=%d\n", vk->sysctl_vm.overcommit_memory); + seq_printf(m, "vm.overcommit_ratio=%d\n", vk->sysctl_vm.overcommit_ratio); + + seq_puts(m, "=== OPERATION ===\n"); + seq_printf(m, "Op cap_capable: %p\n", vk->ops.cap_capable); + seq_printf(m, "Op generic_permission: %p\n", vk->ops.generic_permission); + + seq_puts(m, "=== CUSTOM ===\n"); + seq_printf(m, "Custom type: %s\n", vk->custom->name); + seq_printf(m, "Custom post_create: %p\n", vk->custom->post_create); + seq_printf(m, "Custom pre_destroy: %p\n", vk->custom->pre_destroy); + + return 0; +} + +static int stat_open(struct inode *inode, struct file *file) +{ + struct vkernel *vk = inode->i_private; + int r; + + if (!vkernel_get_vk_safe(vk)) + return -ENOENT; + + r = single_open(file, stat_show, inode->i_private); + if (r < 0) + vkernel_put_vk(vk); + + return r; +} + +static int stat_release(struct inode *inode, struct file *file) +{ + struct vkernel *vk = inode->i_private; + + vkernel_put_vk(vk); + + return single_release(inode, file); +} + +static const struct file_operations vk_stat_fops = { + .open = stat_open, + .release = stat_release, + .read = seq_read, + .llseek = seq_lseek, +}; + +static int sysctl_show(struct seq_file *m, void *v) +{ + struct vkernel *vk = m->private; + struct ipc_namespace *ipc_ns = NULL; + struct net *n; + + if (vk->init_process->nsproxy) + ipc_ns = vk->init_process->nsproxy->ipc_ns; + + n = vk->sysctl_net.net; + + seq_puts(m, "=== fs ===\n"); + seq_printf(m, "fs.file-max=%lu\n", vk->sysctl_fs.files_stat.max_files); + seq_printf(m, "fs.nr_open=%u\n", vk->sysctl_fs.nr_open); + seq_printf(m, "fs.lease-break-time=%d\n", vk->sysctl_fs.lease_break_time); + seq_printf(m, "fs.leases-enable=%d\n", vk->sysctl_fs.leases_enable); + seq_printf(m, "fs.mount-max=%u\n", vk->sysctl_fs.mount_max); + + seq_puts(m, "=== kernel ===\n"); + if (ipc_ns) { + seq_printf(m, "kernel.msgmax=%u\n", ipc_ns->msg_ctlmax); + seq_printf(m, "kernel.msgmnb=%u\n", ipc_ns->msg_ctlmnb); + seq_printf(m, "kernel.msgmni=%u\n", ipc_ns->msg_ctlmni); +#ifdef CONFIG_CHECKPOINT_RESTORE + seq_printf(m, "kernel.msg_next_id=%d\n", ipc_ns->ids[IPC_MSG_IDS].next_id); +#endif + seq_printf(m, "kernel.sem=%d %d %d\n", + ipc_ns->sem_ctls[0], ipc_ns->sem_ctls[1], ipc_ns->sem_ctls[2]); +#ifdef CONFIG_CHECKPOINT_RESTORE + seq_printf(m, "kernel.sem_next_id=%d\n", ipc_ns->ids[IPC_SEM_IDS].next_id); +#endif + seq_printf(m, "kernel.shmall=%lu\n", ipc_ns->shm_ctlall); + seq_printf(m, "kernel.shmmax=%lu\n", ipc_ns->shm_ctlmax); + seq_printf(m, "kernel.shmmni=%d\n", ipc_ns->shm_ctlmni); +#ifdef CONFIG_CHECKPOINT_RESTORE + seq_printf(m, "kernel.shm_next_id=%d\n", ipc_ns->ids[IPC_SHM_IDS].next_id); +#endif + seq_printf(m, "kernel.shm_rmid_forced=%d\n", ipc_ns->shm_rmid_forced); + } + seq_printf(m, "kernel.numa_balancing=%d\n", vk->sysctl_kernel.nb_mode); + seq_printf(m, "kernel.numa_balancing_promote_rate_limit_MBps=%d\n", + vk->sysctl_kernel.nb_promote_rate_limit); + seq_printf(m, "kernel.sched_cfs_bandwidth_slice_us=%u\n", + vk->sysctl_kernel.sched_cfs_bandwidth_slice); + seq_printf(m, "kernel.sched_child_runs_first=%u\n", + vk->sysctl_kernel.sched_child_runs_first); + seq_printf(m, "kernel.sched_deadline_period_max_us=%u\n", + vk->sysctl_kernel.sched_dl_period_max); + seq_printf(m, "kernel.sched_deadline_period_min_us=%u\n", + vk->sysctl_kernel.sched_dl_period_min); + seq_printf(m, "kernel.sched_rr_timeslice_ms=%d\n", + vk->sysctl_kernel.sched_rr_timeslice); + seq_printf(m, "kernel.sched_rt_period_us=%d\n", + vk->sysctl_kernel.sched_rt_period); + seq_printf(m, "kernel.sched_rt_runtime_us=%d\n", + vk->sysctl_kernel.sched_rt_runtime); + seq_printf(m, "kernel.threads-max=%d\n", vk->sysctl_kernel.max_threads); + seq_printf(m, "kernel.keys.gc_delay=%u\n", vk->sysctl_kernel.key_gc_delay); + seq_printf(m, "kernel.keys.maxbytes=%u\n", vk->sysctl_kernel.key_quota_maxbytes); + seq_printf(m, "kernel.keys.maxkeys=%u\n", vk->sysctl_kernel.key_quota_maxkeys); + seq_printf(m, "kernel.keys.persistent_keyring_expiry=%u\n", + vk->sysctl_kernel.persistent_keyring_expiry); + seq_printf(m, "kernel.keys.root_maxbytes=%u\n", + vk->sysctl_kernel.key_quota_root_maxbytes); + seq_printf(m, "kernel.keys.root_maxkeys=%u\n", + vk->sysctl_kernel.key_quota_root_maxkeys); + seq_printf(m, "kernel.pty.max=%d\n", vk->sysctl_kernel.pty_limit); + seq_printf(m, "kernel.pty.reserve=%d\n", vk->sysctl_kernel.pty_reserve); + + seq_puts(m, "=== net ===\n"); + seq_printf(m, "net.nf_conntrack_max=%u\n", vk->sysctl_net.nf_conntrack_max); + seq_printf(m, "net.core.busy_poll=%u\n", vk->sysctl_net.net_busy_poll); + seq_printf(m, "net.core.busy_read=%u\n", vk->sysctl_net.net_busy_read); + seq_printf(m, "net.core.optmem_max=%d\n", vk->sysctl_net.optmem_max); + seq_printf(m, "net.core.wmem_max=%u\n", vk->sysctl_net.wmem_max); + seq_printf(m, "net.core.rmem_max=%u\n", vk->sysctl_net.rmem_max); + seq_printf(m, "net.core.wmem_default=%u\n", vk->sysctl_net.wmem_default); + seq_printf(m, "net.core.rmem_default=%u\n", vk->sysctl_net.rmem_default); + + seq_printf(m, "net.core.somaxconn=%d\n", n->core.sysctl_somaxconn); + seq_printf(m, "net.ipv4.icmp_echo_ignore_broadcasts=%u\n", + n->ipv4.sysctl_icmp_echo_ignore_broadcasts); + seq_printf(m, "net.ipv4.ip_local_port_range=%d %d\n", + n->ipv4.ip_local_ports.range[0], n->ipv4.ip_local_ports.range[1]); + seq_printf(m, "net.ipv4.tcp_max_tw_buckets=%d\n", + n->ipv4.tcp_death_row.sysctl_max_tw_buckets); + seq_printf(m, "net.ipv4.tcp_ecn=%u\n", n->ipv4.sysctl_tcp_ecn); + seq_printf(m, "net.ipv4.ip_default_ttl=%u\n", n->ipv4.sysctl_ip_default_ttl); + seq_printf(m, "net.ipv4.ip_no_pmtu_disc=%u\n", n->ipv4.sysctl_ip_no_pmtu_disc); + seq_printf(m, "net.ipv4.tcp_keepalive_time=%d\n", + READ_ONCE(n->ipv4.sysctl_tcp_keepalive_time) / HZ); + seq_printf(m, "net.ipv4.tcp_keepalive_intvl=%d\n", + READ_ONCE(n->ipv4.sysctl_tcp_keepalive_intvl) / HZ); + seq_printf(m, "net.ipv4.tcp_keepalive_probes=%u\n", + n->ipv4.sysctl_tcp_keepalive_probes); + seq_printf(m, "net.ipv4.tcp_syn_retries=%u\n", n->ipv4.sysctl_tcp_syn_retries); + seq_printf(m, "net.ipv4.tcp_synack_retries=%u\n", n->ipv4.sysctl_tcp_synack_retries); + seq_printf(m, "net.ipv4.tcp_syncookies=%u\n", n->ipv4.sysctl_tcp_syncookies); + seq_printf(m, "net.ipv4.tcp_reordering=%d\n", n->ipv4.sysctl_tcp_reordering); + seq_printf(m, "net.ipv4.tcp_retries1=%u\n", n->ipv4.sysctl_tcp_retries1); + seq_printf(m, "net.ipv4.tcp_retries2=%u\n", n->ipv4.sysctl_tcp_retries2); + seq_printf(m, "net.ipv4.tcp_orphan_retries=%u\n", n->ipv4.sysctl_tcp_orphan_retries); + seq_printf(m, "net.ipv4.tcp_tw_reuse=%u\n", n->ipv4.sysctl_tcp_tw_reuse); + seq_printf(m, "net.ipv4.tcp_fin_timeout=%d\n", + READ_ONCE(n->ipv4.sysctl_tcp_fin_timeout) / HZ); + seq_printf(m, "net.ipv4.tcp_sack=%u\n", n->ipv4.sysctl_tcp_sack); + seq_printf(m, "net.ipv4.tcp_window_scaling=%u\n", n->ipv4.sysctl_tcp_window_scaling); + seq_printf(m, "net.ipv4.tcp_timestamps=%u\n", n->ipv4.sysctl_tcp_timestamps); + seq_printf(m, "net.ipv4.tcp_thin_linear_timeouts=%u\n", + n->ipv4.sysctl_tcp_thin_linear_timeouts); + seq_printf(m, "net.ipv4.tcp_retrans_collapse=%u\n", n->ipv4.sysctl_tcp_retrans_collapse); + seq_printf(m, "net.ipv4.tcp_fack=%u\n", n->ipv4.sysctl_tcp_fack); + seq_printf(m, "net.ipv4.tcp_adv_win_scale=%d\n", n->ipv4.sysctl_tcp_adv_win_scale); + seq_printf(m, "net.ipv4.tcp_dsack=%u\n", n->ipv4.sysctl_tcp_dsack); + seq_printf(m, "net.ipv4.tcp_nometrics_save=%u\n", n->ipv4.sysctl_tcp_nometrics_save); + seq_printf(m, "net.ipv4.tcp_moderate_rcvbuf=%u\n", n->ipv4.sysctl_tcp_moderate_rcvbuf); + seq_printf(m, "net.ipv4.tcp_min_tso_segs=%u\n", n->ipv4.sysctl_tcp_min_tso_segs); + seq_printf(m, "net.ipv4.tcp_wmem=%d %d %d\n", + n->ipv4.sysctl_tcp_wmem[0], n->ipv4.sysctl_tcp_wmem[1], + n->ipv4.sysctl_tcp_wmem[2]); + seq_printf(m, "net.ipv4.tcp_rmem=%d %d %d\n", + n->ipv4.sysctl_tcp_rmem[0], n->ipv4.sysctl_tcp_rmem[1], + n->ipv4.sysctl_tcp_rmem[2]); + seq_printf(m, "net.ipv4.max_syn_backlog=%d\n", n->ipv4.sysctl_max_syn_backlog); + seq_printf(m, "net.ipv4.tcp_fastopen=%u\n", n->ipv4.sysctl_tcp_fastopen); + seq_printf(m, "net.ipv4.tcp_congestion_control=%s\n", + n->ipv4.tcp_congestion_control->name); + + seq_printf(m, "net.ipv4.conf.all.forwarding=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_FORWARDING - 1]); + seq_printf(m, "net.ipv4.conf.all.mc_forwarding=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_MC_FORWARDING - 1]); + seq_printf(m, "net.ipv4.conf.all.proxy_arp=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_PROXY_ARP - 1]); + seq_printf(m, "net.ipv4.conf.all.accept_redirects=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_ACCEPT_REDIRECTS - 1]); + seq_printf(m, "net.ipv4.conf.all.secure_redirects=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_SECURE_REDIRECTS - 1]); + seq_printf(m, "net.ipv4.conf.all.send_redirects=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_SEND_REDIRECTS - 1]); + seq_printf(m, "net.ipv4.conf.all.shared_media=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_SHARED_MEDIA - 1]); + seq_printf(m, "net.ipv4.conf.all.rp_filter=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_RP_FILTER - 1]); + seq_printf(m, "net.ipv4.conf.all.accept_source_route=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1]); + seq_printf(m, "net.ipv4.conf.all.bootp_relay=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_BOOTP_RELAY - 1]); + seq_printf(m, "net.ipv4.conf.all.log_martians=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_LOG_MARTIANS - 1]); + seq_printf(m, "net.ipv4.conf.all.tag=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_TAG - 1]); + seq_printf(m, "net.ipv4.conf.all.arp_filter=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_ARPFILTER - 1]); + seq_printf(m, "net.ipv4.conf.all.medium_id=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_MEDIUM_ID - 1]); + seq_printf(m, "net.ipv4.conf.all.disable_xfrm=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_NOXFRM - 1]); + seq_printf(m, "net.ipv4.conf.all.disable_policy=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_NOPOLICY - 1]); + seq_printf(m, "net.ipv4.conf.all.force_igmp_version=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_FORCE_IGMP_VERSION - 1]); + seq_printf(m, "net.ipv4.conf.all.arp_announce=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_ARP_ANNOUNCE - 1]); + seq_printf(m, "net.ipv4.conf.all.arp_ignore=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_ARP_IGNORE - 1]); + seq_printf(m, "net.ipv4.conf.all.promote_secondaries=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_PROMOTE_SECONDARIES - 1]); + seq_printf(m, "net.ipv4.conf.all.arp_accept=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_ARP_ACCEPT - 1]); + seq_printf(m, "net.ipv4.conf.all.arp_notify=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_ARP_NOTIFY - 1]); + seq_printf(m, "net.ipv4.conf.all.accept_local=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_ACCEPT_LOCAL - 1]); + seq_printf(m, "net.ipv4.conf.all.src_valid_mark=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_SRC_VMARK - 1]); + seq_printf(m, "net.ipv4.conf.all.proxy_arp_pvlan=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_PROXY_ARP_PVLAN - 1]); + seq_printf(m, "net.ipv4.conf.all.route_localnet=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_ROUTE_LOCALNET - 1]); + seq_printf(m, "net.ipv4.conf.all.igmpv2_unsolicited_report_interval=%d\n", + n->ipv4.devconf_all->data[ + IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1]); + seq_printf(m, "net.ipv4.conf.all.igmpv3_unsolicited_report_interval=%d\n", + n->ipv4.devconf_all->data[ + IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1]); + seq_printf(m, "net.ipv4.conf.all.ignore_routes_with_linkdown=%d\n", + n->ipv4.devconf_all->data[ + IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN - 1]); + seq_printf(m, "net.ipv4.conf.all.drop_unicast_in_l2_multicast=%d\n", + n->ipv4.devconf_all->data[ + IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST - 1]); + seq_printf(m, "net.ipv4.conf.all.drop_gratuitous_arp=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_DROP_GRATUITOUS_ARP - 1]); + seq_printf(m, "net.ipv4.conf.all.bc_forwarding=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_BC_FORWARDING - 1]); + seq_printf(m, "net.ipv4.conf.all.arp_evict_nocarrier=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_ARP_EVICT_NOCARRIER - 1]); + + seq_printf(m, "net.ipv4.conf.default.forwarding=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_FORWARDING - 1]); + seq_printf(m, "net.ipv4.conf.default.mc_forwarding=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_MC_FORWARDING - 1]); + seq_printf(m, "net.ipv4.conf.default.proxy_arp=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_PROXY_ARP - 1]); + seq_printf(m, "net.ipv4.conf.default.accept_redirects=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_ACCEPT_REDIRECTS - 1]); + seq_printf(m, "net.ipv4.conf.default.secure_redirects=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_SECURE_REDIRECTS - 1]); + seq_printf(m, "net.ipv4.conf.default.send_redirects=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_SEND_REDIRECTS - 1]); + seq_printf(m, "net.ipv4.conf.default.shared_media=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_SHARED_MEDIA - 1]); + seq_printf(m, "net.ipv4.conf.default.rp_filter=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_RP_FILTER - 1]); + seq_printf(m, "net.ipv4.conf.default.accept_source_route=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1]); + seq_printf(m, "net.ipv4.conf.default.bootp_relay=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_BOOTP_RELAY - 1]); + seq_printf(m, "net.ipv4.conf.default.log_martians=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_LOG_MARTIANS - 1]); + seq_printf(m, "net.ipv4.conf.default.tag=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_TAG - 1]); + seq_printf(m, "net.ipv4.conf.default.arp_filter=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_ARPFILTER - 1]); + seq_printf(m, "net.ipv4.conf.default.medium_id=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_MEDIUM_ID - 1]); + seq_printf(m, "net.ipv4.conf.default.disable_xfrm=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_NOXFRM - 1]); + seq_printf(m, "net.ipv4.conf.default.disable_policy=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_NOPOLICY - 1]); + seq_printf(m, "net.ipv4.conf.default.force_igmp_version=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_FORCE_IGMP_VERSION - 1]); + seq_printf(m, "net.ipv4.conf.default.arp_announce=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_ARP_ANNOUNCE - 1]); + seq_printf(m, "net.ipv4.conf.default.arp_ignore=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_ARP_IGNORE - 1]); + seq_printf(m, "net.ipv4.conf.default.promote_secondaries=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_PROMOTE_SECONDARIES - 1]); + seq_printf(m, "net.ipv4.conf.default.arp_accept=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_ARP_ACCEPT - 1]); + seq_printf(m, "net.ipv4.conf.default.arp_notify=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_ARP_NOTIFY - 1]); + seq_printf(m, "net.ipv4.conf.default.accept_local=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_ACCEPT_LOCAL - 1]); + seq_printf(m, "net.ipv4.conf.default.src_valid_mark=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_SRC_VMARK - 1]); + seq_printf(m, "net.ipv4.conf.default.proxy_arp_pvlan=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_PROXY_ARP_PVLAN - 1]); + seq_printf(m, "net.ipv4.conf.default.route_localnet=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_ROUTE_LOCALNET - 1]); + seq_printf(m, "net.ipv4.conf.default.igmpv2_unsolicited_report_interval=%d\n", + n->ipv4.devconf_dflt->data[ + IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1]); + seq_printf(m, "net.ipv4.conf.default.igmpv3_unsolicited_report_interval=%d\n", + n->ipv4.devconf_dflt->data[ + IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1]); + seq_printf(m, "net.ipv4.conf.default.ignore_routes_with_linkdown=%d\n", + n->ipv4.devconf_dflt->data[ + IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN - 1]); + seq_printf(m, "net.ipv4.conf.default.drop_unicast_in_l2_multicast=%d\n", + n->ipv4.devconf_dflt->data[ + IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST - 1]); + seq_printf(m, "net.ipv4.conf.default.drop_gratuitous_arp=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_DROP_GRATUITOUS_ARP - 1]); + seq_printf(m, "net.ipv4.conf.default.bc_forwarding=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_BC_FORWARDING - 1]); + seq_printf(m, "net.ipv4.conf.default.arp_evict_nocarrier=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_ARP_EVICT_NOCARRIER - 1]); + + seq_puts(m, "=== vm ===\n"); + seq_printf(m, "vm.max_map_count=%d\n", vk->sysctl_vm.max_map_count); + seq_printf(m, "vm.mmap_min_addr=0x%lx\n", vk->sysctl_vm.mmap_min_addr); + seq_printf(m, "vm.dac_mmap_min_addr=0x%lx\n", vk->sysctl_vm.dac_mmap_min_addr); + seq_printf(m, "vm.overcommit_kbytes=%lu\n", vk->sysctl_vm.overcommit_kbytes); + seq_printf(m, "vm.overcommit_memory=%d\n", vk->sysctl_vm.overcommit_memory); + seq_printf(m, "vm.overcommit_ratio=%d\n", vk->sysctl_vm.overcommit_ratio); + + return 0; +} + +static int sysctl_open(struct inode *inode, struct file *file) +{ + struct vkernel *vk = inode->i_private; + int r; + + if (!vkernel_get_vk_safe(vk)) + return -ENOENT; + + r = single_open(file, sysctl_show, inode->i_private); + if (r < 0) + vkernel_put_vk(vk); + + return r; +} + +static int sysctl_release(struct inode *inode, struct file *file) +{ + struct vkernel *vk = inode->i_private; + + vkernel_put_vk(vk); + + return single_release(inode, file); +} + +static ssize_t +sysctl_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct inode *inode; + struct vkernel *vk; + char buf[256]; + size_t ret; + + inode = file_inode(filp); + vk = inode->i_private; + + if (cnt > 255) + cnt = 255; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + pr_debug("sysctl write, vk %s, buf %s\n", vk->name, buf); + + ret = vkernel_set_sysctl_raw(vk, buf); + if (ret) + return ret; + + return cnt; +} + +static const struct file_operations vk_sysctl_fops = { + .open = sysctl_open, + .release = sysctl_release, + .read = seq_read, + .write = sysctl_write, + .llseek = seq_lseek, +}; + +static void vkernel_destroy_vk_debugfs(struct vkernel *vk) +{ + if (IS_ERR(vk->debugfs_dentry)) + return; + + debugfs_remove_recursive(vk->debugfs_dentry); +} + +static int vkernel_create_vk_debugfs(struct vkernel *vk, const char *name) +{ + static DEFINE_MUTEX(vkernel_debugfs_lock); + struct dentry *dent; + + if (!debugfs_initialized()) + return 0; + + mutex_lock(&vkernel_debugfs_lock); + dent = debugfs_lookup(name, vkernel_debugfs_dir); + if (dent) { + pr_warn_ratelimited("vkernel: debugfs: duplicate directory %s\n", name); + dput(dent); + mutex_unlock(&vkernel_debugfs_lock); + return 0; + } + + dent = debugfs_create_dir(name, vkernel_debugfs_dir); + mutex_unlock(&vkernel_debugfs_lock); + if (IS_ERR(dent)) + return 0; + + vk->debugfs_dentry = dent; + + debugfs_create_file("stat", 0444, dent, vk, &vk_stat_fops); + debugfs_create_file("sysctl", 0644, dent, vk, &vk_sysctl_fops); + + return 0; +} + +void vkernel_destroy_vk(struct vkernel *vk) +{ + pr_info("vkernel: destroy vk %s\n", vk->name); + + vk->active = false; + vkernel_unregister_vk(vk); + + mutex_lock(&vk_lock); +#ifdef CONFIG_DEBUG_LIST + list_del(&vk->link); +#else + if (vk->link.prev) + list_del(&vk->link); +#endif + mutex_unlock(&vk_lock); + + if (vk->custom->pre_destroy) + vk->custom->pre_destroy(vk); + if (vk->custom->owner != vkernel_chardev_ops.owner) + module_put(vk->custom->owner); + + vkernel_destroy_vk_debugfs(vk); + + vk_uninit_sysctl_vm(&vk->sysctl_vm); + vk_uninit_sysctl_net(&vk->sysctl_net); + vk_uninit_sysctl_kernel(&vk->sysctl_kernel); + vk_uninit_sysctl_fs(&vk->sysctl_fs); + vk_uninit_memory_pref(&vk->mem_pref); + vk_uninit_cpu_pref(&vk->cpu_pref); + vk_uninit_acl(&vk->acl); + vk_uninit_syscall(&vk->syscall); + kfree(vk); + module_put(vkernel_chardev_ops.owner); +} +EXPORT_SYMBOL(vkernel_destroy_vk); + +struct vkernel *vkernel_create_vk(struct task_struct *tsk, const char *name, + const char *custom) +{ + struct vkernel *vk; + int r = -ENOMEM; + + vk = kzalloc(sizeof(struct vkernel), GFP_KERNEL); + if (!vk) + return ERR_PTR(-ENOMEM); + + __module_get(vkernel_chardev_ops.owner); + + /* Init basic info */ + strscpy(vk->name, name, VKERNEL_NAME_LEN); + INIT_HLIST_NODE(&vk->hash); + vk->pid_ns = task_active_pid_ns(tsk); + vk->uts_ns = tsk->nsproxy->uts_ns; + vk->init_process = tsk; + vk->init_pid = tsk->pid; + refcount_set(&vk->users_count, 1); + + /* + * Force subsequent debugfs file creations to fail if the vk directory + * is not created (by vkernel_create_vk_debugfs()). + */ + vk->debugfs_dentry = ERR_PTR(-ENOENT); + + /* Init syscall */ + r = vk_init_syscall(&vk->syscall); + if (r) + goto err_vk; + /* Init acl */ + r = vk_init_acl(&vk->acl, VKERNEL_ACL_HASH_BITS); + if (r) + goto err_syscall; + /* Init linux cap */ + vk->linux_cap.inheritable = tsk->cred->cap_inheritable; + vk->linux_cap.permitted = tsk->cred->cap_permitted; + vk->linux_cap.effective = tsk->cred->cap_effective; + vk->linux_cap.bset = tsk->cred->cap_bset; + vk->linux_cap.ambient = tsk->cred->cap_ambient; + + /* Init cpu preference */ + r = vk_init_cpu_pref(&vk->cpu_pref); + if (r) + goto err_acl; + /* Init memory preference */ + r = vk_init_memory_pref(&vk->mem_pref); + if (r) + goto err_cpu; + + /* Init extension cap */ + vk->caps = (1 << VKERNEL_CAP_ISOLATE_LOG); + vk->log_ns = vk->pid_ns->ns.inum; + + /* Init sysctl */ + r = vk_init_sysctl_fs(&vk->sysctl_fs); + if (r) + goto err_mem; + r = vk_init_sysctl_kernel(&vk->sysctl_kernel); + if (r) + goto err_fs; + r = vk_init_sysctl_net(&vk->sysctl_net, tsk); + if (r) + goto err_kernel; + r = vk_init_sysctl_vm(&vk->sysctl_vm); + if (r) + goto err_net; + + /* Init default operations */ + vk->ops.cap_capable = vk_cap_capable; + vk->ops.generic_permission = vk_generic_permission; + + r = vkernel_create_vk_debugfs(vk, name); + if (r) + goto err_vm; + + /* Custom initializations */ + vk->custom = vkernel_find_custom(custom); + if (!vk->custom) + vk->custom = &default_custom; + if (vk->custom->owner != vkernel_chardev_ops.owner) + __module_get(vk->custom->owner); + if (vk->custom->post_create) { + r = vk->custom->post_create(vk); + if (r) + goto err_custom_debugfs; + } + + mutex_lock(&vk_lock); + list_add(&vk->link, &vk_list); + mutex_unlock(&vk_lock); + + /* Register vk into kernel. It is inactive state. */ + vkernel_register_vk(vk); + + pr_info("vkernel: create vk %s, init %d, custom %s (expect %s)", + vk->name, vk->init_pid, vk->custom->name, custom); + + return vk; + +err_custom_debugfs: + if (vk->custom->owner != vkernel_chardev_ops.owner) + module_put(vk->custom->owner); + + vkernel_destroy_vk_debugfs(vk); +err_vm: + vk_uninit_sysctl_vm(&vk->sysctl_vm); +err_net: + vk_uninit_sysctl_net(&vk->sysctl_net); +err_kernel: + vk_uninit_sysctl_kernel(&vk->sysctl_kernel); +err_fs: + vk_uninit_sysctl_fs(&vk->sysctl_fs); +err_mem: + vk_uninit_memory_pref(&vk->mem_pref); +err_cpu: + vk_uninit_cpu_pref(&vk->cpu_pref); +err_acl: + vk_uninit_acl(&vk->acl); +err_syscall: + vk_uninit_syscall(&vk->syscall); +err_vk: + kfree(vk); + module_put(vkernel_chardev_ops.owner); + + return ERR_PTR(r); +} +EXPORT_SYMBOL(vkernel_create_vk); + +void vkernel_get_vk(struct vkernel *vk) +{ + refcount_inc(&vk->users_count); +} +EXPORT_SYMBOL(vkernel_get_vk); + +/* + * Make sure the vk is not during destruction, which is a safe version of + * vkernel_get_vk(). Return true if vk referenced successfully, false otherwise. + */ +bool vkernel_get_vk_safe(struct vkernel *vk) +{ + return refcount_inc_not_zero(&vk->users_count); +} +EXPORT_SYMBOL(vkernel_get_vk_safe); + +void vkernel_put_vk(struct vkernel *vk) +{ + if (refcount_dec_and_test(&vk->users_count)) + vkernel_destroy_vk(vk); +} +EXPORT_SYMBOL(vkernel_put_vk); + +/* + * Used to put a reference that was taken on behalf of an object associated + * with a user-visible file descriptor, e.g. a vcpu or device, if installation + * of the new file descriptor fails and the reference cannot be transferred to + * its final owner. In such cases, the caller is still actively using @vk and + * will fail miserably if the refcount unexpectedly hits zero. + */ +void vkernel_put_vk_no_destroy(struct vkernel *vk) +{ + WARN_ON(refcount_dec_and_test(&vk->users_count)); +} +EXPORT_SYMBOL(vkernel_put_vk_no_destroy); + +static int vkernel_vk_release(struct inode *inode, struct file *filp) +{ + struct vkernel *vk = filp->private_data; + + pr_info("vkernel: release vk fd of %s. Currently, vk is still alive\n", vk->name); + + // vkernel_put_vk(vk); + return 0; +} + +static long vkernel_vk_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct vkernel *vk = filp->private_data; + int r = 0; + + switch (ioctl) { + case VKERNEL_SET_DEF_SYSCALL: + r = vkernel_vk_ioctl_set_def_syscall(vk, arg); + break; + case VKERNEL_RESTRICT_SYSCALL: + r = vkernel_vk_ioctl_restrict_syscall(vk, arg); + break; + case VKERNEL_RESTRICT_FILE: + r = vkernel_vk_ioctl_restrict_file(vk, arg); + break; + case VKERNEL_RESTRICT_LINUX_CAP: + r = vkernel_vk_ioctl_restrict_linux_cap(vk, arg); + break; + case VKERNEL_SET_CPU_PREF: + r = vkernel_vk_ioctl_set_cpu(vk, arg); + break; + case VKERNEL_SET_MEMORY_PREF: + r = vkernel_vk_ioctl_set_memory(vk, arg); + break; + case VKERNEL_SET_SYSCTL_FS: + r = vkernel_vk_ioctl_set_sysctl_fs(vk, arg); + break; + case VKERNEL_SET_SYSCTL_KERNEL: + r = vkernel_vk_ioctl_set_sysctl_kernel(vk, arg); + break; + case VKERNEL_SET_SYSCTL_NET: + r = vkernel_vk_ioctl_set_sysctl_net(vk, arg); + break; + case VKERNEL_SET_SYSCTL_VM: + r = vkernel_vk_ioctl_set_sysctl_vm(vk, arg); + break; + case VKERNEL_CHECK_EXTENSION: + r = vkernel_vk_ioctl_check_extension(vk, arg); + break; + case VKERNEL_ENABLE_CAP: + r = vkernel_vk_ioctl_enable_cap(vk, arg); + break; + case VKERNEL_REGISTER: + pr_warn("vkernel: [deprecated] register vk, init %d id %u ret %d\n", + vk->init_process->pid, vk->pid_ns->ns.inum, r); + break; + case VKERNEL_UNREGISTER: + pr_warn("vkernel: [deprecated] unregister vk, init %d id %u ret %d\n", + vk->init_process->pid, vk->pid_ns->ns.inum, r); + break; + case VKERNEL_ACTIVATE: + vk->active = true; + break; + case VKERNEL_DEACTIVATE: + vk->active = false; + break; + default: + r = vkernel_arch_vk_ioctl(filp, ioctl, arg); + } + + return r; +} + +#ifdef CONFIG_VKERNEL_COMPAT +long __weak vkernel_arch_vk_compat_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + return -ENOTTY; +} + +static long vkernel_vk_compat_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + int r; + + r = vkernel_arch_vk_compat_ioctl(filp, ioctl, arg); + if (r != -ENOTTY) + return r; + + return vkernel_vk_ioctl(filp, ioctl, arg); +} +#endif + +static const struct file_operations vkernel_vk_fops = { + .release = vkernel_vk_release, + .unlocked_ioctl = vkernel_vk_ioctl, + .llseek = noop_llseek, + VKERNEL_COMPAT(vkernel_vk_compat_ioctl), +}; + +static int vkernel_dev_ioctl_create_vk(unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct vkernel_desc desc; + struct task_struct *tsk; + struct vkernel *vk; + struct file *file; + char fdname[ITOA_MAX_LEN * 2 + 2]; + int r, fd; + + if (copy_from_user(&desc, argp, sizeof(desc))) + return -EFAULT; + + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), + "find_task_by_pid_ns() needs rcu_read_lock() protection"); + tsk = pid_task(find_pid_ns(desc.pid, &init_pid_ns), PIDTYPE_PID); + if (!tsk) { + pr_err("cannot find pid %d\n", desc.pid); + return -EINVAL; + } + + fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); + if (fd < 0) { + pr_err("cannot get unused fd\n"); + return fd; + } + + snprintf(fdname, sizeof(fdname), "%d-%d", desc.pid, fd); + + vk = vkernel_create_vk(tsk, fdname, desc.custom); + if (IS_ERR(vk)) { + r = PTR_ERR(vk); + goto put_fd; + } + + file = anon_inode_getfile("vkernel-vk", &vkernel_vk_fops, vk, O_RDWR); + if (IS_ERR(file)) { + r = PTR_ERR(file); + goto put_kernel; + } + + vkernel_uevent_notify_change(VKERNEL_EVENT_CREATE_VK, vk); + + fd_install(fd, file); + return fd; + +put_kernel: + vkernel_put_vk(vk); +put_fd: + put_unused_fd(fd); + return r; +} + +static int vkernel_dev_ioctl_destroy_vk(unsigned long arg) +{ + struct vkernel *vk; + unsigned int id = (unsigned int)arg; + + pr_info("vkernel: try to destroy vk with id %u\n", id); + + vk = vkernel_find_vk_by_id(id); + if (!vk) + return -EINVAL; + + vkernel_put_vk(vk); + return 0; +} + +static long vkernel_dev_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + int r = -EINVAL; + + switch (ioctl) { + case VKERNEL_GET_API_VERSION: + if (arg) + goto out; + r = VKERNEL_API_VERSION; + break; + case VKERNEL_CREATE_VK: + r = vkernel_dev_ioctl_create_vk(arg); + break; + case VKERNEL_DESTROY_VK: + r = vkernel_dev_ioctl_destroy_vk(arg); + break; + case VKERNEL_CHECK_EXTENSION: + r = vkernel_vk_ioctl_check_extension(NULL, arg); + break; + case VKERNEL_TRACE_ENABLE: + case VKERNEL_TRACE_PAUSE: + case VKERNEL_TRACE_DISABLE: + r = -EOPNOTSUPP; + break; + default: + r = vkernel_arch_dev_ioctl(filp, ioctl, arg); + } +out: + return r; +} + +static const struct file_operations vkernel_chardev_ops = { + .owner = THIS_MODULE, + .unlocked_ioctl = vkernel_dev_ioctl, + .llseek = noop_llseek, + VKERNEL_COMPAT(vkernel_dev_ioctl), +}; + +static struct miscdevice vkernel_dev = { + VKERNEL_MINOR, + "vkernel", + &vkernel_chardev_ops, +}; + +static void vkernel_uevent_notify_change(unsigned int type, struct vkernel *vk) +{ + struct kobj_uevent_env *env; + unsigned long long created, active; + + if (!vkernel_dev.this_device || !vk) + return; + + mutex_lock(&event_lock); + if (type == VKERNEL_EVENT_CREATE_VK) { + vkernel_createvk_count++; + vkernel_active_vks++; + } else if (type == VKERNEL_EVENT_DESTROY_VK) { + vkernel_active_vks--; + } + created = vkernel_createvk_count; + active = vkernel_active_vks; + mutex_unlock(&event_lock); + + env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT); + if (!env) + return; + + add_uevent_var(env, "CREATED=%llu", created); + add_uevent_var(env, "COUNT=%llu", active); + + if (type == VKERNEL_EVENT_CREATE_VK) + add_uevent_var(env, "EVENT=create"); + else if (type == VKERNEL_EVENT_DESTROY_VK) + add_uevent_var(env, "EVENT=destroy"); + add_uevent_var(env, "VKID=%d", vk->pid_ns->ns.inum); + + if (!IS_ERR(vk->debugfs_dentry)) { + char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT); + + if (p) { + tmp = dentry_path_raw(vk->debugfs_dentry, p, PATH_MAX); + if (!IS_ERR(tmp)) + add_uevent_var(env, "STATS_PATH=%s", tmp); + kfree(p); + } + } + /* no need for checks, since we are adding at most only 5 keys */ + env->envp[env->envp_idx++] = NULL; + kobject_uevent_env(&vkernel_dev.this_device->kobj, KOBJ_CHANGE, env->envp); + kfree(env); +} + +static int clear_zombie_vks(void) +{ + struct vkernel *vk; + struct vkernel *tmp; + struct task_struct *tsk; + int count = 0; + + list_for_each_entry_safe(vk, tmp, &vk_list, link) { + tsk = pid_task(find_pid_ns(vk->init_pid, &init_pid_ns), PIDTYPE_PID); + if (tsk != vk->init_process) { + if (refcount_read(&vk->users_count) > 1) + pr_err("vkernel: BUG! zombie vk %s has other refs, init %d custom %s\n", + vk->name, vk->init_pid, vk->custom->name); + vkernel_put_vk(vk); + count++; + } + } + + return count; +} + +static int clear_zombie_set(void *data, u64 val) +{ + int count; + + count = clear_zombie_vks(); + pr_info("cleared %d zombie vks\n", count); + return 0; +} + +DEFINE_DEBUGFS_ATTRIBUTE(clear_zombie_fops, NULL, clear_zombie_set, + "%lld\n"); + +static void vkernel_init_debug(void) +{ + vkernel_debugfs_dir = debugfs_create_dir("vkernel", NULL); + + debugfs_create_file("clear_zombie", 0200, vkernel_debugfs_dir, + NULL, &clear_zombie_fops); +} + +int vkernel_init(void) +{ + int ret; + + if (vk_kallsyms_init()) + return -1; + if (vk_cap_init()) + return -1; + if (vk_syscall_init()) + return -1; + if (vk_acl_init()) + return -1; + + vkernel_init_debug(); + + ret = misc_register(&vkernel_dev); + if (ret) { + pr_err("vkernel: misc device register failed\n"); + return ret; + } + + vkernel_register_custom(&default_custom); + vkernel_register_custom(&analysis_custom); + pr_info("vkernel: load vkernel\n"); + + return 0; +} +EXPORT_SYMBOL(vkernel_init); + +void vkernel_exit(void) +{ + clear_zombie_vks(); + + pr_info("vkernel: unlod vkernel\n"); + vkernel_unregister_custom(&analysis_custom); + vkernel_unregister_custom(&default_custom); + + misc_deregister(&vkernel_dev); + + debugfs_remove_recursive(vkernel_debugfs_dir); + + vk_acl_uninit(); + vk_syscall_uninit(); + vk_cap_uninit(); + vk_kallsyms_uninit(); +} +EXPORT_SYMBOL(vkernel_exit); + +module_init(vkernel_init); +module_exit(vkernel_exit); diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 299c295a27a03e86f5ae8c5e0e034de599758bc9..9db216c62c6426c9d50097c410e4ef98418cd1e5 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -24,6 +24,9 @@ #include #include #include +#ifdef CONFIG_VKERNEL +#include +#endif #define DEVPTS_DEFAULT_MODE 0600 /* @@ -512,6 +515,15 @@ static struct file_system_type devpts_fs_type = { int devpts_new_index(struct pts_fs_info *fsi) { int index = -ENOSPC; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; + + vk = vkernel_find_vk_by_task(current); + if (vk && atomic_inc_return(&vk->sysctl_kernel.pty_count) >= + (vk->sysctl_kernel.pty_limit - + (fsi->mount_opts.reserve ? 0 : vk->sysctl_kernel.pty_reserve))) + goto out; +#endif if (atomic_inc_return(&pty_count) >= (pty_limit - (fsi->mount_opts.reserve ? 0 : pty_reserve))) @@ -521,13 +533,25 @@ int devpts_new_index(struct pts_fs_info *fsi) GFP_KERNEL); out: - if (index < 0) + if (index < 0) { +#ifdef CONFIG_VKERNEL + if (vk) + atomic_dec(&vk->sysctl_kernel.pty_count); +#endif atomic_dec(&pty_count); + } return index; } void devpts_kill_index(struct pts_fs_info *fsi, int idx) { +#ifdef CONFIG_VKERNEL + struct vkernel *vk; + + vk = vkernel_find_vk_by_task(current); + if (vk) + atomic_dec(&vk->sysctl_kernel.pty_count); +#endif ida_free(&fsi->allocated_ptys, idx); atomic_dec(&pty_count); } diff --git a/fs/exec.c b/fs/exec.c index 88d8e2e51c6a8d7e1ee65a6937d318eee4299258..fa8cd535575a900f4d68b4221c127cd7d275f8f3 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -66,6 +66,9 @@ #include #include #include +#ifdef CONFIG_VKERNEL +#include +#endif #include #include @@ -779,6 +782,9 @@ int setup_arg_pages(struct linux_binprm *bprm, unsigned long rlim_stack; struct mmu_gather tlb; struct vma_iterator vmi; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif #ifdef CONFIG_STACK_GROWSUP /* Limit stack size */ @@ -803,6 +809,12 @@ int setup_arg_pages(struct linux_binprm *bprm, stack_top = arch_align_stack(stack_top); stack_top = PAGE_ALIGN(stack_top); +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk && (unlikely(stack_top < vk->sysctl_vm.mmap_min_addr) || + unlikely(vma->vm_end - vma->vm_start >= stack_top - vk->sysctl_vm.mmap_min_addr))) + return -ENOMEM; +#endif if (unlikely(stack_top < mmap_min_addr) || unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr)) return -ENOMEM; diff --git a/fs/file.c b/fs/file.c index d0c412f0dc55861eebcc486261a875f0561fd248..cd39d27bd6c6495123e15c76605bee4aa075fbe5 100644 --- a/fs/file.c +++ b/fs/file.c @@ -22,6 +22,9 @@ #include #include #include +#ifdef CONFIG_VKERNEL +#include +#endif #include "internal.h" @@ -96,6 +99,9 @@ static struct fdtable *alloc_fdtable(unsigned int slots_wanted) struct fdtable *fdt; unsigned int nr; void *data; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif /* * Figure out how many fds we actually want to support in this fdtable. @@ -123,6 +129,14 @@ static struct fdtable *alloc_fdtable(unsigned int slots_wanted) * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise * bitmaps handling below becomes unpleasant, to put it mildly... */ +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk && unlikely(nr > vk->sysctl_fs.nr_open)) { + nr = round_down(vk->sysctl_fs.nr_open, BITS_PER_LONG); + if (nr < slots_wanted) + return ERR_PTR(-EMFILE); + } +#endif if (unlikely(nr > sysctl_nr_open)) { nr = round_down(sysctl_nr_open, BITS_PER_LONG); if (nr < slots_wanted) @@ -224,6 +238,9 @@ static int expand_files(struct files_struct *files, unsigned int nr) { struct fdtable *fdt; int expanded = 0; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif repeat: fdt = files_fdtable(files); @@ -233,6 +250,11 @@ static int expand_files(struct files_struct *files, unsigned int nr) return expanded; /* Can we expand? */ +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk && nr >= vk->sysctl_fs.nr_open) + return -EMFILE; +#endif if (nr >= sysctl_nr_open) return -EMFILE; diff --git a/fs/file_table.c b/fs/file_table.c index a5a3a385f24c4f10aec11c1db04c5c071d8960ec..a0175354a1efea47033124f3655193fc27bd19f9 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -29,6 +29,9 @@ #include #include #include +#ifdef CONFIG_VKERNEL +#include +#endif #include @@ -77,8 +80,16 @@ static inline void file_free(struct file *f) security_file_free(f); if (unlikely(f->f_mode & FMODE_BACKING)) path_put(backing_file_user_path(f)); - if (likely(!(f->f_mode & FMODE_NOACCOUNT))) + if (likely(!(f->f_mode & FMODE_NOACCOUNT))) { +#ifdef CONFIG_VKERNEL + struct vkernel *vk; + + vk = vkernel_find_vk_by_task(current); + if (vk) + percpu_counter_dec(&vk->sysctl_fs.nr_files); +#endif percpu_counter_dec(&nr_files); + } call_rcu(&f->f_rcuhead, file_free_rcu); } @@ -90,6 +101,13 @@ static long get_nr_files(void) return percpu_counter_read_positive(&nr_files); } +#ifdef CONFIG_VKERNEL +static long vk_get_nr_files(struct vkernel_sysctl_fs *fs) +{ + return percpu_counter_read_positive(&fs->nr_files); +} +#endif + /* * Return the maximum number of open files in the system */ @@ -190,7 +208,19 @@ struct file *alloc_empty_file(int flags, const struct cred *cred) static long old_max; struct file *f; int error; - +#ifdef CONFIG_VKERNEL + struct vkernel *vk; + struct vkernel_sysctl_fs *fs = NULL; + + vk = vkernel_find_vk_by_task(current); + if (vk) { + fs = &vk->sysctl_fs; + if (vk_get_nr_files(fs) >= fs->files_stat.max_files && !capable(CAP_SYS_ADMIN)) { + if (percpu_counter_sum_positive(&fs->nr_files) >= fs->files_stat.max_files) + goto over_vk; + } + } +#endif /* * Privileged users can go above max_files */ @@ -213,10 +243,22 @@ struct file *alloc_empty_file(int flags, const struct cred *cred) return ERR_PTR(error); } +#ifdef CONFIG_VKERNEL + if (fs) + percpu_counter_inc(&fs->nr_files); +#endif percpu_counter_inc(&nr_files); return f; +#ifdef CONFIG_VKERNEL +over_vk: + /* Ran out of vk filps, fs cannot be NULL here */ + if (vk_get_nr_files(fs) > fs->old_max) { + pr_info("VFS: vkernel file-max limit %lu reached\n", fs->files_stat.max_files); + fs->old_max = vk_get_nr_files(fs); + } +#endif over: /* Ran out of filps - report that */ if (get_nr_files() > old_max) { diff --git a/fs/inode.c b/fs/inode.c index a2d6121d5e704d2090c2fa4ef5505cae98d89693..6777fc6bd4a1daf781f8d2f5d5885d75547c7b14 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -20,6 +20,9 @@ #include #include #include +#ifdef CONFIG_VKERNEL +#include +#endif #include #include "internal.h" @@ -158,6 +161,9 @@ int inode_init_always(struct super_block *sb, struct inode *inode) static const struct inode_operations empty_iops; static const struct file_operations no_open_fops = {.open = no_open}; struct address_space *const mapping = &inode->i_data; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif inode->i_sb = sb; inode->i_blkbits = sb->s_blocksize_bits; @@ -231,6 +237,11 @@ int inode_init_always(struct super_block *sb, struct inode *inode) if (unlikely(security_inode_alloc(inode))) return -ENOMEM; +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk) + this_cpu_inc(*vk->sysctl_fs.nr_inodes); +#endif this_cpu_inc(nr_inodes); return 0; @@ -281,6 +292,10 @@ static struct inode *alloc_inode(struct super_block *sb) void __destroy_inode(struct inode *inode) { +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif + BUG_ON(inode_has_buffers(inode)); inode_detach_wb(inode); security_inode_free(inode); @@ -296,6 +311,11 @@ void __destroy_inode(struct inode *inode) posix_acl_release(inode->i_acl); if (inode->i_default_acl && !is_uncached_acl(inode->i_default_acl)) posix_acl_release(inode->i_default_acl); +#endif +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk) + this_cpu_dec(*vk->sysctl_fs.nr_inodes); #endif this_cpu_dec(nr_inodes); } @@ -455,6 +475,10 @@ EXPORT_SYMBOL(ihold); static void __inode_add_lru(struct inode *inode, bool rotate) { +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif + if (inode->i_state & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE)) return; if (atomic_read(&inode->i_count)) @@ -464,9 +488,14 @@ static void __inode_add_lru(struct inode *inode, bool rotate) if (!mapping_shrinkable(&inode->i_data)) return; - if (list_lru_add_obj(&inode->i_sb->s_inode_lru, &inode->i_lru)) + if (list_lru_add_obj(&inode->i_sb->s_inode_lru, &inode->i_lru)) { +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk) + this_cpu_inc(*vk->sysctl_fs.nr_unused); +#endif this_cpu_inc(nr_unused); - else if (rotate) + } else if (rotate) inode->i_state |= I_REFERENCED; } @@ -849,6 +878,11 @@ static enum lru_status inode_lru_isolate(struct list_head *item, { struct list_head *freeable = arg; struct inode *inode = container_of(item, struct inode, i_lru); +#ifdef CONFIG_VKERNEL + struct vkernel *vk; + + vk = vkernel_find_vk_by_task(current); +#endif /* * We are inverting the lru lock/inode->i_lock here, so use a @@ -868,6 +902,10 @@ static enum lru_status inode_lru_isolate(struct list_head *item, !mapping_shrinkable(&inode->i_data)) { list_lru_isolate(lru, &inode->i_lru); spin_unlock(&inode->i_lock); +#ifdef CONFIG_VKERNEL + if (vk) + this_cpu_dec(*vk->sysctl_fs.nr_unused); +#endif this_cpu_dec(nr_unused); return LRU_REMOVED; } @@ -907,6 +945,10 @@ static enum lru_status inode_lru_isolate(struct list_head *item, list_lru_isolate_move(lru, &inode->i_lru, freeable); spin_unlock(&inode->i_lock); +#ifdef CONFIG_VKERNEL + if (vk) + this_cpu_dec(*vk->sysctl_fs.nr_unused); +#endif this_cpu_dec(nr_unused); return LRU_REMOVED; } diff --git a/fs/namei.c b/fs/namei.c index ff289f801501546ff718f2c7d346c0920726c7aa..0cb1592fe2c94ccb14ab8d10eea8a1f7041b9fc1 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -41,6 +41,9 @@ #include #include #include +#ifdef CONFIG_VKERNEL +#include +#endif #include "internal.h" #include "mount.h" @@ -401,6 +404,16 @@ int generic_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { int ret; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; + + vk = vkernel_find_vk_by_task(current); + if (vk) { + ret = vk->ops.generic_permission(vk, idmap, inode, mask); + if (ret) + return ret; + } +#endif /* * Do the basic permission checks. diff --git a/fs/namespace.c b/fs/namespace.c index 87dd8efd868447cdb4cf0bdf5627363d76078bfd..c471b93c5afb0ba6dce12e687c66682c61f80f0a 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -32,6 +32,9 @@ #include #include #include +#ifdef CONFIG_VKERNEL +#include +#endif #include "pnode.h" #include "internal.h" @@ -2220,6 +2223,13 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt) unsigned int max = READ_ONCE(sysctl_mount_max); unsigned int mounts = 0; struct mount *p; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; + + vk = vkernel_find_vk_by_task(current); + if (vk && ns->mounts > READ_ONCE(vk->sysctl_fs.mount_max)) + return -ENOSPC; +#endif if (ns->mounts >= max) return -ENOSPC; diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index d749981a35d94db71f1bd43ead5c36b7c008fe0f..2fae4223d24c5f7db5a46d779b8c0fd8a6ce8aca 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -18,6 +18,10 @@ #include #endif #include +#include +#ifdef CONFIG_VKERNEL +#include +#endif #include #include #include "internal.h" @@ -35,6 +39,7 @@ static void show_val_kb(struct seq_file *m, const char *s, unsigned long num) static int meminfo_proc_show(struct seq_file *m, void *v) { struct sysinfo i; + unsigned long commit_limit; unsigned long committed; long cached; long available; @@ -47,6 +52,23 @@ static int meminfo_proc_show(struct seq_file *m, void *v) si_meminfo(&i); si_swapinfo(&i); committed = vm_memory_committed(); + commit_limit = vm_commit_limit(); + +#if defined(CONFIG_VKERNEL) && defined(CONFIG_MEMCG) + { + struct vkernel *vk; + struct mem_cgroup *memcg; + + vk = vkernel_find_vk_by_task(current); + if (vk) { + memcg = mem_cgroup_from_task(current); + if (memcg) { + commit_limit = vk_vm_commit_limit(&vk->sysctl_vm, memcg); + css_put(&memcg->css); + } + } + } +#endif cached = global_node_page_state(NR_FILE_PAGES) - total_swapcache_pages() - i.bufferram; @@ -127,7 +149,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) global_zone_page_state(NR_BOUNCE)); show_val_kb(m, "WritebackTmp: ", global_node_page_state(NR_WRITEBACK_TEMP)); - show_val_kb(m, "CommitLimit: ", vm_commit_limit()); + show_val_kb(m, "CommitLimit: ", commit_limit); show_val_kb(m, "Committed_AS: ", committed); seq_printf(m, "VmallocTotal: %8lu kB\n", (unsigned long)VMALLOC_TOTAL >> 10); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index e2d96e1e9e9f3977de67afc7c654af8cd2d631ca..3c324fbf5f0d560cef7c8148e9898230627cc4ec 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -31,6 +31,9 @@ #include #include #include +#ifdef CONFIG_VKERNEL +#include +#endif static int sysctl_unprivileged_userfaultfd __read_mostly; @@ -1290,11 +1293,19 @@ static __always_inline int validate_unaligned_range( struct mm_struct *mm, __u64 start, __u64 len) { __u64 task_size = mm->task_size; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif if (len & ~PAGE_MASK) return -EINVAL; if (!len) return -EINVAL; +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk && start < vk->sysctl_vm.mmap_min_addr) + return -EINVAL; +#endif if (start < mmap_min_addr) return -EINVAL; if (start >= task_size) diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h index c0fea6ca507681215b2eda389ab9af0ce4aa52d6..dc8fb910c84d8746b2acd73ee619983e33d650ae 100644 --- a/include/linux/miscdevice.h +++ b/include/linux/miscdevice.h @@ -62,6 +62,7 @@ #define FUSE_MINOR 229 #define SNAPSHOT_MINOR 231 #define KVM_MINOR 232 +#define VKERNEL_MINOR 233 #define BTRFS_MINOR 234 #define AUTOFS_MINOR 235 #define MAPPER_CTRL_MINOR 236 diff --git a/include/linux/mman.h b/include/linux/mman.h index b2e2677ea156ac5eddc55b90f6a133a92249cfe6..301e3d6f17cbf2714c575d3640305b31be32e80a 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -75,10 +75,14 @@ static inline void mm_compute_batch(int overcommit_policy) unsigned long vm_memory_committed(void); +#ifdef CONFIG_VKERNEL +void vm_acct_memory(long pages); +#else static inline void vm_acct_memory(long pages) { percpu_counter_add_batch(&vm_committed_as, pages, vm_committed_as_batch); } +#endif static inline void vm_unacct_memory(long pages) { diff --git a/include/linux/vkernel.h b/include/linux/vkernel.h new file mode 100644 index 0000000000000000000000000000000000000000..e98d9d14ac73b178b8afa1b9f3a37d0615571ae5 --- /dev/null +++ b/include/linux/vkernel.h @@ -0,0 +1,578 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2024 ARM Ltd. + * Author: Joy Allen + */ + +#ifndef _LINUX_VKERNEL_H +#define _LINUX_VKERNEL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#define VKERNEL_API_VERSION 1 + +#define VKERNEL_NAME_LEN 64 +#define VKERNEL_PATH_MAX 128 +#define VKERNEL_ACL_HASH_BITS 8 + +#define NOT_FOUND 0x8000 +#define IOP_VKERNEL_REG 0x8000 +#define IOP_VKERNEL_DIR 0x4000 + +/* Refer KVM */ +#define VKERNELIO 0xAF + +/* System/VK IOCTL list */ +#define VKERNEL_GET_API_VERSION _IO(VKERNELIO, 0x00) +#define VKERNEL_CREATE_VK _IO(VKERNELIO, 0x01) +#define VKERNEL_DESTROY_VK _IO(VKERNELIO, 0x02) +#define VKERNEL_CHECK_EXTENSION _IO(VKERNELIO, 0x03) +#define VKERNEL_TRACE_ENABLE _IO(VKERNELIO, 0x04) +#define VKERNEL_TRACE_PAUSE _IO(VKERNELIO, 0x05) +#define VKERNEL_TRACE_DISABLE _IO(VKERNELIO, 0x06) +#define VKERNEL_SET_DEF_SYSCALL _IO(VKERNELIO, 0x07) +#define VKERNEL_RESTRICT_SYSCALL _IO(VKERNELIO, 0x08) +#define VKERNEL_RESTRICT_FILE _IO(VKERNELIO, 0x09) +#define VKERNEL_RESTRICT_LINUX_CAP _IO(VKERNELIO, 0x0a) +#define VKERNEL_SET_CPU_PREF _IO(VKERNELIO, 0X0b) +#define VKERNEL_SET_MEMORY_PREF _IO(VKERNELIO, 0X0c) +#define VKERNEL_SET_SYSCTL_FS _IO(VKERNELIO, 0X0d) +#define VKERNEL_SET_SYSCTL_KERNEL _IO(VKERNELIO, 0x0e) +#define VKERNEL_SET_SYSCTL_NET _IO(VKERNELIO, 0x0f) +#define VKERNEL_SET_SYSCTL_VM _IO(VKERNELIO, 0x10) +#define VKERNEL_ENABLE_CAP _IO(VKERNELIO, 0x11) +#define VKERNEL_REGISTER _IO(VKERNELIO, 0x12) +#define VKERNEL_UNREGISTER _IO(VKERNELIO, 0x13) +#define VKERNEL_ACTIVATE _IO(VKERNELIO, 0x14) +#define VKERNEL_DEACTIVATE _IO(VKERNELIO, 0x15) + +/* syscall condition compare operations */ +#define VKERNEL_SYSCALL_CMP_ED 0 /* invalid op, means the end of conditions */ +#define VKERNEL_SYSCALL_CMP_EQ 1 /* equal, arg == val */ +#define VKERNEL_SYSCALL_CMP_NE 2 /* not equal, arg != val */ +#define VKERNEL_SYSCALL_CMP_LT 3 /* less than, arg < val */ +#define VKERNEL_SYSCALL_CMP_LE 4 /* less than or equal, arg <= val */ +#define VKERNEL_SYSCALL_CMP_GT 5 /* greater than, arg > val */ +#define VKERNEL_SYSCALL_CMP_GE 6 /* greater than or equal, arg >= val */ +#define VKRENEL_SYSCALL_CMP_ME 7 /* masked equal, arg & mask == val, mask is val1 */ + +/* syscall rule actions */ +#define VKERNEL_SYSCALL_ACT_INVALID 0 +#define VKERNEL_SYSCALL_ACT_KILL_PROCESS 1 +#define VKERNEL_SYSCALL_ACT_KILL_THREAD 2 +#define VKERNEL_SYSCALL_ACT_TRAP 3 +#define VKERNEL_SYSCALL_ACT_ERRNO 4 +#define VKERNEL_SYSCALL_ACT_USER_NOTIF 5 +#define VKERNEL_SYSCALL_ACT_TRACE 6 +#define VKERNEL_SYSCALL_ACT_LOG 7 +#define VKERNEL_SYSCALL_ACT_ALLOW 8 + +#define VKERNEL_SYSCALL_ACT_BITS 16 +#define VKERNEL_SYSCALL_ERRNO_BITS 16 +#define VKERNEL_SYSCALL_ERRNO_MASK ((1U << VKERNEL_SYSCALL_ERRNO_BITS) - 1) + +/* Extension capability list */ +#define VKERNEL_CAP_ISOLATE_LOG 0 +#define VKERNEL_CAP_ISOLATE_ANON 1 +#define VKERNEL_CAP_ISOLATE_ANON_PIPE 2 +#define VKERNEL_CAP_ISOLATE_RAMFS 3 +#define VKERNEL_CAP_NUM 4 + +#define current_vk_task get_current_syscall_task() +#define current_vk get_current_syscall_vk() + +#define vk_hugepage_flags_enabled(flags) \ + (flags & \ + ((1< #include #include +#ifdef CONFIG_VKERNEL +#include +#endif #include @@ -755,6 +758,18 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) file = hugetlb_file_setup(name, hugesize, acctflag, HUGETLB_SHMFS_INODE, (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK); } else { +#ifdef CONFIG_VKERNEL + struct vkernel *vk; + + vk = vkernel_find_vk_by_task(current); + if (vk) { + if ((shmflg & SHM_NORESERVE) && + vk->sysctl_vm.overcommit_memory != OVERCOMMIT_NEVER) + acctflag = VM_NORESERVE; + } else if ((shmflg & SHM_NORESERVE) && + sysctl_overcommit_memory != OVERCOMMIT_NEVER) + acctflag = VM_NORESERVE; +#else /* * Do not allow no accounting for OVERCOMMIT_NEVER, even * if it's asked for. @@ -762,6 +777,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) if ((shmflg & SHM_NORESERVE) && sysctl_overcommit_memory != OVERCOMMIT_NEVER) acctflag = VM_NORESERVE; +#endif file = shmem_kernel_file_setup(name, size, acctflag); } error = PTR_ERR(file); diff --git a/kernel/Makefile b/kernel/Makefile index 5c0e22393015843ec3fc100e6061bdfcf1c96932..b3fdabacdfd9e598a13b6b7276be15363f2376b5 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -15,6 +15,7 @@ obj-y = fork.o exec_domain.o panic.o \ obj-$(CONFIG_USERMODE_DRIVER) += usermode_driver.o obj-$(CONFIG_MULTIUSER) += groups.o obj-$(CONFIG_VHOST_TASK) += vhost_task.o +obj-$(CONFIG_VKERNEL) += vkernel_hook.o ifdef CONFIG_FUNCTION_TRACER # Do not trace internal ftrace files diff --git a/kernel/exit.c b/kernel/exit.c index 12e102d110461c75e621c544c9cb3cba5f6a618d..5ae00a491a6ae768c8eaf9fba8e423eb6451a4b7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -72,6 +72,9 @@ #ifdef CONFIG_TEXT_UNEVICTABLE #include #endif +#ifdef CONFIG_VKERNEL +#include +#endif #include #include @@ -136,6 +139,13 @@ late_initcall(kernel_exit_sysfs_init); static void __unhash_process(struct task_struct *p, bool group_dead) { +#ifdef CONFIG_VKERNEL + struct vkernel *vk; + + vk = vkernel_find_vk_by_task(current); + if (vk) + vk->sysctl_kernel.nr_threads--; +#endif nr_threads--; detach_pid(p, PIDTYPE_PID); if (group_dead) { diff --git a/kernel/fork.c b/kernel/fork.c index 823854ce0bbb9caaaa44df3fcec627ace99a4258..82bd2d357f7a392b2a68d378043900e0f0046ea9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -102,6 +102,10 @@ #include #include +#ifdef CONFIG_VKERNEL +#include +#endif + #include #include #include @@ -2325,6 +2329,9 @@ __latent_entropy struct task_struct *copy_process( struct file *pidfile = NULL; const u64 clone_flags = args->flags; struct nsproxy *nsp = current->nsproxy; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif /* * Don't allow sharing the root directory with processes in a different @@ -2462,6 +2469,12 @@ __latent_entropy struct task_struct *copy_process( * to stop root fork bombs. */ retval = -EAGAIN; +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + /* Vkernel: Check vkernel data race */ + if (vk && data_race(vk->sysctl_kernel.nr_threads >= vk->sysctl_kernel.max_threads)) + goto bad_fork_cleanup_count; +#endif if (data_race(nr_threads >= max_threads)) goto bad_fork_cleanup_count; @@ -2802,6 +2815,10 @@ __latent_entropy struct task_struct *copy_process( &p->signal->thread_head); } attach_pid(p, PIDTYPE_PID); +#ifdef CONFIG_VKERNEL + if (vk) + vk->sysctl_kernel.nr_threads++; +#endif nr_threads++; } __this_cpu_inc(total_forks); @@ -3665,6 +3682,13 @@ int sysctl_max_threads(struct ctl_table *table, int write, int threads = max_threads; int min = 1; int max = MAX_THREADS; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; + + vk = vkernel_find_vk_by_task(current); + if (vk) + threads = vk->sysctl_kernel.max_threads; +#endif t = *table; t.data = &threads; @@ -3675,7 +3699,14 @@ int sysctl_max_threads(struct ctl_table *table, int write, if (ret || !write) return ret; +#ifdef CONFIG_VKERNEL + if (vk) + vk->sysctl_kernel.max_threads = threads; + else + max_threads = threads; +#else max_threads = threads; +#endif return 0; } diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c index 48feaa545b3c7f20d682fd57a399140910fd172f..3a2dae1b7bb0800be974fdd7fa812d824eac293c 100644 --- a/kernel/futex/syscalls.c +++ b/kernel/futex/syscalls.c @@ -3,6 +3,9 @@ #include #include #include +#ifdef CONFIG_VKERNEL +#include +#endif #include "futex.h" @@ -115,6 +118,13 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, { int cmd = op & FUTEX_CMD_MASK; unsigned int flags = 0; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; + + vk = vkernel_find_vk_by_task(current); + if (vk && vk->syscall.do_futex) + return vk->syscall.do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); +#endif if (!(op & FUTEX_PRIVATE_FLAG)) flags |= FLAGS_SHARED; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 3f16cda43d31d134d059cb731d66da728341a724..bb80333aec2954b06233b120ab89c9894aafdf5b 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -47,6 +47,9 @@ #include #include #include +#ifdef CONFIG_VKERNEL +#include +#endif #include #include @@ -1112,6 +1115,9 @@ static unsigned int __init add_to_rb(struct printk_ringbuffer *rb, dest_r.info->flags = r->info->flags; dest_r.info->ts_nsec = r->info->ts_nsec; dest_r.info->caller_id = r->info->caller_id; +#ifdef CONFIG_VKERNEL + dest_r.info->ns = r->info->ns; +#endif memcpy(&dest_r.info->dev_info, &r->info->dev_info, sizeof(dest_r.info->dev_info)); prb_final_commit(&e); @@ -2206,6 +2212,9 @@ int vprintk_store(int facility, int level, u16 text_len; int ret = 0; u64 ts_nsec; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif if (!printk_enter_irqsave(recursion_ptr, irqflags)) return 0; @@ -2293,6 +2302,11 @@ int vprintk_store(int facility, int level, r.info->flags = flags & 0x1f; r.info->ts_nsec = ts_nsec; r.info->caller_id = caller_id; +#ifdef CONFIG_VKERNEL + /* Set log namespace (host can set any invalid value) */ + vk = vkernel_find_vk_by_task(current); + r.info->ns = vk ? vk->log_ns : 0; +#endif if (dev_info) memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info)); diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index fde338606ce83c1a013a270eac64dda039d4e677..c230231abb6927fa8741b258879f559ae7ebb67a 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -5,6 +5,9 @@ #include #include #include +#ifdef CONFIG_VKERNEL +#include +#endif #include "printk_ringbuffer.h" /** @@ -1803,6 +1806,14 @@ static int prb_read(struct printk_ringbuffer *rb, u64 seq, struct prb_desc desc; unsigned long id; int err; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; + + /* Skip record when reading log owned by other ns */ + vk = vkernel_find_vk_by_task(current); + if (vk && vk->log_ns != info->ns) + return -ENOENT; +#endif /* Extract the ID, used to specify the descriptor to read. */ id = DESC_ID(atomic_long_read(state_var)); diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h index 18cd25e489b8935225c1ca9292c109af71941eb1..72d591b7842e84bd0914067d5019371641856de9 100644 --- a/kernel/printk/printk_ringbuffer.h +++ b/kernel/printk/printk_ringbuffer.h @@ -20,6 +20,9 @@ struct printk_info { u8 flags:5; /* internal record flags */ u8 level:3; /* syslog level */ u32 caller_id; /* thread id or processor id */ +#ifdef CONFIG_VKERNEL + u64 ns; /* log namespace */ +#endif struct dev_printk_info dev_info; }; diff --git a/kernel/sys.c b/kernel/sys.c index 1f3ef34ec1903410b43362f09184aba7cc433706..280af2c2b73617d8ab9f7fc96fd1d467100cff94 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -64,6 +64,9 @@ #include #include #include +#ifdef CONFIG_VKERNEL +#include +#endif #include @@ -1562,6 +1565,9 @@ static int do_prlimit(struct task_struct *tsk, unsigned int resource, { struct rlimit *rlim; int retval = 0; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif if (resource >= RLIM_NLIMITS) return -EINVAL; @@ -1570,6 +1576,12 @@ static int do_prlimit(struct task_struct *tsk, unsigned int resource, if (new_rlim) { if (new_rlim->rlim_cur > new_rlim->rlim_max) return -EINVAL; +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk && resource == RLIMIT_NOFILE && + new_rlim->rlim_max > vk->sysctl_fs.nr_open) + return -EPERM; +#endif if (resource == RLIMIT_NOFILE && new_rlim->rlim_max > sysctl_nr_open) return -EPERM; @@ -2051,6 +2063,9 @@ static int validate_prctl_map_addr(struct prctl_mm_map *prctl_map) { unsigned long mmap_max_addr = TASK_SIZE; int error = -EINVAL, i; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif static const unsigned char offsets[] = { offsetof(struct prctl_mm_map, start_code), @@ -2073,6 +2088,11 @@ static int validate_prctl_map_addr(struct prctl_mm_map *prctl_map) for (i = 0; i < ARRAY_SIZE(offsets); i++) { u64 val = *(u64 *)((char *)prctl_map + offsets[i]); +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk && (unsigned long)val < vk->sysctl_vm.mmap_min_addr) + goto out; +#endif if ((unsigned long)val >= mmap_max_addr || (unsigned long)val < mmap_min_addr) goto out; @@ -2258,6 +2278,9 @@ static int prctl_set_mm(int opt, unsigned long addr, }; struct vm_area_struct *vma; int error; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif if (arg5 || (arg4 && (opt != PR_SET_MM_AUXV && opt != PR_SET_MM_MAP && @@ -2278,6 +2301,11 @@ static int prctl_set_mm(int opt, unsigned long addr, if (opt == PR_SET_MM_AUXV) return prctl_set_auxv(mm, addr, arg4); +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk && addr < vk->sysctl_vm.mmap_min_addr) + return -EINVAL; +#endif if (addr >= TASK_SIZE || addr < mmap_min_addr) return -EINVAL; diff --git a/kernel/vkernel_hook.c b/kernel/vkernel_hook.c new file mode 100644 index 0000000000000000000000000000000000000000..7a438f251941c5025c7e1406a18543d1c80abd5b --- /dev/null +++ b/kernel/vkernel_hook.c @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: GPL-2.0 +/** + * Vkernel hook + * + * Vkernel polcies are implemented as loadable module(s) and + * applied by hooks + * + * Copyright (C) 2024 ARM Ltd. + * Author: Joy Allen + **/ + +#include + +static DEFINE_MUTEX(vkernel_lock); +static DEFINE_HASHTABLE(vkernel_ht, 6); + +/* id -> vk cache */ +static unsigned int id_cache; +static struct vkernel *vk_cache; + +DEFINE_PER_CPU(struct task_struct *, current_syscall_task); +EXPORT_PER_CPU_SYMBOL(current_syscall_task); + +DEFINE_PER_CPU(struct vkernel *, current_syscall_vk); +EXPORT_PER_CPU_SYMBOL(current_syscall_vk); + +struct vkernel *vkernel_find_vk_by_id(unsigned int id) +{ + struct vkernel *vk; + + if (id == id_cache) + return vk_cache; + + /* TODO: protect with rwlock? */ + hash_for_each_possible(vkernel_ht, vk, hash, id) { + if (id == vk->pid_ns->ns.inum) { + id_cache = vk->pid_ns->ns.inum; + vk_cache = vk; + return vk; + } + } + + return NULL; +} +EXPORT_SYMBOL(vkernel_find_vk_by_id); + +struct vkernel *vkernel_find_vk_by_task(struct task_struct *tsk) +{ + struct vkernel *vk; + struct pid_namespace *ns; + + ns = task_active_pid_ns(tsk); + if (!ns || ns == &init_pid_ns) + return NULL; + + vk = vkernel_find_vk_by_id(ns->ns.inum); + if (vk && vk->active) + return vk; + + return NULL; +} +EXPORT_SYMBOL(vkernel_find_vk_by_task); + +int vkernel_register_vk(struct vkernel *vk) +{ + if (!hlist_unhashed(&vk->hash)) + return -EEXIST; + + mutex_lock(&vkernel_lock); + hash_add(vkernel_ht, &vk->hash, vk->pid_ns->ns.inum); + mutex_unlock(&vkernel_lock); + id_cache = vk->pid_ns->ns.inum; + vk_cache = vk; + + return 0; +} +EXPORT_SYMBOL(vkernel_register_vk); + +int vkernel_unregister_vk(struct vkernel *vk) +{ + if (vk->pid_ns->ns.inum == id_cache) { + id_cache = 0; + vk_cache = NULL; + } + /* It is also ok to remove an unhashed vk */ + mutex_lock(&vkernel_lock); + hash_del(&vk->hash); + mutex_unlock(&vkernel_lock); + + return 0; +} +EXPORT_SYMBOL(vkernel_unregister_vk); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 234b6e2cba1cd8dbb51db8727ea17ef3d6a43958..e911652a435f4334feb872938d6d158a8bfadeae 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -40,6 +40,9 @@ #include #include +#ifdef CONFIG_VKERNEL +#include +#endif #include #include #include "internal.h" @@ -89,6 +92,14 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, bool in_pf = tva_flags & TVA_IN_PF; bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS; unsigned long supported_orders; +#ifdef CONFIG_VKERNEL + unsigned long flags = transparent_hugepage_flags; + struct vkernel *vk; + + vk = vkernel_find_vk_by_task(current); + if (vk) + flags = vk->mem_pref.thp_flags; +#endif /* Check the intersection of requested and supported orders. */ if (vma_is_anonymous(vma)) @@ -105,7 +116,11 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, if (!vma->vm_mm) /* vdso */ return 0; +#ifdef CONFIG_VKERNEL + if (vk_thp_disabled_by_hw(flags) || vma_thp_disabled(vma, vm_flags)) +#else if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags)) +#endif return 0; /* khugepaged doesn't collapse DAX vma, but page fault is fine. */ @@ -165,9 +180,15 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, * Enforce sysfs THP requirements as necessary. Anonymous vmas * were already handled in thp_vma_allowable_orders(). */ +#ifdef CONFIG_VKERNEL + if (enforce_sysfs && + (!vk_hugepage_flags_enabled(flags) || (!(vm_flags & VM_HUGEPAGE) && + !vk_hugepage_flags_always(flags)))) +#else if (enforce_sysfs && (!hugepage_global_enabled() || (!(vm_flags & VM_HUGEPAGE) && !hugepage_global_always()))) +#endif return 0; /* @@ -1237,23 +1258,33 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma) { const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE); + unsigned long *flags = &transparent_hugepage_flags; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; + + vk = vkernel_find_vk_by_task(current); + if (vk) + flags = &vk->mem_pref.thp_flags; + + /* FIXME: should we both check global and local flags? */ +#endif /* Always do synchronous compaction */ - if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, flags)) return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); /* Kick kcompactd and fail quickly */ - if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, flags)) return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; /* Synchronous compaction if madvised, otherwise kick kcompactd */ - if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, flags)) return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : __GFP_KSWAPD_RECLAIM); /* Only do synchronous compaction if madvised */ - if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, flags)) return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : 0); @@ -1281,6 +1312,10 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) gfp_t gfp; struct folio *folio; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; +#ifdef CONFIG_VKERNEL + unsigned long flags = transparent_hugepage_flags; + struct vkernel *vk; +#endif if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) return VM_FAULT_FALLBACK; @@ -1288,9 +1323,19 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) return VM_FAULT_OOM; khugepaged_enter_vma(vma, vma->vm_flags); +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk) + flags = vk->mem_pref.thp_flags; +#endif + if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm) && +#ifdef CONFIG_VKERNEL + vk_transparent_hugepage_use_zero_page(flags)) { +#else transparent_hugepage_use_zero_page()) { +#endif pgtable_t pgtable; struct folio *zero_folio; vm_fault_t ret; diff --git a/mm/memory.c b/mm/memory.c index dc52184607ad7e61b7078dba9ac445fc175b55ec..c8f35e2f86765728bae9d84ff8d60a9bf6443f52 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -79,6 +79,9 @@ #include #include #include +#ifdef CONFIG_VKERNEL +#include +#endif #include #include @@ -5017,6 +5020,14 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) unsigned long haddr = vmf->address & HPAGE_PMD_MASK; pmd_t entry; vm_fault_t ret = VM_FAULT_FALLBACK; +#ifdef CONFIG_VKERNEL + unsigned long flags = transparent_hugepage_flags; + struct vkernel *vk; + + vk = vkernel_find_vk_by_task(current); + if (vk) + flags = vk->mem_pref.thp_flags; +#endif /* * It is too late to allocate a small folio, we already have a large @@ -5024,7 +5035,11 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) * PMD mappings, but PTE-mapped THP are fine. So let's simply refuse any * PMD mappings if THPs are disabled. */ +#ifdef CONFIG_VKERNEL + if (vk_thp_disabled_by_hw(flags) || vma_thp_disabled(vma, vma->vm_flags)) +#else if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags)) +#endif return ret; if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) diff --git a/mm/mmap.c b/mm/mmap.c index 9090fffd770897de5c302f8b358c7f435a470682..566bef3d1fed4da91b6378f9824cdafb030497de 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -49,6 +49,10 @@ #include #include +#ifdef CONFIG_VKERNEL +#include +#endif + #include #include #include @@ -1160,7 +1164,16 @@ struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) */ static inline unsigned long round_hint_to_min(unsigned long hint) { +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif + hint &= PAGE_MASK; +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk && ((void *)hint != NULL) && (hint < vk->sysctl_vm.mmap_min_addr)) + hint = PAGE_ALIGN(vk->sysctl_vm.mmap_min_addr); +#endif if (((void *)hint != NULL) && (hint < mmap_min_addr)) return PAGE_ALIGN(mmap_min_addr); @@ -1227,6 +1240,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr, { struct mm_struct *mm = current->mm; int pkey = 0; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif *populate = 0; @@ -1260,6 +1276,11 @@ unsigned long do_mmap(struct file *file, unsigned long addr, return -EOVERFLOW; /* Too many mappings? */ +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk && mm->map_count > vk->sysctl_vm.max_map_count) + return -ENOMEM; +#endif #ifdef CONFIG_PID_NS if (mm->map_count > task_active_pid_ns(current)->max_map_count) #else @@ -1390,6 +1411,13 @@ unsigned long do_mmap(struct file *file, unsigned long addr, * memory use of this mapping. */ if (flags & MAP_NORESERVE) { +#ifdef CONFIG_VKERNEL + if (vk) { + if (vk->sysctl_vm.overcommit_memory != OVERCOMMIT_NEVER) + vm_flags |= VM_NORESERVE; + } else if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) + vm_flags |= VM_NORESERVE; +#else /* We honor MAP_NORESERVE if allowed to overcommit */ if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) vm_flags |= VM_NORESERVE; @@ -1397,6 +1425,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, /* hugetlb applies strict overcommit unless MAP_NORESERVE */ if (file && is_file_hugepages(file)) vm_flags |= VM_NORESERVE; +#endif } addr = mmap_region(file, addr, len, vm_flags, pgoff, uf); @@ -1593,6 +1622,9 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info) unsigned long length, gap; unsigned long low_limit, high_limit; struct vm_area_struct *tmp; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); @@ -1602,6 +1634,11 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info) return -ENOMEM; low_limit = info->low_limit; +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk && low_limit < vk->sysctl_vm.mmap_min_addr) + low_limit = vk->sysctl_vm.mmap_min_addr; +#endif if (low_limit < mmap_min_addr) low_limit = mmap_min_addr; high_limit = info->high_limit; @@ -1645,6 +1682,9 @@ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) unsigned long length, gap, gap_end; unsigned long low_limit, high_limit; struct vm_area_struct *tmp; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); /* Adjust search length to account for worst case alignment overhead */ @@ -1653,6 +1693,11 @@ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) return -ENOMEM; low_limit = info->low_limit; +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk && low_limit < vk->sysctl_vm.mmap_min_addr) + low_limit = vk->sysctl_vm.mmap_min_addr; +#endif if (low_limit < mmap_min_addr) low_limit = mmap_min_addr; high_limit = info->high_limit; @@ -1724,6 +1769,13 @@ generic_get_unmapped_area(struct file *filp, unsigned long addr, struct vm_area_struct *vma, *prev; struct vm_unmapped_area_info info; const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags); +#ifdef CONFIG_VKERNEL + struct vkernel *vk; + + vk = vkernel_find_vk_by_task(current); + if (vk && len > mmap_end - vk->sysctl_vm.mmap_min_addr) + return -ENOMEM; +#endif if (len > mmap_end - mmap_min_addr) return -ENOMEM; @@ -1735,6 +1787,9 @@ generic_get_unmapped_area(struct file *filp, unsigned long addr, addr = PAGE_ALIGN(addr); vma = find_vma_prev(mm, addr, &prev); if (mmap_end - len >= addr && addr >= mmap_min_addr && +#ifdef CONFIG_VKERNEL + (!vk || addr >= vk->sysctl_vm.mmap_min_addr) && +#endif (!vma || addr + len <= vm_start_gap(vma)) && (!prev || addr >= vm_end_gap(prev))) return addr; @@ -1772,6 +1827,13 @@ generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, struct mm_struct *mm = current->mm; struct vm_unmapped_area_info info; const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags); +#ifdef CONFIG_VKERNEL + struct vkernel *vk; + + vk = vkernel_find_vk_by_task(current); + if (vk && len > mmap_end - vk->sysctl_vm.mmap_min_addr) + return -ENOMEM; +#endif /* requested length too big for entire address space */ if (len > mmap_end - mmap_min_addr) @@ -1785,6 +1847,9 @@ generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, addr = PAGE_ALIGN(addr); vma = find_vma_prev(mm, addr, &prev); if (mmap_end - len >= addr && addr >= mmap_min_addr && +#ifdef CONFIG_VKERNEL + (!vk || addr >= vk->sysctl_vm.mmap_min_addr) && +#endif (!vma || addr + len <= vm_start_gap(vma)) && (!prev || addr >= vm_end_gap(prev))) return addr; @@ -2110,11 +2175,19 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start); struct vm_area_struct *prev; int error = 0; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif if (!(vma->vm_flags & VM_GROWSDOWN)) return -EFAULT; address &= PAGE_MASK; +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk && address < vk->sysctl_vm.mmap_min_addr) + return -EPERM; +#endif if (address < mmap_min_addr || address < FIRST_USER_ADDRESS) return -EPERM; @@ -2504,6 +2577,13 @@ static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, int new_below) { +#ifdef CONFIG_VKERNEL + struct vkernel *vk; + + vk = vkernel_find_vk_by_task(current); + if (vk && vma->vm_mm->map_count >= vk->sysctl_vm.max_map_count) + return -ENOMEM; +#endif #ifdef CONFIG_PID_NS if (vma->vm_mm->map_count >= task_active_pid_ns(current)->max_map_count) #else @@ -2617,6 +2697,9 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, int count = 0; int error = -ENOMEM; unsigned long locked_vm = 0; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif MA_STATE(mas_detach, &mt_detach, 0, 0); mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); mt_on_stack(mt_detach); @@ -2640,6 +2723,11 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, * not exceed its limit; but let map_count go just above * its limit temporarily, to help free resources as expected. */ +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk && mm->map_count >= vk->sysctl_vm.max_map_count) + goto map_count_exceeded; +#endif #ifdef CONFIG_PID_NS if (end < vma->vm_end && mm->map_count >= task_active_pid_ns(current)->max_map_count) #else @@ -3307,6 +3395,9 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, { struct mm_struct *mm = current->mm; struct vma_prepare vp; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif /* * Check against address space limits by the changed size @@ -3316,6 +3407,11 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) return -ENOMEM; +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk && mm->map_count > vk->sysctl_vm.max_map_count) + return -ENOMEM; +#endif #ifdef CONFIG_PID_NS if (mm->map_count > task_active_pid_ns(current)->max_map_count) #else diff --git a/mm/mremap.c b/mm/mremap.c index cd1c7670f5f6de36068582ace6c757e833d15595..a5c9f3810227fa8e5db07008a45113950d744eec 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -25,6 +25,9 @@ #include #include #include +#ifdef CONFIG_VKERNEL +#include +#endif #include #include @@ -672,6 +675,13 @@ static unsigned long move_vma(struct vm_area_struct *vma, int err = 0; bool need_rmap_locks; struct vma_iterator vmi; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; + + vk = vkernel_find_vk_by_task(current); + if (vk && mm->map_count >= vk->sysctl_vm.max_map_count - 3) + return -ENOMEM; +#endif /* * We'd prefer to avoid failure later on in do_munmap: @@ -887,6 +897,9 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, struct vm_area_struct *vma; unsigned long ret = -EINVAL; unsigned long map_flags = 0; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif if (offset_in_page(new_addr)) goto out; @@ -912,6 +925,11 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, * Check whether current map count plus 2 still leads us to 4 maps below * the threshold, otherwise return -ENOMEM here to be more safe. */ +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk && (mm->map_count + 2) >= vk->sysctl_vm.max_map_count - 3) + return -ENOMEM; +#endif #ifdef CONFIG_PID_NS if ((mm->map_count + 2) >= task_active_pid_ns(current)->max_map_count - 3) #else diff --git a/mm/nommu.c b/mm/nommu.c index 3479cae48ee1cf128fbd55c34d94c8371915913c..37ca77ba9570df27f7695f8bc3fca658d1ac5f74 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -34,6 +34,9 @@ #include #include #include +#ifdef CONFIG_VKERNEL +#include +#endif #include #include @@ -1318,6 +1321,9 @@ static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_region *region; unsigned long npages; struct mm_struct *mm; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif /* we're only permitted to split anonymous regions (these should have * only a single usage on the region) */ @@ -1325,6 +1331,11 @@ static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, return -ENOMEM; mm = vma->vm_mm; +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk && mm->map_count >= vk->sysctl_vm.max_map_count) + return -ENOMEM; +#endif #ifdef CONFIG_PID_NS if (mm->map_count >= task_active_pid_ns(current)->max_map_count) #else diff --git a/mm/shmem.c b/mm/shmem.c index 5341bf731d006ef42bcbe315cab49c563ab00862..2e414cc9742ed85240f727f0c53becf27c8a2255 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -40,6 +40,9 @@ #include #include #include +#ifdef CONFIG_VKERNEL +#include +#endif #ifdef CONFIG_CGROUP_SLI #include #endif @@ -1761,13 +1764,24 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, bool global_huge; loff_t i_size; int order; +#ifdef CONFIG_VKERNEL + unsigned long flags = transparent_hugepage_flags; + struct vkernel *vk; +#endif if (vma && ((vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))) return 0; /* If the hardware/firmware marked hugepage support disabled. */ +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk) + flags = vk->mem_pref.thp_flags; + if (vk_thp_disabled_by_hw(flags)) +#else if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED)) +#endif return 0; global_huge = shmem_huge_global_enabled(inode, index, shmem_huge_force, diff --git a/mm/util.c b/mm/util.c index 4525f46cca0a9206899e33f793a38b79bb2ec53f..4a098cbb7ede3a9131086ba66786b38cb12df546 100644 --- a/mm/util.c +++ b/mm/util.c @@ -24,6 +24,9 @@ #include #include +#ifdef CONFIG_VKERNEL +#include +#endif #include #include "internal.h" @@ -914,6 +917,28 @@ unsigned long vm_commit_limit(void) return allowed; } +#if defined(CONFIG_VKERNEL) && defined(CONFIG_MEMCG) +unsigned long vk_vm_commit_limit(struct vkernel_sysctl_vm *vm, + struct mem_cgroup *memcg) +{ + unsigned long allowed; + struct mem_cgroup *iter; + unsigned long limit; + + if (vm->overcommit_kbytes) + allowed = vm->overcommit_kbytes >> (PAGE_SHIFT - 10); + else { + limit = totalram_pages() - hugetlb_total_pages(); + for (iter = memcg; iter; iter = parent_mem_cgroup(iter)) + limit = min(limit, iter->memory.max); + allowed = (limit * vm->overcommit_ratio / 100); + } + allowed += min_t(unsigned long, total_swap_pages, memcg->swap.max); + + return allowed; +} +#endif + /* * Make sure vm_committed_as in one cacheline and not cacheline shared with * other variables. It can be updated by several CPUs frequently. @@ -935,10 +960,30 @@ struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; */ unsigned long vm_memory_committed(void) { +#ifdef CONFIG_VKERNEL + struct vkernel *vk; + + vk = vkernel_find_vk_by_task(current); + if (vk) + return percpu_counter_sum_positive(&vk->sysctl_vm.vm_committed_as); +#endif return percpu_counter_sum_positive(&vm_committed_as); } EXPORT_SYMBOL_GPL(vm_memory_committed); +#ifdef CONFIG_VKERNEL +void vm_acct_memory(long pages) +{ + struct vkernel *vk; + + vk = vkernel_find_vk_by_task(current); + if (vk) + percpu_counter_add_batch(&vk->sysctl_vm.vm_committed_as, pages, + vk->sysctl_vm.as_batch); + percpu_counter_add_batch(&vm_committed_as, pages, vm_committed_as_batch); +} +#endif + /* * Check that a process has enough memory to allocate a new virtual * mapping. 0 means there is enough memory for the allocation to @@ -958,16 +1003,34 @@ EXPORT_SYMBOL_GPL(vm_memory_committed); int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { long allowed; + int overcommit = sysctl_overcommit_memory; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#ifdef CONFIG_MEMCG + struct mem_cgroup *memcg; + long memcg_allowed; +#endif + + vk = vkernel_find_vk_by_task(current); + if (vk) { + overcommit = vk->sysctl_vm.overcommit_memory; +#ifdef CONFIG_MEMCG + memcg = mem_cgroup_from_task(current); + if (memcg) + memcg_allowed = vk_vm_commit_limit(&vk->sysctl_vm, memcg); +#endif + } +#endif vm_acct_memory(pages); /* * Sometimes we want to use more memory than we have */ - if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) + if (overcommit == OVERCOMMIT_ALWAYS) return 0; - if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { + if (overcommit == OVERCOMMIT_GUESS) { if (pages > totalram_pages() + total_swap_pages) goto error; return 0; @@ -989,6 +1052,12 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) allowed -= min_t(long, mm->total_vm / 32, reserve); } +#if defined(CONFIG_VKERNEL) && defined(CONFIG_MEMCG) + if (vk && + percpu_counter_read_positive(&vk->sysctl_vm.vm_committed_as) < memcg_allowed) + return 0; +#endif + if (percpu_counter_read_positive(&vm_committed_as) < allowed) return 0; error: diff --git a/security/commoncap.c b/security/commoncap.c index 5a26b0d1ee9fa7c9a2d19767da15ad26afb972a9..2d95a9ebac4d739b53deb22368f800b5f600517a 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -25,6 +25,9 @@ #include #include #include +#ifdef CONFIG_VKERNEL +#include +#endif #ifdef CONFIG_CREDP #include #endif @@ -70,6 +73,14 @@ int cap_capable(const struct cred *cred, struct user_namespace *targ_ns, int cap, unsigned int opts) { struct user_namespace *ns = targ_ns; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; + + /* vkernel: check initial capability first */ + vk = vkernel_find_vk_by_task(current); + if (vk && vk->ops.cap_capable(vk, cred, targ_ns, cap, opts)) + return -EPERM; +#endif /* See if cred has the capability in the target user namespace * by examining the target user namespace and all of the target