From f74999eddde06c5b27e992934df5399a85ea2c34 Mon Sep 17 00:00:00 2001 From: Hang Huang Date: Sun, 28 Jun 2026 20:13:55 +0800 Subject: [PATCH 01/17] vk: introduce vkernel framework Initial version provides following features: * syscall isolation: hook do_syscall_x64 and do_futex to isolate syscalls. * cap enhancement: hook cap_capable to enhance capability protection. * file protection: hook generic_permission to customize inode protection. * log isolation: distinguish the owner of kernel log to isolate logs * param isolation: use independent sysctl params for each instance. Signed-off-by: Hang Huang --- arch/arm64/include/asm/vkernel.h | 25 ++++ arch/arm64/kernel/syscall.c | 21 +++ arch/x86/entry/common.c | 19 +++ arch/x86/include/asm/vkernel.h | 26 ++++ fs/namei.c | 13 ++ include/linux/miscdevice.h | 1 + include/linux/vkernel.h | 233 +++++++++++++++++++++++++++++++ init/Kconfig | 10 ++ kernel/Makefile | 1 + kernel/futex/syscalls.c | 10 ++ kernel/vkernel_hook.c | 92 ++++++++++++ security/commoncap.c | 11 ++ 12 files changed, 462 insertions(+) create mode 100644 arch/arm64/include/asm/vkernel.h create mode 100644 arch/x86/include/asm/vkernel.h create mode 100644 include/linux/vkernel.h create mode 100644 kernel/vkernel_hook.c diff --git a/arch/arm64/include/asm/vkernel.h b/arch/arm64/include/asm/vkernel.h new file mode 100644 index 000000000000..31feb6967075 --- /dev/null +++ b/arch/arm64/include/asm/vkernel.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2024 ARM Ltd. + * Author: Joy Allen + */ + +#ifndef __ASM__VKERNEL_H +#define __ASM__VKERNEL_H + +#define sys_call_vk_t syscall_fn_t + +DECLARE_PER_CPU(struct task_struct *, current_syscall_task); +DECLARE_PER_CPU(struct vkernel *, current_syscall_vk); + +static __always_inline struct task_struct *get_current_syscall_task(void) +{ + return this_cpu_read_8(current_syscall_task); +} + +static __always_inline struct vkernel *get_current_syscall_vk(void) +{ + return this_cpu_read_8(current_syscall_vk); +} + +#endif diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c index f090e39f69bc..ff0a13710f9e 100644 --- a/arch/arm64/kernel/syscall.c +++ b/arch/arm64/kernel/syscall.c @@ -7,6 +7,9 @@ #include #include #include +#ifdef CONFIG_VKERNEL +#include +#endif #include #include @@ -42,13 +45,31 @@ static void invoke_syscall(struct pt_regs *regs, unsigned int scno, const syscall_fn_t syscall_table[]) { long ret; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif add_random_kstack_offset(); if (scno < sc_nr) { syscall_fn_t syscall_fn; +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (!vk) { + syscall_fn = syscall_table[array_index_nospec(scno, sc_nr)]; + ret = __invoke_syscall(regs, syscall_fn); + } else { + syscall_fn = (vk->syscall.table)[array_index_nospec(scno, sc_nr)]; + this_cpu_write(current_syscall_task, current); + this_cpu_write(current_syscall_vk, vk); + ret = __invoke_syscall(regs, syscall_fn); + this_cpu_write(current_syscall_vk, NULL); + this_cpu_write(current_syscall_task, NULL); + } +#else syscall_fn = syscall_table[array_index_nospec(scno, sc_nr)]; ret = __invoke_syscall(regs, syscall_fn); +#endif } else { ret = do_ni_syscall(regs, scno); } diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 57d2bd04e5ed..31b93f33dc89 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -20,6 +20,9 @@ #include #include #include +#ifdef CONFIG_VKERNEL +#include +#endif #ifdef CONFIG_XEN_PV #include @@ -46,10 +49,26 @@ static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr) * numbers for comparisons. */ unsigned int unr = nr; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif if (likely(unr < NR_syscalls)) { unr = array_index_nospec(unr, NR_syscalls); +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (!vk) + regs->ax = sys_call_table[unr](regs); + else { + this_cpu_write(current_syscall_task, current); + this_cpu_write(current_syscall_vk, vk); + regs->ax = (vk->syscall.table)[unr](regs); + this_cpu_write(current_syscall_vk, NULL); + this_cpu_write(current_syscall_task, NULL); + } +#else regs->ax = x64_sys_call(regs, unr); +#endif return true; } return false; diff --git a/arch/x86/include/asm/vkernel.h b/arch/x86/include/asm/vkernel.h new file mode 100644 index 000000000000..f46c3262e3c3 --- /dev/null +++ b/arch/x86/include/asm/vkernel.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2024 ARM Ltd. + * Author: Joy Allen + */ + +#ifndef __ASM_X86_VKERNEL_H +#define __ASM_X86_VKERNEL_H + +#define sys_call_vk_t sys_call_ptr_t + +DECLARE_PER_CPU(struct task_struct *, current_syscall_task); +DECLARE_PER_CPU(struct vkernel *, current_syscall_vk); + +static __always_inline struct task_struct *get_current_syscall_task(void) +{ + return this_cpu_read_stable(current_syscall_task); +} + +static __always_inline struct vkernel *get_current_syscall_vk(void) +{ + return this_cpu_read_stable(current_syscall_vk); +} + + +#endif diff --git a/fs/namei.c b/fs/namei.c index ff289f801501..0cb1592fe2c9 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -41,6 +41,9 @@ #include #include #include +#ifdef CONFIG_VKERNEL +#include +#endif #include "internal.h" #include "mount.h" @@ -401,6 +404,16 @@ int generic_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { int ret; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; + + vk = vkernel_find_vk_by_task(current); + if (vk) { + ret = vk->ops.generic_permission(vk, idmap, inode, mask); + if (ret) + return ret; + } +#endif /* * Do the basic permission checks. diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h index c0fea6ca5076..dc8fb910c84d 100644 --- a/include/linux/miscdevice.h +++ b/include/linux/miscdevice.h @@ -62,6 +62,7 @@ #define FUSE_MINOR 229 #define SNAPSHOT_MINOR 231 #define KVM_MINOR 232 +#define VKERNEL_MINOR 233 #define BTRFS_MINOR 234 #define AUTOFS_MINOR 235 #define MAPPER_CTRL_MINOR 236 diff --git a/include/linux/vkernel.h b/include/linux/vkernel.h new file mode 100644 index 000000000000..4291d07c99e6 --- /dev/null +++ b/include/linux/vkernel.h @@ -0,0 +1,233 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2024 ARM Ltd. + * Author: Joy Allen + */ + +#ifndef _LINUX_VKERNEL_H +#define _LINUX_VKERNEL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#define VKERNEL_API_VERSION 1 + +#define VKERNEL_NAME_LEN 64 +#define VKERNEL_PATH_MAX 128 +#define VKERNEL_ACL_HASH_BITS 8 + +#define NOT_FOUND 0x8000 +#define IOP_VKERNEL_REG 0x8000 +#define IOP_VKERNEL_DIR 0x4000 + +/* Refer KVM */ +#define VKERNELIO 0xAF + +/* System/VK IOCTL list */ +#define VKERNEL_GET_API_VERSION _IO(VKERNELIO, 0x00) +#define VKERNEL_CREATE_VK _IO(VKERNELIO, 0x01) +#define VKERNEL_DESTROY_VK _IO(VKERNELIO, 0x02) +#define VKERNEL_CHECK_EXTENSION _IO(VKERNELIO, 0x03) +#define VKERNEL_TRACE_ENABLE _IO(VKERNELIO, 0x04) +#define VKERNEL_TRACE_PAUSE _IO(VKERNELIO, 0x05) +#define VKERNEL_TRACE_DISABLE _IO(VKERNELIO, 0x06) +#define VKERNEL_SET_DEF_SYSCALL _IO(VKERNELIO, 0x07) +#define VKERNEL_RESTRICT_SYSCALL _IO(VKERNELIO, 0x08) +#define VKERNEL_RESTRICT_FILE _IO(VKERNELIO, 0x09) +#define VKERNEL_RESTRICT_LINUX_CAP _IO(VKERNELIO, 0x0a) +#define VKERNEL_SET_CPU_PREF _IO(VKERNELIO, 0X0b) +#define VKERNEL_SET_MEMORY_PREF _IO(VKERNELIO, 0X0c) +#define VKERNEL_SET_SYSCTL_FS _IO(VKERNELIO, 0X0d) +#define VKERNEL_SET_SYSCTL_KERNEL _IO(VKERNELIO, 0x0e) +#define VKERNEL_SET_SYSCTL_NET _IO(VKERNELIO, 0x0f) +#define VKERNEL_SET_SYSCTL_VM _IO(VKERNELIO, 0x10) +#define VKERNEL_ENABLE_CAP _IO(VKERNELIO, 0x11) +#define VKERNEL_REGISTER _IO(VKERNELIO, 0x12) +#define VKERNEL_UNREGISTER _IO(VKERNELIO, 0x13) +#define VKERNEL_ACTIVATE _IO(VKERNELIO, 0x14) +#define VKERNEL_DEACTIVATE _IO(VKERNELIO, 0x15) + +/* syscall condition compare operations */ +#define VKERNEL_SYSCALL_CMP_ED 0 /* invalid op, means the end of conditions */ +#define VKERNEL_SYSCALL_CMP_EQ 1 /* equal, arg == val */ +#define VKERNEL_SYSCALL_CMP_NE 2 /* not equal, arg != val */ +#define VKERNEL_SYSCALL_CMP_LT 3 /* less than, arg < val */ +#define VKERNEL_SYSCALL_CMP_LE 4 /* less than or equal, arg <= val */ +#define VKERNEL_SYSCALL_CMP_GT 5 /* greater than, arg > val */ +#define VKERNEL_SYSCALL_CMP_GE 6 /* greater than or equal, arg >= val */ +#define VKRENEL_SYSCALL_CMP_ME 7 /* masked equal, arg & mask == val, mask is val1 */ + +/* syscall rule actions */ +#define VKERNEL_SYSCALL_ACT_INVALID 0 +#define VKERNEL_SYSCALL_ACT_KILL_PROCESS 1 +#define VKERNEL_SYSCALL_ACT_KILL_THREAD 2 +#define VKERNEL_SYSCALL_ACT_TRAP 3 +#define VKERNEL_SYSCALL_ACT_ERRNO 4 +#define VKERNEL_SYSCALL_ACT_USER_NOTIF 5 +#define VKERNEL_SYSCALL_ACT_TRACE 6 +#define VKERNEL_SYSCALL_ACT_LOG 7 +#define VKERNEL_SYSCALL_ACT_ALLOW 8 + +#define VKERNEL_SYSCALL_ACT_BITS 16 +#define VKERNEL_SYSCALL_ERRNO_BITS 16 +#define VKERNEL_SYSCALL_ERRNO_MASK ((1U << VKERNEL_SYSCALL_ERRNO_BITS) - 1) + +#define current_vk_task get_current_syscall_task() +#define current_vk get_current_syscall_vk() + +struct vkernel_desc { + char custom[VKERNEL_NAME_LEN]; + int pid; +}; + +struct vkernel_syscall_cond { + u16 index; /* argument index 0-5 */ + u16 op; /* compare option */ + unsigned long oprand1; /* compared value */ + unsigned long oprand2; /* optional masked value */ +}; + +struct vkernel_syscall_rule_desc { + u32 nr; /* syscall nr */ + u32 act; /* action when conditions matched, [31:16]act|[15:0]errno */ + struct vkernel_syscall_cond conds[6]; /* optional conditions */ +}; + +struct vkernel_syscall_rule { + struct list_head link; + u32 act; + struct vkernel_syscall_cond conds[6]; +}; + +struct vkernel_syscall { + sys_call_vk_t table[NR_syscalls + 1]; + struct list_head rule_chains[NR_syscalls + 1]; + long (*do_futex)(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, + u32 __user *uaddr2, u32 val2, u32 val3); + u32 def_act; /* default syscall rule action, [31:16]act|[15:0]errno */ +}; + +struct vkernel_file_desc { + char path[VKERNEL_PATH_MAX]; + u16 mode; +}; + +struct vkernel_file_desc_set { + u64 nr_descs; + struct vkernel_file_desc descs[]; +}; + +struct vkernel_acl_node { + struct hlist_node hash; + struct list_head link; + unsigned long ino; + struct super_block *sb; + char path[VKERNEL_PATH_MAX]; + unsigned short mode; +}; + +struct vkernel_acl { + struct hlist_head *ht; + int bits; + struct list_head nodes; + bool active; +}; + +struct vkernel_linux_cap { + kernel_cap_t inheritable; + kernel_cap_t permitted; + kernel_cap_t effective; + kernel_cap_t bset; + kernel_cap_t ambient; +}; + +struct vkernel; + +struct vkernel_ops { + int (*cap_capable)(struct vkernel *vk, const struct cred *cred, + struct user_namespace *targ_ns, int cap, unsigned int opts); + int (*generic_permission)(struct vkernel *vk, struct mnt_idmap *idmap, + struct inode *inode, int mask); +}; + +struct vkernel_custom_type { + struct module *owner; + struct hlist_node hash; + char name[VKERNEL_NAME_LEN]; + int (*post_create)(struct vkernel *vk); + void (*pre_destroy)(struct vkernel *vk); +}; + +struct vkernel { + /* basic */ + char name[VKERNEL_NAME_LEN]; + struct hlist_node hash; + struct list_head link; + struct pid_namespace *pid_ns; + struct uts_namespace *uts_ns; + struct task_struct *init_process; + int init_pid; + refcount_t users_count; + bool active; + + /* security */ + struct vkernel_syscall syscall; + struct vkernel_acl acl; + struct vkernel_linux_cap linux_cap; + + /* operation */ + struct vkernel_ops ops; + + /* custom */ + struct vkernel_custom_type *custom; + void *private; + + /* debug */ + struct dentry *debugfs_dentry; +}; + +struct vkernel *vkernel_find_vk_by_id(unsigned int id); +struct vkernel *vkernel_find_vk_by_task(struct task_struct *tsk); +int vkernel_register_vk(struct vkernel *vk); +int vkernel_unregister_vk(struct vkernel *vk); + +struct vkernel *vkernel_create_vk(struct task_struct *tsk, const char *name, + const char *custom); +void vkernel_destroy_vk(struct vkernel *vk); + +void vkernel_get_vk(struct vkernel *vk); +bool vkernel_get_vk_safe(struct vkernel *vk); +void vkernel_put_vk(struct vkernel *vk); +void vkernel_put_vk_no_destroy(struct vkernel *vk); + +int vkernel_set_syscall(struct vkernel_syscall *syscall, unsigned int nr, + sys_call_vk_t call); +int vkernel_set_default_syscall_rule(struct vkernel_syscall *syscall, u32 act); +int vkernel_add_syscall_rule(struct vkernel_syscall *syscall, + struct vkernel_syscall_rule_desc *desc); + +int vkernel_set_acl(struct vkernel_acl *acl, char *path, unsigned short mode); +int vkernel_clear_acl(struct vkernel_acl *acl, char *path); +int vkernel_set_acl_set(struct vkernel_acl *acl, struct vkernel_file_desc_set *set); +int vkernel_clear_acl_set(struct vkernel_acl *acl, struct vkernel_file_desc_set *set); + +int vkernel_set_linux_cap(struct vkernel *vk, struct vkernel_linux_cap *cap); + +struct vkernel_custom_type *vkernel_find_custom(const char *name); +int vkernel_register_custom(struct vkernel_custom_type *custom); +int vkernel_unregister_custom(struct vkernel_custom_type *custom); + +#endif diff --git a/init/Kconfig b/init/Kconfig index 0142952d3d5f..279bee622a22 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1374,6 +1374,16 @@ config NET_NS Allow user space to create what appear to be multiple instances of the network stack. +config VKERNEL + bool "Virtual Kernel support" + def_bool n + depends on X86_64 || ARM64 + help + This option enables the kernel to support the vkernel system. + VKernel provides more fine-grained isolation and customization + for the containers, such as syscall isolation, file access + protection, capability enhancement, and etc. + endif # NAMESPACES config NVIDIA_SMI_TRAP diff --git a/kernel/Makefile b/kernel/Makefile index 5c0e22393015..b3fdabacdfd9 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -15,6 +15,7 @@ obj-y = fork.o exec_domain.o panic.o \ obj-$(CONFIG_USERMODE_DRIVER) += usermode_driver.o obj-$(CONFIG_MULTIUSER) += groups.o obj-$(CONFIG_VHOST_TASK) += vhost_task.o +obj-$(CONFIG_VKERNEL) += vkernel_hook.o ifdef CONFIG_FUNCTION_TRACER # Do not trace internal ftrace files diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c index 48feaa545b3c..3a2dae1b7bb0 100644 --- a/kernel/futex/syscalls.c +++ b/kernel/futex/syscalls.c @@ -3,6 +3,9 @@ #include #include #include +#ifdef CONFIG_VKERNEL +#include +#endif #include "futex.h" @@ -115,6 +118,13 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, { int cmd = op & FUTEX_CMD_MASK; unsigned int flags = 0; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; + + vk = vkernel_find_vk_by_task(current); + if (vk && vk->syscall.do_futex) + return vk->syscall.do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); +#endif if (!(op & FUTEX_PRIVATE_FLAG)) flags |= FLAGS_SHARED; diff --git a/kernel/vkernel_hook.c b/kernel/vkernel_hook.c new file mode 100644 index 000000000000..7a438f251941 --- /dev/null +++ b/kernel/vkernel_hook.c @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: GPL-2.0 +/** + * Vkernel hook + * + * Vkernel polcies are implemented as loadable module(s) and + * applied by hooks + * + * Copyright (C) 2024 ARM Ltd. + * Author: Joy Allen + **/ + +#include + +static DEFINE_MUTEX(vkernel_lock); +static DEFINE_HASHTABLE(vkernel_ht, 6); + +/* id -> vk cache */ +static unsigned int id_cache; +static struct vkernel *vk_cache; + +DEFINE_PER_CPU(struct task_struct *, current_syscall_task); +EXPORT_PER_CPU_SYMBOL(current_syscall_task); + +DEFINE_PER_CPU(struct vkernel *, current_syscall_vk); +EXPORT_PER_CPU_SYMBOL(current_syscall_vk); + +struct vkernel *vkernel_find_vk_by_id(unsigned int id) +{ + struct vkernel *vk; + + if (id == id_cache) + return vk_cache; + + /* TODO: protect with rwlock? */ + hash_for_each_possible(vkernel_ht, vk, hash, id) { + if (id == vk->pid_ns->ns.inum) { + id_cache = vk->pid_ns->ns.inum; + vk_cache = vk; + return vk; + } + } + + return NULL; +} +EXPORT_SYMBOL(vkernel_find_vk_by_id); + +struct vkernel *vkernel_find_vk_by_task(struct task_struct *tsk) +{ + struct vkernel *vk; + struct pid_namespace *ns; + + ns = task_active_pid_ns(tsk); + if (!ns || ns == &init_pid_ns) + return NULL; + + vk = vkernel_find_vk_by_id(ns->ns.inum); + if (vk && vk->active) + return vk; + + return NULL; +} +EXPORT_SYMBOL(vkernel_find_vk_by_task); + +int vkernel_register_vk(struct vkernel *vk) +{ + if (!hlist_unhashed(&vk->hash)) + return -EEXIST; + + mutex_lock(&vkernel_lock); + hash_add(vkernel_ht, &vk->hash, vk->pid_ns->ns.inum); + mutex_unlock(&vkernel_lock); + id_cache = vk->pid_ns->ns.inum; + vk_cache = vk; + + return 0; +} +EXPORT_SYMBOL(vkernel_register_vk); + +int vkernel_unregister_vk(struct vkernel *vk) +{ + if (vk->pid_ns->ns.inum == id_cache) { + id_cache = 0; + vk_cache = NULL; + } + /* It is also ok to remove an unhashed vk */ + mutex_lock(&vkernel_lock); + hash_del(&vk->hash); + mutex_unlock(&vkernel_lock); + + return 0; +} +EXPORT_SYMBOL(vkernel_unregister_vk); diff --git a/security/commoncap.c b/security/commoncap.c index 5a26b0d1ee9f..2d95a9ebac4d 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -25,6 +25,9 @@ #include #include #include +#ifdef CONFIG_VKERNEL +#include +#endif #ifdef CONFIG_CREDP #include #endif @@ -70,6 +73,14 @@ int cap_capable(const struct cred *cred, struct user_namespace *targ_ns, int cap, unsigned int opts) { struct user_namespace *ns = targ_ns; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; + + /* vkernel: check initial capability first */ + vk = vkernel_find_vk_by_task(current); + if (vk && vk->ops.cap_capable(vk, cred, targ_ns, cap, opts)) + return -EPERM; +#endif /* See if cred has the capability in the target user namespace * by examining the target user namespace and all of the target -- Gitee From c5a0daf9ba20f93d15f1d24a7ff3a4cb8ca9f40a Mon Sep 17 00:00:00 2001 From: Hang Huang Date: Sun, 28 Jun 2026 20:13:55 +0800 Subject: [PATCH 02/17] vk: introduce vkernel data isolation Vkernel data isolation framework is designed to isolate important kernel/user data, such as log, perm data. Currently, only kernel log isolation is supported, which groups and filters kmsg logs according to log ns id. Signed-off-by: chenwei1266 Signed-off-by: Hang Huang --- include/linux/vkernel.h | 11 +++++++++++ kernel/printk/printk.c | 14 ++++++++++++++ kernel/printk/printk_ringbuffer.c | 11 +++++++++++ kernel/printk/printk_ringbuffer.h | 3 +++ 4 files changed, 39 insertions(+) diff --git a/include/linux/vkernel.h b/include/linux/vkernel.h index 4291d07c99e6..f7b971b2d422 100644 --- a/include/linux/vkernel.h +++ b/include/linux/vkernel.h @@ -85,6 +85,13 @@ #define VKERNEL_SYSCALL_ERRNO_BITS 16 #define VKERNEL_SYSCALL_ERRNO_MASK ((1U << VKERNEL_SYSCALL_ERRNO_BITS) - 1) +/* Extension capability list */ +#define VKERNEL_CAP_ISOLATE_LOG 0 +#define VKERNEL_CAP_ISOLATE_ANON 1 +#define VKERNEL_CAP_ISOLATE_ANON_PIPE 2 +#define VKERNEL_CAP_ISOLATE_RAMFS 3 +#define VKERNEL_CAP_NUM 4 + #define current_vk_task get_current_syscall_task() #define current_vk get_current_syscall_vk() @@ -188,6 +195,10 @@ struct vkernel { struct vkernel_acl acl; struct vkernel_linux_cap linux_cap; + /* extension caps */ + unsigned long caps; + unsigned int log_ns; + /* operation */ struct vkernel_ops ops; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 3f16cda43d31..bb80333aec29 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -47,6 +47,9 @@ #include #include #include +#ifdef CONFIG_VKERNEL +#include +#endif #include #include @@ -1112,6 +1115,9 @@ static unsigned int __init add_to_rb(struct printk_ringbuffer *rb, dest_r.info->flags = r->info->flags; dest_r.info->ts_nsec = r->info->ts_nsec; dest_r.info->caller_id = r->info->caller_id; +#ifdef CONFIG_VKERNEL + dest_r.info->ns = r->info->ns; +#endif memcpy(&dest_r.info->dev_info, &r->info->dev_info, sizeof(dest_r.info->dev_info)); prb_final_commit(&e); @@ -2206,6 +2212,9 @@ int vprintk_store(int facility, int level, u16 text_len; int ret = 0; u64 ts_nsec; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif if (!printk_enter_irqsave(recursion_ptr, irqflags)) return 0; @@ -2293,6 +2302,11 @@ int vprintk_store(int facility, int level, r.info->flags = flags & 0x1f; r.info->ts_nsec = ts_nsec; r.info->caller_id = caller_id; +#ifdef CONFIG_VKERNEL + /* Set log namespace (host can set any invalid value) */ + vk = vkernel_find_vk_by_task(current); + r.info->ns = vk ? vk->log_ns : 0; +#endif if (dev_info) memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info)); diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index fde338606ce8..c230231abb69 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -5,6 +5,9 @@ #include #include #include +#ifdef CONFIG_VKERNEL +#include +#endif #include "printk_ringbuffer.h" /** @@ -1803,6 +1806,14 @@ static int prb_read(struct printk_ringbuffer *rb, u64 seq, struct prb_desc desc; unsigned long id; int err; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; + + /* Skip record when reading log owned by other ns */ + vk = vkernel_find_vk_by_task(current); + if (vk && vk->log_ns != info->ns) + return -ENOENT; +#endif /* Extract the ID, used to specify the descriptor to read. */ id = DESC_ID(atomic_long_read(state_var)); diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h index 18cd25e489b8..72d591b7842e 100644 --- a/kernel/printk/printk_ringbuffer.h +++ b/kernel/printk/printk_ringbuffer.h @@ -20,6 +20,9 @@ struct printk_info { u8 flags:5; /* internal record flags */ u8 level:3; /* syslog level */ u32 caller_id; /* thread id or processor id */ +#ifdef CONFIG_VKERNEL + u64 ns; /* log namespace */ +#endif struct dev_printk_info dev_info; }; -- Gitee From 4aeebb849de54bb93a1daadd83e0f4287ed0c2af Mon Sep 17 00:00:00 2001 From: Hang Huang Date: Sun, 28 Jun 2026 20:13:55 +0800 Subject: [PATCH 03/17] vk: introduce fs sysctl customization Sysctl values in `/proc/sys/fs/` can be configured independently for each container. Signed-off-by: chenwei1266 Signed-off-by: Hang Huang --- fs/file.c | 22 ++++++++++++++++++++ fs/file_table.c | 46 +++++++++++++++++++++++++++++++++++++++-- fs/inode.c | 46 +++++++++++++++++++++++++++++++++++++++-- fs/namespace.c | 10 +++++++++ include/linux/vkernel.h | 30 +++++++++++++++++++++++++++ kernel/sys.c | 12 +++++++++++ 6 files changed, 162 insertions(+), 4 deletions(-) diff --git a/fs/file.c b/fs/file.c index d0c412f0dc55..cd39d27bd6c6 100644 --- a/fs/file.c +++ b/fs/file.c @@ -22,6 +22,9 @@ #include #include #include +#ifdef CONFIG_VKERNEL +#include +#endif #include "internal.h" @@ -96,6 +99,9 @@ static struct fdtable *alloc_fdtable(unsigned int slots_wanted) struct fdtable *fdt; unsigned int nr; void *data; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif /* * Figure out how many fds we actually want to support in this fdtable. @@ -123,6 +129,14 @@ static struct fdtable *alloc_fdtable(unsigned int slots_wanted) * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise * bitmaps handling below becomes unpleasant, to put it mildly... */ +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk && unlikely(nr > vk->sysctl_fs.nr_open)) { + nr = round_down(vk->sysctl_fs.nr_open, BITS_PER_LONG); + if (nr < slots_wanted) + return ERR_PTR(-EMFILE); + } +#endif if (unlikely(nr > sysctl_nr_open)) { nr = round_down(sysctl_nr_open, BITS_PER_LONG); if (nr < slots_wanted) @@ -224,6 +238,9 @@ static int expand_files(struct files_struct *files, unsigned int nr) { struct fdtable *fdt; int expanded = 0; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif repeat: fdt = files_fdtable(files); @@ -233,6 +250,11 @@ static int expand_files(struct files_struct *files, unsigned int nr) return expanded; /* Can we expand? */ +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk && nr >= vk->sysctl_fs.nr_open) + return -EMFILE; +#endif if (nr >= sysctl_nr_open) return -EMFILE; diff --git a/fs/file_table.c b/fs/file_table.c index a5a3a385f24c..a0175354a1ef 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -29,6 +29,9 @@ #include #include #include +#ifdef CONFIG_VKERNEL +#include +#endif #include @@ -77,8 +80,16 @@ static inline void file_free(struct file *f) security_file_free(f); if (unlikely(f->f_mode & FMODE_BACKING)) path_put(backing_file_user_path(f)); - if (likely(!(f->f_mode & FMODE_NOACCOUNT))) + if (likely(!(f->f_mode & FMODE_NOACCOUNT))) { +#ifdef CONFIG_VKERNEL + struct vkernel *vk; + + vk = vkernel_find_vk_by_task(current); + if (vk) + percpu_counter_dec(&vk->sysctl_fs.nr_files); +#endif percpu_counter_dec(&nr_files); + } call_rcu(&f->f_rcuhead, file_free_rcu); } @@ -90,6 +101,13 @@ static long get_nr_files(void) return percpu_counter_read_positive(&nr_files); } +#ifdef CONFIG_VKERNEL +static long vk_get_nr_files(struct vkernel_sysctl_fs *fs) +{ + return percpu_counter_read_positive(&fs->nr_files); +} +#endif + /* * Return the maximum number of open files in the system */ @@ -190,7 +208,19 @@ struct file *alloc_empty_file(int flags, const struct cred *cred) static long old_max; struct file *f; int error; - +#ifdef CONFIG_VKERNEL + struct vkernel *vk; + struct vkernel_sysctl_fs *fs = NULL; + + vk = vkernel_find_vk_by_task(current); + if (vk) { + fs = &vk->sysctl_fs; + if (vk_get_nr_files(fs) >= fs->files_stat.max_files && !capable(CAP_SYS_ADMIN)) { + if (percpu_counter_sum_positive(&fs->nr_files) >= fs->files_stat.max_files) + goto over_vk; + } + } +#endif /* * Privileged users can go above max_files */ @@ -213,10 +243,22 @@ struct file *alloc_empty_file(int flags, const struct cred *cred) return ERR_PTR(error); } +#ifdef CONFIG_VKERNEL + if (fs) + percpu_counter_inc(&fs->nr_files); +#endif percpu_counter_inc(&nr_files); return f; +#ifdef CONFIG_VKERNEL +over_vk: + /* Ran out of vk filps, fs cannot be NULL here */ + if (vk_get_nr_files(fs) > fs->old_max) { + pr_info("VFS: vkernel file-max limit %lu reached\n", fs->files_stat.max_files); + fs->old_max = vk_get_nr_files(fs); + } +#endif over: /* Ran out of filps - report that */ if (get_nr_files() > old_max) { diff --git a/fs/inode.c b/fs/inode.c index a2d6121d5e70..6777fc6bd4a1 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -20,6 +20,9 @@ #include #include #include +#ifdef CONFIG_VKERNEL +#include +#endif #include #include "internal.h" @@ -158,6 +161,9 @@ int inode_init_always(struct super_block *sb, struct inode *inode) static const struct inode_operations empty_iops; static const struct file_operations no_open_fops = {.open = no_open}; struct address_space *const mapping = &inode->i_data; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif inode->i_sb = sb; inode->i_blkbits = sb->s_blocksize_bits; @@ -231,6 +237,11 @@ int inode_init_always(struct super_block *sb, struct inode *inode) if (unlikely(security_inode_alloc(inode))) return -ENOMEM; +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk) + this_cpu_inc(*vk->sysctl_fs.nr_inodes); +#endif this_cpu_inc(nr_inodes); return 0; @@ -281,6 +292,10 @@ static struct inode *alloc_inode(struct super_block *sb) void __destroy_inode(struct inode *inode) { +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif + BUG_ON(inode_has_buffers(inode)); inode_detach_wb(inode); security_inode_free(inode); @@ -296,6 +311,11 @@ void __destroy_inode(struct inode *inode) posix_acl_release(inode->i_acl); if (inode->i_default_acl && !is_uncached_acl(inode->i_default_acl)) posix_acl_release(inode->i_default_acl); +#endif +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk) + this_cpu_dec(*vk->sysctl_fs.nr_inodes); #endif this_cpu_dec(nr_inodes); } @@ -455,6 +475,10 @@ EXPORT_SYMBOL(ihold); static void __inode_add_lru(struct inode *inode, bool rotate) { +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif + if (inode->i_state & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE)) return; if (atomic_read(&inode->i_count)) @@ -464,9 +488,14 @@ static void __inode_add_lru(struct inode *inode, bool rotate) if (!mapping_shrinkable(&inode->i_data)) return; - if (list_lru_add_obj(&inode->i_sb->s_inode_lru, &inode->i_lru)) + if (list_lru_add_obj(&inode->i_sb->s_inode_lru, &inode->i_lru)) { +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk) + this_cpu_inc(*vk->sysctl_fs.nr_unused); +#endif this_cpu_inc(nr_unused); - else if (rotate) + } else if (rotate) inode->i_state |= I_REFERENCED; } @@ -849,6 +878,11 @@ static enum lru_status inode_lru_isolate(struct list_head *item, { struct list_head *freeable = arg; struct inode *inode = container_of(item, struct inode, i_lru); +#ifdef CONFIG_VKERNEL + struct vkernel *vk; + + vk = vkernel_find_vk_by_task(current); +#endif /* * We are inverting the lru lock/inode->i_lock here, so use a @@ -868,6 +902,10 @@ static enum lru_status inode_lru_isolate(struct list_head *item, !mapping_shrinkable(&inode->i_data)) { list_lru_isolate(lru, &inode->i_lru); spin_unlock(&inode->i_lock); +#ifdef CONFIG_VKERNEL + if (vk) + this_cpu_dec(*vk->sysctl_fs.nr_unused); +#endif this_cpu_dec(nr_unused); return LRU_REMOVED; } @@ -907,6 +945,10 @@ static enum lru_status inode_lru_isolate(struct list_head *item, list_lru_isolate_move(lru, &inode->i_lru, freeable); spin_unlock(&inode->i_lock); +#ifdef CONFIG_VKERNEL + if (vk) + this_cpu_dec(*vk->sysctl_fs.nr_unused); +#endif this_cpu_dec(nr_unused); return LRU_REMOVED; } diff --git a/fs/namespace.c b/fs/namespace.c index 87dd8efd8684..c471b93c5afb 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -32,6 +32,9 @@ #include #include #include +#ifdef CONFIG_VKERNEL +#include +#endif #include "pnode.h" #include "internal.h" @@ -2220,6 +2223,13 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt) unsigned int max = READ_ONCE(sysctl_mount_max); unsigned int mounts = 0; struct mount *p; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; + + vk = vkernel_find_vk_by_task(current); + if (vk && ns->mounts > READ_ONCE(vk->sysctl_fs.mount_max)) + return -ENOSPC; +#endif if (ns->mounts >= max) return -ENOSPC; diff --git a/include/linux/vkernel.h b/include/linux/vkernel.h index f7b971b2d422..76895d064878 100644 --- a/include/linux/vkernel.h +++ b/include/linux/vkernel.h @@ -161,6 +161,31 @@ struct vkernel_linux_cap { kernel_cap_t ambient; }; +struct vkernel_sysctl_fs_desc { + u64 file_max; + u32 nr_open; + s32 lease_break_time; + s32 leases_enable; + u32 mount_max; +}; + +struct vkernel_sysctl_fs { + /* file */ + struct files_stat_struct files_stat; + unsigned int nr_open; + long old_max; + struct percpu_counter nr_files; + /* inode */ + struct inodes_stat_t inodes_stat; + unsigned long __percpu *nr_inodes; + unsigned long __percpu *nr_unused; + /* lease lock */ + int leases_enable; + int lease_break_time; + /* mount */ + unsigned int mount_max; +}; + struct vkernel; struct vkernel_ops { @@ -199,6 +224,9 @@ struct vkernel { unsigned long caps; unsigned int log_ns; + /* sysctl */ + struct vkernel_sysctl_fs sysctl_fs; + /* operation */ struct vkernel_ops ops; @@ -237,6 +265,8 @@ int vkernel_clear_acl_set(struct vkernel_acl *acl, struct vkernel_file_desc_set int vkernel_set_linux_cap(struct vkernel *vk, struct vkernel_linux_cap *cap); +int vkernel_set_sysctl_fs(struct vkernel_sysctl_fs *fs, struct vkernel_sysctl_fs_desc *desc); + struct vkernel_custom_type *vkernel_find_custom(const char *name); int vkernel_register_custom(struct vkernel_custom_type *custom); int vkernel_unregister_custom(struct vkernel_custom_type *custom); diff --git a/kernel/sys.c b/kernel/sys.c index 1f3ef34ec190..ce3cc7e69cfc 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -64,6 +64,9 @@ #include #include #include +#ifdef CONFIG_VKERNEL +#include +#endif #include @@ -1562,6 +1565,9 @@ static int do_prlimit(struct task_struct *tsk, unsigned int resource, { struct rlimit *rlim; int retval = 0; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif if (resource >= RLIM_NLIMITS) return -EINVAL; @@ -1570,6 +1576,12 @@ static int do_prlimit(struct task_struct *tsk, unsigned int resource, if (new_rlim) { if (new_rlim->rlim_cur > new_rlim->rlim_max) return -EINVAL; +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk && resource == RLIMIT_NOFILE && + new_rlim->rlim_max > vk->sysctl_fs.nr_open) + return -EPERM; +#endif if (resource == RLIMIT_NOFILE && new_rlim->rlim_max > sysctl_nr_open) return -EPERM; -- Gitee From 01abc2babf67c2afb95c61a493120cf9e3e2ddbc Mon Sep 17 00:00:00 2001 From: Hang Huang Date: Sun, 28 Jun 2026 20:13:55 +0800 Subject: [PATCH 04/17] vk: introduce kernel sysctl customization Sysctl values in `/proc/sys/kernel/` can be configured independently for each container. Signed-off-by: chenwei1266 Signed-off-by: Hang Huang --- fs/devpts/inode.c | 26 +++++++++++++++- include/linux/vkernel.h | 67 +++++++++++++++++++++++++++++++++++++++++ kernel/exit.c | 10 ++++++ kernel/fork.c | 31 +++++++++++++++++++ 4 files changed, 133 insertions(+), 1 deletion(-) diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 299c295a27a0..9db216c62c64 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -24,6 +24,9 @@ #include #include #include +#ifdef CONFIG_VKERNEL +#include +#endif #define DEVPTS_DEFAULT_MODE 0600 /* @@ -512,6 +515,15 @@ static struct file_system_type devpts_fs_type = { int devpts_new_index(struct pts_fs_info *fsi) { int index = -ENOSPC; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; + + vk = vkernel_find_vk_by_task(current); + if (vk && atomic_inc_return(&vk->sysctl_kernel.pty_count) >= + (vk->sysctl_kernel.pty_limit - + (fsi->mount_opts.reserve ? 0 : vk->sysctl_kernel.pty_reserve))) + goto out; +#endif if (atomic_inc_return(&pty_count) >= (pty_limit - (fsi->mount_opts.reserve ? 0 : pty_reserve))) @@ -521,13 +533,25 @@ int devpts_new_index(struct pts_fs_info *fsi) GFP_KERNEL); out: - if (index < 0) + if (index < 0) { +#ifdef CONFIG_VKERNEL + if (vk) + atomic_dec(&vk->sysctl_kernel.pty_count); +#endif atomic_dec(&pty_count); + } return index; } void devpts_kill_index(struct pts_fs_info *fsi, int idx) { +#ifdef CONFIG_VKERNEL + struct vkernel *vk; + + vk = vkernel_find_vk_by_task(current); + if (vk) + atomic_dec(&vk->sysctl_kernel.pty_count); +#endif ida_free(&fsi->allocated_ptys, idx); atomic_dec(&pty_count); } diff --git a/include/linux/vkernel.h b/include/linux/vkernel.h index 76895d064878..dc95b500150e 100644 --- a/include/linux/vkernel.h +++ b/include/linux/vkernel.h @@ -169,6 +169,41 @@ struct vkernel_sysctl_fs_desc { u32 mount_max; }; +struct vkernel_sysctl_kernel_desc { + u32 msgmax; + u32 msgmnb; + u32 msgmni; + s32 msg_next_id; + s32 semmsl; + s32 semmns; + s32 semopm; + s32 semmni; + s32 sem_next_id; + u64 shmall; + u64 shmmax; + u64 shmmni; + s32 shm_next_id; + s32 shm_rmid_forced; + s32 numa_balancing; + s32 numa_balancing_promote_rate_limit; + u32 sched_cfs_bandwidth_slice; + u32 sched_child_runs_first; + u32 sched_dl_period_max; + u32 sched_dl_period_min; + s32 sched_rr_timeslice; + s32 sched_rt_period; + s32 sched_rt_runtime; + s32 max_threads; + u32 key_gc_delay; + u32 key_persistent_keyring_expiry; + u32 key_quota_maxbytes; + u32 key_quota_maxkeys; + u32 key_quota_root_maxbytes; + u32 key_quota_root_maxkeys; + s32 pty_limit; + s32 pty_reserve; +}; + struct vkernel_sysctl_fs { /* file */ struct files_stat_struct files_stat; @@ -186,6 +221,35 @@ struct vkernel_sysctl_fs { unsigned int mount_max; }; +struct vkernel_sysctl_kernel { + /* TODO: numa balancing, implemented at mem cgroup? */ + int nb_mode; + int nb_promote_rate_limit; + /* TODO: sched, implemented at cpu cgroup? */ + unsigned int sched_cfs_bandwidth_slice; + unsigned int sched_child_runs_first; + unsigned int sched_dl_period_max; + unsigned int sched_dl_period_min; + /* NOTE: rt has inflence on rcu */ + int sched_rr_timeslice; + int sched_rt_period; + int sched_rt_runtime; + /* thread */ + int nr_threads; + int max_threads; + /* security keys */ + unsigned int key_gc_delay; + unsigned int persistent_keyring_expiry; + unsigned int key_quota_root_maxbytes; + unsigned int key_quota_root_maxkeys; + unsigned int key_quota_maxbytes; + unsigned int key_quota_maxkeys; + /* pty */ + int pty_limit; + int pty_reserve; + atomic_t pty_count; +}; + struct vkernel; struct vkernel_ops { @@ -226,6 +290,7 @@ struct vkernel { /* sysctl */ struct vkernel_sysctl_fs sysctl_fs; + struct vkernel_sysctl_kernel sysctl_kernel; /* operation */ struct vkernel_ops ops; @@ -266,6 +331,8 @@ int vkernel_clear_acl_set(struct vkernel_acl *acl, struct vkernel_file_desc_set int vkernel_set_linux_cap(struct vkernel *vk, struct vkernel_linux_cap *cap); int vkernel_set_sysctl_fs(struct vkernel_sysctl_fs *fs, struct vkernel_sysctl_fs_desc *desc); +int vkernel_set_sysctl_kernel(struct vkernel_sysctl_kernel *k, + struct vkernel_sysctl_kernel_desc *desc); struct vkernel_custom_type *vkernel_find_custom(const char *name); int vkernel_register_custom(struct vkernel_custom_type *custom); diff --git a/kernel/exit.c b/kernel/exit.c index 12e102d11046..5ae00a491a6a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -72,6 +72,9 @@ #ifdef CONFIG_TEXT_UNEVICTABLE #include #endif +#ifdef CONFIG_VKERNEL +#include +#endif #include #include @@ -136,6 +139,13 @@ late_initcall(kernel_exit_sysfs_init); static void __unhash_process(struct task_struct *p, bool group_dead) { +#ifdef CONFIG_VKERNEL + struct vkernel *vk; + + vk = vkernel_find_vk_by_task(current); + if (vk) + vk->sysctl_kernel.nr_threads--; +#endif nr_threads--; detach_pid(p, PIDTYPE_PID); if (group_dead) { diff --git a/kernel/fork.c b/kernel/fork.c index 823854ce0bbb..82bd2d357f7a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -102,6 +102,10 @@ #include #include +#ifdef CONFIG_VKERNEL +#include +#endif + #include #include #include @@ -2325,6 +2329,9 @@ __latent_entropy struct task_struct *copy_process( struct file *pidfile = NULL; const u64 clone_flags = args->flags; struct nsproxy *nsp = current->nsproxy; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif /* * Don't allow sharing the root directory with processes in a different @@ -2462,6 +2469,12 @@ __latent_entropy struct task_struct *copy_process( * to stop root fork bombs. */ retval = -EAGAIN; +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + /* Vkernel: Check vkernel data race */ + if (vk && data_race(vk->sysctl_kernel.nr_threads >= vk->sysctl_kernel.max_threads)) + goto bad_fork_cleanup_count; +#endif if (data_race(nr_threads >= max_threads)) goto bad_fork_cleanup_count; @@ -2802,6 +2815,10 @@ __latent_entropy struct task_struct *copy_process( &p->signal->thread_head); } attach_pid(p, PIDTYPE_PID); +#ifdef CONFIG_VKERNEL + if (vk) + vk->sysctl_kernel.nr_threads++; +#endif nr_threads++; } __this_cpu_inc(total_forks); @@ -3665,6 +3682,13 @@ int sysctl_max_threads(struct ctl_table *table, int write, int threads = max_threads; int min = 1; int max = MAX_THREADS; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; + + vk = vkernel_find_vk_by_task(current); + if (vk) + threads = vk->sysctl_kernel.max_threads; +#endif t = *table; t.data = &threads; @@ -3675,7 +3699,14 @@ int sysctl_max_threads(struct ctl_table *table, int write, if (ret || !write) return ret; +#ifdef CONFIG_VKERNEL + if (vk) + vk->sysctl_kernel.max_threads = threads; + else + max_threads = threads; +#else max_threads = threads; +#endif return 0; } -- Gitee From 7debf61533558eefc70e61dd7287d0c0aa2525d2 Mon Sep 17 00:00:00 2001 From: Hang Huang Date: Sun, 28 Jun 2026 20:13:55 +0800 Subject: [PATCH 05/17] vk: introduce net sysctl customization Since some sysctl values in `/proc/sys/net/` are not net-ns specific, initial version does not add extra kernel paths. Signed-off-by: chenwei1266 Signed-off-by: Hang Huang --- include/linux/vkernel.h | 156 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 156 insertions(+) diff --git a/include/linux/vkernel.h b/include/linux/vkernel.h index dc95b500150e..9758df3ffe74 100644 --- a/include/linux/vkernel.h +++ b/include/linux/vkernel.h @@ -204,6 +204,132 @@ struct vkernel_sysctl_kernel_desc { s32 pty_reserve; }; +struct vkernel_sysctl_net_desc { + u32 nf_conntrack_max; + u32 core_busy_poll; + u32 core_busy_read; + s32 core_dev_weight; + s32 core_netdev_budget; + s32 core_netdev_budget_us; + s32 core_netdev_max_backlog; + s32 core_optmem_max; + u32 core_wmem_max; + u32 core_rmem_max; + u32 core_wmem_default; + u32 core_rmem_default; + + /* net ns fileds */ + + u32 core_somaxconn; + + u8 ipv4_icmp_echo_ignore_all; + u8 ipv4_icmp_echo_enable_probe; + u8 ipv4_icmp_echo_ignore_broadcasts; + u8 ipv4_icmp_ignore_bogus_error_responses; + u8 ipv4_icmp_errors_use_inbound_ifaddr; + u32 ipv4_icmp_ratelimit; + u32 ipv4_icmp_ratemask; + s32 ipv4_ip_local_port_range[2]; + s32 ipv4_max_tw_buckets; + u8 ipv4_tcp_ecn; + u8 ipv4_tcp_ecn_fallback; + u8 ipv4_ip_default_ttl; + u8 ipv4_ip_no_pmtu_disc; + u8 ipv4_ip_fwd_use_pmtu; + u8 ipv4_ip_fwd_update_priority; + u8 ipv4_ip_nonlocal_bind; + u8 ipv4_ip_autobind_reuse; + u8 ipv4_ip_dynaddr; + u8 ipv4_ip_early_demux; + u8 ipv4_tcp_early_demux; + u8 ipv4_udp_early_demux; + u8 ipv4_nexthop_compat_mode; + u8 ipv4_fwmark_reflect; + u8 ipv4_tcp_fwmark_accept; + u8 ipv4_tcp_mtu_probing; + s32 ipv4_tcp_mtu_probe_floor; + s32 ipv4_tcp_base_mss; + s32 ipv4_tcp_min_snd_mss; + s32 ipv4_tcp_probe_threshold; + u32 ipv4_tcp_probe_interval; + s32 ipv4_tcp_keepalive_time; + s32 ipv4_tcp_keepalive_intvl; + u8 ipv4_tcp_keepalive_probes; + u8 ipv4_tcp_syn_retries; + u8 ipv4_tcp_synack_retries; + u8 ipv4_tcp_syncookies; + u8 ipv4_tcp_migrate_req; + u8 ipv4_tcp_comp_sack_nr; + s32 ipv4_tcp_reordering; + u8 ipv4_tcp_retries1; + u8 ipv4_tcp_retries2; + u8 ipv4_tcp_orphan_retries; + u8 ipv4_tcp_tw_reuse; + s32 ipv4_tcp_fin_timeout; + u32 ipv4_tcp_notsent_lowat; + u8 ipv4_tcp_sack; + u8 ipv4_tcp_window_scaling; + u8 ipv4_tcp_timestamps; + u8 ipv4_tcp_early_retrans; + u8 ipv4_tcp_recovery; + u8 ipv4_tcp_thin_linear_timeouts; + u8 ipv4_tcp_slow_start_after_idle; + u8 ipv4_tcp_retrans_collapse; + u8 ipv4_tcp_stdurg; + u8 ipv4_tcp_rfc1337; + u8 ipv4_tcp_abort_on_overflow; + u8 ipv4_tcp_fack; + s32 ipv4_tcp_max_reordering; + s32 ipv4_tcp_adv_win_scale; + u8 ipv4_tcp_dsack; + u8 ipv4_tcp_app_win; + u8 ipv4_tcp_frto; + u8 ipv4_tcp_nometrics_save; + u8 ipv4_tcp_no_ssthresh_metrics_save; + u8 ipv4_tcp_moderate_rcvbuf; + u8 ipv4_tcp_tso_win_divisor; + u8 ipv4_tcp_workaround_signed_windows; + s32 ipv4_tcp_limit_output_bytes; + s32 ipv4_tcp_challenge_ack_limit; + s32 ipv4_tcp_min_rtt_wlen; + u8 ipv4_tcp_min_tso_segs; + u8 ipv4_tcp_tso_rtt_log; + u8 ipv4_tcp_autocorking; + u8 ipv4_tcp_reflect_tos; + s32 ipv4_tcp_invalid_ratelimit; + s32 ipv4_tcp_pacing_ss_ratio; + s32 ipv4_tcp_pacing_ca_ratio; + s32 ipv4_tcp_wmem[3]; + s32 ipv4_tcp_rmem[3]; + u32 ipv4_tcp_child_ehash_entries; + u64 ipv4_tcp_comp_sack_delay_ns; + u64 ipv4_tcp_comp_sack_slack_ns; + s32 ipv4_max_syn_backlog; + s32 ipv4_tcp_fastopen; + u32 ipv4_tcp_fastopen_blackhole_timeout; + char ipv4_tcp_congestion_control[TCP_CA_NAME_MAX]; + u8 ipv4_tcp_plb_enabled; + u8 ipv4_tcp_plb_idle_rehash_rounds; + u8 ipv4_tcp_plb_rehash_rounds; + u8 ipv4_tcp_plb_suspend_rto_sec; + s32 ipv4_tcp_plb_cong_thresh; + s32 ipv4_udp_wmem_min; + s32 ipv4_udp_rmem_min; + u8 ipv4_fib_notify_on_flag_change; + u8 ipv4_igmp_llm_reports; + s32 ipv4_igmp_max_memberships; + s32 ipv4_igmp_max_msf; + s32 ipv4_igmp_qrv; + u32 ipv4_fib_multipath_hash_fields; + u8 ipv4_fib_multipath_use_neigh; + u8 ipv4_fib_multipath_hash_policy; + + s32 ipv4_conf_all[IPV4_DEVCONF_MAX]; + s32 ipv4_conf_default[IPV4_DEVCONF_MAX]; + + s32 unix_max_dgram_qlen; +}; + struct vkernel_sysctl_fs { /* file */ struct files_stat_struct files_stat; @@ -250,6 +376,34 @@ struct vkernel_sysctl_kernel { atomic_t pty_count; }; +struct vkernel_sysctl_net { + /* netns specific */ + unsigned int nf_conntrack_max; + /* core */ + unsigned int net_busy_poll; + unsigned int net_busy_read; + /* napi_struct specific, inactive (not netns specific) */ + int weight_p; + int dev_weight_rx_bias; + int dev_weight_tx_bias; + int dev_rx_weight; + int dev_tx_weight; + /* softnet_data specific, inactive (not netns specific) */ + int netdev_budget; + unsigned int netdev_budget_usecs; + int netdev_max_backlog; + /* sock specific (netns specific) */ + int optmem_max; + u32 wmem_max; + u32 rmem_max; + u32 wmem_default; + u32 rmem_default; + /* global (not netns specific) */ + // struct rps_sock_flow_table __rcu *rps_sock_flow_table; + /* netns core, ipv4, ipv4 conf, unix */ + struct net *net; +}; + struct vkernel; struct vkernel_ops { @@ -291,6 +445,7 @@ struct vkernel { /* sysctl */ struct vkernel_sysctl_fs sysctl_fs; struct vkernel_sysctl_kernel sysctl_kernel; + struct vkernel_sysctl_net sysctl_net; /* operation */ struct vkernel_ops ops; @@ -333,6 +488,7 @@ int vkernel_set_linux_cap(struct vkernel *vk, struct vkernel_linux_cap *cap); int vkernel_set_sysctl_fs(struct vkernel_sysctl_fs *fs, struct vkernel_sysctl_fs_desc *desc); int vkernel_set_sysctl_kernel(struct vkernel_sysctl_kernel *k, struct vkernel_sysctl_kernel_desc *desc); +int vkernel_set_sysctl_net(struct vkernel_sysctl_net *net, struct vkernel_sysctl_net_desc *desc); struct vkernel_custom_type *vkernel_find_custom(const char *name); int vkernel_register_custom(struct vkernel_custom_type *custom); -- Gitee From beeefcd4677f8d14400f97469e2e22019e4708fb Mon Sep 17 00:00:00 2001 From: Hang Huang Date: Sun, 28 Jun 2026 20:13:55 +0800 Subject: [PATCH 06/17] vk: introduce vm sysctl customization Sysctl values in `/proc/sys/vm/` can be configured independently for each container. Signed-off-by: chenwei1266 Signed-off-by: Hang Huang --- fs/exec.c | 12 ++++++ fs/proc/meminfo.c | 24 ++++++++++- fs/userfaultfd.c | 11 +++++ include/linux/mman.h | 4 ++ include/linux/vkernel.h | 28 ++++++++++++ ipc/shm.c | 16 +++++++ kernel/sys.c | 16 +++++++ mm/mmap.c | 96 +++++++++++++++++++++++++++++++++++++++++ mm/mremap.c | 18 ++++++++ mm/nommu.c | 11 +++++ mm/util.c | 73 ++++++++++++++++++++++++++++++- 11 files changed, 306 insertions(+), 3 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 88d8e2e51c6a..fa8cd535575a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -66,6 +66,9 @@ #include #include #include +#ifdef CONFIG_VKERNEL +#include +#endif #include #include @@ -779,6 +782,9 @@ int setup_arg_pages(struct linux_binprm *bprm, unsigned long rlim_stack; struct mmu_gather tlb; struct vma_iterator vmi; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif #ifdef CONFIG_STACK_GROWSUP /* Limit stack size */ @@ -803,6 +809,12 @@ int setup_arg_pages(struct linux_binprm *bprm, stack_top = arch_align_stack(stack_top); stack_top = PAGE_ALIGN(stack_top); +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk && (unlikely(stack_top < vk->sysctl_vm.mmap_min_addr) || + unlikely(vma->vm_end - vma->vm_start >= stack_top - vk->sysctl_vm.mmap_min_addr))) + return -ENOMEM; +#endif if (unlikely(stack_top < mmap_min_addr) || unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr)) return -ENOMEM; diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index d749981a35d9..2fae4223d24c 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -18,6 +18,10 @@ #include #endif #include +#include +#ifdef CONFIG_VKERNEL +#include +#endif #include #include #include "internal.h" @@ -35,6 +39,7 @@ static void show_val_kb(struct seq_file *m, const char *s, unsigned long num) static int meminfo_proc_show(struct seq_file *m, void *v) { struct sysinfo i; + unsigned long commit_limit; unsigned long committed; long cached; long available; @@ -47,6 +52,23 @@ static int meminfo_proc_show(struct seq_file *m, void *v) si_meminfo(&i); si_swapinfo(&i); committed = vm_memory_committed(); + commit_limit = vm_commit_limit(); + +#if defined(CONFIG_VKERNEL) && defined(CONFIG_MEMCG) + { + struct vkernel *vk; + struct mem_cgroup *memcg; + + vk = vkernel_find_vk_by_task(current); + if (vk) { + memcg = mem_cgroup_from_task(current); + if (memcg) { + commit_limit = vk_vm_commit_limit(&vk->sysctl_vm, memcg); + css_put(&memcg->css); + } + } + } +#endif cached = global_node_page_state(NR_FILE_PAGES) - total_swapcache_pages() - i.bufferram; @@ -127,7 +149,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) global_zone_page_state(NR_BOUNCE)); show_val_kb(m, "WritebackTmp: ", global_node_page_state(NR_WRITEBACK_TEMP)); - show_val_kb(m, "CommitLimit: ", vm_commit_limit()); + show_val_kb(m, "CommitLimit: ", commit_limit); show_val_kb(m, "Committed_AS: ", committed); seq_printf(m, "VmallocTotal: %8lu kB\n", (unsigned long)VMALLOC_TOTAL >> 10); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index e2d96e1e9e9f..3c324fbf5f0d 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -31,6 +31,9 @@ #include #include #include +#ifdef CONFIG_VKERNEL +#include +#endif static int sysctl_unprivileged_userfaultfd __read_mostly; @@ -1290,11 +1293,19 @@ static __always_inline int validate_unaligned_range( struct mm_struct *mm, __u64 start, __u64 len) { __u64 task_size = mm->task_size; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif if (len & ~PAGE_MASK) return -EINVAL; if (!len) return -EINVAL; +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk && start < vk->sysctl_vm.mmap_min_addr) + return -EINVAL; +#endif if (start < mmap_min_addr) return -EINVAL; if (start >= task_size) diff --git a/include/linux/mman.h b/include/linux/mman.h index b2e2677ea156..301e3d6f17cb 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -75,10 +75,14 @@ static inline void mm_compute_batch(int overcommit_policy) unsigned long vm_memory_committed(void); +#ifdef CONFIG_VKERNEL +void vm_acct_memory(long pages); +#else static inline void vm_acct_memory(long pages) { percpu_counter_add_batch(&vm_committed_as, pages, vm_committed_as_batch); } +#endif static inline void vm_unacct_memory(long pages) { diff --git a/include/linux/vkernel.h b/include/linux/vkernel.h index 9758df3ffe74..92f76beaa039 100644 --- a/include/linux/vkernel.h +++ b/include/linux/vkernel.h @@ -330,6 +330,14 @@ struct vkernel_sysctl_net_desc { s32 unix_max_dgram_qlen; }; +struct vkernel_sysctl_vm_desc { + s32 max_map_count; + u64 mmap_min_addr; + s32 overcommit_memory; + s32 overcommit_ratio; + u64 overcommit_kbytes; +}; + struct vkernel_sysctl_fs { /* file */ struct files_stat_struct files_stat; @@ -404,6 +412,19 @@ struct vkernel_sysctl_net { struct net *net; }; +struct vkernel_sysctl_vm { + /* map */ + int max_map_count; + unsigned long mmap_min_addr; + unsigned long dac_mmap_min_addr; + /* overcommit */ + int overcommit_memory; + int overcommit_ratio; + unsigned long overcommit_kbytes; + struct percpu_counter vm_committed_as; + s32 as_batch; +}; + struct vkernel; struct vkernel_ops { @@ -446,6 +467,7 @@ struct vkernel { struct vkernel_sysctl_fs sysctl_fs; struct vkernel_sysctl_kernel sysctl_kernel; struct vkernel_sysctl_net sysctl_net; + struct vkernel_sysctl_vm sysctl_vm; /* operation */ struct vkernel_ops ops; @@ -458,6 +480,11 @@ struct vkernel { struct dentry *debugfs_dentry; }; +#ifdef CONFIG_MEMCG +unsigned long vk_vm_commit_limit(struct vkernel_sysctl_vm *vm, + struct mem_cgroup *memcg); +#endif + struct vkernel *vkernel_find_vk_by_id(unsigned int id); struct vkernel *vkernel_find_vk_by_task(struct task_struct *tsk); int vkernel_register_vk(struct vkernel *vk); @@ -489,6 +516,7 @@ int vkernel_set_sysctl_fs(struct vkernel_sysctl_fs *fs, struct vkernel_sysctl_fs int vkernel_set_sysctl_kernel(struct vkernel_sysctl_kernel *k, struct vkernel_sysctl_kernel_desc *desc); int vkernel_set_sysctl_net(struct vkernel_sysctl_net *net, struct vkernel_sysctl_net_desc *desc); +int vkernel_set_sysctl_vm(struct vkernel_sysctl_vm *vm, struct vkernel_sysctl_vm_desc *desc); struct vkernel_custom_type *vkernel_find_custom(const char *name); int vkernel_register_custom(struct vkernel_custom_type *custom); diff --git a/ipc/shm.c b/ipc/shm.c index bb017dd760f4..0bd1c537900d 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -44,6 +44,9 @@ #include #include #include +#ifdef CONFIG_VKERNEL +#include +#endif #include @@ -755,6 +758,18 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) file = hugetlb_file_setup(name, hugesize, acctflag, HUGETLB_SHMFS_INODE, (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK); } else { +#ifdef CONFIG_VKERNEL + struct vkernel *vk; + + vk = vkernel_find_vk_by_task(current); + if (vk) { + if ((shmflg & SHM_NORESERVE) && + vk->sysctl_vm.overcommit_memory != OVERCOMMIT_NEVER) + acctflag = VM_NORESERVE; + } else if ((shmflg & SHM_NORESERVE) && + sysctl_overcommit_memory != OVERCOMMIT_NEVER) + acctflag = VM_NORESERVE; +#else /* * Do not allow no accounting for OVERCOMMIT_NEVER, even * if it's asked for. @@ -762,6 +777,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) if ((shmflg & SHM_NORESERVE) && sysctl_overcommit_memory != OVERCOMMIT_NEVER) acctflag = VM_NORESERVE; +#endif file = shmem_kernel_file_setup(name, size, acctflag); } error = PTR_ERR(file); diff --git a/kernel/sys.c b/kernel/sys.c index ce3cc7e69cfc..280af2c2b736 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2063,6 +2063,9 @@ static int validate_prctl_map_addr(struct prctl_mm_map *prctl_map) { unsigned long mmap_max_addr = TASK_SIZE; int error = -EINVAL, i; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif static const unsigned char offsets[] = { offsetof(struct prctl_mm_map, start_code), @@ -2085,6 +2088,11 @@ static int validate_prctl_map_addr(struct prctl_mm_map *prctl_map) for (i = 0; i < ARRAY_SIZE(offsets); i++) { u64 val = *(u64 *)((char *)prctl_map + offsets[i]); +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk && (unsigned long)val < vk->sysctl_vm.mmap_min_addr) + goto out; +#endif if ((unsigned long)val >= mmap_max_addr || (unsigned long)val < mmap_min_addr) goto out; @@ -2270,6 +2278,9 @@ static int prctl_set_mm(int opt, unsigned long addr, }; struct vm_area_struct *vma; int error; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif if (arg5 || (arg4 && (opt != PR_SET_MM_AUXV && opt != PR_SET_MM_MAP && @@ -2290,6 +2301,11 @@ static int prctl_set_mm(int opt, unsigned long addr, if (opt == PR_SET_MM_AUXV) return prctl_set_auxv(mm, addr, arg4); +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk && addr < vk->sysctl_vm.mmap_min_addr) + return -EINVAL; +#endif if (addr >= TASK_SIZE || addr < mmap_min_addr) return -EINVAL; diff --git a/mm/mmap.c b/mm/mmap.c index 9090fffd7708..566bef3d1fed 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -49,6 +49,10 @@ #include #include +#ifdef CONFIG_VKERNEL +#include +#endif + #include #include #include @@ -1160,7 +1164,16 @@ struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) */ static inline unsigned long round_hint_to_min(unsigned long hint) { +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif + hint &= PAGE_MASK; +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk && ((void *)hint != NULL) && (hint < vk->sysctl_vm.mmap_min_addr)) + hint = PAGE_ALIGN(vk->sysctl_vm.mmap_min_addr); +#endif if (((void *)hint != NULL) && (hint < mmap_min_addr)) return PAGE_ALIGN(mmap_min_addr); @@ -1227,6 +1240,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr, { struct mm_struct *mm = current->mm; int pkey = 0; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif *populate = 0; @@ -1260,6 +1276,11 @@ unsigned long do_mmap(struct file *file, unsigned long addr, return -EOVERFLOW; /* Too many mappings? */ +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk && mm->map_count > vk->sysctl_vm.max_map_count) + return -ENOMEM; +#endif #ifdef CONFIG_PID_NS if (mm->map_count > task_active_pid_ns(current)->max_map_count) #else @@ -1390,6 +1411,13 @@ unsigned long do_mmap(struct file *file, unsigned long addr, * memory use of this mapping. */ if (flags & MAP_NORESERVE) { +#ifdef CONFIG_VKERNEL + if (vk) { + if (vk->sysctl_vm.overcommit_memory != OVERCOMMIT_NEVER) + vm_flags |= VM_NORESERVE; + } else if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) + vm_flags |= VM_NORESERVE; +#else /* We honor MAP_NORESERVE if allowed to overcommit */ if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) vm_flags |= VM_NORESERVE; @@ -1397,6 +1425,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, /* hugetlb applies strict overcommit unless MAP_NORESERVE */ if (file && is_file_hugepages(file)) vm_flags |= VM_NORESERVE; +#endif } addr = mmap_region(file, addr, len, vm_flags, pgoff, uf); @@ -1593,6 +1622,9 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info) unsigned long length, gap; unsigned long low_limit, high_limit; struct vm_area_struct *tmp; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); @@ -1602,6 +1634,11 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info) return -ENOMEM; low_limit = info->low_limit; +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk && low_limit < vk->sysctl_vm.mmap_min_addr) + low_limit = vk->sysctl_vm.mmap_min_addr; +#endif if (low_limit < mmap_min_addr) low_limit = mmap_min_addr; high_limit = info->high_limit; @@ -1645,6 +1682,9 @@ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) unsigned long length, gap, gap_end; unsigned long low_limit, high_limit; struct vm_area_struct *tmp; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); /* Adjust search length to account for worst case alignment overhead */ @@ -1653,6 +1693,11 @@ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) return -ENOMEM; low_limit = info->low_limit; +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk && low_limit < vk->sysctl_vm.mmap_min_addr) + low_limit = vk->sysctl_vm.mmap_min_addr; +#endif if (low_limit < mmap_min_addr) low_limit = mmap_min_addr; high_limit = info->high_limit; @@ -1724,6 +1769,13 @@ generic_get_unmapped_area(struct file *filp, unsigned long addr, struct vm_area_struct *vma, *prev; struct vm_unmapped_area_info info; const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags); +#ifdef CONFIG_VKERNEL + struct vkernel *vk; + + vk = vkernel_find_vk_by_task(current); + if (vk && len > mmap_end - vk->sysctl_vm.mmap_min_addr) + return -ENOMEM; +#endif if (len > mmap_end - mmap_min_addr) return -ENOMEM; @@ -1735,6 +1787,9 @@ generic_get_unmapped_area(struct file *filp, unsigned long addr, addr = PAGE_ALIGN(addr); vma = find_vma_prev(mm, addr, &prev); if (mmap_end - len >= addr && addr >= mmap_min_addr && +#ifdef CONFIG_VKERNEL + (!vk || addr >= vk->sysctl_vm.mmap_min_addr) && +#endif (!vma || addr + len <= vm_start_gap(vma)) && (!prev || addr >= vm_end_gap(prev))) return addr; @@ -1772,6 +1827,13 @@ generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, struct mm_struct *mm = current->mm; struct vm_unmapped_area_info info; const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags); +#ifdef CONFIG_VKERNEL + struct vkernel *vk; + + vk = vkernel_find_vk_by_task(current); + if (vk && len > mmap_end - vk->sysctl_vm.mmap_min_addr) + return -ENOMEM; +#endif /* requested length too big for entire address space */ if (len > mmap_end - mmap_min_addr) @@ -1785,6 +1847,9 @@ generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, addr = PAGE_ALIGN(addr); vma = find_vma_prev(mm, addr, &prev); if (mmap_end - len >= addr && addr >= mmap_min_addr && +#ifdef CONFIG_VKERNEL + (!vk || addr >= vk->sysctl_vm.mmap_min_addr) && +#endif (!vma || addr + len <= vm_start_gap(vma)) && (!prev || addr >= vm_end_gap(prev))) return addr; @@ -2110,11 +2175,19 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start); struct vm_area_struct *prev; int error = 0; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif if (!(vma->vm_flags & VM_GROWSDOWN)) return -EFAULT; address &= PAGE_MASK; +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk && address < vk->sysctl_vm.mmap_min_addr) + return -EPERM; +#endif if (address < mmap_min_addr || address < FIRST_USER_ADDRESS) return -EPERM; @@ -2504,6 +2577,13 @@ static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, int new_below) { +#ifdef CONFIG_VKERNEL + struct vkernel *vk; + + vk = vkernel_find_vk_by_task(current); + if (vk && vma->vm_mm->map_count >= vk->sysctl_vm.max_map_count) + return -ENOMEM; +#endif #ifdef CONFIG_PID_NS if (vma->vm_mm->map_count >= task_active_pid_ns(current)->max_map_count) #else @@ -2617,6 +2697,9 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, int count = 0; int error = -ENOMEM; unsigned long locked_vm = 0; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif MA_STATE(mas_detach, &mt_detach, 0, 0); mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); mt_on_stack(mt_detach); @@ -2640,6 +2723,11 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, * not exceed its limit; but let map_count go just above * its limit temporarily, to help free resources as expected. */ +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk && mm->map_count >= vk->sysctl_vm.max_map_count) + goto map_count_exceeded; +#endif #ifdef CONFIG_PID_NS if (end < vma->vm_end && mm->map_count >= task_active_pid_ns(current)->max_map_count) #else @@ -3307,6 +3395,9 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, { struct mm_struct *mm = current->mm; struct vma_prepare vp; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif /* * Check against address space limits by the changed size @@ -3316,6 +3407,11 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) return -ENOMEM; +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk && mm->map_count > vk->sysctl_vm.max_map_count) + return -ENOMEM; +#endif #ifdef CONFIG_PID_NS if (mm->map_count > task_active_pid_ns(current)->max_map_count) #else diff --git a/mm/mremap.c b/mm/mremap.c index cd1c7670f5f6..a5c9f3810227 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -25,6 +25,9 @@ #include #include #include +#ifdef CONFIG_VKERNEL +#include +#endif #include #include @@ -672,6 +675,13 @@ static unsigned long move_vma(struct vm_area_struct *vma, int err = 0; bool need_rmap_locks; struct vma_iterator vmi; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; + + vk = vkernel_find_vk_by_task(current); + if (vk && mm->map_count >= vk->sysctl_vm.max_map_count - 3) + return -ENOMEM; +#endif /* * We'd prefer to avoid failure later on in do_munmap: @@ -887,6 +897,9 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, struct vm_area_struct *vma; unsigned long ret = -EINVAL; unsigned long map_flags = 0; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif if (offset_in_page(new_addr)) goto out; @@ -912,6 +925,11 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, * Check whether current map count plus 2 still leads us to 4 maps below * the threshold, otherwise return -ENOMEM here to be more safe. */ +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk && (mm->map_count + 2) >= vk->sysctl_vm.max_map_count - 3) + return -ENOMEM; +#endif #ifdef CONFIG_PID_NS if ((mm->map_count + 2) >= task_active_pid_ns(current)->max_map_count - 3) #else diff --git a/mm/nommu.c b/mm/nommu.c index 3479cae48ee1..37ca77ba9570 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -34,6 +34,9 @@ #include #include #include +#ifdef CONFIG_VKERNEL +#include +#endif #include #include @@ -1318,6 +1321,9 @@ static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_region *region; unsigned long npages; struct mm_struct *mm; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#endif /* we're only permitted to split anonymous regions (these should have * only a single usage on the region) */ @@ -1325,6 +1331,11 @@ static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, return -ENOMEM; mm = vma->vm_mm; +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk && mm->map_count >= vk->sysctl_vm.max_map_count) + return -ENOMEM; +#endif #ifdef CONFIG_PID_NS if (mm->map_count >= task_active_pid_ns(current)->max_map_count) #else diff --git a/mm/util.c b/mm/util.c index 4525f46cca0a..4a098cbb7ede 100644 --- a/mm/util.c +++ b/mm/util.c @@ -24,6 +24,9 @@ #include #include +#ifdef CONFIG_VKERNEL +#include +#endif #include #include "internal.h" @@ -914,6 +917,28 @@ unsigned long vm_commit_limit(void) return allowed; } +#if defined(CONFIG_VKERNEL) && defined(CONFIG_MEMCG) +unsigned long vk_vm_commit_limit(struct vkernel_sysctl_vm *vm, + struct mem_cgroup *memcg) +{ + unsigned long allowed; + struct mem_cgroup *iter; + unsigned long limit; + + if (vm->overcommit_kbytes) + allowed = vm->overcommit_kbytes >> (PAGE_SHIFT - 10); + else { + limit = totalram_pages() - hugetlb_total_pages(); + for (iter = memcg; iter; iter = parent_mem_cgroup(iter)) + limit = min(limit, iter->memory.max); + allowed = (limit * vm->overcommit_ratio / 100); + } + allowed += min_t(unsigned long, total_swap_pages, memcg->swap.max); + + return allowed; +} +#endif + /* * Make sure vm_committed_as in one cacheline and not cacheline shared with * other variables. It can be updated by several CPUs frequently. @@ -935,10 +960,30 @@ struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; */ unsigned long vm_memory_committed(void) { +#ifdef CONFIG_VKERNEL + struct vkernel *vk; + + vk = vkernel_find_vk_by_task(current); + if (vk) + return percpu_counter_sum_positive(&vk->sysctl_vm.vm_committed_as); +#endif return percpu_counter_sum_positive(&vm_committed_as); } EXPORT_SYMBOL_GPL(vm_memory_committed); +#ifdef CONFIG_VKERNEL +void vm_acct_memory(long pages) +{ + struct vkernel *vk; + + vk = vkernel_find_vk_by_task(current); + if (vk) + percpu_counter_add_batch(&vk->sysctl_vm.vm_committed_as, pages, + vk->sysctl_vm.as_batch); + percpu_counter_add_batch(&vm_committed_as, pages, vm_committed_as_batch); +} +#endif + /* * Check that a process has enough memory to allocate a new virtual * mapping. 0 means there is enough memory for the allocation to @@ -958,16 +1003,34 @@ EXPORT_SYMBOL_GPL(vm_memory_committed); int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { long allowed; + int overcommit = sysctl_overcommit_memory; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; +#ifdef CONFIG_MEMCG + struct mem_cgroup *memcg; + long memcg_allowed; +#endif + + vk = vkernel_find_vk_by_task(current); + if (vk) { + overcommit = vk->sysctl_vm.overcommit_memory; +#ifdef CONFIG_MEMCG + memcg = mem_cgroup_from_task(current); + if (memcg) + memcg_allowed = vk_vm_commit_limit(&vk->sysctl_vm, memcg); +#endif + } +#endif vm_acct_memory(pages); /* * Sometimes we want to use more memory than we have */ - if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) + if (overcommit == OVERCOMMIT_ALWAYS) return 0; - if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { + if (overcommit == OVERCOMMIT_GUESS) { if (pages > totalram_pages() + total_swap_pages) goto error; return 0; @@ -989,6 +1052,12 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) allowed -= min_t(long, mm->total_vm / 32, reserve); } +#if defined(CONFIG_VKERNEL) && defined(CONFIG_MEMCG) + if (vk && + percpu_counter_read_positive(&vk->sysctl_vm.vm_committed_as) < memcg_allowed) + return 0; +#endif + if (percpu_counter_read_positive(&vm_committed_as) < allowed) return 0; error: -- Gitee From ebafcffffac453edfe985c8bf1cb37b27a107b5a Mon Sep 17 00:00:00 2001 From: Hang Huang Date: Sun, 28 Jun 2026 20:13:55 +0800 Subject: [PATCH 07/17] vk: introduce cpu policy customization Each sched group is allowed to configure independent inner sched policy, such as FIFO, RR. There are rare known real-world workloads benefiting from FIFO/RR policy. So, this version does not introduce these policies in kernel. Signed-off-by: chenwei1266 Signed-off-by: Hang Huang --- include/linux/vkernel.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/include/linux/vkernel.h b/include/linux/vkernel.h index 92f76beaa039..e3729d4eb90d 100644 --- a/include/linux/vkernel.h +++ b/include/linux/vkernel.h @@ -161,6 +161,18 @@ struct vkernel_linux_cap { kernel_cap_t ambient; }; +struct vkernel_cpu_desc { + int policy; + long rr_timeslice_us; + long wakeup_gran_us; +}; + +struct vkernel_cpu_pref { + unsigned int policy; + unsigned long rr_timeslice_us; + unsigned long wakeup_gran_us; +}; + struct vkernel_sysctl_fs_desc { u64 file_max; u32 nr_open; @@ -463,6 +475,9 @@ struct vkernel { unsigned long caps; unsigned int log_ns; + /* resource */ + struct vkernel_cpu_pref cpu_pref; + /* sysctl */ struct vkernel_sysctl_fs sysctl_fs; struct vkernel_sysctl_kernel sysctl_kernel; @@ -512,6 +527,8 @@ int vkernel_clear_acl_set(struct vkernel_acl *acl, struct vkernel_file_desc_set int vkernel_set_linux_cap(struct vkernel *vk, struct vkernel_linux_cap *cap); +int vkernel_set_cpu_pref(struct vkernel *vk, struct vkernel_cpu_desc *desc); + int vkernel_set_sysctl_fs(struct vkernel_sysctl_fs *fs, struct vkernel_sysctl_fs_desc *desc); int vkernel_set_sysctl_kernel(struct vkernel_sysctl_kernel *k, struct vkernel_sysctl_kernel_desc *desc); -- Gitee From 1a7f513eb51e9e0a6d5ce68ccd0f49d672aa353f Mon Sep 17 00:00:00 2001 From: Hang Huang Date: Sun, 28 Jun 2026 20:13:55 +0800 Subject: [PATCH 08/17] vk: introduce mem policy customization Sys items in `/sys/kernel/mm/` can be configured independently for each container. Initial version only supportes THP-related items. Signed-off-by: chenwei1266 Signed-off-by: Hang Huang --- include/linux/vkernel.h | 36 ++++++++++++++++++++++++++++ mm/huge_memory.c | 53 +++++++++++++++++++++++++++++++++++++---- mm/memory.c | 15 ++++++++++++ mm/shmem.c | 14 +++++++++++ 4 files changed, 114 insertions(+), 4 deletions(-) diff --git a/include/linux/vkernel.h b/include/linux/vkernel.h index e3729d4eb90d..e98d9d14ac73 100644 --- a/include/linux/vkernel.h +++ b/include/linux/vkernel.h @@ -95,6 +95,19 @@ #define current_vk_task get_current_syscall_task() #define current_vk get_current_syscall_vk() +#define vk_hugepage_flags_enabled(flags) \ + (flags & \ + ((1< #include +#ifdef CONFIG_VKERNEL +#include +#endif #include #include #include "internal.h" @@ -89,6 +92,14 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, bool in_pf = tva_flags & TVA_IN_PF; bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS; unsigned long supported_orders; +#ifdef CONFIG_VKERNEL + unsigned long flags = transparent_hugepage_flags; + struct vkernel *vk; + + vk = vkernel_find_vk_by_task(current); + if (vk) + flags = vk->mem_pref.thp_flags; +#endif /* Check the intersection of requested and supported orders. */ if (vma_is_anonymous(vma)) @@ -105,7 +116,11 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, if (!vma->vm_mm) /* vdso */ return 0; +#ifdef CONFIG_VKERNEL + if (vk_thp_disabled_by_hw(flags) || vma_thp_disabled(vma, vm_flags)) +#else if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags)) +#endif return 0; /* khugepaged doesn't collapse DAX vma, but page fault is fine. */ @@ -165,9 +180,15 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, * Enforce sysfs THP requirements as necessary. Anonymous vmas * were already handled in thp_vma_allowable_orders(). */ +#ifdef CONFIG_VKERNEL + if (enforce_sysfs && + (!vk_hugepage_flags_enabled(flags) || (!(vm_flags & VM_HUGEPAGE) && + !vk_hugepage_flags_always(flags)))) +#else if (enforce_sysfs && (!hugepage_global_enabled() || (!(vm_flags & VM_HUGEPAGE) && !hugepage_global_always()))) +#endif return 0; /* @@ -1237,23 +1258,33 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma) { const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE); + unsigned long *flags = &transparent_hugepage_flags; +#ifdef CONFIG_VKERNEL + struct vkernel *vk; + + vk = vkernel_find_vk_by_task(current); + if (vk) + flags = &vk->mem_pref.thp_flags; + + /* FIXME: should we both check global and local flags? */ +#endif /* Always do synchronous compaction */ - if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, flags)) return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); /* Kick kcompactd and fail quickly */ - if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, flags)) return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; /* Synchronous compaction if madvised, otherwise kick kcompactd */ - if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, flags)) return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : __GFP_KSWAPD_RECLAIM); /* Only do synchronous compaction if madvised */ - if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, flags)) return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : 0); @@ -1281,6 +1312,10 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) gfp_t gfp; struct folio *folio; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; +#ifdef CONFIG_VKERNEL + unsigned long flags = transparent_hugepage_flags; + struct vkernel *vk; +#endif if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) return VM_FAULT_FALLBACK; @@ -1288,9 +1323,19 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) return VM_FAULT_OOM; khugepaged_enter_vma(vma, vma->vm_flags); +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk) + flags = vk->mem_pref.thp_flags; +#endif + if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm) && +#ifdef CONFIG_VKERNEL + vk_transparent_hugepage_use_zero_page(flags)) { +#else transparent_hugepage_use_zero_page()) { +#endif pgtable_t pgtable; struct folio *zero_folio; vm_fault_t ret; diff --git a/mm/memory.c b/mm/memory.c index dc52184607ad..c8f35e2f8676 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -79,6 +79,9 @@ #include #include #include +#ifdef CONFIG_VKERNEL +#include +#endif #include #include @@ -5017,6 +5020,14 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) unsigned long haddr = vmf->address & HPAGE_PMD_MASK; pmd_t entry; vm_fault_t ret = VM_FAULT_FALLBACK; +#ifdef CONFIG_VKERNEL + unsigned long flags = transparent_hugepage_flags; + struct vkernel *vk; + + vk = vkernel_find_vk_by_task(current); + if (vk) + flags = vk->mem_pref.thp_flags; +#endif /* * It is too late to allocate a small folio, we already have a large @@ -5024,7 +5035,11 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) * PMD mappings, but PTE-mapped THP are fine. So let's simply refuse any * PMD mappings if THPs are disabled. */ +#ifdef CONFIG_VKERNEL + if (vk_thp_disabled_by_hw(flags) || vma_thp_disabled(vma, vma->vm_flags)) +#else if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags)) +#endif return ret; if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) diff --git a/mm/shmem.c b/mm/shmem.c index 5341bf731d00..2e414cc9742e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -40,6 +40,9 @@ #include #include #include +#ifdef CONFIG_VKERNEL +#include +#endif #ifdef CONFIG_CGROUP_SLI #include #endif @@ -1761,13 +1764,24 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, bool global_huge; loff_t i_size; int order; +#ifdef CONFIG_VKERNEL + unsigned long flags = transparent_hugepage_flags; + struct vkernel *vk; +#endif if (vma && ((vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))) return 0; /* If the hardware/firmware marked hugepage support disabled. */ +#ifdef CONFIG_VKERNEL + vk = vkernel_find_vk_by_task(current); + if (vk) + flags = vk->mem_pref.thp_flags; + if (vk_thp_disabled_by_hw(flags)) +#else if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED)) +#endif return 0; global_huge = shmem_huge_global_enabled(inode, index, shmem_huge_force, -- Gitee From e5f9032fed194e53c39c5b2a0bdd4ca9f1fc85bb Mon Sep 17 00:00:00 2001 From: Hang Huang Date: Sun, 28 Jun 2026 20:13:55 +0800 Subject: [PATCH 09/17] vk: implement vkernel framework as a module Initial version includes custom policies for syscall, file and cap. Custom modules plugins are also supported. Signed-off-by: chenwei1266 Signed-off-by: Hang Huang --- drivers/Makefile | 2 + drivers/vkernel/Makefile | 8 + drivers/vkernel/fs/acl.c | 367 ++++++++++++ drivers/vkernel/include/fs.h | 18 + drivers/vkernel/include/security.h | 15 + drivers/vkernel/include/syscall.h | 24 + drivers/vkernel/include/utils.h | 11 + drivers/vkernel/security/capability.c | 111 ++++ drivers/vkernel/syscall.c | 636 ++++++++++++++++++++ drivers/vkernel/utils/kallsyms.c | 63 ++ drivers/vkernel/vkernel_main.c | 823 ++++++++++++++++++++++++++ init/Kconfig | 10 + 12 files changed, 2088 insertions(+) create mode 100644 drivers/vkernel/Makefile create mode 100644 drivers/vkernel/fs/acl.c create mode 100644 drivers/vkernel/include/fs.h create mode 100644 drivers/vkernel/include/security.h create mode 100644 drivers/vkernel/include/syscall.h create mode 100644 drivers/vkernel/include/utils.h create mode 100644 drivers/vkernel/security/capability.c create mode 100644 drivers/vkernel/syscall.c create mode 100644 drivers/vkernel/utils/kallsyms.c create mode 100644 drivers/vkernel/vkernel_main.c diff --git a/drivers/Makefile b/drivers/Makefile index f36e00dfd1bd..3f87c2905025 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -202,3 +202,5 @@ obj-$(CONFIG_DRM_ACCEL) += accel/ obj-$(CONFIG_CDX_BUS) += cdx/ obj-$(CONFIG_S390) += s390/ + +obj-$(CONFIG_VKERNEL_DRIVER) += vkernel/ diff --git a/drivers/vkernel/Makefile b/drivers/vkernel/Makefile new file mode 100644 index 000000000000..fba34460dfb7 --- /dev/null +++ b/drivers/vkernel/Makefile @@ -0,0 +1,8 @@ +obj-$(CONFIG_VKERNEL_DRIVER) += vkernel.o + +ccflags-y := -I$(srctree)/drivers/vkernel/include + +vkernel-y := vkernel_main.o syscall.o +vkernel-y += fs/acl.o +vkernel-y += security/capability.o +vkernel-y += utils/kallsyms.o diff --git a/drivers/vkernel/fs/acl.c b/drivers/vkernel/fs/acl.c new file mode 100644 index 000000000000..8205cc3ef6f7 --- /dev/null +++ b/drivers/vkernel/fs/acl.c @@ -0,0 +1,367 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024 ARM Ltd. + * Author: Joy Allen + */ + +#include + +#include "fs.h" + +static char *def_path[] = { + /* open, access, append, read, exec */ + "/proc/sys/abi", + "/proc/sys/debug", + "/proc/sys/dev", + "/proc/sys/fs", + "/proc/sys/net", + "/proc/sys/user", + "/proc/sys/vm", + /* open, read, exec */ + "/sys/kernel", + "/sys/power", + "/sys/class", + "/sys/devices", + "/sys/dev", + "/sys/bus", + "/sys/block", + "/sys/module", + "/sys/firmware", + "/sys/fs/pstore", + "/sys/fs/bpf", + "/sys/fs/fuse", + "/sys/fs/ext4", + /* open */ + "/proc/sysrq-trigger", + "/sys/kernel/security", + /* nop */ + "/sys/fs/cgroup", + "/dev/vkernel", +}; + +static unsigned short def_mode[] = { + 0x803d, 0x803d, 0x803d, 0x803d, 0x803d, 0x803d, 0x803d, 0x8025, + 0x8025, 0x8025, 0x8025, 0x8025, 0x8025, 0x8025, 0x8025, 0x8025, + 0x8025, 0x8024, 0x8024, 0x8024, 0x8024, 0x8020, 0x8020, 0x0000, + 0x0000, +}; + +static struct kmem_cache *acl_node_cache; + +int vk_acl_init(void) +{ + acl_node_cache = kmem_cache_create("vkernel_acl_node", + sizeof(struct vkernel_acl_node), 0, SLAB_ACCOUNT, NULL); + if (!acl_node_cache) { + pr_err("failed to create slab for acl node\n"); + return -ENOMEM; + } + + return 0; +} + +void vk_acl_uninit(void) +{ + kmem_cache_destroy(acl_node_cache); +} + +int vk_init_acl(struct vkernel_acl *acl, unsigned int bits) +{ + + acl->ht = kcalloc( + 1UL << bits, sizeof(struct hlist_head), GFP_KERNEL); + if (!acl->ht) + return -ENOMEM; + + acl->bits = bits; + INIT_LIST_HEAD(&acl->nodes); + acl->active = false; + + return 0; +} + +void vk_uninit_acl(struct vkernel_acl *acl) +{ + struct hlist_head *ht = acl->ht; + struct vkernel_acl_node *node; + struct vkernel_acl_node *tmp; + + if (!acl->ht || !acl->bits) + return; + + acl->active = false; + list_for_each_entry_safe(node, tmp, &acl->nodes, link) { + if (!hlist_unhashed(&node->hash)) + hlist_del(&node->hash); + list_del(&node->link); + kmem_cache_free(acl_node_cache, node); + } + INIT_LIST_HEAD(&acl->nodes); + + acl->bits = 0; + kfree(ht); +} + +/* inode hash, copy from inode.c */ +static unsigned long inode_hash(struct inode *inode, unsigned long shift) +{ + struct super_block *sb = inode->i_sb; + unsigned long hashval = inode->i_ino; + unsigned long tmp; + + tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / + L1_CACHE_BYTES; + tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> shift); + return tmp; +} + +static struct vkernel_acl_node *vk_acl_node_get(struct vkernel_acl *acl, struct inode *inode) +{ + struct hlist_head *ht = acl->ht; + struct vkernel_acl_node *node; + unsigned long key = inode_hash(inode, acl->bits); + + hlist_for_each_entry(node, &ht[hash_min(key, acl->bits)], hash) { + if (inode->i_ino == node->ino && inode->i_sb == node->sb) + return node; + } + + return NULL; +} + +static int vk_acl_node_del(struct vkernel_acl *acl, struct inode *inode) +{ + struct vkernel_acl_node *node; + + node = vk_acl_node_get(acl, inode); + if (!node) + return -1; + + hlist_del(&node->hash); + node->ino = 0; + node->sb = NULL; + + return 0; +} + +static int vk_acl_node_add(struct vkernel_acl *acl, struct inode *inode, + struct vkernel_acl_node *node) +{ + struct hlist_head *ht = acl->ht; + unsigned long key = inode_hash(inode, acl->bits); + + /* Remove old rule if exists */ + vk_acl_node_del(acl, inode); + + node->ino = inode->i_ino; + node->sb = inode->i_sb; + hlist_add_head(&node->hash, &ht[hash_min(key, acl->bits)]); + + return 0; +} + +/* + * Inode from file->f_inode may be destroyed at following access + * Using kern_path is also unstable, is there a better way? + */ +static struct inode *kern_path_to_inode(const char *filename) +{ + struct path path; + struct inode *inode; + int ret; + + ret = kern_path(filename, LOOKUP_FOLLOW | LOOKUP_OPEN, &path); + if (ret) + return NULL; + + inode = path.dentry->d_inode; + path_put(&path); + + return inode; +} + +static int vk_activate_acl(struct vkernel_acl *acl, struct vkernel_acl_node *node) +{ + struct inode *inode; + + inode = kern_path_to_inode(node->path); + if (!inode) { + pr_warn("vkernel: cannot set cal, no such file or directory %s\n", node->path); + return 0; + } + + if (!vk_acl_node_add(acl, inode, node)) { + if (S_ISDIR(inode->i_mode)) + inode->i_opflags |= IOP_VKERNEL_DIR; + else + inode->i_opflags |= IOP_VKERNEL_REG; + } + + pr_debug("activate acl, path %s mode 0x%x ino %lu\n", node->path, node->mode, inode->i_ino); + + return 0; +} + +int vk_deactivate_acl(struct vkernel_acl *acl, struct vkernel_acl_node *node) +{ + struct inode *inode; + + inode = kern_path_to_inode(node->path); + if (!inode) + return -EINVAL; + + if (!vk_acl_node_del(acl, inode)) { + if (S_ISDIR(inode->i_mode)) + inode->i_opflags &= ~IOP_VKERNEL_DIR; + else + inode->i_opflags &= ~IOP_VKERNEL_REG; + } + + return 0; +} + +static void vk_activate_acl_all(struct vkernel_acl *acl) +{ + static DEFINE_MUTEX(vk_activate_lock); + struct vkernel_acl_node *node; + + /* Failure on trylock means someone is doing this job */ + if (!mutex_trylock(&vk_activate_lock)) + return; + + acl->active = true; + list_for_each_entry(node, &acl->nodes, link) { + if (hlist_unhashed(&node->hash)) + vk_activate_acl(acl, node); + } + + mutex_unlock(&vk_activate_lock); +} + +static int vk_permission(struct vkernel *vk, struct inode *inode, int mask) +{ + struct vkernel_acl_node *node; + + node = vk_acl_node_get(&vk->acl, inode); + if (node) { + if ((mask & ~(node->mode) & (MAY_READ | MAY_WRITE | MAY_EXEC)) != 0) { + pr_err("vkernel: permision denied, pid %d mask 0x%x vmode 0x%x path %s\n", + current->pid, mask, node->mode, node->path); + return -EACCES; + } + } + + return 0; +} + +/* + * Note: some filesystems or inodes may define their own permission hook. + * In such cases, vkernel permission check will be skipped. + */ +int vk_generic_permission(struct vkernel *vk, struct mnt_idmap *idmap, + struct inode *inode, int mask) +{ + int ret = 0; + + /* Activate acl at first check */ + if (unlikely(!vk->acl.active)) + vk_activate_acl_all(&vk->acl); + + if (inode->i_opflags & (IOP_VKERNEL_REG|IOP_VKERNEL_DIR)) + ret = vk_permission(vk, inode, mask); + + return ret; +} + +int vkernel_set_acl(struct vkernel_acl *acl, char *path, unsigned short mode) +{ + struct vkernel_acl_node *node; + + pr_debug("set acl, path %s mode 0x%x\n", path, mode); + node = kmem_cache_alloc(acl_node_cache, GFP_KERNEL_ACCOUNT); + if (!node) { + pr_err("failed to alloc acl node\n"); + return -ENOMEM; + } + INIT_HLIST_NODE(&node->hash); + node->ino = 0; + node->sb = NULL; + memcpy(node->path, path, VKERNEL_PATH_MAX); + node->mode = mode; + list_add_tail(&node->link, &acl->nodes); + + if (acl->active) + return vk_activate_acl(acl, node); + + return 0; +} +EXPORT_SYMBOL(vkernel_set_acl); + +int vkernel_clear_acl(struct vkernel_acl *acl, char *path) +{ + struct vkernel_acl_node *node; + bool found = false; + + list_for_each_entry(node, &acl->nodes, link) { + if (!strncmp(node->path, path, VKERNEL_PATH_MAX)) { + found = true; + break; + } + } + if (!found) + return -EINVAL; + + if (!hlist_unhashed(&node->hash)) + vk_deactivate_acl(acl, node); + + list_del(&node->link); + kmem_cache_free(acl_node_cache, node); + + return 0; +} +EXPORT_SYMBOL(vkernel_clear_acl); + +int vkernel_set_acl_set(struct vkernel_acl *acl, struct vkernel_file_desc_set *set) +{ + u64 i; + int r; + + for (i = 0; i < set->nr_descs; i++) { + r = vkernel_set_acl(acl, set->descs[i].path, set->descs[i].mode); + if (r) + return r; + } + + return 0; +} +EXPORT_SYMBOL(vkernel_set_acl_set); + +int vkernel_clear_acl_set(struct vkernel_acl *acl, struct vkernel_file_desc_set *set) +{ + u64 i; + int r; + + for (i = 0; i < set->nr_descs; i++) { + r = vkernel_clear_acl(acl, set->descs[i].path); + if (r) + return r; + } + + return 0; +} +EXPORT_SYMBOL(vkernel_clear_acl_set); + +int vkernel_set_default_acl_set(struct vkernel_acl *acl) +{ + u64 i; + int r; + + for (i = 0; i < ARRAY_SIZE(def_path); i++) { + r = vkernel_set_acl(acl, def_path[i], def_mode[i]); + if (r) + return r; + } + + return 0; +} +EXPORT_SYMBOL(vkernel_set_default_acl_set); diff --git a/drivers/vkernel/include/fs.h b/drivers/vkernel/include/fs.h new file mode 100644 index 000000000000..ce94c3274827 --- /dev/null +++ b/drivers/vkernel/include/fs.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _VKERNEL_FS_H +#define _VKERNEL_FS_H + +#include + +int vk_acl_init(void); +void vk_acl_uninit(void); + +int vk_init_acl(struct vkernel_acl *acl, unsigned int bits); +void vk_uninit_acl(struct vkernel_acl *acl); +int vkernel_set_default_acl_set(struct vkernel_acl *acl); + +int vk_generic_permission(struct vkernel *vk, struct mnt_idmap *idmap, + struct inode *inode, int mask); + +#endif diff --git a/drivers/vkernel/include/security.h b/drivers/vkernel/include/security.h new file mode 100644 index 000000000000..0eac382ffd1b --- /dev/null +++ b/drivers/vkernel/include/security.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _VKERNEL_SECURITY_H +#define _VKERNEL_SECURITY_H + +#include + +int vk_cap_init(void); +void vk_cap_uninit(void); + +int vk_cap_capable(struct vkernel *vk, const struct cred *cred, + struct user_namespace *targ_ns, + int cap, unsigned int opts); + +#endif diff --git a/drivers/vkernel/include/syscall.h b/drivers/vkernel/include/syscall.h new file mode 100644 index 000000000000..4bb75703f430 --- /dev/null +++ b/drivers/vkernel/include/syscall.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _VKERNEL_SYSCALL_H +#define _VKERNEL_SYSCALL_H + +#include + +extern sys_call_vk_t *sys_call_table_ptr; + +int vk_syscall_init(void); +void vk_syscall_uninit(void); + +long vk_sys_ni_syscall(const struct pt_regs *regs); +long vk_sys_forbid_syscall(const struct pt_regs *regs); +long vk_sys_ni_cond_syscall(const struct pt_regs *regs); +long vk_sys_forbid_cond_syscall(const struct pt_regs *regs); + +int vk_init_syscall(struct vkernel_syscall *syscall); +void vk_uninit_syscall(struct vkernel_syscall *syscall); +void vk_install_default_syscalls(struct vkernel_syscall *syscall); + +extern struct vkernel_custom_type analysis_custom; + +#endif diff --git a/drivers/vkernel/include/utils.h b/drivers/vkernel/include/utils.h new file mode 100644 index 000000000000..9bcb29e144ca --- /dev/null +++ b/drivers/vkernel/include/utils.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _VKERNEL_UTILS_H +#define _VKERNEL_UTILS_H + +int vk_kallsyms_init(void); +void vk_kallsyms_uninit(void); + +unsigned long lookup_name(const char *name); + +#endif diff --git a/drivers/vkernel/security/capability.c b/drivers/vkernel/security/capability.c new file mode 100644 index 000000000000..2b07101f260b --- /dev/null +++ b/drivers/vkernel/security/capability.c @@ -0,0 +1,111 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024 ARM Ltd. + * Author: Joy Allen + */ + +#include +#include + +#include "security.h" +#include "utils.h" + +int (*cap_capget_ptr)(struct task_struct *target, kernel_cap_t *effective, + kernel_cap_t *inheritable, kernel_cap_t *permitted); +int (*cap_capset_ptr)(struct cred *new, const struct cred *old, + const kernel_cap_t *effective, + const kernel_cap_t *inheritable, + const kernel_cap_t *permitted); +int (*cap_task_prctl_ptr)(int option, unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5); + +int vk_cap_init(void) +{ + cap_capget_ptr = (void *)lookup_name("cap_capget"); + cap_capset_ptr = (void *)lookup_name("cap_capset"); + cap_task_prctl_ptr = (void *)lookup_name("cap_task_prctl"); + if (!cap_capget_ptr || !cap_capset_ptr || !cap_task_prctl_ptr) { + pr_err("failed to find cap symbols, get: %p, set: %p, prctl: %p\n", + cap_capget_ptr, cap_capset_ptr, cap_task_prctl_ptr); + return -1; + } + + return 0; +} + +void vk_cap_uninit(void) {} + +int vk_cap_capable(struct vkernel *vk, const struct cred *cred, struct user_namespace *ns, + int cap, unsigned int opts) +{ + /* Check cred and real_cred to allow fs overried_creds */ + if (current_cred() == current_real_cred() && + !cap_issubset(cred->cap_effective, vk->linux_cap.effective)) { + pr_debug("vkernel: cap eff %llx escalated? use vk eff %llx instead\n", + cred->cap_effective.val, vk->linux_cap.effective.val); + for (;;) { + if (ns == cred->user_ns) + return cap_raised(vk->linux_cap.effective, cap) ? 0 : -EPERM; + if (ns->level <= cred->user_ns->level) + return -EPERM; + if ((ns->parent == cred->user_ns) && uid_eq(ns->owner, cred->euid)) + return 0; + ns = ns->parent; + } + } + return 0; +} + +/* + * Set cap for `current`, and `current` should be vk->init_process + * + * Note: this operation will take effect immediately. + */ +int vkernel_set_linux_cap(struct vkernel *vk, struct vkernel_linux_cap *cap) +{ + kernel_cap_t effective, inheritable, permitted; + struct cred *cred; + int action; + int ret; + int i; + + vk->linux_cap = *cap; + + /* Get current [effective,inheritable,permitted] */ + cap_capget_ptr(vk->init_process, &effective, &inheritable, &permitted); + + /* Drop bset according to linux_cap, which affects the following capset */ + if (cap_raised(effective, CAP_SETPCAP)) { + for (i = 0; i <= CAP_LAST_CAP; i++) { + if (!cap_raised(cap->bset, i)) { + ret = cap_task_prctl_ptr(PR_CAPBSET_DROP, i, 0, 0, 0); + if (ret) + return ret; + } + } + } + + /* Set current [effective,inheritable,permitted], ambient is automatically updated */ + cred = prepare_creds(); + if (!cred) + return -ENOMEM; + ret = cap_capset_ptr(cred, current_cred(), &cap->effective, &cap->inheritable, + &cap->permitted); + if (ret) + return ret; + commit_creds(cred); + + /* Raise or lower abmient according to linux_cap */ + for (i = 0; i < CAP_LAST_CAP; i++) { + if (cap_raised(cap->ambient, i)) + action = PR_CAP_AMBIENT_RAISE; + else + action = PR_CAP_AMBIENT_LOWER; + ret = cap_task_prctl_ptr(PR_CAP_AMBIENT, action, i, 0, 0); + if (ret) + return ret; + } + + return 0; +} +EXPORT_SYMBOL(vkernel_set_linux_cap); diff --git a/drivers/vkernel/syscall.c b/drivers/vkernel/syscall.c new file mode 100644 index 000000000000..0aa03db71dcc --- /dev/null +++ b/drivers/vkernel/syscall.c @@ -0,0 +1,636 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024 ARM Ltd. + * Author: Joy Allen + */ + +#include +#include +#include +#include + +#include "syscall.h" +#include "utils.h" + +sys_call_vk_t *sys_call_table_ptr; + +int (*force_sig_seccomp_ptr)(int syscall, int reason, bool force_coredump); +void (*do_exit_ptr)(long code); + +#define NOTIF_SYSCALL_RULE(name) \ +{ \ + .nr = __NR_##name, \ + .act = (VKERNEL_SYSCALL_ACT_ERRNO << VKERNEL_SYSCALL_ERRNO_BITS) | ENOSYS, \ +} \ + +static struct vkernel_syscall_rule_desc def_rules[] = { + NOTIF_SYSCALL_RULE(move_pages), + NOTIF_SYSCALL_RULE(fsconfig), + NOTIF_SYSCALL_RULE(kexec_load), + // NOTIF_SYSCALL_RULE(sysfs), + NOTIF_SYSCALL_RULE(fsopen), + NOTIF_SYSCALL_RULE(pkey_mprotect), + // NOTIF_SYSCALL_RULE(ustat), + NOTIF_SYSCALL_RULE(pkey_free), + NOTIF_SYSCALL_RULE(pkey_alloc), + NOTIF_SYSCALL_RULE(userfaultfd), + NOTIF_SYSCALL_RULE(migrate_pages), + NOTIF_SYSCALL_RULE(add_key), + NOTIF_SYSCALL_RULE(keyctl), + NOTIF_SYSCALL_RULE(clone3), + NOTIF_SYSCALL_RULE(kexec_file_load), + NOTIF_SYSCALL_RULE(swapoff), + NOTIF_SYSCALL_RULE(fsmount), + NOTIF_SYSCALL_RULE(open_tree), + // NOTIF_SYSCALL_RULE(_sysctl), + NOTIF_SYSCALL_RULE(move_mount), + NOTIF_SYSCALL_RULE(swapon), + NOTIF_SYSCALL_RULE(pivot_root), + NOTIF_SYSCALL_RULE(fspick), +}; + +static struct kmem_cache *syscall_rule_cache; + +int vk_syscall_init(void) +{ + sys_call_table_ptr = (void *)lookup_name("sys_call_table"); + if (!sys_call_table_ptr) { + pr_err("failed to find sys_call_table\n"); + return -1; + } + + force_sig_seccomp_ptr = (void *)lookup_name("force_sig_seccomp"); + if (!force_sig_seccomp_ptr) { + pr_err("failed to find force_sig_seccomp\n"); + return -1; + } + + do_exit_ptr = (void *)lookup_name("do_exit"); + if (!force_sig_seccomp_ptr) { + pr_err("failed to find do_exit\n"); + return -1; + } + + syscall_rule_cache = kmem_cache_create("vkernel_syscall_rule", + sizeof(struct vkernel_syscall_rule), 0, SLAB_ACCOUNT, NULL); + if (!syscall_rule_cache) { + pr_err("failed to create slab for syscall rule\n"); + return -ENOMEM; + } + + return 0; +} + +void vk_syscall_uninit(void) +{ + kmem_cache_destroy(syscall_rule_cache); +} + +static inline bool check_cond(int op, unsigned long arg, + unsigned long oprand1, unsigned long oprand2) +{ + switch (op) { + case VKERNEL_SYSCALL_CMP_EQ: + return arg == oprand1; + case VKERNEL_SYSCALL_CMP_NE: + return arg != oprand1; + case VKERNEL_SYSCALL_CMP_LT: + return arg < oprand1; + case VKERNEL_SYSCALL_CMP_LE: + return arg <= oprand1; + case VKERNEL_SYSCALL_CMP_GT: + return arg > oprand1; + case VKERNEL_SYSCALL_CMP_GE: + return arg >= oprand1; + case VKRENEL_SYSCALL_CMP_ME: + return (arg & oprand1) == oprand2; + } + + return false; +} + + +static bool check_rule(struct vkernel_syscall_rule *rule, struct pt_regs *regs) +{ + struct vkernel_syscall_cond *cond; + unsigned long args[6]; + int i; + + /* Corner case */ + if (!rule) + return true; + + syscall_get_arguments(current, regs, args); + for (i = 0; i < 6; i++) { + cond = &rule->conds[i]; + if (cond->op == VKERNEL_SYSCALL_CMP_ED) + break; + if (!check_cond(cond->op, args[cond->index], cond->oprand1, cond->oprand2)) + return false; + } + + return true; +} + +asmlinkage long vk_sys_act_cond(const struct pt_regs *regs) +{ + struct vkernel *vk; + struct vkernel_syscall_rule *rule; + struct pt_regs *curr_regs; + int nr; + unsigned int act; + + curr_regs = current_pt_regs(); + nr = syscall_get_nr(current, curr_regs); + if (likely(current_vk_task == current)) + vk = current_vk; + else + vk = vkernel_find_vk_by_task(current); + + act = vk->syscall.def_act; + list_for_each_entry(rule, &vk->syscall.rule_chains[nr], link) { + if (check_rule(rule, curr_regs)) { + act = rule->act; + break; + } + } + + switch (act >> VKERNEL_SYSCALL_ERRNO_BITS) { + case VKERNEL_SYSCALL_ACT_TRAP: + pr_info("vkernel: cond trap for syscall %d\n", nr); + syscall_rollback(current, curr_regs); + force_sig_seccomp_ptr(nr, -EPERM, false); + fallthrough; + case VKERNEL_SYSCALL_ACT_ERRNO: + pr_info("vkernel: cond err for syscall %d\n", nr); + return -(act & VKERNEL_SYSCALL_ERRNO_MASK); + + case VKERNEL_SYSCALL_ACT_USER_NOTIF: + pr_info("vkernel: cond user notif (nosys) for syscall %d\n", nr); + return -ENOSYS; + + case VKERNEL_SYSCALL_ACT_TRACE: + pr_info("vkernel: cond trace (nosys) for syscall %d\n", nr); + return -ENOSYS; + + case VKERNEL_SYSCALL_ACT_LOG: + pr_info("vkernel: cond log for syscall %d\n", nr); + fallthrough; + case VKERNEL_SYSCALL_ACT_ALLOW: + return sys_call_table_ptr[nr](regs); + + case VKERNEL_SYSCALL_ACT_KILL_PROCESS: + case VKERNEL_SYSCALL_ACT_KILL_THREAD: + default: + pr_info("vkernel: cond kill process/thread for syscall %d\n", nr); + if ((act >> VKERNEL_SYSCALL_ERRNO_BITS) != SECCOMP_RET_KILL_THREAD || + (atomic_read(¤t->signal->live) == 1)) { + /* Show the original registers in the dump. */ + syscall_rollback(current, curr_regs); + /* Trigger a coredump with SIGSYS */ + force_sig_seccomp_ptr(nr, -EPERM, true); + } else { + /* Call do_exit since there is missing unified pt_reg api */ + do_exit_ptr(SIGSYS); + } + return -1; + } + + /* We never get here */ + unreachable(); + + return -1; +} + +asmlinkage long vk_sys_act_invalid(const struct pt_regs *regs) +{ + pr_info("invalid syscall, never get here\n"); + return -ENOSYS; +} + +asmlinkage long vk_sys_act_kill_process(const struct pt_regs *regs) +{ + struct pt_regs *curr_regs; + int nr; + + curr_regs = current_pt_regs(); + nr = syscall_get_nr(current, curr_regs); + pr_info("vkernel: kill process for syscall %d\n", nr); + syscall_rollback(current, curr_regs); + force_sig_seccomp_ptr(nr, -EPERM, true); + + return -1; +} + +asmlinkage long vk_sys_act_kill_thread(const struct pt_regs *regs) +{ + struct pt_regs *curr_regs; + int nr; + + curr_regs = current_pt_regs(); + nr = syscall_get_nr(current, curr_regs); + pr_info("vkernel: kill thread for syscall %d\n", nr); + if ((atomic_read(¤t->signal->live) == 1)) { + syscall_rollback(current, current_pt_regs()); + force_sig_seccomp_ptr(nr, -EPERM, true); + } else { + /* Call do_exit since there is missing unified pt_reg api */ + do_exit_ptr(SIGSYS); + } + + return -1; +} + +asmlinkage long vk_sys_act_trap(const struct pt_regs *regs) +{ + struct pt_regs *curr_regs; + int nr; + + curr_regs = current_pt_regs(); + nr = syscall_get_nr(current, curr_regs); + pr_info("vkernel: trap for syscall %d\n", nr); + syscall_rollback(current, curr_regs); + force_sig_seccomp_ptr(nr, -EPERM, false); + + return -1; +} + +asmlinkage long vk_sys_act_user_notif(const struct pt_regs *regs) +{ + pr_err("vkernel: user notif for syscall nr %d\n", + syscall_get_nr(current, current_pt_regs())); + return -ENOSYS; +} + +asmlinkage long vk_sys_act_trace(const struct pt_regs *regs) +{ + pr_err("vkernel: trace for syscall nr %d\n", + syscall_get_nr(current, current_pt_regs())); + return -ENOSYS; +} + +asmlinkage long vk_sys_act_errno(const struct pt_regs *regs) +{ + struct vkernel *vk; + struct vkernel_syscall_rule *rule; + struct pt_regs *curr_regs; + int nr; + int errno; + + if (likely(current_vk_task == current)) + vk = current_vk; + else + vk = vkernel_find_vk_by_task(current); + curr_regs = current_pt_regs(); + nr = syscall_get_nr(current, curr_regs); + if (list_empty(&vk->syscall.rule_chains[nr])) + errno = vk->syscall.def_act & 0xffff; + else { + rule = list_first_entry(&vk->syscall.rule_chains[nr], + struct vkernel_syscall_rule, link); + errno = rule->act & VKERNEL_SYSCALL_ERRNO_MASK; + } + + pr_err("vkernel: err for syscall nr %d errno -%d\n", nr, errno); + return -errno; +} + +asmlinkage long vk_sys_act_log(const struct pt_regs *regs) +{ + int nr; + + nr = syscall_get_nr(current, current_pt_regs()); + pr_info("vkernel: log for syscall %d\n", nr); + + return sys_call_table_ptr[nr](regs); +} + +static void clear_syscall_rule_chain(struct list_head *chain) +{ + struct vkernel_syscall_rule *rule; + struct vkernel_syscall_rule *tmp; + + list_for_each_entry_safe(rule, tmp, chain, link) { + list_del(&rule->link); + kmem_cache_free(syscall_rule_cache, rule); + } + INIT_LIST_HEAD(chain); +} + +int vk_init_syscall(struct vkernel_syscall *syscall) +{ + int i; + + for (i = 0; i < NR_syscalls; i++) { + syscall->table[i] = sys_call_table_ptr[i]; + INIT_LIST_HEAD(&syscall->rule_chains[i]); + } + syscall->def_act = VKERNEL_SYSCALL_ACT_ALLOW << VKERNEL_SYSCALL_ERRNO_BITS; + + return 0; +} + +void vk_uninit_syscall(struct vkernel_syscall *syscall) +{ + int i; + + for (i = 0; i < NR_syscalls; i++) + clear_syscall_rule_chain(&syscall->rule_chains[i]); +} + +int vkernel_set_syscall(struct vkernel_syscall *syscall, unsigned int nr, + sys_call_vk_t call) +{ + if (unlikely(nr >= NR_syscalls)) + return -EINVAL; + + clear_syscall_rule_chain(&syscall->rule_chains[nr]); + syscall->table[nr] = call; + + return 0; +} +EXPORT_SYMBOL(vkernel_set_syscall); + +static sys_call_vk_t uncond_table[] = { + [VKERNEL_SYSCALL_ACT_INVALID] = vk_sys_act_invalid, + [VKERNEL_SYSCALL_ACT_KILL_PROCESS] = vk_sys_act_kill_process, + [VKERNEL_SYSCALL_ACT_KILL_THREAD] = vk_sys_act_kill_thread, + [VKERNEL_SYSCALL_ACT_TRAP] = vk_sys_act_trap, + [VKERNEL_SYSCALL_ACT_ERRNO] = vk_sys_act_errno, + [VKERNEL_SYSCALL_ACT_USER_NOTIF] = vk_sys_act_user_notif, + [VKERNEL_SYSCALL_ACT_TRACE] = vk_sys_act_trace, + [VKERNEL_SYSCALL_ACT_LOG] = vk_sys_act_log, +}; + +/* + * Call before adding rules + */ +int vkernel_set_default_syscall_rule(struct vkernel_syscall *syscall, u32 act) +{ + unsigned int action; + int i; + + action = act >> VKERNEL_SYSCALL_ERRNO_BITS; + if (action == VKERNEL_SYSCALL_ACT_INVALID || + action > VKERNEL_SYSCALL_ACT_ALLOW || + act == syscall->def_act) { + pr_err("invalid default rule, act 0x%x, old 0x%x\n", act, syscall->def_act); + return -EINVAL; + } + + for (i = 0; i < NR_syscalls; i++) { + clear_syscall_rule_chain(&syscall->rule_chains[i]); + if (action < VKERNEL_SYSCALL_ACT_ALLOW) + syscall->table[i] = uncond_table[action]; + else + syscall->table[i] = sys_call_table_ptr[i]; + } + syscall->def_act = act; + + return 0; +} +EXPORT_SYMBOL(vkernel_set_default_syscall_rule); + +int vkernel_add_syscall_rule(struct vkernel_syscall *syscall, + struct vkernel_syscall_rule_desc *desc) +{ + struct vkernel_syscall_rule *rule; + unsigned int nr; + unsigned int action; + int index; + + pr_debug("set syscall rule, nr %u act 0x%x has_cond %d\n", + desc->nr, desc->act, desc->conds[0].op != VKERNEL_SYSCALL_CMP_ED); + + nr = desc->nr; + action = (desc->act >> VKERNEL_SYSCALL_ERRNO_BITS); + if (nr >= NR_syscalls || + action == VKERNEL_SYSCALL_ACT_INVALID || + action > VKERNEL_SYSCALL_ACT_ALLOW || + (desc->act == syscall->def_act && list_empty(&syscall->rule_chains[nr]))) { + pr_err("invalid rule, nr %u act 0x%x def_act 0x%x\n", + desc->nr, desc->act, syscall->def_act); + return -EINVAL; + } + + /* Update syscall rule chain */ + rule = kmem_cache_alloc(syscall_rule_cache, GFP_KERNEL_ACCOUNT); + if (!rule) { + pr_err("failed to alloc syscall rule\n"); + return -ENOMEM; + } + + rule->act = desc->act; + for (index = 0; index < 6; index++) { + rule->conds[index] = desc->conds[index]; + if (desc->conds[index].op == VKERNEL_SYSCALL_CMP_ED) + break; + } + list_add(&rule->link, &syscall->rule_chains[nr]); + + /* Update syscall table */ + if (index > 0) + syscall->table[nr] = vk_sys_act_cond; + else if (action < VKERNEL_SYSCALL_ACT_ALLOW) + syscall->table[nr] = uncond_table[action]; + else + syscall->table[nr] = sys_call_table_ptr[nr]; + + return 0; +} +EXPORT_SYMBOL(vkernel_add_syscall_rule); + +void vk_install_default_syscalls(struct vkernel_syscall *syscall) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(def_rules); i++) + vkernel_add_syscall_rule(syscall, &def_rules[i]); +} +EXPORT_SYMBOL(vk_install_default_syscalls); + + +struct vkernel_analysis { + unsigned int syscalls[NR_syscalls + 1]; + unsigned int exec_count; + unsigned int exec_capacity; + char *execs[]; +}; + +asmlinkage long vk_sys_act_analysis(const struct pt_regs *regs) +{ + struct vkernel *vk; + struct vkernel_analysis *data; + struct vkernel_analysis *newdata; + char __user *uname; + char *kname; + struct pt_regs *curr_regs; + int nr; + + if (likely(current_vk_task == current)) + vk = current_vk; + else + vk = vkernel_find_vk_by_task(current); + data = (struct vkernel_analysis *)vk->private; + curr_regs = current_pt_regs(); + nr = syscall_get_nr(current, curr_regs); + if (data->syscalls[nr] < UINT_MAX) + data->syscalls[nr]++; + if (nr == __NR_execve || nr == __NR_execveat) { + kname = __getname(); + if (unlikely(!kname)) { + pr_err("failed to alloc name\n"); + return -ENOMEM; + } + if (nr == __NR_execve) + uname = (char __user *)regs_get_kernel_argument(curr_regs, 0); + else + uname = (char __user *)regs_get_kernel_argument(curr_regs, 1); + if (strncpy_from_user(kname, uname, PATH_MAX) < 0) { + pr_err("failed to copy user filename\n"); + __putname(kname); + return -EFAULT; + } + if (data->exec_count >= data->exec_capacity) { + newdata = kzalloc(sizeof(*data) + + sizeof(char *) * (data->exec_capacity << 1), GFP_KERNEL); + if (!newdata) + return -ENOMEM; + memcpy(newdata, data, sizeof(*data) + sizeof(char *) * data->exec_capacity); + newdata->exec_capacity <<= 1; + + vk->private = newdata; + /* TODO: fix race window */ + while (refcount_read(&vk->users_count) > 1) + ; + kfree(data); + data = newdata; + } + data->execs[data->exec_count++] = kname; + } + + return sys_call_table_ptr[nr](regs); +} + +static int analysis_show(struct seq_file *m, void *v) +{ + struct vkernel *vk = m->private; + struct vkernel_analysis *data = vk->private; + unsigned int i; + bool first; + + seq_puts(m, "{\n"); + seq_puts(m, " \"syscalls\": ["); + first = true; + for (i = 0; i < NR_syscalls; i++) { + if (!data->syscalls[i]) + continue; + if (first) { + seq_printf(m, "%u", i); + first = false; + } else + seq_printf(m, ", %u", i); + } + seq_puts(m, "],\n"); + seq_puts(m, " \"execs\": [\n"); + first = true; + for (i = 0; i < data->exec_count; i++) { + if (unlikely(!data->execs[i])) { + pr_warn("encounter nil exec path in vkernel_analysis\n"); + continue; + } + if (first) { + seq_printf(m, " \"%s\"", data->execs[i]); + first = false; + } else + seq_printf(m, ",\n \"%s\"", data->execs[i]); + } + seq_puts(m, "\n ],\n"); + seq_puts(m, " \"syscall_details\": [\n"); + first = true; + for (i = 0; i < NR_syscalls; i++) { + if (!data->syscalls[i]) + continue; + if (first) { + seq_printf(m, " {\"nr\": %u, \"count\": %u}", i, data->syscalls[i]); + first = false; + } else + seq_printf(m, ",\n {\"nr\": %u, \"count\": %u}", i, data->syscalls[i]); + } + seq_puts(m, "\n ]\n"); + seq_puts(m, "}\n"); + + return 0; +} + +static int analysis_open(struct inode *inode, struct file *file) +{ + struct vkernel *vk = inode->i_private; + int r; + + if (!vkernel_get_vk_safe(vk)) + return -ENOENT; + + r = single_open(file, analysis_show, inode->i_private); + if (r < 0) + vkernel_put_vk(vk); + + return r; +} + +static int analysis_release(struct inode *inode, struct file *file) +{ + struct vkernel *vk = inode->i_private; + + vkernel_put_vk(vk); + + return single_release(inode, file); +} + +static const struct file_operations analysis_fops = { + .open = analysis_open, + .release = analysis_release, + .read = seq_read, + .llseek = seq_lseek, +}; + +static int analysis_post_create(struct vkernel *vk) +{ + struct vkernel_analysis *data; + struct vkernel_syscall *syscall; + int i; + + data = kzalloc(sizeof(*data) + sizeof(char *) * 64, GFP_KERNEL); + if (!data) + return -ENOMEM; + data->exec_capacity = 4; + vk->private = data; + + syscall = &vk->syscall; + for (i = 0; i < NR_syscalls; i++) + syscall->table[i] = vk_sys_act_analysis; + + debugfs_create_file("analysis", 0444, vk->debugfs_dentry, vk, &analysis_fops); + + return 0; +} + +static void analysis_pre_destroy(struct vkernel *vk) +{ + struct vkernel_analysis *data = (struct vkernel_analysis *)vk->private; + + if (unlikely(!data)) { + pr_warn("detroy an analysis vk without vkernel_analysis data\n"); + return; + } + + kfree(data); + vk->private = NULL; +} + +struct vkernel_custom_type analysis_custom = { + .owner = THIS_MODULE, + .name = "analysis", + .post_create = analysis_post_create, + .pre_destroy = analysis_pre_destroy, +}; diff --git a/drivers/vkernel/utils/kallsyms.c b/drivers/vkernel/utils/kallsyms.c new file mode 100644 index 000000000000..613d1f0b28dc --- /dev/null +++ b/drivers/vkernel/utils/kallsyms.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Wrapper of lookup_name + * Define the wrapper, so other components can include a function not a symbol + * + * Copyright (C) 2024 ARM Ltd. + * Author: Joy Allen + */ + +#include +#include + +#include "utils.h" + +/* + * There are two ways of preventing vicious recursive loops when hooking: + * - detect recusion using function return address (USE_FENTRY_OFFSET = 0) + * - avoid recusion by jumping over the ftrace call (USE_FENTRY_OFFSET = 1) + */ +#define USE_FENTRY_OFFSET 0 + +/* + * Tail call optimization can interfere with recursion detection based on + * return address on the stack. Disable it to avoid machine hangups. + */ +#if !USE_FENTRY_OFFSET +#pragma GCC optimize("-fno-optimize-sibling-calls") +#endif + +unsigned long vk_lookup_name(const char *name) +{ + struct kprobe kp = { .symbol_name = name }; + unsigned long retval; + + if (register_kprobe(&kp) < 0) + return 0; + + retval = (unsigned long)kp.addr; + unregister_kprobe(&kp); + + return retval; +} + +static unsigned long (*kallsyms_lookup_name_ptr)(const char *name); + +int vk_kallsyms_init(void) +{ + kallsyms_lookup_name_ptr = (void *)vk_lookup_name("kallsyms_lookup_name"); + if (!kallsyms_lookup_name_ptr) { + pr_err("cannot resolve symbol: kallsyms_lookup_name\n"); + return -ENOENT; + } + + return 0; +} + +void vk_kallsyms_uninit(void) {} + +unsigned long lookup_name(const char *name) +{ + return kallsyms_lookup_name_ptr(name); +} +EXPORT_SYMBOL(lookup_name); diff --git a/drivers/vkernel/vkernel_main.c b/drivers/vkernel/vkernel_main.c new file mode 100644 index 000000000000..5a57a54508b2 --- /dev/null +++ b/drivers/vkernel/vkernel_main.c @@ -0,0 +1,823 @@ +// SPDX-License-Identifier: GPL-2.0 +/** + * vkernel core + * + * Copyright (C) 2024 ARM Ltd. + * Author: Joy Allen + **/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "fs.h" +#include "security.h" +#include "syscall.h" +#include "utils.h" + +MODULE_AUTHOR("JYH Lab"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("vkernel core module"); + +/* Worst case buffer size needed for holding an integer. */ +#define ITOA_MAX_LEN 12 + +static DEFINE_MUTEX(vk_lock); +static LIST_HEAD(vk_list); + +static DEFINE_MUTEX(custom_lock); +static DEFINE_HASHTABLE(custom_ht, 6); + +struct dentry *vkernel_debugfs_dir; +EXPORT_SYMBOL_GPL(vkernel_debugfs_dir); + +static const struct file_operations vkernel_chardev_ops; + +#define CONFIG_VKERNEL_COMPAT + +#ifdef CONFIG_VKERNEL_COMPAT +#define VKERNEL_COMPAT(c) .compat_ioctl = (c) +#else +/* + * For architectures that don't implement a compat infrastructure, + * adopt a double line of defense: + * - Prevent a compat task from opening /dev/vkernel + * - If the open has been done by a 64bit task, and the vkernel fd + * passed to a compat task, let the ioctls fail. + */ +static long vkernel_no_compat_ioctl(struct file *file, unsigned int ioctl, + unsigned long arg) +{ + return -EINVAL; +} + +static int vkernel_no_compat_open(struct inode *inode, struct file *file) +{ + return is_compat_task() ? -ENODEV : 0; +} +#define VKERNEL_COMPAT(c) .compat_ioctl = vkernel_no_compat_ioctl, \ + .open = vkernel_no_compat_open +#endif + +#define VKERNEL_EVENT_CREATE_VK 0 +#define VKERNEL_EVENT_DESTROY_VK 1 + +static void vkernel_uevent_notify_change(unsigned int type, struct vkernel *vk); +static DEFINE_MUTEX(event_lock); +static unsigned long long vkernel_createvk_count; +static unsigned long long vkernel_active_vks; + + +static int default_post_create(struct vkernel *vk) +{ + /* Set default syscall and acl rules */ + vk_install_default_syscalls(&vk->syscall); + return vkernel_set_default_acl_set(&vk->acl); +} + +static struct vkernel_custom_type default_custom = { + .owner = THIS_MODULE, + .name = "default", + .post_create = default_post_create, + .pre_destroy = NULL, +}; + +struct vkernel_custom_type *vkernel_find_custom(const char *name) +{ + struct vkernel_custom_type *custom; + unsigned int key; + + key = full_name_hash(NULL, name, strlen(name)); + + hash_for_each_possible(custom_ht, custom, hash, key) { + if (!strcmp(name, custom->name)) + return custom; + } + + return NULL; +} +EXPORT_SYMBOL(vkernel_find_custom); + +int vkernel_register_custom(struct vkernel_custom_type *custom) +{ + unsigned int key; + + if (!custom->owner) { + pr_err("custom type %s has no owner\n", custom->name); + return -EINVAL; + } + + if (vkernel_find_custom(custom->name)) { + pr_err("custom type %s already existed\n", custom->name); + return -EEXIST; + } + + key = full_name_hash(NULL, custom->name, strlen(custom->name)); + mutex_lock(&custom_lock); + hash_add(custom_ht, &custom->hash, key); + mutex_unlock(&custom_lock); + + pr_info("register cutom type %s\n", custom->name); + + return 0; +} +EXPORT_SYMBOL(vkernel_register_custom); + +int vkernel_unregister_custom(struct vkernel_custom_type *custom) +{ + pr_info("unregister cutom type %s\n", custom->name); + + mutex_lock(&custom_lock); + /* It is also ok to remove an unhashed custom */ + hash_del(&custom->hash); + mutex_unlock(&custom_lock); + + return 0; +} +EXPORT_SYMBOL(vkernel_unregister_custom); + + +__weak int vkernel_arch_vk_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + return 0; +} + +__weak int vkernel_arch_dev_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + return 0; +} + +static int vkernel_vk_ioctl_set_def_syscall(struct vkernel *vk, unsigned long arg) +{ + return vkernel_set_default_syscall_rule(&vk->syscall, arg); +} + +static int vkernel_vk_ioctl_restrict_syscall(struct vkernel *vk, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct vkernel_syscall_rule_desc desc; + + if (copy_from_user(&desc, argp, sizeof(desc))) + return -EFAULT; + + return vkernel_add_syscall_rule(&vk->syscall, &desc); +} + +static int vkernel_vk_ioctl_restrict_file(struct vkernel *vk, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct vkernel_file_desc_set header; + struct vkernel_file_desc_set *set = NULL; + unsigned long full_size; + int r = 0; + + if (copy_from_user(&header, argp, sizeof(header))) { + r = -EFAULT; + goto out; + } + if (!header.nr_descs) { + r = -EINVAL; + goto out; + } + + full_size = sizeof(header) + sizeof(struct vkernel_file_desc) * header.nr_descs; + set = kmalloc(full_size, GFP_KERNEL); + if (!set) { + r = -ENOMEM; + goto out_set; + } + if (copy_from_user(set, argp, full_size)) { + r = -EFAULT; + goto out_set; + } + + r = vkernel_set_acl_set(&vk->acl, set); + +out_set: + kfree(set); +out: + return r; +} + +static int vkernel_vk_ioctl_restrict_linux_cap(struct vkernel *vk, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct vkernel_linux_cap cap; + + if (copy_from_user(&cap, argp, sizeof(cap))) + return -EFAULT; + + return vkernel_set_linux_cap(vk, &cap); +} + +static int stat_show(struct seq_file *m, void *v) +{ + struct vkernel *vk = m->private; + + seq_puts(m, "=== BASIC ===\n"); + seq_printf(m, "Name: %s\n", vk->name); + seq_printf(m, "Pid ns: %u\n", vk->pid_ns->ns.inum); + seq_printf(m, "Uts ns: %u\n", vk->uts_ns->ns.inum); + seq_printf(m, "Init pid: %d\n", vk->init_pid); + seq_printf(m, "Users count: %d\n", refcount_read(&vk->users_count)); + seq_printf(m, "Active: %d\n", vk->active); + + seq_puts(m, "=== SECURITY ===\n"); + seq_printf(m, "Syscall def act: %d\n", vk->syscall.def_act); + seq_printf(m, "Syscall do_futex %p\n", vk->syscall.do_futex); + seq_printf(m, "ACL bits: %d\n", vk->acl.bits); + seq_printf(m, "ACL active: %d\n", vk->acl.active); + seq_printf(m, "Cap inheritable: 0x%llx\n", vk->linux_cap.inheritable.val); + seq_printf(m, "Cap permitted: 0x%llx\n", vk->linux_cap.permitted.val); + seq_printf(m, "Cap effective: 0x%llx\n", vk->linux_cap.effective.val); + seq_printf(m, "Cap bset: 0x%llx\n", vk->linux_cap.bset.val); + seq_printf(m, "Cap ambient: 0x%llx\n", vk->linux_cap.ambient.val); + + seq_puts(m, "=== OPERATION ===\n"); + seq_printf(m, "Op cap_capable: %p\n", vk->ops.cap_capable); + seq_printf(m, "Op generic_permission: %p\n", vk->ops.generic_permission); + + seq_puts(m, "=== CUSTOM ===\n"); + seq_printf(m, "Custom type: %s\n", vk->custom->name); + seq_printf(m, "Custom post_create: %p\n", vk->custom->post_create); + seq_printf(m, "Custom pre_destroy: %p\n", vk->custom->pre_destroy); + + return 0; +} + +static int stat_open(struct inode *inode, struct file *file) +{ + struct vkernel *vk = inode->i_private; + int r; + + if (!vkernel_get_vk_safe(vk)) + return -ENOENT; + + r = single_open(file, stat_show, inode->i_private); + if (r < 0) + vkernel_put_vk(vk); + + return r; +} + +static int stat_release(struct inode *inode, struct file *file) +{ + struct vkernel *vk = inode->i_private; + + vkernel_put_vk(vk); + + return single_release(inode, file); +} + +static const struct file_operations vk_stat_fops = { + .open = stat_open, + .release = stat_release, + .read = seq_read, + .llseek = seq_lseek, +}; + +static void vkernel_destroy_vk_debugfs(struct vkernel *vk) +{ + if (IS_ERR(vk->debugfs_dentry)) + return; + + debugfs_remove_recursive(vk->debugfs_dentry); +} + +static int vkernel_create_vk_debugfs(struct vkernel *vk, const char *name) +{ + static DEFINE_MUTEX(vkernel_debugfs_lock); + struct dentry *dent; + + if (!debugfs_initialized()) + return 0; + + mutex_lock(&vkernel_debugfs_lock); + dent = debugfs_lookup(name, vkernel_debugfs_dir); + if (dent) { + pr_warn_ratelimited("vkernel: debugfs: duplicate directory %s\n", name); + dput(dent); + mutex_unlock(&vkernel_debugfs_lock); + return 0; + } + + dent = debugfs_create_dir(name, vkernel_debugfs_dir); + mutex_unlock(&vkernel_debugfs_lock); + if (IS_ERR(dent)) + return 0; + + vk->debugfs_dentry = dent; + + debugfs_create_file("stat", 0444, dent, vk, &vk_stat_fops); + + return 0; +} + +void vkernel_destroy_vk(struct vkernel *vk) +{ + pr_info("vkernel: destroy vk %s\n", vk->name); + + vk->active = false; + vkernel_unregister_vk(vk); + + mutex_lock(&vk_lock); +#ifdef CONFIG_DEBUG_LIST + list_del(&vk->link); +#else + if (vk->link.prev) + list_del(&vk->link); +#endif + mutex_unlock(&vk_lock); + + if (vk->custom->pre_destroy) + vk->custom->pre_destroy(vk); + if (vk->custom->owner != vkernel_chardev_ops.owner) + module_put(vk->custom->owner); + + vkernel_destroy_vk_debugfs(vk); + + vk_uninit_acl(&vk->acl); + vk_uninit_syscall(&vk->syscall); + kfree(vk); + module_put(vkernel_chardev_ops.owner); +} +EXPORT_SYMBOL(vkernel_destroy_vk); + +struct vkernel *vkernel_create_vk(struct task_struct *tsk, const char *name, + const char *custom) +{ + struct vkernel *vk; + int r = -ENOMEM; + + vk = kzalloc(sizeof(struct vkernel), GFP_KERNEL); + if (!vk) + return ERR_PTR(-ENOMEM); + + __module_get(vkernel_chardev_ops.owner); + + /* Init basic info */ + strscpy(vk->name, name, VKERNEL_NAME_LEN); + INIT_HLIST_NODE(&vk->hash); + vk->pid_ns = task_active_pid_ns(tsk); + vk->uts_ns = tsk->nsproxy->uts_ns; + vk->init_process = tsk; + vk->init_pid = tsk->pid; + refcount_set(&vk->users_count, 1); + + /* + * Force subsequent debugfs file creations to fail if the vk directory + * is not created (by vkernel_create_vk_debugfs()). + */ + vk->debugfs_dentry = ERR_PTR(-ENOENT); + + /* Init syscall */ + r = vk_init_syscall(&vk->syscall); + if (r) + goto err_vk; + /* Init acl */ + r = vk_init_acl(&vk->acl, VKERNEL_ACL_HASH_BITS); + if (r) + goto err_syscall; + /* Init linux cap */ + vk->linux_cap.inheritable = tsk->cred->cap_inheritable; + vk->linux_cap.permitted = tsk->cred->cap_permitted; + vk->linux_cap.effective = tsk->cred->cap_effective; + vk->linux_cap.bset = tsk->cred->cap_bset; + vk->linux_cap.ambient = tsk->cred->cap_ambient; + + /* Init default operations */ + vk->ops.cap_capable = vk_cap_capable; + vk->ops.generic_permission = vk_generic_permission; + + r = vkernel_create_vk_debugfs(vk, name); + if (r) + goto err_acl; + + /* Custom initializations */ + vk->custom = vkernel_find_custom(custom); + if (!vk->custom) + vk->custom = &default_custom; + if (vk->custom->owner != vkernel_chardev_ops.owner) + __module_get(vk->custom->owner); + if (vk->custom->post_create) { + r = vk->custom->post_create(vk); + if (r) + goto err_custom_debugfs; + } + + mutex_lock(&vk_lock); + list_add(&vk->link, &vk_list); + mutex_unlock(&vk_lock); + + /* Register vk into kernel. It is inactive state. */ + vkernel_register_vk(vk); + + pr_info("vkernel: create vk %s, init %d, custom %s (expect %s)", + vk->name, vk->init_pid, vk->custom->name, custom); + + return vk; + +err_custom_debugfs: + if (vk->custom->owner != vkernel_chardev_ops.owner) + module_put(vk->custom->owner); + + vkernel_destroy_vk_debugfs(vk); +err_acl: + vk_uninit_acl(&vk->acl); +err_syscall: + vk_uninit_syscall(&vk->syscall); +err_vk: + kfree(vk); + module_put(vkernel_chardev_ops.owner); + + return ERR_PTR(r); +} +EXPORT_SYMBOL(vkernel_create_vk); + +void vkernel_get_vk(struct vkernel *vk) +{ + refcount_inc(&vk->users_count); +} +EXPORT_SYMBOL(vkernel_get_vk); + +/* + * Make sure the vk is not during destruction, which is a safe version of + * vkernel_get_vk(). Return true if vk referenced successfully, false otherwise. + */ +bool vkernel_get_vk_safe(struct vkernel *vk) +{ + return refcount_inc_not_zero(&vk->users_count); +} +EXPORT_SYMBOL(vkernel_get_vk_safe); + +void vkernel_put_vk(struct vkernel *vk) +{ + if (refcount_dec_and_test(&vk->users_count)) + vkernel_destroy_vk(vk); +} +EXPORT_SYMBOL(vkernel_put_vk); + +/* + * Used to put a reference that was taken on behalf of an object associated + * with a user-visible file descriptor, e.g. a vcpu or device, if installation + * of the new file descriptor fails and the reference cannot be transferred to + * its final owner. In such cases, the caller is still actively using @vk and + * will fail miserably if the refcount unexpectedly hits zero. + */ +void vkernel_put_vk_no_destroy(struct vkernel *vk) +{ + WARN_ON(refcount_dec_and_test(&vk->users_count)); +} +EXPORT_SYMBOL(vkernel_put_vk_no_destroy); + +static int vkernel_vk_release(struct inode *inode, struct file *filp) +{ + struct vkernel *vk = filp->private_data; + + pr_info("vkernel: release vk fd of %s. Currently, vk is still alive\n", vk->name); + + // vkernel_put_vk(vk); + return 0; +} + +static long vkernel_vk_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct vkernel *vk = filp->private_data; + int r = 0; + + switch (ioctl) { + case VKERNEL_SET_DEF_SYSCALL: + r = vkernel_vk_ioctl_set_def_syscall(vk, arg); + break; + case VKERNEL_RESTRICT_SYSCALL: + r = vkernel_vk_ioctl_restrict_syscall(vk, arg); + break; + case VKERNEL_RESTRICT_FILE: + r = vkernel_vk_ioctl_restrict_file(vk, arg); + break; + case VKERNEL_RESTRICT_LINUX_CAP: + r = vkernel_vk_ioctl_restrict_linux_cap(vk, arg); + break; + case VKERNEL_SET_CPU_PREF: + case VKERNEL_SET_MEMORY_PREF: + case VKERNEL_SET_SYSCTL_FS: + case VKERNEL_SET_SYSCTL_KERNEL: + case VKERNEL_SET_SYSCTL_NET: + case VKERNEL_SET_SYSCTL_VM: + case VKERNEL_CHECK_EXTENSION: + case VKERNEL_ENABLE_CAP: + r = -EOPNOTSUPP; + break; + case VKERNEL_REGISTER: + pr_warn("vkernel: [deprecated] register vk, init %d id %u ret %d\n", + vk->init_process->pid, vk->pid_ns->ns.inum, r); + break; + case VKERNEL_UNREGISTER: + pr_warn("vkernel: [deprecated] unregister vk, init %d id %u ret %d\n", + vk->init_process->pid, vk->pid_ns->ns.inum, r); + break; + case VKERNEL_ACTIVATE: + vk->active = true; + break; + case VKERNEL_DEACTIVATE: + vk->active = false; + break; + default: + r = vkernel_arch_vk_ioctl(filp, ioctl, arg); + } + + return r; +} + +#ifdef CONFIG_VKERNEL_COMPAT +long __weak vkernel_arch_vk_compat_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + return -ENOTTY; +} + +static long vkernel_vk_compat_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + int r; + + r = vkernel_arch_vk_compat_ioctl(filp, ioctl, arg); + if (r != -ENOTTY) + return r; + + return vkernel_vk_ioctl(filp, ioctl, arg); +} +#endif + +static const struct file_operations vkernel_vk_fops = { + .release = vkernel_vk_release, + .unlocked_ioctl = vkernel_vk_ioctl, + .llseek = noop_llseek, + VKERNEL_COMPAT(vkernel_vk_compat_ioctl), +}; + +static int vkernel_dev_ioctl_create_vk(unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct vkernel_desc desc; + struct task_struct *tsk; + struct vkernel *vk; + struct file *file; + char fdname[ITOA_MAX_LEN * 2 + 2]; + int r, fd; + + if (copy_from_user(&desc, argp, sizeof(desc))) + return -EFAULT; + + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), + "find_task_by_pid_ns() needs rcu_read_lock() protection"); + tsk = pid_task(find_pid_ns(desc.pid, &init_pid_ns), PIDTYPE_PID); + if (!tsk) { + pr_err("cannot find pid %d\n", desc.pid); + return -EINVAL; + } + + fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); + if (fd < 0) { + pr_err("cannot get unused fd\n"); + return fd; + } + + snprintf(fdname, sizeof(fdname), "%d-%d", desc.pid, fd); + + vk = vkernel_create_vk(tsk, fdname, desc.custom); + if (IS_ERR(vk)) { + r = PTR_ERR(vk); + goto put_fd; + } + + file = anon_inode_getfile("vkernel-vk", &vkernel_vk_fops, vk, O_RDWR); + if (IS_ERR(file)) { + r = PTR_ERR(file); + goto put_kernel; + } + + vkernel_uevent_notify_change(VKERNEL_EVENT_CREATE_VK, vk); + + fd_install(fd, file); + return fd; + +put_kernel: + vkernel_put_vk(vk); +put_fd: + put_unused_fd(fd); + return r; +} + +static int vkernel_dev_ioctl_destroy_vk(unsigned long arg) +{ + struct vkernel *vk; + unsigned int id = (unsigned int)arg; + + pr_info("vkernel: try to destroy vk with id %u\n", id); + + vk = vkernel_find_vk_by_id(id); + if (!vk) + return -EINVAL; + + vkernel_put_vk(vk); + return 0; +} + +static long vkernel_dev_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + int r = -EINVAL; + + switch (ioctl) { + case VKERNEL_GET_API_VERSION: + if (arg) + goto out; + r = VKERNEL_API_VERSION; + break; + case VKERNEL_CREATE_VK: + r = vkernel_dev_ioctl_create_vk(arg); + break; + case VKERNEL_DESTROY_VK: + r = vkernel_dev_ioctl_destroy_vk(arg); + break; + case VKERNEL_CHECK_EXTENSION: + r = -EOPNOTSUPP; + break; + case VKERNEL_TRACE_ENABLE: + case VKERNEL_TRACE_PAUSE: + case VKERNEL_TRACE_DISABLE: + r = -EOPNOTSUPP; + break; + default: + r = vkernel_arch_dev_ioctl(filp, ioctl, arg); + } +out: + return r; +} + +static const struct file_operations vkernel_chardev_ops = { + .owner = THIS_MODULE, + .unlocked_ioctl = vkernel_dev_ioctl, + .llseek = noop_llseek, + VKERNEL_COMPAT(vkernel_dev_ioctl), +}; + +static struct miscdevice vkernel_dev = { + VKERNEL_MINOR, + "vkernel", + &vkernel_chardev_ops, +}; + +static void vkernel_uevent_notify_change(unsigned int type, struct vkernel *vk) +{ + struct kobj_uevent_env *env; + unsigned long long created, active; + + if (!vkernel_dev.this_device || !vk) + return; + + mutex_lock(&event_lock); + if (type == VKERNEL_EVENT_CREATE_VK) { + vkernel_createvk_count++; + vkernel_active_vks++; + } else if (type == VKERNEL_EVENT_DESTROY_VK) { + vkernel_active_vks--; + } + created = vkernel_createvk_count; + active = vkernel_active_vks; + mutex_unlock(&event_lock); + + env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT); + if (!env) + return; + + add_uevent_var(env, "CREATED=%llu", created); + add_uevent_var(env, "COUNT=%llu", active); + + if (type == VKERNEL_EVENT_CREATE_VK) + add_uevent_var(env, "EVENT=create"); + else if (type == VKERNEL_EVENT_DESTROY_VK) + add_uevent_var(env, "EVENT=destroy"); + add_uevent_var(env, "VKID=%d", vk->pid_ns->ns.inum); + + if (!IS_ERR(vk->debugfs_dentry)) { + char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT); + + if (p) { + tmp = dentry_path_raw(vk->debugfs_dentry, p, PATH_MAX); + if (!IS_ERR(tmp)) + add_uevent_var(env, "STATS_PATH=%s", tmp); + kfree(p); + } + } + /* no need for checks, since we are adding at most only 5 keys */ + env->envp[env->envp_idx++] = NULL; + kobject_uevent_env(&vkernel_dev.this_device->kobj, KOBJ_CHANGE, env->envp); + kfree(env); +} + +static int clear_zombie_vks(void) +{ + struct vkernel *vk; + struct vkernel *tmp; + struct task_struct *tsk; + int count = 0; + + list_for_each_entry_safe(vk, tmp, &vk_list, link) { + tsk = pid_task(find_pid_ns(vk->init_pid, &init_pid_ns), PIDTYPE_PID); + if (tsk != vk->init_process) { + if (refcount_read(&vk->users_count) > 1) + pr_err("vkernel: BUG! zombie vk %s has other refs, init %d custom %s\n", + vk->name, vk->init_pid, vk->custom->name); + vkernel_put_vk(vk); + count++; + } + } + + return count; +} + +static int clear_zombie_set(void *data, u64 val) +{ + int count; + + count = clear_zombie_vks(); + pr_info("cleared %d zombie vks\n", count); + return 0; +} + +DEFINE_DEBUGFS_ATTRIBUTE(clear_zombie_fops, NULL, clear_zombie_set, + "%lld\n"); + +static void vkernel_init_debug(void) +{ + vkernel_debugfs_dir = debugfs_create_dir("vkernel", NULL); + + debugfs_create_file("clear_zombie", 0200, vkernel_debugfs_dir, + NULL, &clear_zombie_fops); +} + +int vkernel_init(void) +{ + int ret; + + if (vk_kallsyms_init()) + return -1; + if (vk_cap_init()) + return -1; + if (vk_syscall_init()) + return -1; + if (vk_acl_init()) + return -1; + + vkernel_init_debug(); + + ret = misc_register(&vkernel_dev); + if (ret) { + pr_err("vkernel: misc device register failed\n"); + return ret; + } + + vkernel_register_custom(&default_custom); + vkernel_register_custom(&analysis_custom); + pr_info("vkernel: load vkernel\n"); + + return 0; +} +EXPORT_SYMBOL(vkernel_init); + +void vkernel_exit(void) +{ + clear_zombie_vks(); + + pr_info("vkernel: unlod vkernel\n"); + vkernel_unregister_custom(&analysis_custom); + vkernel_unregister_custom(&default_custom); + + misc_deregister(&vkernel_dev); + + debugfs_remove_recursive(vkernel_debugfs_dir); + + vk_acl_uninit(); + vk_syscall_uninit(); + vk_cap_uninit(); + vk_kallsyms_uninit(); +} +EXPORT_SYMBOL(vkernel_exit); + +module_init(vkernel_init); +module_exit(vkernel_exit); diff --git a/init/Kconfig b/init/Kconfig index 279bee622a22..a89f2136dd99 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1384,6 +1384,16 @@ config VKERNEL for the containers, such as syscall isolation, file access protection, capability enhancement, and etc. +config VKERNEL_DRIVER + tristate "Virtual Kernel driver" + depends on VKERNEL + default m + help + Enables vkernel kernel drivers. + This driver provides real policies for vkernel, such as syscall + rules, file rule, capability rule, and etc. Additional custom + module plugins can also be registered to this driver. + endif # NAMESPACES config NVIDIA_SMI_TRAP -- Gitee From c551b458d8b1dd4f03274404d5cd975d7683aeed Mon Sep 17 00:00:00 2001 From: Hang Huang Date: Sun, 28 Jun 2026 20:13:55 +0800 Subject: [PATCH 10/17] vk: implement vkernel data isolation in driver Driver supports kernel log isolation by assigning pid ns id to log ns id. Kmsg logs will be grouped and filtered by pid ns. Signed-off-by: jiangnan Signed-off-by: Hang Huang --- drivers/vkernel/vkernel_main.c | 60 ++++++++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 2 deletions(-) diff --git a/drivers/vkernel/vkernel_main.c b/drivers/vkernel/vkernel_main.c index 5a57a54508b2..3871d8a926e0 100644 --- a/drivers/vkernel/vkernel_main.c +++ b/drivers/vkernel/vkernel_main.c @@ -72,6 +72,10 @@ static int vkernel_no_compat_open(struct inode *inode, struct file *file) #define VKERNEL_EVENT_CREATE_VK 0 #define VKERNEL_EVENT_DESTROY_VK 1 +#define VKERNEL_CAP_MASK ((1 << VKERNEL_CAP_ISOLATE_ANON) |\ + (1 << VKERNEL_CAP_ISOLATE_ANON_PIPE) | \ + (1 << VKERNEL_CAP_ISOLATE_RAMFS)) + static void vkernel_uevent_notify_change(unsigned int type, struct vkernel *vk); static DEFINE_MUTEX(event_lock); static unsigned long long vkernel_createvk_count; @@ -222,6 +226,46 @@ static int vkernel_vk_ioctl_restrict_linux_cap(struct vkernel *vk, unsigned long return vkernel_set_linux_cap(vk, &cap); } +static int vkernel_vk_ioctl_check_extension(struct vkernel *vk, unsigned long arg) +{ + int r = 0; + + switch (arg) { + case VKERNEL_CAP_ISOLATE_LOG: + r = 0; + break; + default: + r = -EOPNOTSUPP; + break; + } + + return r; +} + +static int vkernel_vk_ioctl_enable_cap(struct vkernel *vk, unsigned long arg) +{ + int r = 0; + + if (arg >= VKERNEL_CAP_NUM) + return -EINVAL; + + if (vk->caps & (arg << 1)) + return 0; + + switch (arg) { + case VKERNEL_CAP_ISOLATE_LOG: + vk->log_ns = vk->pid_ns->ns.inum; + break; + default: + r = -EOPNOTSUPP; + } + + if (!r) + vk->caps |= (1 << arg); + + return r; +} + static int stat_show(struct seq_file *m, void *v) { struct vkernel *vk = m->private; @@ -245,6 +289,10 @@ static int stat_show(struct seq_file *m, void *v) seq_printf(m, "Cap bset: 0x%llx\n", vk->linux_cap.bset.val); seq_printf(m, "Cap ambient: 0x%llx\n", vk->linux_cap.ambient.val); + seq_puts(m, "EXTENSION CAP\n"); + seq_printf(m, "Isolation caps: 0x%lx\n", vk->caps); + seq_printf(m, "Log ns: %u\n", vk->log_ns); + seq_puts(m, "=== OPERATION ===\n"); seq_printf(m, "Op cap_capable: %p\n", vk->ops.cap_capable); seq_printf(m, "Op generic_permission: %p\n", vk->ops.generic_permission); @@ -397,6 +445,10 @@ struct vkernel *vkernel_create_vk(struct task_struct *tsk, const char *name, vk->linux_cap.bset = tsk->cred->cap_bset; vk->linux_cap.ambient = tsk->cred->cap_ambient; + /* Init extension cap */ + vk->caps = (1 << VKERNEL_CAP_ISOLATE_LOG); + vk->log_ns = vk->pid_ns->ns.inum; + /* Init default operations */ vk->ops.cap_capable = vk_cap_capable; vk->ops.generic_permission = vk_generic_permission; @@ -517,9 +569,13 @@ static long vkernel_vk_ioctl(struct file *filp, case VKERNEL_SET_SYSCTL_KERNEL: case VKERNEL_SET_SYSCTL_NET: case VKERNEL_SET_SYSCTL_VM: + r = -EOPNOTSUPP; + break; case VKERNEL_CHECK_EXTENSION: + r = vkernel_vk_ioctl_check_extension(vk, arg); + break; case VKERNEL_ENABLE_CAP: - r = -EOPNOTSUPP; + r = vkernel_vk_ioctl_enable_cap(vk, arg); break; case VKERNEL_REGISTER: pr_warn("vkernel: [deprecated] register vk, init %d id %u ret %d\n", @@ -655,7 +711,7 @@ static long vkernel_dev_ioctl(struct file *filp, r = vkernel_dev_ioctl_destroy_vk(arg); break; case VKERNEL_CHECK_EXTENSION: - r = -EOPNOTSUPP; + r = vkernel_vk_ioctl_check_extension(NULL, arg); break; case VKERNEL_TRACE_ENABLE: case VKERNEL_TRACE_PAUSE: -- Gitee From 34797720a4f421a29274240333674a5e4db66744 Mon Sep 17 00:00:00 2001 From: Hang Huang Date: Sun, 28 Jun 2026 20:13:55 +0800 Subject: [PATCH 11/17] vk: implement fs sysctl customization The following fs sysctl items are supported: file-max file-nr inode-nr inode-state mount-max nr_open Signed-off-by: jiangnan Signed-off-by: Hang Huang --- drivers/vkernel/Makefile | 1 + drivers/vkernel/include/sysctl.h | 11 +++++ drivers/vkernel/sysctl/fs.c | 73 ++++++++++++++++++++++++++++++++ drivers/vkernel/vkernel_main.c | 33 ++++++++++++++- 4 files changed, 117 insertions(+), 1 deletion(-) create mode 100644 drivers/vkernel/include/sysctl.h create mode 100644 drivers/vkernel/sysctl/fs.c diff --git a/drivers/vkernel/Makefile b/drivers/vkernel/Makefile index fba34460dfb7..952c1827c149 100644 --- a/drivers/vkernel/Makefile +++ b/drivers/vkernel/Makefile @@ -5,4 +5,5 @@ ccflags-y := -I$(srctree)/drivers/vkernel/include vkernel-y := vkernel_main.o syscall.o vkernel-y += fs/acl.o vkernel-y += security/capability.o +vkernel-y += sysctl/fs.o vkernel-y += utils/kallsyms.o diff --git a/drivers/vkernel/include/sysctl.h b/drivers/vkernel/include/sysctl.h new file mode 100644 index 000000000000..942b580b4574 --- /dev/null +++ b/drivers/vkernel/include/sysctl.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _VKERNEL_SYSCTL_H +#define _VKERNEL_SYSCTL_H + +#include + +int vk_init_sysctl_fs(struct vkernel_sysctl_fs *fs); +void vk_uninit_sysctl_fs(struct vkernel_sysctl_fs *fs); + +#endif diff --git a/drivers/vkernel/sysctl/fs.c b/drivers/vkernel/sysctl/fs.c new file mode 100644 index 000000000000..d57ebae8cfb3 --- /dev/null +++ b/drivers/vkernel/sysctl/fs.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024 ARM Ltd. + * Author: Joy Allen + */ + +#include +#include + +#include "sysctl.h" + +int vk_init_sysctl_fs(struct vkernel_sysctl_fs *fs) +{ + unsigned long n; + unsigned long nr_pages = totalram_pages(); + unsigned long memreserve = (nr_pages - nr_free_pages()) * 3/2; + + memreserve = min(memreserve, nr_pages - 1); + n = ((nr_pages - memreserve) * (PAGE_SIZE / 1024)) / 10; + fs->files_stat.max_files = max_t(unsigned long, n, NR_FILE); + fs->nr_open = 1024 * 1024; + if (percpu_counter_init(&fs->nr_files, 0, GFP_KERNEL)) { + pr_err("vkernel: failed to init sysctl_fs nr_files\n"); + return -ENOMEM; + } + + fs->nr_inodes = alloc_percpu_gfp(unsigned long, GFP_KERNEL); + if (!fs->nr_inodes) { + pr_err("vkernel: failed to alloc sysctl_fs nr_inodes\n"); + return -ENOMEM; + } + fs->nr_unused = alloc_percpu_gfp(unsigned long, GFP_KERNEL); + if (!fs->nr_unused) { + pr_err("vkernel: failed to alloc sysctl_fs nr_unused\n"); + return -ENOMEM; + } + + fs->leases_enable = 1; + fs->lease_break_time = 45; + + fs->mount_max = 100000; + + return 0; +} + +void vk_uninit_sysctl_fs(struct vkernel_sysctl_fs *fs) +{ + if (fs->nr_inodes) + free_percpu(fs->nr_inodes); + if (fs->nr_unused) + free_percpu(fs->nr_unused); + + percpu_counter_destroy(&fs->nr_files); +} + +int vkernel_set_sysctl_fs(struct vkernel_sysctl_fs *fs, struct vkernel_sysctl_fs_desc *desc) +{ + if (desc->file_max) + fs->files_stat.max_files = desc->file_max; + if (desc->nr_open) + fs->nr_open = desc->nr_open; + + if (desc->leases_enable == 0 || desc->leases_enable == 1) + fs->leases_enable = desc->leases_enable; + if (desc->lease_break_time > 0) + fs->lease_break_time = desc->lease_break_time; + + if (desc->mount_max) + fs->mount_max = desc->mount_max; + + return 0; +} +EXPORT_SYMBOL(vkernel_set_sysctl_fs); diff --git a/drivers/vkernel/vkernel_main.c b/drivers/vkernel/vkernel_main.c index 3871d8a926e0..4afcdb844c99 100644 --- a/drivers/vkernel/vkernel_main.c +++ b/drivers/vkernel/vkernel_main.c @@ -23,6 +23,7 @@ #include "fs.h" #include "security.h" #include "syscall.h" +#include "sysctl.h" #include "utils.h" MODULE_AUTHOR("JYH Lab"); @@ -226,6 +227,17 @@ static int vkernel_vk_ioctl_restrict_linux_cap(struct vkernel *vk, unsigned long return vkernel_set_linux_cap(vk, &cap); } +static int vkernel_vk_ioctl_set_sysctl_fs(struct vkernel *vk, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct vkernel_sysctl_fs_desc desc; + + if (copy_from_user(&desc, argp, sizeof(desc))) + return -EFAULT; + + return vkernel_set_sysctl_fs(&vk->sysctl_fs, &desc); +} + static int vkernel_vk_ioctl_check_extension(struct vkernel *vk, unsigned long arg) { int r = 0; @@ -293,6 +305,13 @@ static int stat_show(struct seq_file *m, void *v) seq_printf(m, "Isolation caps: 0x%lx\n", vk->caps); seq_printf(m, "Log ns: %u\n", vk->log_ns); + seq_puts(m, "=== SYSCTL ===\n"); + seq_printf(m, "fs.file-max=%lu\n", vk->sysctl_fs.files_stat.max_files); + seq_printf(m, "fs.nr_open=%u\n", vk->sysctl_fs.nr_open); + seq_printf(m, "fs.lease-break-time=%d\n", vk->sysctl_fs.lease_break_time); + seq_printf(m, "fs.leases-enable=%d\n", vk->sysctl_fs.leases_enable); + seq_printf(m, "fs.mount-max=%u\n", vk->sysctl_fs.mount_max); + seq_puts(m, "=== OPERATION ===\n"); seq_printf(m, "Op cap_capable: %p\n", vk->ops.cap_capable); seq_printf(m, "Op generic_permission: %p\n", vk->ops.generic_permission); @@ -396,6 +415,7 @@ void vkernel_destroy_vk(struct vkernel *vk) vkernel_destroy_vk_debugfs(vk); + vk_uninit_sysctl_fs(&vk->sysctl_fs); vk_uninit_acl(&vk->acl); vk_uninit_syscall(&vk->syscall); kfree(vk); @@ -449,13 +469,18 @@ struct vkernel *vkernel_create_vk(struct task_struct *tsk, const char *name, vk->caps = (1 << VKERNEL_CAP_ISOLATE_LOG); vk->log_ns = vk->pid_ns->ns.inum; + /* Init sysctl */ + r = vk_init_sysctl_fs(&vk->sysctl_fs); + if (r) + goto err_acl; + /* Init default operations */ vk->ops.cap_capable = vk_cap_capable; vk->ops.generic_permission = vk_generic_permission; r = vkernel_create_vk_debugfs(vk, name); if (r) - goto err_acl; + goto err_fs; /* Custom initializations */ vk->custom = vkernel_find_custom(custom); @@ -486,6 +511,8 @@ struct vkernel *vkernel_create_vk(struct task_struct *tsk, const char *name, module_put(vk->custom->owner); vkernel_destroy_vk_debugfs(vk); +err_fs: + vk_uninit_sysctl_fs(&vk->sysctl_fs); err_acl: vk_uninit_acl(&vk->acl); err_syscall: @@ -565,7 +592,11 @@ static long vkernel_vk_ioctl(struct file *filp, break; case VKERNEL_SET_CPU_PREF: case VKERNEL_SET_MEMORY_PREF: + r = -EOPNOTSUPP; + break; case VKERNEL_SET_SYSCTL_FS: + r = vkernel_vk_ioctl_set_sysctl_fs(vk, arg); + break; case VKERNEL_SET_SYSCTL_KERNEL: case VKERNEL_SET_SYSCTL_NET: case VKERNEL_SET_SYSCTL_VM: -- Gitee From 263c865479591af0802e11efc157cd36c2c77dd7 Mon Sep 17 00:00:00 2001 From: Hang Huang Date: Sun, 28 Jun 2026 20:13:55 +0800 Subject: [PATCH 12/17] vk: implement kernel sysctl customization The following kernel sysctl items are supported: threads-max pty/{max, nr, reserve} The following are supported by ipc ns: msgmax, msgmnb, msgmni, msg_next_id sem, sem_next_id shmall, shmmax, shmmni, shm_next_id, shm_rmid_forced Signed-off-by: jiangnan Signed-off-by: Hang Huang --- drivers/vkernel/Makefile | 2 +- drivers/vkernel/include/sysctl.h | 12 ++++ drivers/vkernel/sysctl/kernel.c | 112 +++++++++++++++++++++++++++++++ drivers/vkernel/vkernel_main.c | 92 ++++++++++++++++++++++++- 4 files changed, 216 insertions(+), 2 deletions(-) create mode 100644 drivers/vkernel/sysctl/kernel.c diff --git a/drivers/vkernel/Makefile b/drivers/vkernel/Makefile index 952c1827c149..31a5a77181d0 100644 --- a/drivers/vkernel/Makefile +++ b/drivers/vkernel/Makefile @@ -5,5 +5,5 @@ ccflags-y := -I$(srctree)/drivers/vkernel/include vkernel-y := vkernel_main.o syscall.o vkernel-y += fs/acl.o vkernel-y += security/capability.o -vkernel-y += sysctl/fs.o +vkernel-y += sysctl/fs.o sysctl/kernel.o vkernel-y += utils/kallsyms.o diff --git a/drivers/vkernel/include/sysctl.h b/drivers/vkernel/include/sysctl.h index 942b580b4574..4a25e022022c 100644 --- a/drivers/vkernel/include/sysctl.h +++ b/drivers/vkernel/include/sysctl.h @@ -4,8 +4,20 @@ #define _VKERNEL_SYSCTL_H #include +#include + +#define IPC_SEM_IDS 0 +#define IPC_MSG_IDS 1 +#define IPC_SHM_IDS 2 + +/* defined at kernel/fork.c */ +#define MIN_THREADS 20 +#define MAX_THREADS FUTEX_TID_MASK int vk_init_sysctl_fs(struct vkernel_sysctl_fs *fs); void vk_uninit_sysctl_fs(struct vkernel_sysctl_fs *fs); +int vk_init_sysctl_kernel(struct vkernel_sysctl_kernel *k); +void vk_uninit_sysctl_kernel(struct vkernel_sysctl_kernel *k); + #endif diff --git a/drivers/vkernel/sysctl/kernel.c b/drivers/vkernel/sysctl/kernel.c new file mode 100644 index 000000000000..5690e565a5b7 --- /dev/null +++ b/drivers/vkernel/sysctl/kernel.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024 ARM Ltd. + * Author: Joy Allen + */ + +#include +#include +#include + +#include "sysctl.h" + +int vk_init_sysctl_kernel(struct vkernel_sysctl_kernel *k) +{ + u64 threads; + unsigned long nr_pages = totalram_pages(); + + k->nb_mode = NUMA_BALANCING_DISABLED; + k->nb_promote_rate_limit = 65536; + + k->sched_cfs_bandwidth_slice = 5000UL; + k->sched_child_runs_first = 0; + + k->sched_dl_period_max = 1 << 22; /* ~4 seconds */ + k->sched_dl_period_min = 100; /* 100 us */ + + k->sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; + k->sched_rt_period = 1000000; + k->sched_rt_runtime = 950000; + + /* + * The number of threads shall be limited such that the thread + * structures may only consume a small part of the available memory. + */ + if (fls64(nr_pages) + fls64(PAGE_SIZE) > 64) + threads = MAX_THREADS; + else + threads = div64_u64((u64) nr_pages * (u64) PAGE_SIZE, + (u64) THREAD_SIZE * 8UL); + if (threads > MAX_THREADS) + threads = MAX_THREADS; + k->nr_threads = 0; + k->max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS); + + k->key_gc_delay = 5 * 60; + k->persistent_keyring_expiry = 3 * 24 * 3600; /* Expire after 3 days of non-use */ + k->key_quota_root_maxbytes = 25000000; + k->key_quota_root_maxkeys = 1000000; + k->key_quota_maxbytes = 20000; + k->key_quota_maxkeys = 200; + + k->pty_limit = NR_UNIX98_PTY_DEFAULT; + k->pty_reserve = NR_UNIX98_PTY_RESERVE; + k->pty_count = (atomic_t)ATOMIC_INIT(0); + + return 0; +} + +void vk_uninit_sysctl_kernel(struct vkernel_sysctl_kernel *k) +{ + +} + +int vkernel_set_sysctl_kernel(struct vkernel_sysctl_kernel *k, + struct vkernel_sysctl_kernel_desc *desc) +{ + if (desc->numa_balancing >= 0) + k->nb_mode = desc->numa_balancing; + if (desc->numa_balancing_promote_rate_limit > 0) + k->nb_promote_rate_limit = desc->numa_balancing_promote_rate_limit; + + if (desc->sched_cfs_bandwidth_slice) + k->sched_cfs_bandwidth_slice = desc->sched_cfs_bandwidth_slice; + if (desc->sched_child_runs_first == 0 || desc->sched_child_runs_first == 1) + k->sched_child_runs_first = desc->sched_child_runs_first; + + if (desc->sched_dl_period_max) + k->sched_dl_period_max = desc->sched_dl_period_max; + if (desc->sched_dl_period_min) + k->sched_dl_period_min = desc->sched_dl_period_min; + + if (desc->sched_rr_timeslice > 0) + k->sched_rr_timeslice = desc->sched_rr_timeslice; + if (desc->sched_rt_period > 0) + k->sched_rt_period = desc->sched_rt_period; + if (desc->sched_rt_runtime > 0) + k->sched_rt_runtime = desc->sched_rt_runtime; + + if (desc->max_threads > 0) + k->max_threads = clamp_t(u64, desc->max_threads, MIN_THREADS, MAX_THREADS); + + if (desc->key_gc_delay) + k->key_gc_delay = desc->key_gc_delay; + if (desc->key_persistent_keyring_expiry) + k->persistent_keyring_expiry = desc->key_persistent_keyring_expiry; + if (desc->key_quota_root_maxbytes) + k->key_quota_root_maxbytes = desc->key_quota_root_maxbytes; + if (desc->key_quota_root_maxkeys) + k->key_quota_root_maxkeys = desc->key_quota_root_maxkeys; + if (desc->key_quota_maxbytes) + k->key_quota_maxbytes = desc->key_quota_maxbytes; + if (desc->key_quota_maxkeys) + k->key_quota_maxkeys = desc->key_quota_maxkeys; + + if (desc->pty_limit > 0) + k->pty_limit = desc->pty_limit; + if (desc->pty_reserve > 0) + k->pty_reserve = desc->pty_reserve; + + return 0; +} +EXPORT_SYMBOL(vkernel_set_sysctl_kernel); diff --git a/drivers/vkernel/vkernel_main.c b/drivers/vkernel/vkernel_main.c index 4afcdb844c99..c495e375f6f1 100644 --- a/drivers/vkernel/vkernel_main.c +++ b/drivers/vkernel/vkernel_main.c @@ -238,6 +238,58 @@ static int vkernel_vk_ioctl_set_sysctl_fs(struct vkernel *vk, unsigned long arg) return vkernel_set_sysctl_fs(&vk->sysctl_fs, &desc); } +static int vkernel_vk_ioctl_set_sysctl_kernel(struct vkernel *vk, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct vkernel_sysctl_kernel_desc desc; + struct ipc_namespace *ipc_ns; + + if (copy_from_user(&desc, argp, sizeof(desc))) + return -EFAULT; + + /* Handle namespace fields */ + if (vk->init_process->nsproxy) + ipc_ns = vk->init_process->nsproxy->ipc_ns; + if (likely(ipc_ns)) { + if (desc.msgmax) + ipc_ns->msg_ctlmax = desc.msgmax; + if (desc.msgmnb) + ipc_ns->msg_ctlmnb = desc.msgmnb; + if (desc.msgmni) + ipc_ns->msg_ctlmni = desc.msgmni; +#ifdef CONFIG_CHECKPOINT_RESTORE + if (desc.msg_next_id >= -1) + ipc_ns->ids[IPC_MSG_IDS].next_id = desc.msg_next_id; +#endif + if (desc.semmsl > 0) + ipc_ns->sem_ctls[0] = desc.semmsl; + if (desc.semmns > 0) + ipc_ns->sem_ctls[1] = desc.semmns; + if (desc.semopm > 0) + ipc_ns->sem_ctls[2] = desc.semopm; + if (desc.semmni > 0) + ipc_ns->sem_ctls[3] = desc.semmni; +#ifdef CONFIG_CHECKPOINT_RESTORE + if (desc.sem_next_id >= -1) + ipc_ns->ids[IPC_SEM_IDS].next_id = desc.sem_next_id; +#endif + if (desc.shmall) + ipc_ns->shm_ctlall = desc.shmall; + if (desc.shmmax) + ipc_ns->shm_ctlmax = desc.shmmax; + if (desc.shmmni) + ipc_ns->shm_ctlmni = desc.shmmni; +#ifdef CONFIG_CHECKPOINT_RESTORE + if (desc.shm_next_id) + ipc_ns->ids[IPC_SHM_IDS].next_id = desc.shm_next_id; +#endif + if (desc.shm_rmid_forced == 0 || desc.shm_rmid_forced == 1) + ipc_ns->shm_rmid_forced = desc.shm_rmid_forced; + } + + return vkernel_set_sysctl_kernel(&vk->sysctl_kernel, &desc); +} + static int vkernel_vk_ioctl_check_extension(struct vkernel *vk, unsigned long arg) { int r = 0; @@ -311,6 +363,35 @@ static int stat_show(struct seq_file *m, void *v) seq_printf(m, "fs.lease-break-time=%d\n", vk->sysctl_fs.lease_break_time); seq_printf(m, "fs.leases-enable=%d\n", vk->sysctl_fs.leases_enable); seq_printf(m, "fs.mount-max=%u\n", vk->sysctl_fs.mount_max); + seq_printf(m, "kernel.numa_balancing=%d\n", vk->sysctl_kernel.nb_mode); + seq_printf(m, "kernel.numa_balancing_promote_rate_limit_MBps=%d\n", + vk->sysctl_kernel.nb_promote_rate_limit); + seq_printf(m, "kernel.sched_cfs_bandwidth_slice_us=%u\n", + vk->sysctl_kernel.sched_cfs_bandwidth_slice); + seq_printf(m, "kernel.sched_child_runs_first=%u\n", + vk->sysctl_kernel.sched_child_runs_first); + seq_printf(m, "kernel.sched_deadline_period_max_us=%u\n", + vk->sysctl_kernel.sched_dl_period_max); + seq_printf(m, "kernel.sched_deadline_period_min_us=%u\n", + vk->sysctl_kernel.sched_dl_period_min); + seq_printf(m, "kernel.sched_rr_timeslice_ms=%d\n", + vk->sysctl_kernel.sched_rr_timeslice); + seq_printf(m, "kernel.sched_rt_period_us=%d\n", + vk->sysctl_kernel.sched_rt_period); + seq_printf(m, "kernel.sched_rt_runtime_us=%d\n", + vk->sysctl_kernel.sched_rt_runtime); + seq_printf(m, "kernel.threads-max=%d\n", vk->sysctl_kernel.max_threads); + seq_printf(m, "kernel.keys.gc_delay=%u\n", vk->sysctl_kernel.key_gc_delay); + seq_printf(m, "kernel.keys.maxbytes=%u\n", vk->sysctl_kernel.key_quota_maxbytes); + seq_printf(m, "kernel.keys.maxkeys=%u\n", vk->sysctl_kernel.key_quota_maxkeys); + seq_printf(m, "kernel.keys.persistent_keyring_expiry=%u\n", + vk->sysctl_kernel.persistent_keyring_expiry); + seq_printf(m, "kernel.keys.root_maxbytes=%u\n", + vk->sysctl_kernel.key_quota_root_maxbytes); + seq_printf(m, "kernel.keys.root_maxkeys=%u\n", + vk->sysctl_kernel.key_quota_root_maxkeys); + seq_printf(m, "kernel.pty.max=%d\n", vk->sysctl_kernel.pty_limit); + seq_printf(m, "kernel.pty.reserve=%d\n", vk->sysctl_kernel.pty_reserve); seq_puts(m, "=== OPERATION ===\n"); seq_printf(m, "Op cap_capable: %p\n", vk->ops.cap_capable); @@ -415,6 +496,7 @@ void vkernel_destroy_vk(struct vkernel *vk) vkernel_destroy_vk_debugfs(vk); + vk_uninit_sysctl_kernel(&vk->sysctl_kernel); vk_uninit_sysctl_fs(&vk->sysctl_fs); vk_uninit_acl(&vk->acl); vk_uninit_syscall(&vk->syscall); @@ -473,6 +555,10 @@ struct vkernel *vkernel_create_vk(struct task_struct *tsk, const char *name, r = vk_init_sysctl_fs(&vk->sysctl_fs); if (r) goto err_acl; + r = vk_init_sysctl_kernel(&vk->sysctl_kernel); + if (r) + goto err_fs; + /* Init default operations */ vk->ops.cap_capable = vk_cap_capable; @@ -480,7 +566,7 @@ struct vkernel *vkernel_create_vk(struct task_struct *tsk, const char *name, r = vkernel_create_vk_debugfs(vk, name); if (r) - goto err_fs; + goto err_kernel; /* Custom initializations */ vk->custom = vkernel_find_custom(custom); @@ -511,6 +597,8 @@ struct vkernel *vkernel_create_vk(struct task_struct *tsk, const char *name, module_put(vk->custom->owner); vkernel_destroy_vk_debugfs(vk); +err_kernel: + vk_uninit_sysctl_kernel(&vk->sysctl_kernel); err_fs: vk_uninit_sysctl_fs(&vk->sysctl_fs); err_acl: @@ -598,6 +686,8 @@ static long vkernel_vk_ioctl(struct file *filp, r = vkernel_vk_ioctl_set_sysctl_fs(vk, arg); break; case VKERNEL_SET_SYSCTL_KERNEL: + r = vkernel_vk_ioctl_set_sysctl_kernel(vk, arg); + break; case VKERNEL_SET_SYSCTL_NET: case VKERNEL_SET_SYSCTL_VM: r = -EOPNOTSUPP; -- Gitee From db83b6addce18ed608eef9c908c2b174ae0dd3c3 Mon Sep 17 00:00:00 2001 From: Hang Huang Date: Sun, 28 Jun 2026 20:13:55 +0800 Subject: [PATCH 13/17] vk: implement net sysctl customization The kernel sysctl items defined in net ns are supported by net ns. Currently, other items are not supported yet. Signed-off-by: jiangnan Signed-off-by: Hang Huang --- drivers/vkernel/Makefile | 2 +- drivers/vkernel/include/sysctl.h | 11 + drivers/vkernel/sysctl/net.c | 416 +++++++++++++++++++++++++++++++ drivers/vkernel/vkernel_main.c | 30 ++- 4 files changed, 456 insertions(+), 3 deletions(-) create mode 100644 drivers/vkernel/sysctl/net.c diff --git a/drivers/vkernel/Makefile b/drivers/vkernel/Makefile index 31a5a77181d0..e5aad2d7d09f 100644 --- a/drivers/vkernel/Makefile +++ b/drivers/vkernel/Makefile @@ -5,5 +5,5 @@ ccflags-y := -I$(srctree)/drivers/vkernel/include vkernel-y := vkernel_main.o syscall.o vkernel-y += fs/acl.o vkernel-y += security/capability.o -vkernel-y += sysctl/fs.o sysctl/kernel.o +vkernel-y += sysctl/fs.o sysctl/kernel.o sysctl/net.o vkernel-y += utils/kallsyms.o diff --git a/drivers/vkernel/include/sysctl.h b/drivers/vkernel/include/sysctl.h index 4a25e022022c..88cb25f38de9 100644 --- a/drivers/vkernel/include/sysctl.h +++ b/drivers/vkernel/include/sysctl.h @@ -20,4 +20,15 @@ void vk_uninit_sysctl_fs(struct vkernel_sysctl_fs *fs); int vk_init_sysctl_kernel(struct vkernel_sysctl_kernel *k); void vk_uninit_sysctl_kernel(struct vkernel_sysctl_kernel *k); +int vk_init_sysctl_net(struct vkernel_sysctl_net *net, struct task_struct *tsk); +void vk_uninit_sysctl_net(struct vkernel_sysctl_net *net); + +extern int (*tcp_set_default_congestion_control_ptr)(struct net *net, const char *name); + +int devconf_proc(struct net *net, struct ipv4_devconf *conf, + int val, int i, int type); +int devconf_forward(struct net *net, struct ipv4_devconf *conf, + int val, int i, int type); +int devconf_flush(struct net *net, struct ipv4_devconf *conf, + int val, int i, int type); #endif diff --git a/drivers/vkernel/sysctl/net.c b/drivers/vkernel/sysctl/net.c new file mode 100644 index 000000000000..deaff3d13d16 --- /dev/null +++ b/drivers/vkernel/sysctl/net.c @@ -0,0 +1,416 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024 ARM Ltd. + * Author: Joy Allen + */ + +#include +#include + +#include "sysctl.h" +#include "utils.h" + +int (*tcp_set_default_congestion_control_ptr)(struct net *net, const char *name); +void (*rt_cache_flush_ptr)(struct net *net); +void (*inet_netconf_notify_devconf_ptr)(struct net *net, int event, int type, + int ifindex, struct ipv4_devconf *devconf); + +// extern unsigned int nf_conntrack_max; + +int vk_init_sysctl_net(struct vkernel_sysctl_net *net, struct task_struct *tsk) +{ + tcp_set_default_congestion_control_ptr = + (void *)lookup_name("tcp_set_default_congestion_control"); + rt_cache_flush_ptr = (void *)lookup_name("rt_cache_flush"); + inet_netconf_notify_devconf_ptr = + (void *)lookup_name("inet_netconf_notify_devconf"); + + /* congestion_control can be null */ + if (!rt_cache_flush_ptr || !inet_netconf_notify_devconf_ptr) { + pr_err("failed to find net symbols, flush: %p, notify: %p\n", + rt_cache_flush_ptr, inet_netconf_notify_devconf_ptr); + return -1; + } + + if (!tsk) { + pr_err("failed to init sysctl net with invalid task\n"); + return -1; + } + + // net->nf_conntrack_max = nf_conntrack_max; + net->nf_conntrack_max = 1572864; + + net->net_busy_poll = 0; + net->net_busy_read = 0; + + net->weight_p = 64; + net->dev_weight_rx_bias = 1; + net->dev_weight_tx_bias = 1; + net->dev_rx_weight = 64; + net->dev_tx_weight = 64; + + net->netdev_budget = 300; + net->netdev_budget_usecs = 2 * USEC_PER_SEC / HZ; + net->netdev_max_backlog = 1000; + + net->optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV+512); + net->wmem_max = SK_WMEM_MAX; + net->rmem_max = SK_RMEM_MAX; + net->wmem_default = SK_WMEM_MAX; + net->rmem_default = SK_RMEM_MAX; + + net->net = ERR_PTR(-ESRCH); + rcu_read_lock(); + task_lock(tsk); + if (tsk->nsproxy) + net->net = get_net(tsk->nsproxy->net_ns); + task_unlock(tsk); + rcu_read_unlock(); + if (IS_ERR(net->net)) { + pr_err("failed to get net ns, error %ld\n", PTR_ERR(net->net)); + return -1; + } + + return 0; +} + +void vk_uninit_sysctl_net(struct vkernel_sysctl_net *net) +{ + if (!IS_ERR(net->net)) + put_net(net->net); +} + +enum { + DEVCONF_ALL, + DEVCONF_DFLT, + DEVCONF_OTHER +}; + +#define IPV4_DEVCONF_DFLT(net, attr) \ + IPV4_DEVCONF((*net->ipv4.devconf_dflt), attr) + +static void devinet_copy_dflt_conf(struct net *net, int i) +{ + struct net_device *dev; + + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { + struct in_device *in_dev; + + in_dev = __in_dev_get_rcu(dev); + if (in_dev && !test_bit(i, in_dev->cnf.state)) + in_dev->cnf.data[i] = net->ipv4.devconf_dflt->data[i]; + } + rcu_read_unlock(); +} + +static void inet_forward_change(struct net *net) +{ + struct net_device *dev; + int on = IPV4_DEVCONF_ALL(net, FORWARDING); + + IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on; + IPV4_DEVCONF_DFLT(net, FORWARDING) = on; + inet_netconf_notify_devconf_ptr(net, RTM_NEWNETCONF, + NETCONFA_FORWARDING, + NETCONFA_IFINDEX_ALL, + net->ipv4.devconf_all); + inet_netconf_notify_devconf_ptr(net, RTM_NEWNETCONF, + NETCONFA_FORWARDING, + NETCONFA_IFINDEX_DEFAULT, + net->ipv4.devconf_dflt); + + for_each_netdev(net, dev) { + struct in_device *in_dev; + + if (on) + dev_disable_lro(dev); + + in_dev = __in_dev_get_rtnl(dev); + if (in_dev) { + IN_DEV_CONF_SET(in_dev, FORWARDING, on); + inet_netconf_notify_devconf_ptr(net, RTM_NEWNETCONF, + NETCONFA_FORWARDING, + dev->ifindex, &in_dev->cnf); + } + } +} + +static int devinet_conf_ifindex(struct net *net, struct ipv4_devconf *cnf) +{ + struct in_device *idev; + + if (cnf == net->ipv4.devconf_dflt) + return NETCONFA_IFINDEX_DEFAULT; + else if (cnf == net->ipv4.devconf_all) + return NETCONFA_IFINDEX_ALL; + + idev = container_of(cnf, struct in_device, cnf); + return idev->dev->ifindex; +} + +int devconf_proc(struct net *net, struct ipv4_devconf *conf, + int val, int i, int type) +{ + int old_val; + int ifindex; + + old_val = conf->data[i - 1]; + conf->data[i - 1] = val; + + set_bit(i - 1, conf->state); + + if (type == DEVCONF_DFLT) + devinet_copy_dflt_conf(net, i - 1); // inline + if (i == IPV4_DEVCONF_ACCEPT_LOCAL || i == IPV4_DEVCONF_ROUTE_LOCALNET) + if (conf->data[i - 1] == 0 && old_val != 0) + rt_cache_flush_ptr(net); + + if (i == IPV4_DEVCONF_BC_FORWARDING && conf->data[i - 1] != old_val) + rt_cache_flush_ptr(net); + + if (i == IPV4_DEVCONF_RP_FILTER && conf->data[i - 1] != old_val) { + ifindex = devinet_conf_ifindex(net, conf); // inline + inet_netconf_notify_devconf_ptr(net, RTM_NEWNETCONF, + NETCONFA_RP_FILTER, + ifindex, conf); + } + if (i == IPV4_DEVCONF_PROXY_ARP && conf->data[i - 1] != old_val) { + ifindex = devinet_conf_ifindex(net, conf); + inet_netconf_notify_devconf_ptr(net, RTM_NEWNETCONF, + NETCONFA_PROXY_NEIGH, + ifindex, conf); + } + if (i == IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN && conf->data[i - 1] != old_val) { + ifindex = devinet_conf_ifindex(net, conf); + inet_netconf_notify_devconf_ptr(net, RTM_NEWNETCONF, + NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, + ifindex, conf); + } + + return 0; +} + +int devconf_forward(struct net *net, struct ipv4_devconf *conf, + int val, int i, int type) +{ + int old_val; + + old_val = conf->data[i - 1]; + conf->data[i - 1] = val; + if (conf->data[i - 1] != old_val) { + if (type != DEVCONF_DFLT) { + if (!rtnl_trylock()) { + conf->data[i - 1] = old_val; + return -EBUSY; + } + if (type == DEVCONF_ALL) + inet_forward_change(net); // inline + else { + struct in_device *idev = + container_of(conf, struct in_device, cnf); + dev_disable_lro(idev->dev); + inet_netconf_notify_devconf_ptr(net, RTM_NEWNETCONF, + NETCONFA_FORWARDING, + idev->dev->ifindex, + conf); + } + } else + inet_netconf_notify_devconf_ptr(net, RTM_NEWNETCONF, + NETCONFA_FORWARDING, + NETCONFA_IFINDEX_DEFAULT, + conf); + } + + return 0; +} + +int devconf_flush(struct net *net, struct ipv4_devconf *conf, + int val, int i, int type) +{ + int old_val; + + old_val = conf->data[i - 1]; + conf->data[i - 1] = val; + if (conf->data[i - 1] != old_val) + rt_cache_flush_ptr(net); + + return 0; +} + +int vkernel_set_sysctl_net(struct vkernel_sysctl_net *net, struct vkernel_sysctl_net_desc *desc) +{ + struct net *n = net->net; + int weight; + int val; + int i; + + /* netns specific */ + if (desc->nf_conntrack_max) + net->nf_conntrack_max = desc->nf_conntrack_max; + + /* core, poll/select specific */ + net->net_busy_poll = desc->core_busy_poll; + net->net_busy_read = desc->core_busy_read; + + /* napi_struct specific */ + if (desc->core_dev_weight > 0) { + net->weight_p = desc->core_dev_weight; + weight = READ_ONCE(net->weight_p); + WRITE_ONCE(net->dev_rx_weight, weight * net->dev_weight_rx_bias); + WRITE_ONCE(net->dev_tx_weight, weight * net->dev_weight_tx_bias); + } + + /* softnet_data specific */ + if (desc->core_netdev_budget > 0) + net->netdev_budget = desc->core_netdev_budget; + if (desc->core_netdev_budget_us > 0) + net->netdev_budget_usecs = desc->core_netdev_budget_us; + if (desc->core_netdev_max_backlog > 0) + net->netdev_max_backlog = desc->core_netdev_max_backlog; + + /* sock specific (netns specific) */ + if (desc->core_optmem_max > 0) + net->optmem_max = desc->core_optmem_max; + if (desc->core_wmem_max) + net->wmem_max = desc->core_wmem_max; + if (desc->core_rmem_max) + net->rmem_max = desc->core_rmem_max; + if (desc->core_wmem_default) + net->wmem_default = desc->core_wmem_default; + if (desc->core_rmem_default) + net->rmem_default = desc->core_rmem_default; + + /* net ns specific */ + + /* core */ + if (desc->core_somaxconn) + n->core.sysctl_somaxconn = desc->core_somaxconn; + + /* ipv4 */ + if (desc->ipv4_icmp_echo_ignore_broadcasts == 0 || + desc->ipv4_icmp_echo_ignore_broadcasts == 1) + n->ipv4.sysctl_icmp_echo_ignore_broadcasts = desc->ipv4_icmp_echo_ignore_broadcasts; + if (desc->ipv4_ip_local_port_range[0] > 0 && desc->ipv4_ip_local_port_range[1] > 0) { + n->ipv4.ip_local_ports.range[0] = desc->ipv4_ip_local_port_range[0]; + n->ipv4.ip_local_ports.range[1] = desc->ipv4_ip_local_port_range[1]; + } + if (desc->ipv4_max_tw_buckets > 0) + n->ipv4.tcp_death_row.sysctl_max_tw_buckets = desc->ipv4_max_tw_buckets; + if (desc->ipv4_tcp_ecn <= 2) + n->ipv4.sysctl_tcp_ecn = desc->ipv4_tcp_ecn; + if (desc->ipv4_ip_default_ttl >= 1 && desc->ipv4_ip_default_ttl <= 255) + n->ipv4.sysctl_ip_default_ttl = desc->ipv4_ip_default_ttl; + if (desc->ipv4_ip_no_pmtu_disc == 0 || desc->ipv4_ip_no_pmtu_disc == 1) + n->ipv4.sysctl_ip_no_pmtu_disc = desc->ipv4_ip_no_pmtu_disc; + if (desc->ipv4_tcp_keepalive_time > 0) + WRITE_ONCE(n->ipv4.sysctl_tcp_keepalive_time, desc->ipv4_tcp_keepalive_time * HZ); + if (desc->ipv4_tcp_keepalive_intvl > 0) + WRITE_ONCE(n->ipv4.sysctl_tcp_keepalive_intvl, desc->ipv4_tcp_keepalive_intvl * HZ); + if (desc->ipv4_tcp_keepalive_probes) + n->ipv4.sysctl_tcp_keepalive_probes = desc->ipv4_tcp_keepalive_probes; + if (desc->ipv4_tcp_syn_retries >= 1 && desc->ipv4_tcp_syn_retries <= MAX_TCP_SYNCNT) + n->ipv4.sysctl_tcp_syn_retries = desc->ipv4_tcp_syn_retries; + if (desc->ipv4_tcp_synack_retries) + n->ipv4.sysctl_tcp_synack_retries = desc->ipv4_tcp_synack_retries; + if (desc->ipv4_tcp_syncookies >= 0 && desc->ipv4_tcp_syncookies <= 2) + n->ipv4.sysctl_tcp_syncookies = desc->ipv4_tcp_syncookies; + if (desc->ipv4_tcp_reordering > 0) + n->ipv4.sysctl_tcp_reordering = desc->ipv4_tcp_reordering; + if (desc->ipv4_tcp_retries1 && desc->ipv4_tcp_retries1 <= 255) + n->ipv4.sysctl_tcp_retries1 = desc->ipv4_tcp_retries1; + if (desc->ipv4_tcp_retries2) + n->ipv4.sysctl_tcp_retries2 = desc->ipv4_tcp_retries2; + if (desc->ipv4_tcp_orphan_retries) + n->ipv4.sysctl_tcp_orphan_retries = desc->ipv4_tcp_orphan_retries; + if (desc->ipv4_tcp_tw_reuse >= 0 && desc->ipv4_tcp_tw_reuse <= 2) + n->ipv4.sysctl_tcp_tw_reuse = desc->ipv4_tcp_tw_reuse; + if (desc->ipv4_tcp_fin_timeout > 0) + WRITE_ONCE(n->ipv4.sysctl_tcp_fin_timeout, desc->ipv4_tcp_fin_timeout * HZ); + if (desc->ipv4_tcp_sack == 0 || desc->ipv4_tcp_sack == 1) + n->ipv4.sysctl_tcp_sack = desc->ipv4_tcp_sack; + if (desc->ipv4_tcp_window_scaling == 0 || + desc->ipv4_tcp_window_scaling == 1) + n->ipv4.sysctl_tcp_window_scaling = desc->ipv4_tcp_window_scaling; + if (desc->ipv4_tcp_timestamps == 0 || desc->ipv4_tcp_timestamps == 1) + n->ipv4.sysctl_tcp_timestamps = desc->ipv4_tcp_timestamps; + if (desc->ipv4_tcp_thin_linear_timeouts == 0 || + desc->ipv4_tcp_thin_linear_timeouts == 1) + n->ipv4.sysctl_tcp_thin_linear_timeouts = desc->ipv4_tcp_thin_linear_timeouts; + if (desc->ipv4_tcp_retrans_collapse == 0 || + desc->ipv4_tcp_retrans_collapse == 1) + n->ipv4.sysctl_tcp_retrans_collapse = desc->ipv4_tcp_retrans_collapse; + if (desc->ipv4_tcp_fack == 0 || desc->ipv4_tcp_fack == 1) + n->ipv4.sysctl_tcp_fack = desc->ipv4_tcp_fack; + if (desc->ipv4_tcp_adv_win_scale >= 0 && desc->ipv4_tcp_adv_win_scale <= 4) + n->ipv4.sysctl_tcp_adv_win_scale = desc->ipv4_tcp_adv_win_scale; + if (desc->ipv4_tcp_dsack == 0 || desc->ipv4_tcp_dsack == 1) + n->ipv4.sysctl_tcp_dsack = desc->ipv4_tcp_dsack; + if (desc->ipv4_tcp_nometrics_save == 0 || desc->ipv4_tcp_nometrics_save == 1) + n->ipv4.sysctl_tcp_nometrics_save = desc->ipv4_tcp_nometrics_save; + if (desc->ipv4_tcp_moderate_rcvbuf == 0 || desc->ipv4_tcp_moderate_rcvbuf == 1) + n->ipv4.sysctl_tcp_moderate_rcvbuf = desc->ipv4_tcp_moderate_rcvbuf; + if (desc->ipv4_tcp_min_tso_segs) + n->ipv4.sysctl_tcp_min_tso_segs = desc->ipv4_tcp_min_tso_segs; + if (desc->ipv4_tcp_wmem[0] > 0 && desc->ipv4_tcp_wmem[1] > 0 && + desc->ipv4_tcp_wmem[2] > 0) { + n->ipv4.sysctl_tcp_wmem[0] = desc->ipv4_tcp_wmem[0]; + n->ipv4.sysctl_tcp_wmem[1] = desc->ipv4_tcp_wmem[1]; + n->ipv4.sysctl_tcp_wmem[2] = desc->ipv4_tcp_wmem[2]; + } + if (desc->ipv4_tcp_rmem[0] > 0 && desc->ipv4_tcp_rmem[1] > 0 && + desc->ipv4_tcp_rmem[2] > 0) { + n->ipv4.sysctl_tcp_rmem[0] = desc->ipv4_tcp_rmem[0]; + n->ipv4.sysctl_tcp_rmem[1] = desc->ipv4_tcp_rmem[1]; + n->ipv4.sysctl_tcp_rmem[2] = desc->ipv4_tcp_rmem[2]; + } + if (desc->ipv4_max_syn_backlog > 0) + n->ipv4.sysctl_max_syn_backlog = desc->ipv4_max_syn_backlog; + if (desc->ipv4_tcp_fastopen == 1 || desc->ipv4_tcp_fastopen == 2 || + desc->ipv4_tcp_fastopen == 4) + n->ipv4.sysctl_tcp_fastopen = desc->ipv4_tcp_fastopen; + if (tcp_set_default_congestion_control_ptr && strlen(desc->ipv4_tcp_congestion_control) > 1) + tcp_set_default_congestion_control_ptr(n, desc->ipv4_tcp_congestion_control); + + /* ipv4 conf */ + for (i = IPV4_DEVCONF_FORWARDING; i <= IPV4_DEVCONF_MAX; i++) { + val = desc->ipv4_conf_all[i - 1]; + if (val < 0) + continue; + + if (i == IPV4_DEVCONF_FORWARDING) + devconf_forward(n, n->ipv4.devconf_all, val, i, DEVCONF_ALL); + else if (i == IPV4_DEVCONF_NOXFRM || + i == IPV4_DEVCONF_NOPOLICY || + i == IPV4_DEVCONF_PROMOTE_SECONDARIES || + i == IPV4_DEVCONF_ROUTE_LOCALNET || + i == IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST) + devconf_flush(n, n->ipv4.devconf_all, val, i, DEVCONF_ALL); + else + devconf_proc(n, n->ipv4.devconf_all, val, i, DEVCONF_ALL); + } + /* ipv4 conf default */ + for (i = IPV4_DEVCONF_FORWARDING; i <= IPV4_DEVCONF_MAX; i++) { + val = desc->ipv4_conf_default[i - 1]; + if (val != 0 && val != 1) + continue; + + if (i == IPV4_DEVCONF_FORWARDING) + devconf_forward(n, n->ipv4.devconf_dflt, val, i, DEVCONF_DFLT); + else if (i == IPV4_DEVCONF_NOXFRM || + i == IPV4_DEVCONF_NOPOLICY || + i == IPV4_DEVCONF_PROMOTE_SECONDARIES || + i == IPV4_DEVCONF_ROUTE_LOCALNET || + i == IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST) + devconf_flush(n, n->ipv4.devconf_dflt, val, i, DEVCONF_DFLT); + else + devconf_proc(n, n->ipv4.devconf_dflt, val, i, DEVCONF_DFLT); + } + + /* unix */ + if (desc->unix_max_dgram_qlen > 0) + n->unx.sysctl_max_dgram_qlen = desc->unix_max_dgram_qlen; + + return 0; +} +EXPORT_SYMBOL(vkernel_set_sysctl_net); diff --git a/drivers/vkernel/vkernel_main.c b/drivers/vkernel/vkernel_main.c index c495e375f6f1..9b109ac0a482 100644 --- a/drivers/vkernel/vkernel_main.c +++ b/drivers/vkernel/vkernel_main.c @@ -290,6 +290,17 @@ static int vkernel_vk_ioctl_set_sysctl_kernel(struct vkernel *vk, unsigned long return vkernel_set_sysctl_kernel(&vk->sysctl_kernel, &desc); } +static int vkernel_vk_ioctl_set_sysctl_net(struct vkernel *vk, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct vkernel_sysctl_net_desc desc; + + if (copy_from_user(&desc, argp, sizeof(desc))) + return -EFAULT; + + return vkernel_set_sysctl_net(&vk->sysctl_net, &desc); +} + static int vkernel_vk_ioctl_check_extension(struct vkernel *vk, unsigned long arg) { int r = 0; @@ -392,6 +403,14 @@ static int stat_show(struct seq_file *m, void *v) vk->sysctl_kernel.key_quota_root_maxkeys); seq_printf(m, "kernel.pty.max=%d\n", vk->sysctl_kernel.pty_limit); seq_printf(m, "kernel.pty.reserve=%d\n", vk->sysctl_kernel.pty_reserve); + seq_printf(m, "net.nf_conntrack_max=%u\n", vk->sysctl_net.nf_conntrack_max); + seq_printf(m, "net.core.busy_poll=%u\n", vk->sysctl_net.net_busy_poll); + seq_printf(m, "net.core.busy_read=%u\n", vk->sysctl_net.net_busy_read); + seq_printf(m, "net.core.optmem_max=%d\n", vk->sysctl_net.optmem_max); + seq_printf(m, "net.core.wmem_max=%u\n", vk->sysctl_net.wmem_max); + seq_printf(m, "net.core.rmem_max=%u\n", vk->sysctl_net.rmem_max); + seq_printf(m, "net.core.wmem_default=%u\n", vk->sysctl_net.wmem_default); + seq_printf(m, "net.core.rmem_default=%u\n", vk->sysctl_net.rmem_default); seq_puts(m, "=== OPERATION ===\n"); seq_printf(m, "Op cap_capable: %p\n", vk->ops.cap_capable); @@ -496,6 +515,7 @@ void vkernel_destroy_vk(struct vkernel *vk) vkernel_destroy_vk_debugfs(vk); + vk_uninit_sysctl_net(&vk->sysctl_net); vk_uninit_sysctl_kernel(&vk->sysctl_kernel); vk_uninit_sysctl_fs(&vk->sysctl_fs); vk_uninit_acl(&vk->acl); @@ -558,7 +578,9 @@ struct vkernel *vkernel_create_vk(struct task_struct *tsk, const char *name, r = vk_init_sysctl_kernel(&vk->sysctl_kernel); if (r) goto err_fs; - + r = vk_init_sysctl_net(&vk->sysctl_net, tsk); + if (r) + goto err_kernel; /* Init default operations */ vk->ops.cap_capable = vk_cap_capable; @@ -566,7 +588,7 @@ struct vkernel *vkernel_create_vk(struct task_struct *tsk, const char *name, r = vkernel_create_vk_debugfs(vk, name); if (r) - goto err_kernel; + goto err_net; /* Custom initializations */ vk->custom = vkernel_find_custom(custom); @@ -597,6 +619,8 @@ struct vkernel *vkernel_create_vk(struct task_struct *tsk, const char *name, module_put(vk->custom->owner); vkernel_destroy_vk_debugfs(vk); +err_net: + vk_uninit_sysctl_net(&vk->sysctl_net); err_kernel: vk_uninit_sysctl_kernel(&vk->sysctl_kernel); err_fs: @@ -689,6 +713,8 @@ static long vkernel_vk_ioctl(struct file *filp, r = vkernel_vk_ioctl_set_sysctl_kernel(vk, arg); break; case VKERNEL_SET_SYSCTL_NET: + r = vkernel_vk_ioctl_set_sysctl_net(vk, arg); + break; case VKERNEL_SET_SYSCTL_VM: r = -EOPNOTSUPP; break; -- Gitee From 82c9a3dea98a781db7c8980dc5716654bb22c459 Mon Sep 17 00:00:00 2001 From: Hang Huang Date: Sun, 28 Jun 2026 20:13:55 +0800 Subject: [PATCH 14/17] vk: implement vm sysctl customization The following vm sysctl items are supported: max_map_count mmap_min_addr overcommit_kbytes overcommit_memory overcommit_ratio Signed-off-by: jiangnan Signed-off-by: Hang Huang --- drivers/vkernel/Makefile | 2 +- drivers/vkernel/include/sysctl.h | 6 ++ drivers/vkernel/sysctl/vm.c | 102 +++++++++++++++++++++++++++++++ drivers/vkernel/vkernel_main.c | 30 ++++++++- 4 files changed, 137 insertions(+), 3 deletions(-) create mode 100644 drivers/vkernel/sysctl/vm.c diff --git a/drivers/vkernel/Makefile b/drivers/vkernel/Makefile index e5aad2d7d09f..e1d2463d3e2b 100644 --- a/drivers/vkernel/Makefile +++ b/drivers/vkernel/Makefile @@ -5,5 +5,5 @@ ccflags-y := -I$(srctree)/drivers/vkernel/include vkernel-y := vkernel_main.o syscall.o vkernel-y += fs/acl.o vkernel-y += security/capability.o -vkernel-y += sysctl/fs.o sysctl/kernel.o sysctl/net.o +vkernel-y += sysctl/fs.o sysctl/kernel.o sysctl/net.o sysctl/vm.o vkernel-y += utils/kallsyms.o diff --git a/drivers/vkernel/include/sysctl.h b/drivers/vkernel/include/sysctl.h index 88cb25f38de9..af820bd4f17d 100644 --- a/drivers/vkernel/include/sysctl.h +++ b/drivers/vkernel/include/sysctl.h @@ -31,4 +31,10 @@ int devconf_forward(struct net *net, struct ipv4_devconf *conf, int val, int i, int type); int devconf_flush(struct net *net, struct ipv4_devconf *conf, int val, int i, int type); + +int vk_init_sysctl_vm(struct vkernel_sysctl_vm *vm); +void vk_uninit_sysctl_vm(struct vkernel_sysctl_vm *vm); + +void vk_sync_overcommit_as(struct vkernel *vk); + #endif diff --git a/drivers/vkernel/sysctl/vm.c b/drivers/vkernel/sysctl/vm.c new file mode 100644 index 000000000000..4b322b455da2 --- /dev/null +++ b/drivers/vkernel/sysctl/vm.c @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024 ARM Ltd. + * Author: Joy Allen + */ + +#include +#include + +#include "sysctl.h" + +static s32 vk_mm_compute_batch(void) +{ + u64 memsized_batch; + s32 nr = num_present_cpus(); + s32 batch = max_t(s32, nr * 2, 32); + + /* batch size set to 0.4% of (total memory/#cpus), or max int32 */ + memsized_batch = min_t(u64, (totalram_pages() / nr) / 256, INT_MAX); + + return max_t(s32, memsized_batch, batch); +} + +void vk_sync_overcommit_as(struct vkernel *vk) +{ + struct percpu_counter *fbc = &vk->sysctl_vm.vm_committed_as; + unsigned long flags; + int cpu; + s32 *pcount; + s32 count; + + raw_spin_lock_irqsave(&fbc->lock, flags); + for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) { + pcount = per_cpu_ptr(fbc->counters, cpu); + count = *pcount; + fbc->count += count; + *pcount -= count; + } + raw_spin_unlock_irqrestore(&fbc->lock, flags); +} + +int vk_init_sysctl_vm(struct vkernel_sysctl_vm *vm) +{ + vm->max_map_count = DEFAULT_MAX_MAP_COUNT; + vm->dac_mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; +#ifdef CONFIG_LSM_MMAP_MIN_ADDR + if (vm->dac_mmap_min_addr > CONFIG_LSM_MMAP_MIN_ADDR) + vm->mmap_min_addr = vm->dac_mmap_min_addr; + else + vm->mmap_min_addr = CONFIG_LSM_MMAP_MIN_ADDR; +#else + vm->mmap_min_addr = vm->dac_mmap_min_addr; +#endif + + vm->overcommit_memory = 0; + vm->overcommit_ratio = 50; + vm->overcommit_kbytes = 0; + vm->as_batch = vk_mm_compute_batch(); + if (percpu_counter_init(&vm->vm_committed_as, 0, GFP_KERNEL)) { + pr_err("vkernel: failed to init sysctl_vm vm_committed_as\n"); + return -ENOMEM; + } + + return 0; +} + +void vk_uninit_sysctl_vm(struct vkernel_sysctl_vm *vm) +{ + percpu_counter_destroy(&vm->vm_committed_as); +} + +int vkernel_set_sysctl_vm(struct vkernel_sysctl_vm *vm, struct vkernel_sysctl_vm_desc *desc) +{ + if (desc->max_map_count > 0) + vm->max_map_count = desc->max_map_count; + + if (desc->mmap_min_addr) { + vm->dac_mmap_min_addr = desc->mmap_min_addr; +#ifdef CONFIG_LSM_MMAP_MIN_ADDR + if (vm->dac_mmap_min_addr > CONFIG_LSM_MMAP_MIN_ADDR) + vm->mmap_min_addr = vm->dac_mmap_min_addr; + else + vm->mmap_min_addr = CONFIG_LSM_MMAP_MIN_ADDR; +#else + vm->mmap_min_addr = vm->dac_mmap_min_addr; +#endif + } + + if (desc->overcommit_memory > 0) { + vm->overcommit_memory = desc->overcommit_memory; + if (desc->overcommit_ratio > 0) { + vm->overcommit_ratio = desc->overcommit_ratio; + vm->overcommit_kbytes = 0; + } else if (desc->overcommit_kbytes) { + vm->overcommit_ratio = 0; + vm->overcommit_kbytes = desc->overcommit_kbytes; + } + } + + return 0; +} +EXPORT_SYMBOL(vkernel_set_sysctl_vm); diff --git a/drivers/vkernel/vkernel_main.c b/drivers/vkernel/vkernel_main.c index 9b109ac0a482..9f9babbf9ad0 100644 --- a/drivers/vkernel/vkernel_main.c +++ b/drivers/vkernel/vkernel_main.c @@ -301,6 +301,20 @@ static int vkernel_vk_ioctl_set_sysctl_net(struct vkernel *vk, unsigned long arg return vkernel_set_sysctl_net(&vk->sysctl_net, &desc); } +static int vkernel_vk_ioctl_set_sysctl_vm(struct vkernel *vk, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct vkernel_sysctl_vm_desc desc; + + if (copy_from_user(&desc, argp, sizeof(desc))) + return -EFAULT; + + if (desc.overcommit_memory == OVERCOMMIT_NEVER) + vk_sync_overcommit_as(vk); + + return vkernel_set_sysctl_vm(&vk->sysctl_vm, &desc); +} + static int vkernel_vk_ioctl_check_extension(struct vkernel *vk, unsigned long arg) { int r = 0; @@ -411,6 +425,12 @@ static int stat_show(struct seq_file *m, void *v) seq_printf(m, "net.core.rmem_max=%u\n", vk->sysctl_net.rmem_max); seq_printf(m, "net.core.wmem_default=%u\n", vk->sysctl_net.wmem_default); seq_printf(m, "net.core.rmem_default=%u\n", vk->sysctl_net.rmem_default); + seq_printf(m, "vm.max_map_count=%d\n", vk->sysctl_vm.max_map_count); + seq_printf(m, "vm.mmap_min_addr=0x%lx\n", vk->sysctl_vm.mmap_min_addr); + seq_printf(m, "vm.dac_mmap_min_addr=0x%lx\n", vk->sysctl_vm.dac_mmap_min_addr); + seq_printf(m, "vm.overcommit_kbytes=%lu\n", vk->sysctl_vm.overcommit_kbytes); + seq_printf(m, "vm.overcommit_memory=%d\n", vk->sysctl_vm.overcommit_memory); + seq_printf(m, "vm.overcommit_ratio=%d\n", vk->sysctl_vm.overcommit_ratio); seq_puts(m, "=== OPERATION ===\n"); seq_printf(m, "Op cap_capable: %p\n", vk->ops.cap_capable); @@ -515,6 +535,7 @@ void vkernel_destroy_vk(struct vkernel *vk) vkernel_destroy_vk_debugfs(vk); + vk_uninit_sysctl_vm(&vk->sysctl_vm); vk_uninit_sysctl_net(&vk->sysctl_net); vk_uninit_sysctl_kernel(&vk->sysctl_kernel); vk_uninit_sysctl_fs(&vk->sysctl_fs); @@ -581,6 +602,9 @@ struct vkernel *vkernel_create_vk(struct task_struct *tsk, const char *name, r = vk_init_sysctl_net(&vk->sysctl_net, tsk); if (r) goto err_kernel; + r = vk_init_sysctl_vm(&vk->sysctl_vm); + if (r) + goto err_net; /* Init default operations */ vk->ops.cap_capable = vk_cap_capable; @@ -588,7 +612,7 @@ struct vkernel *vkernel_create_vk(struct task_struct *tsk, const char *name, r = vkernel_create_vk_debugfs(vk, name); if (r) - goto err_net; + goto err_vm; /* Custom initializations */ vk->custom = vkernel_find_custom(custom); @@ -619,6 +643,8 @@ struct vkernel *vkernel_create_vk(struct task_struct *tsk, const char *name, module_put(vk->custom->owner); vkernel_destroy_vk_debugfs(vk); +err_vm: + vk_uninit_sysctl_vm(&vk->sysctl_vm); err_net: vk_uninit_sysctl_net(&vk->sysctl_net); err_kernel: @@ -716,7 +742,7 @@ static long vkernel_vk_ioctl(struct file *filp, r = vkernel_vk_ioctl_set_sysctl_net(vk, arg); break; case VKERNEL_SET_SYSCTL_VM: - r = -EOPNOTSUPP; + r = vkernel_vk_ioctl_set_sysctl_vm(vk, arg); break; case VKERNEL_CHECK_EXTENSION: r = vkernel_vk_ioctl_check_extension(vk, arg); -- Gitee From 685d72eb6f00fde4623eec0d03e2af16301b06f0 Mon Sep 17 00:00:00 2001 From: Hang Huang Date: Sun, 28 Jun 2026 20:13:55 +0800 Subject: [PATCH 15/17] vk: introduce raw sysctl interface Tuning sysctl items is inconvenient, since `/proc/sys/*` is mounted read-only in container. Thus, vkernel driver exports an interface named `sysctl` in debugfs for each container. Signed-off-by: jiangnan Signed-off-by: Hang Huang --- drivers/vkernel/Makefile | 2 +- drivers/vkernel/include/sysctl.h | 13 + drivers/vkernel/sysctl/raw.c | 689 +++++++++++++++++++++++++++++++ drivers/vkernel/vkernel_main.c | 343 +++++++++++++++ 4 files changed, 1046 insertions(+), 1 deletion(-) create mode 100644 drivers/vkernel/sysctl/raw.c diff --git a/drivers/vkernel/Makefile b/drivers/vkernel/Makefile index e1d2463d3e2b..36c8de5e459b 100644 --- a/drivers/vkernel/Makefile +++ b/drivers/vkernel/Makefile @@ -5,5 +5,5 @@ ccflags-y := -I$(srctree)/drivers/vkernel/include vkernel-y := vkernel_main.o syscall.o vkernel-y += fs/acl.o vkernel-y += security/capability.o -vkernel-y += sysctl/fs.o sysctl/kernel.o sysctl/net.o sysctl/vm.o +vkernel-y += sysctl/fs.o sysctl/kernel.o sysctl/net.o sysctl/vm.o sysctl/raw.o vkernel-y += utils/kallsyms.o diff --git a/drivers/vkernel/include/sysctl.h b/drivers/vkernel/include/sysctl.h index af820bd4f17d..7520ee7cbf74 100644 --- a/drivers/vkernel/include/sysctl.h +++ b/drivers/vkernel/include/sysctl.h @@ -37,4 +37,17 @@ void vk_uninit_sysctl_vm(struct vkernel_sysctl_vm *vm); void vk_sync_overcommit_as(struct vkernel *vk); +int vkernel_set_sysctl_raw(struct vkernel *vk, char *buf); + +/* Defined at ipc/util.h, MODIFIED */ +static inline int sem_check_semmni(struct ipc_namespace *ns) +{ + /* + * Check semmni range [0, ipc_mni] + * semmni is the last element of sem_ctls[4] array + */ + return ((ns->sem_ctls[3] < 0) || (ns->sem_ctls[3] > (1<<15))) + ? -ERANGE : 0; +} + #endif diff --git a/drivers/vkernel/sysctl/raw.c b/drivers/vkernel/sysctl/raw.c new file mode 100644 index 000000000000..2b67fa89370e --- /dev/null +++ b/drivers/vkernel/sysctl/raw.c @@ -0,0 +1,689 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024 ARM Ltd. + * Author: Joy Allen + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "sysctl.h" +#include "utils.h" + +enum { + DEVCONF_ALL, + DEVCONF_DFLT, + DEVCONF_OTHER +}; + +int vkernel_set_sysctl_raw(struct vkernel *vk, char *buf) +{ + struct ipc_namespace *ipc_ns = NULL; + struct net *n; + char *name; + char *val; + char *p; + u64 uval; + s64 sval, old_sval, third_sval; + bool has_uval = false, has_sval = false; + + val = strchr(buf, '='); + if (!val) + return -EINVAL; + *val++ = 0; + name = strstrip(buf); + val = strstrip(val); + + if (!kstrtou64(val, 10, &uval)) + has_uval = true; + else + pr_warn("failed to parse raw sysctl val %s to u64\n", val); + if (!kstrtos64(val, 10, &sval)) + has_sval = true; + else + pr_warn("failed to parse raw sysctl val %s to s64\n", val); + + if (vk->init_process->nsproxy) + ipc_ns = vk->init_process->nsproxy->ipc_ns; + + n = vk->sysctl_net.net; + + if (!strcmp(name, "fs.file-max")) { + if (has_uval && uval) + vk->sysctl_fs.files_stat.max_files = uval; + } else if (!strcmp(name, "fs.nr_open")) { + if (has_uval && uval) + vk->sysctl_fs.nr_open = uval; + } else if (!strcmp(name, "fs.lease-break-time")) { + if (has_uval && sval > 0) + vk->sysctl_fs.leases_enable = sval; + } else if (!strcmp(name, "fs.leases-enable")) { + if (has_sval && (sval == 0 || sval == 1)) + vk->sysctl_fs.lease_break_time = sval; + } else if (!strcmp(name, "fs.mount-max")) { + if (has_uval && uval) + vk->sysctl_fs.mount_max = uval; + } else if (!strcmp(name, "kernel.msgmax")) { + if (has_uval && ipc_ns && uval) + ipc_ns->msg_ctlmax = uval; + } else if (!strcmp(name, "kernel.msgmnb")) { + if (has_uval && ipc_ns && uval) + ipc_ns->msg_ctlmnb = uval; + } else if (!strcmp(name, "kernel.msgmni")) { + if (has_uval && ipc_ns && uval) + ipc_ns->msg_ctlmni = uval; + } +#ifdef CONFIG_CHECKPOINT_RESTORE + else if (!strcmp(name, "kernel.msg_next_id")) { + if (has_sval && ipc_ns && sval >= -1) + ipc_ns->ids[IPC_MSG_IDS].next_id = sval; + } +#endif + else if (!strcmp(name, "kernel.sem")) { + if (ipc_ns) { + old_sval = ipc_ns->sem_ctls[3]; + uval = 0; + while ((p = strsep(&val, " \t")) != NULL && uval < 4) { + if (!*p) + continue; + if (!kstrtos64(p, 10, &sval) && sval > 0) + ipc_ns->sem_ctls[uval] = sval; + uval++; + } + if (sem_check_semmni(ipc_ns)) + ipc_ns->sem_ctls[3] = old_sval; + } + } +#ifdef CONFIG_CHECKPOINT_RESTORE + else if (!strcmp(name, "kernel.sem_next_id")) { + if (has_sval && ipc_ns && sval >= -1) + ipc_ns->ids[IPC_SEM_IDS].next_id = sval; + } +#endif + else if (!strcmp(name, "kernel.shmall")) { + if (has_uval && ipc_ns && uval) + ipc_ns->shm_ctlall = uval; + } else if (!strcmp(name, "kernel.shmmax")) { + if (has_uval && ipc_ns && uval) + ipc_ns->shm_ctlmax = uval; + } else if (!strcmp(name, "kernel.shmmni")) { + if (has_uval && ipc_ns && uval) + ipc_ns->shm_ctlmni = uval; + } +#ifdef CONFIG_CHECKPOINT_RESTORE + else if (!strcmp(name, "kernel.shm_next_id")) { + if (has_uval && ipc_ns && uval) + ipc_ns->ids[IPC_SHM_IDS].next_id = uval; + } +#endif + else if (!strcmp(name, "kernel.shm_rmid_forced")) { + if (has_sval && ipc_ns && (sval == 0 || sval == 1)) + ipc_ns->shm_rmid_forced = sval; + } else if (!strcmp(name, "kernel.numa_balancing")) { + /* inactive */ + if (has_sval && sval >= 0) + vk->sysctl_kernel.nb_mode = sval; + } else if (!strcmp(name, "kernel.numa_balancing_promote_rate_limit_MBps")) { + /* inactive */ + if (has_sval && sval > 0) + vk->sysctl_kernel.nb_promote_rate_limit = sval; + } else if (!strcmp(name, "kernel.sched_cfs_bandwidth_slice_us")) { + if (has_uval && uval) + vk->sysctl_kernel.sched_cfs_bandwidth_slice = uval; + } else if (!strcmp(name, "kernel.sched_child_runs_first")) { + if (has_uval && (uval == 0 || uval == 1)) + vk->sysctl_kernel.sched_child_runs_first = uval; + } else if (!strcmp(name, "kernel.sched_deadline_period_max_us")) { + if (has_uval && uval) + vk->sysctl_kernel.sched_dl_period_max = uval; + } else if (!strcmp(name, "kernel.sched_deadline_period_min_us")) { + if (has_uval && uval) + vk->sysctl_kernel.sched_dl_period_min = uval; + } else if (!strcmp(name, "kernel.sched_rr_timeslice_ms")) { + /* inactive */ + if (has_sval && sval > 0) + vk->sysctl_kernel.sched_rr_timeslice = sval; + } else if (!strcmp(name, "kernel.sched_rt_period_us")) { + /* inactive */ + if (has_sval && sval > 0) + vk->sysctl_kernel.sched_rt_period = sval; + } else if (!strcmp(name, "kernel.sched_rt_runtime_us")) { + /* inactive */ + if (has_sval && sval > 0) + vk->sysctl_kernel.sched_rt_runtime = sval; + } else if (!strcmp(name, "kernel.threads-max")) { + if (has_sval && sval > 0) + vk->sysctl_kernel.max_threads = clamp_t(u64, sval, + MIN_THREADS, MAX_THREADS); + } else if (!strcmp(name, "kernel.keys.gc_delay")) { + if (has_uval && uval > 0) + vk->sysctl_kernel.key_gc_delay = uval; + } else if (!strcmp(name, "kernel.keys.maxbytes")) { + if (has_uval && uval > 0) + vk->sysctl_kernel.key_quota_maxbytes = uval; + } else if (!strcmp(name, "kernel.keys.maxkeys")) { + if (has_uval && uval > 0) + vk->sysctl_kernel.key_quota_maxkeys = uval; + } else if (!strcmp(name, "kernel.keys.persistent_keyring_expiry")) { + if (has_uval && uval > 0) + vk->sysctl_kernel.persistent_keyring_expiry = uval; + } else if (!strcmp(name, "kernel.keys.root_maxbytes")) { + if (has_uval && uval > 0) + vk->sysctl_kernel.key_quota_root_maxbytes = uval; + } else if (!strcmp(name, "kernel.keys.root_maxkeys")) { + if (has_uval && uval > 0) + vk->sysctl_kernel.key_quota_root_maxkeys = uval; + } else if (!strcmp(name, "kernel.pty.max")) { + if (has_sval && sval > 0) + vk->sysctl_kernel.pty_limit = sval; + } else if (!strcmp(name, "kernel.pty.reserve")) { + if (has_sval && sval > 0) + vk->sysctl_kernel.pty_reserve = sval; + } else if (!strcmp(name, "net.nf_conntrack_max")) { + if (has_uval && uval > 0) + vk->sysctl_net.nf_conntrack_max = uval; + } else if (!strcmp(name, "net.core.busy_poll")) { + if (has_uval) + vk->sysctl_net.net_busy_poll = uval; + } else if (!strcmp(name, "net.core.busy_read")) { + if (has_uval) + vk->sysctl_net.net_busy_read = uval; + } else if (!strcmp(name, "net.core.optmem_max")) { + if (has_sval && sval > 0) + vk->sysctl_net.optmem_max = sval; + } else if (!strcmp(name, "net.core.wmem_max")) { + if (has_uval && uval) + vk->sysctl_net.wmem_max = uval; + } else if (!strcmp(name, "net.core.rmem_max")) { + if (has_uval && uval) + vk->sysctl_net.rmem_max = uval; + } else if (!strcmp(name, "net.core.wmem_default")) { + if (has_uval && uval) + vk->sysctl_net.wmem_default = uval; + } else if (!strcmp(name, "net.core.rmem_default")) { + if (has_uval && uval) + vk->sysctl_net.rmem_default = uval; + } else if (!strcmp(name, "net.core.somaxconn")) { + if (has_uval && uval) + n->core.sysctl_somaxconn = uval; + } else if (!strcmp(name, "net.ipv4.icmp_echo_ignore_broadcasts")) { + if (has_uval && (uval == 0 || uval == 1)) + n->ipv4.sysctl_icmp_echo_ignore_broadcasts = uval; + } else if (!strcmp(name, "net.ipv4.ip_local_port_range")) { + uval = 0; + while ((p = strsep(&val, " \t")) != NULL && uval < 2) { + if (!*p) + continue; + if (uval == 0) { + if (kstrtos64(p, 10, &sval)) + sval = 0; + } else { + if (kstrtos64(p, 10, &old_sval)) + old_sval = 0; + } + uval++; + } + if (sval > 0 && old_sval > 0) { + n->ipv4.ip_local_ports.range[0] = sval; + n->ipv4.ip_local_ports.range[1] = old_sval; + } + } else if (!strcmp(name, "net.ipv4.tcp_max_tw_buckets")) { + if (has_sval && sval > 0) + n->ipv4.tcp_death_row.sysctl_max_tw_buckets = sval; + } else if (!strcmp(name, "net.ipv4.tcp_ecn")) { + if (has_uval && uval <= 2) + n->ipv4.sysctl_tcp_ecn = uval; + } else if (!strcmp(name, "net.ipv4.ip_default_ttl")) { + if (has_uval && (uval >= 1 && uval <= 255)) + n->ipv4.sysctl_ip_default_ttl = uval; + } else if (!strcmp(name, "net.ipv4.ip_no_pmtu_disc")) { + if (has_uval && (uval == 0 || uval == 1)) + n->ipv4.sysctl_ip_no_pmtu_disc = uval; + } else if (!strcmp(name, "net.ipv4.tcp_keepalive_time")) { + if (has_sval && sval > 0) + WRITE_ONCE(n->ipv4.sysctl_tcp_keepalive_time, sval * HZ); + } else if (!strcmp(name, "net.ipv4.tcp_keepalive_intvl")) { + if (has_sval && sval > 0) + WRITE_ONCE(n->ipv4.sysctl_tcp_keepalive_intvl, sval * HZ); + } else if (!strcmp(name, "net.ipv4.tcp_keepalive_probes")) { + if (has_uval && uval) + n->ipv4.sysctl_tcp_keepalive_probes = uval; + } else if (!strcmp(name, "net.ipv4.tcp_syn_retries")) { + if (has_uval && uval >= 1 && uval <= MAX_TCP_SYNCNT) + n->ipv4.sysctl_tcp_syn_retries = uval; + } else if (!strcmp(name, "net.ipv4.tcp_synack_retries")) { + if (has_uval && uval) + n->ipv4.sysctl_tcp_synack_retries = uval; + } else if (!strcmp(name, "net.ipv4.tcp_syncookies")) { + if (has_uval && uval >= 0 && uval <= 2) + n->ipv4.sysctl_tcp_syncookies = uval; + } else if (!strcmp(name, "net.ipv4.tcp_reordering")) { + if (has_sval && sval > 0) + n->ipv4.sysctl_tcp_reordering = sval; + } else if (!strcmp(name, "net.ipv4.tcp_retries1")) { + if (has_uval && uval && uval <= 255) + n->ipv4.sysctl_tcp_retries1 = uval; + } else if (!strcmp(name, "net.ipv4.tcp_retries2")) { + if (has_uval && uval) + n->ipv4.sysctl_tcp_retries2 = uval; + } else if (!strcmp(name, "net.ipv4.tcp_orphan_retries")) { + if (has_uval && uval) + n->ipv4.sysctl_tcp_orphan_retries = uval; + } else if (!strcmp(name, "net.ipv4.tcp_tw_reuse")) { + if (has_uval && uval >= 0 && uval <= 2) + n->ipv4.sysctl_tcp_tw_reuse = uval; + } else if (!strcmp(name, "net.ipv4.tcp_fin_timeout")) { + if (has_sval && sval > 0) + WRITE_ONCE(n->ipv4.sysctl_tcp_fin_timeout, sval * HZ); + } else if (!strcmp(name, "net.ipv4.tcp_sack")) { + if (has_uval && (uval == 0 || uval == 1)) + n->ipv4.sysctl_tcp_sack = uval; + } else if (!strcmp(name, "net.ipv4.tcp_window_scaling")) { + if (has_uval && (uval == 0 || uval == 1)) + n->ipv4.sysctl_tcp_window_scaling = uval; + } else if (!strcmp(name, "net.ipv4.tcp_timestamps")) { + if (has_uval && (uval == 0 || uval == 1)) + n->ipv4.sysctl_tcp_timestamps = uval; + } else if (!strcmp(name, "net.ipv4.tcp_thin_linear_timeouts")) { + if (has_uval && (uval == 0 || uval == 1)) + n->ipv4.sysctl_tcp_thin_linear_timeouts = uval; + } else if (!strcmp(name, "net.ipv4.tcp_retrans_collapse")) { + if (has_uval && (uval == 0 || uval == 1)) + n->ipv4.sysctl_tcp_retrans_collapse = uval; + } else if (!strcmp(name, "net.ipv4.tcp_fack")) { + if (has_uval && (uval == 0 || uval == 1)) + n->ipv4.sysctl_tcp_fack = uval; + } else if (!strcmp(name, "net.ipv4.tcp_adv_win_scale")) { + if (has_sval && sval >= 0 && sval <= 4) + n->ipv4.sysctl_tcp_adv_win_scale = sval; + } else if (!strcmp(name, "net.ipv4.tcp_dsack")) { + if (has_uval && (uval == 0 || uval == 1)) + n->ipv4.sysctl_tcp_dsack = uval; // ? + } else if (!strcmp(name, "net.ipv4.tcp_nometrics_save")) { + if (has_uval && (uval == 0 || uval == 1)) + n->ipv4.sysctl_tcp_nometrics_save = uval; + } else if (!strcmp(name, "net.ipv4.tcp_moderate_rcvbuf")) { + if (has_uval && (uval == 0 || uval == 1)) + n->ipv4.sysctl_tcp_moderate_rcvbuf = uval; + } else if (!strcmp(name, "net.ipv4.tcp_min_tso_segs")) { + if (has_uval && uval) + n->ipv4.sysctl_tcp_min_tso_segs = uval; + } else if (!strcmp(name, "net.ipv4.tcp_wmem")) { + uval = 0; + while ((p = strsep(&val, " \t")) != NULL && uval < 3) { + if (!*p) + continue; + if (uval == 0) { + if (kstrtos64(p, 10, &sval)) + sval = 0; + } else if (uval == 1) { + if (kstrtos64(p, 10, &old_sval)) + old_sval = 0; + } else { + if (kstrtos64(p, 10, &third_sval)) + third_sval = 0; + } + uval++; + } + if (sval > 0 && old_sval > 0 && third_sval > 0) { + n->ipv4.sysctl_tcp_wmem[0] = sval; + n->ipv4.sysctl_tcp_wmem[1] = old_sval; + n->ipv4.sysctl_tcp_wmem[2] = third_sval; + } + } else if (!strcmp(name, "net.ipv4.tcp_rmem")) { + uval = 0; + while ((p = strsep(&val, " \t")) != NULL && uval < 3) { + if (!*p) + continue; + if (uval == 0) { + if (kstrtos64(p, 10, &sval)) + sval = 0; + } else if (uval == 1) { + if (kstrtos64(p, 10, &old_sval)) + old_sval = 0; + } else { + if (kstrtos64(p, 10, &third_sval)) + third_sval = 0; + } + uval++; + } + if (sval > 0 && old_sval > 0 && third_sval > 0) { + n->ipv4.sysctl_tcp_rmem[0] = sval; + n->ipv4.sysctl_tcp_rmem[1] = old_sval; + n->ipv4.sysctl_tcp_rmem[2] = third_sval; + } + } else if (!strcmp(name, "net.ipv4.max_syn_backlog")) { + if (has_sval && sval > 0) + n->ipv4.sysctl_max_syn_backlog = sval; + } else if (!strcmp(name, "net.ipv4.tcp_fastopen")) { + if (has_sval && (sval == 1 || sval == 2 || sval == 4)) + n->ipv4.sysctl_tcp_fastopen = sval; + } else if (!strcmp(name, "net.ipv4.tcp_congestion_control")) { + if (strlen(val) > 1) + tcp_set_default_congestion_control_ptr(n, val); + } else if (!strcmp(name, "net.ipv4.conf.all.forwarding")) { + if (has_sval && sval >= 0) + devconf_forward(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_FORWARDING, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.mc_forwarding")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_MC_FORWARDING, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.proxy_arp")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_PROXY_ARP, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.accept_redirects")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_ACCEPT_REDIRECTS, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.secure_redirects")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_SECURE_REDIRECTS, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.send_redirects")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_SEND_REDIRECTS, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.shared_media")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_SHARED_MEDIA, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.rp_filter")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_RP_FILTER, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.accept_source_route")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.bootp_relay")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_BOOTP_RELAY, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.log_martians")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_LOG_MARTIANS, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.tag")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_TAG, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.arp_filter")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_ARPFILTER, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.medium_id")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_MEDIUM_ID, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.disable_xfrm")) { + if (has_sval && sval >= 0) + devconf_flush(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_NOXFRM, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.disable_policy")) { + if (has_sval && sval >= 0) + devconf_flush(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_NOPOLICY, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.force_igmp_version")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_FORCE_IGMP_VERSION, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.arp_announce")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_ARP_ANNOUNCE, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.arp_ignore")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_ARP_IGNORE, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.promote_secondaries")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_PROMOTE_SECONDARIES, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.arp_accept")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_ARP_ACCEPT, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.arp_notify")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_ARP_NOTIFY, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.accept_local")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_ACCEPT_LOCAL, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.src_valid_mark")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_SRC_VMARK, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.proxy_arp_pvlan")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_PROXY_ARP_PVLAN, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.route_localnet")) { + if (has_sval && sval >= 0) + devconf_flush(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_ROUTE_LOCALNET, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.igmpv2_unsolicited_report_interval")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL, + DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.igmpv3_unsolicited_report_interval")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL, + DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.ignore_routes_with_linkdown")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN, + DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.drop_unicast_in_l2_multicast")) { + if (has_sval && sval >= 0) + devconf_flush(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST, + DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.drop_gratuitous_arp")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_DROP_GRATUITOUS_ARP, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.bc_forwarding")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_BC_FORWARDING, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.all.arp_evict_nocarrier")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_all, sval, + IPV4_DEVCONF_ARP_EVICT_NOCARRIER, DEVCONF_ALL); + } else if (!strcmp(name, "net.ipv4.conf.default.forwarding")) { + if (has_sval && sval >= 0) + devconf_forward(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_FORWARDING, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.mc_forwarding")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_MC_FORWARDING, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.proxy_arp")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_PROXY_ARP, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.accept_redirects")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_ACCEPT_REDIRECTS, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.secure_redirects")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_SECURE_REDIRECTS, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.send_redirects")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_SEND_REDIRECTS, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.shared_media")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_SHARED_MEDIA, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.rp_filter")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_RP_FILTER, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.accept_source_route")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.bootp_relay")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_BOOTP_RELAY, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.log_martians")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_LOG_MARTIANS, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.tag")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_TAG, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.arp_filter")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_ARPFILTER, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.medium_id")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_MEDIUM_ID, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.disable_xfrm")) { + if (has_sval && sval >= 0) + devconf_flush(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_NOXFRM, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.disable_policy")) { + if (has_sval && sval >= 0) + devconf_flush(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_NOPOLICY, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.force_igmp_version")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_FORCE_IGMP_VERSION, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.arp_announce")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_ARP_ANNOUNCE, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.arp_ignore")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_ARP_IGNORE, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.promote_secondaries")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_PROMOTE_SECONDARIES, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.arp_accept")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_ARP_ACCEPT, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.arp_notify")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_ARP_NOTIFY, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.accept_local")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_ACCEPT_LOCAL, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.src_valid_mark")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_SRC_VMARK, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.proxy_arp_pvlan")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_PROXY_ARP_PVLAN, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.route_localnet")) { + if (has_sval && sval >= 0) + devconf_flush(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_ROUTE_LOCALNET, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.igmpv2_unsolicited_report_interval")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL, + DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.igmpv3_unsolicited_report_interval")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL, + DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.ignore_routes_with_linkdown")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN, + DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.drop_unicast_in_l2_multicast")) { + if (has_sval && sval >= 0) + devconf_flush(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST, + DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.drop_gratuitous_arp")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_DROP_GRATUITOUS_ARP, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.bc_forwarding")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_BC_FORWARDING, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.conf.default.arp_evict_nocarrier")) { + if (has_sval && sval >= 0) + devconf_proc(n, n->ipv4.devconf_dflt, sval, + IPV4_DEVCONF_ARP_EVICT_NOCARRIER, DEVCONF_DFLT); + } else if (!strcmp(name, "net.ipv4.unix_max_dgram_qlen")) { + if (has_sval && sval > 0) + n->unx.sysctl_max_dgram_qlen = sval; + } else if (!strcmp(name, "vm.max_map_count")) { + if (has_sval && sval > 0) + vk->sysctl_vm.max_map_count = sval; + } else if (!strcmp(name, "vm.mmap_min_addr")) { + if (!has_uval && kstrtou64(val, 16, &uval)) { + pr_warn("failed to parse raw sysctl val %s to u64\n", val); + return -EINVAL; + } + if (uval) { + vk->sysctl_vm.dac_mmap_min_addr = uval; +#ifdef CONFIG_LSM_MMAP_MIN_ADDR + if (vk->sysctl_vm.dac_mmap_min_addr > CONFIG_LSM_MMAP_MIN_ADDR) + vk->sysctl_vm.mmap_min_addr = vk->sysctl_vm.dac_mmap_min_addr; + else + vk->sysctl_vm.mmap_min_addr = CONFIG_LSM_MMAP_MIN_ADDR; +#else + vk->sysctl_vm.mmap_min_addr = vk->sysctl_vm.dac_mmap_min_addr; +#endif + } + } else if (!strcmp(name, "vm.overcommit_kbytes")) { + if (has_uval && uval) { + vk->sysctl_vm.overcommit_kbytes = uval; + vk->sysctl_vm.overcommit_ratio = 0; + } + } else if (!strcmp(name, "vm.overcommit_memory")) { + if (has_sval && sval > 0) { + if (sval == OVERCOMMIT_NEVER) + vk_sync_overcommit_as(vk); + vk->sysctl_vm.overcommit_memory = sval; + } + } else if (!strcmp(name, "vm.overcommit_ratio")) { + if (has_sval && sval) { + vk->sysctl_vm.overcommit_ratio = sval; + vk->sysctl_vm.overcommit_kbytes = 0; + } + } else { + pr_err("vkernel: unsupported sysctl %s\n", name); + return -EINVAL; + } + + pr_debug("handled sysctl %s\n", name); + return 0; +} diff --git a/drivers/vkernel/vkernel_main.c b/drivers/vkernel/vkernel_main.c index 9f9babbf9ad0..bb99876c229d 100644 --- a/drivers/vkernel/vkernel_main.c +++ b/drivers/vkernel/vkernel_main.c @@ -475,6 +475,348 @@ static const struct file_operations vk_stat_fops = { .llseek = seq_lseek, }; +static int sysctl_show(struct seq_file *m, void *v) +{ + struct vkernel *vk = m->private; + struct ipc_namespace *ipc_ns = NULL; + struct net *n; + + if (vk->init_process->nsproxy) + ipc_ns = vk->init_process->nsproxy->ipc_ns; + + n = vk->sysctl_net.net; + + seq_puts(m, "=== fs ===\n"); + seq_printf(m, "fs.file-max=%lu\n", vk->sysctl_fs.files_stat.max_files); + seq_printf(m, "fs.nr_open=%u\n", vk->sysctl_fs.nr_open); + seq_printf(m, "fs.lease-break-time=%d\n", vk->sysctl_fs.lease_break_time); + seq_printf(m, "fs.leases-enable=%d\n", vk->sysctl_fs.leases_enable); + seq_printf(m, "fs.mount-max=%u\n", vk->sysctl_fs.mount_max); + + seq_puts(m, "=== kernel ===\n"); + if (ipc_ns) { + seq_printf(m, "kernel.msgmax=%u\n", ipc_ns->msg_ctlmax); + seq_printf(m, "kernel.msgmnb=%u\n", ipc_ns->msg_ctlmnb); + seq_printf(m, "kernel.msgmni=%u\n", ipc_ns->msg_ctlmni); +#ifdef CONFIG_CHECKPOINT_RESTORE + seq_printf(m, "kernel.msg_next_id=%d\n", ipc_ns->ids[IPC_MSG_IDS].next_id); +#endif + seq_printf(m, "kernel.sem=%d %d %d\n", + ipc_ns->sem_ctls[0], ipc_ns->sem_ctls[1], ipc_ns->sem_ctls[2]); +#ifdef CONFIG_CHECKPOINT_RESTORE + seq_printf(m, "kernel.sem_next_id=%d\n", ipc_ns->ids[IPC_SEM_IDS].next_id); +#endif + seq_printf(m, "kernel.shmall=%lu\n", ipc_ns->shm_ctlall); + seq_printf(m, "kernel.shmmax=%lu\n", ipc_ns->shm_ctlmax); + seq_printf(m, "kernel.shmmni=%d\n", ipc_ns->shm_ctlmni); +#ifdef CONFIG_CHECKPOINT_RESTORE + seq_printf(m, "kernel.shm_next_id=%d\n", ipc_ns->ids[IPC_SHM_IDS].next_id); +#endif + seq_printf(m, "kernel.shm_rmid_forced=%d\n", ipc_ns->shm_rmid_forced); + } + seq_printf(m, "kernel.numa_balancing=%d\n", vk->sysctl_kernel.nb_mode); + seq_printf(m, "kernel.numa_balancing_promote_rate_limit_MBps=%d\n", + vk->sysctl_kernel.nb_promote_rate_limit); + seq_printf(m, "kernel.sched_cfs_bandwidth_slice_us=%u\n", + vk->sysctl_kernel.sched_cfs_bandwidth_slice); + seq_printf(m, "kernel.sched_child_runs_first=%u\n", + vk->sysctl_kernel.sched_child_runs_first); + seq_printf(m, "kernel.sched_deadline_period_max_us=%u\n", + vk->sysctl_kernel.sched_dl_period_max); + seq_printf(m, "kernel.sched_deadline_period_min_us=%u\n", + vk->sysctl_kernel.sched_dl_period_min); + seq_printf(m, "kernel.sched_rr_timeslice_ms=%d\n", + vk->sysctl_kernel.sched_rr_timeslice); + seq_printf(m, "kernel.sched_rt_period_us=%d\n", + vk->sysctl_kernel.sched_rt_period); + seq_printf(m, "kernel.sched_rt_runtime_us=%d\n", + vk->sysctl_kernel.sched_rt_runtime); + seq_printf(m, "kernel.threads-max=%d\n", vk->sysctl_kernel.max_threads); + seq_printf(m, "kernel.keys.gc_delay=%u\n", vk->sysctl_kernel.key_gc_delay); + seq_printf(m, "kernel.keys.maxbytes=%u\n", vk->sysctl_kernel.key_quota_maxbytes); + seq_printf(m, "kernel.keys.maxkeys=%u\n", vk->sysctl_kernel.key_quota_maxkeys); + seq_printf(m, "kernel.keys.persistent_keyring_expiry=%u\n", + vk->sysctl_kernel.persistent_keyring_expiry); + seq_printf(m, "kernel.keys.root_maxbytes=%u\n", + vk->sysctl_kernel.key_quota_root_maxbytes); + seq_printf(m, "kernel.keys.root_maxkeys=%u\n", + vk->sysctl_kernel.key_quota_root_maxkeys); + seq_printf(m, "kernel.pty.max=%d\n", vk->sysctl_kernel.pty_limit); + seq_printf(m, "kernel.pty.reserve=%d\n", vk->sysctl_kernel.pty_reserve); + + seq_puts(m, "=== net ===\n"); + seq_printf(m, "net.nf_conntrack_max=%u\n", vk->sysctl_net.nf_conntrack_max); + seq_printf(m, "net.core.busy_poll=%u\n", vk->sysctl_net.net_busy_poll); + seq_printf(m, "net.core.busy_read=%u\n", vk->sysctl_net.net_busy_read); + seq_printf(m, "net.core.optmem_max=%d\n", vk->sysctl_net.optmem_max); + seq_printf(m, "net.core.wmem_max=%u\n", vk->sysctl_net.wmem_max); + seq_printf(m, "net.core.rmem_max=%u\n", vk->sysctl_net.rmem_max); + seq_printf(m, "net.core.wmem_default=%u\n", vk->sysctl_net.wmem_default); + seq_printf(m, "net.core.rmem_default=%u\n", vk->sysctl_net.rmem_default); + + seq_printf(m, "net.core.somaxconn=%d\n", n->core.sysctl_somaxconn); + seq_printf(m, "net.ipv4.icmp_echo_ignore_broadcasts=%u\n", + n->ipv4.sysctl_icmp_echo_ignore_broadcasts); + seq_printf(m, "net.ipv4.ip_local_port_range=%d %d\n", + n->ipv4.ip_local_ports.range[0], n->ipv4.ip_local_ports.range[1]); + seq_printf(m, "net.ipv4.tcp_max_tw_buckets=%d\n", + n->ipv4.tcp_death_row.sysctl_max_tw_buckets); + seq_printf(m, "net.ipv4.tcp_ecn=%u\n", n->ipv4.sysctl_tcp_ecn); + seq_printf(m, "net.ipv4.ip_default_ttl=%u\n", n->ipv4.sysctl_ip_default_ttl); + seq_printf(m, "net.ipv4.ip_no_pmtu_disc=%u\n", n->ipv4.sysctl_ip_no_pmtu_disc); + seq_printf(m, "net.ipv4.tcp_keepalive_time=%d\n", + READ_ONCE(n->ipv4.sysctl_tcp_keepalive_time) / HZ); + seq_printf(m, "net.ipv4.tcp_keepalive_intvl=%d\n", + READ_ONCE(n->ipv4.sysctl_tcp_keepalive_intvl) / HZ); + seq_printf(m, "net.ipv4.tcp_keepalive_probes=%u\n", + n->ipv4.sysctl_tcp_keepalive_probes); + seq_printf(m, "net.ipv4.tcp_syn_retries=%u\n", n->ipv4.sysctl_tcp_syn_retries); + seq_printf(m, "net.ipv4.tcp_synack_retries=%u\n", n->ipv4.sysctl_tcp_synack_retries); + seq_printf(m, "net.ipv4.tcp_syncookies=%u\n", n->ipv4.sysctl_tcp_syncookies); + seq_printf(m, "net.ipv4.tcp_reordering=%d\n", n->ipv4.sysctl_tcp_reordering); + seq_printf(m, "net.ipv4.tcp_retries1=%u\n", n->ipv4.sysctl_tcp_retries1); + seq_printf(m, "net.ipv4.tcp_retries2=%u\n", n->ipv4.sysctl_tcp_retries2); + seq_printf(m, "net.ipv4.tcp_orphan_retries=%u\n", n->ipv4.sysctl_tcp_orphan_retries); + seq_printf(m, "net.ipv4.tcp_tw_reuse=%u\n", n->ipv4.sysctl_tcp_tw_reuse); + seq_printf(m, "net.ipv4.tcp_fin_timeout=%d\n", + READ_ONCE(n->ipv4.sysctl_tcp_fin_timeout) / HZ); + seq_printf(m, "net.ipv4.tcp_sack=%u\n", n->ipv4.sysctl_tcp_sack); + seq_printf(m, "net.ipv4.tcp_window_scaling=%u\n", n->ipv4.sysctl_tcp_window_scaling); + seq_printf(m, "net.ipv4.tcp_timestamps=%u\n", n->ipv4.sysctl_tcp_timestamps); + seq_printf(m, "net.ipv4.tcp_thin_linear_timeouts=%u\n", + n->ipv4.sysctl_tcp_thin_linear_timeouts); + seq_printf(m, "net.ipv4.tcp_retrans_collapse=%u\n", n->ipv4.sysctl_tcp_retrans_collapse); + seq_printf(m, "net.ipv4.tcp_fack=%u\n", n->ipv4.sysctl_tcp_fack); + seq_printf(m, "net.ipv4.tcp_adv_win_scale=%d\n", n->ipv4.sysctl_tcp_adv_win_scale); + seq_printf(m, "net.ipv4.tcp_dsack=%u\n", n->ipv4.sysctl_tcp_dsack); + seq_printf(m, "net.ipv4.tcp_nometrics_save=%u\n", n->ipv4.sysctl_tcp_nometrics_save); + seq_printf(m, "net.ipv4.tcp_moderate_rcvbuf=%u\n", n->ipv4.sysctl_tcp_moderate_rcvbuf); + seq_printf(m, "net.ipv4.tcp_min_tso_segs=%u\n", n->ipv4.sysctl_tcp_min_tso_segs); + seq_printf(m, "net.ipv4.tcp_wmem=%d %d %d\n", + n->ipv4.sysctl_tcp_wmem[0], n->ipv4.sysctl_tcp_wmem[1], + n->ipv4.sysctl_tcp_wmem[2]); + seq_printf(m, "net.ipv4.tcp_rmem=%d %d %d\n", + n->ipv4.sysctl_tcp_rmem[0], n->ipv4.sysctl_tcp_rmem[1], + n->ipv4.sysctl_tcp_rmem[2]); + seq_printf(m, "net.ipv4.max_syn_backlog=%d\n", n->ipv4.sysctl_max_syn_backlog); + seq_printf(m, "net.ipv4.tcp_fastopen=%u\n", n->ipv4.sysctl_tcp_fastopen); + seq_printf(m, "net.ipv4.tcp_congestion_control=%s\n", + n->ipv4.tcp_congestion_control->name); + + seq_printf(m, "net.ipv4.conf.all.forwarding=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_FORWARDING - 1]); + seq_printf(m, "net.ipv4.conf.all.mc_forwarding=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_MC_FORWARDING - 1]); + seq_printf(m, "net.ipv4.conf.all.proxy_arp=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_PROXY_ARP - 1]); + seq_printf(m, "net.ipv4.conf.all.accept_redirects=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_ACCEPT_REDIRECTS - 1]); + seq_printf(m, "net.ipv4.conf.all.secure_redirects=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_SECURE_REDIRECTS - 1]); + seq_printf(m, "net.ipv4.conf.all.send_redirects=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_SEND_REDIRECTS - 1]); + seq_printf(m, "net.ipv4.conf.all.shared_media=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_SHARED_MEDIA - 1]); + seq_printf(m, "net.ipv4.conf.all.rp_filter=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_RP_FILTER - 1]); + seq_printf(m, "net.ipv4.conf.all.accept_source_route=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1]); + seq_printf(m, "net.ipv4.conf.all.bootp_relay=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_BOOTP_RELAY - 1]); + seq_printf(m, "net.ipv4.conf.all.log_martians=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_LOG_MARTIANS - 1]); + seq_printf(m, "net.ipv4.conf.all.tag=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_TAG - 1]); + seq_printf(m, "net.ipv4.conf.all.arp_filter=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_ARPFILTER - 1]); + seq_printf(m, "net.ipv4.conf.all.medium_id=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_MEDIUM_ID - 1]); + seq_printf(m, "net.ipv4.conf.all.disable_xfrm=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_NOXFRM - 1]); + seq_printf(m, "net.ipv4.conf.all.disable_policy=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_NOPOLICY - 1]); + seq_printf(m, "net.ipv4.conf.all.force_igmp_version=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_FORCE_IGMP_VERSION - 1]); + seq_printf(m, "net.ipv4.conf.all.arp_announce=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_ARP_ANNOUNCE - 1]); + seq_printf(m, "net.ipv4.conf.all.arp_ignore=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_ARP_IGNORE - 1]); + seq_printf(m, "net.ipv4.conf.all.promote_secondaries=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_PROMOTE_SECONDARIES - 1]); + seq_printf(m, "net.ipv4.conf.all.arp_accept=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_ARP_ACCEPT - 1]); + seq_printf(m, "net.ipv4.conf.all.arp_notify=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_ARP_NOTIFY - 1]); + seq_printf(m, "net.ipv4.conf.all.accept_local=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_ACCEPT_LOCAL - 1]); + seq_printf(m, "net.ipv4.conf.all.src_valid_mark=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_SRC_VMARK - 1]); + seq_printf(m, "net.ipv4.conf.all.proxy_arp_pvlan=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_PROXY_ARP_PVLAN - 1]); + seq_printf(m, "net.ipv4.conf.all.route_localnet=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_ROUTE_LOCALNET - 1]); + seq_printf(m, "net.ipv4.conf.all.igmpv2_unsolicited_report_interval=%d\n", + n->ipv4.devconf_all->data[ + IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1]); + seq_printf(m, "net.ipv4.conf.all.igmpv3_unsolicited_report_interval=%d\n", + n->ipv4.devconf_all->data[ + IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1]); + seq_printf(m, "net.ipv4.conf.all.ignore_routes_with_linkdown=%d\n", + n->ipv4.devconf_all->data[ + IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN - 1]); + seq_printf(m, "net.ipv4.conf.all.drop_unicast_in_l2_multicast=%d\n", + n->ipv4.devconf_all->data[ + IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST - 1]); + seq_printf(m, "net.ipv4.conf.all.drop_gratuitous_arp=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_DROP_GRATUITOUS_ARP - 1]); + seq_printf(m, "net.ipv4.conf.all.bc_forwarding=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_BC_FORWARDING - 1]); + seq_printf(m, "net.ipv4.conf.all.arp_evict_nocarrier=%d\n", + n->ipv4.devconf_all->data[IPV4_DEVCONF_ARP_EVICT_NOCARRIER - 1]); + + seq_printf(m, "net.ipv4.conf.default.forwarding=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_FORWARDING - 1]); + seq_printf(m, "net.ipv4.conf.default.mc_forwarding=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_MC_FORWARDING - 1]); + seq_printf(m, "net.ipv4.conf.default.proxy_arp=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_PROXY_ARP - 1]); + seq_printf(m, "net.ipv4.conf.default.accept_redirects=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_ACCEPT_REDIRECTS - 1]); + seq_printf(m, "net.ipv4.conf.default.secure_redirects=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_SECURE_REDIRECTS - 1]); + seq_printf(m, "net.ipv4.conf.default.send_redirects=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_SEND_REDIRECTS - 1]); + seq_printf(m, "net.ipv4.conf.default.shared_media=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_SHARED_MEDIA - 1]); + seq_printf(m, "net.ipv4.conf.default.rp_filter=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_RP_FILTER - 1]); + seq_printf(m, "net.ipv4.conf.default.accept_source_route=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1]); + seq_printf(m, "net.ipv4.conf.default.bootp_relay=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_BOOTP_RELAY - 1]); + seq_printf(m, "net.ipv4.conf.default.log_martians=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_LOG_MARTIANS - 1]); + seq_printf(m, "net.ipv4.conf.default.tag=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_TAG - 1]); + seq_printf(m, "net.ipv4.conf.default.arp_filter=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_ARPFILTER - 1]); + seq_printf(m, "net.ipv4.conf.default.medium_id=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_MEDIUM_ID - 1]); + seq_printf(m, "net.ipv4.conf.default.disable_xfrm=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_NOXFRM - 1]); + seq_printf(m, "net.ipv4.conf.default.disable_policy=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_NOPOLICY - 1]); + seq_printf(m, "net.ipv4.conf.default.force_igmp_version=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_FORCE_IGMP_VERSION - 1]); + seq_printf(m, "net.ipv4.conf.default.arp_announce=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_ARP_ANNOUNCE - 1]); + seq_printf(m, "net.ipv4.conf.default.arp_ignore=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_ARP_IGNORE - 1]); + seq_printf(m, "net.ipv4.conf.default.promote_secondaries=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_PROMOTE_SECONDARIES - 1]); + seq_printf(m, "net.ipv4.conf.default.arp_accept=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_ARP_ACCEPT - 1]); + seq_printf(m, "net.ipv4.conf.default.arp_notify=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_ARP_NOTIFY - 1]); + seq_printf(m, "net.ipv4.conf.default.accept_local=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_ACCEPT_LOCAL - 1]); + seq_printf(m, "net.ipv4.conf.default.src_valid_mark=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_SRC_VMARK - 1]); + seq_printf(m, "net.ipv4.conf.default.proxy_arp_pvlan=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_PROXY_ARP_PVLAN - 1]); + seq_printf(m, "net.ipv4.conf.default.route_localnet=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_ROUTE_LOCALNET - 1]); + seq_printf(m, "net.ipv4.conf.default.igmpv2_unsolicited_report_interval=%d\n", + n->ipv4.devconf_dflt->data[ + IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1]); + seq_printf(m, "net.ipv4.conf.default.igmpv3_unsolicited_report_interval=%d\n", + n->ipv4.devconf_dflt->data[ + IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1]); + seq_printf(m, "net.ipv4.conf.default.ignore_routes_with_linkdown=%d\n", + n->ipv4.devconf_dflt->data[ + IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN - 1]); + seq_printf(m, "net.ipv4.conf.default.drop_unicast_in_l2_multicast=%d\n", + n->ipv4.devconf_dflt->data[ + IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST - 1]); + seq_printf(m, "net.ipv4.conf.default.drop_gratuitous_arp=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_DROP_GRATUITOUS_ARP - 1]); + seq_printf(m, "net.ipv4.conf.default.bc_forwarding=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_BC_FORWARDING - 1]); + seq_printf(m, "net.ipv4.conf.default.arp_evict_nocarrier=%d\n", + n->ipv4.devconf_dflt->data[IPV4_DEVCONF_ARP_EVICT_NOCARRIER - 1]); + + seq_puts(m, "=== vm ===\n"); + seq_printf(m, "vm.max_map_count=%d\n", vk->sysctl_vm.max_map_count); + seq_printf(m, "vm.mmap_min_addr=0x%lx\n", vk->sysctl_vm.mmap_min_addr); + seq_printf(m, "vm.dac_mmap_min_addr=0x%lx\n", vk->sysctl_vm.dac_mmap_min_addr); + seq_printf(m, "vm.overcommit_kbytes=%lu\n", vk->sysctl_vm.overcommit_kbytes); + seq_printf(m, "vm.overcommit_memory=%d\n", vk->sysctl_vm.overcommit_memory); + seq_printf(m, "vm.overcommit_ratio=%d\n", vk->sysctl_vm.overcommit_ratio); + + return 0; +} + +static int sysctl_open(struct inode *inode, struct file *file) +{ + struct vkernel *vk = inode->i_private; + int r; + + if (!vkernel_get_vk_safe(vk)) + return -ENOENT; + + r = single_open(file, sysctl_show, inode->i_private); + if (r < 0) + vkernel_put_vk(vk); + + return r; +} + +static int sysctl_release(struct inode *inode, struct file *file) +{ + struct vkernel *vk = inode->i_private; + + vkernel_put_vk(vk); + + return single_release(inode, file); +} + +static ssize_t +sysctl_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct inode *inode; + struct vkernel *vk; + char buf[256]; + size_t ret; + + inode = file_inode(filp); + vk = inode->i_private; + + if (cnt > 255) + cnt = 255; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + pr_debug("sysctl write, vk %s, buf %s\n", vk->name, buf); + + ret = vkernel_set_sysctl_raw(vk, buf); + if (ret) + return ret; + + return cnt; +} + +static const struct file_operations vk_sysctl_fops = { + .open = sysctl_open, + .release = sysctl_release, + .read = seq_read, + .write = sysctl_write, + .llseek = seq_lseek, +}; + static void vkernel_destroy_vk_debugfs(struct vkernel *vk) { if (IS_ERR(vk->debugfs_dentry)) @@ -508,6 +850,7 @@ static int vkernel_create_vk_debugfs(struct vkernel *vk, const char *name) vk->debugfs_dentry = dent; debugfs_create_file("stat", 0444, dent, vk, &vk_stat_fops); + debugfs_create_file("sysctl", 0644, dent, vk, &vk_sysctl_fops); return 0; } -- Gitee From 7b6fd2c219024309d0eb04d849e1ad0456736458 Mon Sep 17 00:00:00 2001 From: Hang Huang Date: Sun, 28 Jun 2026 20:13:55 +0800 Subject: [PATCH 16/17] vk: implement cpu policy customization Currently, cpu policy does not take effect cause of lack of target real-world workloads. Signed-off-by: jiangnan Signed-off-by: Hang Huang --- drivers/vkernel/Makefile | 1 + drivers/vkernel/include/sched.h | 11 ++++++++++ drivers/vkernel/sched/cpu.c | 36 +++++++++++++++++++++++++++++++++ drivers/vkernel/vkernel_main.c | 29 +++++++++++++++++++++++++- 4 files changed, 76 insertions(+), 1 deletion(-) create mode 100644 drivers/vkernel/include/sched.h create mode 100644 drivers/vkernel/sched/cpu.c diff --git a/drivers/vkernel/Makefile b/drivers/vkernel/Makefile index 36c8de5e459b..76dacd808640 100644 --- a/drivers/vkernel/Makefile +++ b/drivers/vkernel/Makefile @@ -5,5 +5,6 @@ ccflags-y := -I$(srctree)/drivers/vkernel/include vkernel-y := vkernel_main.o syscall.o vkernel-y += fs/acl.o vkernel-y += security/capability.o +vkernel-y += sched/cpu.o vkernel-y += sysctl/fs.o sysctl/kernel.o sysctl/net.o sysctl/vm.o sysctl/raw.o vkernel-y += utils/kallsyms.o diff --git a/drivers/vkernel/include/sched.h b/drivers/vkernel/include/sched.h new file mode 100644 index 000000000000..c273e3dea619 --- /dev/null +++ b/drivers/vkernel/include/sched.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _VKERNEL_SCHED_H +#define _VKERNEL_SCHED_H + +#include + +int vk_init_cpu_pref(struct vkernel_cpu_pref *cpu); +void vk_uninit_cpu_pref(struct vkernel_cpu_pref *cpu); + +#endif diff --git a/drivers/vkernel/sched/cpu.c b/drivers/vkernel/sched/cpu.c new file mode 100644 index 000000000000..16af7da43480 --- /dev/null +++ b/drivers/vkernel/sched/cpu.c @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024 ARM Ltd. + * Author: Joy Allen + */ + +#include "sched.h" + +int vk_init_cpu_pref(struct vkernel_cpu_pref *cpu) +{ + cpu->policy = SCHED_NORMAL; + cpu->rr_timeslice_us = 0; + cpu->wakeup_gran_us = 0; + + return 0; +} + +void vk_uninit_cpu_pref(struct vkernel_cpu_pref *cpu) +{ + +} + +int vkernel_set_cpu_pref(struct vkernel *vk, struct vkernel_cpu_desc *desc) +{ + if (desc->policy >= 0) + vk->cpu_pref.policy = desc->policy; + + if (desc->rr_timeslice_us > 0) + vk->cpu_pref.rr_timeslice_us = desc->rr_timeslice_us; + + if (desc->wakeup_gran_us > 0) + vk->cpu_pref.wakeup_gran_us = desc->wakeup_gran_us; + + return 0; +} +EXPORT_SYMBOL(vkernel_set_cpu_pref); diff --git a/drivers/vkernel/vkernel_main.c b/drivers/vkernel/vkernel_main.c index bb99876c229d..1a2f8588233b 100644 --- a/drivers/vkernel/vkernel_main.c +++ b/drivers/vkernel/vkernel_main.c @@ -21,6 +21,7 @@ #include #include "fs.h" +#include "sched.h" #include "security.h" #include "syscall.h" #include "sysctl.h" @@ -227,6 +228,17 @@ static int vkernel_vk_ioctl_restrict_linux_cap(struct vkernel *vk, unsigned long return vkernel_set_linux_cap(vk, &cap); } +static int vkernel_vk_ioctl_set_cpu(struct vkernel *vk, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct vkernel_cpu_desc desc; + + if (copy_from_user(&desc, argp, sizeof(desc))) + return -EFAULT; + + return vkernel_set_cpu_pref(vk, &desc); +} + static int vkernel_vk_ioctl_set_sysctl_fs(struct vkernel *vk, unsigned long arg) { void __user *argp = (void __user *)arg; @@ -378,6 +390,11 @@ static int stat_show(struct seq_file *m, void *v) seq_printf(m, "Cap bset: 0x%llx\n", vk->linux_cap.bset.val); seq_printf(m, "Cap ambient: 0x%llx\n", vk->linux_cap.ambient.val); + seq_puts(m, "=== RESOURCE ===\n"); + seq_printf(m, "Cpu policy: %d\n", vk->cpu_pref.policy); + seq_printf(m, "Cpu rr timeslice: %lu\n", vk->cpu_pref.rr_timeslice_us); + seq_printf(m, "Cpu wakeup gran: %lu\n", vk->cpu_pref.wakeup_gran_us); + seq_puts(m, "EXTENSION CAP\n"); seq_printf(m, "Isolation caps: 0x%lx\n", vk->caps); seq_printf(m, "Log ns: %u\n", vk->log_ns); @@ -882,6 +899,7 @@ void vkernel_destroy_vk(struct vkernel *vk) vk_uninit_sysctl_net(&vk->sysctl_net); vk_uninit_sysctl_kernel(&vk->sysctl_kernel); vk_uninit_sysctl_fs(&vk->sysctl_fs); + vk_uninit_cpu_pref(&vk->cpu_pref); vk_uninit_acl(&vk->acl); vk_uninit_syscall(&vk->syscall); kfree(vk); @@ -931,6 +949,11 @@ struct vkernel *vkernel_create_vk(struct task_struct *tsk, const char *name, vk->linux_cap.bset = tsk->cred->cap_bset; vk->linux_cap.ambient = tsk->cred->cap_ambient; + /* Init cpu preference */ + r = vk_init_cpu_pref(&vk->cpu_pref); + if (r) + goto err_acl; + /* Init extension cap */ vk->caps = (1 << VKERNEL_CAP_ISOLATE_LOG); vk->log_ns = vk->pid_ns->ns.inum; @@ -938,7 +961,7 @@ struct vkernel *vkernel_create_vk(struct task_struct *tsk, const char *name, /* Init sysctl */ r = vk_init_sysctl_fs(&vk->sysctl_fs); if (r) - goto err_acl; + goto err_cpu; r = vk_init_sysctl_kernel(&vk->sysctl_kernel); if (r) goto err_fs; @@ -994,6 +1017,8 @@ struct vkernel *vkernel_create_vk(struct task_struct *tsk, const char *name, vk_uninit_sysctl_kernel(&vk->sysctl_kernel); err_fs: vk_uninit_sysctl_fs(&vk->sysctl_fs); +err_cpu: + vk_uninit_cpu_pref(&vk->cpu_pref); err_acl: vk_uninit_acl(&vk->acl); err_syscall: @@ -1072,6 +1097,8 @@ static long vkernel_vk_ioctl(struct file *filp, r = vkernel_vk_ioctl_restrict_linux_cap(vk, arg); break; case VKERNEL_SET_CPU_PREF: + r = vkernel_vk_ioctl_set_cpu(vk, arg); + break; case VKERNEL_SET_MEMORY_PREF: r = -EOPNOTSUPP; break; -- Gitee From c353897a5615b97fb2be9a97d87ae4d15555b274 Mon Sep 17 00:00:00 2001 From: Hang Huang Date: Sun, 28 Jun 2026 20:13:55 +0800 Subject: [PATCH 17/17] vk: implement mem policy customization The following sys items in `/sys/kernel/mm/` are supported: transparent_hugepage/{enabled, defrag, use_zero_page} Signed-off-by: jiangnan Signed-off-by: Hang Huang --- drivers/vkernel/Makefile | 1 + drivers/vkernel/include/mm.h | 31 +++++++++++++++ drivers/vkernel/mm/mm.c | 72 ++++++++++++++++++++++++++++++++++ drivers/vkernel/vkernel_main.c | 26 +++++++++++- 4 files changed, 128 insertions(+), 2 deletions(-) create mode 100644 drivers/vkernel/include/mm.h create mode 100644 drivers/vkernel/mm/mm.c diff --git a/drivers/vkernel/Makefile b/drivers/vkernel/Makefile index 76dacd808640..1fd838c21967 100644 --- a/drivers/vkernel/Makefile +++ b/drivers/vkernel/Makefile @@ -4,6 +4,7 @@ ccflags-y := -I$(srctree)/drivers/vkernel/include vkernel-y := vkernel_main.o syscall.o vkernel-y += fs/acl.o +vkernel-y += mm/mm.o vkernel-y += security/capability.o vkernel-y += sched/cpu.o vkernel-y += sysctl/fs.o sysctl/kernel.o sysctl/net.o sysctl/vm.o sysctl/raw.o diff --git a/drivers/vkernel/include/mm.h b/drivers/vkernel/include/mm.h new file mode 100644 index 000000000000..c2fdf89ba8eb --- /dev/null +++ b/drivers/vkernel/include/mm.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _VKERNEL_MM_H +#define _VKERNEL_MM_H + +#include + +/* Copy from mm/shmem.c */ + +#define SHMEM_HUGE_NEVER 0 +#define SHMEM_HUGE_ALWAYS 1 +#define SHMEM_HUGE_WITHIN_SIZE 2 +#define SHMEM_HUGE_ADVISE 3 + +/* + * Special values. + * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled: + * + * SHMEM_HUGE_DENY: + * disables huge on shm_mnt and all mounts, for emergency use; + * SHMEM_HUGE_FORCE: + * enables huge on shm_mnt and all mounts, w/o needing option, for testing; + * + */ +#define SHMEM_HUGE_DENY (-1) +#define SHMEM_HUGE_FORCE (-2) + +int vk_init_memory_pref(struct vkernel_mem_pref *mem); +void vk_uninit_memory_pref(struct vkernel_mem_pref *mem); + +#endif diff --git a/drivers/vkernel/mm/mm.c b/drivers/vkernel/mm/mm.c new file mode 100644 index 000000000000..6eee69ccf25a --- /dev/null +++ b/drivers/vkernel/mm/mm.c @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024 ARM Ltd. + * Author: Joy Allen + */ + +#include "mm.h" + +int vk_init_memory_pref(struct vkernel_mem_pref *mem) +{ + mem->default_policy.refcnt = (atomic_t)ATOMIC_INIT(1); + mem->default_policy.mode = MPOL_LOCAL; + + mem->shmem_huge = SHMEM_HUGE_NEVER; + mem->thp_flags = +#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS + (1<thp_flags; + + if (desc->numa_mode >= 0 && desc->numa_mode < MPOL_MAX) { + /* TODO: Setup all fields */ + // mem->default_policy.mode = desc->numa_mode; + pr_info("set default numa policy is not supported yet\n"); + } + + if (desc->shmem_enabled >= SHMEM_HUGE_FORCE && + desc->shmem_enabled <= SHMEM_HUGE_ADVISE) + mem->shmem_huge = desc->shmem_enabled; + + if (desc->thp_enabled > -1 && + desc->thp_enabled < TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG) { + clear_bit(TRANSPARENT_HUGEPAGE_FLAG, flags); + clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, flags); + if (desc->thp_enabled > TRANSPARENT_HUGEPAGE_UNSUPPORTED) + set_bit(desc->thp_enabled, flags); + } + + if (desc->thp_defrag > -1 && + desc->thp_defrag < TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG) { + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, flags); + if (desc->thp_defrag > TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG) + set_bit(desc->thp_defrag, flags); + } + + if (desc->thp_use_zero_page == 0) + clear_bit(TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG, flags); + else if (desc->thp_use_zero_page == 1) + set_bit(TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG, flags); + + return 0; +} diff --git a/drivers/vkernel/vkernel_main.c b/drivers/vkernel/vkernel_main.c index 1a2f8588233b..32e6c0a428bb 100644 --- a/drivers/vkernel/vkernel_main.c +++ b/drivers/vkernel/vkernel_main.c @@ -21,6 +21,7 @@ #include #include "fs.h" +#include "mm.h" #include "sched.h" #include "security.h" #include "syscall.h" @@ -239,6 +240,17 @@ static int vkernel_vk_ioctl_set_cpu(struct vkernel *vk, unsigned long arg) return vkernel_set_cpu_pref(vk, &desc); } +static int vkernel_vk_ioctl_set_memory(struct vkernel *vk, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct vkernel_mem_desc desc; + + if (copy_from_user(&desc, argp, sizeof(desc))) + return -EFAULT; + + return vkernel_set_memory_pref(&vk->mem_pref, &desc); +} + static int vkernel_vk_ioctl_set_sysctl_fs(struct vkernel *vk, unsigned long arg) { void __user *argp = (void __user *)arg; @@ -394,6 +406,9 @@ static int stat_show(struct seq_file *m, void *v) seq_printf(m, "Cpu policy: %d\n", vk->cpu_pref.policy); seq_printf(m, "Cpu rr timeslice: %lu\n", vk->cpu_pref.rr_timeslice_us); seq_printf(m, "Cpu wakeup gran: %lu\n", vk->cpu_pref.wakeup_gran_us); + seq_printf(m, "Mem def polciy: %u\n", vk->mem_pref.default_policy.mode); + seq_printf(m, "Mem shmem huge: %d\n", vk->mem_pref.shmem_huge); + seq_printf(m, "Mem thp flags: 0x%lx\n", vk->mem_pref.thp_flags); seq_puts(m, "EXTENSION CAP\n"); seq_printf(m, "Isolation caps: 0x%lx\n", vk->caps); @@ -899,6 +914,7 @@ void vkernel_destroy_vk(struct vkernel *vk) vk_uninit_sysctl_net(&vk->sysctl_net); vk_uninit_sysctl_kernel(&vk->sysctl_kernel); vk_uninit_sysctl_fs(&vk->sysctl_fs); + vk_uninit_memory_pref(&vk->mem_pref); vk_uninit_cpu_pref(&vk->cpu_pref); vk_uninit_acl(&vk->acl); vk_uninit_syscall(&vk->syscall); @@ -953,6 +969,10 @@ struct vkernel *vkernel_create_vk(struct task_struct *tsk, const char *name, r = vk_init_cpu_pref(&vk->cpu_pref); if (r) goto err_acl; + /* Init memory preference */ + r = vk_init_memory_pref(&vk->mem_pref); + if (r) + goto err_cpu; /* Init extension cap */ vk->caps = (1 << VKERNEL_CAP_ISOLATE_LOG); @@ -961,7 +981,7 @@ struct vkernel *vkernel_create_vk(struct task_struct *tsk, const char *name, /* Init sysctl */ r = vk_init_sysctl_fs(&vk->sysctl_fs); if (r) - goto err_cpu; + goto err_mem; r = vk_init_sysctl_kernel(&vk->sysctl_kernel); if (r) goto err_fs; @@ -1017,6 +1037,8 @@ struct vkernel *vkernel_create_vk(struct task_struct *tsk, const char *name, vk_uninit_sysctl_kernel(&vk->sysctl_kernel); err_fs: vk_uninit_sysctl_fs(&vk->sysctl_fs); +err_mem: + vk_uninit_memory_pref(&vk->mem_pref); err_cpu: vk_uninit_cpu_pref(&vk->cpu_pref); err_acl: @@ -1100,7 +1122,7 @@ static long vkernel_vk_ioctl(struct file *filp, r = vkernel_vk_ioctl_set_cpu(vk, arg); break; case VKERNEL_SET_MEMORY_PREF: - r = -EOPNOTSUPP; + r = vkernel_vk_ioctl_set_memory(vk, arg); break; case VKERNEL_SET_SYSCTL_FS: r = vkernel_vk_ioctl_set_sysctl_fs(vk, arg); -- Gitee