diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index cf3807641d89f18b821223067e4031c56f1cbca9..fe53741cde9f6f5aa7413f243d1e92b750f7fb99 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -7595,6 +7595,14 @@ Kernel parameters See Documentation/admin-guide/mm/transhuge.rst for more details. + thp_file= [KNL] + Format: [KMG]:always|always+exec|never + Can be used to control the default behavior of the + system with respect to file-backed transparent hugepages. + Can be used multiple times for multiple file-backed THP + sizes. See Documentation/admin-guide/mm/transhuge.rst + for more details. + threadirqs [KNL,EARLY] Force threading of all interrupt handlers except those marked explicitly IRQF_NO_THREAD. diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 5fbc3d89bb0731996df610554a7c0d08b8bf2554..2c6e6305161d09c317ed51f138487d12de7b45c9 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -337,6 +337,33 @@ that THP is shared. Exceeding the number would block the collapse:: A higher value may increase memory footprint for some workloads. +File-Backed Hugepages +--------------------- + +The kernel will automatically select an appropriate THP size for file-backed +memory from a set of allowed sizes. By default all THP sizes that the page cache +supports are allowed, but this set can be modified with one of:: + + echo always >/sys/kernel/mm/transparent_hugepage/hugepages-kB/file_enabled + echo always+exec >/sys/kernel/mm/transparent_hugepage/hugepages-kB/file_enabled + echo never >/sys/kernel/mm/transparent_hugepage/hugepages-kB/file_enabled + +where is the hugepage size being addressed, the available sizes for which +vary by system. ``always`` adds the hugepage size to the set of allowed sizes, +and ``never`` removes the hugepage size from the set of allowed sizes. + +``always+exec`` acts like ``always`` but additionally marks the hugepage size as +the preferred hugepage size for sections of any file mapped executable. A +maximum of one hugepage size can be marked as ``exec`` at a time, so applying it +to a new size implicitly removes it from any size it was previously set for. + +In some situations, constraining the allowed sizes can reduce memory +fragmentation, resulting in fewer allocation fallbacks and improved system +performance. + +Note that any changes to the allowed set of sizes only applies to future +file-backed THP allocations. + Boot parameters =============== @@ -403,6 +430,19 @@ user, the PMD_ORDER hugepage policy will be overridden. If the policy for PMD_ORDER is not defined within a valid ``thp_shmem``, its policy will default to ``never``. +Each supported file-backed THP size can be controlled by passing +``thp_file=[KMG]:``, where ```` is the THP size and +```` is one of ``always``, ``always+exec`` or ``never``. + +For example, the following will set 64K THP to ``always+exec``:: + + thp_file=64K:always+exec + +``thp_file=`` may be specified multiple times to configure all THP sizes as +required. If ``thp_file=`` is specified at least once, any file-backed THP +sizes not explicitly configured on the command line are implicitly set to +``never``. + Hugepages in tmpfs/shmem ======================== @@ -706,6 +746,10 @@ nr_anon_partially_mapped an anonymous THP as "partially mapped" and count it here, even though it is not actually partially mapped anymore. +file_alloc + is incremented every time a file huge page is successfully + allocated. + As the system ages, allocating huge pages may be expensive as the system uses memory compaction to copy data around memory to free a huge page for use. There are some counters in ``/proc/vmstat`` to help diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 16a56b6b3f6ca199dca3af5d23d7ec3d9eb9712c..4717f75416b3d9ba0b6b94d084e14c08f95d8c3d 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1045,6 +1045,7 @@ static int load_elf_binary(struct linux_binprm *bprm) unsigned long k, vaddr; unsigned long total_size = 0; unsigned long alignment; + int exec_order = file_exec_order(); if (elf_ppnt->p_type != PT_LOAD) continue; @@ -1183,6 +1184,9 @@ static int load_elf_binary(struct linux_binprm *bprm) * is then page aligned. */ load_bias = ELF_PAGESTART(load_bias - vaddr); + if (exec_order > 0 && interpreter && + total_size >= (PAGE_SIZE << exec_order)) + load_bias &= ~((PAGE_SIZE << exec_order) - 1); } error = elf_load(bprm->file, load_bias + vaddr, elf_ppnt, diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 2949e5acff351bdb6b5747ad6e60459916dfd94a..4050b8bfaf2a412cecbaefba75482d8d70b5f5d2 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -104,6 +104,24 @@ enum tva_type { #define thp_vma_allowable_order(vma, vm_flags, type, order) \ (!!thp_vma_allowable_orders(vma, vm_flags, type, BIT(order))) +static inline int lowest_order(unsigned long orders) +{ + if (orders) + return __ffs(orders); + return -1; +} + +static inline int highest_order(unsigned long orders) +{ + return fls_long(orders) - 1; +} + +static inline int next_order(unsigned long *orders, int prev) +{ + *orders &= ~BIT(prev); + return highest_order(*orders); +} + #define split_folio(f) split_folio_to_list(f, NULL) #ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES @@ -142,6 +160,7 @@ enum mthp_stat_item { MTHP_STAT_SPLIT_DEFERRED, MTHP_STAT_NR_ANON, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, + MTHP_STAT_FILE_ALLOC, __MTHP_STAT_COUNT }; @@ -181,6 +200,18 @@ extern unsigned long transparent_hugepage_flags; extern unsigned long huge_anon_orders_always; extern unsigned long huge_anon_orders_madvise; extern unsigned long huge_anon_orders_inherit; +extern unsigned long huge_file_orders_always; +extern int huge_file_exec_order; + +static inline unsigned long file_orders_always(void) +{ + return READ_ONCE(huge_file_orders_always); +} + +static inline int file_exec_order(void) +{ + return READ_ONCE(huge_file_exec_order); +} static inline bool hugepage_global_enabled(void) { @@ -195,17 +226,6 @@ static inline bool hugepage_global_always(void) (1< mapping_max_folio_order(mapping)) order = mapping_max_folio_order(mapping); + + orders = file_orders_always() | BIT(0); + orders &= BIT(order + 1) - 1; /* If we're not aligned, allocate a smaller folio */ if (index & ((1UL << order) - 1)) - order = __ffs(index); + orders &= BIT(__ffs(index) + 1) - 1; + order = highest_order(orders); - do { + while (orders) { gfp_t alloc_gfp = gfp; err = -ENOMEM; @@ -2011,7 +2026,7 @@ struct folio *__filemap_get_folio_mpol(struct address_space *mapping, alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN; folio = filemap_alloc_folio(alloc_gfp, order, policy); if (!folio) - continue; + goto try_next; /* Init accessed so avoid atomic mark_page_accessed later */ if (fgp_flags & FGP_ACCESSED) @@ -2024,7 +2039,12 @@ struct folio *__filemap_get_folio_mpol(struct address_space *mapping, break; folio_put(folio); folio = NULL; - } while (order-- > min_order); + +try_next: + if (order <= min_order) + break; + order = next_order(&orders, order); + }; if (err == -EEXIST) goto repeat; @@ -3382,9 +3402,11 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; unsigned long start = vma->vm_pgoff; unsigned long end = start + vma_pages(vma); + int exec_order = file_exec_order(); unsigned long ra_end; - ra->order = exec_folio_order(); + /* If explicit order is set for exec mappings, use it. */ + ra->order = exec_order >= 0 ? exec_order : exec_folio_order(); ra->start = round_down(vmf->pgoff, 1UL << ra->order); ra->start = max(ra->start, start); ra_end = round_up(ra->start + ra->ra_pages, 1UL << ra->order); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 970e077019b75a4ad140c6ac16d993544fc85298..94c69255bea35e82dba23ea204b7c5fccb4e98c7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -80,7 +80,10 @@ unsigned long huge_zero_pfn __read_mostly = ~0UL; unsigned long huge_anon_orders_always __read_mostly; unsigned long huge_anon_orders_madvise __read_mostly; unsigned long huge_anon_orders_inherit __read_mostly; +unsigned long huge_file_orders_always __read_mostly; +int huge_file_exec_order __read_mostly = -1; static bool anon_orders_configured __initdata; +static bool file_orders_configured; static inline bool file_thp_enabled(struct vm_area_struct *vma) { @@ -550,6 +553,7 @@ static const struct attribute_group hugepage_attr_group = { static void hugepage_exit_sysfs(struct kobject *hugepage_kobj); static void thpsize_release(struct kobject *kobj); static DEFINE_SPINLOCK(huge_anon_orders_lock); +static DEFINE_SPINLOCK(huge_file_orders_lock); static LIST_HEAD(thpsize_list); static ssize_t anon_enabled_show(struct kobject *kobj, @@ -620,6 +624,51 @@ static ssize_t anon_enabled_store(struct kobject *kobj, return count; } +static ssize_t file_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int order = to_thpsize(kobj)->order; + const char *output; + bool exec; + + if (test_bit(order, &huge_file_orders_always)) { + exec = READ_ONCE(huge_file_exec_order) == order; + output = exec ? "always [always+exec] never" : + "[always] always+exec never"; + } else { + output = "always always+exec [never]"; + } + + return sysfs_emit(buf, "%s\n", output); +} + +static ssize_t file_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + int order = to_thpsize(kobj)->order; + ssize_t ret = count; + + spin_lock(&huge_file_orders_lock); + + if (sysfs_streq(buf, "always")) { + set_bit(order, &huge_file_orders_always); + if (huge_file_exec_order == order) + huge_file_exec_order = -1; + } else if (sysfs_streq(buf, "always+exec")) { + set_bit(order, &huge_file_orders_always); + huge_file_exec_order = order; + } else if (sysfs_streq(buf, "never")) { + clear_bit(order, &huge_file_orders_always); + if (huge_file_exec_order == order) + huge_file_exec_order = -1; + } else { + ret = -EINVAL; + } + + spin_unlock(&huge_file_orders_lock); + return ret; +} + static struct kobj_attribute anon_enabled_attr = __ATTR(enabled, 0644, anon_enabled_show, anon_enabled_store); @@ -632,7 +681,11 @@ static const struct attribute_group anon_ctrl_attr_grp = { .attrs = anon_ctrl_attrs, }; +static struct kobj_attribute file_enabled_attr = + __ATTR(file_enabled, 0644, file_enabled_show, file_enabled_store); + static struct attribute *file_ctrl_attrs[] = { + &file_enabled_attr.attr, #ifdef CONFIG_SHMEM &thpsize_shmem_enabled_attr.attr, #endif @@ -701,6 +754,7 @@ DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED); DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED); DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON); DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED); +DEFINE_MTHP_STAT_ATTR(file_alloc, MTHP_STAT_FILE_ALLOC); static struct attribute *anon_stats_attrs[] = { &anon_fault_alloc_attr.attr, @@ -726,6 +780,7 @@ static struct attribute_group anon_stats_attr_grp = { }; static struct attribute *file_stats_attrs[] = { + &file_alloc_attr.attr, #ifdef CONFIG_SHMEM &shmem_alloc_attr.attr, &shmem_fallback_attr.attr, @@ -851,6 +906,16 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) if (!anon_orders_configured) huge_anon_orders_inherit = BIT(PMD_ORDER); + /* + * For pagecache, default to enabling all orders. powerpc's PMD_ORDER + * (and therefore THP_ORDERS_ALL_FILE_DEFAULT) isn't a compile-time + * constant so we have to do this here. + */ + if (!file_orders_configured) { + huge_file_orders_always = THP_ORDERS_ALL_FILE_DEFAULT; + file_orders_configured = true; + } + *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); if (unlikely(!*hugepage_kobj)) { pr_err("failed to create transparent hugepage kobject\n"); @@ -1134,6 +1199,87 @@ static int __init setup_thp_anon(char *str) } __setup("thp_anon=", setup_thp_anon); +static int __init setup_thp_file(char *str) +{ + char *token, *range, *policy, *subtoken; + unsigned long always; + char *start_size, *end_size; + int start, end, nr, exec; + char *p; + + if (!str || strlen(str) + 1 > PAGE_SIZE) + goto err; + strcpy(str_dup, str); + + always = huge_file_orders_always; + exec = huge_file_exec_order; + p = str_dup; + while ((token = strsep(&p, ";")) != NULL) { + range = strsep(&token, ":"); + policy = token; + + if (!policy) + goto err; + + while ((subtoken = strsep(&range, ",")) != NULL) { + if (strchr(subtoken, '-')) { + start_size = strsep(&subtoken, "-"); + end_size = subtoken; + + start = get_order_from_str(start_size, + THP_ORDERS_ALL_FILE_DEFAULT); + end = get_order_from_str(end_size, + THP_ORDERS_ALL_FILE_DEFAULT); + } else { + start_size = end_size = subtoken; + start = end = get_order_from_str(subtoken, + THP_ORDERS_ALL_FILE_DEFAULT); + } + + if (start == -EINVAL) { + pr_err("invalid size %s in thp_shmem boot parameter\n", + start_size); + goto err; + } + + if (end == -EINVAL) { + pr_err("invalid size %s in thp_shmem boot parameter\n", + end_size); + goto err; + } + + if (start < 0 || end < 0 || start > end) + goto err; + + nr = end - start + 1; + if (!strcmp(policy, "always")) { + bitmap_set(&always, start, nr); + } else if (!strcmp(policy, "always+exec")) { + if (nr != 1) + goto err; + bitmap_set(&always, start, nr); + exec = start; + } else if (!strcmp(policy, "never")) { + bitmap_clear(&always, start, nr); + if (exec != -1 && !test_bit(exec, &always)) + exec = -1; + } else { + pr_err("invalid policy %s in thp_file boot parameter\n", policy); + goto err; + } + } + } + + huge_file_orders_always = always; + huge_file_exec_order = exec; + file_orders_configured = true; + return 1; +err: + pr_warn("thp_file=%s: cannot parse, ignored\n", str); + return 0; +} +__setup("thp_file=", setup_thp_file); + pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) { if (likely(vma->vm_flags & VM_WRITE)) diff --git a/mm/readahead.c b/mm/readahead.c index 7b05082c89ea2b2e8408a3c9a10b0f2ba850255f..5cff8dd48969f4522b7bb2600d34706af244495c 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -464,6 +464,15 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index, return 0; } +static int select_new_order(int old_order, int max_order, unsigned long orders) +{ + orders &= BIT(max_order + 1) - 1; + VM_WARN_ON(!orders); + + orders &= (BIT(old_order + 1) - 1); + return highest_order(orders); +} + void page_cache_ra_order(struct readahead_control *ractl, struct file_ra_state *ra) { @@ -477,6 +486,7 @@ void page_cache_ra_order(struct readahead_control *ractl, int err = 0; gfp_t gfp = readahead_gfp_mask(mapping); unsigned int new_order = ra->order; + unsigned long orders; trace_page_cache_ra_order(mapping->host, start, ra); if (!mapping_large_folio_support(mapping)) { @@ -486,8 +496,9 @@ void page_cache_ra_order(struct readahead_control *ractl, limit = min(limit, index + ra->size - 1); + orders = file_orders_always() | BIT(0); + new_order = select_new_order(new_order, ilog2(ra->size), orders); new_order = min(mapping_max_folio_order(mapping), new_order); - new_order = min_t(unsigned int, new_order, ilog2(ra->size)); new_order = max(new_order, min_order); ra->order = new_order; @@ -508,9 +519,10 @@ void page_cache_ra_order(struct readahead_control *ractl, /* Align with smaller pages if needed */ if (index & ((1UL << order) - 1)) - order = __ffs(index); + order = select_new_order(order, __ffs(index), orders); /* Don't allocate pages past EOF */ - while (order > min_order && index + (1UL << order) - 1 > limit) + while (order > min_order && (index + (1UL << order) - 1 > limit || + (BIT(order) & orders) == 0)) order--; err = ra_alloc_folio(ractl, index, mark, order, gfp); if (err)