From 107bdaacdf96b3b1b7c1c2969a20cd05d79bdff5 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Wed, 25 Dec 2024 15:05:46 +0800 Subject: [PATCH 1/8] mm: mTHP user controls to configure pagecache large folio sizes ANBZ: #9728 cherry-picked from: https://lore.kernel.org/lkml/20240717071257.4141363-1-ryan.roberts@arm.com/T/#m25b51aa890b123202cda93fa0e67340b3e4b26b6 Add mTHP controls to sysfs to allow user space to configure the folio sizes that can be considered for allocation of file-backed memory: /sys/kernel/mm/transparent_hugepage/hugepages-*kB/file_enable For now, the control can be set to either `always` or `never` to enable or disable that size. More options may be added in future. By default, at boot, all folio sizes are enabled, and the algorithm used to select a folio size remains conceptually unchanged; increase by 2 enabled orders each time a readahead marker is hit then reduce to the closest enabled order to fit within bounds of ra size, index alignment and EOF. So when all folio sizes are enabled, behavior should be unchanged. When folio sizes are disabled, the algorithm will never select them. Systems such as Android are always under extreme memory pressure and as a result fragmentation often causes attempts to allocate large folios to fail and fallback to smaller folios. By fixing the pagecache to one large folio size (e.g. 64K) plus fallback to small folios, a large source of this fragmentation can be removed and 64K mTHP allocations succeed more often, allowing the system to benefit from improved performance on arm64 and other arches that support "contpte". Signed-off-by: Ryan Roberts Signed-off-by: Baolin Wang Reviewed-by: Xu Yu Link: https://gitee.com/anolis/cloud-kernel/pulls/4378 --- Documentation/admin-guide/mm/transhuge.rst | 21 +++++++++++ include/linux/huge_mm.h | 42 +++++++++++++--------- mm/filemap.c | 15 ++++++-- mm/huge_memory.c | 42 ++++++++++++++++++++++ mm/readahead.c | 18 ++++++++-- 5 files changed, 115 insertions(+), 23 deletions(-) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 5fbc3d89bb07..ec1627f1e7c4 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -337,6 +337,27 @@ that THP is shared. Exceeding the number would block the collapse:: A higher value may increase memory footprint for some workloads. +File-Backed Hugepages +--------------------- + +The kernel will automatically select an appropriate THP size for file-backed +memory from a set of allowed sizes. By default all THP sizes that the page cache +supports are allowed, but this set can be modified with one of:: + + echo always >/sys/kernel/mm/transparent_hugepage/hugepages-kB/file_enabled + echo never >/sys/kernel/mm/transparent_hugepage/hugepages-kB/file_enabled + +where is the hugepage size being addressed, the available sizes for which +vary by system. ``always`` adds the hugepage size to the set of allowed sizes, +and ``never`` removes the hugepage size from the set of allowed sizes. + +In some situations, constraining the allowed sizes can reduce memory +fragmentation, resulting in fewer allocation fallbacks and improved system +performance. + +Note that any changes to the allowed set of sizes only applies to future +file-backed THP allocations. + Boot parameters =============== diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 2949e5acff35..1163152ffd6b 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -104,6 +104,24 @@ enum tva_type { #define thp_vma_allowable_order(vma, vm_flags, type, order) \ (!!thp_vma_allowable_orders(vma, vm_flags, type, BIT(order))) +static inline int lowest_order(unsigned long orders) +{ + if (orders) + return __ffs(orders); + return -1; +} + +static inline int highest_order(unsigned long orders) +{ + return fls_long(orders) - 1; +} + +static inline int next_order(unsigned long *orders, int prev) +{ + *orders &= ~BIT(prev); + return highest_order(*orders); +} + #define split_folio(f) split_folio_to_list(f, NULL) #ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES @@ -181,6 +199,12 @@ extern unsigned long transparent_hugepage_flags; extern unsigned long huge_anon_orders_always; extern unsigned long huge_anon_orders_madvise; extern unsigned long huge_anon_orders_inherit; +extern unsigned long huge_file_orders_always; + +static inline unsigned long file_orders_always(void) +{ + return READ_ONCE(huge_file_orders_always); +} static inline bool hugepage_global_enabled(void) { @@ -195,17 +219,6 @@ static inline bool hugepage_global_always(void) (1< mapping_max_folio_order(mapping)) order = mapping_max_folio_order(mapping); + + orders = file_orders_always() | BIT(0); + orders &= BIT(order + 1) - 1; /* If we're not aligned, allocate a smaller folio */ if (index & ((1UL << order) - 1)) - order = __ffs(index); + orders &= BIT(__ffs(index) + 1) - 1; + order = highest_order(orders); - do { + while (orders) { gfp_t alloc_gfp = gfp; err = -ENOMEM; @@ -2024,7 +2029,11 @@ struct folio *__filemap_get_folio_mpol(struct address_space *mapping, break; folio_put(folio); folio = NULL; - } while (order-- > min_order); + + if (order <= min_order) + break; + order = next_order(&orders, order); + }; if (err == -EEXIST) goto repeat; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 970e077019b7..d7e71e21fde9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -80,6 +80,7 @@ unsigned long huge_zero_pfn __read_mostly = ~0UL; unsigned long huge_anon_orders_always __read_mostly; unsigned long huge_anon_orders_madvise __read_mostly; unsigned long huge_anon_orders_inherit __read_mostly; +unsigned long huge_file_orders_always __read_mostly; static bool anon_orders_configured __initdata; static inline bool file_thp_enabled(struct vm_area_struct *vma) @@ -620,6 +621,36 @@ static ssize_t anon_enabled_store(struct kobject *kobj, return count; } +static ssize_t file_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int order = to_thpsize(kobj)->order; + const char *output; + + if (test_bit(order, &huge_file_orders_always)) + output = "[always] never"; + else + output = "always [never]"; + + return sysfs_emit(buf, "%s\n", output); +} + +static ssize_t file_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + int order = to_thpsize(kobj)->order; + ssize_t ret = count; + + if (sysfs_streq(buf, "always")) + set_bit(order, &huge_file_orders_always); + else if (sysfs_streq(buf, "never")) + clear_bit(order, &huge_file_orders_always); + else + ret = -EINVAL; + + return ret; +} + static struct kobj_attribute anon_enabled_attr = __ATTR(enabled, 0644, anon_enabled_show, anon_enabled_store); @@ -632,7 +663,11 @@ static const struct attribute_group anon_ctrl_attr_grp = { .attrs = anon_ctrl_attrs, }; +static struct kobj_attribute file_enabled_attr = + __ATTR(file_enabled, 0644, file_enabled_show, file_enabled_store); + static struct attribute *file_ctrl_attrs[] = { + &file_enabled_attr.attr, #ifdef CONFIG_SHMEM &thpsize_shmem_enabled_attr.attr, #endif @@ -851,6 +886,13 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) if (!anon_orders_configured) huge_anon_orders_inherit = BIT(PMD_ORDER); + /* + * For pagecache, default to enabling all orders. powerpc's PMD_ORDER + * (and therefore THP_ORDERS_ALL_FILE_DEFAULT) isn't a compile-time + * constant so we have to do this here. + */ + huge_file_orders_always = THP_ORDERS_ALL_FILE_DEFAULT; + *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); if (unlikely(!*hugepage_kobj)) { pr_err("failed to create transparent hugepage kobject\n"); diff --git a/mm/readahead.c b/mm/readahead.c index 7b05082c89ea..90fcc6ec9557 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -464,6 +464,15 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index, return 0; } +static int select_new_order(int old_order, int max_order, unsigned long orders) +{ + orders &= BIT(max_order + 1) - 1; + VM_WARN_ON(!orders); + + orders &= (BIT(old_order + 1) - 1); + return highest_order(orders); +} + void page_cache_ra_order(struct readahead_control *ractl, struct file_ra_state *ra) { @@ -477,6 +486,7 @@ void page_cache_ra_order(struct readahead_control *ractl, int err = 0; gfp_t gfp = readahead_gfp_mask(mapping); unsigned int new_order = ra->order; + unsigned long orders; trace_page_cache_ra_order(mapping->host, start, ra); if (!mapping_large_folio_support(mapping)) { @@ -486,8 +496,9 @@ void page_cache_ra_order(struct readahead_control *ractl, limit = min(limit, index + ra->size - 1); + orders = file_orders_always() | BIT(0); + new_order = select_new_order(new_order, ilog2(ra->size), orders); new_order = min(mapping_max_folio_order(mapping), new_order); - new_order = min_t(unsigned int, new_order, ilog2(ra->size)); new_order = max(new_order, min_order); ra->order = new_order; @@ -508,9 +519,10 @@ void page_cache_ra_order(struct readahead_control *ractl, /* Align with smaller pages if needed */ if (index & ((1UL << order) - 1)) - order = __ffs(index); + order = select_new_order(order, __ffs(index), orders); /* Don't allocate pages past EOF */ - while (order > min_order && index + (1UL << order) - 1 > limit) + while (order > min_order && index + (1UL << order) - 1 > limit && + (BIT(order) & orders) == 0) order--; err = ra_alloc_folio(ractl, index, mark, order, gfp); if (err) -- Gitee From ec66b31a6fb10653bb012335055e8c95d5fe0f91 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Wed, 25 Dec 2024 15:56:21 +0800 Subject: [PATCH 2/8] mm: Introduce "always+exec" for mTHP file_enabled control ANBZ: #9728 cherry-picked from: https://lore.kernel.org/lkml/20240717071257.4141363-1-ryan.roberts@arm.com/T/#md06a4a7a606cb90824f322fec868ee0d7620a876 In addition to `always` and `never`, add `always+exec` as an option for: /sys/kernel/mm/transparent_hugepage/hugepages-*kB/file_enabled `always+exec` acts like `always` but additionally marks the hugepage size as the preferred hugepage size for sections of any file mapped with execute permission. A maximum of one hugepage size can be marked as `exec` at a time, so applying it to a new size implicitly removes it from any size it was previously set for. Change readahead to use this flagged exec size; when a request is made for an executable mapping, do a synchronous read of the size in a naturally aligned manner. On arm64 if memory is physically contiguous and naturally aligned to the "contpte" size, we can use contpte mappings, which improves utilization of the TLB. When paired with the "multi-size THP" changes, this works well to reduce dTLB pressure. However iTLB pressure is still high due to executable mappings having a low liklihood of being in the required folio size and mapping alignment, even when the filesystem supports readahead into large folios (e.g. XFS). The reason for the low liklihood is that the current readahead algorithm starts with an order-2 folio and increases the folio order by 2 every time the readahead mark is hit. But most executable memory is faulted in fairly randomly and so the readahead mark is rarely hit and most executable folios remain order-2. This is observed impirically and confirmed from discussion with a gnu linker expert; in general, the linker does nothing to group temporally accessed text together spacially. Additionally, with the current read-around approach there are no alignment guarrantees between the file and folio. This is insufficient for arm64's contpte mapping requirement (order-4 for 4K base pages). So it seems reasonable to special-case the read(ahead) logic for executable mappings. The trade-off is performance improvement (due to more efficient storage of the translations in iTLB) vs potential read amplification (due to reading too much data around the fault which won't be used), and the latter is independent of base page size. Of course if no hugepage size is marked as `always+exec` the old behaviour is maintained. Performance Benchmarking ------------------------ The below shows kernel compilation and speedometer javascript benchmarks on Ampere Altra arm64 system. When the patch is applied, `always+exec` is set for 64K folios. First, confirmation that this patch causes more memory to be contained in 64K folios (this is for all file-backed memory so includes non-executable too): | File-backed folios | Speedometer | Kernel Compile | | by size as percentage |-----------------|-----------------| | of all mapped file mem | before | after | before | after | |=========================|========|========|========|========| |file-thp-aligned-16kB | 45% | 9% | 46% | 7% | |file-thp-aligned-32kB | 2% | 0% | 3% | 1% | |file-thp-aligned-64kB | 3% | 63% | 5% | 80% | |file-thp-aligned-128kB | 11% | 11% | 0% | 0% | |file-thp-unaligned-16kB | 1% | 0% | 3% | 1% | |file-thp-unaligned-128kB | 1% | 0% | 0% | 0% | |file-thp-partial | 0% | 0% | 0% | 0% | |-------------------------|--------|--------|--------|--------| |file-cont-aligned-64kB | 16% | 75% | 5% | 80% | The above shows that for both use cases, the amount of file memory backed by 16K folios reduces and the amount backed by 64K folios increases significantly. And the amount of memory that is contpte-mapped significantly increases (last line). And this is reflected in performance improvement: Kernel Compilation (smaller is faster): | kernel | real-time | kern-time | user-time | peak memory | |----------|-------------|-------------|-------------|---------------| | before | 0.0% | 0.0% | 0.0% | 0.0% | | after | -1.6% | -2.1% | -1.7% | 0.0% | Speedometer (bigger is faster): | kernel | runs_per_min | peak memory | |----------|----------------|---------------| | before | 0.0% | 0.0% | | after | 1.3% | 1.0% | Both benchmarks show a ~1.5% improvement once the patch is applied. Signed-off-by: Ryan Roberts Signed-off-by: Baolin Wang Reviewed-by: Xu Yu Link: https://gitee.com/anolis/cloud-kernel/pulls/4378 --- Documentation/admin-guide/mm/transhuge.rst | 6 +++++ include/linux/huge_mm.h | 11 ++++++++ mm/filemap.c | 4 ++- mm/huge_memory.c | 31 +++++++++++++++++----- 4 files changed, 44 insertions(+), 8 deletions(-) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index ec1627f1e7c4..fa71b7a98db6 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -345,12 +345,18 @@ memory from a set of allowed sizes. By default all THP sizes that the page cache supports are allowed, but this set can be modified with one of:: echo always >/sys/kernel/mm/transparent_hugepage/hugepages-kB/file_enabled + echo always+exec >/sys/kernel/mm/transparent_hugepage/hugepages-kB/file_enabled echo never >/sys/kernel/mm/transparent_hugepage/hugepages-kB/file_enabled where is the hugepage size being addressed, the available sizes for which vary by system. ``always`` adds the hugepage size to the set of allowed sizes, and ``never`` removes the hugepage size from the set of allowed sizes. +``always+exec`` acts like ``always`` but additionally marks the hugepage size as +the preferred hugepage size for sections of any file mapped executable. A +maximum of one hugepage size can be marked as ``exec`` at a time, so applying it +to a new size implicitly removes it from any size it was previously set for. + In some situations, constraining the allowed sizes can reduce memory fragmentation, resulting in fewer allocation fallbacks and improved system performance. diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 1163152ffd6b..830648694a50 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -200,12 +200,18 @@ extern unsigned long huge_anon_orders_always; extern unsigned long huge_anon_orders_madvise; extern unsigned long huge_anon_orders_inherit; extern unsigned long huge_file_orders_always; +extern int huge_file_exec_order; static inline unsigned long file_orders_always(void) { return READ_ONCE(huge_file_orders_always); } +static inline int file_exec_order(void) +{ + return READ_ONCE(huge_file_exec_order); +} + static inline bool hugepage_global_enabled(void) { return transparent_hugepage_flags & @@ -755,6 +761,11 @@ static inline unsigned long file_orders_always(void) return 0; } +static inline int file_exec_order(void) +{ + return -1; +} + static inline void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, unsigned long address) { diff --git a/mm/filemap.c b/mm/filemap.c index 90479f8dbce4..7873bab137da 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3391,9 +3391,11 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; unsigned long start = vma->vm_pgoff; unsigned long end = start + vma_pages(vma); + int exec_order = file_exec_order(); unsigned long ra_end; - ra->order = exec_folio_order(); + /* If explicit order is set for exec mappings, use it. */ + ra->order = exec_order >= 0 ? exec_order : exec_folio_order(); ra->start = round_down(vmf->pgoff, 1UL << ra->order); ra->start = max(ra->start, start); ra_end = round_up(ra->start + ra->ra_pages, 1UL << ra->order); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d7e71e21fde9..ef4b537eb251 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -81,6 +81,7 @@ unsigned long huge_anon_orders_always __read_mostly; unsigned long huge_anon_orders_madvise __read_mostly; unsigned long huge_anon_orders_inherit __read_mostly; unsigned long huge_file_orders_always __read_mostly; +int huge_file_exec_order __read_mostly = -1; static bool anon_orders_configured __initdata; static inline bool file_thp_enabled(struct vm_area_struct *vma) @@ -551,6 +552,7 @@ static const struct attribute_group hugepage_attr_group = { static void hugepage_exit_sysfs(struct kobject *hugepage_kobj); static void thpsize_release(struct kobject *kobj); static DEFINE_SPINLOCK(huge_anon_orders_lock); +static DEFINE_SPINLOCK(huge_file_orders_lock); static LIST_HEAD(thpsize_list); static ssize_t anon_enabled_show(struct kobject *kobj, @@ -626,11 +628,15 @@ static ssize_t file_enabled_show(struct kobject *kobj, { int order = to_thpsize(kobj)->order; const char *output; + bool exec; - if (test_bit(order, &huge_file_orders_always)) - output = "[always] never"; - else - output = "always [never]"; + if (test_bit(order, &huge_file_orders_always)) { + exec = READ_ONCE(huge_file_exec_order) == order; + output = exec ? "always [always+exec] never" : + "[always] always+exec never"; + } else { + output = "always always+exec [never]"; + } return sysfs_emit(buf, "%s\n", output); } @@ -641,13 +647,24 @@ static ssize_t file_enabled_store(struct kobject *kobj, int order = to_thpsize(kobj)->order; ssize_t ret = count; - if (sysfs_streq(buf, "always")) + spin_lock(&huge_file_orders_lock); + + if (sysfs_streq(buf, "always")) { set_bit(order, &huge_file_orders_always); - else if (sysfs_streq(buf, "never")) + if (huge_file_exec_order == order) + huge_file_exec_order = -1; + } else if (sysfs_streq(buf, "always+exec")) { + set_bit(order, &huge_file_orders_always); + huge_file_exec_order = order; + } else if (sysfs_streq(buf, "never")) { clear_bit(order, &huge_file_orders_always); - else + if (huge_file_exec_order == order) + huge_file_exec_order = -1; + } else { ret = -EINVAL; + } + spin_unlock(&huge_file_orders_lock); return ret; } -- Gitee From 3ec230d32420937650ae877c994a0be9794abf14 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Wed, 25 Dec 2024 16:55:37 +0800 Subject: [PATCH 3/8] mm: Override mTHP "file_enabled" defaults at kernel cmdline ANBZ: #9728 cherry-picked from: https://lore.kernel.org/lkml/20240717071257.4141363-1-ryan.roberts@arm.com/T/#mb70537979115e89c8398c6f2b3d3e70ec438c8d0 Add thp_file= cmdline parameter to allow specifying the default enablement of each supported file-backed THP size. The parameter accepts the following format and can be provided multiple times to configure each size: thp_file=[KMG]: See Documentation/admin-guide/mm/transhuge.rst for more details. Configuring the defaults at boot time is often necessary because its not always possible to drop active executable pages from the page cache, especially if they are well used like libc. The command line parameter allows configuring the values before the first page is installed in the page cache. Signed-off-by: Ryan Roberts Signed-off-by: Baolin Wang Reviewed-by: Xu Yu Link: https://gitee.com/anolis/cloud-kernel/pulls/4378 --- .../admin-guide/kernel-parameters.txt | 8 ++++ Documentation/admin-guide/mm/transhuge.rst | 13 ++++++ mm/huge_memory.c | 45 ++++++++++++++++++- 3 files changed, 65 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index cf3807641d89..fe53741cde9f 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -7595,6 +7595,14 @@ Kernel parameters See Documentation/admin-guide/mm/transhuge.rst for more details. + thp_file= [KNL] + Format: [KMG]:always|always+exec|never + Can be used to control the default behavior of the + system with respect to file-backed transparent hugepages. + Can be used multiple times for multiple file-backed THP + sizes. See Documentation/admin-guide/mm/transhuge.rst + for more details. + threadirqs [KNL,EARLY] Force threading of all interrupt handlers except those marked explicitly IRQF_NO_THREAD. diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index fa71b7a98db6..64ff27e37082 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -430,6 +430,19 @@ user, the PMD_ORDER hugepage policy will be overridden. If the policy for PMD_ORDER is not defined within a valid ``thp_shmem``, its policy will default to ``never``. +Each supported file-backed THP size can be controlled by passing +``thp_file=[KMG]:``, where ```` is the THP size and +```` is one of ``always``, ``always+exec`` or ``never``. + +For example, the following will set 64K THP to ``always+exec``:: + + thp_file=64K:always+exec + +``thp_file=`` may be specified multiple times to configure all THP sizes as +required. If ``thp_file=`` is specified at least once, any file-backed THP +sizes not explicitly configured on the command line are implicitly set to +``never``. + Hugepages in tmpfs/shmem ======================== diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ef4b537eb251..135beeca6f2d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -83,6 +83,7 @@ unsigned long huge_anon_orders_inherit __read_mostly; unsigned long huge_file_orders_always __read_mostly; int huge_file_exec_order __read_mostly = -1; static bool anon_orders_configured __initdata; +static bool file_orders_configured; static inline bool file_thp_enabled(struct vm_area_struct *vma) { @@ -908,7 +909,10 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) * (and therefore THP_ORDERS_ALL_FILE_DEFAULT) isn't a compile-time * constant so we have to do this here. */ - huge_file_orders_always = THP_ORDERS_ALL_FILE_DEFAULT; + if (!file_orders_configured) { + huge_file_orders_always = THP_ORDERS_ALL_FILE_DEFAULT; + file_orders_configured = true; + } *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); if (unlikely(!*hugepage_kobj)) { @@ -1193,6 +1197,45 @@ static int __init setup_thp_anon(char *str) } __setup("thp_anon=", setup_thp_anon); +static int __init setup_thp_file(char *str) +{ + unsigned long size; + char *state; + int order; + int ret = 0; + + if (!str) + goto out; + + size = (unsigned long)memparse(str, &state); + order = ilog2(size >> PAGE_SHIFT); + if (*state != ':' || !is_power_of_2(size) || size <= PAGE_SIZE || + !(BIT(order) & THP_ORDERS_ALL_FILE_DEFAULT)) + goto out; + + state++; + + if (!strcmp(state, "always")) { + set_bit(order, &huge_file_orders_always); + ret = 1; + } else if (!strcmp(state, "always+exec")) { + set_bit(order, &huge_file_orders_always); + huge_file_exec_order = order; + ret = 1; + } else if (!strcmp(state, "never")) { + clear_bit(order, &huge_file_orders_always); + ret = 1; + } + + if (ret) + file_orders_configured = true; +out: + if (!ret) + pr_warn("thp_file=%s: cannot parse, ignored\n", str); + return ret; +} +__setup("thp_file=", setup_thp_file); + pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) { if (likely(vma->vm_flags & VM_WRITE)) -- Gitee From 8fffbf92e2a8b98d6bc1a0019a6f08bc8ee9b497 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 26 Dec 2024 15:49:48 +0800 Subject: [PATCH 4/8] anolis: mm: optimize the 'thp_file' cmdline format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #9728 Similar to the ‘thp_anon’ parameter, change the 'thp_file' to support the setting of policies with multiple sizes. Signed-off-by: Baolin Wang Reviewed-by: Xu Yu Link: https://gitee.com/anolis/cloud-kernel/pulls/4378 --- mm/huge_memory.c | 98 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 70 insertions(+), 28 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 135beeca6f2d..323871694b91 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1199,40 +1199,82 @@ __setup("thp_anon=", setup_thp_anon); static int __init setup_thp_file(char *str) { - unsigned long size; - char *state; - int order; - int ret = 0; + char *token, *range, *policy, *subtoken; + unsigned long always; + char *start_size, *end_size; + int start, end, nr, exec; + char *p; - if (!str) - goto out; + if (!str || strlen(str) + 1 > PAGE_SIZE) + goto err; + strcpy(str_dup, str); - size = (unsigned long)memparse(str, &state); - order = ilog2(size >> PAGE_SHIFT); - if (*state != ':' || !is_power_of_2(size) || size <= PAGE_SIZE || - !(BIT(order) & THP_ORDERS_ALL_FILE_DEFAULT)) - goto out; + always = huge_file_orders_always; + exec = huge_file_exec_order; + p = str_dup; + while ((token = strsep(&p, ";")) != NULL) { + range = strsep(&token, ":"); + policy = token; - state++; + if (!policy) + goto err; - if (!strcmp(state, "always")) { - set_bit(order, &huge_file_orders_always); - ret = 1; - } else if (!strcmp(state, "always+exec")) { - set_bit(order, &huge_file_orders_always); - huge_file_exec_order = order; - ret = 1; - } else if (!strcmp(state, "never")) { - clear_bit(order, &huge_file_orders_always); - ret = 1; + while ((subtoken = strsep(&range, ",")) != NULL) { + if (strchr(subtoken, '-')) { + start_size = strsep(&subtoken, "-"); + end_size = subtoken; + + start = get_order_from_str(start_size, + THP_ORDERS_ALL_FILE_DEFAULT); + end = get_order_from_str(end_size, + THP_ORDERS_ALL_FILE_DEFAULT); + } else { + start_size = end_size = subtoken; + start = end = get_order_from_str(subtoken, + THP_ORDERS_ALL_FILE_DEFAULT); + } + + if (start == -EINVAL) { + pr_err("invalid size %s in thp_shmem boot parameter\n", + start_size); + goto err; + } + + if (end == -EINVAL) { + pr_err("invalid size %s in thp_shmem boot parameter\n", + end_size); + goto err; + } + + if (start < 0 || end < 0 || start > end) + goto err; + + nr = end - start + 1; + if (!strcmp(policy, "always")) { + bitmap_set(&always, start, nr); + } else if (!strcmp(policy, "always+exec")) { + if (nr != 1) + goto err; + bitmap_set(&always, start, nr); + exec = start; + } else if (!strcmp(policy, "never")) { + bitmap_clear(&always, start, nr); + if (exec != -1 && !test_bit(exec, &always)) + exec = -1; + } else { + pr_err("invalid policy %s in thp_file boot parameter\n", policy); + goto err; + } + } } - if (ret) - file_orders_configured = true; -out: - if (!ret) - pr_warn("thp_file=%s: cannot parse, ignored\n", str); - return ret; + huge_file_orders_always = always; + huge_file_exec_order = exec; + file_orders_configured = true; + return 1; +err: + pr_warn("thp_file=%s: cannot parse, ignored\n", str); + return 0; } __setup("thp_file=", setup_thp_file); -- Gitee From cbe31e378a1f218a5c011c9f48f8f8647835d802 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 25 Dec 2024 17:13:51 +0800 Subject: [PATCH 5/8] anolis: mm: add mTHP counters for file folios ANBZ: #9728 Add mTHP counters for file folios. Signed-off-by: Baolin Wang Reviewed-by: Xu Yu Link: https://gitee.com/anolis/cloud-kernel/pulls/4378 --- Documentation/admin-guide/mm/transhuge.rst | 4 ++++ include/linux/huge_mm.h | 1 + mm/filemap.c | 16 +++++++++++++--- mm/huge_memory.c | 2 ++ 4 files changed, 20 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 64ff27e37082..2c6e6305161d 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -746,6 +746,10 @@ nr_anon_partially_mapped an anonymous THP as "partially mapped" and count it here, even though it is not actually partially mapped anymore. +file_alloc + is incremented every time a file huge page is successfully + allocated. + As the system ages, allocating huge pages may be expensive as the system uses memory compaction to copy data around memory to free a huge page for use. There are some counters in ``/proc/vmstat`` to help diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 830648694a50..4050b8bfaf2a 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -160,6 +160,7 @@ enum mthp_stat_item { MTHP_STAT_SPLIT_DEFERRED, MTHP_STAT_NR_ANON, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, + MTHP_STAT_FILE_ALLOC, __MTHP_STAT_COUNT }; diff --git a/mm/filemap.c b/mm/filemap.c index 7873bab137da..1bfc68f0ae76 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -997,9 +997,13 @@ struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order, int n; struct folio *folio; - if (policy) - return folio_alloc_mpol_noprof(gfp, order, policy, + if (policy) { + folio = folio_alloc_mpol_noprof(gfp, order, policy, NO_INTERLEAVE_INDEX, numa_node_id()); + if (folio) + count_mthp_stat(order, MTHP_STAT_FILE_ALLOC); + return folio; + } if (cpuset_do_page_mem_spread()) { unsigned int cpuset_mems_cookie; @@ -1009,9 +1013,15 @@ struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order, folio = __folio_alloc_node_noprof(gfp, order, n); } while (!folio && read_mems_allowed_retry(cpuset_mems_cookie)); + if (folio) + count_mthp_stat(order, MTHP_STAT_FILE_ALLOC); return folio; } - return folio_alloc_noprof(gfp, order); + + folio = folio_alloc_noprof(gfp, order); + if (folio) + count_mthp_stat(order, MTHP_STAT_FILE_ALLOC); + return folio; } EXPORT_SYMBOL(filemap_alloc_folio_noprof); #endif diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 323871694b91..94c69255bea3 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -754,6 +754,7 @@ DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED); DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED); DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON); DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED); +DEFINE_MTHP_STAT_ATTR(file_alloc, MTHP_STAT_FILE_ALLOC); static struct attribute *anon_stats_attrs[] = { &anon_fault_alloc_attr.attr, @@ -779,6 +780,7 @@ static struct attribute_group anon_stats_attr_grp = { }; static struct attribute *file_stats_attrs[] = { + &file_alloc_attr.attr, #ifdef CONFIG_SHMEM &shmem_alloc_attr.attr, &shmem_fallback_attr.attr, -- Gitee From 668b9ef8a65a8ba4a6ebdf095de13ad798562b58 Mon Sep 17 00:00:00 2001 From: Rongwei Wang Date: Mon, 30 Dec 2024 14:21:39 +0800 Subject: [PATCH 6/8] anolis: mm, thp: hugetext: make PIC binary mapping address THP align ANBZ: #9728 The patch mainly to make mmap address of PIC binary is aligned with HPAGE_PMD_SIZE. If not so, the ELF binary that is generated with -fPIC compile option can not use hugepages, because of the mapping address is randomly selected by kernel. Note: Baolin Wang changed the code to make it suitable for the file mTHP. Signed-off-by: Rongwei Wang Signed-off-by: Baolin Wang Reviewed-by: Xu Yu Link: https://gitee.com/anolis/cloud-kernel/pulls/4378 [rebase 6.6.102] --- fs/binfmt_elf.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 16a56b6b3f6c..4717f75416b3 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1045,6 +1045,7 @@ static int load_elf_binary(struct linux_binprm *bprm) unsigned long k, vaddr; unsigned long total_size = 0; unsigned long alignment; + int exec_order = file_exec_order(); if (elf_ppnt->p_type != PT_LOAD) continue; @@ -1183,6 +1184,9 @@ static int load_elf_binary(struct linux_binprm *bprm) * is then page aligned. */ load_bias = ELF_PAGESTART(load_bias - vaddr); + if (exec_order > 0 && interpreter && + total_size >= (PAGE_SIZE << exec_order)) + load_bias &= ~((PAGE_SIZE << exec_order) - 1); } error = elf_load(bprm->file, load_bias + vaddr, elf_ppnt, -- Gitee From 786cb437157a48c9ea88502f9fda6e411363dd6f Mon Sep 17 00:00:00 2001 From: Weilin Tong Date: Mon, 3 Mar 2025 17:24:06 +0800 Subject: [PATCH 7/8] anolis: mm: fix read-ahead beyond EOF in page_cache_ra_order() ANBZ: #9728 Adjusted the loop condition to correctly handle limits and order constraints, ensuring read-ahead does not exceed the EOF. This change improves file system robustness by preventing potential over-read scenarios. Fixes: 107bdaacdf96 ("mm: mTHP user controls to configure pagecache large folio sizes") Signed-off-by: Weilin Tong Reviewed-by: Baolin Wang Link: https://gitee.com/anolis/cloud-kernel/pulls/4755 Signed-off-by: Baolin Wang --- mm/readahead.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/readahead.c b/mm/readahead.c index 90fcc6ec9557..5cff8dd48969 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -521,8 +521,8 @@ void page_cache_ra_order(struct readahead_control *ractl, if (index & ((1UL << order) - 1)) order = select_new_order(order, __ffs(index), orders); /* Don't allocate pages past EOF */ - while (order > min_order && index + (1UL << order) - 1 > limit && - (BIT(order) & orders) == 0) + while (order > min_order && (index + (1UL << order) - 1 > limit || + (BIT(order) & orders) == 0)) order--; err = ra_alloc_folio(ractl, index, mark, order, gfp); if (err) -- Gitee From 9f96c8f3f79306640f72eb845d1242913772fa65 Mon Sep 17 00:00:00 2001 From: Zelin Deng Date: Thu, 20 Mar 2025 12:43:28 +0800 Subject: [PATCH 8/8] anolis: mm: fallback to lower order in __filemap_get_folio() ANBZ: #9728 In __filemap_get_folio() if filemap_alloc_folio() fails to alloc highest order large folio, it continues the while loop with highest order. Thus when system has high memory pressure, filemap_alloc_folio() cannot alloc the expected large folio in time, the while loop can cause softlockup panic. [41230.985727] CPU: 39 PID: 14210 Comm: genload Kdump: loaded Tainted: G W OEL 6.6.71-3_rc2.an23.aarch64 #1 [41230.985729] Hardware name: AlibabaCloud AliServer-Xuanwu2.0AM-1UC1P-5B/AS1111MG1, BIOS 1.2.M1.AL.P.139.00 02/14/2023 [41230.985730] pstate: 634010c9 (nZCv daIF +PAN -UAO +TCO +DIT +SSBS BTYPE=--) [41230.985732] pc : machine_kexec+0x40/0x200 [41230.985734] lr : machine_kexec+0x40/0x200 [41230.985737] sp : ffff800082783b80 [41230.985737] x29: ffff800082783b80 x28: ffff00173facc900 x27: ffff00173facd4c8 [41230.985740] x26: ffff8000818ce008 x25: ffff8016be1f9000 x24: ffff800082783e10 [41230.985743] x23: ffff8000ad003630 x22: ffff80008240b3e8 x21: ffff8000810e3750 [41230.985745] x20: ffff04000cc05000 x19: ffff04000cc05000 x18: ffffffffffffffff [41230.985747] x17: 31313153412f4235 x16: 2d50314355312d4d x15: ffff00173facc900 [41230.985750] x14: ffff00173facd4c8 x13: 2e656c6261696c65 x12: 726e75206562206c [41230.985752] x11: 000000010001057c x10: ffff800081fb9cf8 x9 : ffff8000800f2f48 [41230.985755] x8 : 00000000000083a0 x7 : c00000010001057c x6 : 000000000001e2a0 [41230.985757] x5 : ffff00173fac9288 x4 : 0000000000000000 x3 : ffff8016be1f9000 [41230.985760] x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffff00082f0da800 [41230.985762] Call trace: [41230.985763] machine_kexec+0x40/0x200 [41230.985766] __crash_kexec+0x70/0xd8 [41230.985768] panic+0x308/0x388 [41230.985771] watchdog_timer_fn+0x2cc/0x2d8 [41230.985773] __hrtimer_run_queues+0x19c/0x370 [41230.985775] hrtimer_interrupt+0xec/0x248 [41230.985776] arch_timer_handler_phys+0x30/0x50 [41230.985779] handle_percpu_devid_irq+0x8c/0x230 [41230.985782] generic_handle_domain_irq+0x30/0x50 [41230.985783] __gic_handle_irq_from_irqson.isra.0+0x140/0x260 [41230.985786] gic_handle_irq+0x2c/0xa0 [41230.985787] call_on_irq_stack+0x24/0x30 [41230.985789] do_interrupt_handler+0x80/0x90 [41230.985791] el1_interrupt+0x44/0xa8 [41230.985793] el1h_64_irq_handler+0x14/0x20 [41230.985794] el1h_64_irq+0x78/0x80 [41230.985795] arch_counter_get_cntpct+0x14/0x18 [41230.985797] ktime_get+0x48/0xa8 [41230.985799] memcg_lat_stat_start+0x24/0x50 [41230.985801] __alloc_pages_direct_compact+0x58/0x388 [41230.985804] __alloc_pages_slowpath+0x6b8/0x918 [41230.985805] __alloc_pages+0x34c/0x428 [41230.985807] alloc_pages+0x98/0x138 [41230.985809] folio_alloc+0x1c/0x40 [41230.985812] filemap_alloc_folio+0x3c/0xc0 [41230.985814] __filemap_get_folio+0x1e8/0x470 [41230.985816] iomap_get_folio+0x6c/0x88 [41230.985818] iomap_write_begin+0x1c0/0x308 [41230.985820] iomap_write_iter+0xf4/0x280 [41230.985822] iomap_file_buffered_write+0x88/0xf0 [41230.985823] xfs_file_buffered_write+0x98/0x2d0 [xfs] [41230.985868] xfs_file_write_iter+0x104/0x150 [xfs] [41230.985915] vfs_write+0x1a4/0x2f8 [41230.985918] ksys_write+0x70/0x108 [41230.985920] __arm64_sys_write+0x20/0x30 [41230.985923] el0_svc_common.constprop.0+0x60/0x138 [41230.985925] do_el0_svc+0x20/0x30 [41230.985928] el0_svc+0x44/0x1a8 [41230.985929] el0t_64_sync_handler+0xf8/0x128 [41230.985931] el0t_64_sync+0x17c/0x180 [41230.985932] ---[ end trace 0000000000000000 ]--- [41230.985934] Bye! For the original semantics of THP, fallback is required to try lower order folio. Add order fallback in __filemap_get_folio() while loop to try lower order when higher order allocation fails. Fixes: c5f71a671738 ("mm: mTHP user controls to configure pagecache large folio sizes") Signed-off-by: Zelin Deng Reviewed-by: Baolin Wang Link: https://gitee.com/anolis/cloud-kernel/pulls/4867 --- mm/filemap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index 1bfc68f0ae76..e3f652240b7b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2026,7 +2026,7 @@ struct folio *__filemap_get_folio_mpol(struct address_space *mapping, alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN; folio = filemap_alloc_folio(alloc_gfp, order, policy); if (!folio) - continue; + goto try_next; /* Init accessed so avoid atomic mark_page_accessed later */ if (fgp_flags & FGP_ACCESSED) @@ -2040,6 +2040,7 @@ struct folio *__filemap_get_folio_mpol(struct address_space *mapping, folio_put(folio); folio = NULL; +try_next: if (order <= min_order) break; order = next_order(&orders, order); -- Gitee