From 107bdaacdf96b3b1b7c1c2969a20cd05d79bdff5 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Wed, 25 Dec 2024 15:05:46 +0800
Subject: [PATCH 1/8] mm: mTHP user controls to configure pagecache large folio
 sizes

ANBZ: #9728

cherry-picked from: https://lore.kernel.org/lkml/20240717071257.4141363-1-ryan.roberts@arm.com/T/#m25b51aa890b123202cda93fa0e67340b3e4b26b6

Add mTHP controls to sysfs to allow user space to configure the folio
sizes that can be considered for allocation of file-backed memory:

  /sys/kernel/mm/transparent_hugepage/hugepages-*kB/file_enable

For now, the control can be set to either `always` or `never` to enable
or disable that size. More options may be added in future.

By default, at boot, all folio sizes are enabled, and the algorithm used
to select a folio size remains conceptually unchanged; increase by 2
enabled orders each time a readahead marker is hit then reduce to the
closest enabled order to fit within bounds of ra size, index alignment
and EOF. So when all folio sizes are enabled, behavior should be
unchanged. When folio sizes are disabled, the algorithm will never
select them.

Systems such as Android are always under extreme memory pressure and as
a result fragmentation often causes attempts to allocate large folios to
fail and fallback to smaller folios. By fixing the pagecache to one
large folio size (e.g. 64K) plus fallback to small folios, a large
source of this fragmentation can be removed and 64K mTHP allocations
succeed more often, allowing the system to benefit from improved
performance on arm64 and other arches that support "contpte".

Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Xu Yu <xuyu@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/4378
---
 Documentation/admin-guide/mm/transhuge.rst | 21 +++++++++++
 include/linux/huge_mm.h                    | 42 +++++++++++++---------
 mm/filemap.c                               | 15 ++++++--
 mm/huge_memory.c                           | 42 ++++++++++++++++++++++
 mm/readahead.c                             | 18 ++++++++--
 5 files changed, 115 insertions(+), 23 deletions(-)
diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index 5fbc3d89bb07..ec1627f1e7c4 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -337,6 +337,27 @@ that THP is shared. Exceeding the number would block the collapse::
 
 A higher value may increase memory footprint for some workloads.
 
+File-Backed Hugepages
+---------------------
+
+The kernel will automatically select an appropriate THP size for file-backed
+memory from a set of allowed sizes. By default all THP sizes that the page cache
+supports are allowed, but this set can be modified with one of::
+
+        echo always >/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/file_enabled
+        echo never >/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/file_enabled
+
+where <size> is the hugepage size being addressed, the available sizes for which
+vary by system. ``always`` adds the hugepage size to the set of allowed sizes,
+and ``never`` removes the hugepage size from the set of allowed sizes.
+
+In some situations, constraining the allowed sizes can reduce memory
+fragmentation, resulting in fewer allocation fallbacks and improved system
+performance.
+
+Note that any changes to the allowed set of sizes only applies to future
+file-backed THP allocations.
+
 Boot parameters
 ===============
 
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 2949e5acff35..1163152ffd6b 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -104,6 +104,24 @@ enum tva_type {
 #define thp_vma_allowable_order(vma, vm_flags, type, order) \
 	(!!thp_vma_allowable_orders(vma, vm_flags, type, BIT(order)))
 
+static inline int lowest_order(unsigned long orders)
+{
+	if (orders)
+		return __ffs(orders);
+	return -1;
+}
+
+static inline int highest_order(unsigned long orders)
+{
+	return fls_long(orders) - 1;
+}
+
+static inline int next_order(unsigned long *orders, int prev)
+{
+	*orders &= ~BIT(prev);
+	return highest_order(*orders);
+}
+
 #define split_folio(f) split_folio_to_list(f, NULL)
 
 #ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
@@ -181,6 +199,12 @@ extern unsigned long transparent_hugepage_flags;
 extern unsigned long huge_anon_orders_always;
 extern unsigned long huge_anon_orders_madvise;
 extern unsigned long huge_anon_orders_inherit;
+extern unsigned long huge_file_orders_always;
+
+static inline unsigned long file_orders_always(void)
+{
+	return READ_ONCE(huge_file_orders_always);
+}
 
 static inline bool hugepage_global_enabled(void)
 {
@@ -195,17 +219,6 @@ static inline bool hugepage_global_always(void)
 			(1<<TRANSPARENT_HUGEPAGE_FLAG);
 }
 
-static inline int highest_order(unsigned long orders)
-{
-	return fls_long(orders) - 1;
-}
-
-static inline int next_order(unsigned long *orders, int prev)
-{
-	*orders &= ~BIT(prev);
-	return highest_order(*orders);
-}
-
 /*
  * Do the below checks:
  *   - For file vma, check if the linear page offset of vma is
@@ -737,12 +750,7 @@ static inline bool thp_migration_supported(void)
 	return false;
 }
 
-static inline int highest_order(unsigned long orders)
-{
-	return 0;
-}
-
-static inline int next_order(unsigned long *orders, int prev)
+static inline unsigned long file_orders_always(void)
 {
 	return 0;
 }
diff --git a/mm/filemap.c b/mm/filemap.c
index 4e636647100c..90479f8dbce4 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1983,6 +1983,7 @@ struct folio *__filemap_get_folio_mpol(struct address_space *mapping,
 	if (!folio && (fgp_flags & FGP_CREAT)) {
 		unsigned int min_order = mapping_min_folio_order(mapping);
 		unsigned int order = max(min_order, FGF_GET_ORDER(fgp_flags));
+		unsigned long orders;
 		int err;
 		index = mapping_align_index(mapping, index);
 
@@ -1999,11 +2000,15 @@ struct folio *__filemap_get_folio_mpol(struct address_space *mapping,
 
 		if (order > mapping_max_folio_order(mapping))
 			order = mapping_max_folio_order(mapping);
+
+		orders = file_orders_always() | BIT(0);
+		orders &= BIT(order + 1) - 1;
 		/* If we're not aligned, allocate a smaller folio */
 		if (index & ((1UL << order) - 1))
-			order = __ffs(index);
+			orders &= BIT(__ffs(index) + 1) - 1;
+		order = highest_order(orders);
 
-		do {
+		while (orders) {
 			gfp_t alloc_gfp = gfp;
 
 			err = -ENOMEM;
@@ -2024,7 +2029,11 @@ struct folio *__filemap_get_folio_mpol(struct address_space *mapping,
 				break;
 			folio_put(folio);
 			folio = NULL;
-		} while (order-- > min_order);
+
+			if (order <= min_order)
+				break;
+			order = next_order(&orders, order);
+		};
 
 		if (err == -EEXIST)
 			goto repeat;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 970e077019b7..d7e71e21fde9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -80,6 +80,7 @@ unsigned long huge_zero_pfn __read_mostly = ~0UL;
 unsigned long huge_anon_orders_always __read_mostly;
 unsigned long huge_anon_orders_madvise __read_mostly;
 unsigned long huge_anon_orders_inherit __read_mostly;
+unsigned long huge_file_orders_always __read_mostly;
 static bool anon_orders_configured __initdata;
 
 static inline bool file_thp_enabled(struct vm_area_struct *vma)
@@ -620,6 +621,36 @@ static ssize_t anon_enabled_store(struct kobject *kobj,
 	return count;
 }
 
+static ssize_t file_enabled_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	int order = to_thpsize(kobj)->order;
+	const char *output;
+
+	if (test_bit(order, &huge_file_orders_always))
+		output = "[always] never";
+	else
+		output = "always [never]";
+
+	return sysfs_emit(buf, "%s\n", output);
+}
+
+static ssize_t file_enabled_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	int order = to_thpsize(kobj)->order;
+	ssize_t ret = count;
+
+	if (sysfs_streq(buf, "always"))
+		set_bit(order, &huge_file_orders_always);
+	else if (sysfs_streq(buf, "never"))
+		clear_bit(order, &huge_file_orders_always);
+	else
+		ret = -EINVAL;
+
+	return ret;
+}
+
 static struct kobj_attribute anon_enabled_attr =
 	__ATTR(enabled, 0644, anon_enabled_show, anon_enabled_store);
 
@@ -632,7 +663,11 @@ static const struct attribute_group anon_ctrl_attr_grp = {
 	.attrs = anon_ctrl_attrs,
 };
 
+static struct kobj_attribute file_enabled_attr =
+	__ATTR(file_enabled, 0644, file_enabled_show, file_enabled_store);
+
 static struct attribute *file_ctrl_attrs[] = {
+	&file_enabled_attr.attr,
 #ifdef CONFIG_SHMEM
 	&thpsize_shmem_enabled_attr.attr,
 #endif
@@ -851,6 +886,13 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
 	if (!anon_orders_configured)
 		huge_anon_orders_inherit = BIT(PMD_ORDER);
 
+	/*
+	 * For pagecache, default to enabling all orders. powerpc's PMD_ORDER
+	 * (and therefore THP_ORDERS_ALL_FILE_DEFAULT) isn't a compile-time
+	 * constant so we have to do this here.
+	 */
+	huge_file_orders_always = THP_ORDERS_ALL_FILE_DEFAULT;
+
 	*hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
 	if (unlikely(!*hugepage_kobj)) {
 		pr_err("failed to create transparent hugepage kobject\n");
diff --git a/mm/readahead.c b/mm/readahead.c
index 7b05082c89ea..90fcc6ec9557 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -464,6 +464,15 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
 	return 0;
 }
 
+static int select_new_order(int old_order, int max_order, unsigned long orders)
+{
+	orders &= BIT(max_order + 1) - 1;
+	VM_WARN_ON(!orders);
+
+	orders &= (BIT(old_order + 1) - 1);
+	return highest_order(orders);
+}
+
 void page_cache_ra_order(struct readahead_control *ractl,
 		struct file_ra_state *ra)
 {
@@ -477,6 +486,7 @@ void page_cache_ra_order(struct readahead_control *ractl,
 	int err = 0;
 	gfp_t gfp = readahead_gfp_mask(mapping);
 	unsigned int new_order = ra->order;
+	unsigned long orders;
 
 	trace_page_cache_ra_order(mapping->host, start, ra);
 	if (!mapping_large_folio_support(mapping)) {
@@ -486,8 +496,9 @@ void page_cache_ra_order(struct readahead_control *ractl,
 
 	limit = min(limit, index + ra->size - 1);
 
+	orders = file_orders_always() | BIT(0);
+	new_order = select_new_order(new_order, ilog2(ra->size), orders);
 	new_order = min(mapping_max_folio_order(mapping), new_order);
-	new_order = min_t(unsigned int, new_order, ilog2(ra->size));
 	new_order = max(new_order, min_order);
 
 	ra->order = new_order;
@@ -508,9 +519,10 @@ void page_cache_ra_order(struct readahead_control *ractl,
 
 		/* Align with smaller pages if needed */
 		if (index & ((1UL << order) - 1))
-			order = __ffs(index);
+			order = select_new_order(order, __ffs(index), orders);
 		/* Don't allocate pages past EOF */
-		while (order > min_order && index + (1UL << order) - 1 > limit)
+		while (order > min_order && index + (1UL << order) - 1 > limit &&
+			(BIT(order) & orders) == 0)
 			order--;
 		err = ra_alloc_folio(ractl, index, mark, order, gfp);
 		if (err)
-- 
Gitee


From ec66b31a6fb10653bb012335055e8c95d5fe0f91 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Wed, 25 Dec 2024 15:56:21 +0800
Subject: [PATCH 2/8] mm: Introduce "always+exec" for mTHP file_enabled control

ANBZ: #9728

cherry-picked from: https://lore.kernel.org/lkml/20240717071257.4141363-1-ryan.roberts@arm.com/T/#md06a4a7a606cb90824f322fec868ee0d7620a876

In addition to `always` and `never`, add `always+exec` as an option for:

  /sys/kernel/mm/transparent_hugepage/hugepages-*kB/file_enabled

`always+exec` acts like `always` but additionally marks the hugepage
size as the preferred hugepage size for sections of any file mapped with
execute permission. A maximum of one hugepage size can be marked as
`exec` at a time, so applying it to a new size implicitly removes it
from any size it was previously set for.

Change readahead to use this flagged exec size; when a request is made
for an executable mapping, do a synchronous read of the size in a
naturally aligned manner.

On arm64 if memory is physically contiguous and naturally aligned to the
"contpte" size, we can use contpte mappings, which improves utilization
of the TLB. When paired with the "multi-size THP" changes, this works
well to reduce dTLB pressure. However iTLB pressure is still high due to
executable mappings having a low liklihood of being in the required
folio size and mapping alignment, even when the filesystem supports
readahead into large folios (e.g. XFS).

The reason for the low liklihood is that the current readahead algorithm
starts with an order-2 folio and increases the folio order by 2 every
time the readahead mark is hit. But most executable memory is faulted in
fairly randomly and so the readahead mark is rarely hit and most
executable folios remain order-2. This is observed impirically and
confirmed from discussion with a gnu linker expert; in general, the
linker does nothing to group temporally accessed text together
spacially. Additionally, with the current read-around approach there are
no alignment guarrantees between the file and folio. This is
insufficient for arm64's contpte mapping requirement (order-4 for 4K
base pages).

So it seems reasonable to special-case the read(ahead) logic for
executable mappings. The trade-off is performance improvement (due to
more efficient storage of the translations in iTLB) vs potential read
amplification (due to reading too much data around the fault which won't
be used), and the latter is independent of base page size.

Of course if no hugepage size is marked as `always+exec` the old
behaviour is maintained.

Performance Benchmarking
------------------------

The below shows kernel compilation and speedometer javascript benchmarks
on Ampere Altra arm64 system. When the patch is applied, `always+exec`
is set for 64K folios.

First, confirmation that this patch causes more memory to be contained
in 64K folios (this is for all file-backed memory so includes
non-executable too):

| File-backed folios      |   Speedometer   |  Kernel Compile |
| by size as percentage   |-----------------|-----------------|
| of all mapped file mem  | before |  after | before |  after |
|=========================|========|========|========|========|
|file-thp-aligned-16kB    |    45% |     9% |    46% |     7% |
|file-thp-aligned-32kB    |     2% |     0% |     3% |     1% |
|file-thp-aligned-64kB    |     3% |    63% |     5% |    80% |
|file-thp-aligned-128kB   |    11% |    11% |     0% |     0% |
|file-thp-unaligned-16kB  |     1% |     0% |     3% |     1% |
|file-thp-unaligned-128kB |     1% |     0% |     0% |     0% |
|file-thp-partial         |     0% |     0% |     0% |     0% |
|-------------------------|--------|--------|--------|--------|
|file-cont-aligned-64kB   |    16% |    75% |     5% |    80% |

The above shows that for both use cases, the amount of file memory
backed by 16K folios reduces and the amount backed by 64K folios
increases significantly. And the amount of memory that is contpte-mapped
significantly increases (last line).

And this is reflected in performance improvement:

Kernel Compilation (smaller is faster):
| kernel   |   real-time |   kern-time |   user-time |   peak memory |
|----------|-------------|-------------|-------------|---------------|
| before   |        0.0% |        0.0% |        0.0% |          0.0% |
| after    |       -1.6% |       -2.1% |       -1.7% |          0.0% |

Speedometer (bigger is faster):
| kernel   |   runs_per_min |   peak memory |
|----------|----------------|---------------|
| before   |           0.0% |          0.0% |
| after    |           1.3% |          1.0% |

Both benchmarks show a ~1.5% improvement once the patch is applied.

Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Xu Yu <xuyu@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/4378
---
 Documentation/admin-guide/mm/transhuge.rst |  6 +++++
 include/linux/huge_mm.h                    | 11 ++++++++
 mm/filemap.c                               |  4 ++-
 mm/huge_memory.c                           | 31 +++++++++++++++++-----
 4 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index ec1627f1e7c4..fa71b7a98db6 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -345,12 +345,18 @@ memory from a set of allowed sizes. By default all THP sizes that the page cache
 supports are allowed, but this set can be modified with one of::
 
         echo always >/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/file_enabled
+        echo always+exec >/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/file_enabled
         echo never >/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/file_enabled
 
 where <size> is the hugepage size being addressed, the available sizes for which
 vary by system. ``always`` adds the hugepage size to the set of allowed sizes,
 and ``never`` removes the hugepage size from the set of allowed sizes.
 
+``always+exec`` acts like ``always`` but additionally marks the hugepage size as
+the preferred hugepage size for sections of any file mapped executable. A
+maximum of one hugepage size can be marked as ``exec`` at a time, so applying it
+to a new size implicitly removes it from any size it was previously set for.
+
 In some situations, constraining the allowed sizes can reduce memory
 fragmentation, resulting in fewer allocation fallbacks and improved system
 performance.
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 1163152ffd6b..830648694a50 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -200,12 +200,18 @@ extern unsigned long huge_anon_orders_always;
 extern unsigned long huge_anon_orders_madvise;
 extern unsigned long huge_anon_orders_inherit;
 extern unsigned long huge_file_orders_always;
+extern int huge_file_exec_order;
 
 static inline unsigned long file_orders_always(void)
 {
 	return READ_ONCE(huge_file_orders_always);
 }
 
+static inline int file_exec_order(void)
+{
+	return READ_ONCE(huge_file_exec_order);
+}
+
 static inline bool hugepage_global_enabled(void)
 {
 	return transparent_hugepage_flags &
@@ -755,6 +761,11 @@ static inline unsigned long file_orders_always(void)
 	return 0;
 }
 
+static inline int file_exec_order(void)
+{
+	return -1;
+}
+
 static inline void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
 				    unsigned long address)
 {
diff --git a/mm/filemap.c b/mm/filemap.c
index 90479f8dbce4..7873bab137da 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3391,9 +3391,11 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
 		struct vm_area_struct *vma = vmf->vma;
 		unsigned long start = vma->vm_pgoff;
 		unsigned long end = start + vma_pages(vma);
+		int exec_order = file_exec_order();
 		unsigned long ra_end;
 
-		ra->order = exec_folio_order();
+		/* If explicit order is set for exec mappings, use it. */
+		ra->order = exec_order >= 0 ? exec_order : exec_folio_order();
 		ra->start = round_down(vmf->pgoff, 1UL << ra->order);
 		ra->start = max(ra->start, start);
 		ra_end = round_up(ra->start + ra->ra_pages, 1UL << ra->order);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d7e71e21fde9..ef4b537eb251 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -81,6 +81,7 @@ unsigned long huge_anon_orders_always __read_mostly;
 unsigned long huge_anon_orders_madvise __read_mostly;
 unsigned long huge_anon_orders_inherit __read_mostly;
 unsigned long huge_file_orders_always __read_mostly;
+int huge_file_exec_order __read_mostly = -1;
 static bool anon_orders_configured __initdata;
 
 static inline bool file_thp_enabled(struct vm_area_struct *vma)
@@ -551,6 +552,7 @@ static const struct attribute_group hugepage_attr_group = {
 static void hugepage_exit_sysfs(struct kobject *hugepage_kobj);
 static void thpsize_release(struct kobject *kobj);
 static DEFINE_SPINLOCK(huge_anon_orders_lock);
+static DEFINE_SPINLOCK(huge_file_orders_lock);
 static LIST_HEAD(thpsize_list);
 
 static ssize_t anon_enabled_show(struct kobject *kobj,
@@ -626,11 +628,15 @@ static ssize_t file_enabled_show(struct kobject *kobj,
 {
 	int order = to_thpsize(kobj)->order;
 	const char *output;
+	bool exec;
 
-	if (test_bit(order, &huge_file_orders_always))
-		output = "[always] never";
-	else
-		output = "always [never]";
+	if (test_bit(order, &huge_file_orders_always)) {
+		exec = READ_ONCE(huge_file_exec_order) == order;
+		output = exec ? "always [always+exec] never" :
+				"[always] always+exec never";
+	} else {
+		output = "always always+exec [never]";
+	}
 
 	return sysfs_emit(buf, "%s\n", output);
 }
@@ -641,13 +647,24 @@ static ssize_t file_enabled_store(struct kobject *kobj,
 	int order = to_thpsize(kobj)->order;
 	ssize_t ret = count;
 
-	if (sysfs_streq(buf, "always"))
+	spin_lock(&huge_file_orders_lock);
+
+	if (sysfs_streq(buf, "always")) {
 		set_bit(order, &huge_file_orders_always);
-	else if (sysfs_streq(buf, "never"))
+		if (huge_file_exec_order == order)
+			huge_file_exec_order = -1;
+	} else if (sysfs_streq(buf, "always+exec")) {
+		set_bit(order, &huge_file_orders_always);
+		huge_file_exec_order = order;
+	} else if (sysfs_streq(buf, "never")) {
 		clear_bit(order, &huge_file_orders_always);
-	else
+		if (huge_file_exec_order == order)
+			huge_file_exec_order = -1;
+	} else {
 		ret = -EINVAL;
+	}
 
+	spin_unlock(&huge_file_orders_lock);
 	return ret;
 }
 
-- 
Gitee


From 3ec230d32420937650ae877c994a0be9794abf14 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Wed, 25 Dec 2024 16:55:37 +0800
Subject: [PATCH 3/8] mm: Override mTHP "file_enabled" defaults at kernel
 cmdline

ANBZ: #9728

cherry-picked from: https://lore.kernel.org/lkml/20240717071257.4141363-1-ryan.roberts@arm.com/T/#mb70537979115e89c8398c6f2b3d3e70ec438c8d0

Add thp_file= cmdline parameter to allow specifying the default
enablement of each supported file-backed THP size. The parameter accepts
the following format and can be provided multiple times to configure
each size:

  thp_file=<size>[KMG]:<value>

See Documentation/admin-guide/mm/transhuge.rst for more details.

Configuring the defaults at boot time is often necessary because its not
always possible to drop active executable pages from the page cache,
especially if they are well used like libc. The command line parameter
allows configuring the values before the first page is installed in the
page cache.

Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Xu Yu <xuyu@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/4378
---
 .../admin-guide/kernel-parameters.txt         |  8 ++++
 Documentation/admin-guide/mm/transhuge.rst    | 13 ++++++
 mm/huge_memory.c                              | 45 ++++++++++++++++++-
 3 files changed, 65 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index cf3807641d89..fe53741cde9f 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -7595,6 +7595,14 @@ Kernel parameters
 			See Documentation/admin-guide/mm/transhuge.rst for more
 			details.
 
+	thp_file=       [KNL]
+			Format: <size>[KMG]:always|always+exec|never
+			Can be used to control the default behavior of the
+			system with respect to file-backed transparent hugepages.
+			Can be used multiple times for multiple file-backed THP
+			sizes. See Documentation/admin-guide/mm/transhuge.rst
+			for more details.
+
 	threadirqs	[KNL,EARLY]
 			Force threading of all interrupt handlers except those
 			marked explicitly IRQF_NO_THREAD.
diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index fa71b7a98db6..64ff27e37082 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -430,6 +430,19 @@ user, the PMD_ORDER hugepage policy will be overridden. If the policy for
 PMD_ORDER is not defined within a valid ``thp_shmem``, its policy will
 default to ``never``.
 
+Each supported file-backed THP size can be controlled by passing
+``thp_file=<size>[KMG]:<state>``, where ``<size>`` is the THP size and
+``<state>`` is one of ``always``, ``always+exec`` or ``never``.
+
+For example, the following will set 64K THP to ``always+exec``::
+
+        thp_file=64K:always+exec
+
+``thp_file=`` may be specified multiple times to configure all THP sizes as
+required. If ``thp_file=`` is specified at least once, any file-backed THP
+sizes not explicitly configured on the command line are implicitly set to
+``never``.
+
 Hugepages in tmpfs/shmem
 ========================
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index ef4b537eb251..135beeca6f2d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -83,6 +83,7 @@ unsigned long huge_anon_orders_inherit __read_mostly;
 unsigned long huge_file_orders_always __read_mostly;
 int huge_file_exec_order __read_mostly = -1;
 static bool anon_orders_configured __initdata;
+static bool file_orders_configured;
 
 static inline bool file_thp_enabled(struct vm_area_struct *vma)
 {
@@ -908,7 +909,10 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
 	 * (and therefore THP_ORDERS_ALL_FILE_DEFAULT) isn't a compile-time
 	 * constant so we have to do this here.
 	 */
-	huge_file_orders_always = THP_ORDERS_ALL_FILE_DEFAULT;
+	if (!file_orders_configured) {
+		huge_file_orders_always = THP_ORDERS_ALL_FILE_DEFAULT;
+		file_orders_configured = true;
+	}
 
 	*hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
 	if (unlikely(!*hugepage_kobj)) {
@@ -1193,6 +1197,45 @@ static int __init setup_thp_anon(char *str)
 }
 __setup("thp_anon=", setup_thp_anon);
 
+static int __init setup_thp_file(char *str)
+{
+	unsigned long size;
+	char *state;
+	int order;
+	int ret = 0;
+
+	if (!str)
+		goto out;
+
+	size = (unsigned long)memparse(str, &state);
+	order = ilog2(size >> PAGE_SHIFT);
+	if (*state != ':' || !is_power_of_2(size) || size <= PAGE_SIZE ||
+	    !(BIT(order) & THP_ORDERS_ALL_FILE_DEFAULT))
+		goto out;
+
+	state++;
+
+	if (!strcmp(state, "always")) {
+		set_bit(order, &huge_file_orders_always);
+		ret = 1;
+	} else if (!strcmp(state, "always+exec")) {
+		set_bit(order, &huge_file_orders_always);
+		huge_file_exec_order = order;
+		ret = 1;
+	} else if (!strcmp(state, "never")) {
+		clear_bit(order, &huge_file_orders_always);
+		ret = 1;
+	}
+
+	if (ret)
+		file_orders_configured = true;
+out:
+	if (!ret)
+		pr_warn("thp_file=%s: cannot parse, ignored\n", str);
+	return ret;
+}
+__setup("thp_file=", setup_thp_file);
+
 pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
 {
 	if (likely(vma->vm_flags & VM_WRITE))
-- 
Gitee


From 8fffbf92e2a8b98d6bc1a0019a6f08bc8ee9b497 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linux.alibaba.com>
Date: Thu, 26 Dec 2024 15:49:48 +0800
Subject: [PATCH 4/8] anolis: mm: optimize the 'thp_file' cmdline format
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ANBZ: #9728

Similar to the ‘thp_anon’ parameter, change the 'thp_file' to support the
setting of policies with multiple sizes.

Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Xu Yu <xuyu@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/4378
---
 mm/huge_memory.c | 98 ++++++++++++++++++++++++++++++++++--------------
 1 file changed, 70 insertions(+), 28 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 135beeca6f2d..323871694b91 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1199,40 +1199,82 @@ __setup("thp_anon=", setup_thp_anon);
 
 static int __init setup_thp_file(char *str)
 {
-	unsigned long size;
-	char *state;
-	int order;
-	int ret = 0;
+	char *token, *range, *policy, *subtoken;
+	unsigned long always;
+	char *start_size, *end_size;
+	int start, end, nr, exec;
+	char *p;
 
-	if (!str)
-		goto out;
+	if (!str || strlen(str) + 1 > PAGE_SIZE)
+		goto err;
+	strcpy(str_dup, str);
 
-	size = (unsigned long)memparse(str, &state);
-	order = ilog2(size >> PAGE_SHIFT);
-	if (*state != ':' || !is_power_of_2(size) || size <= PAGE_SIZE ||
-	    !(BIT(order) & THP_ORDERS_ALL_FILE_DEFAULT))
-		goto out;
+	always = huge_file_orders_always;
+	exec = huge_file_exec_order;
+	p = str_dup;
+	while ((token = strsep(&p, ";")) != NULL) {
+		range = strsep(&token, ":");
+		policy = token;
 
-	state++;
+		if (!policy)
+			goto err;
 
-	if (!strcmp(state, "always")) {
-		set_bit(order, &huge_file_orders_always);
-		ret = 1;
-	} else if (!strcmp(state, "always+exec")) {
-		set_bit(order, &huge_file_orders_always);
-		huge_file_exec_order = order;
-		ret = 1;
-	} else if (!strcmp(state, "never")) {
-		clear_bit(order, &huge_file_orders_always);
-		ret = 1;
+		while ((subtoken = strsep(&range, ",")) != NULL) {
+			if (strchr(subtoken, '-')) {
+				start_size = strsep(&subtoken, "-");
+				end_size = subtoken;
+
+				start = get_order_from_str(start_size,
+							   THP_ORDERS_ALL_FILE_DEFAULT);
+				end = get_order_from_str(end_size,
+							 THP_ORDERS_ALL_FILE_DEFAULT);
+			} else {
+				start_size = end_size = subtoken;
+				start = end = get_order_from_str(subtoken,
+								 THP_ORDERS_ALL_FILE_DEFAULT);
+			}
+
+			if (start == -EINVAL) {
+				pr_err("invalid size %s in thp_shmem boot parameter\n",
+				       start_size);
+				goto err;
+			}
+
+			if (end == -EINVAL) {
+				pr_err("invalid size %s in thp_shmem boot parameter\n",
+				       end_size);
+				goto err;
+			}
+
+			if (start < 0 || end < 0 || start > end)
+				goto err;
+
+			nr = end - start + 1;
+			if (!strcmp(policy, "always")) {
+				bitmap_set(&always, start, nr);
+			} else if (!strcmp(policy, "always+exec")) {
+				if (nr != 1)
+					goto err;
+				bitmap_set(&always, start, nr);
+				exec = start;
+			} else if (!strcmp(policy, "never")) {
+				bitmap_clear(&always, start, nr);
+				if (exec != -1 && !test_bit(exec, &always))
+					exec = -1;
+			} else {
+				pr_err("invalid policy %s in thp_file boot parameter\n", policy);
+				goto err;
+			}
+		}
 	}
 
-	if (ret)
-		file_orders_configured = true;
-out:
-	if (!ret)
-		pr_warn("thp_file=%s: cannot parse, ignored\n", str);
-	return ret;
+	huge_file_orders_always = always;
+	huge_file_exec_order = exec;
+	file_orders_configured = true;
+	return 1;
+err:
+	pr_warn("thp_file=%s: cannot parse, ignored\n", str);
+	return 0;
 }
 __setup("thp_file=", setup_thp_file);
 
-- 
Gitee


From cbe31e378a1f218a5c011c9f48f8f8647835d802 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linux.alibaba.com>
Date: Wed, 25 Dec 2024 17:13:51 +0800
Subject: [PATCH 5/8] anolis: mm: add mTHP counters for file folios

ANBZ: #9728

Add mTHP counters for file folios.

Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Xu Yu <xuyu@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/4378
---
 Documentation/admin-guide/mm/transhuge.rst |  4 ++++
 include/linux/huge_mm.h                    |  1 +
 mm/filemap.c                               | 16 +++++++++++++---
 mm/huge_memory.c                           |  2 ++
 4 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index 64ff27e37082..2c6e6305161d 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -746,6 +746,10 @@ nr_anon_partially_mapped
        an anonymous THP as "partially mapped" and count it here, even though it
        is not actually partially mapped anymore.
 
+file_alloc
+       is incremented every time a file huge page is successfully
+       allocated.
+
 As the system ages, allocating huge pages may be expensive as the
 system uses memory compaction to copy data around memory to free a
 huge page for use. There are some counters in ``/proc/vmstat`` to help
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 830648694a50..4050b8bfaf2a 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -160,6 +160,7 @@ enum mthp_stat_item {
 	MTHP_STAT_SPLIT_DEFERRED,
 	MTHP_STAT_NR_ANON,
 	MTHP_STAT_NR_ANON_PARTIALLY_MAPPED,
+	MTHP_STAT_FILE_ALLOC,
 	__MTHP_STAT_COUNT
 };
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 7873bab137da..1bfc68f0ae76 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -997,9 +997,13 @@ struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order,
 	int n;
 	struct folio *folio;
 
-	if (policy)
-		return folio_alloc_mpol_noprof(gfp, order, policy,
+	if (policy) {
+		folio = folio_alloc_mpol_noprof(gfp, order, policy,
 				NO_INTERLEAVE_INDEX, numa_node_id());
+		if (folio)
+			count_mthp_stat(order, MTHP_STAT_FILE_ALLOC);
+		return folio;
+	}
 
 	if (cpuset_do_page_mem_spread()) {
 		unsigned int cpuset_mems_cookie;
@@ -1009,9 +1013,15 @@ struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order,
 			folio = __folio_alloc_node_noprof(gfp, order, n);
 		} while (!folio && read_mems_allowed_retry(cpuset_mems_cookie));
 
+		if (folio)
+			count_mthp_stat(order, MTHP_STAT_FILE_ALLOC);
 		return folio;
 	}
-	return folio_alloc_noprof(gfp, order);
+
+	folio = folio_alloc_noprof(gfp, order);
+	if (folio)
+		count_mthp_stat(order, MTHP_STAT_FILE_ALLOC);
+	return folio;
 }
 EXPORT_SYMBOL(filemap_alloc_folio_noprof);
 #endif
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 323871694b91..94c69255bea3 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -754,6 +754,7 @@ DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
 DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
 DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON);
 DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED);
+DEFINE_MTHP_STAT_ATTR(file_alloc, MTHP_STAT_FILE_ALLOC);
 
 static struct attribute *anon_stats_attrs[] = {
 	&anon_fault_alloc_attr.attr,
@@ -779,6 +780,7 @@ static struct attribute_group anon_stats_attr_grp = {
 };
 
 static struct attribute *file_stats_attrs[] = {
+	&file_alloc_attr.attr,
 #ifdef CONFIG_SHMEM
 	&shmem_alloc_attr.attr,
 	&shmem_fallback_attr.attr,
-- 
Gitee


From 668b9ef8a65a8ba4a6ebdf095de13ad798562b58 Mon Sep 17 00:00:00 2001
From: Rongwei Wang <rongwei.wang@linux.alibaba.com>
Date: Mon, 30 Dec 2024 14:21:39 +0800
Subject: [PATCH 6/8] anolis: mm, thp: hugetext: make PIC binary mapping
 address THP align

ANBZ: #9728

The patch mainly to make mmap address of PIC binary is aligned
with HPAGE_PMD_SIZE. If not so, the ELF binary that is generated
with -fPIC compile option can not use hugepages, because of
the mapping address is randomly selected by kernel.

Note: Baolin Wang changed the code to make it suitable for the file mTHP.

Signed-off-by: Rongwei Wang <rongwei.wang@linux.alibaba.com>
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Xu Yu <xuyu@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/4378
[rebase 6.6.102]
---
 fs/binfmt_elf.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 16a56b6b3f6c..4717f75416b3 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1045,6 +1045,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
 		unsigned long k, vaddr;
 		unsigned long total_size = 0;
 		unsigned long alignment;
+		int exec_order = file_exec_order();
 
 		if (elf_ppnt->p_type != PT_LOAD)
 			continue;
@@ -1183,6 +1184,9 @@ static int load_elf_binary(struct linux_binprm *bprm)
 			 * is then page aligned.
 			 */
 			load_bias = ELF_PAGESTART(load_bias - vaddr);
+			if (exec_order > 0 && interpreter &&
+			    total_size >= (PAGE_SIZE << exec_order))
+				load_bias &= ~((PAGE_SIZE << exec_order) - 1);
 		}
 
 		error = elf_load(bprm->file, load_bias + vaddr, elf_ppnt,
-- 
Gitee


From 786cb437157a48c9ea88502f9fda6e411363dd6f Mon Sep 17 00:00:00 2001
From: Weilin Tong <tongweilin@linux.alibaba.com>
Date: Mon, 3 Mar 2025 17:24:06 +0800
Subject: [PATCH 7/8] anolis: mm: fix read-ahead beyond EOF in
 page_cache_ra_order()

ANBZ: #9728

Adjusted the loop condition to correctly handle limits and order constraints, ensuring
read-ahead does not exceed the EOF. This change improves file system robustness by
preventing potential over-read scenarios.

Fixes: 107bdaacdf96 ("mm: mTHP user controls to configure pagecache large folio sizes")
Signed-off-by: Weilin Tong <tongweilin@linux.alibaba.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/4755
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
---
 mm/readahead.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/readahead.c b/mm/readahead.c
index 90fcc6ec9557..5cff8dd48969 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -521,8 +521,8 @@ void page_cache_ra_order(struct readahead_control *ractl,
 		if (index & ((1UL << order) - 1))
 			order = select_new_order(order, __ffs(index), orders);
 		/* Don't allocate pages past EOF */
-		while (order > min_order && index + (1UL << order) - 1 > limit &&
-			(BIT(order) & orders) == 0)
+		while (order > min_order && (index + (1UL << order) - 1 > limit ||
+			(BIT(order) & orders) == 0))
 			order--;
 		err = ra_alloc_folio(ractl, index, mark, order, gfp);
 		if (err)
-- 
Gitee


From 9f96c8f3f79306640f72eb845d1242913772fa65 Mon Sep 17 00:00:00 2001
From: Zelin Deng <zelin.deng@linux.alibaba.com>
Date: Thu, 20 Mar 2025 12:43:28 +0800
Subject: [PATCH 8/8] anolis: mm: fallback to lower order in
 __filemap_get_folio()

ANBZ: #9728

In __filemap_get_folio() if filemap_alloc_folio() fails to alloc highest
order large folio, it continues the while loop with highest order. Thus
when system has high memory pressure, filemap_alloc_folio() cannot alloc
the expected large folio in time, the while loop can cause softlockup
panic.

[41230.985727] CPU: 39 PID: 14210 Comm: genload Kdump: loaded Tainted: G        W  OEL     6.6.71-3_rc2.an23.aarch64 #1
[41230.985729] Hardware name: AlibabaCloud AliServer-Xuanwu2.0AM-1UC1P-5B/AS1111MG1, BIOS 1.2.M1.AL.P.139.00 02/14/2023
[41230.985730] pstate: 634010c9 (nZCv daIF +PAN -UAO +TCO +DIT +SSBS BTYPE=--)
[41230.985732] pc : machine_kexec+0x40/0x200
[41230.985734] lr : machine_kexec+0x40/0x200
[41230.985737] sp : ffff800082783b80
[41230.985737] x29: ffff800082783b80 x28: ffff00173facc900 x27: ffff00173facd4c8
[41230.985740] x26: ffff8000818ce008 x25: ffff8016be1f9000 x24: ffff800082783e10
[41230.985743] x23: ffff8000ad003630 x22: ffff80008240b3e8 x21: ffff8000810e3750
[41230.985745] x20: ffff04000cc05000 x19: ffff04000cc05000 x18: ffffffffffffffff
[41230.985747] x17: 31313153412f4235 x16: 2d50314355312d4d x15: ffff00173facc900
[41230.985750] x14: ffff00173facd4c8 x13: 2e656c6261696c65 x12: 726e75206562206c
[41230.985752] x11: 000000010001057c x10: ffff800081fb9cf8 x9 : ffff8000800f2f48
[41230.985755] x8 : 00000000000083a0 x7 : c00000010001057c x6 : 000000000001e2a0
[41230.985757] x5 : ffff00173fac9288 x4 : 0000000000000000 x3 : ffff8016be1f9000
[41230.985760] x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffff00082f0da800
[41230.985762] Call trace:
[41230.985763]  machine_kexec+0x40/0x200
[41230.985766]  __crash_kexec+0x70/0xd8
[41230.985768]  panic+0x308/0x388
[41230.985771]  watchdog_timer_fn+0x2cc/0x2d8
[41230.985773]  __hrtimer_run_queues+0x19c/0x370
[41230.985775]  hrtimer_interrupt+0xec/0x248
[41230.985776]  arch_timer_handler_phys+0x30/0x50
[41230.985779]  handle_percpu_devid_irq+0x8c/0x230
[41230.985782]  generic_handle_domain_irq+0x30/0x50
[41230.985783]  __gic_handle_irq_from_irqson.isra.0+0x140/0x260
[41230.985786]  gic_handle_irq+0x2c/0xa0
[41230.985787]  call_on_irq_stack+0x24/0x30
[41230.985789]  do_interrupt_handler+0x80/0x90
[41230.985791]  el1_interrupt+0x44/0xa8
[41230.985793]  el1h_64_irq_handler+0x14/0x20
[41230.985794]  el1h_64_irq+0x78/0x80
[41230.985795]  arch_counter_get_cntpct+0x14/0x18
[41230.985797]  ktime_get+0x48/0xa8
[41230.985799]  memcg_lat_stat_start+0x24/0x50
[41230.985801]  __alloc_pages_direct_compact+0x58/0x388
[41230.985804]  __alloc_pages_slowpath+0x6b8/0x918
[41230.985805]  __alloc_pages+0x34c/0x428
[41230.985807]  alloc_pages+0x98/0x138
[41230.985809]  folio_alloc+0x1c/0x40
[41230.985812]  filemap_alloc_folio+0x3c/0xc0
[41230.985814]  __filemap_get_folio+0x1e8/0x470
[41230.985816]  iomap_get_folio+0x6c/0x88
[41230.985818]  iomap_write_begin+0x1c0/0x308
[41230.985820]  iomap_write_iter+0xf4/0x280
[41230.985822]  iomap_file_buffered_write+0x88/0xf0
[41230.985823]  xfs_file_buffered_write+0x98/0x2d0 [xfs]
[41230.985868]  xfs_file_write_iter+0x104/0x150 [xfs]
[41230.985915]  vfs_write+0x1a4/0x2f8
[41230.985918]  ksys_write+0x70/0x108
[41230.985920]  __arm64_sys_write+0x20/0x30
[41230.985923]  el0_svc_common.constprop.0+0x60/0x138
[41230.985925]  do_el0_svc+0x20/0x30
[41230.985928]  el0_svc+0x44/0x1a8
[41230.985929]  el0t_64_sync_handler+0xf8/0x128
[41230.985931]  el0t_64_sync+0x17c/0x180
[41230.985932] ---[ end trace 0000000000000000 ]---
[41230.985934] Bye!

For the original semantics of THP, fallback is required to try lower
order folio.
Add order fallback in __filemap_get_folio() while loop to try lower
order when higher order allocation fails.

Fixes: c5f71a671738 ("mm: mTHP user controls to configure pagecache large folio sizes")
Signed-off-by: Zelin Deng <zelin.deng@linux.alibaba.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/4867
---
 mm/filemap.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 1bfc68f0ae76..e3f652240b7b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2026,7 +2026,7 @@ struct folio *__filemap_get_folio_mpol(struct address_space *mapping,
 				alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN;
 			folio = filemap_alloc_folio(alloc_gfp, order, policy);
 			if (!folio)
-				continue;
+				goto try_next;
 
 			/* Init accessed so avoid atomic mark_page_accessed later */
 			if (fgp_flags & FGP_ACCESSED)
@@ -2040,6 +2040,7 @@ struct folio *__filemap_get_folio_mpol(struct address_space *mapping,
 			folio_put(folio);
 			folio = NULL;
 
+try_next:
 			if (order <= min_order)
 				break;
 			order = next_order(&orders, order);
-- 
Gitee