mm: swap: relaim the cached parts that got scanned
This commit implements reclaim during scan for cluster allocator. Cluster scanning were unable to reuse SWAP_HAS_CACHE slots, which could result in low allocation success rate or early OOM. So to ensure maximum allocation success rate, integrate reclaiming with scanning. If found a range of suitable swap slots but fragmented due to HAS_CACHE, just try to reclaim the slots. Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-8-cb9c148b9297@kernel.org Signed-off-by: Kairui Song <kasong@tencent.com> Reported-by: Barry Song <21cnbao@gmail.com> Cc: Chris Li <chrisl@kernel.org> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Hugh Dickins <hughd@google.com> Cc: Kalesh Singh <kaleshsingh@google.com> Cc: Ryan Roberts <ryan.roberts@arm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
parent
477cb7ba28
commit
661383c611
@ -301,6 +301,7 @@ struct swap_info_struct {
|
|||||||
/* list of cluster that contains at least one free slot */
|
/* list of cluster that contains at least one free slot */
|
||||||
struct list_head frag_clusters[SWAP_NR_ORDERS];
|
struct list_head frag_clusters[SWAP_NR_ORDERS];
|
||||||
/* list of cluster that are fragmented or contented */
|
/* list of cluster that are fragmented or contented */
|
||||||
|
unsigned int frag_cluster_nr[SWAP_NR_ORDERS];
|
||||||
unsigned int lowest_bit; /* index of first free in swap_map */
|
unsigned int lowest_bit; /* index of first free in swap_map */
|
||||||
unsigned int highest_bit; /* index of last free in swap_map */
|
unsigned int highest_bit; /* index of last free in swap_map */
|
||||||
unsigned int pages; /* total of usable pages of swap */
|
unsigned int pages; /* total of usable pages of swap */
|
||||||
|
144
mm/swapfile.c
144
mm/swapfile.c
@ -513,6 +513,10 @@ static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *
|
|||||||
VM_BUG_ON(ci->count != 0);
|
VM_BUG_ON(ci->count != 0);
|
||||||
lockdep_assert_held(&si->lock);
|
lockdep_assert_held(&si->lock);
|
||||||
lockdep_assert_held(&ci->lock);
|
lockdep_assert_held(&ci->lock);
|
||||||
|
|
||||||
|
if (ci->flags & CLUSTER_FLAG_FRAG)
|
||||||
|
si->frag_cluster_nr[ci->order]--;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If the swap is discardable, prepare discard the cluster
|
* If the swap is discardable, prepare discard the cluster
|
||||||
* instead of free it immediately. The cluster will be freed
|
* instead of free it immediately. The cluster will be freed
|
||||||
@ -572,31 +576,84 @@ static void dec_cluster_info_page(struct swap_info_struct *p,
|
|||||||
|
|
||||||
if (!(ci->flags & CLUSTER_FLAG_NONFULL)) {
|
if (!(ci->flags & CLUSTER_FLAG_NONFULL)) {
|
||||||
VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE);
|
VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE);
|
||||||
if (ci->flags & CLUSTER_FLAG_FRAG)
|
if (ci->flags & CLUSTER_FLAG_FRAG) {
|
||||||
|
p->frag_cluster_nr[ci->order]--;
|
||||||
list_move_tail(&ci->list, &p->nonfull_clusters[ci->order]);
|
list_move_tail(&ci->list, &p->nonfull_clusters[ci->order]);
|
||||||
else
|
} else {
|
||||||
list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]);
|
list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]);
|
||||||
|
}
|
||||||
ci->flags = CLUSTER_FLAG_NONFULL;
|
ci->flags = CLUSTER_FLAG_NONFULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline bool cluster_scan_range(struct swap_info_struct *si, unsigned int start,
|
static bool cluster_reclaim_range(struct swap_info_struct *si,
|
||||||
unsigned int nr_pages)
|
struct swap_cluster_info *ci,
|
||||||
|
unsigned long start, unsigned long end)
|
||||||
{
|
{
|
||||||
unsigned char *p = si->swap_map + start;
|
unsigned char *map = si->swap_map;
|
||||||
unsigned char *end = p + nr_pages;
|
unsigned long offset;
|
||||||
|
|
||||||
while (p < end)
|
spin_unlock(&ci->lock);
|
||||||
if (*p++)
|
spin_unlock(&si->lock);
|
||||||
|
|
||||||
|
for (offset = start; offset < end; offset++) {
|
||||||
|
switch (READ_ONCE(map[offset])) {
|
||||||
|
case 0:
|
||||||
|
continue;
|
||||||
|
case SWAP_HAS_CACHE:
|
||||||
|
if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT) > 0)
|
||||||
|
continue;
|
||||||
|
goto out;
|
||||||
|
default:
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out:
|
||||||
|
spin_lock(&si->lock);
|
||||||
|
spin_lock(&ci->lock);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Recheck the range no matter reclaim succeeded or not, the slot
|
||||||
|
* could have been be freed while we are not holding the lock.
|
||||||
|
*/
|
||||||
|
for (offset = start; offset < end; offset++)
|
||||||
|
if (READ_ONCE(map[offset]))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool cluster_scan_range(struct swap_info_struct *si,
|
||||||
|
struct swap_cluster_info *ci,
|
||||||
|
unsigned long start, unsigned int nr_pages)
|
||||||
|
{
|
||||||
|
unsigned long offset, end = start + nr_pages;
|
||||||
|
unsigned char *map = si->swap_map;
|
||||||
|
bool need_reclaim = false;
|
||||||
|
|
||||||
static inline void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci,
|
for (offset = start; offset < end; offset++) {
|
||||||
unsigned int start, unsigned char usage,
|
switch (READ_ONCE(map[offset])) {
|
||||||
unsigned int order)
|
case 0:
|
||||||
|
continue;
|
||||||
|
case SWAP_HAS_CACHE:
|
||||||
|
if (!vm_swap_full())
|
||||||
|
return false;
|
||||||
|
need_reclaim = true;
|
||||||
|
continue;
|
||||||
|
default:
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (need_reclaim)
|
||||||
|
return cluster_reclaim_range(si, ci, start, end);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci,
|
||||||
|
unsigned int start, unsigned char usage,
|
||||||
|
unsigned int order)
|
||||||
{
|
{
|
||||||
unsigned int nr_pages = 1 << order;
|
unsigned int nr_pages = 1 << order;
|
||||||
|
|
||||||
@ -615,6 +672,8 @@ static inline void cluster_alloc_range(struct swap_info_struct *si, struct swap_
|
|||||||
if (ci->count == SWAPFILE_CLUSTER) {
|
if (ci->count == SWAPFILE_CLUSTER) {
|
||||||
VM_BUG_ON(!(ci->flags &
|
VM_BUG_ON(!(ci->flags &
|
||||||
(CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL | CLUSTER_FLAG_FRAG)));
|
(CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL | CLUSTER_FLAG_FRAG)));
|
||||||
|
if (ci->flags & CLUSTER_FLAG_FRAG)
|
||||||
|
si->frag_cluster_nr[ci->order]--;
|
||||||
list_del(&ci->list);
|
list_del(&ci->list);
|
||||||
ci->flags = 0;
|
ci->flags = 0;
|
||||||
}
|
}
|
||||||
@ -640,7 +699,7 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigne
|
|||||||
}
|
}
|
||||||
|
|
||||||
while (offset <= end) {
|
while (offset <= end) {
|
||||||
if (cluster_scan_range(si, offset, nr_pages)) {
|
if (cluster_scan_range(si, ci, offset, nr_pages)) {
|
||||||
cluster_alloc_range(si, ci, offset, usage, order);
|
cluster_alloc_range(si, ci, offset, usage, order);
|
||||||
*foundp = offset;
|
*foundp = offset;
|
||||||
if (ci->count == SWAPFILE_CLUSTER) {
|
if (ci->count == SWAPFILE_CLUSTER) {
|
||||||
@ -668,9 +727,8 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
|
|||||||
unsigned char usage)
|
unsigned char usage)
|
||||||
{
|
{
|
||||||
struct percpu_cluster *cluster;
|
struct percpu_cluster *cluster;
|
||||||
struct swap_cluster_info *ci, *n;
|
struct swap_cluster_info *ci;
|
||||||
unsigned int offset, found = 0;
|
unsigned int offset, found = 0;
|
||||||
LIST_HEAD(fraged);
|
|
||||||
|
|
||||||
new_cluster:
|
new_cluster:
|
||||||
lockdep_assert_held(&si->lock);
|
lockdep_assert_held(&si->lock);
|
||||||
@ -690,25 +748,42 @@ new_cluster:
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (order < PMD_ORDER) {
|
if (order < PMD_ORDER) {
|
||||||
list_for_each_entry_safe(ci, n, &si->nonfull_clusters[order], list) {
|
unsigned int frags = 0;
|
||||||
list_move_tail(&ci->list, &fraged);
|
|
||||||
|
while (!list_empty(&si->nonfull_clusters[order])) {
|
||||||
|
ci = list_first_entry(&si->nonfull_clusters[order],
|
||||||
|
struct swap_cluster_info, list);
|
||||||
|
list_move_tail(&ci->list, &si->frag_clusters[order]);
|
||||||
ci->flags = CLUSTER_FLAG_FRAG;
|
ci->flags = CLUSTER_FLAG_FRAG;
|
||||||
|
si->frag_cluster_nr[order]++;
|
||||||
offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
|
offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
|
||||||
&found, order, usage);
|
&found, order, usage);
|
||||||
|
frags++;
|
||||||
if (found)
|
if (found)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!found) {
|
if (!found) {
|
||||||
list_for_each_entry_safe(ci, n, &si->frag_clusters[order], list) {
|
/*
|
||||||
|
* Nonfull clusters are moved to frag tail if we reached
|
||||||
|
* here, count them too, don't over scan the frag list.
|
||||||
|
*/
|
||||||
|
while (frags < si->frag_cluster_nr[order]) {
|
||||||
|
ci = list_first_entry(&si->frag_clusters[order],
|
||||||
|
struct swap_cluster_info, list);
|
||||||
|
/*
|
||||||
|
* Rotate the frag list to iterate, they were all failing
|
||||||
|
* high order allocation or moved here due to per-CPU usage,
|
||||||
|
* this help keeping usable cluster ahead.
|
||||||
|
*/
|
||||||
|
list_move_tail(&ci->list, &si->frag_clusters[order]);
|
||||||
offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
|
offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
|
||||||
&found, order, usage);
|
&found, order, usage);
|
||||||
|
frags++;
|
||||||
if (found)
|
if (found)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
list_splice_tail(&fraged, &si->frag_clusters[order]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (found)
|
if (found)
|
||||||
@ -729,25 +804,28 @@ new_cluster:
|
|||||||
|
|
||||||
/* Order 0 stealing from higher order */
|
/* Order 0 stealing from higher order */
|
||||||
for (int o = 1; o < SWAP_NR_ORDERS; o++) {
|
for (int o = 1; o < SWAP_NR_ORDERS; o++) {
|
||||||
if (!list_empty(&si->frag_clusters[o])) {
|
/*
|
||||||
|
* Clusters here have at least one usable slots and can't fail order 0
|
||||||
|
* allocation, but reclaim may drop si->lock and race with another user.
|
||||||
|
*/
|
||||||
|
while (!list_empty(&si->frag_clusters[o])) {
|
||||||
ci = list_first_entry(&si->frag_clusters[o],
|
ci = list_first_entry(&si->frag_clusters[o],
|
||||||
struct swap_cluster_info, list);
|
struct swap_cluster_info, list);
|
||||||
offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found,
|
|
||||||
0, usage);
|
|
||||||
VM_BUG_ON(!found);
|
|
||||||
goto done;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!list_empty(&si->nonfull_clusters[o])) {
|
|
||||||
ci = list_first_entry(&si->nonfull_clusters[o], struct swap_cluster_info,
|
|
||||||
list);
|
|
||||||
offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
|
offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
|
||||||
&found, 0, usage);
|
&found, 0, usage);
|
||||||
VM_BUG_ON(!found);
|
if (found)
|
||||||
goto done;
|
goto done;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (!list_empty(&si->nonfull_clusters[o])) {
|
||||||
|
ci = list_first_entry(&si->nonfull_clusters[o],
|
||||||
|
struct swap_cluster_info, list);
|
||||||
|
offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
|
||||||
|
&found, 0, usage);
|
||||||
|
if (found)
|
||||||
|
goto done;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
done:
|
done:
|
||||||
cluster->next[order] = offset;
|
cluster->next[order] = offset;
|
||||||
return found;
|
return found;
|
||||||
@ -3042,6 +3120,7 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
|
|||||||
for (i = 0; i < SWAP_NR_ORDERS; i++) {
|
for (i = 0; i < SWAP_NR_ORDERS; i++) {
|
||||||
INIT_LIST_HEAD(&p->nonfull_clusters[i]);
|
INIT_LIST_HEAD(&p->nonfull_clusters[i]);
|
||||||
INIT_LIST_HEAD(&p->frag_clusters[i]);
|
INIT_LIST_HEAD(&p->frag_clusters[i]);
|
||||||
|
p->frag_cluster_nr[i] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (i = 0; i < swap_header->info.nr_badpages; i++) {
|
for (i = 0; i < swap_header->info.nr_badpages; i++) {
|
||||||
@ -3085,7 +3164,6 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
|
|||||||
if (!cluster_info)
|
if (!cluster_info)
|
||||||
return nr_extents;
|
return nr_extents;
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Reduce false cache line sharing between cluster_info and
|
* Reduce false cache line sharing between cluster_info and
|
||||||
* sharing same address space.
|
* sharing same address space.
|
||||||
|
Loading…
Reference in New Issue
Block a user