From 36fb7f8459cc42eca202f0ad7b2d051359406d57 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Thu, 21 Nov 2002 19:32:34 -0800
Subject: [PATCH] [PATCH] handle zones which are full of unreclaimable pages

This patch is a general solution to the situation where a zone is full
of pinned pages.

This can come about if:

a) Someone has allocated all of ZONE_DMA for IO buffers

b) Some application is mlocking some memory and a zone ends up full
   of mlocked pages (can happen on a 1G ia32 system)

c) All of ZONE_HIGHMEM is pinned in hugetlb pages (can happen on 1G
   machines)

We'll currently burn 10% of CPU in kswapd when this happens, although
it is quite hard to trigger.

The algorithm is:

- If page reclaim has scanned 2 * the total number of pages in the
  zone and there have been no pages freed in that zone then mark the
  zone as "all unreclaimable".

- When a zone is "all unreclaimable" page reclaim almost ignores it.
  We will perform a "light" scan at DEF_PRIORITY (typically 1/4096'th of
  the zone, or 64 pages) and then forget about the zone.

- When a batch of pages are freed into the zone, clear its "all
  unreclaimable" state and start full scanning again.  The assumption
  being that some state change has come about which will make reclaim
  successful again.

  So if a "light scan" actually frees some pages, the zone will revert to
  normal state immediately.

So we're effectively putting the zone into "low power" mode, and lightly
polling it to see if something has changed.

The code works OK, but is quite hard to test - I mainly tested it by
pinning all highmem in hugetlb pages.
---
 include/linux/mmzone.h |  3 +++
 mm/page_alloc.c        | 16 ++++++++++------
 mm/vmscan.c            | 35 ++++++++++++++++++++++++++++++++++-
 3 files changed, 47 insertions(+), 7 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 3e004bc2ff63..f286bf9aeefd 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -84,6 +84,8 @@ struct zone {
 	atomic_t		refill_counter;
 	unsigned long		nr_active;
 	unsigned long		nr_inactive;
+	int			all_unreclaimable; /* All pages pinned */
+	unsigned long		pages_scanned;	   /* since last reclaim */
 
 	ZONE_PADDING(_pad2_)
 
@@ -203,6 +205,7 @@ memclass(struct zone *pgzone, struct zone *classzone)
 
 void get_zone_counts(unsigned long *active, unsigned long *inactive);
 void build_all_zonelists(void);
+void wakeup_kswapd(struct zone *zone);
 
 /**
  * for_each_pgdat - helper macro to iterate over all nodes
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 16b70897ca42..d8921e02318a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -167,6 +167,12 @@ static inline void free_pages_check(const char *function, struct page *page)
  * Frees a list of pages. 
  * Assumes all pages on list are in same zone, and of same order.
  * count is the number of pages to free, or 0 for all on the list.
+ *
+ * If the zone was previously in an "all pages pinned" state then look to
+ * see if this freeing clears that state.
+ *
+ * And clear the zone's pages_scanned counter, to hold off the "all pages are
+ * pinned" detection logic.
  */
 static int
 free_pages_bulk(struct zone *zone, int count,
@@ -181,6 +187,8 @@ free_pages_bulk(struct zone *zone, int count,
 	base = zone->zone_mem_map;
 	area = zone->free_area + order;
 	spin_lock_irqsave(&zone->lock, flags);
+	zone->all_unreclaimable = 0;
+	zone->pages_scanned = 0;
 	while (!list_empty(list) && count--) {
 		page = list_entry(list->prev, struct page, list);
 		/* have to delete it as __free_pages_bulk list manipulates */
@@ -464,12 +472,8 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 	}
 
 	/* we're somewhat low on memory, failed to find what we needed */
-	for (i = 0; zones[i] != NULL; i++) {
-		struct zone *z = zones[i];
-		if (z->free_pages <= z->pages_low &&
-		    waitqueue_active(&z->zone_pgdat->kswapd_wait))
-			wake_up_interruptible(&z->zone_pgdat->kswapd_wait);
-	}
+	for (i = 0; zones[i] != NULL; i++)
+		wakeup_kswapd(zones[i]);
 
 	/* Go through the zonelist again, taking __GFP_HIGH into account */
 	min = 1UL << order;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 42754033157b..42dfeadc6d9d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -468,6 +468,7 @@ shrink_cache(const int nr_pages, struct zone *zone,
 			nr_taken++;
 		}
 		zone->nr_inactive -= nr_taken;
+		zone->pages_scanned += nr_taken;
 		spin_unlock_irq(&zone->lru_lock);
 
 		if (nr_taken == 0)
@@ -720,6 +721,9 @@ shrink_zone(struct zone *zone, int max_scan, unsigned int gfp_mask,
  *    satisfy the `incremental min' zone defense algorithm.
  *
  * Returns the number of reclaimed pages.
+ *
+ * If a zone is deemed to be full of pinned pages then just give it a light
+ * scan then give up on it.
  */
 static int
 shrink_caches(struct zone *classzone, int priority, int *total_scanned,
@@ -735,6 +739,9 @@ shrink_caches(struct zone *classzone, int priority, int *total_scanned,
 		int nr_mapped = 0;
 		int max_scan;
 
+		if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+			continue;	/* Let kswapd poll it */
+
 		/*
 		 * If we cannot reclaim `nr_pages' pages by scanning twice
 		 * that many pages then fall back to the next zone.
@@ -817,6 +824,14 @@ try_to_free_pages(struct zone *classzone,
  * special.
  *
  * Returns the number of pages which were actually freed.
+ *
+ * There is special handling here for zones which are full of pinned pages.
+ * This can happen if the pages are all mlocked, or if they are all used by
+ * device drivers (say, ZONE_DMA).  Or if they are all in use by hugetlb.
+ * What we do is to detect the case where all pages in the zone have been
+ * scanned twice and there has been zero successful reclaim.  Mark the zone as
+ * dead and from now on, only perform a short scan.  Basically we're polling
+ * the zone for when the problem goes away.
  */
 static int balance_pgdat(pg_data_t *pgdat, int nr_pages, struct page_state *ps)
 {
@@ -833,6 +848,9 @@ static int balance_pgdat(pg_data_t *pgdat, int nr_pages, struct page_state *ps)
 			int max_scan;
 			int to_reclaim;
 
+			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+				continue;
+
 			if (nr_pages && to_free > 0) {	/* Software suspend */
 				to_reclaim = min(to_free, SWAP_CLUSTER_MAX*8);
 			} else {			/* Zone balancing */
@@ -849,6 +867,10 @@ static int balance_pgdat(pg_data_t *pgdat, int nr_pages, struct page_state *ps)
 			to_free -= shrink_zone(zone, max_scan, GFP_KSWAPD,
 					to_reclaim, &nr_mapped, ps, priority);
 			shrink_slab(max_scan + nr_mapped, GFP_KSWAPD);
+			if (zone->all_unreclaimable)
+				continue;
+			if (zone->pages_scanned > zone->present_pages * 2)
+				zone->all_unreclaimable = 1;
 		}
 		if (all_zones_ok)
 			break;
@@ -909,6 +931,18 @@ int kswapd(void *p)
 	}
 }
 
+/*
+ * A zone is low on free memory, so wake its kswapd task to service it.
+ */
+void wakeup_kswapd(struct zone *zone)
+{
+	if (zone->free_pages > zone->pages_low)
+		return;
+	if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait))
+		return;
+	wake_up_interruptible(&zone->zone_pgdat->kswapd_wait);
+}
+
 #ifdef CONFIG_SOFTWARE_SUSPEND
 /*
  * Try to free `nr_pages' of memory, system-wide.  Returns the number of freed
@@ -938,7 +972,6 @@ int shrink_all_memory(int nr_pages)
 static int __init kswapd_init(void)
 {
 	pg_data_t *pgdat;
-	printk("Starting kswapd\n");
 	swap_setup();
 	for_each_pgdat(pgdat)
 		kernel_thread(kswapd, pgdat, CLONE_KERNEL);
-- 
2.39.5