[PATCH] slab reclaim balancing

author Andrew Morton <akpm@digeo.com>

Wed, 25 Sep 2002 14:20:18 +0000 (07:20 -0700)

committer Linus Torvalds <torvalds@home.transmeta.com>

Wed, 25 Sep 2002 14:20:18 +0000 (07:20 -0700)
author Andrew Morton <akpm@digeo.com>
Wed, 25 Sep 2002 14:20:18 +0000 (07:20 -0700)
committer Linus Torvalds <torvalds@home.transmeta.com>
Wed, 25 Sep 2002 14:20:18 +0000 (07:20 -0700)
diff --git a/fs/dcache.c b/fs/dcache.c

index ac127d32eed91703bbd6005487913905299b31f3..1715f006ccd43e1b01178b2ce7f66d29c41d4f75 100644 (file)
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -329,12 +329,11 @@ static inline void prune_one_dentry(struct dentry * dentry)
  void prune_dcache(int count)
  {
         spin_lock(&dcache_lock);
-       for (;;) {
+       for (; count ; count--) {
                 struct dentry *dentry;
                 struct list_head *tmp;
  
                 tmp = dentry_unused.prev;
-
                 if (tmp == &dentry_unused)
                         break;
                 list_del_init(tmp);
@@ -349,12 +348,8 @@ void prune_dcache(int count)
                 dentry_stat.nr_unused--;
  
                 /* Unused dentry with a count? */
-               if (atomic_read(&dentry->d_count))
-                       BUG();
-
+               BUG_ON(atomic_read(&dentry->d_count));
                 prune_one_dentry(dentry);
-               if (!--count)
-                       break;
         }
         spin_unlock(&dcache_lock);
  }
@@ -573,19 +568,11 @@ void shrink_dcache_anon(struct list_head *head)
  
  /*
   * This is called from kswapd when we think we need some
- * more memory, but aren't really sure how much. So we
- * carefully try to free a _bit_ of our dcache, but not
- * too much.
- *
- * Priority:
- *   1 - very urgent: shrink everything
- *  ...
- *   6 - base-level: try to shrink a bit.
+ * more memory. 
   */
-int shrink_dcache_memory(int priority, unsigned int gfp_mask)
+int shrink_dcache_memory(int ratio, unsigned int gfp_mask)
  {
-       int count = 0;
-
+       int entries = dentry_stat.nr_dentry / ratio + 1;
         /*
          * Nasty deadlock avoidance.
          *
@@ -600,11 +587,8 @@ int shrink_dcache_memory(int priority, unsigned int gfp_mask)
         if (!(gfp_mask & __GFP_FS))
                 return 0;
  
-       count = dentry_stat.nr_unused / priority;
-
-       prune_dcache(count);
-       kmem_cache_shrink(dentry_cache);
-       return 0;
+       prune_dcache(entries);
+       return entries;
  }
  
  #define NAME_ALLOC_LEN(len)    ((len+16) & ~15)
diff --git a/fs/dquot.c b/fs/dquot.c

index 58095d92cbee612c443a7672b3b0ed65cbe361d7..3b1efaef018a66cc16688a57da92657c5d8206fe 100644 (file)
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -480,26 +480,17 @@ static void prune_dqcache(int count)
  
  /*
   * This is called from kswapd when we think we need some
- * more memory, but aren't really sure how much. So we
- * carefully try to free a _bit_ of our dqcache, but not
- * too much.
- *
- * Priority:
- *   1 - very urgent: shrink everything
- *   ...
- *   6 - base-level: try to shrink a bit.
+ * more memory
   */
  
-int shrink_dqcache_memory(int priority, unsigned int gfp_mask)
+int shrink_dqcache_memory(int ratio, unsigned int gfp_mask)
  {
-       int count = 0;
+       int entries = dqstats.allocated_dquots / ratio + 1;
  
         lock_kernel();
-       count = dqstats.free_dquots / priority;
-       prune_dqcache(count);
+       prune_dqcache(entries);
         unlock_kernel();
-       kmem_cache_shrink(dquot_cachep);
-       return 0;
+       return entries;
  }
  
  /*
diff --git a/fs/inode.c b/fs/inode.c

index 89c96e221043d052e41edc976f2a771480146370..c07e1e7e1a35892e09de8def2ebc1c0f134981b5 100644 (file)
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -386,10 +386,11 @@ void prune_icache(int goal)
  
         count = 0;
         entry = inode_unused.prev;
-       while (entry != &inode_unused)
-       {
+       for(; goal; goal--) {
                 struct list_head *tmp = entry;
  
+               if (entry == &inode_unused)
+                       break;
                 entry = entry->prev;
                 inode = INODE(tmp);
                 if (inode->i_state & (I_FREEING|I_CLEAR|I_LOCK))
@@ -403,8 +404,6 @@ void prune_icache(int goal)
                 list_add(tmp, freeable);
                 inode->i_state |= I_FREEING;
                 count++;
-               if (!--goal)
-                       break;
         }
         inodes_stat.nr_unused -= count;
         spin_unlock(&inode_lock);
@@ -414,19 +413,11 @@ void prune_icache(int goal)
  
  /*
   * This is called from kswapd when we think we need some
- * more memory, but aren't really sure how much. So we
- * carefully try to free a _bit_ of our icache, but not
- * too much.
- *
- * Priority:
- *   1 - very urgent: shrink everything
- *  ...
- *   6 - base-level: try to shrink a bit.
+ * more memory. 
   */
-int shrink_icache_memory(int priority, int gfp_mask)
+int shrink_icache_memory(int ratio, unsigned int gfp_mask)
  {
-       int count = 0;
-
+       int entries = inodes_stat.nr_inodes / ratio + 1;
         /*
          * Nasty deadlock avoidance..
          *
@@ -437,12 +428,10 @@ int shrink_icache_memory(int priority, int gfp_mask)
         if (!(gfp_mask & __GFP_FS))
                 return 0;
  
-       count = inodes_stat.nr_unused / priority;
-
-       prune_icache(count);
-       kmem_cache_shrink(inode_cachep);
-       return 0;
+       prune_icache(entries);
+       return entries;
  }
+EXPORT_SYMBOL(shrink_icache_memory);
  
  /*
   * Called with the inode lock held.
diff --git a/include/linux/dcache.h b/include/linux/dcache.h

index f99a03f17e606672737dcd818aa075a10b9aabe0..a64a657545fe77bd90e233859af2d6f42d107340 100644 (file)
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -186,7 +186,7 @@ extern int shrink_dcache_memory(int, unsigned int);
  extern void prune_dcache(int);
  
  /* icache memory management (defined in linux/fs/inode.c) */
-extern int shrink_icache_memory(int, int);
+extern int shrink_icache_memory(int, unsigned int);
  extern void prune_icache(int);
  
  /* quota cache memory management (defined in linux/fs/dquot.c) */
diff --git a/include/linux/mm.h b/include/linux/mm.h

index c63e4947387f6aac208bbfa31f43e5f684ddc8bb..482db998aca70ed4eaef95a7cb2a9ada07d08c2a 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -524,6 +524,7 @@ extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned lon
  
  extern struct page * vmalloc_to_page(void *addr);
  extern unsigned long get_page_cache_size(void);
+extern unsigned int nr_used_zone_pages(void);
  
  #endif /* __KERNEL__ */
  
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 435a12dd157495bd2442a161695f0af98d322d54..a1cce719581d70f6a7255f9f8f3f1f65655689c7 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -479,6 +479,17 @@ unsigned int nr_free_pages(void)
         return sum;
  }
  
+unsigned int nr_used_zone_pages(void)
+{
+       unsigned int pages = 0;
+       struct zone *zone;
+
+       for_each_zone(zone)
+               pages += zone->nr_active + zone->nr_inactive;
+
+       return pages;
+}
+
  static unsigned int nr_free_zone_pages(int offset)
  {
         pg_data_t *pgdat;
diff --git a/mm/slab.c b/mm/slab.c

index 549cd2f465ea97482bc0c0ee37e5f7ea1a68d68b..962598c0b1b76d39ecbd5fa8e1fcb25aec6cc224 100644 (file)
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1496,7 +1496,11 @@ static inline void kmem_cache_free_one(kmem_cache_t *cachep, void *objp)
                 if (unlikely(!--slabp->inuse)) {
                         /* Was partial or full, now empty. */
                         list_del(&slabp->list);
-                       list_add(&slabp->list, &cachep->slabs_free);
+/*                     list_add(&slabp->list, &cachep->slabs_free);            */
+                       if (unlikely(list_empty(&cachep->slabs_partial)))
+                               list_add(&slabp->list, &cachep->slabs_partial);
+                       else
+                               kmem_slab_destroy(cachep, slabp);
                 } else if (unlikely(inuse == cachep->num)) {
                         /* Was full. */
                         list_del(&slabp->list);
@@ -1970,7 +1974,7 @@ static int s_show(struct seq_file *m, void *p)
         }
         list_for_each(q,&cachep->slabs_partial) {
                 slabp = list_entry(q, slab_t, list);
-               if (slabp->inuse == cachep->num || !slabp->inuse)
+               if (slabp->inuse == cachep->num)
                         BUG();
                 active_objs += slabp->inuse;
                 active_slabs++;
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 5eade9423f0d5302b0035a9614bde5bb6b192787..4302f698a7a4d59635c2ebb0ec3239984bfe02cd 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -70,6 +70,10 @@
  #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
  #endif
  
+#ifndef CONFIG_QUOTA
+#define shrink_dqcache_memory(ratio, gfp_mask) do { } while (0)
+#endif
+
  /* Must be called with page's pte_chain_lock held. */
  static inline int page_mapping_inuse(struct page * page)
  {
@@ -97,7 +101,7 @@ static inline int is_page_cache_freeable(struct page *page)
  
  static /* inline */ int
  shrink_list(struct list_head *page_list, int nr_pages,
-               unsigned int gfp_mask, int *max_scan)
+               unsigned int gfp_mask, int *max_scan, int *nr_mapped)
  {
         struct address_space *mapping;
         LIST_HEAD(ret_pages);
@@ -116,6 +120,10 @@ shrink_list(struct list_head *page_list, int nr_pages,
                 if (TestSetPageLocked(page))
                         goto keep;
  
+               /* Double the slab pressure for mapped and swapcache pages */
+               if (page_mapped(page) || PageSwapCache(page))
+                       (*nr_mapped)++;
+
                 BUG_ON(PageActive(page));
                 may_enter_fs = (gfp_mask & __GFP_FS) ||
                                 (PageSwapCache(page) && (gfp_mask & __GFP_IO));
@@ -320,7 +328,7 @@ keep:
   */
  static /* inline */ int
  shrink_cache(int nr_pages, struct zone *zone,
-               unsigned int gfp_mask, int max_scan)
+               unsigned int gfp_mask, int max_scan, int *nr_mapped)
  {
         LIST_HEAD(page_list);
         struct pagevec pvec;
@@ -371,7 +379,8 @@ shrink_cache(int nr_pages, struct zone *zone,
  
                 max_scan -= nr_scan;
                 KERNEL_STAT_ADD(pgscan, nr_scan);
-               nr_pages = shrink_list(&page_list,nr_pages,gfp_mask,&max_scan);
+               nr_pages = shrink_list(&page_list, nr_pages,
+                               gfp_mask, &max_scan, nr_mapped);
  
                 if (nr_pages <= 0 && list_empty(&page_list))
                         goto done;
@@ -522,14 +531,10 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in)
  
  static /* inline */ int
  shrink_zone(struct zone *zone, int max_scan,
-               unsigned int gfp_mask, int nr_pages)
+               unsigned int gfp_mask, int nr_pages, int *nr_mapped)
  {
         unsigned long ratio;
  
-       /* This is bogus for ZONE_HIGHMEM? */
-       if (kmem_cache_reap(gfp_mask) >= nr_pages)
-               return 0;
-
         /*
          * Try to keep the active list 2/3 of the size of the cache.  And
          * make sure that refill_inactive is given a decent number of pages.
@@ -547,7 +552,8 @@ shrink_zone(struct zone *zone, int max_scan,
                 atomic_sub(SWAP_CLUSTER_MAX, &zone->refill_counter);
                 refill_inactive_zone(zone, SWAP_CLUSTER_MAX);
         }
-       nr_pages = shrink_cache(nr_pages, zone, gfp_mask, max_scan);
+       nr_pages = shrink_cache(nr_pages, zone, gfp_mask,
+                               max_scan, nr_mapped);
         return nr_pages;
  }
  
@@ -557,6 +563,9 @@ shrink_caches(struct zone *classzone, int priority,
  {
         struct zone *first_classzone;
         struct zone *zone;
+       int ratio;
+       int nr_mapped = 0;
+       int pages = nr_used_zone_pages();
  
         first_classzone = classzone->zone_pgdat->node_zones;
         for (zone = classzone; zone >= first_classzone; zone--) {
@@ -581,16 +590,28 @@ shrink_caches(struct zone *classzone, int priority,
                 max_scan = zone->nr_inactive >> priority;
                 if (max_scan < to_reclaim * 2)
                         max_scan = to_reclaim * 2;
-               unreclaimed = shrink_zone(zone, max_scan, gfp_mask, to_reclaim);
+               unreclaimed = shrink_zone(zone, max_scan,
+                               gfp_mask, to_reclaim, &nr_mapped);
                 nr_pages -= to_reclaim - unreclaimed;
                 *total_scanned += max_scan;
         }
  
-       shrink_dcache_memory(priority, gfp_mask);
-       shrink_icache_memory(1, gfp_mask);
-#ifdef CONFIG_QUOTA
-       shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
-#endif
+       /*
+        * Here we assume it costs one seek to replace a lru page and that
+        * it also takes a seek to recreate a cache object.  With this in
+        * mind we age equal percentages of the lru and ageable caches.
+        * This should balance the seeks generated by these structures.
+        *
+        * NOTE: for now I do this for all zones.  If we find this is too
+        * aggressive on large boxes we may want to exclude ZONE_HIGHMEM
+        *
+        * If we're encountering mapped pages on the LRU then increase the
+        * pressure on slab to avoid swapping.
+        */
+       ratio = (pages / (*total_scanned + nr_mapped + 1)) + 1;
+       shrink_dcache_memory(ratio, gfp_mask);
+       shrink_icache_memory(ratio, gfp_mask);
+       shrink_dqcache_memory(ratio, gfp_mask);
         return nr_pages;
  }
author	Andrew Morton <akpm@digeo.com>
	Wed, 25 Sep 2002 14:20:18 +0000 (07:20 -0700)
committer	Linus Torvalds <torvalds@home.transmeta.com>
	Wed, 25 Sep 2002 14:20:18 +0000 (07:20 -0700)
fs/dcache.c		patch \| blob \| history
fs/dquot.c		patch \| blob \| history
fs/inode.c		patch \| blob \| history
include/linux/dcache.h		patch \| blob \| history
include/linux/mm.h		patch \| blob \| history
mm/page_alloc.c		patch \| blob \| history
mm/slab.c		patch \| blob \| history
mm/vmscan.c		patch \| blob \| history