[PATCH] batched slab shrink and registration API

author Andrew Morton <akpm@digeo.com>

Sun, 13 Oct 2002 09:58:45 +0000 (02:58 -0700)

committer Linus Torvalds <torvalds@home.transmeta.com>

Sun, 13 Oct 2002 09:58:45 +0000 (02:58 -0700)
author Andrew Morton <akpm@digeo.com>
Sun, 13 Oct 2002 09:58:45 +0000 (02:58 -0700)
committer Linus Torvalds <torvalds@home.transmeta.com>
Sun, 13 Oct 2002 09:58:45 +0000 (02:58 -0700)
diff --git a/fs/dcache.c b/fs/dcache.c

index 4528be4d90d1098f14721d1b858caad6eed48995..ef0871dbcdb204f7b2dad1e826f568ccc70331ff 100644 (file)
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -328,7 +328,7 @@ static inline void prune_one_dentry(struct dentry * dentry)
   * all the dentries are in use.
   */
   
-void prune_dcache(int count)
+static void prune_dcache(int count)
  {
         spin_lock(&dcache_lock);
         for (; count ; count--) {
@@ -572,25 +572,24 @@ void shrink_dcache_anon(struct list_head *head)
   * This is called from kswapd when we think we need some
   * more memory. 
   */
-int shrink_dcache_memory(int ratio, unsigned int gfp_mask)
+static int shrink_dcache_memory(int nr, unsigned int gfp_mask)
  {
-       int entries = dentry_stat.nr_dentry / ratio + 1;
-       /*
-        * Nasty deadlock avoidance.
-        *
-        * ext2_new_block->getblk->GFP->shrink_dcache_memory->prune_dcache->
-        * prune_one_dentry->dput->dentry_iput->iput->inode->i_sb->s_op->
-        * put_inode->ext2_discard_prealloc->ext2_free_blocks->lock_super->
-        * DEADLOCK.
-        *
-        * We should make sure we don't hold the superblock lock over
-        * block allocations, but for now:
-        */
-       if (!(gfp_mask & __GFP_FS))
-               return 0;
-
-       prune_dcache(entries);
-       return entries;
+       if (nr) {
+               /*
+                * Nasty deadlock avoidance.
+                *
+                * ext2_new_block->getblk->GFP->shrink_dcache_memory->
+                * prune_dcache->prune_one_dentry->dput->dentry_iput->iput->
+                * inode->i_sb->s_op->put_inode->ext2_discard_prealloc->
+                * ext2_free_blocks->lock_super->DEADLOCK.
+                *
+                * We should make sure we don't hold the superblock lock over
+                * block allocations, but for now:
+                */
+               if (gfp_mask & __GFP_FS)
+                       prune_dcache(nr);
+       }
+       return dentry_stat.nr_dentry;
  }
  
  #define NAME_ALLOC_LEN(len)    ((len+16) & ~15)
@@ -1330,6 +1329,8 @@ static void __init dcache_init(unsigned long mempages)
                                          NULL, NULL);
         if (!dentry_cache)
                 panic("Cannot create dentry cache");
+       
+       set_shrinker(DEFAULT_SEEKS, shrink_dcache_memory);
  
  #if PAGE_SHIFT < 13
         mempages >>= (13 - PAGE_SHIFT);
@@ -1375,9 +1376,6 @@ kmem_cache_t *names_cachep;
  /* SLAB cache for file structures */
  kmem_cache_t *filp_cachep;
  
-/* SLAB cache for dquot structures */
-kmem_cache_t *dquot_cachep;
-
  EXPORT_SYMBOL(d_genocide);
  
  extern void bdev_cache_init(void);
@@ -1397,14 +1395,6 @@ void __init vfs_caches_init(unsigned long mempages)
         if(!filp_cachep)
                 panic("Cannot create filp SLAB cache");
  
-#if defined (CONFIG_QUOTA)
-       dquot_cachep = kmem_cache_create("dquot", 
-                       sizeof(struct dquot), sizeof(unsigned long) * 4,
-                       SLAB_HWCACHE_ALIGN, NULL, NULL);
-       if (!dquot_cachep)
-               panic("Cannot create dquot SLAB cache");
-#endif
-
         dcache_init(mempages);
         inode_init(mempages);
         files_init(mempages); 
diff --git a/fs/dquot.c b/fs/dquot.c

index f97b3609c894540e78173da17c4dc0d16096f2a5..24d50ae3482458804c807195b91ad70c89731328 100644 (file)
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -55,6 +55,7 @@
  #include <linux/errno.h>
  #include <linux/kernel.h>
  #include <linux/fs.h>
+#include <linux/mm.h>
  #include <linux/time.h>
  #include <linux/types.h>
  #include <linux/string.h>
@@ -481,14 +482,14 @@ static void prune_dqcache(int count)
   * more memory
   */
  
-int shrink_dqcache_memory(int ratio, unsigned int gfp_mask)
+static int shrink_dqcache_memory(int nr, unsigned int gfp_mask)
  {
-       int entries = dqstats.allocated_dquots / ratio + 1;
-
-       lock_kernel();
-       prune_dqcache(entries);
-       unlock_kernel();
-       return entries;
+       if (nr) {
+               lock_kernel();
+               prune_dqcache(nr);
+               unlock_kernel();
+       }
+       return dqstats.allocated_dquots;
  }
  
  /*
@@ -1490,6 +1491,9 @@ static ctl_table sys_table[] = {
         {},
  };
  
+/* SLAB cache for dquot structures */
+kmem_cache_t *dquot_cachep;
+
  static int __init dquot_init(void)
  {
         int i;
@@ -1499,9 +1503,17 @@ static int __init dquot_init(void)
                 INIT_LIST_HEAD(dquot_hash + i);
         printk(KERN_NOTICE "VFS: Disk quotas v%s\n", __DQUOT_VERSION__);
  
+       dquot_cachep = kmem_cache_create("dquot", 
+                       sizeof(struct dquot), sizeof(unsigned long) * 4,
+                       SLAB_HWCACHE_ALIGN, NULL, NULL);
+       if (!dquot_cachep)
+               panic("Cannot create dquot SLAB cache");
+
+       set_shrinker(DEFAULT_SEEKS, shrink_dqcache_memory);
+
         return 0;
  }
-__initcall(dquot_init);
+module_init(dquot_init);
  
  EXPORT_SYMBOL(register_quota_format);
  EXPORT_SYMBOL(unregister_quota_format);
diff --git a/fs/inode.c b/fs/inode.c

index d567858897305c4c07a2de71faded27b27f96afd..4f56d96031ea2742de3bcaf9b2b4ba404bdf80eb 100644 (file)
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -243,22 +243,25 @@ void clear_inode(struct inode *inode)
   * Dispose-list gets a local list with local inodes in it, so it doesn't
   * need to worry about list corruption and SMP locks.
   */
-static void dispose_list(struct list_head * head)
+static void dispose_list(struct list_head *head)
  {
-       struct list_head * inode_entry;
-       struct inode * inode;
+       int nr_disposed = 0;
+
+       while (!list_empty(head)) {
+               struct inode *inode;
  
-       while ((inode_entry = head->next) != head)
-       {
-               list_del(inode_entry);
+               inode = list_entry(head->next, struct inode, i_list);
+               list_del(&inode->i_list);
  
-               inode = list_entry(inode_entry, struct inode, i_list);
                 if (inode->i_data.nrpages)
                         truncate_inode_pages(&inode->i_data, 0);
                 clear_inode(inode);
                 destroy_inode(inode);
-               inodes_stat.nr_inodes--;
+               nr_disposed++;
         }
+       spin_lock(&inode_lock);
+       inodes_stat.nr_inodes -= nr_disposed;
+       spin_unlock(&inode_lock);
  }
  
  /*
@@ -377,7 +380,7 @@ int invalidate_device(kdev_t dev, int do_sync)
          !inode_has_buffers(inode))
  #define INODE(entry)   (list_entry(entry, struct inode, i_list))
  
-void prune_icache(int goal)
+static inline void prune_icache(int goal)
  {
         LIST_HEAD(list);
         struct list_head *entry, *freeable = &list;
@@ -417,23 +420,19 @@ void prune_icache(int goal)
   * This is called from kswapd when we think we need some
   * more memory. 
   */
-int shrink_icache_memory(int ratio, unsigned int gfp_mask)
+static int shrink_icache_memory(int nr, unsigned int gfp_mask)
  {
-       int entries = inodes_stat.nr_inodes / ratio + 1;
-       /*
-        * Nasty deadlock avoidance..
-        *
-        * We may hold various FS locks, and we don't
-        * want to recurse into the FS that called us
-        * in clear_inode() and friends..
-        */
-       if (!(gfp_mask & __GFP_FS))
-               return 0;
-
-       prune_icache(entries);
-       return entries;
+       if (nr) {
+               /*
+                * Nasty deadlock avoidance.  We may hold various FS locks,
+                * and we don't want to recurse into the FS that called us
+                * in clear_inode() and friends..
+                */
+               if (gfp_mask & __GFP_FS)
+                       prune_icache(nr);
+       }
+       return inodes_stat.nr_inodes;
  }
-EXPORT_SYMBOL(shrink_icache_memory);
  
  /*
   * Called with the inode lock held.
@@ -1226,4 +1225,6 @@ void __init inode_init(unsigned long mempages)
                                          NULL);
         if (!inode_cachep)
                 panic("cannot create inode slab cache");
+
+       set_shrinker(DEFAULT_SEEKS, shrink_icache_memory);
  }
diff --git a/include/linux/dcache.h b/include/linux/dcache.h

index 0abaaaa2c96d7a38a059d7297f8d8acd9e5c2959..71708edafce9f09aaa749cc9582a3c9790080c09 100644 (file)
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -180,17 +180,6 @@ extern void shrink_dcache_parent(struct dentry *);
  extern void shrink_dcache_anon(struct list_head *);
  extern int d_invalidate(struct dentry *);
  
-/* dcache memory management */
-extern int shrink_dcache_memory(int, unsigned int);
-extern void prune_dcache(int);
-
-/* icache memory management (defined in linux/fs/inode.c) */
-extern int shrink_icache_memory(int, unsigned int);
-extern void prune_icache(int);
-
-/* quota cache memory management (defined in linux/fs/dquot.c) */
-extern int shrink_dqcache_memory(int, unsigned int);
-
  /* only used at mount-time */
  extern struct dentry * d_alloc_root(struct inode *);
  
diff --git a/include/linux/mm.h b/include/linux/mm.h

index a5107b5043f79c5f96fe93879572bb0c1dc26fab..a6c66cc418eebac1dde4e04f95c0f22eefff60e2 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -391,6 +391,29 @@ extern     int free_hugepages(struct vm_area_struct *);
  #endif
  
  
+/*
+ * Prototype to add a shrinker callback for ageable caches.
+ * 
+ * These functions are passed a count `nr_to_scan' and a gfpmask.  They should
+ * scan `nr_to_scan' objects, attempting to free them.
+ *
+ * The callback must the number of objects which remain in the cache.
+ *
+ * The callback will be passes nr_to_scan == 0 when the VM is querying the
+ * cache size, so a fastpath for that case is appropriate.
+ */
+typedef int (*shrinker_t)(int nr_to_scan, unsigned int gfp_mask);
+
+/*
+ * Add an aging callback.  The int is the number of 'seeks' it takes
+ * to recreate one of the objects that these functions age.
+ */
+
+#define DEFAULT_SEEKS 2
+struct shrinker;
+extern struct shrinker *set_shrinker(int, shrinker_t);
+extern void remove_shrinker(struct shrinker *shrinker);
+
  /*
   * If the mapping doesn't provide a set_page_dirty a_op, then
   * just fall through and assume that it wants buffer_heads.
diff --git a/kernel/ksyms.c b/kernel/ksyms.c

index 9beb67e2a99952e459f1094429e4dd92cb40d5ec..bd0a43fcf7f4684ff162c01afe210ee77d84172a 100644 (file)
--- a/kernel/ksyms.c
+++ b/kernel/ksyms.c
@@ -103,6 +103,8 @@ EXPORT_SYMBOL(kmem_cache_shrink);
  EXPORT_SYMBOL(kmem_cache_alloc);
  EXPORT_SYMBOL(kmem_cache_free);
  EXPORT_SYMBOL(kmem_cache_size);
+EXPORT_SYMBOL(set_shrinker);
+EXPORT_SYMBOL(remove_shrinker);
  EXPORT_SYMBOL(kmalloc);
  EXPORT_SYMBOL(kfree);
  EXPORT_SYMBOL(vfree);
@@ -246,7 +248,6 @@ EXPORT_SYMBOL(dput);
  EXPORT_SYMBOL(have_submounts);
  EXPORT_SYMBOL(d_find_alias);
  EXPORT_SYMBOL(d_prune_aliases);
-EXPORT_SYMBOL(prune_dcache);
  EXPORT_SYMBOL(shrink_dcache_sb);
  EXPORT_SYMBOL(shrink_dcache_parent);
  EXPORT_SYMBOL(shrink_dcache_anon);
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 0086407047f62131b8443543d74e6e477b358c65..31856732ed7bf01e12b7cab4b775ae512d0d362e 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -77,9 +77,94 @@ static long total_memory;
  #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
  #endif
  
-#ifndef CONFIG_QUOTA
-#define shrink_dqcache_memory(ratio, gfp_mask) do { } while (0)
-#endif
+/*
+ * The list of shrinker callbacks used by to apply pressure to
+ * ageable caches.
+ */
+struct shrinker {
+       shrinker_t              shrinker;
+       struct list_head        list;
+       int                     seeks;  /* seeks to recreate an obj */
+       int                     nr;     /* objs pending delete */
+};
+
+static LIST_HEAD(shrinker_list);
+static DECLARE_MUTEX(shrinker_sem);
+
+/*
+ * Add a shrinker callback to be called from the vm
+ */
+struct shrinker *set_shrinker(int seeks, shrinker_t theshrinker)
+{
+        struct shrinker *shrinker;
+
+        shrinker = kmalloc(sizeof(*shrinker), GFP_KERNEL);
+        if (shrinker) {
+               shrinker->shrinker = theshrinker;
+               shrinker->seeks = seeks;
+               shrinker->nr = 0;
+               down(&shrinker_sem);
+               list_add(&shrinker->list, &shrinker_list);
+               up(&shrinker_sem);
+       }
+       return shrinker;
+}
+
+/*
+ * Remove one
+ */
+void remove_shrinker(struct shrinker *shrinker)
+{
+       down(&shrinker_sem);
+       list_del(&shrinker->list);
+       up(&shrinker_sem);
+       kfree(shrinker);
+}
+ 
+#define SHRINK_BATCH 32
+/*
+ * Call the shrink functions to age shrinkable caches
+ *
+ * Here we assume it costs one seek to replace a lru page and that it also
+ * takes a seek to recreate a cache object.  With this in mind we age equal
+ * percentages of the lru and ageable caches.  This should balance the seeks
+ * generated by these structures.
+ *
+ * If the vm encounted mapped pages on the LRU it increase the pressure on
+ * slab to avoid swapping.
+ *
+ * FIXME: do not do for zone highmem
+ */
+static int shrink_slab(int scanned,  unsigned int gfp_mask)
+{
+       struct list_head *lh;
+       int pages;
+
+       if (down_trylock(&shrinker_sem))
+               return 0;
+
+       pages = nr_used_zone_pages();
+       list_for_each(lh, &shrinker_list) {
+               struct shrinker *shrinker;
+               int entries;
+               unsigned long delta;
+
+               shrinker = list_entry(lh, struct shrinker, list);
+               entries = (*shrinker->shrinker)(0, gfp_mask);
+               if (!entries)
+                       continue;
+               delta = scanned * shrinker->seeks * entries;
+               shrinker->nr += delta / (pages + 1);
+               if (shrinker->nr > SHRINK_BATCH) {
+                       int nr = shrinker->nr;
+
+                       shrinker->nr = 0;
+                       (*shrinker->shrinker)(nr, gfp_mask);
+               }
+       }
+       up(&shrinker_sem);
+       return 0;
+}
  
  /* Must be called with page's pte_chain_lock held. */
  static inline int page_mapping_inuse(struct page * page)
@@ -626,32 +711,6 @@ shrink_zone(struct zone *zone, int max_scan, unsigned int gfp_mask,
                                 max_scan, nr_mapped);
  }
  
-/*
- * FIXME: don't do this for ZONE_HIGHMEM
- */
-/*
- * Here we assume it costs one seek to replace a lru page and that it also
- * takes a seek to recreate a cache object.  With this in mind we age equal
- * percentages of the lru and ageable caches.  This should balance the seeks
- * generated by these structures.
- *
- * NOTE: for now I do this for all zones.  If we find this is too aggressive
- * on large boxes we may want to exclude ZONE_HIGHMEM.
- *
- * If we're encountering mapped pages on the LRU then increase the pressure on
- * slab to avoid swapping.
- */
-static void shrink_slab(int total_scanned, int gfp_mask)
-{
-       int shrink_ratio;
-       int pages = nr_used_zone_pages();
-
-       shrink_ratio = (pages / (total_scanned + 1)) + 1;
-       shrink_dcache_memory(shrink_ratio, gfp_mask);
-       shrink_icache_memory(shrink_ratio, gfp_mask);
-       shrink_dqcache_memory(shrink_ratio, gfp_mask);
-}
-
  /*
   * This is the direct reclaim path, for page-allocating processes.  We only
   * try to reclaim pages from zones which will satisfy the caller's allocation
@@ -695,7 +754,7 @@ shrink_caches(struct zone *classzone, int priority, int *total_scanned,
         }
         return ret;
  }
-
+ 
  /*
   * This is the main entry point to direct page reclaim.
   *
author	Andrew Morton <akpm@digeo.com>
	Sun, 13 Oct 2002 09:58:45 +0000 (02:58 -0700)
committer	Linus Torvalds <torvalds@home.transmeta.com>
	Sun, 13 Oct 2002 09:58:45 +0000 (02:58 -0700)
fs/dcache.c		patch \| blob \| history
fs/dquot.c		patch \| blob \| history
fs/inode.c		patch \| blob \| history
include/linux/dcache.h		patch \| blob \| history
include/linux/mm.h		patch \| blob \| history
kernel/ksyms.c		patch \| blob \| history
mm/vmscan.c		patch \| blob \| history