Linux 2.2.19pre2

author Alan Cox <alan@lxorguk.ukuu.org.uk>

Fri, 23 Nov 2007 20:23:00 +0000 (15:23 -0500)

committer Alan Cox <alan@lxorguk.ukuu.org.uk>

Fri, 23 Nov 2007 20:23:00 +0000 (15:23 -0500)
author Alan Cox <alan@lxorguk.ukuu.org.uk>
Fri, 23 Nov 2007 20:23:00 +0000 (15:23 -0500)
committer Alan Cox <alan@lxorguk.ukuu.org.uk>
Fri, 23 Nov 2007 20:23:00 +0000 (15:23 -0500)
diff --git a/Makefile b/Makefile

index f057cc53e5b553dd7fd76d1f1cfc4e83d968e5c3..a751970a0f59425142abd43bb225b812acf0e886 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
  VERSION = 2
  PATCHLEVEL = 2
  SUBLEVEL = 19
-EXTRAVERSION = pre1
+EXTRAVERSION = pre2
  
  ARCH := $(shell uname -m | sed -e s/i.86/i386/ -e s/sun4u/sparc64/ -e s/arm.*/arm/ -e s/sa110/arm/)
  
diff --git a/drivers/block/rd.c b/drivers/block/rd.c

index f1d54b24810aa6ff3a72ebb7e421ff0c3b40bb60..e1d9f1edba38e241c0e9734d0d7cd636292b2b71 100644 (file)
--- a/drivers/block/rd.c
+++ b/drivers/block/rd.c
@@ -173,7 +173,7 @@ repeat:
         if (CURRENT->cmd == READ) 
                 memset(CURRENT->buffer, 0, len); 
         else    
-               set_bit(BH_Protected, &CURRENT->bh->b_state);
+               mark_buffer_protected(CURRENT->bh);
  
         end_request(1);
         goto repeat;
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c

index 8da1765dd480f38cbb1f2d83c1932680b91b44d6..d56d630462d02fd89786b60869804d22f8f1185e 100644 (file)
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -62,9 +62,9 @@ static void set_brk(unsigned long start, unsigned long end)
  static int dump_write(struct file *file, const void *addr, int nr)
  {
         int r;
-       down(&file->f_dentry->d_inode->i_sem);
+       fs_down(&file->f_dentry->d_inode->i_sem);
         r = file->f_op->write(file, addr, nr, &file->f_pos) == nr;
-       up(&file->f_dentry->d_inode->i_sem);
+       fs_up(&file->f_dentry->d_inode->i_sem);
         return r;
  }
  
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c

index 5d5b91b8076c55cd2c24101eec1e620c637026c5..84e9ac54c9f9a8eb7d4ac9b97860d8440b85b40c 100644 (file)
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -948,9 +948,9 @@ static int load_elf_library(int fd)
  static int dump_write(struct file *file, const void *addr, int nr)
  {
         int r;
-       down(&file->f_dentry->d_inode->i_sem);
+       fs_down(&file->f_dentry->d_inode->i_sem);
         r = file->f_op->write(file, addr, nr, &file->f_pos) == nr;
-       up(&file->f_dentry->d_inode->i_sem);
+       fs_up(&file->f_dentry->d_inode->i_sem);
         return r;
  }
  
diff --git a/fs/buffer.c b/fs/buffer.c

index b59b5b4bb740c430a152772e0c82daab0526d952..3e27c36072f0ef9c09c9add450ad63e2c7d95507 100644 (file)
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -83,6 +83,7 @@ static struct wait_queue * buffer_wait = NULL;
  
  static int nr_buffers = 0;
  static int nr_buffers_type[NR_LIST] = {0,};
+static unsigned long size_buffers_type[NR_LIST];
  static int nr_buffer_heads = 0;
  static int nr_unused_buffer_heads = 0;
  static int nr_hashed_buffers = 0;
@@ -359,9 +360,9 @@ asmlinkage int sys_fsync(unsigned int fd)
                 goto out_putf;
  
         /* We need to protect against concurrent writers.. */
-       down(&inode->i_sem);
+       fs_down(&inode->i_sem);
         err = file->f_op->fsync(file, dentry);
-       up(&inode->i_sem);
+       fs_up(&inode->i_sem);
  
  out_putf:
         fput(file);
@@ -396,9 +397,9 @@ asmlinkage int sys_fdatasync(unsigned int fd)
                 goto out_putf;
  
         /* this needs further work, at the moment it is identical to fsync() */
-       down(&inode->i_sem);
+       fs_down(&inode->i_sem);
         err = file->f_op->fsync(file, dentry);
-       up(&inode->i_sem);
+       fs_up(&inode->i_sem);
  
  out_putf:
         fput(file);
@@ -474,6 +475,7 @@ static void remove_from_queues(struct buffer_head * bh)
                 return;
         }
         nr_buffers_type[bh->b_list]--;
+       size_buffers_type[bh->b_list] -= bh->b_size;
         remove_from_hash_queue(bh);
         remove_from_lru_list(bh);
  }
@@ -523,6 +525,7 @@ static void insert_into_queues(struct buffer_head * bh)
                 (*bhp)->b_prev_free = bh;
  
                 nr_buffers_type[bh->b_list]++;
+               size_buffers_type[bh->b_list] += bh->b_size;
  
                 /* Put the buffer in new hash-queue if it has a device. */
                 bh->b_next = NULL;
@@ -571,8 +574,10 @@ struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
  {
         struct buffer_head * bh;
         bh = find_buffer(dev,block,size);
-       if (bh)
+       if (bh) {
                 bh->b_count++;
+               touch_buffer(bh);
+       }
         return bh;
  }
  
@@ -816,6 +821,46 @@ static inline void file_buffer(struct buffer_head *bh, int list)
         insert_into_queues(bh);
  }
  
+/* -1 -> no need to flush
+    0 -> async flush
+    1 -> sync flush (wait for I/O completation) */
+static int balance_dirty_state(kdev_t dev)
+{
+       unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
+
+       dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
+       tot = (buffermem >> PAGE_SHIFT) + nr_free_pages;
+       tot -= size_buffers_type[BUF_PROTECTED] >> PAGE_SHIFT;
+
+       dirty *= 200;
+       soft_dirty_limit = tot * bdf_prm.b_un.nfract;
+       hard_dirty_limit = soft_dirty_limit * 2;
+
+       if (dirty > soft_dirty_limit)
+       {
+               if (dirty > hard_dirty_limit)
+                       return 1;
+               return 0;
+       }
+       return -1;
+}
+
+/*
+ * if a new dirty buffer is created we need to balance bdflush.
+ *
+ * in the future we might want to make bdflush aware of different
+ * pressures on different devices - thus the (currently unused)
+ * 'dev' parameter.
+ */
+void balance_dirty(kdev_t dev)
+{
+       int state = balance_dirty_state(dev);
+
+       if (state < 0)
+               return;
+       wakeup_bdflush(state);
+}
+
  /*
   * A buffer may need to be moved from one buffer list to another
   * (e.g. in case it is not shared any more). Handle this.
@@ -828,7 +873,9 @@ void refile_buffer(struct buffer_head * buf)
                 printk("Attempt to refile free buffer\n");
                 return;
         }
-       if (buffer_dirty(buf))
+       if (buffer_protected(buf))
+               dispose = BUF_PROTECTED;
+       else if (buffer_dirty(buf))
                 dispose = BUF_DIRTY;
         else if (buffer_locked(buf))
                 dispose = BUF_LOCKED;
@@ -837,13 +884,7 @@ void refile_buffer(struct buffer_head * buf)
         if(dispose != buf->b_list) {
                 file_buffer(buf, dispose);
                 if(dispose == BUF_DIRTY) {
-                       int too_many = (nr_buffers * bdf_prm.b_un.nfract/100);
-
-                       /* This buffer is dirty, maybe we need to start flushing.
-                        * If too high a percentage of the buffers are dirty...
-                        */
-                       if (nr_buffers_type[BUF_DIRTY] > too_many)
-                               wakeup_bdflush(1);
+                       balance_dirty(buf->b_dev);
  
                         /* If this is a loop device, and
                          * more than half of the buffers are dirty...
@@ -864,7 +905,6 @@ void __brelse(struct buffer_head * buf)
         /* If dirty, mark the time this buffer should be written back. */
         set_writetime(buf, 0);
         refile_buffer(buf);
-       touch_buffer(buf);
  
         if (buf->b_count) {
                 buf->b_count--;
@@ -1457,6 +1497,7 @@ static int grow_buffers(int size)
         }
         tmp->b_this_page = bh;
         free_list[isize] = bh;
+       mem_map[MAP_NR(page)].flags = 0;
         mem_map[MAP_NR(page)].buffers = bh;
         buffermem += PAGE_SIZE;
         return 1;
@@ -1468,33 +1509,34 @@ static int grow_buffers(int size)
  #define BUFFER_BUSY_BITS       ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
  #define buffer_busy(bh)                ((bh)->b_count || ((bh)->b_state & BUFFER_BUSY_BITS))
  
-static int sync_page_buffers(struct page * page, int wait)
+static void sync_page_buffers(struct page * page)
  {
-       struct buffer_head * bh = page->buffers;
-       struct buffer_head * tmp = bh;
+       struct buffer_head * tmp, * bh = page->buffers;
  
+       /*
+        * Here we'll probably sleep and so we must make sure that
+        * the page doesn't go away from under us. We also prefer any
+        * concurrent try_to_free_buffers() not to work in any way on
+        * our current page from under us since we're just working on it.
+        * As always in 2.2.x we're serialized by the big kernel lock
+        * during those hacky page-visibility manipulations.
+        *
+        * SUBTLE NOTE: for things like LVM snapshotting WRITEA will block too!
+        */
         page->buffers = NULL;
  
+       tmp = bh;
         do {
                 struct buffer_head *p = tmp;
                 tmp = tmp->b_this_page;
-               if (buffer_locked(p)) {
-                       if (wait)
-                               __wait_on_buffer(p);
-               } else if (buffer_dirty(p))
-                       ll_rw_block(WRITE, 1, &p);
-       } while (tmp != bh);
  
-       page->buffers = bh;
-
-       do {
-               struct buffer_head *p = tmp;
-               tmp = tmp->b_this_page;
-               if (buffer_busy(p))
-                       return 1;
+               if (buffer_dirty(p))
+                       if (test_and_set_bit(BH_Wait_IO, &p->b_state))
+                               ll_rw_block(WRITE, 1, &p);
         } while (tmp != bh);
  
-       return 0;
+       /* Restore the visibility of the page before returning. */
+       page->buffers = bh;
  }
  
  /*
@@ -1504,10 +1546,9 @@ static int sync_page_buffers(struct page * page, int wait)
   * Wake up bdflush() if this fails - if we're running low on memory due
   * to dirty buffers, we need to flush them out as quickly as possible.
   */
-int try_to_free_buffers(struct page * page_map, int wait)
+int try_to_free_buffers(struct page * page_map, int gfp_mask)
  {
         struct buffer_head * tmp, * bh = page_map->buffers;
-       int too_many;
  
         tmp = bh;
         do {
@@ -1516,8 +1557,6 @@ int try_to_free_buffers(struct page * page_map, int wait)
                 tmp = tmp->b_this_page;
         } while (tmp != bh);
  
- succeed:
-       tmp = bh;
         do {
                 struct buffer_head * p = tmp;
                 tmp = tmp->b_this_page;
@@ -1536,25 +1575,12 @@ int try_to_free_buffers(struct page * page_map, int wait)
         return 1;
  
   busy:
-       too_many = (nr_buffers * bdf_prm.b_un.nfract/100);
+       if (gfp_mask & __GFP_IO)
+               sync_page_buffers(page_map);
  
-       if (!sync_page_buffers(page_map, wait)) {
-
-               /* If a high percentage of the buffers are dirty, 
-                * wake kflushd 
-                */
-               if (nr_buffers_type[BUF_DIRTY] > too_many)
-                       wakeup_bdflush(0);
-                       
-               /*
-                * We can jump after the busy check because
-                * we rely on the kernel lock.
-                */
-               goto succeed;
-       }
-
-       if(nr_buffers_type[BUF_DIRTY] > too_many)
+       if (balance_dirty_state(NODEV) >= 0)
                 wakeup_bdflush(0);
+
         return 0;
  }
  
@@ -1566,7 +1592,7 @@ void show_buffers(void)
         int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
         int protected = 0;
         int nlist;
-       static char *buf_types[NR_LIST] = {"CLEAN","LOCKED","DIRTY"};
+       static char *buf_types[NR_LIST] = {"CLEAN","LOCKED","DIRTY","PROTECTED",};
  
         printk("Buffer memory:   %8ldkB\n",buffermem>>10);
         printk("Buffer heads:    %6d\n",nr_buffer_heads);
@@ -1590,7 +1616,7 @@ void show_buffers(void)
                         used++, lastused = found;
                 bh = bh->b_next_free;
           } while (bh != lru_list[nlist]);
-         printk("%8s: %d buffers, %d used (last=%d), "
+         printk("%9s: %d buffers, %d used (last=%d), "
                  "%d locked, %d protected, %d dirty\n",
                  buf_types[nlist], found, used, lastused,
                  locked, protected, dirty);
@@ -1935,7 +1961,8 @@ int bdflush(void * unused)
                 
                 /* If there are still a lot of dirty buffers around, skip the sleep
                    and flush some more */
-               if(ndirty == 0 || nr_buffers_type[BUF_DIRTY] <= nr_buffers * bdf_prm.b_un.nfract/100) {
+               if (!ndirty || balance_dirty_state(NODEV) < 0)
+               {
                         spin_lock_irq(&current->sigmask_lock);
                         flush_signals(current);
                         spin_unlock_irq(&current->sigmask_lock);
diff --git a/fs/coda/file.c b/fs/coda/file.c

index 46303344615da8efd060603b52982aa8ca72a6da..35967edfc9c9650f86e3e27d67e602ea55461173 100644 (file)
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -190,10 +190,10 @@ static ssize_t coda_file_write(struct file *coda_file, const char *buff,
                  return -1;
          }
  
-       down(&cont_inode->i_sem);
+       fs_down(&cont_inode->i_sem);
          result = cont_file.f_op->write(&cont_file , buff, count, 
                                        &(cont_file.f_pos));
-       up(&cont_inode->i_sem);
+       fs_up(&cont_inode->i_sem);
          coda_restore_codafile(coda_inode, coda_file, cont_inode, &cont_file);
         
         if (result)
@@ -228,14 +228,14 @@ int coda_fsync(struct file *coda_file, struct dentry *coda_dentry)
          coda_prepare_openfile(coda_inode, coda_file, cont_inode, 
                               &cont_file, &cont_dentry);
  
-       down(&cont_inode->i_sem);
+       fs_down(&cont_inode->i_sem);
  
          result = file_fsync(&cont_file ,&cont_dentry);
         if ( result == 0 ) {
                 result = venus_fsync(coda_inode->i_sb, &(cnp->c_fid));
         }
  
-       up(&cont_inode->i_sem);
+       fs_up(&cont_inode->i_sem);
  
          coda_restore_codafile(coda_inode, coda_file, cont_inode, &cont_file);
          return result;
diff --git a/fs/dcache.c b/fs/dcache.c

index 0430bb0fdb8428c94eef184fee5b929fc06344b0..e4265a5ce0534fbf3987b4b62fcc4d72ebb1c228 100644 (file)
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -475,9 +475,9 @@ void shrink_dcache_parent(struct dentry * parent)
   */
  void shrink_dcache_memory(int priority, unsigned int gfp_mask)
  {
-       if (gfp_mask & __GFP_IO) {
+       if (gfp_mask & __GFP_IO && !current->fs_locks) {
                 int count = 0;
-               if (priority)
+               if (priority > 1)
                         count = dentry_stat.nr_unused / priority;
                 prune_dcache(count, -1);
         }
diff --git a/fs/open.c b/fs/open.c

index 7a9fa444e48f28cd9d033f3fca1326aa072f8d75..9f9354e971960f36b3d871b4c37434d8a0aa19cf 100644 (file)
--- a/fs/open.c
+++ b/fs/open.c
@@ -73,7 +73,7 @@ int do_truncate(struct dentry *dentry, unsigned long length)
         if ((off_t) length < 0)
                 return -EINVAL;
  
-       down(&inode->i_sem);
+       fs_down(&inode->i_sem);
         newattrs.ia_size = length;
         newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
         error = notify_change(dentry, &newattrs);
@@ -83,7 +83,7 @@ int do_truncate(struct dentry *dentry, unsigned long length)
                 if (inode->i_op && inode->i_op->truncate)
                         inode->i_op->truncate(inode);
         }
-       up(&inode->i_sem);
+       fs_up(&inode->i_sem);
         return error;
  }
  
diff --git a/fs/read_write.c b/fs/read_write.c

index e2b5b789977b5c3ee1269f908812253af3c26b75..56c4fa41125ffc58639712c2fe36598024f7c703 100644 (file)
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -166,9 +166,9 @@ asmlinkage ssize_t sys_write(unsigned int fd, const char * buf, size_t count)
         if (!file->f_op || !(write = file->f_op->write))
                 goto out;
  
-       down(&inode->i_sem);
+       fs_down(&inode->i_sem);
         ret = write(file, buf, count, &file->f_pos);
-       up(&inode->i_sem);
+       fs_up(&inode->i_sem);
  out:
         fput(file);
  bad_file:
@@ -314,9 +314,9 @@ asmlinkage ssize_t sys_writev(unsigned long fd, const struct iovec * vector,
         if (!file)
                 goto bad_file;
         if (file->f_op && file->f_op->write && (file->f_mode & FMODE_WRITE)) {
-               down(&file->f_dentry->d_inode->i_sem);
+               fs_down(&file->f_dentry->d_inode->i_sem);
                 ret = do_readv_writev(VERIFY_READ, file, vector, count);
-               up(&file->f_dentry->d_inode->i_sem);
+               fs_up(&file->f_dentry->d_inode->i_sem);
         }
         fput(file);
  
@@ -386,9 +386,9 @@ asmlinkage ssize_t sys_pwrite(unsigned int fd, const char * buf,
         if (pos < 0)
                 goto out;
  
-       down(&file->f_dentry->d_inode->i_sem);
+       fs_down(&file->f_dentry->d_inode->i_sem);
         ret = write(file, buf, count, &pos);
-       up(&file->f_dentry->d_inode->i_sem);
+       fs_up(&file->f_dentry->d_inode->i_sem);
  
  out:
         fput(file);
diff --git a/include/linux/fs.h b/include/linux/fs.h

index e2d39ed8182d01c7d096fa5487394bcb58c1191c..b7a722c93cdc0df7f4e5ec5186124e2387b19460 100644 (file)
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -190,6 +190,7 @@ typedef char buffer_block[BLOCK_SIZE];
  #define BH_Lock                2       /* 1 if the buffer is locked */
  #define BH_Req         3       /* 0 if the buffer has been invalidated */
  #define BH_Protected   6       /* 1 if the buffer is protected */
+#define BH_Wait_IO     7       /* 1 if we should throttle on this buffer */
  
  /*
   * Try to keep the most commonly used fields in single cache lines (16
@@ -782,7 +783,7 @@ extern struct file *inuse_filps;
  
  extern void refile_buffer(struct buffer_head * buf);
  extern void set_writetime(struct buffer_head * buf, int flag);
-extern int try_to_free_buffers(struct page *, int wait);
+extern int try_to_free_buffers(struct page *, int);
  
  extern int nr_buffers;
  extern long buffermem;
@@ -791,15 +792,25 @@ extern int nr_buffer_heads;
  #define BUF_CLEAN      0
  #define BUF_LOCKED     1       /* Buffers scheduled for write */
  #define BUF_DIRTY      2       /* Dirty buffers, not yet scheduled for write */
-#define NR_LIST                3
+#define BUF_PROTECTED  3       /* Ramdisk persistent storage */
+#define NR_LIST                4
  
  void mark_buffer_uptodate(struct buffer_head * bh, int on);
  
+extern inline void mark_buffer_protected(struct buffer_head * bh)
+{
+       if (!test_and_set_bit(BH_Protected, &bh->b_state)) {
+               if (bh->b_list != BUF_PROTECTED)
+                       refile_buffer(bh);
+       }
+}
+
  extern inline void mark_buffer_clean(struct buffer_head * bh)
  {
         if (test_and_clear_bit(BH_Dirty, &bh->b_state)) {
                 if (bh->b_list == BUF_DIRTY)
                         refile_buffer(bh);
+               clear_bit(BH_Wait_IO, &bh->b_state);
         }
  }
  
@@ -941,6 +952,9 @@ extern void inode_setattr(struct inode *, struct iattr *);
  
  extern __u32 inode_generation_count;
  
+#define fs_down(sem)   do { current->fs_locks++; down(sem); } while (0)
+#define fs_up(sem)     do { up(sem); current->fs_locks--; } while (0)
+
  #endif /* __KERNEL__ */
  
  #endif
diff --git a/include/linux/locks.h b/include/linux/locks.h

index 2094a4d19f7938ed95fd132f5d4c3bfe663ef077..f92fa3788c31c7209d38cdca1486803a3bc78d69 100644 (file)
--- a/include/linux/locks.h
+++ b/include/linux/locks.h
@@ -50,10 +50,12 @@ extern inline void lock_super(struct super_block * sb)
         if (sb->s_lock)
                 __wait_on_super(sb);
         sb->s_lock = 1;
+       current->fs_locks++;
  }
  
  extern inline void unlock_super(struct super_block * sb)
  {
+       current->fs_locks--;
         sb->s_lock = 0;
         wake_up(&sb->s_wait);
  }
diff --git a/include/linux/mm.h b/include/linux/mm.h

index 84c587aca99c322c571e8ae708e36f09e1e89a79..ad89e46aa2f0390b0a7c43c93b6f3e890c17f508 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -129,12 +129,8 @@ typedef struct page {
         struct wait_queue *wait;
         struct page **pprev_hash;
         struct buffer_head * buffers;
-       int age;
  } mem_map_t;
  
-#define PAGE_AGE_INITIAL 1     /* age for pages just mapped */
-#define PAGE_AGE_YOUNG 2       /* age for pages recently referenced */
-
  /* Page flag bit values */
  #define PG_locked               0
  #define PG_error                1
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 82b10f6ec1d60ba76b81cf76eacea13a5ed51414..fdecb4207d5c19834bea801aaa4861616bcc5cd0 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -319,6 +319,8 @@ struct task_struct {
         struct files_struct *files;
  /* memory management info */
         struct mm_struct *mm;
+       struct list_head local_pages; int allocation_order, nr_local_pages;
+       int fs_locks;
  
  /* signal handlers */
         spinlock_t sigmask_lock;        /* Protects signal and blocked */
@@ -351,6 +353,7 @@ struct task_struct {
  #define PF_SIGNALED    0x00000400      /* killed by a signal */
  #define PF_MEMALLOC    0x00000800      /* Allocating memory */
  #define PF_VFORK       0x00001000      /* Wake up parent in mm_release */
+#define PF_FREE_PAGES  0x00002000      /* The current-> */
  
  #define PF_USEDFPU     0x00100000      /* task used FPU this quantum (SMP) */
  #define PF_DTRACE      0x00200000      /* delayed trace (used on m68k, i386) */
@@ -400,7 +403,7 @@ struct task_struct {
  /* tss */      INIT_TSS, \
  /* fs */       &init_fs, \
  /* files */    &init_files, \
-/* mm */       &init_mm, \
+/* mm */       &init_mm, { &init_task.local_pages, &init_task.local_pages}, 0, 0, 0, \
  /* signals */  SPIN_LOCK_UNLOCKED, &init_signals, {{0}}, {{0}}, NULL, &init_task.sigqueue, 0, 0, \
  /* exec cts */ 0,0, \
  /* oom */      0, \
diff --git a/init/main.c b/init/main.c

index 5a7fe3c8b7bda4cf4a3b5e8143879df333339df3..2f4f06e2b733df234d6961aa9bfefd93538a3781 100644 (file)
--- a/init/main.c
+++ b/init/main.c
@@ -80,7 +80,6 @@ static int init(void *);
  extern int bdflush(void *);
  extern int kupdate(void *);
  extern int kswapd(void *);
-extern int kpiod(void *);
  extern void kswapd_setup(void);
  extern unsigned long init_IRQ( unsigned long);
  extern void init_modules(void);
@@ -1584,7 +1583,6 @@ static void __init do_basic_setup(void)
         kernel_thread(kupdate, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
         /* Start the background pageout daemon. */
         kswapd_setup();
-       kernel_thread(kpiod, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
         kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
  
  #if CONFIG_AP1000
diff --git a/ipc/shm.c b/ipc/shm.c

index 62dd3c64599479ea529baa4e3cbd78d16c5b5711..4a33fa8fe85060781de7070a5f22bd75a6be7e3a 100644 (file)
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -679,7 +679,7 @@ done:       /* pte_val(pte) == shp->shm_pages[idx] */
  }
  
  /*
- * Goes through counter = (shm_rss >> prio) present shm pages.
+ * Goes through counter = (shm_rss / prio) present shm pages.
   */
  static unsigned long swap_id = 0; /* currently being swapped */
  static unsigned long swap_idx = 0; /* next to swap */
@@ -693,7 +693,7 @@ int shm_swap (int prio, int gfp_mask)
         int loop = 0;
         int counter;
         
-       counter = shm_rss >> prio;
+       counter = shm_rss / prio;
         if (!counter || !(swap_nr = get_swap_page()))
                 return 0;
  
diff --git a/kernel/fork.c b/kernel/fork.c

index 70309a73c7cfa55273fa5d470db7284113339fad..70a98bb198c62ef20c30955537a9f036f6cae2e7 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -665,6 +665,8 @@ int do_fork(unsigned long clone_flags, unsigned long usp, struct pt_regs *regs)
         p->lock_depth = -1;             /* -1 = no lock */
         p->start_time = jiffies;
  
+       INIT_LIST_HEAD(&p->local_pages);
+
         retval = -ENOMEM;
         /* copy all the process information */
         if (copy_files(clone_flags, p))
diff --git a/mm/filemap.c b/mm/filemap.c

index 384bab05a42efef3d3481a24c5b126bc84210125..785135056c64d29a236dcefb578fbaa3a714814b 100644 (file)
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -19,7 +19,6 @@
  #include <linux/blkdev.h>
  #include <linux/file.h>
  #include <linux/swapctl.h>
-#include <linux/slab.h>
  #include <linux/init.h>
  
  #include <asm/pgtable.h>
@@ -36,25 +35,6 @@ unsigned long page_cache_size = 0;
  unsigned int page_hash_bits, page_hash_mask;
  struct page **page_hash_table;
  
-/* 
- * Define a request structure for outstanding page write requests
- * to the background page io daemon
- */
-
-struct pio_request 
-{
-       struct pio_request *    next;
-       struct file *           file;
-       unsigned long           offset;
-       unsigned long           page;
-};
-static struct pio_request *pio_first = NULL, **pio_last = &pio_first;
-static kmem_cache_t *pio_request_cache;
-static struct wait_queue *pio_wait = NULL;
-
-static inline void 
-make_pio_request(struct file *, unsigned long, unsigned long);
-
  static inline int sync_page(struct page *page)
  {
         struct inode *inode = page->inode;
@@ -150,14 +130,21 @@ int shrink_mmap(int priority, int gfp_mask)
         unsigned long limit = num_physpages;
         struct page * page;
         int count;
-       int nr_dirty = 0;
-       
+
         /* Make sure we scan all pages twice at priority 0. */
-       count = (limit << 1) >> priority;
+       count = limit / priority;
  
   refresh_clock:
         page = mem_map + clock;
         do {
+               int referenced;
+
+               if (current->need_resched) {
+                       current->state = TASK_RUNNING;
+                       schedule();
+                       goto refresh_clock;
+               }
+               
                 /* This works even in the presence of PageSkip because
                  * the first two entries at the beginning of a hole will
                  * be marked, not just the first.
@@ -174,42 +161,39 @@ int shrink_mmap(int priority, int gfp_mask)
                         clock = page - mem_map;
                 }
                 
-               if (test_and_clear_bit(PG_referenced, &page->flags)) {
-                       page->age = PAGE_AGE_YOUNG;
-                       continue;
-               }
-
-               if (page->age > 0) {
-                       page->age--;
-                       continue;
-               }
+               count--;
  
                 /* We can't free pages unless there's just one user */
                 if (atomic_read(&page->count) != 1)
                         continue;
  
+               referenced = test_and_clear_bit(PG_referenced, &page->flags);
+
                 if (PageLocked(page))
                         continue;
  
-               if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
+               if ((gfp_mask & __GFP_DMA) && !PageDMA(page)) {
+                       count++;
                         continue;
+               }
  
-               /* Is it a page swap page? Drop it, its old. */
+               /*
+                * Is it a page swap page? If so, we want to
+                * drop it if it is no longer used, even if it
+                * were to be marked referenced..
+                */
                 if (PageSwapCache(page)) {
+                       if (referenced && swap_count(page->offset) != 1)
+                               continue;
                         delete_from_swap_cache(page);
                         return 1;
                 }       
  
+               if (referenced)
+                       continue;
+
                 /* Is it a buffer page? */
                 if (page->buffers) {
-                       /*
-                        * Wait for async IO to complete
-                        * at each 64 buffers
-                        */ 
-
-                       int wait = ((gfp_mask & __GFP_IO) 
-                               && (!(nr_dirty++ % 64)));
-
                         if (buffer_under_min())
                                 continue;
                         /*
@@ -217,10 +201,8 @@ int shrink_mmap(int priority, int gfp_mask)
                          * throttling.
                          */
  
-                       if (!try_to_free_buffers(page, wait)) { 
-                               if(--count < 0) break;
+                       if (!try_to_free_buffers(page, gfp_mask))
                                 goto refresh_clock;
-                       }
                         return 1;
                 }
  
@@ -231,8 +213,7 @@ int shrink_mmap(int priority, int gfp_mask)
                         remove_inode_page(page);
                         return 1;
                 }
-
-       } while (--count > 0);
+       } while (count > 0);
         return 0;
  }
  
@@ -299,7 +280,7 @@ static inline void add_to_page_cache(struct page * page,
         struct page **hash)
  {
         atomic_inc(&page->count);
-       page->flags = (page->flags & ~((1 << PG_uptodate) | (1 << PG_error))) | (1 << PG_referenced);
+       page->flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced));
         page->offset = offset;
         add_page_to_inode_queue(inode, page);
         __add_page_to_hash_queue(page, hash);
@@ -878,12 +859,12 @@ static int file_send_actor(read_descriptor_t * desc, const char *area, unsigned
  
         if (size > count)
                 size = count;
-       down(&inode->i_sem);
+       fs_down(&inode->i_sem);
         old_fs = get_fs();
         set_fs(KERNEL_DS);
         written = file->f_op->write(file, area, size, &file->f_pos);
         set_fs(old_fs);
-       up(&inode->i_sem);
+       fs_up(&inode->i_sem);
         if (written < 0) {
                 desc->error = written;
                 written = 0;
@@ -1160,8 +1141,7 @@ static inline int do_write_page(struct inode * inode, struct file * file,
  
  static int filemap_write_page(struct vm_area_struct * vma,
                               unsigned long offset,
-                             unsigned long page,
-                             int wait)
+                             unsigned long page)
  {
         int result;
         struct file * file;
@@ -1179,20 +1159,9 @@ static int filemap_write_page(struct vm_area_struct * vma,
          * and file could be released ... increment the count to be safe.
          */
         file->f_count++;
-
-       /* 
-        * If this is a swapping operation rather than msync(), then
-        * leave the actual IO, and the restoration of the file count,
-        * to the kpiod thread.  Just queue the request for now.
-        */
-       if (!wait) {
-               make_pio_request(file, offset, page);
-               return 0;
-       }
-       
-       down(&inode->i_sem);
+       fs_down(&inode->i_sem);
         result = do_write_page(inode, file, (const char *) page, offset);
-       up(&inode->i_sem);
+       fs_up(&inode->i_sem);
         fput(file);
         return result;
  }
@@ -1205,7 +1174,7 @@ static int filemap_write_page(struct vm_area_struct * vma,
   */
  int filemap_swapout(struct vm_area_struct * vma, struct page * page)
  {
-       return filemap_write_page(vma, page->offset, page_address(page), 0);
+       return filemap_write_page(vma, page->offset, page_address(page));
  }
  
  static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
@@ -1242,7 +1211,7 @@ static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
                         return 0;
                 }
         }
-       error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page, 1);
+       error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page);
         page_cache_free(page);
         return error;
  }
@@ -1414,9 +1383,9 @@ static int msync_interval(struct vm_area_struct * vma,
                         if (file) {
                                 struct dentry * dentry = file->f_dentry;
                                 struct inode * inode = dentry->d_inode;
-                               down(&inode->i_sem);
+                               fs_down(&inode->i_sem);
                                 error = file_fsync(file, dentry);
-                               up(&inode->i_sem);
+                               fs_up(&inode->i_sem);
                         }
                 }
                 return error;
@@ -1745,130 +1714,6 @@ void put_cached_page(unsigned long addr)
         page_cache_release(page);
  }
  
-
-/* Add request for page IO to the queue */
-
-static inline void put_pio_request(struct pio_request *p)
-{
-       *pio_last = p;
-       p->next = NULL;
-       pio_last = &p->next;
-}
-
-/* Take the first page IO request off the queue */
-
-static inline struct pio_request * get_pio_request(void)
-{
-       struct pio_request * p = pio_first;
-       pio_first = p->next;
-       if (!pio_first)
-               pio_last = &pio_first;
-       return p;
-}
-
-/* Make a new page IO request and queue it to the kpiod thread */
-
-static inline void make_pio_request(struct file *file,
-                                   unsigned long offset,
-                                   unsigned long page)
-{
-       struct pio_request *p;
-
-       atomic_inc(&page_cache_entry(page)->count);
-
-       /* 
-        * We need to allocate without causing any recursive IO in the
-        * current thread's context.  We might currently be swapping out
-        * as a result of an allocation made while holding a critical
-        * filesystem lock.  To avoid deadlock, we *MUST* not reenter
-        * the filesystem in this thread.
-        *
-        * We can wait for kswapd to free memory, or we can try to free
-        * pages without actually performing further IO, without fear of
-        * deadlock.  --sct
-        */
-
-       while ((p = kmem_cache_alloc(pio_request_cache, GFP_BUFFER)) == NULL) {
-               if (try_to_free_pages(__GFP_WAIT))
-                       continue;
-               current->state = TASK_INTERRUPTIBLE;
-               schedule_timeout(HZ/10);
-       }
-       
-       p->file   = file;
-       p->offset = offset;
-       p->page   = page;
-
-       put_pio_request(p);
-       wake_up(&pio_wait);
-}
-
-
-/*
- * This is the only thread which is allowed to write out filemap pages
- * while swapping.
- * 
- * To avoid deadlock, it is important that we never reenter this thread.
- * Although recursive memory allocations within this thread may result
- * in more page swapping, that swapping will always be done by queuing
- * another IO request to the same thread: we will never actually start
- * that IO request until we have finished with the current one, and so
- * we will not deadlock.  
- */
-
-int kpiod(void * unused)
-{
-       struct task_struct *tsk = current;
-       struct wait_queue wait = { tsk, };
-       struct inode * inode;
-       struct dentry * dentry;
-       struct pio_request * p;
-       
-       tsk->session = 1;
-       tsk->pgrp = 1;
-       strcpy(tsk->comm, "kpiod");
-       sigfillset(&tsk->blocked);
-       init_waitqueue(&pio_wait);
-       /*
-        * Mark this task as a memory allocator - we don't want to get caught
-        * up in the regular mm freeing frenzy if we have to allocate memory
-        * in order to write stuff out.
-        */
-       tsk->flags |= PF_MEMALLOC;
-
-       lock_kernel();
-       
-       pio_request_cache = kmem_cache_create("pio_request", 
-                                             sizeof(struct pio_request),
-                                             0, SLAB_HWCACHE_ALIGN, 
-                                             NULL, NULL);
-       if (!pio_request_cache)
-               panic ("Could not create pio_request slab cache");
-
-       while (1) {
-               tsk->state = TASK_INTERRUPTIBLE;
-               add_wait_queue(&pio_wait, &wait);
-               if (!pio_first)
-                       schedule();
-               remove_wait_queue(&pio_wait, &wait);
-               tsk->state = TASK_RUNNING;
-
-               while (pio_first) {
-                       p = get_pio_request();
-                       dentry = p->file->f_dentry;
-                       inode = dentry->d_inode;
-                       
-                       down(&inode->i_sem);
-                       do_write_page(inode, p->file,
-                                     (const char *) p->page, p->offset);
-                       up(&inode->i_sem);
-                       fput(p->file);
-                       page_cache_free(p->page);
-                       kmem_cache_free(pio_request_cache, p);
-               }
-       }
-}
-
  void __init page_cache_init(unsigned long memory_size)
  {
         unsigned long htable_size;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 8212c29bb780bca48e7c3e287f1d1e9e519acea0..533cca3ad461d2c4846b3bb8cc4e4db487bbbdf7 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -93,34 +93,69 @@ static inline void remove_mem_queue(struct page * entry)
   */
  spinlock_t page_alloc_lock = SPIN_LOCK_UNLOCKED;
  
+#define list(x) (mem_map+(x))
+#define __free_pages_ok(map_nr, mask, area, index)             \
+       nr_free_pages -= (mask);                                \
+       while ((mask) + (1 << (NR_MEM_LISTS-1))) {              \
+               if (!test_and_change_bit((index), (area)->map)) \
+                       break;                                  \
+               (area)->count--;                                \
+               remove_mem_queue(list((map_nr) ^ -(mask)));     \
+               (mask) <<= 1;                                   \
+               (area)++;                                       \
+               (index) >>= 1;                                  \
+               (map_nr) &= (mask);                             \
+       }                                                       \
+       add_mem_queue(area, list(map_nr));
+
+static void free_local_pages(struct page * page) {
+       unsigned long order = page->offset;
+       unsigned int type = PageDMA(page) ? 1 : 0;
+       struct free_area_struct *area;
+       unsigned long map_nr = page - mem_map;
+       unsigned long mask = (~0UL) << order;
+       unsigned long index = map_nr >> (1 + order);
+
+       area = free_area[type] + order;
+       __free_pages_ok(map_nr, mask, area, index);
+}
+
  static inline void free_pages_ok(unsigned long map_nr, unsigned long order, unsigned type)
  {
-       struct free_area_struct *area = free_area[type] + order;
-       unsigned long index = map_nr >> (1 + order);
-       unsigned long mask = (~0UL) << order;
+       struct free_area_struct *area;
+       unsigned long index;
+       unsigned long mask;
         unsigned long flags;
+       struct page * page;
  
-       spin_lock_irqsave(&page_alloc_lock, flags);
-
-#define list(x) (mem_map+(x))
+       if (current->flags & PF_FREE_PAGES)
+               goto local_freelist;
+ back_local_freelist:
  
+       index = map_nr >> (1 + order);
+       mask = (~0UL) << order;
         map_nr &= mask;
-       nr_free_pages -= mask;
-       while (mask + (1 << (NR_MEM_LISTS-1))) {
-               if (!test_and_change_bit(index, area->map))
-                       break;
-               area->count--;
-               remove_mem_queue(list(map_nr ^ -mask));
-               mask <<= 1;
-               area++;
-               index >>= 1;
-               map_nr &= mask;
-       }
-       add_mem_queue(area, list(map_nr));
-
-#undef list
  
+       spin_lock_irqsave(&page_alloc_lock, flags);
+       area = free_area[type] + order;
+       __free_pages_ok(map_nr, mask, area, index);
         spin_unlock_irqrestore(&page_alloc_lock, flags);
+       return;
+
+ local_freelist:
+       /*
+        * This is a little subtle: if the allocation order
+        * wanted is major than zero we'd better take all the pages
+        * local since we must deal with fragmentation too and we
+        * can't rely on the nr_local_pages information.
+        */
+       if (current->nr_local_pages && !current->allocation_order)
+               goto back_local_freelist;
+
+       page = mem_map + map_nr;
+       list_add((struct list_head *) page, &current->local_pages);
+       page->offset = order;
+       current->nr_local_pages++;
  }
  
  void __free_pages(struct page *page, unsigned long order)
@@ -129,7 +164,6 @@ void __free_pages(struct page *page, unsigned long order)
                 if (PageSwapCache(page))
                         panic ("Freeing swap cache page");
                 page->flags &= ~(1 << PG_referenced);
-               page->age = PAGE_AGE_INITIAL;
                 free_pages_ok(page - mem_map, order, PageDMA(page) ? 1 : 0);
                 return;
         }
@@ -180,13 +214,32 @@ do { unsigned long size = 1 << high; \
         atomic_set(&map->count, 1); \
  } while (0)
  
+static void refile_local_pages(void)
+{
+       if (current->nr_local_pages) {
+               struct page * page;
+               struct list_head * entry;
+               int nr_pages = current->nr_local_pages;
+
+               while ((entry = current->local_pages.next) != &current->local_pages) {
+                       list_del(entry);
+                       page = (struct page *) entry;
+                       free_local_pages(page);
+                       if (!nr_pages--)
+                               panic("__get_free_pages local_pages list corrupted I");
+               }
+               if (nr_pages)
+                       panic("__get_free_pages local_pages list corrupted II");
+               current->nr_local_pages = 0;
+       }
+}
+
  unsigned long __get_free_pages(int gfp_mask, unsigned long order)
  {
         unsigned long flags;
-       static atomic_t free_before_allocate = ATOMIC_INIT(0);
  
         if (order >= NR_MEM_LISTS)
-               goto nopage;
+               goto out;
  
  #ifdef ATOMIC_MEMORY_DEBUGGING
         if ((gfp_mask & __GFP_WAIT) && in_interrupt()) {
@@ -195,26 +248,24 @@ unsigned long __get_free_pages(int gfp_mask, unsigned long order)
                         printk("gfp called nonatomically from interrupt %p\n",
                                 __builtin_return_address(0));
                 }
-               goto nopage;
+               goto out;
         }
  #endif
  
+       /*
+        * Acquire lock before reading nr_free_pages to make sure it
+        * won't change from under us.
+        */
+       spin_lock_irqsave(&page_alloc_lock, flags);
+
         /*
          * If this is a recursive call, we'd better
          * do our best to just allocate things without
          * further thought.
          */
         if (!(current->flags & PF_MEMALLOC)) {
-               int freed;
                 extern struct wait_queue * kswapd_wait;
  
-               /* Somebody needs to free pages so we free some of our own. */
-               if (atomic_read(&free_before_allocate)) {
-                       current->flags |= PF_MEMALLOC;
-                       try_to_free_pages(gfp_mask);
-                       current->flags &= ~PF_MEMALLOC;
-               }
-
                 if (nr_free_pages > freepages.low)
                         goto ok_to_allocate;
  
@@ -224,35 +275,44 @@ unsigned long __get_free_pages(int gfp_mask, unsigned long order)
                 /* Do we have to block or can we proceed? */
                 if (nr_free_pages > freepages.min)
                         goto ok_to_allocate;
-
-               current->flags |= PF_MEMALLOC;
-               atomic_inc(&free_before_allocate);
-               freed = try_to_free_pages(gfp_mask);
-               atomic_dec(&free_before_allocate);
-               current->flags &= ~PF_MEMALLOC;
-
-               /*
-                * Re-check we're still low on memory after we blocked
-                * for some time. Somebody may have released lots of
-                * memory from under us while we was trying to free
-                * the pages. We check against pages_high to be sure
-                * to succeed only if lots of memory is been released.
-                */
-               if (nr_free_pages > freepages.high)
-                       goto ok_to_allocate;
-
-               if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
-                       goto nopage;
+               if (gfp_mask & __GFP_WAIT) {
+                       int freed;
+                       /*
+                        * If the task is ok to sleep it's fine also
+                        * if we release irq here.
+                        */
+                       spin_unlock_irq(&page_alloc_lock);
+
+                       current->flags |= PF_MEMALLOC|PF_FREE_PAGES;
+                       current->allocation_order = order;
+                       freed = try_to_free_pages(gfp_mask);
+                       current->flags &= ~(PF_MEMALLOC|PF_FREE_PAGES);
+
+                       spin_lock_irq(&page_alloc_lock);
+                       refile_local_pages();
+
+                       /*
+                        * Re-check we're still low on memory after we blocked
+                        * for some time. Somebody may have released lots of
+                        * memory from under us while we was trying to free
+                        * the pages. We check against pages_high to be sure
+                        * to succeed only if lots of memory is been released.
+                        */
+                       if (nr_free_pages > freepages.high)
+                               goto ok_to_allocate;
+
+                       if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
+                               goto nopage;
+               }
         }
  ok_to_allocate:
-       spin_lock_irqsave(&page_alloc_lock, flags);
         /* if it's not a dma request, try non-dma first */
         if (!(gfp_mask & __GFP_DMA))
                 RMQUEUE_TYPE(order, 0);
         RMQUEUE_TYPE(order, 1);
+ nopage:
         spin_unlock_irqrestore(&page_alloc_lock, flags);
-
-nopage:
+ out:
         return 0;
  }
  
diff --git a/mm/swap_state.c b/mm/swap_state.c

index 8c5e7176c5aa668ccfc4ad0997af4f47380386ad..7c27e5b338db776516948ade0884b1373d641600 100644 (file)
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -63,6 +63,7 @@ int add_to_swap_cache(struct page *page, unsigned long entry)
                 return 0;
         }
         atomic_inc(&page->count);
+       page->flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced));
         page->inode = &swapper_inode;
         page->offset = entry;
         add_page_to_hash_queue(page, &swapper_inode, entry);
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 86e6b1fe9a87e76a27bab5e9385b0edf5ae3c7bd..81ba9a55ba624c55c99ad91106c4668e159f9d5c 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -96,6 +96,9 @@ drop_pte:
          * some real work in the future in "shrink_mmap()".
          */
         if (!pte_dirty(pte)) {
+               if (page_map->inode && pgcache_under_min())
+                       /* unmapping this page would be useless */
+                       return 0;
                 flush_cache_page(vma, address);
                 pte_clear(page_table);
                 goto drop_pte;
@@ -106,7 +109,7 @@ drop_pte:
          * we cannot do I/O! Avoid recursing on FS
          * locks etc.
          */
-       if (!(gfp_mask & __GFP_IO))
+       if (!(gfp_mask & __GFP_IO) || current->fs_locks)
                 return 0;
  
         /*
@@ -208,6 +211,8 @@ static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct *
                 result = try_to_swap_out(tsk, vma, address, pte, gfp_mask);
                 if (result)
                         return result;
+               if (current->need_resched)
+                       return 2;
                 address += PAGE_SIZE;
                 pte++;
         } while (address < end);
@@ -327,7 +332,7 @@ static int swap_out(unsigned int priority, int gfp_mask)
          * Think of swap_cnt as a "shadow rss" - it tells us which process
          * we want to page out (always try largest first).
          */
-       counter = nr_tasks / (priority+1);
+       counter = nr_tasks / priority;
         if (counter < 1)
                 counter = 1;
  
@@ -361,8 +366,13 @@ static int swap_out(unsigned int priority, int gfp_mask)
                         goto out;
                 }
  
-               if (swap_out_process(pbest, gfp_mask))
+               switch (swap_out_process(pbest, gfp_mask)) {
+               case 1:
                         return 1;
+               case 2:
+                       current->state = TASK_RUNNING;
+                       schedule();
+               }
         }
  out:
         return 0;
@@ -377,11 +387,9 @@ out:
   * cluster them so that we get good swap-out behaviour. See
   * the "free_memory()" macro for details.
   */
-static int do_try_to_free_pages(unsigned int gfp_mask)
+int try_to_free_pages(unsigned int gfp_mask)
  {
         int priority;
-       int ret = 0;
-       int swapcount;
         int count = SWAP_CLUSTER_MAX;
  
         lock_kernel();
@@ -389,41 +397,34 @@ static int do_try_to_free_pages(unsigned int gfp_mask)
         /* Always trim SLAB caches when memory gets low. */
         kmem_cache_reap(gfp_mask);
  
-       priority = 6;
+       priority = 5;
         do {
                 while (shrink_mmap(priority, gfp_mask)) {
-                       ret = 1;
                         if (!--count)
                                 goto done;
                 }
  
                 /* Try to get rid of some shared memory pages.. */
-               if (gfp_mask & __GFP_IO) {
+               if (gfp_mask & __GFP_IO && !current->fs_locks) {
                         while (shm_swap(priority, gfp_mask)) {
-                               ret = 1;
                                 if (!--count)
                                         goto done;
                         }
                 }
  
                 /* Then, try to page stuff out.. */
-               swapcount = count;
                 while (swap_out(priority, gfp_mask)) {
-                       ret = 1;
-                       if (!--swapcount)
-                               break;
+                       if (!--count)
+                               goto done;
                 }
  
                 shrink_dcache_memory(priority, gfp_mask);
-       } while (--priority >= 0);
+       } while (--priority > 0);
  done:
         unlock_kernel();
  
-       if (!ret)
-               printk("VM: do_try_to_free_pages failed for %s...\n",
-                               current->comm);
         /* Return success if we freed a page. */
-       return ret;
+       return priority > 0;
  }
  
  /*
@@ -499,7 +500,7 @@ int kswapd(void *unused)
  
                 while (nr_free_pages < freepages.high)
                 {
-                       if (do_try_to_free_pages(GFP_KSWAPD))
+                       if (try_to_free_pages(GFP_KSWAPD))
                         {
                                 if (tsk->need_resched)
                                         schedule();
@@ -510,17 +511,3 @@ int kswapd(void *unused)
                 }
         }
  }
-
-/*
- * Called by non-kswapd processes when kswapd really cannot
- * keep up with the demand for free memory.
- */
-int try_to_free_pages(unsigned int gfp_mask)
-{
-       int retval = 1;
-
-       if (gfp_mask & __GFP_WAIT)
-               retval = do_try_to_free_pages(gfp_mask);
-       return retval;
-}
-
author	Alan Cox <alan@lxorguk.ukuu.org.uk>
	Fri, 23 Nov 2007 20:23:00 +0000 (15:23 -0500)
committer	Alan Cox <alan@lxorguk.ukuu.org.uk>
	Fri, 23 Nov 2007 20:23:00 +0000 (15:23 -0500)
Makefile		patch \| blob \| history
drivers/block/rd.c		patch \| blob \| history
fs/binfmt_aout.c		patch \| blob \| history
fs/binfmt_elf.c		patch \| blob \| history
fs/buffer.c		patch \| blob \| history
fs/coda/file.c		patch \| blob \| history
fs/dcache.c		patch \| blob \| history
fs/open.c		patch \| blob \| history
fs/read_write.c		patch \| blob \| history
include/linux/fs.h		patch \| blob \| history
include/linux/locks.h		patch \| blob \| history
include/linux/mm.h		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
init/main.c		patch \| blob \| history
ipc/shm.c		patch \| blob \| history
kernel/fork.c		patch \| blob \| history
mm/filemap.c		patch \| blob \| history
mm/page_alloc.c		patch \| blob \| history
mm/swap_state.c		patch \| blob \| history
mm/vmscan.c		patch \| blob \| history