From 03701f15cd3659a163ed690d8eed7ed0efef23fe Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Fri, 23 Nov 2007 15:23:00 -0500
Subject: [PATCH] Linux 2.2.19pre2

o	Drop the page aging for a moment to merge the
	Andrea VM
o	Merge Andrea's VM-global patch			(Andrea Arcangeli)
---
 Makefile              |   2 +-
 drivers/block/rd.c    |   2 +-
 fs/binfmt_aout.c      |   4 +-
 fs/binfmt_elf.c       |   4 +-
 fs/buffer.c           | 137 +++++++++++++++----------
 fs/coda/file.c        |   8 +-
 fs/dcache.c           |   4 +-
 fs/open.c             |   4 +-
 fs/read_write.c       |  12 +--
 include/linux/fs.h    |  18 +++-
 include/linux/locks.h |   2 +
 include/linux/mm.h    |   4 -
 include/linux/sched.h |   5 +-
 init/main.c           |   2 -
 ipc/shm.c             |   4 +-
 kernel/fork.c         |   2 +
 mm/filemap.c          | 231 +++++++-----------------------------------
 mm/page_alloc.c       | 168 ++++++++++++++++++++----------
 mm/swap_state.c       |   1 +
 mm/vmscan.c           |  55 ++++------
 20 files changed, 302 insertions(+), 367 deletions(-)

diff --git a/Makefile b/Makefile
index f057cc53e5b5..a751970a0f59 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 2
 SUBLEVEL = 19
-EXTRAVERSION = pre1
+EXTRAVERSION = pre2
 
 ARCH := $(shell uname -m | sed -e s/i.86/i386/ -e s/sun4u/sparc64/ -e s/arm.*/arm/ -e s/sa110/arm/)
 
diff --git a/drivers/block/rd.c b/drivers/block/rd.c
index f1d54b24810a..e1d9f1edba38 100644
--- a/drivers/block/rd.c
+++ b/drivers/block/rd.c
@@ -173,7 +173,7 @@ repeat:
 	if (CURRENT->cmd == READ) 
 		memset(CURRENT->buffer, 0, len); 
 	else	
-		set_bit(BH_Protected, &CURRENT->bh->b_state);
+		mark_buffer_protected(CURRENT->bh);
 
 	end_request(1);
 	goto repeat;
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 8da1765dd480..d56d630462d0 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -62,9 +62,9 @@ static void set_brk(unsigned long start, unsigned long end)
 static int dump_write(struct file *file, const void *addr, int nr)
 {
 	int r;
-	down(&file->f_dentry->d_inode->i_sem);
+	fs_down(&file->f_dentry->d_inode->i_sem);
 	r = file->f_op->write(file, addr, nr, &file->f_pos) == nr;
-	up(&file->f_dentry->d_inode->i_sem);
+	fs_up(&file->f_dentry->d_inode->i_sem);
 	return r;
 }
 
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 5d5b91b8076c..84e9ac54c9f9 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -948,9 +948,9 @@ static int load_elf_library(int fd)
 static int dump_write(struct file *file, const void *addr, int nr)
 {
 	int r;
-	down(&file->f_dentry->d_inode->i_sem);
+	fs_down(&file->f_dentry->d_inode->i_sem);
 	r = file->f_op->write(file, addr, nr, &file->f_pos) == nr;
-	up(&file->f_dentry->d_inode->i_sem);
+	fs_up(&file->f_dentry->d_inode->i_sem);
 	return r;
 }
 
diff --git a/fs/buffer.c b/fs/buffer.c
index b59b5b4bb740..3e27c36072f0 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -83,6 +83,7 @@ static struct wait_queue * buffer_wait = NULL;
 
 static int nr_buffers = 0;
 static int nr_buffers_type[NR_LIST] = {0,};
+static unsigned long size_buffers_type[NR_LIST];
 static int nr_buffer_heads = 0;
 static int nr_unused_buffer_heads = 0;
 static int nr_hashed_buffers = 0;
@@ -359,9 +360,9 @@ asmlinkage int sys_fsync(unsigned int fd)
 		goto out_putf;
 
 	/* We need to protect against concurrent writers.. */
-	down(&inode->i_sem);
+	fs_down(&inode->i_sem);
 	err = file->f_op->fsync(file, dentry);
-	up(&inode->i_sem);
+	fs_up(&inode->i_sem);
 
 out_putf:
 	fput(file);
@@ -396,9 +397,9 @@ asmlinkage int sys_fdatasync(unsigned int fd)
 		goto out_putf;
 
 	/* this needs further work, at the moment it is identical to fsync() */
-	down(&inode->i_sem);
+	fs_down(&inode->i_sem);
 	err = file->f_op->fsync(file, dentry);
-	up(&inode->i_sem);
+	fs_up(&inode->i_sem);
 
 out_putf:
 	fput(file);
@@ -474,6 +475,7 @@ static void remove_from_queues(struct buffer_head * bh)
 		return;
 	}
 	nr_buffers_type[bh->b_list]--;
+	size_buffers_type[bh->b_list] -= bh->b_size;
 	remove_from_hash_queue(bh);
 	remove_from_lru_list(bh);
 }
@@ -523,6 +525,7 @@ static void insert_into_queues(struct buffer_head * bh)
 		(*bhp)->b_prev_free = bh;
 
 		nr_buffers_type[bh->b_list]++;
+		size_buffers_type[bh->b_list] += bh->b_size;
 
 		/* Put the buffer in new hash-queue if it has a device. */
 		bh->b_next = NULL;
@@ -571,8 +574,10 @@ struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
 {
 	struct buffer_head * bh;
 	bh = find_buffer(dev,block,size);
-	if (bh)
+	if (bh) {
 		bh->b_count++;
+		touch_buffer(bh);
+	}
 	return bh;
 }
 
@@ -816,6 +821,46 @@ static inline void file_buffer(struct buffer_head *bh, int list)
 	insert_into_queues(bh);
 }
 
+/* -1 -> no need to flush
+    0 -> async flush
+    1 -> sync flush (wait for I/O completation) */
+static int balance_dirty_state(kdev_t dev)
+{
+	unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
+
+	dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
+	tot = (buffermem >> PAGE_SHIFT) + nr_free_pages;
+	tot -= size_buffers_type[BUF_PROTECTED] >> PAGE_SHIFT;
+
+	dirty *= 200;
+	soft_dirty_limit = tot * bdf_prm.b_un.nfract;
+	hard_dirty_limit = soft_dirty_limit * 2;
+
+	if (dirty > soft_dirty_limit)
+	{
+		if (dirty > hard_dirty_limit)
+			return 1;
+		return 0;
+	}
+	return -1;
+}
+
+/*
+ * if a new dirty buffer is created we need to balance bdflush.
+ *
+ * in the future we might want to make bdflush aware of different
+ * pressures on different devices - thus the (currently unused)
+ * 'dev' parameter.
+ */
+void balance_dirty(kdev_t dev)
+{
+	int state = balance_dirty_state(dev);
+
+	if (state < 0)
+		return;
+	wakeup_bdflush(state);
+}
+
 /*
  * A buffer may need to be moved from one buffer list to another
  * (e.g. in case it is not shared any more). Handle this.
@@ -828,7 +873,9 @@ void refile_buffer(struct buffer_head * buf)
 		printk("Attempt to refile free buffer\n");
 		return;
 	}
-	if (buffer_dirty(buf))
+	if (buffer_protected(buf))
+		dispose = BUF_PROTECTED;
+	else if (buffer_dirty(buf))
 		dispose = BUF_DIRTY;
 	else if (buffer_locked(buf))
 		dispose = BUF_LOCKED;
@@ -837,13 +884,7 @@ void refile_buffer(struct buffer_head * buf)
 	if(dispose != buf->b_list) {
 		file_buffer(buf, dispose);
 		if(dispose == BUF_DIRTY) {
-			int too_many = (nr_buffers * bdf_prm.b_un.nfract/100);
-
-			/* This buffer is dirty, maybe we need to start flushing.
-			 * If too high a percentage of the buffers are dirty...
-			 */
-			if (nr_buffers_type[BUF_DIRTY] > too_many)
-				wakeup_bdflush(1);
+			balance_dirty(buf->b_dev);
 
 			/* If this is a loop device, and
 			 * more than half of the buffers are dirty...
@@ -864,7 +905,6 @@ void __brelse(struct buffer_head * buf)
 	/* If dirty, mark the time this buffer should be written back. */
 	set_writetime(buf, 0);
 	refile_buffer(buf);
-	touch_buffer(buf);
 
 	if (buf->b_count) {
 		buf->b_count--;
@@ -1457,6 +1497,7 @@ static int grow_buffers(int size)
 	}
 	tmp->b_this_page = bh;
 	free_list[isize] = bh;
+	mem_map[MAP_NR(page)].flags = 0;
 	mem_map[MAP_NR(page)].buffers = bh;
 	buffermem += PAGE_SIZE;
 	return 1;
@@ -1468,33 +1509,34 @@ static int grow_buffers(int size)
 #define BUFFER_BUSY_BITS	((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
 #define buffer_busy(bh)		((bh)->b_count || ((bh)->b_state & BUFFER_BUSY_BITS))
 
-static int sync_page_buffers(struct page * page, int wait)
+static void sync_page_buffers(struct page * page)
 {
-	struct buffer_head * bh = page->buffers;
-	struct buffer_head * tmp = bh;
+	struct buffer_head * tmp, * bh = page->buffers;
 
+	/*
+	 * Here we'll probably sleep and so we must make sure that
+	 * the page doesn't go away from under us. We also prefer any
+	 * concurrent try_to_free_buffers() not to work in any way on
+	 * our current page from under us since we're just working on it.
+	 * As always in 2.2.x we're serialized by the big kernel lock
+	 * during those hacky page-visibility manipulations.
+	 *
+	 * SUBTLE NOTE: for things like LVM snapshotting WRITEA will block too!
+	 */
 	page->buffers = NULL;
 
+	tmp = bh;
 	do {
 		struct buffer_head *p = tmp;
 		tmp = tmp->b_this_page;
-		if (buffer_locked(p)) {
-			if (wait)
-				__wait_on_buffer(p);
-		} else if (buffer_dirty(p))
-			ll_rw_block(WRITE, 1, &p);
-	} while (tmp != bh);
 
-	page->buffers = bh;
-
-	do {
-		struct buffer_head *p = tmp;
-		tmp = tmp->b_this_page;
-		if (buffer_busy(p))
-			return 1;
+		if (buffer_dirty(p))
+			if (test_and_set_bit(BH_Wait_IO, &p->b_state))
+				ll_rw_block(WRITE, 1, &p);
 	} while (tmp != bh);
 
-	return 0;
+	/* Restore the visibility of the page before returning. */
+	page->buffers = bh;
 }
 
 /*
@@ -1504,10 +1546,9 @@ static int sync_page_buffers(struct page * page, int wait)
  * Wake up bdflush() if this fails - if we're running low on memory due
  * to dirty buffers, we need to flush them out as quickly as possible.
  */
-int try_to_free_buffers(struct page * page_map, int wait)
+int try_to_free_buffers(struct page * page_map, int gfp_mask)
 {
 	struct buffer_head * tmp, * bh = page_map->buffers;
-	int too_many;
 
 	tmp = bh;
 	do {
@@ -1516,8 +1557,6 @@ int try_to_free_buffers(struct page * page_map, int wait)
 		tmp = tmp->b_this_page;
 	} while (tmp != bh);
 
- succeed:
-	tmp = bh;
 	do {
 		struct buffer_head * p = tmp;
 		tmp = tmp->b_this_page;
@@ -1536,25 +1575,12 @@ int try_to_free_buffers(struct page * page_map, int wait)
 	return 1;
 
  busy:
-	too_many = (nr_buffers * bdf_prm.b_un.nfract/100);
+	if (gfp_mask & __GFP_IO)
+		sync_page_buffers(page_map);
 
-	if (!sync_page_buffers(page_map, wait)) {
-
-		/* If a high percentage of the buffers are dirty, 
-		 * wake kflushd 
-		 */
-		if (nr_buffers_type[BUF_DIRTY] > too_many)
-			wakeup_bdflush(0);
-			
-		/*
-		 * We can jump after the busy check because
-		 * we rely on the kernel lock.
-		 */
-		goto succeed;
-	}
-
-	if(nr_buffers_type[BUF_DIRTY] > too_many)
+	if (balance_dirty_state(NODEV) >= 0)
 		wakeup_bdflush(0);
+
 	return 0;
 }
 
@@ -1566,7 +1592,7 @@ void show_buffers(void)
 	int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
 	int protected = 0;
 	int nlist;
-	static char *buf_types[NR_LIST] = {"CLEAN","LOCKED","DIRTY"};
+	static char *buf_types[NR_LIST] = {"CLEAN","LOCKED","DIRTY","PROTECTED",};
 
 	printk("Buffer memory:   %8ldkB\n",buffermem>>10);
 	printk("Buffer heads:    %6d\n",nr_buffer_heads);
@@ -1590,7 +1616,7 @@ void show_buffers(void)
 			used++, lastused = found;
 		bh = bh->b_next_free;
 	  } while (bh != lru_list[nlist]);
-	  printk("%8s: %d buffers, %d used (last=%d), "
+	  printk("%9s: %d buffers, %d used (last=%d), "
 		 "%d locked, %d protected, %d dirty\n",
 		 buf_types[nlist], found, used, lastused,
 		 locked, protected, dirty);
@@ -1935,7 +1961,8 @@ int bdflush(void * unused)
 		
 		/* If there are still a lot of dirty buffers around, skip the sleep
 		   and flush some more */
-		if(ndirty == 0 || nr_buffers_type[BUF_DIRTY] <= nr_buffers * bdf_prm.b_un.nfract/100) {
+		if (!ndirty || balance_dirty_state(NODEV) < 0)
+		{
 			spin_lock_irq(&current->sigmask_lock);
 			flush_signals(current);
 			spin_unlock_irq(&current->sigmask_lock);
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 46303344615d..35967edfc9c9 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -190,10 +190,10 @@ static ssize_t coda_file_write(struct file *coda_file, const char *buff,
                 return -1;
         }
 
-	down(&cont_inode->i_sem);
+	fs_down(&cont_inode->i_sem);
         result = cont_file.f_op->write(&cont_file , buff, count, 
 				       &(cont_file.f_pos));
-	up(&cont_inode->i_sem);
+	fs_up(&cont_inode->i_sem);
         coda_restore_codafile(coda_inode, coda_file, cont_inode, &cont_file);
 	
 	if (result)
@@ -228,14 +228,14 @@ int coda_fsync(struct file *coda_file, struct dentry *coda_dentry)
         coda_prepare_openfile(coda_inode, coda_file, cont_inode, 
 			      &cont_file, &cont_dentry);
 
-	down(&cont_inode->i_sem);
+	fs_down(&cont_inode->i_sem);
 
         result = file_fsync(&cont_file ,&cont_dentry);
 	if ( result == 0 ) {
 		result = venus_fsync(coda_inode->i_sb, &(cnp->c_fid));
 	}
 
-	up(&cont_inode->i_sem);
+	fs_up(&cont_inode->i_sem);
 
         coda_restore_codafile(coda_inode, coda_file, cont_inode, &cont_file);
         return result;
diff --git a/fs/dcache.c b/fs/dcache.c
index 0430bb0fdb84..e4265a5ce053 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -475,9 +475,9 @@ void shrink_dcache_parent(struct dentry * parent)
  */
 void shrink_dcache_memory(int priority, unsigned int gfp_mask)
 {
-	if (gfp_mask & __GFP_IO) {
+	if (gfp_mask & __GFP_IO && !current->fs_locks) {
 		int count = 0;
-		if (priority)
+		if (priority > 1)
 			count = dentry_stat.nr_unused / priority;
 		prune_dcache(count, -1);
 	}
diff --git a/fs/open.c b/fs/open.c
index 7a9fa444e48f..9f9354e97196 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -73,7 +73,7 @@ int do_truncate(struct dentry *dentry, unsigned long length)
 	if ((off_t) length < 0)
 		return -EINVAL;
 
-	down(&inode->i_sem);
+	fs_down(&inode->i_sem);
 	newattrs.ia_size = length;
 	newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
 	error = notify_change(dentry, &newattrs);
@@ -83,7 +83,7 @@ int do_truncate(struct dentry *dentry, unsigned long length)
 		if (inode->i_op && inode->i_op->truncate)
 			inode->i_op->truncate(inode);
 	}
-	up(&inode->i_sem);
+	fs_up(&inode->i_sem);
 	return error;
 }
 
diff --git a/fs/read_write.c b/fs/read_write.c
index e2b5b789977b..56c4fa41125f 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -166,9 +166,9 @@ asmlinkage ssize_t sys_write(unsigned int fd, const char * buf, size_t count)
 	if (!file->f_op || !(write = file->f_op->write))
 		goto out;
 
-	down(&inode->i_sem);
+	fs_down(&inode->i_sem);
 	ret = write(file, buf, count, &file->f_pos);
-	up(&inode->i_sem);
+	fs_up(&inode->i_sem);
 out:
 	fput(file);
 bad_file:
@@ -314,9 +314,9 @@ asmlinkage ssize_t sys_writev(unsigned long fd, const struct iovec * vector,
 	if (!file)
 		goto bad_file;
 	if (file->f_op && file->f_op->write && (file->f_mode & FMODE_WRITE)) {
-		down(&file->f_dentry->d_inode->i_sem);
+		fs_down(&file->f_dentry->d_inode->i_sem);
 		ret = do_readv_writev(VERIFY_READ, file, vector, count);
-		up(&file->f_dentry->d_inode->i_sem);
+		fs_up(&file->f_dentry->d_inode->i_sem);
 	}
 	fput(file);
 
@@ -386,9 +386,9 @@ asmlinkage ssize_t sys_pwrite(unsigned int fd, const char * buf,
 	if (pos < 0)
 		goto out;
 
-	down(&file->f_dentry->d_inode->i_sem);
+	fs_down(&file->f_dentry->d_inode->i_sem);
 	ret = write(file, buf, count, &pos);
-	up(&file->f_dentry->d_inode->i_sem);
+	fs_up(&file->f_dentry->d_inode->i_sem);
 
 out:
 	fput(file);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e2d39ed8182d..b7a722c93cdc 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -190,6 +190,7 @@ typedef char buffer_block[BLOCK_SIZE];
 #define BH_Lock		2	/* 1 if the buffer is locked */
 #define BH_Req		3	/* 0 if the buffer has been invalidated */
 #define BH_Protected	6	/* 1 if the buffer is protected */
+#define BH_Wait_IO	7	/* 1 if we should throttle on this buffer */
 
 /*
  * Try to keep the most commonly used fields in single cache lines (16
@@ -782,7 +783,7 @@ extern struct file *inuse_filps;
 
 extern void refile_buffer(struct buffer_head * buf);
 extern void set_writetime(struct buffer_head * buf, int flag);
-extern int try_to_free_buffers(struct page *, int wait);
+extern int try_to_free_buffers(struct page *, int);
 
 extern int nr_buffers;
 extern long buffermem;
@@ -791,15 +792,25 @@ extern int nr_buffer_heads;
 #define BUF_CLEAN	0
 #define BUF_LOCKED	1	/* Buffers scheduled for write */
 #define BUF_DIRTY	2	/* Dirty buffers, not yet scheduled for write */
-#define NR_LIST		3
+#define BUF_PROTECTED	3	/* Ramdisk persistent storage */
+#define NR_LIST		4
 
 void mark_buffer_uptodate(struct buffer_head * bh, int on);
 
+extern inline void mark_buffer_protected(struct buffer_head * bh)
+{
+	if (!test_and_set_bit(BH_Protected, &bh->b_state)) {
+		if (bh->b_list != BUF_PROTECTED)
+			refile_buffer(bh);
+	}
+}
+
 extern inline void mark_buffer_clean(struct buffer_head * bh)
 {
 	if (test_and_clear_bit(BH_Dirty, &bh->b_state)) {
 		if (bh->b_list == BUF_DIRTY)
 			refile_buffer(bh);
+		clear_bit(BH_Wait_IO, &bh->b_state);
 	}
 }
 
@@ -941,6 +952,9 @@ extern void inode_setattr(struct inode *, struct iattr *);
 
 extern __u32 inode_generation_count;
 
+#define fs_down(sem)	do { current->fs_locks++; down(sem); } while (0)
+#define fs_up(sem)	do { up(sem); current->fs_locks--; } while (0)
+
 #endif /* __KERNEL__ */
 
 #endif
diff --git a/include/linux/locks.h b/include/linux/locks.h
index 2094a4d19f79..f92fa3788c31 100644
--- a/include/linux/locks.h
+++ b/include/linux/locks.h
@@ -50,10 +50,12 @@ extern inline void lock_super(struct super_block * sb)
 	if (sb->s_lock)
 		__wait_on_super(sb);
 	sb->s_lock = 1;
+	current->fs_locks++;
 }
 
 extern inline void unlock_super(struct super_block * sb)
 {
+	current->fs_locks--;
 	sb->s_lock = 0;
 	wake_up(&sb->s_wait);
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 84c587aca99c..ad89e46aa2f0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -129,12 +129,8 @@ typedef struct page {
 	struct wait_queue *wait;
 	struct page **pprev_hash;
 	struct buffer_head * buffers;
-	int age;
 } mem_map_t;
 
-#define PAGE_AGE_INITIAL 1	/* age for pages just mapped */
-#define PAGE_AGE_YOUNG 2	/* age for pages recently referenced */
-
 /* Page flag bit values */
 #define PG_locked		 0
 #define PG_error		 1
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 82b10f6ec1d6..fdecb4207d5c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -319,6 +319,8 @@ struct task_struct {
 	struct files_struct *files;
 /* memory management info */
 	struct mm_struct *mm;
+	struct list_head local_pages; int allocation_order, nr_local_pages;
+	int fs_locks;
 
 /* signal handlers */
 	spinlock_t sigmask_lock;	/* Protects signal and blocked */
@@ -351,6 +353,7 @@ struct task_struct {
 #define PF_SIGNALED	0x00000400	/* killed by a signal */
 #define PF_MEMALLOC	0x00000800	/* Allocating memory */
 #define PF_VFORK	0x00001000	/* Wake up parent in mm_release */
+#define PF_FREE_PAGES	0x00002000	/* The current-> */
 
 #define PF_USEDFPU	0x00100000	/* task used FPU this quantum (SMP) */
 #define PF_DTRACE	0x00200000	/* delayed trace (used on m68k, i386) */
@@ -400,7 +403,7 @@ struct task_struct {
 /* tss */	INIT_TSS, \
 /* fs */	&init_fs, \
 /* files */	&init_files, \
-/* mm */	&init_mm, \
+/* mm */	&init_mm, { &init_task.local_pages, &init_task.local_pages}, 0, 0, 0, \
 /* signals */	SPIN_LOCK_UNLOCKED, &init_signals, {{0}}, {{0}}, NULL, &init_task.sigqueue, 0, 0, \
 /* exec cts */	0,0, \
 /* oom */	0, \
diff --git a/init/main.c b/init/main.c
index 5a7fe3c8b7bd..2f4f06e2b733 100644
--- a/init/main.c
+++ b/init/main.c
@@ -80,7 +80,6 @@ static int init(void *);
 extern int bdflush(void *);
 extern int kupdate(void *);
 extern int kswapd(void *);
-extern int kpiod(void *);
 extern void kswapd_setup(void);
 extern unsigned long init_IRQ( unsigned long);
 extern void init_modules(void);
@@ -1584,7 +1583,6 @@ static void __init do_basic_setup(void)
 	kernel_thread(kupdate, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
 	/* Start the background pageout daemon. */
 	kswapd_setup();
-	kernel_thread(kpiod, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
 	kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
 
 #if CONFIG_AP1000
diff --git a/ipc/shm.c b/ipc/shm.c
index 62dd3c645994..4a33fa8fe850 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -679,7 +679,7 @@ done:	/* pte_val(pte) == shp->shm_pages[idx] */
 }
 
 /*
- * Goes through counter = (shm_rss >> prio) present shm pages.
+ * Goes through counter = (shm_rss / prio) present shm pages.
  */
 static unsigned long swap_id = 0; /* currently being swapped */
 static unsigned long swap_idx = 0; /* next to swap */
@@ -693,7 +693,7 @@ int shm_swap (int prio, int gfp_mask)
 	int loop = 0;
 	int counter;
 	
-	counter = shm_rss >> prio;
+	counter = shm_rss / prio;
 	if (!counter || !(swap_nr = get_swap_page()))
 		return 0;
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 70309a73c7cf..70a98bb198c6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -665,6 +665,8 @@ int do_fork(unsigned long clone_flags, unsigned long usp, struct pt_regs *regs)
 	p->lock_depth = -1;		/* -1 = no lock */
 	p->start_time = jiffies;
 
+	INIT_LIST_HEAD(&p->local_pages);
+
 	retval = -ENOMEM;
 	/* copy all the process information */
 	if (copy_files(clone_flags, p))
diff --git a/mm/filemap.c b/mm/filemap.c
index 384bab05a42e..785135056c64 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -19,7 +19,6 @@
 #include <linux/blkdev.h>
 #include <linux/file.h>
 #include <linux/swapctl.h>
-#include <linux/slab.h>
 #include <linux/init.h>
 
 #include <asm/pgtable.h>
@@ -36,25 +35,6 @@ unsigned long page_cache_size = 0;
 unsigned int page_hash_bits, page_hash_mask;
 struct page **page_hash_table;
 
-/* 
- * Define a request structure for outstanding page write requests
- * to the background page io daemon
- */
-
-struct pio_request 
-{
-	struct pio_request *	next;
-	struct file *		file;
-	unsigned long		offset;
-	unsigned long		page;
-};
-static struct pio_request *pio_first = NULL, **pio_last = &pio_first;
-static kmem_cache_t *pio_request_cache;
-static struct wait_queue *pio_wait = NULL;
-
-static inline void 
-make_pio_request(struct file *, unsigned long, unsigned long);
-
 static inline int sync_page(struct page *page)
 {
 	struct inode *inode = page->inode;
@@ -150,14 +130,21 @@ int shrink_mmap(int priority, int gfp_mask)
 	unsigned long limit = num_physpages;
 	struct page * page;
 	int count;
-	int nr_dirty = 0;
-	
+
 	/* Make sure we scan all pages twice at priority 0. */
-	count = (limit << 1) >> priority;
+	count = limit / priority;
 
  refresh_clock:
 	page = mem_map + clock;
 	do {
+		int referenced;
+
+		if (current->need_resched) {
+			current->state = TASK_RUNNING;
+			schedule();
+			goto refresh_clock;
+		}
+		
 		/* This works even in the presence of PageSkip because
 		 * the first two entries at the beginning of a hole will
 		 * be marked, not just the first.
@@ -174,42 +161,39 @@ int shrink_mmap(int priority, int gfp_mask)
 			clock = page - mem_map;
 		}
 		
-		if (test_and_clear_bit(PG_referenced, &page->flags)) {
-			page->age = PAGE_AGE_YOUNG;
-			continue;
-		}
-
-		if (page->age > 0) {
-			page->age--;
-			continue;
-		}
+		count--;
 
 		/* We can't free pages unless there's just one user */
 		if (atomic_read(&page->count) != 1)
 			continue;
 
+		referenced = test_and_clear_bit(PG_referenced, &page->flags);
+
 		if (PageLocked(page))
 			continue;
 
-		if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
+		if ((gfp_mask & __GFP_DMA) && !PageDMA(page)) {
+			count++;
 			continue;
+		}
 
-		/* Is it a page swap page? Drop it, its old. */
+		/*
+		 * Is it a page swap page? If so, we want to
+		 * drop it if it is no longer used, even if it
+		 * were to be marked referenced..
+		 */
 		if (PageSwapCache(page)) {
+			if (referenced && swap_count(page->offset) != 1)
+				continue;
 			delete_from_swap_cache(page);
 			return 1;
 		}	
 
+		if (referenced)
+			continue;
+
 		/* Is it a buffer page? */
 		if (page->buffers) {
-			/*
-			 * Wait for async IO to complete
-			 * at each 64 buffers
-			 */ 
-
-			int wait = ((gfp_mask & __GFP_IO) 
-				&& (!(nr_dirty++ % 64)));
-
 			if (buffer_under_min())
 				continue;
 			/*
@@ -217,10 +201,8 @@ int shrink_mmap(int priority, int gfp_mask)
 			 * throttling.
 			 */
 
-			if (!try_to_free_buffers(page, wait)) { 
-				if(--count < 0) break;
+			if (!try_to_free_buffers(page, gfp_mask))
 				goto refresh_clock;
-			}
 			return 1;
 		}
 
@@ -231,8 +213,7 @@ int shrink_mmap(int priority, int gfp_mask)
 			remove_inode_page(page);
 			return 1;
 		}
-
-	} while (--count > 0);
+	} while (count > 0);
 	return 0;
 }
 
@@ -299,7 +280,7 @@ static inline void add_to_page_cache(struct page * page,
 	struct page **hash)
 {
 	atomic_inc(&page->count);
-	page->flags = (page->flags & ~((1 << PG_uptodate) | (1 << PG_error))) | (1 << PG_referenced);
+	page->flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced));
 	page->offset = offset;
 	add_page_to_inode_queue(inode, page);
 	__add_page_to_hash_queue(page, hash);
@@ -878,12 +859,12 @@ static int file_send_actor(read_descriptor_t * desc, const char *area, unsigned
 
 	if (size > count)
 		size = count;
-	down(&inode->i_sem);
+	fs_down(&inode->i_sem);
 	old_fs = get_fs();
 	set_fs(KERNEL_DS);
 	written = file->f_op->write(file, area, size, &file->f_pos);
 	set_fs(old_fs);
-	up(&inode->i_sem);
+	fs_up(&inode->i_sem);
 	if (written < 0) {
 		desc->error = written;
 		written = 0;
@@ -1160,8 +1141,7 @@ static inline int do_write_page(struct inode * inode, struct file * file,
 
 static int filemap_write_page(struct vm_area_struct * vma,
 			      unsigned long offset,
-			      unsigned long page,
-			      int wait)
+			      unsigned long page)
 {
 	int result;
 	struct file * file;
@@ -1179,20 +1159,9 @@ static int filemap_write_page(struct vm_area_struct * vma,
 	 * and file could be released ... increment the count to be safe.
 	 */
 	file->f_count++;
-
-	/* 
-	 * If this is a swapping operation rather than msync(), then
-	 * leave the actual IO, and the restoration of the file count,
-	 * to the kpiod thread.  Just queue the request for now.
-	 */
-	if (!wait) {
-		make_pio_request(file, offset, page);
-		return 0;
-	}
-	
-	down(&inode->i_sem);
+	fs_down(&inode->i_sem);
 	result = do_write_page(inode, file, (const char *) page, offset);
-	up(&inode->i_sem);
+	fs_up(&inode->i_sem);
 	fput(file);
 	return result;
 }
@@ -1205,7 +1174,7 @@ static int filemap_write_page(struct vm_area_struct * vma,
  */
 int filemap_swapout(struct vm_area_struct * vma, struct page * page)
 {
-	return filemap_write_page(vma, page->offset, page_address(page), 0);
+	return filemap_write_page(vma, page->offset, page_address(page));
 }
 
 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
@@ -1242,7 +1211,7 @@ static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
 			return 0;
 		}
 	}
-	error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page, 1);
+	error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page);
 	page_cache_free(page);
 	return error;
 }
@@ -1414,9 +1383,9 @@ static int msync_interval(struct vm_area_struct * vma,
 			if (file) {
 				struct dentry * dentry = file->f_dentry;
 				struct inode * inode = dentry->d_inode;
-				down(&inode->i_sem);
+				fs_down(&inode->i_sem);
 				error = file_fsync(file, dentry);
-				up(&inode->i_sem);
+				fs_up(&inode->i_sem);
 			}
 		}
 		return error;
@@ -1745,130 +1714,6 @@ void put_cached_page(unsigned long addr)
 	page_cache_release(page);
 }
 
-
-/* Add request for page IO to the queue */
-
-static inline void put_pio_request(struct pio_request *p)
-{
-	*pio_last = p;
-	p->next = NULL;
-	pio_last = &p->next;
-}
-
-/* Take the first page IO request off the queue */
-
-static inline struct pio_request * get_pio_request(void)
-{
-	struct pio_request * p = pio_first;
-	pio_first = p->next;
-	if (!pio_first)
-		pio_last = &pio_first;
-	return p;
-}
-
-/* Make a new page IO request and queue it to the kpiod thread */
-
-static inline void make_pio_request(struct file *file,
-				    unsigned long offset,
-				    unsigned long page)
-{
-	struct pio_request *p;
-
-	atomic_inc(&page_cache_entry(page)->count);
-
-	/* 
-	 * We need to allocate without causing any recursive IO in the
-	 * current thread's context.  We might currently be swapping out
-	 * as a result of an allocation made while holding a critical
-	 * filesystem lock.  To avoid deadlock, we *MUST* not reenter
-	 * the filesystem in this thread.
-	 *
-	 * We can wait for kswapd to free memory, or we can try to free
-	 * pages without actually performing further IO, without fear of
-	 * deadlock.  --sct
-	 */
-
-	while ((p = kmem_cache_alloc(pio_request_cache, GFP_BUFFER)) == NULL) {
-		if (try_to_free_pages(__GFP_WAIT))
-			continue;
-		current->state = TASK_INTERRUPTIBLE;
-		schedule_timeout(HZ/10);
-	}
-	
-	p->file   = file;
-	p->offset = offset;
-	p->page   = page;
-
-	put_pio_request(p);
-	wake_up(&pio_wait);
-}
-
-
-/*
- * This is the only thread which is allowed to write out filemap pages
- * while swapping.
- * 
- * To avoid deadlock, it is important that we never reenter this thread.
- * Although recursive memory allocations within this thread may result
- * in more page swapping, that swapping will always be done by queuing
- * another IO request to the same thread: we will never actually start
- * that IO request until we have finished with the current one, and so
- * we will not deadlock.  
- */
-
-int kpiod(void * unused)
-{
-	struct task_struct *tsk = current;
-	struct wait_queue wait = { tsk, };
-	struct inode * inode;
-	struct dentry * dentry;
-	struct pio_request * p;
-	
-	tsk->session = 1;
-	tsk->pgrp = 1;
-	strcpy(tsk->comm, "kpiod");
-	sigfillset(&tsk->blocked);
-	init_waitqueue(&pio_wait);
-	/*
-	 * Mark this task as a memory allocator - we don't want to get caught
-	 * up in the regular mm freeing frenzy if we have to allocate memory
-	 * in order to write stuff out.
-	 */
-	tsk->flags |= PF_MEMALLOC;
-
-	lock_kernel();
-	
-	pio_request_cache = kmem_cache_create("pio_request", 
-					      sizeof(struct pio_request),
-					      0, SLAB_HWCACHE_ALIGN, 
-					      NULL, NULL);
-	if (!pio_request_cache)
-		panic ("Could not create pio_request slab cache");
-
-	while (1) {
-		tsk->state = TASK_INTERRUPTIBLE;
-		add_wait_queue(&pio_wait, &wait);
-		if (!pio_first)
-			schedule();
-		remove_wait_queue(&pio_wait, &wait);
-		tsk->state = TASK_RUNNING;
-
-		while (pio_first) {
-			p = get_pio_request();
-			dentry = p->file->f_dentry;
-			inode = dentry->d_inode;
-			
-			down(&inode->i_sem);
-			do_write_page(inode, p->file,
-				      (const char *) p->page, p->offset);
-			up(&inode->i_sem);
-			fput(p->file);
-			page_cache_free(p->page);
-			kmem_cache_free(pio_request_cache, p);
-		}
-	}
-}
-
 void __init page_cache_init(unsigned long memory_size)
 {
 	unsigned long htable_size;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8212c29bb780..533cca3ad461 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -93,34 +93,69 @@ static inline void remove_mem_queue(struct page * entry)
  */
 spinlock_t page_alloc_lock = SPIN_LOCK_UNLOCKED;
 
+#define list(x) (mem_map+(x))
+#define __free_pages_ok(map_nr, mask, area, index)		\
+	nr_free_pages -= (mask);				\
+	while ((mask) + (1 << (NR_MEM_LISTS-1))) {		\
+		if (!test_and_change_bit((index), (area)->map))	\
+			break;					\
+		(area)->count--;				\
+		remove_mem_queue(list((map_nr) ^ -(mask)));	\
+		(mask) <<= 1;					\
+		(area)++;					\
+		(index) >>= 1;					\
+		(map_nr) &= (mask);				\
+	}							\
+	add_mem_queue(area, list(map_nr));
+
+static void free_local_pages(struct page * page) {
+	unsigned long order = page->offset;
+	unsigned int type = PageDMA(page) ? 1 : 0;
+	struct free_area_struct *area;
+	unsigned long map_nr = page - mem_map;
+	unsigned long mask = (~0UL) << order;
+	unsigned long index = map_nr >> (1 + order);
+
+	area = free_area[type] + order;
+	__free_pages_ok(map_nr, mask, area, index);
+}
+
 static inline void free_pages_ok(unsigned long map_nr, unsigned long order, unsigned type)
 {
-	struct free_area_struct *area = free_area[type] + order;
-	unsigned long index = map_nr >> (1 + order);
-	unsigned long mask = (~0UL) << order;
+	struct free_area_struct *area;
+	unsigned long index;
+	unsigned long mask;
 	unsigned long flags;
+	struct page * page;
 
-	spin_lock_irqsave(&page_alloc_lock, flags);
-
-#define list(x) (mem_map+(x))
+	if (current->flags & PF_FREE_PAGES)
+		goto local_freelist;
+ back_local_freelist:
 
+	index = map_nr >> (1 + order);
+	mask = (~0UL) << order;
 	map_nr &= mask;
-	nr_free_pages -= mask;
-	while (mask + (1 << (NR_MEM_LISTS-1))) {
-		if (!test_and_change_bit(index, area->map))
-			break;
-		area->count--;
-		remove_mem_queue(list(map_nr ^ -mask));
-		mask <<= 1;
-		area++;
-		index >>= 1;
-		map_nr &= mask;
-	}
-	add_mem_queue(area, list(map_nr));
-
-#undef list
 
+	spin_lock_irqsave(&page_alloc_lock, flags);
+	area = free_area[type] + order;
+	__free_pages_ok(map_nr, mask, area, index);
 	spin_unlock_irqrestore(&page_alloc_lock, flags);
+	return;
+
+ local_freelist:
+	/*
+	 * This is a little subtle: if the allocation order
+	 * wanted is major than zero we'd better take all the pages
+	 * local since we must deal with fragmentation too and we
+	 * can't rely on the nr_local_pages information.
+	 */
+	if (current->nr_local_pages && !current->allocation_order)
+		goto back_local_freelist;
+
+	page = mem_map + map_nr;
+	list_add((struct list_head *) page, &current->local_pages);
+	page->offset = order;
+	current->nr_local_pages++;
 }
 
 void __free_pages(struct page *page, unsigned long order)
@@ -129,7 +164,6 @@ void __free_pages(struct page *page, unsigned long order)
 		if (PageSwapCache(page))
 			panic ("Freeing swap cache page");
 		page->flags &= ~(1 << PG_referenced);
-		page->age = PAGE_AGE_INITIAL;
 		free_pages_ok(page - mem_map, order, PageDMA(page) ? 1 : 0);
 		return;
 	}
@@ -180,13 +214,32 @@ do { unsigned long size = 1 << high; \
 	atomic_set(&map->count, 1); \
 } while (0)
 
+static void refile_local_pages(void)
+{
+	if (current->nr_local_pages) {
+		struct page * page;
+		struct list_head * entry;
+		int nr_pages = current->nr_local_pages;
+
+		while ((entry = current->local_pages.next) != &current->local_pages) {
+			list_del(entry);
+			page = (struct page *) entry;
+			free_local_pages(page);
+			if (!nr_pages--)
+				panic("__get_free_pages local_pages list corrupted I");
+		}
+		if (nr_pages)
+			panic("__get_free_pages local_pages list corrupted II");
+		current->nr_local_pages = 0;
+	}
+}
+
 unsigned long __get_free_pages(int gfp_mask, unsigned long order)
 {
 	unsigned long flags;
-	static atomic_t free_before_allocate = ATOMIC_INIT(0);
 
 	if (order >= NR_MEM_LISTS)
-		goto nopage;
+		goto out;
 
 #ifdef ATOMIC_MEMORY_DEBUGGING
 	if ((gfp_mask & __GFP_WAIT) && in_interrupt()) {
@@ -195,26 +248,24 @@ unsigned long __get_free_pages(int gfp_mask, unsigned long order)
 			printk("gfp called nonatomically from interrupt %p\n",
 				__builtin_return_address(0));
 		}
-		goto nopage;
+		goto out;
 	}
 #endif
 
+	/*
+	 * Acquire lock before reading nr_free_pages to make sure it
+	 * won't change from under us.
+	 */
+	spin_lock_irqsave(&page_alloc_lock, flags);
+
 	/*
 	 * If this is a recursive call, we'd better
 	 * do our best to just allocate things without
 	 * further thought.
 	 */
 	if (!(current->flags & PF_MEMALLOC)) {
-		int freed;
 		extern struct wait_queue * kswapd_wait;
 
-		/* Somebody needs to free pages so we free some of our own. */
-		if (atomic_read(&free_before_allocate)) {
-			current->flags |= PF_MEMALLOC;
-			try_to_free_pages(gfp_mask);
-			current->flags &= ~PF_MEMALLOC;
-		}
-
 		if (nr_free_pages > freepages.low)
 			goto ok_to_allocate;
 
@@ -224,35 +275,44 @@ unsigned long __get_free_pages(int gfp_mask, unsigned long order)
 		/* Do we have to block or can we proceed? */
 		if (nr_free_pages > freepages.min)
 			goto ok_to_allocate;
-
-		current->flags |= PF_MEMALLOC;
-		atomic_inc(&free_before_allocate);
-		freed = try_to_free_pages(gfp_mask);
-		atomic_dec(&free_before_allocate);
-		current->flags &= ~PF_MEMALLOC;
-
-		/*
-		 * Re-check we're still low on memory after we blocked
-		 * for some time. Somebody may have released lots of
-		 * memory from under us while we was trying to free
-		 * the pages. We check against pages_high to be sure
-		 * to succeed only if lots of memory is been released.
-		 */
-		if (nr_free_pages > freepages.high)
-			goto ok_to_allocate;
-
-		if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
-			goto nopage;
+		if (gfp_mask & __GFP_WAIT) {
+			int freed;
+			/*
+			 * If the task is ok to sleep it's fine also
+			 * if we release irq here.
+			 */
+			spin_unlock_irq(&page_alloc_lock);
+
+			current->flags |= PF_MEMALLOC|PF_FREE_PAGES;
+			current->allocation_order = order;
+			freed = try_to_free_pages(gfp_mask);
+			current->flags &= ~(PF_MEMALLOC|PF_FREE_PAGES);
+
+			spin_lock_irq(&page_alloc_lock);
+			refile_local_pages();
+
+			/*
+			 * Re-check we're still low on memory after we blocked
+			 * for some time. Somebody may have released lots of
+			 * memory from under us while we was trying to free
+			 * the pages. We check against pages_high to be sure
+			 * to succeed only if lots of memory is been released.
+			 */
+			if (nr_free_pages > freepages.high)
+				goto ok_to_allocate;
+
+			if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
+				goto nopage;
+		}
 	}
 ok_to_allocate:
-	spin_lock_irqsave(&page_alloc_lock, flags);
 	/* if it's not a dma request, try non-dma first */
 	if (!(gfp_mask & __GFP_DMA))
 		RMQUEUE_TYPE(order, 0);
 	RMQUEUE_TYPE(order, 1);
+ nopage:
 	spin_unlock_irqrestore(&page_alloc_lock, flags);
-
-nopage:
+ out:
 	return 0;
 }
 
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 8c5e7176c5aa..7c27e5b338db 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -63,6 +63,7 @@ int add_to_swap_cache(struct page *page, unsigned long entry)
 		return 0;
 	}
 	atomic_inc(&page->count);
+	page->flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced));
 	page->inode = &swapper_inode;
 	page->offset = entry;
 	add_page_to_hash_queue(page, &swapper_inode, entry);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 86e6b1fe9a87..81ba9a55ba62 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -96,6 +96,9 @@ drop_pte:
 	 * some real work in the future in "shrink_mmap()".
 	 */
 	if (!pte_dirty(pte)) {
+		if (page_map->inode && pgcache_under_min())
+			/* unmapping this page would be useless */
+			return 0;
 		flush_cache_page(vma, address);
 		pte_clear(page_table);
 		goto drop_pte;
@@ -106,7 +109,7 @@ drop_pte:
 	 * we cannot do I/O! Avoid recursing on FS
 	 * locks etc.
 	 */
-	if (!(gfp_mask & __GFP_IO))
+	if (!(gfp_mask & __GFP_IO) || current->fs_locks)
 		return 0;
 
 	/*
@@ -208,6 +211,8 @@ static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct *
 		result = try_to_swap_out(tsk, vma, address, pte, gfp_mask);
 		if (result)
 			return result;
+		if (current->need_resched)
+			return 2;
 		address += PAGE_SIZE;
 		pte++;
 	} while (address < end);
@@ -327,7 +332,7 @@ static int swap_out(unsigned int priority, int gfp_mask)
 	 * Think of swap_cnt as a "shadow rss" - it tells us which process
 	 * we want to page out (always try largest first).
 	 */
-	counter = nr_tasks / (priority+1);
+	counter = nr_tasks / priority;
 	if (counter < 1)
 		counter = 1;
 
@@ -361,8 +366,13 @@ static int swap_out(unsigned int priority, int gfp_mask)
 			goto out;
 		}
 
-		if (swap_out_process(pbest, gfp_mask))
+		switch (swap_out_process(pbest, gfp_mask)) {
+		case 1:
 			return 1;
+		case 2:
+			current->state = TASK_RUNNING;
+			schedule();
+		}
 	}
 out:
 	return 0;
@@ -377,11 +387,9 @@ out:
  * cluster them so that we get good swap-out behaviour. See
  * the "free_memory()" macro for details.
  */
-static int do_try_to_free_pages(unsigned int gfp_mask)
+int try_to_free_pages(unsigned int gfp_mask)
 {
 	int priority;
-	int ret = 0;
-	int swapcount;
 	int count = SWAP_CLUSTER_MAX;
 
 	lock_kernel();
@@ -389,41 +397,34 @@ static int do_try_to_free_pages(unsigned int gfp_mask)
 	/* Always trim SLAB caches when memory gets low. */
 	kmem_cache_reap(gfp_mask);
 
-	priority = 6;
+	priority = 5;
 	do {
 		while (shrink_mmap(priority, gfp_mask)) {
-			ret = 1;
 			if (!--count)
 				goto done;
 		}
 
 		/* Try to get rid of some shared memory pages.. */
-		if (gfp_mask & __GFP_IO) {
+		if (gfp_mask & __GFP_IO && !current->fs_locks) {
 			while (shm_swap(priority, gfp_mask)) {
-				ret = 1;
 				if (!--count)
 					goto done;
 			}
 		}
 
 		/* Then, try to page stuff out.. */
-		swapcount = count;
 		while (swap_out(priority, gfp_mask)) {
-			ret = 1;
-			if (!--swapcount)
-				break;
+			if (!--count)
+				goto done;
 		}
 
 		shrink_dcache_memory(priority, gfp_mask);
-	} while (--priority >= 0);
+	} while (--priority > 0);
 done:
 	unlock_kernel();
 
-	if (!ret)
-		printk("VM: do_try_to_free_pages failed for %s...\n",
-				current->comm);
 	/* Return success if we freed a page. */
-	return ret;
+	return priority > 0;
 }
 
 /*
@@ -499,7 +500,7 @@ int kswapd(void *unused)
 
 		while (nr_free_pages < freepages.high)
 		{
-			if (do_try_to_free_pages(GFP_KSWAPD))
+			if (try_to_free_pages(GFP_KSWAPD))
 			{
 				if (tsk->need_resched)
 					schedule();
@@ -510,17 +511,3 @@ int kswapd(void *unused)
 		}
 	}
 }
-
-/*
- * Called by non-kswapd processes when kswapd really cannot
- * keep up with the demand for free memory.
- */
-int try_to_free_pages(unsigned int gfp_mask)
-{
-	int retval = 1;
-
-	if (gfp_mask & __GFP_WAIT)
-		retval = do_try_to_free_pages(gfp_mask);
-	return retval;
-}
-	
-- 
2.39.5