From 616d8602876cdb9fe13e39179a696b5a69f393a6 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linuxfoundation.org>
Date: Fri, 23 Nov 2007 15:18:11 -0500
Subject: [PATCH] Linux 2.2.2-pre2

this one contains various small documentation updates and updates to xconfig,
but the important parts (and the smallest part of the actual patch) are:

 - shared file lockup fix by Stephen Tweedie
 - my fix for the TCP bug that Ingo found
 - Ingo's io-apic setup fixes, which should finally get rid of the
   spurious apic interrupts with some motherboards and the ExtINT setup.
 - inode leak thing
 - SMP scheduler potential race condition fix
 - sound driver updates
 - partition and disk fixes (2kB blocksize media and some IDE disk
   geometry and irq detection issues).

None of the fixes are critical to most people, but all of them _can_ be
critical to people who have seen vulnerabilities in the area. As such, if
you're happy with 2.2.1 there is no pressing reason to test this patch
out, but I hope to have the pre-patches so that the final 2.2.2 can be
left around for a while (CD-ROM manufacturers etc would certainly prefer
to not see lots of releases).

                Linus
---
 arch/i386/kernel/io_apic.c |  40 +++++++---
 drivers/char/vt.c          |  47 ++++++-----
 fs/inode.c                 |  69 ++++++++--------
 include/linux/sched.h      |   8 +-
 init/main.c                |   2 +
 kernel/ksyms.c             |   1 +
 kernel/sched.c             |  51 ++++++------
 mm/filemap.c               | 158 ++++++++++++++++++++++++++++++++++++-
 mm/vmscan.c                |  14 ++--
 net/ipv4/proc.c            |   2 +
 net/ipv4/tcp_ipv4.c        |  21 +++--
 11 files changed, 289 insertions(+), 124 deletions(-)

diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 0a2416141087..b57259afc1d8 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -202,7 +202,7 @@ DO_ACTION( enable,  1, |= 0xff000000, )				/* destination = 0xff */
 DO_ACTION( mask,    0, |= 0x00010000, io_apic_sync())		/* mask = 1 */
 DO_ACTION( unmask,  0, &= 0xfffeffff, )				/* mask = 0 */
 
-static void __init clear_IO_APIC_pin(unsigned int pin)
+static void clear_IO_APIC_pin(unsigned int pin)
 {
 	struct IO_APIC_route_entry entry;
 
@@ -215,6 +215,13 @@ static void __init clear_IO_APIC_pin(unsigned int pin)
 	io_apic_write(0x11 + 2 * pin, *(((int *)&entry) + 1));
 }
 
+static void clear_IO_APIC (void)
+{
+	int pin;
+
+	for (pin = 0; pin < nr_ioapic_registers; pin++)
+		clear_IO_APIC_pin(pin);
+}
 
 /*
  * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
@@ -625,7 +632,7 @@ void __init setup_IO_APIC_irqs(void)
 /*
  * Set up a certain pin as ExtINT delivered interrupt
  */
-void __init setup_ExtINT_pin(unsigned int pin)
+void __init setup_ExtINT_pin(unsigned int pin, int irq)
 {
 	struct IO_APIC_route_entry entry;
 
@@ -635,17 +642,16 @@ void __init setup_ExtINT_pin(unsigned int pin)
 	memset(&entry,0,sizeof(entry));
 
 	entry.delivery_mode = dest_ExtINT;
-	entry.dest_mode = 1;				/* logical delivery */
+	entry.dest_mode = 0;				/* physical delivery */
 	entry.mask = 0;					/* unmask IRQ now */
 	/*
-	 * Careful with this one. We do not use 'true' logical
-	 * delivery, as we set local APICs to LDR == 0. But
-	 * 0xff logical destination is special (broadcast).
-	 * Any other combination will cause problems.
+	 * We use physical delivery to get the timer IRQ
+	 * to the boot CPU. 'boot_cpu_id' is the physical
+	 * APIC ID of the boot CPU.
 	 */
-	entry.dest.logical.logical_dest = 0xff;
+	entry.dest.physical.physical_dest = boot_cpu_id;
 
-	entry.vector = 0;				/* it's ignored */
+	entry.vector = assign_irq_vector(irq);
 
 	entry.polarity = 0;
 	entry.trigger = 0;
@@ -760,7 +766,7 @@ void __init print_IO_APIC(void)
 
 static void __init init_sym_mode(void)
 {
-	int i, pin;
+	int i;
 
 	for (i = 0; i < PIN_MAP_SIZE; i++) {
 		irq_2_pin[i].pin = -1;
@@ -790,8 +796,7 @@ static void __init init_sym_mode(void)
 	/*
 	 * Do not trust the IO-APIC being empty at bootup
 	 */
-	for (pin = 0; pin < nr_ioapic_registers; pin++)
-		clear_IO_APIC_pin(pin);
+	clear_IO_APIC();
 }
 
 /*
@@ -799,6 +804,15 @@ static void __init init_sym_mode(void)
  */
 void init_pic_mode(void)
 {
+	/*
+	 * Clear the IO-APIC before rebooting:
+	 */
+	clear_IO_APIC();
+
+	/*
+	 * Put it back into PIC mode (has an effect only on
+	 * certain boards)
+	 */
 	printk("disabling symmetric IO mode... ");
 		outb_p(0x70, 0x22);
 		outb_p(0x00, 0x23);
@@ -1184,7 +1198,7 @@ static inline void check_timer(void)
 
 		if (pin2 != -1) {
 			printk(".. (found pin %d) ...", pin2);
-			setup_ExtINT_pin(pin2);
+			setup_ExtINT_pin(pin2, 0);
 			make_8259A_irq(0);
 		}
 
diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index 6830089f96c2..97be390b03a1 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -165,7 +165,7 @@ do_kdsk_ioctl(int cmd, struct kbentry *user_kbe, int perm, struct kbd_struct *kb
 			val = K_HOLE;
 		} else
 		    val = (i ? K_HOLE : K_NOSUCHMAP);
-		return __put_user(val, &user_kbe->kb_value);
+		return put_user(val, &user_kbe->kb_value);
 	case KDSKBENT:
 		if (!perm)
 			return -EPERM;
@@ -244,7 +244,7 @@ do_kbkeycode_ioctl(int cmd, struct kbkeycode *user_kbkc, int perm)
 	case KDGETKEYCODE:
 		kc = getkeycode(tmp.scancode);
 		if (kc >= 0)
-			kc = __put_user(kc, &user_kbkc->keycode);
+			kc = put_user(kc, &user_kbkc->keycode);
 		break;
 	case KDSETKEYCODE:
 		if (!perm)
@@ -282,8 +282,8 @@ do_kdgkb_ioctl(int cmd, struct kbsentry *user_kdgkb, int perm)
 		p = func_table[i];
 		if(p)
 			for ( ; *p && sz; p++, sz--)
-				__put_user(*p, q++);
-		__put_user('\0', q);
+				put_user(*p, q++);
+		put_user('\0', q);
 		return ((p && *p) ? -EOVERFLOW : 0);
 	case KDSKBSENT:
 		if (!perm)
@@ -603,12 +603,10 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 	{
 		struct kbdiacrs *a = (struct kbdiacrs *)arg;
 
-		i = verify_area(VERIFY_WRITE, (void *) a, sizeof(struct kbdiacrs));
-		if (i)
-			return i;
-		__put_user(accent_table_size, &a->kb_cnt);
-		__copy_to_user(a->kbdiacr, accent_table,
-			    accent_table_size*sizeof(struct kbdiacr));
+		if (put_user(accent_table_size, &a->kb_cnt))
+			return -EFAULT;
+		if (copy_to_user(a->kbdiacr, accent_table, accent_table_size*sizeof(struct kbdiacr)))
+			return -EFAULT;
 		return 0;
 	}
 
@@ -619,14 +617,13 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 
 		if (!perm)
 			return -EPERM;
-		i = verify_area(VERIFY_READ, (void *) a, sizeof(struct kbdiacrs));
-		if (i)
-			return i;
-		__get_user(ct,&a->kb_cnt);
+		if (get_user(ct,&a->kb_cnt))
+			return -EFAULT;
 		if (ct >= MAX_DIACR)
 			return -EINVAL;
 		accent_table_size = ct;
-		__copy_from_user(accent_table, a->kbdiacr, ct*sizeof(struct kbdiacr));
+		if (copy_from_user(accent_table, a->kbdiacr, ct*sizeof(struct kbdiacr)))
+			return -EFAULT;
 		return 0;
 	}
 
@@ -717,12 +714,12 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 		i = verify_area(VERIFY_WRITE,(void *)vtstat, sizeof(struct vt_stat));
 		if (i)
 			return i;
-		__put_user(fg_console + 1, &vtstat->v_active);
+		put_user(fg_console + 1, &vtstat->v_active);
 		state = 1;	/* /dev/tty0 is always open */
 		for (i = 0, mask = 2; i < MAX_NR_CONSOLES && mask; ++i, mask <<= 1)
 			if (VT_IS_IN_USE(i))
 				state |= mask;
-		return __put_user(state, &vtstat->v_state);
+		return put_user(state, &vtstat->v_state);
 	}
 
 	/*
@@ -856,8 +853,8 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 		i = verify_area(VERIFY_READ, (void *)vtsizes, sizeof(struct vt_sizes));
 		if (i)
 			return i;
-		__get_user(ll, &vtsizes->v_rows);
-		__get_user(cc, &vtsizes->v_cols);
+		get_user(ll, &vtsizes->v_rows);
+		get_user(cc, &vtsizes->v_cols);
 		return vc_resize_all(ll, cc);
 	}
 
@@ -870,12 +867,12 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 		i = verify_area(VERIFY_READ, (void *)vtconsize, sizeof(struct vt_consize));
 		if (i)
 			return i;
-		__get_user(ll, &vtconsize->v_rows);
-		__get_user(cc, &vtconsize->v_cols);
-		__get_user(vlin, &vtconsize->v_vlin);
-		__get_user(clin, &vtconsize->v_clin);
-		__get_user(vcol, &vtconsize->v_vcol);
-		__get_user(ccol, &vtconsize->v_ccol);
+		get_user(ll, &vtconsize->v_rows);
+		get_user(cc, &vtconsize->v_cols);
+		get_user(vlin, &vtconsize->v_vlin);
+		get_user(clin, &vtconsize->v_clin);
+		get_user(vcol, &vtconsize->v_vcol);
+		get_user(ccol, &vtconsize->v_ccol);
 		vlin = vlin ? vlin : video_scan_lines;
 		if ( clin )
 		  {
diff --git a/fs/inode.c b/fs/inode.c
index 72a23f8584fa..347e88d37d01 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -232,13 +232,15 @@ void clear_inode(struct inode *inode)
 
 /*
  * Dispose-list gets a local list, so it doesn't need to
- * worry about list corruption.
+ * worry about list corruption. It releases the inode lock
+ * while clearing the inodes.
  */
 static void dispose_list(struct list_head * head)
 {
 	struct list_head *next;
 	int count = 0;
 
+	spin_unlock(&inode_lock);
 	next = head->next;
 	for (;;) {
 		struct list_head * tmp = next;
@@ -256,7 +258,6 @@ static void dispose_list(struct list_head * head)
 	spin_lock(&inode_lock);
 	list_splice(head, &inode_unused);
 	inodes_stat.nr_free_inodes += count;
-	spin_unlock(&inode_lock);
 }
 
 /*
@@ -305,52 +306,52 @@ int invalidate_inodes(struct super_block * sb)
 	spin_lock(&inode_lock);
 	busy = invalidate_list(&inode_in_use, sb, &throw_away);
 	busy |= invalidate_list(&sb->s_dirty, sb, &throw_away);
-	spin_unlock(&inode_lock);
-
 	dispose_list(&throw_away);
+	spin_unlock(&inode_lock);
 
 	return busy;
 }
 
 /*
  * This is called with the inode lock held. It searches
- * the in-use for the specified number of freeable inodes.
- * Freeable inodes are moved to a temporary list and then
- * placed on the unused list by dispose_list.
+ * the in-use for freeable inodes, which are moved to a
+ * temporary list and then placed on the unused list by
+ * dispose_list. 
+ *
+ * We don't expect to have to call this very often.
  *
- * Note that we do not expect to have to search very hard:
- * the freeable inodes will be at the old end of the list.
- * 
- * N.B. The spinlock is released to call dispose_list.
+ * N.B. The spinlock is released during the call to
+ *      dispose_list.
  */
 #define CAN_UNUSE(inode) \
-	(((inode)->i_count == 0) && \
-	 (!(inode)->i_state))
+	(((inode)->i_count | (inode)->i_state) == 0)
+#define INODE(entry)	(list_entry(entry, struct inode, i_list))
 
-static int free_inodes(int goal)
+static int free_inodes(void)
 {
-	struct list_head *tmp, *head = &inode_in_use;
-	LIST_HEAD(freeable);
-	int found = 0, depth = goal << 1;
+	struct list_head list, *entry, *freeable = &list;
+	int found = 0;
 
-	while ((tmp = head->prev) != head && depth--) {
-		struct inode * inode = list_entry(tmp, struct inode, i_list);
+	INIT_LIST_HEAD(freeable);
+	entry = inode_in_use.next;
+	while (entry != &inode_in_use) {
+		struct list_head *tmp = entry;
+
+		entry = entry->next;
+		if (!CAN_UNUSE(INODE(tmp)))
+			continue;
 		list_del(tmp);
-		if (CAN_UNUSE(inode)) {
-			list_del(&inode->i_hash);
-			INIT_LIST_HEAD(&inode->i_hash);
-			list_add(tmp, &freeable);
-			if (++found < goal)
-				continue;
-			break;
-		}
-		list_add(tmp, head);
+		list_del(&INODE(tmp)->i_hash);
+		INIT_LIST_HEAD(&INODE(tmp)->i_hash);
+		list_add(tmp, freeable);
+		found = 1;
 	}
+
 	if (found) {
-		spin_unlock(&inode_lock);
-		dispose_list(&freeable);
-		spin_lock(&inode_lock);
+		dispose_list(freeable);
+		found = 1;	/* silly compiler */
 	}
+
 	return found;
 }
 
@@ -374,7 +375,7 @@ static void shrink_dentry_inodes(int goal)
 static void try_to_free_inodes(int goal)
 {
 	shrink_dentry_inodes(goal);
-	if (!free_inodes(goal))
+	if (!free_inodes())
 		shrink_dentry_inodes(goal);
 }
 
@@ -385,7 +386,7 @@ static void try_to_free_inodes(int goal)
 void free_inode_memory(int goal)
 {
 	spin_lock(&inode_lock);
-	free_inodes(goal);
+	free_inodes();
 	spin_unlock(&inode_lock);
 }
 
@@ -450,7 +451,7 @@ static struct inode * grow_inodes(void)
 	inodes_stat.preshrink = 1;
 
 	spin_lock(&inode_lock);
-	free_inodes(inodes_stat.nr_inodes >> 2);
+	free_inodes();
 	{
 		struct list_head *tmp = inode_unused.next;
 		if (tmp != &inode_unused) {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ebb9bc276563..9b97235c83d7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -174,6 +174,8 @@ struct mm_struct {
 	unsigned long rss, total_vm, locked_vm;
 	unsigned long def_flags;
 	unsigned long cpu_vm_mask;
+	unsigned long swap_cnt;	/* number of pages to swap on next pass */
+	unsigned long swap_address;
 	/*
 	 * This is an architecture-specific pointer: the portable
 	 * part of Linux does not know about any segments.
@@ -191,7 +193,7 @@ struct mm_struct {
 		0, 0, 0, 				\
 		0, 0, 0, 0,				\
 		0, 0, 0,				\
-		0, 0, NULL }
+		0, 0, 0, 0, NULL }
 
 struct signal_struct {
 	atomic_t		count;
@@ -276,8 +278,6 @@ struct task_struct {
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
 	unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap;
 	int swappable:1;
-	unsigned long swap_address;
-	unsigned long swap_cnt;		/* number of pages to swap on next pass */
 /* process credentials */
 	uid_t uid,euid,suid,fsuid;
 	gid_t gid,egid,sgid,fsgid;
@@ -361,7 +361,7 @@ struct task_struct {
 /* utime */	{0,0,0,0},0, \
 /* per CPU times */ {0, }, {0, }, \
 /* flt */	0,0,0,0,0,0, \
-/* swp */	0,0,0, \
+/* swp */	0, \
 /* process credentials */					\
 /* uid etc */	0,0,0,0,0,0,0,0,				\
 /* suppl grps*/ 0, {0,},					\
diff --git a/init/main.c b/init/main.c
index aea2ca978463..9b37f328ecc1 100644
--- a/init/main.c
+++ b/init/main.c
@@ -64,6 +64,7 @@ extern int console_loglevel;
 static int init(void *);
 extern int bdflush(void *);
 extern int kswapd(void *);
+extern int kpiod(void *);
 extern void kswapd_setup(void);
 
 extern void init_IRQ(void);
@@ -1271,6 +1272,7 @@ static void __init do_basic_setup(void)
 	kernel_thread(bdflush, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
 	/* Start the background pageout daemon. */
 	kswapd_setup();
+	kernel_thread(kpiod, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
 	kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
 
 #if CONFIG_AP1000
diff --git a/kernel/ksyms.c b/kernel/ksyms.c
index 84c345d82855..492433cde76c 100644
--- a/kernel/ksyms.c
+++ b/kernel/ksyms.c
@@ -107,6 +107,7 @@ EXPORT_SYMBOL(high_memory);
 EXPORT_SYMBOL(update_vm_cache);
 EXPORT_SYMBOL(vmtruncate);
 EXPORT_SYMBOL(find_vma);
+EXPORT_SYMBOL(get_unmapped_area);
 
 /* filesystem internal functions */
 EXPORT_SYMBOL(in_group_p);
diff --git a/kernel/sched.c b/kernel/sched.c
index add76fbe0bf6..513ef16f92e8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -680,8 +680,18 @@ asmlinkage void schedule(void)
 
 	sched_data->prevstate = prev->state;
 
+/* this is the scheduler proper: */
 	{
 		struct task_struct * p = init_task.next_run;
+		int c = -1000;
+
+		/* Default process to select.. */
+		next = idle_task;
+		if (prev->state == TASK_RUNNING) {
+			c = goodness(prev, prev, this_cpu);
+			next = prev;
+		}
+
 		/*
 		 * This is subtle.
 		 * Note how we can enable interrupts here, even
@@ -693,36 +703,27 @@ asmlinkage void schedule(void)
 		 * the scheduler lock
 		 */
 		spin_unlock_irq(&runqueue_lock);
-#ifdef __SMP__
-		prev->has_cpu = 0;
-#endif
-	
 /*
  * Note! there may appear new tasks on the run-queue during this, as
  * interrupts are enabled. However, they will be put on front of the
  * list, so our list starting at "p" is essentially fixed.
  */
-/* this is the scheduler proper: */
-		{
-			int c = -1000;
-			next = idle_task;
-			while (p != &init_task) {
-				if (can_schedule(p)) {
-					int weight = goodness(p, prev, this_cpu);
-					if (weight > c)
-						c = weight, next = p;
-				}
-				p = p->next_run;
+		while (p != &init_task) {
+			if (can_schedule(p)) {
+				int weight = goodness(p, prev, this_cpu);
+				if (weight > c)
+					c = weight, next = p;
 			}
+			p = p->next_run;
+		}
 
-			/* Do we need to re-calculate counters? */
-			if (!c) {
-				struct task_struct *p;
-				read_lock(&tasklist_lock);
-				for_each_task(p)
-					p->counter = (p->counter >> 1) + p->priority;
-				read_unlock(&tasklist_lock);
-			}
+		/* Do we need to re-calculate counters? */
+		if (!c) {
+			struct task_struct *p;
+			read_lock(&tasklist_lock);
+			for_each_task(p)
+				p->counter = (p->counter >> 1) + p->priority;
+			read_unlock(&tasklist_lock);
 		}
 	}
 
@@ -751,10 +752,8 @@ asmlinkage void schedule(void)
 	 * thus we have to lock the previous process from getting
 	 * rescheduled during switch_to().
 	 */
-	prev->has_cpu = 1;
-
- 	next->has_cpu = 1;
  	next->processor = this_cpu;
+ 	next->has_cpu = 1;
 	spin_unlock(&scheduler_lock);
 #endif /* __SMP__ */
  	if (prev != next) {
diff --git a/mm/filemap.c b/mm/filemap.c
index 3c15ea63b3ce..849c2a93cabb 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -19,6 +19,7 @@
 #include <linux/blkdev.h>
 #include <linux/file.h>
 #include <linux/swapctl.h>
+#include <linux/slab.h>
 
 #include <asm/pgtable.h>
 #include <asm/uaccess.h>
@@ -39,6 +40,26 @@ struct page * page_hash_table[PAGE_HASH_SIZE];
 
 #define release_page(page) __free_page((page))
 
+/* 
+ * Define a request structure for outstanding page write requests
+ * to the background page io daemon
+ */
+
+struct pio_request 
+{
+	struct pio_request *	next;
+	struct file *		file;
+	unsigned long		offset;
+	unsigned long		page;
+};
+static struct pio_request *pio_first = NULL, **pio_last = &pio_first;
+static kmem_cache_t *pio_request_cache;
+static struct wait_queue *pio_wait = NULL;
+
+static inline void 
+make_pio_request(struct file *, unsigned long, unsigned long);
+
+
 /*
  * Invalidate the pages of an inode, removing all pages that aren't
  * locked down (those are sure to be up-to-date anyway, so we shouldn't
@@ -1079,8 +1100,9 @@ static inline int do_write_page(struct inode * inode, struct file * file,
 }
 
 static int filemap_write_page(struct vm_area_struct * vma,
-	unsigned long offset,
-	unsigned long page)
+			      unsigned long offset,
+			      unsigned long page,
+			      int wait)
 {
 	int result;
 	struct file * file;
@@ -1098,6 +1120,17 @@ static int filemap_write_page(struct vm_area_struct * vma,
 	 * and file could be released ... increment the count to be safe.
 	 */
 	file->f_count++;
+
+	/* 
+	 * If this is a swapping operation rather than msync(), then
+	 * leave the actual IO, and the restoration of the file count,
+	 * to the kpiod thread.  Just queue the request for now.
+	 */
+	if (!wait) {
+		make_pio_request(file, offset, page);
+		return 0;
+	}
+	
 	down(&inode->i_sem);
 	result = do_write_page(inode, file, (const char *) page, offset);
 	up(&inode->i_sem);
@@ -1113,7 +1146,7 @@ static int filemap_write_page(struct vm_area_struct * vma,
  */
 int filemap_swapout(struct vm_area_struct * vma, struct page * page)
 {
-	return filemap_write_page(vma, page->offset, page_address(page));
+	return filemap_write_page(vma, page->offset, page_address(page), 0);
 }
 
 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
@@ -1150,7 +1183,7 @@ static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
 			return 0;
 		}
 	}
-	error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page);
+	error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page, 1);
 	free_page(page);
 	return error;
 }
@@ -1569,3 +1602,120 @@ void put_cached_page(unsigned long addr)
 	wake_up(&page->wait);
 	__free_page(page);
 }
+
+
+/* Add request for page IO to the queue */
+
+static inline void put_pio_request(struct pio_request *p)
+{
+	*pio_last = p;
+	p->next = NULL;
+	pio_last = &p->next;
+}
+
+/* Take the first page IO request off the queue */
+
+static inline struct pio_request * get_pio_request(void)
+{
+	struct pio_request * p = pio_first;
+	pio_first = p->next;
+	if (!pio_first)
+		pio_last = &pio_first;
+	return p;
+}
+
+/* Make a new page IO request and queue it to the kpiod thread */
+
+static inline void make_pio_request(struct file *file,
+				    unsigned long offset,
+				    unsigned long page)
+{
+	struct pio_request *p;
+
+	atomic_inc(&mem_map[MAP_NR(page)].count);
+
+	/* 
+	 * We need to allocate without causing any recursive IO in the
+	 * current thread's context.  We might currently be swapping out
+	 * as a result of an allocation made while holding a critical
+	 * filesystem lock.  To avoid deadlock, we *MUST* not reenter
+	 * the filesystem in this thread.
+	 *
+	 * We can wait for kswapd to free memory, or we can try to free
+	 * pages without actually performing further IO, without fear of
+	 * deadlock.  --sct
+	 */
+
+	while ((p = kmem_cache_alloc(pio_request_cache, GFP_BUFFER)) == NULL) {
+		if (try_to_free_pages(__GFP_WAIT))
+			continue;
+		current->state = TASK_INTERRUPTIBLE;
+		schedule_timeout(HZ/10);
+	}
+	
+	p->file   = file;
+	p->offset = offset;
+	p->page   = page;
+
+	put_pio_request(p);
+	wake_up(&pio_wait);
+}
+
+
+/*
+ * This is the only thread which is allowed to write out filemap pages
+ * while swapping.
+ * 
+ * To avoid deadlock, it is important that we never reenter this thread.
+ * Although recursive memory allocations within this thread may result
+ * in more page swapping, that swapping will always be done by queuing
+ * another IO request to the same thread: we will never actually start
+ * that IO request until we have finished with the current one, and so
+ * we will not deadlock.  
+ */
+
+int kpiod(void * unused)
+{
+	struct wait_queue wait = {current};
+	struct inode * inode;
+	struct dentry * dentry;
+	struct pio_request * p;
+	
+	current->session = 1;
+	current->pgrp = 1;
+	strcpy(current->comm, "kpiod");
+	sigfillset(&current->blocked);
+	init_waitqueue(&pio_wait);
+
+	lock_kernel();
+	
+	pio_request_cache = kmem_cache_create("pio_request", 
+					      sizeof(struct pio_request),
+					      0, SLAB_HWCACHE_ALIGN, 
+					      NULL, NULL);
+	if (!pio_request_cache)
+		panic ("Could not create pio_request slab cache");
+	
+	while (1) {
+		current->state = TASK_INTERRUPTIBLE;
+		add_wait_queue(&pio_wait, &wait);
+		while (!pio_first)
+			schedule();
+		remove_wait_queue(&pio_wait, &wait);
+		current->state = TASK_RUNNING;
+
+		while (pio_first) {
+			p = get_pio_request();
+			dentry = p->file->f_dentry;
+			inode = dentry->d_inode;
+			
+			down(&inode->i_sem);
+			do_write_page(inode, p->file,
+				      (const char *) p->page, p->offset);
+			up(&inode->i_sem);
+			fput(p->file);
+			free_page(p->page);
+			kmem_cache_free(pio_request_cache, p);
+		}
+	}
+}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 116096153341..7dbae4cfc3e5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -202,7 +202,7 @@ static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct *
 
 	do {
 		int result;
-		tsk->swap_address = address + PAGE_SIZE;
+		tsk->mm->swap_address = address + PAGE_SIZE;
 		result = try_to_swap_out(tsk, vma, address, pte, gfp_mask);
 		if (result)
 			return result;
@@ -274,7 +274,7 @@ static int swap_out_process(struct task_struct * p, int gfp_mask)
 	/*
 	 * Go through process' page directory.
 	 */
-	address = p->swap_address;
+	address = p->mm->swap_address;
 
 	/*
 	 * Find the proper vm-area
@@ -296,8 +296,8 @@ static int swap_out_process(struct task_struct * p, int gfp_mask)
 	}
 
 	/* We didn't find anything for the process */
-	p->swap_cnt = 0;
-	p->swap_address = 0;
+	p->mm->swap_cnt = 0;
+	p->mm->swap_address = 0;
 	return 0;
 }
 
@@ -345,9 +345,9 @@ static int swap_out(unsigned int priority, int gfp_mask)
 				continue;
 			/* Refresh swap_cnt? */
 			if (assign)
-				p->swap_cnt = p->mm->rss;
-			if (p->swap_cnt > max_cnt) {
-				max_cnt = p->swap_cnt;
+				p->mm->swap_cnt = p->mm->rss;
+			if (p->mm->swap_cnt > max_cnt) {
+				max_cnt = p->mm->swap_cnt;
 				pbest = p;
 			}
 		}
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index f8990903ee9c..d21b1065308b 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -184,6 +184,8 @@ get__netinfo(struct proto *pro, char *buffer, int format, char **start, off_t of
 
 			for (req = sp->tp_pinfo.af_tcp.syn_wait_queue; req;
 			     i++, req = req->dl_next) {
+				if (req->sk)
+					continue;
 				pos += 128;
 				if (pos < offset) 
 					continue;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 660e64c44ffd..18a058c3189a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1563,12 +1563,6 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 	}
 #endif /* CONFIG_FILTER */
 
-	/*
-	 *	socket locking is here for SMP purposes as backlog rcv
-	 *	is currently called with bh processing disabled.
-	 */
-	lock_sock(sk); 
-
 	/* 
 	 * This doesn't check if the socket has enough room for the packet.
 	 * Either process the packet _without_ queueing it and then free it,
@@ -1579,7 +1573,6 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 	if (sk->state == TCP_ESTABLISHED) { /* Fast path */
 		if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
 			goto reset;
-		release_sock(sk);
 		return 0; 
 	} 
 
@@ -1590,14 +1583,21 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 		nsk = tcp_v4_hnd_req(sk, skb);
 		if (!nsk) 
 			goto discard;
-		lock_sock(nsk);
-		release_sock(sk);
+
+		/*
+		 * Queue it on the new socket if the new socket is active,
+		 * otherwise we just shortcircuit this and continue with
+		 * the new socket..
+		 */
+		if (atomic_read(&nsk->sock_readers)) {
+			__skb_queue_tail(&nsk->back_log, skb);
+			return 0;
+		}
 		sk = nsk;
 	}
 	
 	if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
 		goto reset;
-	release_sock(sk); 
 	return 0;
 
 reset:
@@ -1609,7 +1609,6 @@ discard:
 	 * might be destroyed here. This current version compiles correctly,
 	 * but you have been warned.
 	 */
-	release_sock(sk);  
 	return 0;
 }
 
-- 
2.39.5