Linux 2.2.2-pre2

author Linus Torvalds <torvalds@linuxfoundation.org>

Fri, 23 Nov 2007 20:18:11 +0000 (15:18 -0500)

committer Linus Torvalds <torvalds@linuxfoundation.org>

Fri, 23 Nov 2007 20:18:11 +0000 (15:18 -0500)
author Linus Torvalds <torvalds@linuxfoundation.org>
Fri, 23 Nov 2007 20:18:11 +0000 (15:18 -0500)
committer Linus Torvalds <torvalds@linuxfoundation.org>
Fri, 23 Nov 2007 20:18:11 +0000 (15:18 -0500)
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c

index 0a24161410879e2ddd0c00003662b8add36df61e..b57259afc1d8a5289b47f38126b855348f36de1c 100644 (file)
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -202,7 +202,7 @@ DO_ACTION( enable,  1, |= 0xff000000, )                             /* destination = 0xff */
  DO_ACTION( mask,    0, |= 0x00010000, io_apic_sync())          /* mask = 1 */
  DO_ACTION( unmask,  0, &= 0xfffeffff, )                                /* mask = 0 */
  
-static void __init clear_IO_APIC_pin(unsigned int pin)
+static void clear_IO_APIC_pin(unsigned int pin)
  {
         struct IO_APIC_route_entry entry;
  
@@ -215,6 +215,13 @@ static void __init clear_IO_APIC_pin(unsigned int pin)
         io_apic_write(0x11 + 2 * pin, *(((int *)&entry) + 1));
  }
  
+static void clear_IO_APIC (void)
+{
+       int pin;
+
+       for (pin = 0; pin < nr_ioapic_registers; pin++)
+               clear_IO_APIC_pin(pin);
+}
  
  /*
   * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
@@ -625,7 +632,7 @@ void __init setup_IO_APIC_irqs(void)
  /*
   * Set up a certain pin as ExtINT delivered interrupt
   */
-void __init setup_ExtINT_pin(unsigned int pin)
+void __init setup_ExtINT_pin(unsigned int pin, int irq)
  {
         struct IO_APIC_route_entry entry;
  
@@ -635,17 +642,16 @@ void __init setup_ExtINT_pin(unsigned int pin)
         memset(&entry,0,sizeof(entry));
  
         entry.delivery_mode = dest_ExtINT;
-       entry.dest_mode = 1;                            /* logical delivery */
+       entry.dest_mode = 0;                            /* physical delivery */
         entry.mask = 0;                                 /* unmask IRQ now */
         /*
-        * Careful with this one. We do not use 'true' logical
-        * delivery, as we set local APICs to LDR == 0. But
-        * 0xff logical destination is special (broadcast).
-        * Any other combination will cause problems.
+        * We use physical delivery to get the timer IRQ
+        * to the boot CPU. 'boot_cpu_id' is the physical
+        * APIC ID of the boot CPU.
          */
-       entry.dest.logical.logical_dest = 0xff;
+       entry.dest.physical.physical_dest = boot_cpu_id;
  
-       entry.vector = 0;                               /* it's ignored */
+       entry.vector = assign_irq_vector(irq);
  
         entry.polarity = 0;
         entry.trigger = 0;
@@ -760,7 +766,7 @@ void __init print_IO_APIC(void)
  
  static void __init init_sym_mode(void)
  {
-       int i, pin;
+       int i;
  
         for (i = 0; i < PIN_MAP_SIZE; i++) {
                 irq_2_pin[i].pin = -1;
@@ -790,8 +796,7 @@ static void __init init_sym_mode(void)
         /*
          * Do not trust the IO-APIC being empty at bootup
          */
-       for (pin = 0; pin < nr_ioapic_registers; pin++)
-               clear_IO_APIC_pin(pin);
+       clear_IO_APIC();
  }
  
  /*
@@ -799,6 +804,15 @@ static void __init init_sym_mode(void)
   */
  void init_pic_mode(void)
  {
+       /*
+        * Clear the IO-APIC before rebooting:
+        */
+       clear_IO_APIC();
+
+       /*
+        * Put it back into PIC mode (has an effect only on
+        * certain boards)
+        */
         printk("disabling symmetric IO mode... ");
                 outb_p(0x70, 0x22);
                 outb_p(0x00, 0x23);
@@ -1184,7 +1198,7 @@ static inline void check_timer(void)
  
                 if (pin2 != -1) {
                         printk(".. (found pin %d) ...", pin2);
-                       setup_ExtINT_pin(pin2);
+                       setup_ExtINT_pin(pin2, 0);
                         make_8259A_irq(0);
                 }
  
diff --git a/drivers/char/vt.c b/drivers/char/vt.c

index 6830089f96c2c2236cebe98ce240a45b1ef355b2..97be390b03a1ed60cdb8cbb08480d03783f85ad2 100644 (file)
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -165,7 +165,7 @@ do_kdsk_ioctl(int cmd, struct kbentry *user_kbe, int perm, struct kbd_struct *kb
                         val = K_HOLE;
                 } else
                     val = (i ? K_HOLE : K_NOSUCHMAP);
-               return __put_user(val, &user_kbe->kb_value);
+               return put_user(val, &user_kbe->kb_value);
         case KDSKBENT:
                 if (!perm)
                         return -EPERM;
@@ -244,7 +244,7 @@ do_kbkeycode_ioctl(int cmd, struct kbkeycode *user_kbkc, int perm)
         case KDGETKEYCODE:
                 kc = getkeycode(tmp.scancode);
                 if (kc >= 0)
-                       kc = __put_user(kc, &user_kbkc->keycode);
+                       kc = put_user(kc, &user_kbkc->keycode);
                 break;
         case KDSETKEYCODE:
                 if (!perm)
@@ -282,8 +282,8 @@ do_kdgkb_ioctl(int cmd, struct kbsentry *user_kdgkb, int perm)
                 p = func_table[i];
                 if(p)
                         for ( ; *p && sz; p++, sz--)
-                               __put_user(*p, q++);
-               __put_user('\0', q);
+                               put_user(*p, q++);
+               put_user('\0', q);
                 return ((p && *p) ? -EOVERFLOW : 0);
         case KDSKBSENT:
                 if (!perm)
@@ -603,12 +603,10 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
         {
                 struct kbdiacrs *a = (struct kbdiacrs *)arg;
  
-               i = verify_area(VERIFY_WRITE, (void *) a, sizeof(struct kbdiacrs));
-               if (i)
-                       return i;
-               __put_user(accent_table_size, &a->kb_cnt);
-               __copy_to_user(a->kbdiacr, accent_table,
-                           accent_table_size*sizeof(struct kbdiacr));
+               if (put_user(accent_table_size, &a->kb_cnt))
+                       return -EFAULT;
+               if (copy_to_user(a->kbdiacr, accent_table, accent_table_size*sizeof(struct kbdiacr)))
+                       return -EFAULT;
                 return 0;
         }
  
@@ -619,14 +617,13 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
  
                 if (!perm)
                         return -EPERM;
-               i = verify_area(VERIFY_READ, (void *) a, sizeof(struct kbdiacrs));
-               if (i)
-                       return i;
-               __get_user(ct,&a->kb_cnt);
+               if (get_user(ct,&a->kb_cnt))
+                       return -EFAULT;
                 if (ct >= MAX_DIACR)
                         return -EINVAL;
                 accent_table_size = ct;
-               __copy_from_user(accent_table, a->kbdiacr, ct*sizeof(struct kbdiacr));
+               if (copy_from_user(accent_table, a->kbdiacr, ct*sizeof(struct kbdiacr)))
+                       return -EFAULT;
                 return 0;
         }
  
@@ -717,12 +714,12 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
                 i = verify_area(VERIFY_WRITE,(void *)vtstat, sizeof(struct vt_stat));
                 if (i)
                         return i;
-               __put_user(fg_console + 1, &vtstat->v_active);
+               put_user(fg_console + 1, &vtstat->v_active);
                 state = 1;      /* /dev/tty0 is always open */
                 for (i = 0, mask = 2; i < MAX_NR_CONSOLES && mask; ++i, mask <<= 1)
                         if (VT_IS_IN_USE(i))
                                 state |= mask;
-               return __put_user(state, &vtstat->v_state);
+               return put_user(state, &vtstat->v_state);
         }
  
         /*
@@ -856,8 +853,8 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
                 i = verify_area(VERIFY_READ, (void *)vtsizes, sizeof(struct vt_sizes));
                 if (i)
                         return i;
-               __get_user(ll, &vtsizes->v_rows);
-               __get_user(cc, &vtsizes->v_cols);
+               get_user(ll, &vtsizes->v_rows);
+               get_user(cc, &vtsizes->v_cols);
                 return vc_resize_all(ll, cc);
         }
  
@@ -870,12 +867,12 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
                 i = verify_area(VERIFY_READ, (void *)vtconsize, sizeof(struct vt_consize));
                 if (i)
                         return i;
-               __get_user(ll, &vtconsize->v_rows);
-               __get_user(cc, &vtconsize->v_cols);
-               __get_user(vlin, &vtconsize->v_vlin);
-               __get_user(clin, &vtconsize->v_clin);
-               __get_user(vcol, &vtconsize->v_vcol);
-               __get_user(ccol, &vtconsize->v_ccol);
+               get_user(ll, &vtconsize->v_rows);
+               get_user(cc, &vtconsize->v_cols);
+               get_user(vlin, &vtconsize->v_vlin);
+               get_user(clin, &vtconsize->v_clin);
+               get_user(vcol, &vtconsize->v_vcol);
+               get_user(ccol, &vtconsize->v_ccol);
                 vlin = vlin ? vlin : video_scan_lines;
                 if ( clin )
                   {
diff --git a/fs/inode.c b/fs/inode.c

index 72a23f8584fa4765ca35c95a8656312c63916f41..347e88d37d0143dd05b9863354de1a5dc8267a13 100644 (file)
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -232,13 +232,15 @@ void clear_inode(struct inode *inode)
  
  /*
   * Dispose-list gets a local list, so it doesn't need to
- * worry about list corruption.
+ * worry about list corruption. It releases the inode lock
+ * while clearing the inodes.
   */
  static void dispose_list(struct list_head * head)
  {
         struct list_head *next;
         int count = 0;
  
+       spin_unlock(&inode_lock);
         next = head->next;
         for (;;) {
                 struct list_head * tmp = next;
@@ -256,7 +258,6 @@ static void dispose_list(struct list_head * head)
         spin_lock(&inode_lock);
         list_splice(head, &inode_unused);
         inodes_stat.nr_free_inodes += count;
-       spin_unlock(&inode_lock);
  }
  
  /*
@@ -305,52 +306,52 @@ int invalidate_inodes(struct super_block * sb)
         spin_lock(&inode_lock);
         busy = invalidate_list(&inode_in_use, sb, &throw_away);
         busy |= invalidate_list(&sb->s_dirty, sb, &throw_away);
-       spin_unlock(&inode_lock);
-
         dispose_list(&throw_away);
+       spin_unlock(&inode_lock);
  
         return busy;
  }
  
  /*
   * This is called with the inode lock held. It searches
- * the in-use for the specified number of freeable inodes.
- * Freeable inodes are moved to a temporary list and then
- * placed on the unused list by dispose_list.
+ * the in-use for freeable inodes, which are moved to a
+ * temporary list and then placed on the unused list by
+ * dispose_list. 
+ *
+ * We don't expect to have to call this very often.
   *
- * Note that we do not expect to have to search very hard:
- * the freeable inodes will be at the old end of the list.
- * 
- * N.B. The spinlock is released to call dispose_list.
+ * N.B. The spinlock is released during the call to
+ *      dispose_list.
   */
  #define CAN_UNUSE(inode) \
-       (((inode)->i_count == 0) && \
-        (!(inode)->i_state))
+       (((inode)->i_count | (inode)->i_state) == 0)
+#define INODE(entry)   (list_entry(entry, struct inode, i_list))
  
-static int free_inodes(int goal)
+static int free_inodes(void)
  {
-       struct list_head *tmp, *head = &inode_in_use;
-       LIST_HEAD(freeable);
-       int found = 0, depth = goal << 1;
+       struct list_head list, *entry, *freeable = &list;
+       int found = 0;
  
-       while ((tmp = head->prev) != head && depth--) {
-               struct inode * inode = list_entry(tmp, struct inode, i_list);
+       INIT_LIST_HEAD(freeable);
+       entry = inode_in_use.next;
+       while (entry != &inode_in_use) {
+               struct list_head *tmp = entry;
+
+               entry = entry->next;
+               if (!CAN_UNUSE(INODE(tmp)))
+                       continue;
                 list_del(tmp);
-               if (CAN_UNUSE(inode)) {
-                       list_del(&inode->i_hash);
-                       INIT_LIST_HEAD(&inode->i_hash);
-                       list_add(tmp, &freeable);
-                       if (++found < goal)
-                               continue;
-                       break;
-               }
-               list_add(tmp, head);
+               list_del(&INODE(tmp)->i_hash);
+               INIT_LIST_HEAD(&INODE(tmp)->i_hash);
+               list_add(tmp, freeable);
+               found = 1;
         }
+
         if (found) {
-               spin_unlock(&inode_lock);
-               dispose_list(&freeable);
-               spin_lock(&inode_lock);
+               dispose_list(freeable);
+               found = 1;      /* silly compiler */
         }
+
         return found;
  }
  
@@ -374,7 +375,7 @@ static void shrink_dentry_inodes(int goal)
  static void try_to_free_inodes(int goal)
  {
         shrink_dentry_inodes(goal);
-       if (!free_inodes(goal))
+       if (!free_inodes())
                 shrink_dentry_inodes(goal);
  }
  
@@ -385,7 +386,7 @@ static void try_to_free_inodes(int goal)
  void free_inode_memory(int goal)
  {
         spin_lock(&inode_lock);
-       free_inodes(goal);
+       free_inodes();
         spin_unlock(&inode_lock);
  }
  
@@ -450,7 +451,7 @@ static struct inode * grow_inodes(void)
         inodes_stat.preshrink = 1;
  
         spin_lock(&inode_lock);
-       free_inodes(inodes_stat.nr_inodes >> 2);
+       free_inodes();
         {
                 struct list_head *tmp = inode_unused.next;
                 if (tmp != &inode_unused) {
diff --git a/include/linux/sched.h b/include/linux/sched.h

index ebb9bc2765630cea885979e12504c8399cbb282b..9b97235c83d7da675ffc440cf3a88c456710270d 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -174,6 +174,8 @@ struct mm_struct {
         unsigned long rss, total_vm, locked_vm;
         unsigned long def_flags;
         unsigned long cpu_vm_mask;
+       unsigned long swap_cnt; /* number of pages to swap on next pass */
+       unsigned long swap_address;
         /*
          * This is an architecture-specific pointer: the portable
          * part of Linux does not know about any segments.
@@ -191,7 +193,7 @@ struct mm_struct {
                 0, 0, 0,                                \
                 0, 0, 0, 0,                             \
                 0, 0, 0,                                \
-               0, 0, NULL }
+               0, 0, 0, 0, NULL }
  
  struct signal_struct {
         atomic_t                count;
@@ -276,8 +278,6 @@ struct task_struct {
  /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
         unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap;
         int swappable:1;
-       unsigned long swap_address;
-       unsigned long swap_cnt;         /* number of pages to swap on next pass */
  /* process credentials */
         uid_t uid,euid,suid,fsuid;
         gid_t gid,egid,sgid,fsgid;
@@ -361,7 +361,7 @@ struct task_struct {
  /* utime */    {0,0,0,0},0, \
  /* per CPU times */ {0, }, {0, }, \
  /* flt */      0,0,0,0,0,0, \
-/* swp */      0,0,0, \
+/* swp */      0, \
  /* process credentials */                                      \
  /* uid etc */  0,0,0,0,0,0,0,0,                                \
  /* suppl grps*/ 0, {0,},                                       \
diff --git a/init/main.c b/init/main.c

index aea2ca978463a078a11197e165d7a484e35f6329..9b37f328ecc1fbb917e0a4f947ca380648403b86 100644 (file)
--- a/init/main.c
+++ b/init/main.c
@@ -64,6 +64,7 @@ extern int console_loglevel;
  static int init(void *);
  extern int bdflush(void *);
  extern int kswapd(void *);
+extern int kpiod(void *);
  extern void kswapd_setup(void);
  
  extern void init_IRQ(void);
@@ -1271,6 +1272,7 @@ static void __init do_basic_setup(void)
         kernel_thread(bdflush, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
         /* Start the background pageout daemon. */
         kswapd_setup();
+       kernel_thread(kpiod, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
         kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
  
  #if CONFIG_AP1000
diff --git a/kernel/ksyms.c b/kernel/ksyms.c

index 84c345d8285502f367e7d629203e7e36a227e2b7..492433cde76c76e5259a5e57d3d9296eaa7a4d8b 100644 (file)
--- a/kernel/ksyms.c
+++ b/kernel/ksyms.c
@@ -107,6 +107,7 @@ EXPORT_SYMBOL(high_memory);
  EXPORT_SYMBOL(update_vm_cache);
  EXPORT_SYMBOL(vmtruncate);
  EXPORT_SYMBOL(find_vma);
+EXPORT_SYMBOL(get_unmapped_area);
  
  /* filesystem internal functions */
  EXPORT_SYMBOL(in_group_p);
diff --git a/kernel/sched.c b/kernel/sched.c

index add76fbe0bf6f6ca4a74447469552962c6b271bc..513ef16f92e855a557bec2bfdc08a3a59e2687c3 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -680,8 +680,18 @@ asmlinkage void schedule(void)
  
         sched_data->prevstate = prev->state;
  
+/* this is the scheduler proper: */
         {
                 struct task_struct * p = init_task.next_run;
+               int c = -1000;
+
+               /* Default process to select.. */
+               next = idle_task;
+               if (prev->state == TASK_RUNNING) {
+                       c = goodness(prev, prev, this_cpu);
+                       next = prev;
+               }
+
                 /*
                  * This is subtle.
                  * Note how we can enable interrupts here, even
@@ -693,36 +703,27 @@ asmlinkage void schedule(void)
                  * the scheduler lock
                  */
                 spin_unlock_irq(&runqueue_lock);
-#ifdef __SMP__
-               prev->has_cpu = 0;
-#endif
-       
  /*
   * Note! there may appear new tasks on the run-queue during this, as
   * interrupts are enabled. However, they will be put on front of the
   * list, so our list starting at "p" is essentially fixed.
   */
-/* this is the scheduler proper: */
-               {
-                       int c = -1000;
-                       next = idle_task;
-                       while (p != &init_task) {
-                               if (can_schedule(p)) {
-                                       int weight = goodness(p, prev, this_cpu);
-                                       if (weight > c)
-                                               c = weight, next = p;
-                               }
-                               p = p->next_run;
+               while (p != &init_task) {
+                       if (can_schedule(p)) {
+                               int weight = goodness(p, prev, this_cpu);
+                               if (weight > c)
+                                       c = weight, next = p;
                         }
+                       p = p->next_run;
+               }
  
-                       /* Do we need to re-calculate counters? */
-                       if (!c) {
-                               struct task_struct *p;
-                               read_lock(&tasklist_lock);
-                               for_each_task(p)
-                                       p->counter = (p->counter >> 1) + p->priority;
-                               read_unlock(&tasklist_lock);
-                       }
+               /* Do we need to re-calculate counters? */
+               if (!c) {
+                       struct task_struct *p;
+                       read_lock(&tasklist_lock);
+                       for_each_task(p)
+                               p->counter = (p->counter >> 1) + p->priority;
+                       read_unlock(&tasklist_lock);
                 }
         }
  
@@ -751,10 +752,8 @@ asmlinkage void schedule(void)
          * thus we have to lock the previous process from getting
          * rescheduled during switch_to().
          */
-       prev->has_cpu = 1;
-
-       next->has_cpu = 1;
         next->processor = this_cpu;
+       next->has_cpu = 1;
         spin_unlock(&scheduler_lock);
  #endif /* __SMP__ */
         if (prev != next) {
diff --git a/mm/filemap.c b/mm/filemap.c

index 3c15ea63b3ce367ec043993c54e6a4f1acd93c8a..849c2a93cabbb9c0955ed05c0ae7127e3ddb9836 100644 (file)
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -19,6 +19,7 @@
  #include <linux/blkdev.h>
  #include <linux/file.h>
  #include <linux/swapctl.h>
+#include <linux/slab.h>
  
  #include <asm/pgtable.h>
  #include <asm/uaccess.h>
@@ -39,6 +40,26 @@ struct page * page_hash_table[PAGE_HASH_SIZE];
  
  #define release_page(page) __free_page((page))
  
+/* 
+ * Define a request structure for outstanding page write requests
+ * to the background page io daemon
+ */
+
+struct pio_request 
+{
+       struct pio_request *    next;
+       struct file *           file;
+       unsigned long           offset;
+       unsigned long           page;
+};
+static struct pio_request *pio_first = NULL, **pio_last = &pio_first;
+static kmem_cache_t *pio_request_cache;
+static struct wait_queue *pio_wait = NULL;
+
+static inline void 
+make_pio_request(struct file *, unsigned long, unsigned long);
+
+
  /*
   * Invalidate the pages of an inode, removing all pages that aren't
   * locked down (those are sure to be up-to-date anyway, so we shouldn't
@@ -1079,8 +1100,9 @@ static inline int do_write_page(struct inode * inode, struct file * file,
  }
  
  static int filemap_write_page(struct vm_area_struct * vma,
-       unsigned long offset,
-       unsigned long page)
+                             unsigned long offset,
+                             unsigned long page,
+                             int wait)
  {
         int result;
         struct file * file;
@@ -1098,6 +1120,17 @@ static int filemap_write_page(struct vm_area_struct * vma,
          * and file could be released ... increment the count to be safe.
          */
         file->f_count++;
+
+       /* 
+        * If this is a swapping operation rather than msync(), then
+        * leave the actual IO, and the restoration of the file count,
+        * to the kpiod thread.  Just queue the request for now.
+        */
+       if (!wait) {
+               make_pio_request(file, offset, page);
+               return 0;
+       }
+       
         down(&inode->i_sem);
         result = do_write_page(inode, file, (const char *) page, offset);
         up(&inode->i_sem);
@@ -1113,7 +1146,7 @@ static int filemap_write_page(struct vm_area_struct * vma,
   */
  int filemap_swapout(struct vm_area_struct * vma, struct page * page)
  {
-       return filemap_write_page(vma, page->offset, page_address(page));
+       return filemap_write_page(vma, page->offset, page_address(page), 0);
  }
  
  static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
@@ -1150,7 +1183,7 @@ static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
                         return 0;
                 }
         }
-       error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page);
+       error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page, 1);
         free_page(page);
         return error;
  }
@@ -1569,3 +1602,120 @@ void put_cached_page(unsigned long addr)
         wake_up(&page->wait);
         __free_page(page);
  }
+
+
+/* Add request for page IO to the queue */
+
+static inline void put_pio_request(struct pio_request *p)
+{
+       *pio_last = p;
+       p->next = NULL;
+       pio_last = &p->next;
+}
+
+/* Take the first page IO request off the queue */
+
+static inline struct pio_request * get_pio_request(void)
+{
+       struct pio_request * p = pio_first;
+       pio_first = p->next;
+       if (!pio_first)
+               pio_last = &pio_first;
+       return p;
+}
+
+/* Make a new page IO request and queue it to the kpiod thread */
+
+static inline void make_pio_request(struct file *file,
+                                   unsigned long offset,
+                                   unsigned long page)
+{
+       struct pio_request *p;
+
+       atomic_inc(&mem_map[MAP_NR(page)].count);
+
+       /* 
+        * We need to allocate without causing any recursive IO in the
+        * current thread's context.  We might currently be swapping out
+        * as a result of an allocation made while holding a critical
+        * filesystem lock.  To avoid deadlock, we *MUST* not reenter
+        * the filesystem in this thread.
+        *
+        * We can wait for kswapd to free memory, or we can try to free
+        * pages without actually performing further IO, without fear of
+        * deadlock.  --sct
+        */
+
+       while ((p = kmem_cache_alloc(pio_request_cache, GFP_BUFFER)) == NULL) {
+               if (try_to_free_pages(__GFP_WAIT))
+                       continue;
+               current->state = TASK_INTERRUPTIBLE;
+               schedule_timeout(HZ/10);
+       }
+       
+       p->file   = file;
+       p->offset = offset;
+       p->page   = page;
+
+       put_pio_request(p);
+       wake_up(&pio_wait);
+}
+
+
+/*
+ * This is the only thread which is allowed to write out filemap pages
+ * while swapping.
+ * 
+ * To avoid deadlock, it is important that we never reenter this thread.
+ * Although recursive memory allocations within this thread may result
+ * in more page swapping, that swapping will always be done by queuing
+ * another IO request to the same thread: we will never actually start
+ * that IO request until we have finished with the current one, and so
+ * we will not deadlock.  
+ */
+
+int kpiod(void * unused)
+{
+       struct wait_queue wait = {current};
+       struct inode * inode;
+       struct dentry * dentry;
+       struct pio_request * p;
+       
+       current->session = 1;
+       current->pgrp = 1;
+       strcpy(current->comm, "kpiod");
+       sigfillset(&current->blocked);
+       init_waitqueue(&pio_wait);
+
+       lock_kernel();
+       
+       pio_request_cache = kmem_cache_create("pio_request", 
+                                             sizeof(struct pio_request),
+                                             0, SLAB_HWCACHE_ALIGN, 
+                                             NULL, NULL);
+       if (!pio_request_cache)
+               panic ("Could not create pio_request slab cache");
+       
+       while (1) {
+               current->state = TASK_INTERRUPTIBLE;
+               add_wait_queue(&pio_wait, &wait);
+               while (!pio_first)
+                       schedule();
+               remove_wait_queue(&pio_wait, &wait);
+               current->state = TASK_RUNNING;
+
+               while (pio_first) {
+                       p = get_pio_request();
+                       dentry = p->file->f_dentry;
+                       inode = dentry->d_inode;
+                       
+                       down(&inode->i_sem);
+                       do_write_page(inode, p->file,
+                                     (const char *) p->page, p->offset);
+                       up(&inode->i_sem);
+                       fput(p->file);
+                       free_page(p->page);
+                       kmem_cache_free(pio_request_cache, p);
+               }
+       }
+}
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 116096153341badcd18421db803da163cd90a517..7dbae4cfc3e5de537af9d7f1f8e769c89203c922 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -202,7 +202,7 @@ static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct *
  
         do {
                 int result;
-               tsk->swap_address = address + PAGE_SIZE;
+               tsk->mm->swap_address = address + PAGE_SIZE;
                 result = try_to_swap_out(tsk, vma, address, pte, gfp_mask);
                 if (result)
                         return result;
@@ -274,7 +274,7 @@ static int swap_out_process(struct task_struct * p, int gfp_mask)
         /*
          * Go through process' page directory.
          */
-       address = p->swap_address;
+       address = p->mm->swap_address;
  
         /*
          * Find the proper vm-area
@@ -296,8 +296,8 @@ static int swap_out_process(struct task_struct * p, int gfp_mask)
         }
  
         /* We didn't find anything for the process */
-       p->swap_cnt = 0;
-       p->swap_address = 0;
+       p->mm->swap_cnt = 0;
+       p->mm->swap_address = 0;
         return 0;
  }
  
@@ -345,9 +345,9 @@ static int swap_out(unsigned int priority, int gfp_mask)
                                 continue;
                         /* Refresh swap_cnt? */
                         if (assign)
-                               p->swap_cnt = p->mm->rss;
-                       if (p->swap_cnt > max_cnt) {
-                               max_cnt = p->swap_cnt;
+                               p->mm->swap_cnt = p->mm->rss;
+                       if (p->mm->swap_cnt > max_cnt) {
+                               max_cnt = p->mm->swap_cnt;
                                 pbest = p;
                         }
                 }
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c

index f8990903ee9c5df59d237090b2e3b3e51471f2a6..d21b1065308bf2e16bb2c2235250898c452ac803 100644 (file)
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -184,6 +184,8 @@ get__netinfo(struct proto *pro, char *buffer, int format, char **start, off_t of
  
                         for (req = sp->tp_pinfo.af_tcp.syn_wait_queue; req;
                              i++, req = req->dl_next) {
+                               if (req->sk)
+                                       continue;
                                 pos += 128;
                                 if (pos < offset) 
                                         continue;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c

index 660e64c44ffdb1a968d57891172cae8f6a4fbf35..18a058c3189a1213da60206ad97ac0b33984190f 100644 (file)
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1563,12 +1563,6 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
         }
  #endif /* CONFIG_FILTER */
  
-       /*
-        *      socket locking is here for SMP purposes as backlog rcv
-        *      is currently called with bh processing disabled.
-        */
-       lock_sock(sk); 
-
         /* 
          * This doesn't check if the socket has enough room for the packet.
          * Either process the packet _without_ queueing it and then free it,
@@ -1579,7 +1573,6 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
         if (sk->state == TCP_ESTABLISHED) { /* Fast path */
                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
                         goto reset;
-               release_sock(sk);
                 return 0; 
         } 
  
@@ -1590,14 +1583,21 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
                 nsk = tcp_v4_hnd_req(sk, skb);
                 if (!nsk) 
                         goto discard;
-               lock_sock(nsk);
-               release_sock(sk);
+
+               /*
+                * Queue it on the new socket if the new socket is active,
+                * otherwise we just shortcircuit this and continue with
+                * the new socket..
+                */
+               if (atomic_read(&nsk->sock_readers)) {
+                       __skb_queue_tail(&nsk->back_log, skb);
+                       return 0;
+               }
                 sk = nsk;
         }
         
         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
                 goto reset;
-       release_sock(sk); 
         return 0;
  
  reset:
@@ -1609,7 +1609,6 @@ discard:
          * might be destroyed here. This current version compiles correctly,
          * but you have been warned.
          */
-       release_sock(sk);  
         return 0;
  }
author	Linus Torvalds <torvalds@linuxfoundation.org>
	Fri, 23 Nov 2007 20:18:11 +0000 (15:18 -0500)
committer	Linus Torvalds <torvalds@linuxfoundation.org>
	Fri, 23 Nov 2007 20:18:11 +0000 (15:18 -0500)
arch/i386/kernel/io_apic.c		patch \| blob \| history
drivers/char/vt.c		patch \| blob \| history
fs/inode.c		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
init/main.c		patch \| blob \| history
kernel/ksyms.c		patch \| blob \| history
kernel/sched.c		patch \| blob \| history
mm/filemap.c		patch \| blob \| history
mm/vmscan.c		patch \| blob \| history
net/ipv4/proc.c		patch \| blob \| history
net/ipv4/tcp_ipv4.c		patch \| blob \| history