]> git.neil.brown.name Git - history.git/commitdiff
Linux 2.2.2-pre2 2.2.2pre2
authorLinus Torvalds <torvalds@linuxfoundation.org>
Fri, 23 Nov 2007 20:18:11 +0000 (15:18 -0500)
committerLinus Torvalds <torvalds@linuxfoundation.org>
Fri, 23 Nov 2007 20:18:11 +0000 (15:18 -0500)
this one contains various small documentation updates and updates to xconfig,
but the important parts (and the smallest part of the actual patch) are:

 - shared file lockup fix by Stephen Tweedie
 - my fix for the TCP bug that Ingo found
 - Ingo's io-apic setup fixes, which should finally get rid of the
   spurious apic interrupts with some motherboards and the ExtINT setup.
 - inode leak thing
 - SMP scheduler potential race condition fix
 - sound driver updates
 - partition and disk fixes (2kB blocksize media and some IDE disk
   geometry and irq detection issues).

None of the fixes are critical to most people, but all of them _can_ be
critical to people who have seen vulnerabilities in the area. As such, if
you're happy with 2.2.1 there is no pressing reason to test this patch
out, but I hope to have the pre-patches so that the final 2.2.2 can be
left around for a while (CD-ROM manufacturers etc would certainly prefer
to not see lots of releases).

                Linus

arch/i386/kernel/io_apic.c
drivers/char/vt.c
fs/inode.c
include/linux/sched.h
init/main.c
kernel/ksyms.c
kernel/sched.c
mm/filemap.c
mm/vmscan.c
net/ipv4/proc.c
net/ipv4/tcp_ipv4.c

index 0a24161410879e2ddd0c00003662b8add36df61e..b57259afc1d8a5289b47f38126b855348f36de1c 100644 (file)
@@ -202,7 +202,7 @@ DO_ACTION( enable,  1, |= 0xff000000, )                             /* destination = 0xff */
 DO_ACTION( mask,    0, |= 0x00010000, io_apic_sync())          /* mask = 1 */
 DO_ACTION( unmask,  0, &= 0xfffeffff, )                                /* mask = 0 */
 
-static void __init clear_IO_APIC_pin(unsigned int pin)
+static void clear_IO_APIC_pin(unsigned int pin)
 {
        struct IO_APIC_route_entry entry;
 
@@ -215,6 +215,13 @@ static void __init clear_IO_APIC_pin(unsigned int pin)
        io_apic_write(0x11 + 2 * pin, *(((int *)&entry) + 1));
 }
 
+static void clear_IO_APIC (void)
+{
+       int pin;
+
+       for (pin = 0; pin < nr_ioapic_registers; pin++)
+               clear_IO_APIC_pin(pin);
+}
 
 /*
  * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
@@ -625,7 +632,7 @@ void __init setup_IO_APIC_irqs(void)
 /*
  * Set up a certain pin as ExtINT delivered interrupt
  */
-void __init setup_ExtINT_pin(unsigned int pin)
+void __init setup_ExtINT_pin(unsigned int pin, int irq)
 {
        struct IO_APIC_route_entry entry;
 
@@ -635,17 +642,16 @@ void __init setup_ExtINT_pin(unsigned int pin)
        memset(&entry,0,sizeof(entry));
 
        entry.delivery_mode = dest_ExtINT;
-       entry.dest_mode = 1;                            /* logical delivery */
+       entry.dest_mode = 0;                            /* physical delivery */
        entry.mask = 0;                                 /* unmask IRQ now */
        /*
-        * Careful with this one. We do not use 'true' logical
-        * delivery, as we set local APICs to LDR == 0. But
-        * 0xff logical destination is special (broadcast).
-        * Any other combination will cause problems.
+        * We use physical delivery to get the timer IRQ
+        * to the boot CPU. 'boot_cpu_id' is the physical
+        * APIC ID of the boot CPU.
         */
-       entry.dest.logical.logical_dest = 0xff;
+       entry.dest.physical.physical_dest = boot_cpu_id;
 
-       entry.vector = 0;                               /* it's ignored */
+       entry.vector = assign_irq_vector(irq);
 
        entry.polarity = 0;
        entry.trigger = 0;
@@ -760,7 +766,7 @@ void __init print_IO_APIC(void)
 
 static void __init init_sym_mode(void)
 {
-       int i, pin;
+       int i;
 
        for (i = 0; i < PIN_MAP_SIZE; i++) {
                irq_2_pin[i].pin = -1;
@@ -790,8 +796,7 @@ static void __init init_sym_mode(void)
        /*
         * Do not trust the IO-APIC being empty at bootup
         */
-       for (pin = 0; pin < nr_ioapic_registers; pin++)
-               clear_IO_APIC_pin(pin);
+       clear_IO_APIC();
 }
 
 /*
@@ -799,6 +804,15 @@ static void __init init_sym_mode(void)
  */
 void init_pic_mode(void)
 {
+       /*
+        * Clear the IO-APIC before rebooting:
+        */
+       clear_IO_APIC();
+
+       /*
+        * Put it back into PIC mode (has an effect only on
+        * certain boards)
+        */
        printk("disabling symmetric IO mode... ");
                outb_p(0x70, 0x22);
                outb_p(0x00, 0x23);
@@ -1184,7 +1198,7 @@ static inline void check_timer(void)
 
                if (pin2 != -1) {
                        printk(".. (found pin %d) ...", pin2);
-                       setup_ExtINT_pin(pin2);
+                       setup_ExtINT_pin(pin2, 0);
                        make_8259A_irq(0);
                }
 
index 6830089f96c2c2236cebe98ce240a45b1ef355b2..97be390b03a1ed60cdb8cbb08480d03783f85ad2 100644 (file)
@@ -165,7 +165,7 @@ do_kdsk_ioctl(int cmd, struct kbentry *user_kbe, int perm, struct kbd_struct *kb
                        val = K_HOLE;
                } else
                    val = (i ? K_HOLE : K_NOSUCHMAP);
-               return __put_user(val, &user_kbe->kb_value);
+               return put_user(val, &user_kbe->kb_value);
        case KDSKBENT:
                if (!perm)
                        return -EPERM;
@@ -244,7 +244,7 @@ do_kbkeycode_ioctl(int cmd, struct kbkeycode *user_kbkc, int perm)
        case KDGETKEYCODE:
                kc = getkeycode(tmp.scancode);
                if (kc >= 0)
-                       kc = __put_user(kc, &user_kbkc->keycode);
+                       kc = put_user(kc, &user_kbkc->keycode);
                break;
        case KDSETKEYCODE:
                if (!perm)
@@ -282,8 +282,8 @@ do_kdgkb_ioctl(int cmd, struct kbsentry *user_kdgkb, int perm)
                p = func_table[i];
                if(p)
                        for ( ; *p && sz; p++, sz--)
-                               __put_user(*p, q++);
-               __put_user('\0', q);
+                               put_user(*p, q++);
+               put_user('\0', q);
                return ((p && *p) ? -EOVERFLOW : 0);
        case KDSKBSENT:
                if (!perm)
@@ -603,12 +603,10 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
        {
                struct kbdiacrs *a = (struct kbdiacrs *)arg;
 
-               i = verify_area(VERIFY_WRITE, (void *) a, sizeof(struct kbdiacrs));
-               if (i)
-                       return i;
-               __put_user(accent_table_size, &a->kb_cnt);
-               __copy_to_user(a->kbdiacr, accent_table,
-                           accent_table_size*sizeof(struct kbdiacr));
+               if (put_user(accent_table_size, &a->kb_cnt))
+                       return -EFAULT;
+               if (copy_to_user(a->kbdiacr, accent_table, accent_table_size*sizeof(struct kbdiacr)))
+                       return -EFAULT;
                return 0;
        }
 
@@ -619,14 +617,13 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 
                if (!perm)
                        return -EPERM;
-               i = verify_area(VERIFY_READ, (void *) a, sizeof(struct kbdiacrs));
-               if (i)
-                       return i;
-               __get_user(ct,&a->kb_cnt);
+               if (get_user(ct,&a->kb_cnt))
+                       return -EFAULT;
                if (ct >= MAX_DIACR)
                        return -EINVAL;
                accent_table_size = ct;
-               __copy_from_user(accent_table, a->kbdiacr, ct*sizeof(struct kbdiacr));
+               if (copy_from_user(accent_table, a->kbdiacr, ct*sizeof(struct kbdiacr)))
+                       return -EFAULT;
                return 0;
        }
 
@@ -717,12 +714,12 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
                i = verify_area(VERIFY_WRITE,(void *)vtstat, sizeof(struct vt_stat));
                if (i)
                        return i;
-               __put_user(fg_console + 1, &vtstat->v_active);
+               put_user(fg_console + 1, &vtstat->v_active);
                state = 1;      /* /dev/tty0 is always open */
                for (i = 0, mask = 2; i < MAX_NR_CONSOLES && mask; ++i, mask <<= 1)
                        if (VT_IS_IN_USE(i))
                                state |= mask;
-               return __put_user(state, &vtstat->v_state);
+               return put_user(state, &vtstat->v_state);
        }
 
        /*
@@ -856,8 +853,8 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
                i = verify_area(VERIFY_READ, (void *)vtsizes, sizeof(struct vt_sizes));
                if (i)
                        return i;
-               __get_user(ll, &vtsizes->v_rows);
-               __get_user(cc, &vtsizes->v_cols);
+               get_user(ll, &vtsizes->v_rows);
+               get_user(cc, &vtsizes->v_cols);
                return vc_resize_all(ll, cc);
        }
 
@@ -870,12 +867,12 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
                i = verify_area(VERIFY_READ, (void *)vtconsize, sizeof(struct vt_consize));
                if (i)
                        return i;
-               __get_user(ll, &vtconsize->v_rows);
-               __get_user(cc, &vtconsize->v_cols);
-               __get_user(vlin, &vtconsize->v_vlin);
-               __get_user(clin, &vtconsize->v_clin);
-               __get_user(vcol, &vtconsize->v_vcol);
-               __get_user(ccol, &vtconsize->v_ccol);
+               get_user(ll, &vtconsize->v_rows);
+               get_user(cc, &vtconsize->v_cols);
+               get_user(vlin, &vtconsize->v_vlin);
+               get_user(clin, &vtconsize->v_clin);
+               get_user(vcol, &vtconsize->v_vcol);
+               get_user(ccol, &vtconsize->v_ccol);
                vlin = vlin ? vlin : video_scan_lines;
                if ( clin )
                  {
index 72a23f8584fa4765ca35c95a8656312c63916f41..347e88d37d0143dd05b9863354de1a5dc8267a13 100644 (file)
@@ -232,13 +232,15 @@ void clear_inode(struct inode *inode)
 
 /*
  * Dispose-list gets a local list, so it doesn't need to
- * worry about list corruption.
+ * worry about list corruption. It releases the inode lock
+ * while clearing the inodes.
  */
 static void dispose_list(struct list_head * head)
 {
        struct list_head *next;
        int count = 0;
 
+       spin_unlock(&inode_lock);
        next = head->next;
        for (;;) {
                struct list_head * tmp = next;
@@ -256,7 +258,6 @@ static void dispose_list(struct list_head * head)
        spin_lock(&inode_lock);
        list_splice(head, &inode_unused);
        inodes_stat.nr_free_inodes += count;
-       spin_unlock(&inode_lock);
 }
 
 /*
@@ -305,52 +306,52 @@ int invalidate_inodes(struct super_block * sb)
        spin_lock(&inode_lock);
        busy = invalidate_list(&inode_in_use, sb, &throw_away);
        busy |= invalidate_list(&sb->s_dirty, sb, &throw_away);
-       spin_unlock(&inode_lock);
-
        dispose_list(&throw_away);
+       spin_unlock(&inode_lock);
 
        return busy;
 }
 
 /*
  * This is called with the inode lock held. It searches
- * the in-use for the specified number of freeable inodes.
- * Freeable inodes are moved to a temporary list and then
- * placed on the unused list by dispose_list.
+ * the in-use for freeable inodes, which are moved to a
+ * temporary list and then placed on the unused list by
+ * dispose_list. 
+ *
+ * We don't expect to have to call this very often.
  *
- * Note that we do not expect to have to search very hard:
- * the freeable inodes will be at the old end of the list.
- * 
- * N.B. The spinlock is released to call dispose_list.
+ * N.B. The spinlock is released during the call to
+ *      dispose_list.
  */
 #define CAN_UNUSE(inode) \
-       (((inode)->i_count == 0) && \
-        (!(inode)->i_state))
+       (((inode)->i_count | (inode)->i_state) == 0)
+#define INODE(entry)   (list_entry(entry, struct inode, i_list))
 
-static int free_inodes(int goal)
+static int free_inodes(void)
 {
-       struct list_head *tmp, *head = &inode_in_use;
-       LIST_HEAD(freeable);
-       int found = 0, depth = goal << 1;
+       struct list_head list, *entry, *freeable = &list;
+       int found = 0;
 
-       while ((tmp = head->prev) != head && depth--) {
-               struct inode * inode = list_entry(tmp, struct inode, i_list);
+       INIT_LIST_HEAD(freeable);
+       entry = inode_in_use.next;
+       while (entry != &inode_in_use) {
+               struct list_head *tmp = entry;
+
+               entry = entry->next;
+               if (!CAN_UNUSE(INODE(tmp)))
+                       continue;
                list_del(tmp);
-               if (CAN_UNUSE(inode)) {
-                       list_del(&inode->i_hash);
-                       INIT_LIST_HEAD(&inode->i_hash);
-                       list_add(tmp, &freeable);
-                       if (++found < goal)
-                               continue;
-                       break;
-               }
-               list_add(tmp, head);
+               list_del(&INODE(tmp)->i_hash);
+               INIT_LIST_HEAD(&INODE(tmp)->i_hash);
+               list_add(tmp, freeable);
+               found = 1;
        }
+
        if (found) {
-               spin_unlock(&inode_lock);
-               dispose_list(&freeable);
-               spin_lock(&inode_lock);
+               dispose_list(freeable);
+               found = 1;      /* silly compiler */
        }
+
        return found;
 }
 
@@ -374,7 +375,7 @@ static void shrink_dentry_inodes(int goal)
 static void try_to_free_inodes(int goal)
 {
        shrink_dentry_inodes(goal);
-       if (!free_inodes(goal))
+       if (!free_inodes())
                shrink_dentry_inodes(goal);
 }
 
@@ -385,7 +386,7 @@ static void try_to_free_inodes(int goal)
 void free_inode_memory(int goal)
 {
        spin_lock(&inode_lock);
-       free_inodes(goal);
+       free_inodes();
        spin_unlock(&inode_lock);
 }
 
@@ -450,7 +451,7 @@ static struct inode * grow_inodes(void)
        inodes_stat.preshrink = 1;
 
        spin_lock(&inode_lock);
-       free_inodes(inodes_stat.nr_inodes >> 2);
+       free_inodes();
        {
                struct list_head *tmp = inode_unused.next;
                if (tmp != &inode_unused) {
index ebb9bc2765630cea885979e12504c8399cbb282b..9b97235c83d7da675ffc440cf3a88c456710270d 100644 (file)
@@ -174,6 +174,8 @@ struct mm_struct {
        unsigned long rss, total_vm, locked_vm;
        unsigned long def_flags;
        unsigned long cpu_vm_mask;
+       unsigned long swap_cnt; /* number of pages to swap on next pass */
+       unsigned long swap_address;
        /*
         * This is an architecture-specific pointer: the portable
         * part of Linux does not know about any segments.
@@ -191,7 +193,7 @@ struct mm_struct {
                0, 0, 0,                                \
                0, 0, 0, 0,                             \
                0, 0, 0,                                \
-               0, 0, NULL }
+               0, 0, 0, 0, NULL }
 
 struct signal_struct {
        atomic_t                count;
@@ -276,8 +278,6 @@ struct task_struct {
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
        unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap;
        int swappable:1;
-       unsigned long swap_address;
-       unsigned long swap_cnt;         /* number of pages to swap on next pass */
 /* process credentials */
        uid_t uid,euid,suid,fsuid;
        gid_t gid,egid,sgid,fsgid;
@@ -361,7 +361,7 @@ struct task_struct {
 /* utime */    {0,0,0,0},0, \
 /* per CPU times */ {0, }, {0, }, \
 /* flt */      0,0,0,0,0,0, \
-/* swp */      0,0,0, \
+/* swp */      0, \
 /* process credentials */                                      \
 /* uid etc */  0,0,0,0,0,0,0,0,                                \
 /* suppl grps*/ 0, {0,},                                       \
index aea2ca978463a078a11197e165d7a484e35f6329..9b37f328ecc1fbb917e0a4f947ca380648403b86 100644 (file)
@@ -64,6 +64,7 @@ extern int console_loglevel;
 static int init(void *);
 extern int bdflush(void *);
 extern int kswapd(void *);
+extern int kpiod(void *);
 extern void kswapd_setup(void);
 
 extern void init_IRQ(void);
@@ -1271,6 +1272,7 @@ static void __init do_basic_setup(void)
        kernel_thread(bdflush, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
        /* Start the background pageout daemon. */
        kswapd_setup();
+       kernel_thread(kpiod, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
        kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
 
 #if CONFIG_AP1000
index 84c345d8285502f367e7d629203e7e36a227e2b7..492433cde76c76e5259a5e57d3d9296eaa7a4d8b 100644 (file)
@@ -107,6 +107,7 @@ EXPORT_SYMBOL(high_memory);
 EXPORT_SYMBOL(update_vm_cache);
 EXPORT_SYMBOL(vmtruncate);
 EXPORT_SYMBOL(find_vma);
+EXPORT_SYMBOL(get_unmapped_area);
 
 /* filesystem internal functions */
 EXPORT_SYMBOL(in_group_p);
index add76fbe0bf6f6ca4a74447469552962c6b271bc..513ef16f92e855a557bec2bfdc08a3a59e2687c3 100644 (file)
@@ -680,8 +680,18 @@ asmlinkage void schedule(void)
 
        sched_data->prevstate = prev->state;
 
+/* this is the scheduler proper: */
        {
                struct task_struct * p = init_task.next_run;
+               int c = -1000;
+
+               /* Default process to select.. */
+               next = idle_task;
+               if (prev->state == TASK_RUNNING) {
+                       c = goodness(prev, prev, this_cpu);
+                       next = prev;
+               }
+
                /*
                 * This is subtle.
                 * Note how we can enable interrupts here, even
@@ -693,36 +703,27 @@ asmlinkage void schedule(void)
                 * the scheduler lock
                 */
                spin_unlock_irq(&runqueue_lock);
-#ifdef __SMP__
-               prev->has_cpu = 0;
-#endif
-       
 /*
  * Note! there may appear new tasks on the run-queue during this, as
  * interrupts are enabled. However, they will be put on front of the
  * list, so our list starting at "p" is essentially fixed.
  */
-/* this is the scheduler proper: */
-               {
-                       int c = -1000;
-                       next = idle_task;
-                       while (p != &init_task) {
-                               if (can_schedule(p)) {
-                                       int weight = goodness(p, prev, this_cpu);
-                                       if (weight > c)
-                                               c = weight, next = p;
-                               }
-                               p = p->next_run;
+               while (p != &init_task) {
+                       if (can_schedule(p)) {
+                               int weight = goodness(p, prev, this_cpu);
+                               if (weight > c)
+                                       c = weight, next = p;
                        }
+                       p = p->next_run;
+               }
 
-                       /* Do we need to re-calculate counters? */
-                       if (!c) {
-                               struct task_struct *p;
-                               read_lock(&tasklist_lock);
-                               for_each_task(p)
-                                       p->counter = (p->counter >> 1) + p->priority;
-                               read_unlock(&tasklist_lock);
-                       }
+               /* Do we need to re-calculate counters? */
+               if (!c) {
+                       struct task_struct *p;
+                       read_lock(&tasklist_lock);
+                       for_each_task(p)
+                               p->counter = (p->counter >> 1) + p->priority;
+                       read_unlock(&tasklist_lock);
                }
        }
 
@@ -751,10 +752,8 @@ asmlinkage void schedule(void)
         * thus we have to lock the previous process from getting
         * rescheduled during switch_to().
         */
-       prev->has_cpu = 1;
-
-       next->has_cpu = 1;
        next->processor = this_cpu;
+       next->has_cpu = 1;
        spin_unlock(&scheduler_lock);
 #endif /* __SMP__ */
        if (prev != next) {
index 3c15ea63b3ce367ec043993c54e6a4f1acd93c8a..849c2a93cabbb9c0955ed05c0ae7127e3ddb9836 100644 (file)
@@ -19,6 +19,7 @@
 #include <linux/blkdev.h>
 #include <linux/file.h>
 #include <linux/swapctl.h>
+#include <linux/slab.h>
 
 #include <asm/pgtable.h>
 #include <asm/uaccess.h>
@@ -39,6 +40,26 @@ struct page * page_hash_table[PAGE_HASH_SIZE];
 
 #define release_page(page) __free_page((page))
 
+/* 
+ * Define a request structure for outstanding page write requests
+ * to the background page io daemon
+ */
+
+struct pio_request 
+{
+       struct pio_request *    next;
+       struct file *           file;
+       unsigned long           offset;
+       unsigned long           page;
+};
+static struct pio_request *pio_first = NULL, **pio_last = &pio_first;
+static kmem_cache_t *pio_request_cache;
+static struct wait_queue *pio_wait = NULL;
+
+static inline void 
+make_pio_request(struct file *, unsigned long, unsigned long);
+
+
 /*
  * Invalidate the pages of an inode, removing all pages that aren't
  * locked down (those are sure to be up-to-date anyway, so we shouldn't
@@ -1079,8 +1100,9 @@ static inline int do_write_page(struct inode * inode, struct file * file,
 }
 
 static int filemap_write_page(struct vm_area_struct * vma,
-       unsigned long offset,
-       unsigned long page)
+                             unsigned long offset,
+                             unsigned long page,
+                             int wait)
 {
        int result;
        struct file * file;
@@ -1098,6 +1120,17 @@ static int filemap_write_page(struct vm_area_struct * vma,
         * and file could be released ... increment the count to be safe.
         */
        file->f_count++;
+
+       /* 
+        * If this is a swapping operation rather than msync(), then
+        * leave the actual IO, and the restoration of the file count,
+        * to the kpiod thread.  Just queue the request for now.
+        */
+       if (!wait) {
+               make_pio_request(file, offset, page);
+               return 0;
+       }
+       
        down(&inode->i_sem);
        result = do_write_page(inode, file, (const char *) page, offset);
        up(&inode->i_sem);
@@ -1113,7 +1146,7 @@ static int filemap_write_page(struct vm_area_struct * vma,
  */
 int filemap_swapout(struct vm_area_struct * vma, struct page * page)
 {
-       return filemap_write_page(vma, page->offset, page_address(page));
+       return filemap_write_page(vma, page->offset, page_address(page), 0);
 }
 
 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
@@ -1150,7 +1183,7 @@ static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
                        return 0;
                }
        }
-       error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page);
+       error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page, 1);
        free_page(page);
        return error;
 }
@@ -1569,3 +1602,120 @@ void put_cached_page(unsigned long addr)
        wake_up(&page->wait);
        __free_page(page);
 }
+
+
+/* Add request for page IO to the queue */
+
+static inline void put_pio_request(struct pio_request *p)
+{
+       *pio_last = p;
+       p->next = NULL;
+       pio_last = &p->next;
+}
+
+/* Take the first page IO request off the queue */
+
+static inline struct pio_request * get_pio_request(void)
+{
+       struct pio_request * p = pio_first;
+       pio_first = p->next;
+       if (!pio_first)
+               pio_last = &pio_first;
+       return p;
+}
+
+/* Make a new page IO request and queue it to the kpiod thread */
+
+static inline void make_pio_request(struct file *file,
+                                   unsigned long offset,
+                                   unsigned long page)
+{
+       struct pio_request *p;
+
+       atomic_inc(&mem_map[MAP_NR(page)].count);
+
+       /* 
+        * We need to allocate without causing any recursive IO in the
+        * current thread's context.  We might currently be swapping out
+        * as a result of an allocation made while holding a critical
+        * filesystem lock.  To avoid deadlock, we *MUST* not reenter
+        * the filesystem in this thread.
+        *
+        * We can wait for kswapd to free memory, or we can try to free
+        * pages without actually performing further IO, without fear of
+        * deadlock.  --sct
+        */
+
+       while ((p = kmem_cache_alloc(pio_request_cache, GFP_BUFFER)) == NULL) {
+               if (try_to_free_pages(__GFP_WAIT))
+                       continue;
+               current->state = TASK_INTERRUPTIBLE;
+               schedule_timeout(HZ/10);
+       }
+       
+       p->file   = file;
+       p->offset = offset;
+       p->page   = page;
+
+       put_pio_request(p);
+       wake_up(&pio_wait);
+}
+
+
+/*
+ * This is the only thread which is allowed to write out filemap pages
+ * while swapping.
+ * 
+ * To avoid deadlock, it is important that we never reenter this thread.
+ * Although recursive memory allocations within this thread may result
+ * in more page swapping, that swapping will always be done by queuing
+ * another IO request to the same thread: we will never actually start
+ * that IO request until we have finished with the current one, and so
+ * we will not deadlock.  
+ */
+
+int kpiod(void * unused)
+{
+       struct wait_queue wait = {current};
+       struct inode * inode;
+       struct dentry * dentry;
+       struct pio_request * p;
+       
+       current->session = 1;
+       current->pgrp = 1;
+       strcpy(current->comm, "kpiod");
+       sigfillset(&current->blocked);
+       init_waitqueue(&pio_wait);
+
+       lock_kernel();
+       
+       pio_request_cache = kmem_cache_create("pio_request", 
+                                             sizeof(struct pio_request),
+                                             0, SLAB_HWCACHE_ALIGN, 
+                                             NULL, NULL);
+       if (!pio_request_cache)
+               panic ("Could not create pio_request slab cache");
+       
+       while (1) {
+               current->state = TASK_INTERRUPTIBLE;
+               add_wait_queue(&pio_wait, &wait);
+               while (!pio_first)
+                       schedule();
+               remove_wait_queue(&pio_wait, &wait);
+               current->state = TASK_RUNNING;
+
+               while (pio_first) {
+                       p = get_pio_request();
+                       dentry = p->file->f_dentry;
+                       inode = dentry->d_inode;
+                       
+                       down(&inode->i_sem);
+                       do_write_page(inode, p->file,
+                                     (const char *) p->page, p->offset);
+                       up(&inode->i_sem);
+                       fput(p->file);
+                       free_page(p->page);
+                       kmem_cache_free(pio_request_cache, p);
+               }
+       }
+}
index 116096153341badcd18421db803da163cd90a517..7dbae4cfc3e5de537af9d7f1f8e769c89203c922 100644 (file)
@@ -202,7 +202,7 @@ static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct *
 
        do {
                int result;
-               tsk->swap_address = address + PAGE_SIZE;
+               tsk->mm->swap_address = address + PAGE_SIZE;
                result = try_to_swap_out(tsk, vma, address, pte, gfp_mask);
                if (result)
                        return result;
@@ -274,7 +274,7 @@ static int swap_out_process(struct task_struct * p, int gfp_mask)
        /*
         * Go through process' page directory.
         */
-       address = p->swap_address;
+       address = p->mm->swap_address;
 
        /*
         * Find the proper vm-area
@@ -296,8 +296,8 @@ static int swap_out_process(struct task_struct * p, int gfp_mask)
        }
 
        /* We didn't find anything for the process */
-       p->swap_cnt = 0;
-       p->swap_address = 0;
+       p->mm->swap_cnt = 0;
+       p->mm->swap_address = 0;
        return 0;
 }
 
@@ -345,9 +345,9 @@ static int swap_out(unsigned int priority, int gfp_mask)
                                continue;
                        /* Refresh swap_cnt? */
                        if (assign)
-                               p->swap_cnt = p->mm->rss;
-                       if (p->swap_cnt > max_cnt) {
-                               max_cnt = p->swap_cnt;
+                               p->mm->swap_cnt = p->mm->rss;
+                       if (p->mm->swap_cnt > max_cnt) {
+                               max_cnt = p->mm->swap_cnt;
                                pbest = p;
                        }
                }
index f8990903ee9c5df59d237090b2e3b3e51471f2a6..d21b1065308bf2e16bb2c2235250898c452ac803 100644 (file)
@@ -184,6 +184,8 @@ get__netinfo(struct proto *pro, char *buffer, int format, char **start, off_t of
 
                        for (req = sp->tp_pinfo.af_tcp.syn_wait_queue; req;
                             i++, req = req->dl_next) {
+                               if (req->sk)
+                                       continue;
                                pos += 128;
                                if (pos < offset) 
                                        continue;
index 660e64c44ffdb1a968d57891172cae8f6a4fbf35..18a058c3189a1213da60206ad97ac0b33984190f 100644 (file)
@@ -1563,12 +1563,6 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
        }
 #endif /* CONFIG_FILTER */
 
-       /*
-        *      socket locking is here for SMP purposes as backlog rcv
-        *      is currently called with bh processing disabled.
-        */
-       lock_sock(sk); 
-
        /* 
         * This doesn't check if the socket has enough room for the packet.
         * Either process the packet _without_ queueing it and then free it,
@@ -1579,7 +1573,6 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
        if (sk->state == TCP_ESTABLISHED) { /* Fast path */
                if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
                        goto reset;
-               release_sock(sk);
                return 0; 
        } 
 
@@ -1590,14 +1583,21 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
                nsk = tcp_v4_hnd_req(sk, skb);
                if (!nsk) 
                        goto discard;
-               lock_sock(nsk);
-               release_sock(sk);
+
+               /*
+                * Queue it on the new socket if the new socket is active,
+                * otherwise we just shortcircuit this and continue with
+                * the new socket..
+                */
+               if (atomic_read(&nsk->sock_readers)) {
+                       __skb_queue_tail(&nsk->back_log, skb);
+                       return 0;
+               }
                sk = nsk;
        }
        
        if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
                goto reset;
-       release_sock(sk); 
        return 0;
 
 reset:
@@ -1609,7 +1609,6 @@ discard:
         * might be destroyed here. This current version compiles correctly,
         * but you have been warned.
         */
-       release_sock(sk);  
        return 0;
 }