From 616d8602876cdb9fe13e39179a696b5a69f393a6 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 23 Nov 2007 15:18:11 -0500 Subject: [PATCH] Linux 2.2.2-pre2 this one contains various small documentation updates and updates to xconfig, but the important parts (and the smallest part of the actual patch) are: - shared file lockup fix by Stephen Tweedie - my fix for the TCP bug that Ingo found - Ingo's io-apic setup fixes, which should finally get rid of the spurious apic interrupts with some motherboards and the ExtINT setup. - inode leak thing - SMP scheduler potential race condition fix - sound driver updates - partition and disk fixes (2kB blocksize media and some IDE disk geometry and irq detection issues). None of the fixes are critical to most people, but all of them _can_ be critical to people who have seen vulnerabilities in the area. As such, if you're happy with 2.2.1 there is no pressing reason to test this patch out, but I hope to have the pre-patches so that the final 2.2.2 can be left around for a while (CD-ROM manufacturers etc would certainly prefer to not see lots of releases). Linus --- arch/i386/kernel/io_apic.c | 40 +++++++--- drivers/char/vt.c | 47 ++++++----- fs/inode.c | 69 ++++++++-------- include/linux/sched.h | 8 +- init/main.c | 2 + kernel/ksyms.c | 1 + kernel/sched.c | 51 ++++++------ mm/filemap.c | 158 ++++++++++++++++++++++++++++++++++++- mm/vmscan.c | 14 ++-- net/ipv4/proc.c | 2 + net/ipv4/tcp_ipv4.c | 21 +++-- 11 files changed, 289 insertions(+), 124 deletions(-) diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index 0a2416141087..b57259afc1d8 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -202,7 +202,7 @@ DO_ACTION( enable, 1, |= 0xff000000, ) /* destination = 0xff */ DO_ACTION( mask, 0, |= 0x00010000, io_apic_sync()) /* mask = 1 */ DO_ACTION( unmask, 0, &= 0xfffeffff, ) /* mask = 0 */ -static void __init clear_IO_APIC_pin(unsigned int pin) +static void clear_IO_APIC_pin(unsigned int pin) { struct IO_APIC_route_entry entry; @@ -215,6 +215,13 @@ static void __init clear_IO_APIC_pin(unsigned int pin) io_apic_write(0x11 + 2 * pin, *(((int *)&entry) + 1)); } +static void clear_IO_APIC (void) +{ + int pin; + + for (pin = 0; pin < nr_ioapic_registers; pin++) + clear_IO_APIC_pin(pin); +} /* * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to @@ -625,7 +632,7 @@ void __init setup_IO_APIC_irqs(void) /* * Set up a certain pin as ExtINT delivered interrupt */ -void __init setup_ExtINT_pin(unsigned int pin) +void __init setup_ExtINT_pin(unsigned int pin, int irq) { struct IO_APIC_route_entry entry; @@ -635,17 +642,16 @@ void __init setup_ExtINT_pin(unsigned int pin) memset(&entry,0,sizeof(entry)); entry.delivery_mode = dest_ExtINT; - entry.dest_mode = 1; /* logical delivery */ + entry.dest_mode = 0; /* physical delivery */ entry.mask = 0; /* unmask IRQ now */ /* - * Careful with this one. We do not use 'true' logical - * delivery, as we set local APICs to LDR == 0. But - * 0xff logical destination is special (broadcast). - * Any other combination will cause problems. + * We use physical delivery to get the timer IRQ + * to the boot CPU. 'boot_cpu_id' is the physical + * APIC ID of the boot CPU. */ - entry.dest.logical.logical_dest = 0xff; + entry.dest.physical.physical_dest = boot_cpu_id; - entry.vector = 0; /* it's ignored */ + entry.vector = assign_irq_vector(irq); entry.polarity = 0; entry.trigger = 0; @@ -760,7 +766,7 @@ void __init print_IO_APIC(void) static void __init init_sym_mode(void) { - int i, pin; + int i; for (i = 0; i < PIN_MAP_SIZE; i++) { irq_2_pin[i].pin = -1; @@ -790,8 +796,7 @@ static void __init init_sym_mode(void) /* * Do not trust the IO-APIC being empty at bootup */ - for (pin = 0; pin < nr_ioapic_registers; pin++) - clear_IO_APIC_pin(pin); + clear_IO_APIC(); } /* @@ -799,6 +804,15 @@ static void __init init_sym_mode(void) */ void init_pic_mode(void) { + /* + * Clear the IO-APIC before rebooting: + */ + clear_IO_APIC(); + + /* + * Put it back into PIC mode (has an effect only on + * certain boards) + */ printk("disabling symmetric IO mode... "); outb_p(0x70, 0x22); outb_p(0x00, 0x23); @@ -1184,7 +1198,7 @@ static inline void check_timer(void) if (pin2 != -1) { printk(".. (found pin %d) ...", pin2); - setup_ExtINT_pin(pin2); + setup_ExtINT_pin(pin2, 0); make_8259A_irq(0); } diff --git a/drivers/char/vt.c b/drivers/char/vt.c index 6830089f96c2..97be390b03a1 100644 --- a/drivers/char/vt.c +++ b/drivers/char/vt.c @@ -165,7 +165,7 @@ do_kdsk_ioctl(int cmd, struct kbentry *user_kbe, int perm, struct kbd_struct *kb val = K_HOLE; } else val = (i ? K_HOLE : K_NOSUCHMAP); - return __put_user(val, &user_kbe->kb_value); + return put_user(val, &user_kbe->kb_value); case KDSKBENT: if (!perm) return -EPERM; @@ -244,7 +244,7 @@ do_kbkeycode_ioctl(int cmd, struct kbkeycode *user_kbkc, int perm) case KDGETKEYCODE: kc = getkeycode(tmp.scancode); if (kc >= 0) - kc = __put_user(kc, &user_kbkc->keycode); + kc = put_user(kc, &user_kbkc->keycode); break; case KDSETKEYCODE: if (!perm) @@ -282,8 +282,8 @@ do_kdgkb_ioctl(int cmd, struct kbsentry *user_kdgkb, int perm) p = func_table[i]; if(p) for ( ; *p && sz; p++, sz--) - __put_user(*p, q++); - __put_user('\0', q); + put_user(*p, q++); + put_user('\0', q); return ((p && *p) ? -EOVERFLOW : 0); case KDSKBSENT: if (!perm) @@ -603,12 +603,10 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, { struct kbdiacrs *a = (struct kbdiacrs *)arg; - i = verify_area(VERIFY_WRITE, (void *) a, sizeof(struct kbdiacrs)); - if (i) - return i; - __put_user(accent_table_size, &a->kb_cnt); - __copy_to_user(a->kbdiacr, accent_table, - accent_table_size*sizeof(struct kbdiacr)); + if (put_user(accent_table_size, &a->kb_cnt)) + return -EFAULT; + if (copy_to_user(a->kbdiacr, accent_table, accent_table_size*sizeof(struct kbdiacr))) + return -EFAULT; return 0; } @@ -619,14 +617,13 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, if (!perm) return -EPERM; - i = verify_area(VERIFY_READ, (void *) a, sizeof(struct kbdiacrs)); - if (i) - return i; - __get_user(ct,&a->kb_cnt); + if (get_user(ct,&a->kb_cnt)) + return -EFAULT; if (ct >= MAX_DIACR) return -EINVAL; accent_table_size = ct; - __copy_from_user(accent_table, a->kbdiacr, ct*sizeof(struct kbdiacr)); + if (copy_from_user(accent_table, a->kbdiacr, ct*sizeof(struct kbdiacr))) + return -EFAULT; return 0; } @@ -717,12 +714,12 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, i = verify_area(VERIFY_WRITE,(void *)vtstat, sizeof(struct vt_stat)); if (i) return i; - __put_user(fg_console + 1, &vtstat->v_active); + put_user(fg_console + 1, &vtstat->v_active); state = 1; /* /dev/tty0 is always open */ for (i = 0, mask = 2; i < MAX_NR_CONSOLES && mask; ++i, mask <<= 1) if (VT_IS_IN_USE(i)) state |= mask; - return __put_user(state, &vtstat->v_state); + return put_user(state, &vtstat->v_state); } /* @@ -856,8 +853,8 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, i = verify_area(VERIFY_READ, (void *)vtsizes, sizeof(struct vt_sizes)); if (i) return i; - __get_user(ll, &vtsizes->v_rows); - __get_user(cc, &vtsizes->v_cols); + get_user(ll, &vtsizes->v_rows); + get_user(cc, &vtsizes->v_cols); return vc_resize_all(ll, cc); } @@ -870,12 +867,12 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, i = verify_area(VERIFY_READ, (void *)vtconsize, sizeof(struct vt_consize)); if (i) return i; - __get_user(ll, &vtconsize->v_rows); - __get_user(cc, &vtconsize->v_cols); - __get_user(vlin, &vtconsize->v_vlin); - __get_user(clin, &vtconsize->v_clin); - __get_user(vcol, &vtconsize->v_vcol); - __get_user(ccol, &vtconsize->v_ccol); + get_user(ll, &vtconsize->v_rows); + get_user(cc, &vtconsize->v_cols); + get_user(vlin, &vtconsize->v_vlin); + get_user(clin, &vtconsize->v_clin); + get_user(vcol, &vtconsize->v_vcol); + get_user(ccol, &vtconsize->v_ccol); vlin = vlin ? vlin : video_scan_lines; if ( clin ) { diff --git a/fs/inode.c b/fs/inode.c index 72a23f8584fa..347e88d37d01 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -232,13 +232,15 @@ void clear_inode(struct inode *inode) /* * Dispose-list gets a local list, so it doesn't need to - * worry about list corruption. + * worry about list corruption. It releases the inode lock + * while clearing the inodes. */ static void dispose_list(struct list_head * head) { struct list_head *next; int count = 0; + spin_unlock(&inode_lock); next = head->next; for (;;) { struct list_head * tmp = next; @@ -256,7 +258,6 @@ static void dispose_list(struct list_head * head) spin_lock(&inode_lock); list_splice(head, &inode_unused); inodes_stat.nr_free_inodes += count; - spin_unlock(&inode_lock); } /* @@ -305,52 +306,52 @@ int invalidate_inodes(struct super_block * sb) spin_lock(&inode_lock); busy = invalidate_list(&inode_in_use, sb, &throw_away); busy |= invalidate_list(&sb->s_dirty, sb, &throw_away); - spin_unlock(&inode_lock); - dispose_list(&throw_away); + spin_unlock(&inode_lock); return busy; } /* * This is called with the inode lock held. It searches - * the in-use for the specified number of freeable inodes. - * Freeable inodes are moved to a temporary list and then - * placed on the unused list by dispose_list. + * the in-use for freeable inodes, which are moved to a + * temporary list and then placed on the unused list by + * dispose_list. + * + * We don't expect to have to call this very often. * - * Note that we do not expect to have to search very hard: - * the freeable inodes will be at the old end of the list. - * - * N.B. The spinlock is released to call dispose_list. + * N.B. The spinlock is released during the call to + * dispose_list. */ #define CAN_UNUSE(inode) \ - (((inode)->i_count == 0) && \ - (!(inode)->i_state)) + (((inode)->i_count | (inode)->i_state) == 0) +#define INODE(entry) (list_entry(entry, struct inode, i_list)) -static int free_inodes(int goal) +static int free_inodes(void) { - struct list_head *tmp, *head = &inode_in_use; - LIST_HEAD(freeable); - int found = 0, depth = goal << 1; + struct list_head list, *entry, *freeable = &list; + int found = 0; - while ((tmp = head->prev) != head && depth--) { - struct inode * inode = list_entry(tmp, struct inode, i_list); + INIT_LIST_HEAD(freeable); + entry = inode_in_use.next; + while (entry != &inode_in_use) { + struct list_head *tmp = entry; + + entry = entry->next; + if (!CAN_UNUSE(INODE(tmp))) + continue; list_del(tmp); - if (CAN_UNUSE(inode)) { - list_del(&inode->i_hash); - INIT_LIST_HEAD(&inode->i_hash); - list_add(tmp, &freeable); - if (++found < goal) - continue; - break; - } - list_add(tmp, head); + list_del(&INODE(tmp)->i_hash); + INIT_LIST_HEAD(&INODE(tmp)->i_hash); + list_add(tmp, freeable); + found = 1; } + if (found) { - spin_unlock(&inode_lock); - dispose_list(&freeable); - spin_lock(&inode_lock); + dispose_list(freeable); + found = 1; /* silly compiler */ } + return found; } @@ -374,7 +375,7 @@ static void shrink_dentry_inodes(int goal) static void try_to_free_inodes(int goal) { shrink_dentry_inodes(goal); - if (!free_inodes(goal)) + if (!free_inodes()) shrink_dentry_inodes(goal); } @@ -385,7 +386,7 @@ static void try_to_free_inodes(int goal) void free_inode_memory(int goal) { spin_lock(&inode_lock); - free_inodes(goal); + free_inodes(); spin_unlock(&inode_lock); } @@ -450,7 +451,7 @@ static struct inode * grow_inodes(void) inodes_stat.preshrink = 1; spin_lock(&inode_lock); - free_inodes(inodes_stat.nr_inodes >> 2); + free_inodes(); { struct list_head *tmp = inode_unused.next; if (tmp != &inode_unused) { diff --git a/include/linux/sched.h b/include/linux/sched.h index ebb9bc276563..9b97235c83d7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -174,6 +174,8 @@ struct mm_struct { unsigned long rss, total_vm, locked_vm; unsigned long def_flags; unsigned long cpu_vm_mask; + unsigned long swap_cnt; /* number of pages to swap on next pass */ + unsigned long swap_address; /* * This is an architecture-specific pointer: the portable * part of Linux does not know about any segments. @@ -191,7 +193,7 @@ struct mm_struct { 0, 0, 0, \ 0, 0, 0, 0, \ 0, 0, 0, \ - 0, 0, NULL } + 0, 0, 0, 0, NULL } struct signal_struct { atomic_t count; @@ -276,8 +278,6 @@ struct task_struct { /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap; int swappable:1; - unsigned long swap_address; - unsigned long swap_cnt; /* number of pages to swap on next pass */ /* process credentials */ uid_t uid,euid,suid,fsuid; gid_t gid,egid,sgid,fsgid; @@ -361,7 +361,7 @@ struct task_struct { /* utime */ {0,0,0,0},0, \ /* per CPU times */ {0, }, {0, }, \ /* flt */ 0,0,0,0,0,0, \ -/* swp */ 0,0,0, \ +/* swp */ 0, \ /* process credentials */ \ /* uid etc */ 0,0,0,0,0,0,0,0, \ /* suppl grps*/ 0, {0,}, \ diff --git a/init/main.c b/init/main.c index aea2ca978463..9b37f328ecc1 100644 --- a/init/main.c +++ b/init/main.c @@ -64,6 +64,7 @@ extern int console_loglevel; static int init(void *); extern int bdflush(void *); extern int kswapd(void *); +extern int kpiod(void *); extern void kswapd_setup(void); extern void init_IRQ(void); @@ -1271,6 +1272,7 @@ static void __init do_basic_setup(void) kernel_thread(bdflush, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND); /* Start the background pageout daemon. */ kswapd_setup(); + kernel_thread(kpiod, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND); kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND); #if CONFIG_AP1000 diff --git a/kernel/ksyms.c b/kernel/ksyms.c index 84c345d82855..492433cde76c 100644 --- a/kernel/ksyms.c +++ b/kernel/ksyms.c @@ -107,6 +107,7 @@ EXPORT_SYMBOL(high_memory); EXPORT_SYMBOL(update_vm_cache); EXPORT_SYMBOL(vmtruncate); EXPORT_SYMBOL(find_vma); +EXPORT_SYMBOL(get_unmapped_area); /* filesystem internal functions */ EXPORT_SYMBOL(in_group_p); diff --git a/kernel/sched.c b/kernel/sched.c index add76fbe0bf6..513ef16f92e8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -680,8 +680,18 @@ asmlinkage void schedule(void) sched_data->prevstate = prev->state; +/* this is the scheduler proper: */ { struct task_struct * p = init_task.next_run; + int c = -1000; + + /* Default process to select.. */ + next = idle_task; + if (prev->state == TASK_RUNNING) { + c = goodness(prev, prev, this_cpu); + next = prev; + } + /* * This is subtle. * Note how we can enable interrupts here, even @@ -693,36 +703,27 @@ asmlinkage void schedule(void) * the scheduler lock */ spin_unlock_irq(&runqueue_lock); -#ifdef __SMP__ - prev->has_cpu = 0; -#endif - /* * Note! there may appear new tasks on the run-queue during this, as * interrupts are enabled. However, they will be put on front of the * list, so our list starting at "p" is essentially fixed. */ -/* this is the scheduler proper: */ - { - int c = -1000; - next = idle_task; - while (p != &init_task) { - if (can_schedule(p)) { - int weight = goodness(p, prev, this_cpu); - if (weight > c) - c = weight, next = p; - } - p = p->next_run; + while (p != &init_task) { + if (can_schedule(p)) { + int weight = goodness(p, prev, this_cpu); + if (weight > c) + c = weight, next = p; } + p = p->next_run; + } - /* Do we need to re-calculate counters? */ - if (!c) { - struct task_struct *p; - read_lock(&tasklist_lock); - for_each_task(p) - p->counter = (p->counter >> 1) + p->priority; - read_unlock(&tasklist_lock); - } + /* Do we need to re-calculate counters? */ + if (!c) { + struct task_struct *p; + read_lock(&tasklist_lock); + for_each_task(p) + p->counter = (p->counter >> 1) + p->priority; + read_unlock(&tasklist_lock); } } @@ -751,10 +752,8 @@ asmlinkage void schedule(void) * thus we have to lock the previous process from getting * rescheduled during switch_to(). */ - prev->has_cpu = 1; - - next->has_cpu = 1; next->processor = this_cpu; + next->has_cpu = 1; spin_unlock(&scheduler_lock); #endif /* __SMP__ */ if (prev != next) { diff --git a/mm/filemap.c b/mm/filemap.c index 3c15ea63b3ce..849c2a93cabb 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -39,6 +40,26 @@ struct page * page_hash_table[PAGE_HASH_SIZE]; #define release_page(page) __free_page((page)) +/* + * Define a request structure for outstanding page write requests + * to the background page io daemon + */ + +struct pio_request +{ + struct pio_request * next; + struct file * file; + unsigned long offset; + unsigned long page; +}; +static struct pio_request *pio_first = NULL, **pio_last = &pio_first; +static kmem_cache_t *pio_request_cache; +static struct wait_queue *pio_wait = NULL; + +static inline void +make_pio_request(struct file *, unsigned long, unsigned long); + + /* * Invalidate the pages of an inode, removing all pages that aren't * locked down (those are sure to be up-to-date anyway, so we shouldn't @@ -1079,8 +1100,9 @@ static inline int do_write_page(struct inode * inode, struct file * file, } static int filemap_write_page(struct vm_area_struct * vma, - unsigned long offset, - unsigned long page) + unsigned long offset, + unsigned long page, + int wait) { int result; struct file * file; @@ -1098,6 +1120,17 @@ static int filemap_write_page(struct vm_area_struct * vma, * and file could be released ... increment the count to be safe. */ file->f_count++; + + /* + * If this is a swapping operation rather than msync(), then + * leave the actual IO, and the restoration of the file count, + * to the kpiod thread. Just queue the request for now. + */ + if (!wait) { + make_pio_request(file, offset, page); + return 0; + } + down(&inode->i_sem); result = do_write_page(inode, file, (const char *) page, offset); up(&inode->i_sem); @@ -1113,7 +1146,7 @@ static int filemap_write_page(struct vm_area_struct * vma, */ int filemap_swapout(struct vm_area_struct * vma, struct page * page) { - return filemap_write_page(vma, page->offset, page_address(page)); + return filemap_write_page(vma, page->offset, page_address(page), 0); } static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma, @@ -1150,7 +1183,7 @@ static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma, return 0; } } - error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page); + error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page, 1); free_page(page); return error; } @@ -1569,3 +1602,120 @@ void put_cached_page(unsigned long addr) wake_up(&page->wait); __free_page(page); } + + +/* Add request for page IO to the queue */ + +static inline void put_pio_request(struct pio_request *p) +{ + *pio_last = p; + p->next = NULL; + pio_last = &p->next; +} + +/* Take the first page IO request off the queue */ + +static inline struct pio_request * get_pio_request(void) +{ + struct pio_request * p = pio_first; + pio_first = p->next; + if (!pio_first) + pio_last = &pio_first; + return p; +} + +/* Make a new page IO request and queue it to the kpiod thread */ + +static inline void make_pio_request(struct file *file, + unsigned long offset, + unsigned long page) +{ + struct pio_request *p; + + atomic_inc(&mem_map[MAP_NR(page)].count); + + /* + * We need to allocate without causing any recursive IO in the + * current thread's context. We might currently be swapping out + * as a result of an allocation made while holding a critical + * filesystem lock. To avoid deadlock, we *MUST* not reenter + * the filesystem in this thread. + * + * We can wait for kswapd to free memory, or we can try to free + * pages without actually performing further IO, without fear of + * deadlock. --sct + */ + + while ((p = kmem_cache_alloc(pio_request_cache, GFP_BUFFER)) == NULL) { + if (try_to_free_pages(__GFP_WAIT)) + continue; + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(HZ/10); + } + + p->file = file; + p->offset = offset; + p->page = page; + + put_pio_request(p); + wake_up(&pio_wait); +} + + +/* + * This is the only thread which is allowed to write out filemap pages + * while swapping. + * + * To avoid deadlock, it is important that we never reenter this thread. + * Although recursive memory allocations within this thread may result + * in more page swapping, that swapping will always be done by queuing + * another IO request to the same thread: we will never actually start + * that IO request until we have finished with the current one, and so + * we will not deadlock. + */ + +int kpiod(void * unused) +{ + struct wait_queue wait = {current}; + struct inode * inode; + struct dentry * dentry; + struct pio_request * p; + + current->session = 1; + current->pgrp = 1; + strcpy(current->comm, "kpiod"); + sigfillset(¤t->blocked); + init_waitqueue(&pio_wait); + + lock_kernel(); + + pio_request_cache = kmem_cache_create("pio_request", + sizeof(struct pio_request), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!pio_request_cache) + panic ("Could not create pio_request slab cache"); + + while (1) { + current->state = TASK_INTERRUPTIBLE; + add_wait_queue(&pio_wait, &wait); + while (!pio_first) + schedule(); + remove_wait_queue(&pio_wait, &wait); + current->state = TASK_RUNNING; + + while (pio_first) { + p = get_pio_request(); + dentry = p->file->f_dentry; + inode = dentry->d_inode; + + down(&inode->i_sem); + do_write_page(inode, p->file, + (const char *) p->page, p->offset); + up(&inode->i_sem); + fput(p->file); + free_page(p->page); + kmem_cache_free(pio_request_cache, p); + } + } +} diff --git a/mm/vmscan.c b/mm/vmscan.c index 116096153341..7dbae4cfc3e5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -202,7 +202,7 @@ static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct * do { int result; - tsk->swap_address = address + PAGE_SIZE; + tsk->mm->swap_address = address + PAGE_SIZE; result = try_to_swap_out(tsk, vma, address, pte, gfp_mask); if (result) return result; @@ -274,7 +274,7 @@ static int swap_out_process(struct task_struct * p, int gfp_mask) /* * Go through process' page directory. */ - address = p->swap_address; + address = p->mm->swap_address; /* * Find the proper vm-area @@ -296,8 +296,8 @@ static int swap_out_process(struct task_struct * p, int gfp_mask) } /* We didn't find anything for the process */ - p->swap_cnt = 0; - p->swap_address = 0; + p->mm->swap_cnt = 0; + p->mm->swap_address = 0; return 0; } @@ -345,9 +345,9 @@ static int swap_out(unsigned int priority, int gfp_mask) continue; /* Refresh swap_cnt? */ if (assign) - p->swap_cnt = p->mm->rss; - if (p->swap_cnt > max_cnt) { - max_cnt = p->swap_cnt; + p->mm->swap_cnt = p->mm->rss; + if (p->mm->swap_cnt > max_cnt) { + max_cnt = p->mm->swap_cnt; pbest = p; } } diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index f8990903ee9c..d21b1065308b 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -184,6 +184,8 @@ get__netinfo(struct proto *pro, char *buffer, int format, char **start, off_t of for (req = sp->tp_pinfo.af_tcp.syn_wait_queue; req; i++, req = req->dl_next) { + if (req->sk) + continue; pos += 128; if (pos < offset) continue; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 660e64c44ffd..18a058c3189a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1563,12 +1563,6 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) } #endif /* CONFIG_FILTER */ - /* - * socket locking is here for SMP purposes as backlog rcv - * is currently called with bh processing disabled. - */ - lock_sock(sk); - /* * This doesn't check if the socket has enough room for the packet. * Either process the packet _without_ queueing it and then free it, @@ -1579,7 +1573,6 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) if (sk->state == TCP_ESTABLISHED) { /* Fast path */ if (tcp_rcv_established(sk, skb, skb->h.th, skb->len)) goto reset; - release_sock(sk); return 0; } @@ -1590,14 +1583,21 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) nsk = tcp_v4_hnd_req(sk, skb); if (!nsk) goto discard; - lock_sock(nsk); - release_sock(sk); + + /* + * Queue it on the new socket if the new socket is active, + * otherwise we just shortcircuit this and continue with + * the new socket.. + */ + if (atomic_read(&nsk->sock_readers)) { + __skb_queue_tail(&nsk->back_log, skb); + return 0; + } sk = nsk; } if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len)) goto reset; - release_sock(sk); return 0; reset: @@ -1609,7 +1609,6 @@ discard: * might be destroyed here. This current version compiles correctly, * but you have been warned. */ - release_sock(sk); return 0; } -- 2.39.5