From: Linus Torvalds Date: Fri, 23 Nov 2007 20:26:09 +0000 (-0500) Subject: Import 2.3.11pre3 X-Git-Tag: 2.3.11pre3 X-Git-Url: http://git.neil.brown.name/?a=commitdiff_plain;h=5c2f9737e8637fed65bf9efcb5422de1298786b7;p=history.git Import 2.3.11pre3 --- diff --git a/arch/i386/boot/compressed/Makefile b/arch/i386/boot/compressed/Makefile index 6a3fda06dcc7..aef76160275d 100644 --- a/arch/i386/boot/compressed/Makefile +++ b/arch/i386/boot/compressed/Makefile @@ -30,7 +30,7 @@ vmlinux: piggy.o $(OBJECTS) bvmlinux: piggy.o $(OBJECTS) $(LD) $(BZLINKFLAGS) -o bvmlinux $(OBJECTS) piggy.o -head.o: head.S $(TOPDIR)/include/linux/tasks.h +head.o: head.S $(CC) $(AFLAGS) -traditional -c head.S piggy.o: $(SYSTEM) diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index 0c3f24889c20..7aba6fdc9948 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -50,7 +50,7 @@ ifdef CONFIG_X86_VISWS_APIC O_OBJS += visws_apic.o endif -head.o: head.S $(TOPDIR)/include/linux/tasks.h +head.o: head.S $(CC) -D__ASSEMBLY__ $(AFLAGS) -traditional -c $*.S -o $*.o include $(TOPDIR)/Rules.make diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index acbc3e325000..ac854e721832 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -8,11 +8,12 @@ */ .text -#include +#include #include #include #include #include +#include #define CL_MAGIC_ADDR 0x90020 @@ -330,7 +331,7 @@ ignore_int: * of tasks we can have.. */ #define IDT_ENTRIES 256 -#define GDT_ENTRIES (12+2*NR_TASKS) +#define GDT_ENTRIES (__TSS(NR_CPUS)) .globl SYMBOL_NAME(idt) @@ -519,8 +520,7 @@ ENTRY(empty_zero_page) ALIGN /* - * This contains up to 8192 quadwords depending on NR_TASKS - 64kB of - * gdt entries. Ugh. + * This contains typically 140 quadwords, depending on NR_CPUS. * * NOTE! Make sure the gdt descriptor in head.S matches this if you * change anything. @@ -542,7 +542,7 @@ ENTRY(gdt_table) .quad 0x00409a0000000000 /* 0x48 APM CS code */ .quad 0x00009a0000000000 /* 0x50 APM CS 16 code (16 bit) */ .quad 0x0040920000000000 /* 0x58 APM DS data */ - .fill 2*NR_TASKS,8,0 /* space for LDT's and TSS's etc */ + .fill NR_CPUS*4,8,0 /* space for TSS's and LDT's */ /* * This is to aid debugging, the various locking macros will be putting diff --git a/arch/i386/kernel/init_task.c b/arch/i386/kernel/init_task.c index 0faa696a4e3e..537bcd196968 100644 --- a/arch/i386/kernel/init_task.c +++ b/arch/i386/kernel/init_task.c @@ -4,6 +4,7 @@ #include #include #include +#include static struct vm_area_struct init_mmap = INIT_MMAP; static struct fs_struct init_fs = INIT_FS; @@ -22,4 +23,14 @@ struct mm_struct init_mm = INIT_MM(init_mm); union task_union init_task_union __attribute__((__section__(".data.init_task"))) = { INIT_TASK(init_task_union.task) }; - + +/* + * per-CPU TSS segments. Threads are completely 'soft' on Linux, + * no more per-task TSS's. The TSS size is kept cacheline-aligned + * so they are allowed to end up in the .data.cacheline_aligned + * section. Since TSS's are completely CPU-local, we want them + * on exact cacheline boundaries, to eliminate cacheline ping-pong. + */ +struct hard_thread_struct init_tss[NR_CPUS] __cacheline_aligned = + { [0 ... NR_CPUS-1] = INIT_TSS }; + diff --git a/arch/i386/kernel/ioport.c b/arch/i386/kernel/ioport.c index 445a266139b3..9664ebfc1435 100644 --- a/arch/i386/kernel/ioport.c +++ b/arch/i386/kernel/ioport.c @@ -54,7 +54,8 @@ static void set_bitmap(unsigned long *bitmap, short base, short extent, int new_ */ asmlinkage int sys_ioperm(unsigned long from, unsigned long num, int turn_on) { - struct thread_struct * t = ¤t->tss; + struct soft_thread_struct * t = ¤t->thread; + struct hard_thread_struct * tss = init_tss + smp_processor_id(); if ((from + num <= from) || (from + num > IO_BITMAP_SIZE*32)) return -EINVAL; @@ -65,14 +66,24 @@ asmlinkage int sys_ioperm(unsigned long from, unsigned long num, int turn_on) * IO bitmap up. ioperm() is much less timing critical than clone(), * this is why we delay this operation until now: */ -#define IO_BITMAP_OFFSET offsetof(struct thread_struct,io_bitmap) - - if (t->bitmap != IO_BITMAP_OFFSET) { - t->bitmap = IO_BITMAP_OFFSET; + if (!t->ioperm) { + /* + * just in case ... + */ memset(t->io_bitmap,0xff,(IO_BITMAP_SIZE+1)*4); + t->ioperm = 1; + /* + * this activates it in the TSS + */ + tss->bitmap = IO_BITMAP_OFFSET; } - - set_bitmap((unsigned long *)t->io_bitmap, from, num, !turn_on); + + /* + * do it in the per-thread copy and in the TSS ... + */ + set_bitmap(t->io_bitmap, from, num, !turn_on); + set_bitmap(tss->io_bitmap, from, num, !turn_on); + return 0; } diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c index 9dc24667bc70..bbfb9e3be2c4 100644 --- a/arch/i386/kernel/irq.c +++ b/arch/i386/kernel/irq.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include diff --git a/arch/i386/kernel/ldt.c b/arch/i386/kernel/ldt.c index 25e8deec44e8..fa2f06254080 100644 --- a/arch/i386/kernel/ldt.c +++ b/arch/i386/kernel/ldt.c @@ -2,6 +2,7 @@ * linux/kernel/ldt.c * * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds + * Copyright (C) 1998 Ingo Molnar */ #include @@ -17,19 +18,31 @@ #include #include +/* + * read_ldt() is not really atomic - this is not a problem since + * synchronization of reads and writes done to the LDT has to be + * assured by user-space anyway. Writes are atomic, to protect + * the security checks done on new descriptors. + */ static int read_ldt(void * ptr, unsigned long bytecount) { - void * address = current->mm->segments; + int err; unsigned long size; + struct mm_struct * mm = current->mm; + + err = 0; + if (!mm->segments) + goto out; - if (!ptr) - return -EINVAL; - if (!address) - return 0; size = LDT_ENTRIES*LDT_ENTRY_SIZE; if (size > bytecount) size = bytecount; - return copy_to_user(ptr, address, size) ? -EFAULT : size; + + err = size; + if (copy_to_user(ptr, mm->segments, size)) + err = -EFAULT; +out: + return err; } static int write_ldt(void * ptr, unsigned long bytecount, int oldmode) @@ -64,31 +77,29 @@ static int write_ldt(void * ptr, unsigned long bytecount, int oldmode) * you get strange behaviour (the kernel is safe, it's just user * space strangeness). * - * For no good reason except historical, the GDT index of the LDT - * is chosen to follow the index number in the task[] array. + * we have two choices: either we preallocate the LDT descriptor + * and can do a shared modify_ldt(), or we postallocate it and do + * an smp message pass to update it. Currently we are a bit + * un-nice to user-space and reload the LDT only on the next + * schedule. (only an issue on SMP) + * + * the GDT index of the LDT is allocated dynamically, and is + * limited by MAX_LDT_DESCRIPTORS. */ + down(&mm->mmap_sem); if (!mm->segments) { - void * ldt; + error = -ENOMEM; - ldt = vmalloc(LDT_ENTRIES*LDT_ENTRY_SIZE); - if (!ldt) - goto out; - memset(ldt, 0, LDT_ENTRIES*LDT_ENTRY_SIZE); + mm->segments = vmalloc(LDT_ENTRIES*LDT_ENTRY_SIZE); + if (!mm->segments) + goto out_unlock; + + if (atomic_read(&mm->count) > 1) + printk(KERN_WARNING "LDT allocated for cloned task!\n"); /* - * Make sure someone else hasn't allocated it for us ... + * Possibly do an SMP cross-call to other CPUs to reload + * their LDTs */ - if (!mm->segments) { - int i = current->tarray_ptr - &task[0]; - mm->segments = ldt; - set_ldt_desc(i, ldt, LDT_ENTRIES); - current->tss.ldt = _LDT(i); - load_ldt(i); - if (atomic_read(&mm->count) > 1) - printk(KERN_WARNING - "LDT allocated for cloned task!\n"); - } else { - vfree(ldt); - } } lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->segments); @@ -127,6 +138,9 @@ install: *lp = entry_1; *(lp+1) = entry_2; error = 0; + +out_unlock: + up(&mm->mmap_sem); out: return error; } @@ -135,7 +149,6 @@ asmlinkage int sys_modify_ldt(int func, void *ptr, unsigned long bytecount) { int ret = -ENOSYS; - lock_kernel(); switch (func) { case 0: ret = read_ldt(ptr, bytecount); @@ -147,6 +160,5 @@ asmlinkage int sys_modify_ldt(int func, void *ptr, unsigned long bytecount) ret = write_ldt(ptr, bytecount, 0); break; } - unlock_kernel(); return ret; } diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 08dde1ed7c9c..427b5e0883c5 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -405,6 +405,7 @@ void show_regs(struct pt_regs * regs) regs->esi, regs->edi, regs->ebp); printk(" DS: %04x ES: %04x\n", 0xffff & regs->xds,0xffff & regs->xes); + __asm__("movl %%cr0, %0": "=r" (cr0)); __asm__("movl %%cr2, %0": "=r" (cr2)); __asm__("movl %%cr3, %0": "=r" (cr3)); @@ -475,11 +476,28 @@ void free_task_struct(struct task_struct *p) free_pages((unsigned long) p, 1); } +/* + * No need to lock the MM as we are the last user + */ void release_segments(struct mm_struct *mm) { - if (mm->segments) { - void * ldt = mm->segments; + void * ldt = mm->segments; + + /* + * free the LDT + */ + if (ldt) { mm->segments = NULL; + /* + * special case, when we release the LDT from under + * the running CPU. Other CPUs cannot possibly use + * this LDT as we were getting here through mmput() ... + */ + if (mm == current->mm) + load_LDT(mm); + /* + * Nobody anymore uses the LDT, we can free it: + */ vfree(ldt); } } @@ -492,10 +510,9 @@ void forget_segments(void) : "r" (0)); /* - * Get the LDT entry from init_task. + * Load the LDT entry of init_task. */ - current->tss.ldt = _LDT(0); - load_ldt(0); + load_LDT(init_task.mm); } /* @@ -537,12 +554,9 @@ void exit_thread(void) void flush_thread(void) { - int i; struct task_struct *tsk = current; - for (i=0 ; i<8 ; i++) - tsk->tss.debugreg[i] = 0; - + memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8); /* * Forget coprocessor state.. */ @@ -552,33 +566,50 @@ void flush_thread(void) void release_thread(struct task_struct *dead_task) { + void * ldt = dead_task->mm->segments; + + // temporary debugging check + if (ldt) { + printk("WARNING: dead process %8s still has LDT? <%p>\n", + dead_task->comm, ldt); + BUG(); + } } /* - * If new_mm is NULL, we're being called to set up the LDT descriptor - * for a clone task. Each clone must have a separate entry in the GDT. + * If new_mm is NULL, we're being called to set up the LDT for + * a clone task: this is easy since the clone is not running yet. + * otherwise we copy the old segment into a new segment. + * + * we do not have to muck with descriptors here, that is + * done in __switch_to() and get_mmu_context(). */ -void copy_segments(int nr, struct task_struct *p, struct mm_struct *new_mm) +void copy_segments(struct task_struct *p, struct mm_struct *new_mm) { struct mm_struct * old_mm = current->mm; void * old_ldt = old_mm->segments, * ldt = old_ldt; - /* default LDT - use the one from init_task */ - p->tss.ldt = _LDT(0); - if (old_ldt) { - if (new_mm) { - ldt = vmalloc(LDT_ENTRIES*LDT_ENTRY_SIZE); - new_mm->segments = ldt; - if (!ldt) { - printk(KERN_WARNING "ldt allocation failed\n"); - return; - } - memcpy(ldt, old_ldt, LDT_ENTRIES*LDT_ENTRY_SIZE); - } - p->tss.ldt = _LDT(nr); - set_ldt_desc(nr, ldt, LDT_ENTRIES); + if (!old_mm->segments) { + /* + * default LDT - use the one from init_task + */ + if (new_mm) + new_mm->segments = NULL; return; } + + if (new_mm) { + /* + * Completely new LDT, we initialize it from the parent: + */ + ldt = vmalloc(LDT_ENTRIES*LDT_ENTRY_SIZE); + if (!ldt) + printk(KERN_WARNING "ldt allocation failed\n"); + else + memcpy(ldt, old_ldt, LDT_ENTRIES*LDT_ENTRY_SIZE); + new_mm->segments = ldt; + } + return; } /* @@ -592,31 +623,21 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, { struct pt_regs * childregs; - childregs = ((struct pt_regs *) (2*PAGE_SIZE + (unsigned long) p)) - 1; + childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p)) - 1; *childregs = *regs; childregs->eax = 0; childregs->esp = esp; - p->tss.esp = (unsigned long) childregs; - p->tss.esp0 = (unsigned long) (childregs+1); - p->tss.ss0 = __KERNEL_DS; + p->thread.esp = (unsigned long) childregs; + p->thread.esp0 = (unsigned long) (childregs+1); - p->tss.tr = _TSS(nr); - set_tss_desc(nr,&(p->tss)); - p->tss.eip = (unsigned long) ret_from_fork; + p->thread.eip = (unsigned long) ret_from_fork; - savesegment(fs,p->tss.fs); - savesegment(gs,p->tss.gs); - - /* - * a bitmap offset pointing outside of the TSS limit causes a nicely - * controllable SIGSEGV. The first sys_ioperm() call sets up the - * bitmap properly. - */ - p->tss.bitmap = sizeof(struct thread_struct); + savesegment(fs,p->thread.fs); + savesegment(gs,p->thread.gs); unlazy_fpu(current); - p->tss.i387 = current->tss.i387; + p->thread.i387 = current->thread.i387; return 0; } @@ -632,7 +653,7 @@ int dump_fpu (struct pt_regs * regs, struct user_i387_struct* fpu) fpvalid = tsk->used_math; if (fpvalid) { unlazy_fpu(tsk); - memcpy(fpu,&tsk->tss.i387.hard,sizeof(*fpu)); + memcpy(fpu,&tsk->thread.i387.hard,sizeof(*fpu)); } return fpvalid; @@ -654,7 +675,7 @@ void dump_thread(struct pt_regs * regs, struct user * dump) dump->u_dsize -= dump->u_tsize; dump->u_ssize = 0; for (i = 0; i < 8; i++) - dump->u_debugreg[i] = current->tss.debugreg[i]; + dump->u_debugreg[i] = current->thread.debugreg[i]; if (dump->start_stack < TASK_SIZE) dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT; @@ -683,11 +704,10 @@ void dump_thread(struct pt_regs * regs, struct user * dump) /* * This special macro can be used to load a debugging register */ -#define loaddebug(tsk,register) \ +#define loaddebug(thread,register) \ __asm__("movl %0,%%db" #register \ : /* no output */ \ - :"r" (tsk->tss.debugreg[register])) - + :"r" (thread->debugreg[register])) /* * switch_to(x,yn) should switch tasks from x to y. @@ -712,60 +732,80 @@ void dump_thread(struct pt_regs * regs, struct user * dump) * More important, however, is the fact that this allows us much * more flexibility. */ -void __switch_to(struct task_struct *prev, struct task_struct *next) +extern int cpus_initialized; +void __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { - /* Do the FPU save and set TS if it wasn't set before.. */ - unlazy_fpu(prev); + struct soft_thread_struct *prev = &prev_p->thread, + *next = &next_p->thread; + struct hard_thread_struct *tss = init_tss + smp_processor_id(); + + unlazy_fpu(prev_p); /* - * Reload TR, LDT and the page table pointers.. - * - * We need TR for the IO permission bitmask (and - * the vm86 bitmasks in case we ever use enhanced - * v86 mode properly). - * - * We may want to get rid of the TR register some - * day, and copy the bitmaps around by hand. Oh, - * well. In the meantime we have to clear the busy - * bit in the TSS entry, ugh. + * Reload esp0, LDT and the page table pointer: */ - gdt_table[next->tss.tr >> 3].b &= 0xfffffdff; - asm volatile("ltr %0": :"g" (*(unsigned short *)&next->tss.tr)); + tss->esp0 = next->esp0; /* * Save away %fs and %gs. No need to save %es and %ds, as * those are always kernel segments while inside the kernel. */ - asm volatile("movl %%fs,%0":"=m" (*(int *)&prev->tss.fs)); - asm volatile("movl %%gs,%0":"=m" (*(int *)&prev->tss.gs)); + asm volatile("movl %%fs,%0":"=m" (*(int *)&prev->fs)); + asm volatile("movl %%gs,%0":"=m" (*(int *)&prev->gs)); /* Re-load LDT if necessary */ - if (next->mm->segments != prev->mm->segments) - asm volatile("lldt %0": :"g" (*(unsigned short *)&next->tss.ldt)); + if (prev_p->mm->segments != next_p->mm->segments) + load_LDT(next_p->mm); /* Re-load page tables */ { - unsigned long new_cr3 = next->tss.cr3; - if (new_cr3 != prev->tss.cr3) + unsigned long new_cr3 = next->cr3; + + tss->cr3 = new_cr3; + if (new_cr3 != prev->cr3) asm volatile("movl %0,%%cr3": :"r" (new_cr3)); } /* * Restore %fs and %gs. */ - loadsegment(fs,next->tss.fs); - loadsegment(gs,next->tss.gs); + loadsegment(fs, next->fs); + loadsegment(gs, next->gs); /* * Now maybe reload the debug registers */ - if (next->tss.debugreg[7]){ - loaddebug(next,0); - loaddebug(next,1); - loaddebug(next,2); - loaddebug(next,3); - loaddebug(next,6); - loaddebug(next,7); + if (next->debugreg[7]){ + loaddebug(next, 0); + loaddebug(next, 1); + loaddebug(next, 2); + loaddebug(next, 3); + /* no 4 and 5 */ + loaddebug(next, 6); + loaddebug(next, 7); + } + + if (prev->ioperm || next->ioperm) { + if (next->ioperm) { + /* + * 4 cachelines copy ... not good, but not that + * bad either. Anyone got something better? + * This only affects processes which use ioperm(). + * [Putting the TSSs into 4k-tlb mapped regions + * and playing VM tricks to switch the IO bitmap + * is not really acceptable.] + */ + memcpy(tss->io_bitmap, next->io_bitmap, + IO_BITMAP_SIZE*sizeof(unsigned long)); + tss->bitmap = IO_BITMAP_OFFSET; + } else + /* + * a bitmap offset pointing outside of the TSS limit + * causes a nicely controllable SIGSEGV if a process + * tries to use a port IO instruction. The first + * sys_ioperm() call sets up the bitmap properly. + */ + tss->bitmap = INVALID_IO_BITMAP_OFFSET; } } diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c index 9935cdf53fac..e8645129123b 100644 --- a/arch/i386/kernel/ptrace.c +++ b/arch/i386/kernel/ptrace.c @@ -45,7 +45,7 @@ static inline int get_stack_long(struct task_struct *task, int offset) { unsigned char *stack; - stack = (unsigned char *)task->tss.esp0; + stack = (unsigned char *)task->thread.esp0; stack += offset; return (*((int *)stack)); } @@ -61,7 +61,7 @@ static inline int put_stack_long(struct task_struct *task, int offset, { unsigned char * stack; - stack = (unsigned char *) task->tss.esp0; + stack = (unsigned char *) task->thread.esp0; stack += offset; *(unsigned long *) stack = data; return 0; @@ -76,12 +76,12 @@ static int putreg(struct task_struct *child, case FS: if (value && (value & 3) != 3) return -EIO; - child->tss.fs = value; + child->thread.fs = value; return 0; case GS: if (value && (value & 3) != 3) return -EIO; - child->tss.gs = value; + child->thread.gs = value; return 0; case DS: case ES: @@ -112,10 +112,10 @@ static unsigned long getreg(struct task_struct *child, switch (regno >> 2) { case FS: - retval = child->tss.fs; + retval = child->thread.fs; break; case GS: - retval = child->tss.gs; + retval = child->thread.gs; break; case DS: case ES: @@ -229,7 +229,7 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data) addr <= (long) &dummy->u_debugreg[7]){ addr -= (long) &dummy->u_debugreg[0]; addr = addr >> 2; - tmp = child->tss.debugreg[addr]; + tmp = child->thread.debugreg[addr]; }; ret = put_user(tmp,(unsigned long *) data); goto out; @@ -278,7 +278,7 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data) addr -= (long) &dummy->u_debugreg; addr = addr >> 2; - child->tss.debugreg[addr] = data; + child->thread.debugreg[addr] = data; ret = 0; goto out; }; @@ -409,18 +409,18 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data) ret = 0; if ( !child->used_math ) { /* Simulate an empty FPU. */ - child->tss.i387.hard.cwd = 0xffff037f; - child->tss.i387.hard.swd = 0xffff0000; - child->tss.i387.hard.twd = 0xffffffff; + child->thread.i387.hard.cwd = 0xffff037f; + child->thread.i387.hard.swd = 0xffff0000; + child->thread.i387.hard.twd = 0xffffffff; } #ifdef CONFIG_MATH_EMULATION if ( boot_cpu_data.hard_math ) { #endif - __copy_to_user((void *)data, &child->tss.i387.hard, + __copy_to_user((void *)data, &child->thread.i387.hard, sizeof(struct user_i387_struct)); #ifdef CONFIG_MATH_EMULATION } else { - save_i387_soft(&child->tss.i387.soft, + save_i387_soft(&child->thread.i387.soft, (struct _fpstate *)data); } #endif @@ -438,11 +438,11 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data) #ifdef CONFIG_MATH_EMULATION if ( boot_cpu_data.hard_math ) { #endif - __copy_from_user(&child->tss.i387.hard, (void *)data, + __copy_from_user(&child->thread.i387.hard, (void *)data, sizeof(struct user_i387_struct)); #ifdef CONFIG_MATH_EMULATION } else { - restore_i387_soft(&child->tss.i387.soft, + restore_i387_soft(&child->thread.i387.soft, (struct _fpstate *)data); } #endif diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index 20160e173ba9..e45e87082eb7 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -49,6 +49,7 @@ #include #include #include +#include /* * Machine setup.. @@ -57,6 +58,8 @@ char ignore_irq13 = 0; /* set if exception 16 works */ struct cpuinfo_x86 boot_cpu_data = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; +unsigned long mmu_cr4_features __initdata = 0; + /* * Bus types .. */ @@ -994,3 +997,63 @@ int get_cpuinfo(char * buffer) } return p - buffer; } + +int cpus_initialized = 0; +unsigned long cpu_initialized = 0; + +/* + * cpu_init() initializes state that is per-CPU. Some data is already + * initialized (naturally) in the bootstrap process, such as the GDT + * and IDT. We reload them nevertheless, this function acts as a + * 'CPU state barrier', nothing should get across. + */ +void cpu_init (void) +{ + int nr = smp_processor_id(); + struct hard_thread_struct * t = &init_tss[nr]; + + if (test_and_set_bit(nr,&cpu_initialized)) { + printk("CPU#%d ALREADY INITIALIZED!!!!!!!!!\n", nr); + for (;;) __sti(); + } + cpus_initialized++; + printk("INITIALIZING CPU#%d\n", nr); + + if (boot_cpu_data.x86_capability & X86_FEATURE_PSE) + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + + __asm__ __volatile__("lgdt %0": "=m" (gdt_descr)); + __asm__ __volatile__("lidt %0": "=m" (idt_descr)); + + /* + * Delete NT + */ + __asm__("pushfl ; andl $0xffffbfff,(%esp) ; popfl"); + + /* + * set up and load the per-CPU TSS and LDT + */ + t->esp0 = current->thread.esp0; + set_tss_desc(nr,t); + gdt_table[__TSS(nr)].b &= 0xfffffdff; + load_TR(nr); + + load_LDT(current->mm); + + /* + * Clear all 6 debug registers: + */ + +#define CD(register) __asm__("movl %0,%%db" #register ::"r"(0) ); + + CD(0); CD(1); CD(2); CD(3); /* no db4 and db5 */; CD(6); CD(7); + +#undef CD + + /* + * Force FPU initialization: + */ + current->flags &= ~PF_USEDFPU; + current->used_math = 0; + stts(); +} diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c index 32e7c4c56f4a..e5bc3141b40b 100644 --- a/arch/i386/kernel/signal.c +++ b/arch/i386/kernel/signal.c @@ -155,7 +155,7 @@ static inline int restore_i387_hard(struct _fpstate *buf) { struct task_struct *tsk = current; clear_fpu(tsk); - return __copy_from_user(&tsk->tss.i387.hard, buf, sizeof(*buf)); + return __copy_from_user(&tsk->thread.i387.hard, buf, sizeof(*buf)); } static inline int restore_i387(struct _fpstate *buf) @@ -167,7 +167,7 @@ static inline int restore_i387(struct _fpstate *buf) if (boot_cpu_data.hard_math) err = restore_i387_hard(buf); else - err = restore_i387_soft(¤t->tss.i387.soft, buf); + err = restore_i387_soft(¤t->thread.i387.soft, buf); #endif current->used_math = 1; return err; @@ -308,8 +308,8 @@ static inline int save_i387_hard(struct _fpstate * buf) struct task_struct *tsk = current; unlazy_fpu(tsk); - tsk->tss.i387.hard.status = tsk->tss.i387.hard.swd; - if (__copy_to_user(buf, &tsk->tss.i387.hard, sizeof(*buf))) + tsk->thread.i387.hard.status = tsk->thread.i387.hard.swd; + if (__copy_to_user(buf, &tsk->thread.i387.hard, sizeof(*buf))) return -1; return 1; } @@ -328,7 +328,7 @@ static int save_i387(struct _fpstate *buf) return save_i387_hard(buf); #else return boot_cpu_data.hard_math ? save_i387_hard(buf) - : save_i387_soft(¤t->tss.i387.soft, buf); + : save_i387_soft(¤t->thread.i387.soft, buf); #endif } @@ -354,8 +354,8 @@ setup_sigcontext(struct sigcontext *sc, struct _fpstate *fpstate, err |= __put_user(regs->edx, &sc->edx); err |= __put_user(regs->ecx, &sc->ecx); err |= __put_user(regs->eax, &sc->eax); - err |= __put_user(current->tss.trap_no, &sc->trapno); - err |= __put_user(current->tss.error_code, &sc->err); + err |= __put_user(current->thread.trap_no, &sc->trapno); + err |= __put_user(current->thread.error_code, &sc->err); err |= __put_user(regs->eip, &sc->eip); err |= __put_user(regs->xcs, (unsigned int *)&sc->cs); err |= __put_user(regs->eflags, &sc->eflags); @@ -370,7 +370,7 @@ setup_sigcontext(struct sigcontext *sc, struct _fpstate *fpstate, /* non-iBCS2 extensions.. */ err |= __put_user(mask, &sc->oldmask); - err |= __put_user(current->tss.cr2, &sc->cr2); + err |= __put_user(current->thread.cr2, &sc->cr2); return err; } diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index f092d09059b2..a62937b8631e 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c @@ -104,7 +104,7 @@ int smp_found_config=0; /* Have we found an SMP box */ unsigned long cpu_present_map = 0; /* Bitmask of physically existing CPUs */ unsigned long cpu_online_map = 0; /* Bitmask of currently online CPUs */ -int smp_num_cpus = 1; /* Total count of live CPUs */ +int smp_num_cpus = 0; /* Total count of live CPUs */ int smp_threads_ready=0; /* Set when the idlers are all forked */ volatile int cpu_number_map[NR_CPUS]; /* which CPU maps to which logical number */ volatile int __cpu_logical_map[NR_CPUS]; /* which logical number maps to which CPU */ @@ -225,6 +225,7 @@ static char *mpc_family(int family,int model) return n; } + /* * Read the MPC */ @@ -637,6 +638,8 @@ void __init init_smp_config (void) #endif } + + /* * Trampoline 80x86 program as an array. */ @@ -882,6 +885,7 @@ int __init start_secondary(void *unused) * booting is too fragile that we want to limit the * things done here to the most necessary things. */ + cpu_init(); smp_callin(); while (!atomic_read(&smp_commenced)) /* nothing */ ; @@ -896,15 +900,6 @@ int __init start_secondary(void *unused) */ void __init initialize_secondary(void) { - struct thread_struct * p = ¤t->tss; - - /* - * Load up the LDT and the task register. - */ - asm volatile("lldt %%ax": :"a" (p->ldt)); - asm volatile("ltr %%ax": :"a" (p->tr)); - stts(); - /* * We don't actually need to load the full TSS, * basically just the stack pointer and the eip. @@ -914,7 +909,7 @@ void __init initialize_secondary(void) "movl %0,%%esp\n\t" "jmp *%1" : - :"r" (p->esp),"r" (p->eip)); + :"r" (current->thread.esp),"r" (current->thread.eip)); } extern struct { @@ -937,7 +932,13 @@ static void __init do_boot_cpu(int i) kernel_thread(start_secondary, NULL, CLONE_PID); cpucount++; - idle = task[cpucount]; + /* + * We remove it from the pidhash and the runqueue + * once we got the process: + */ + idle = init_task.prev_task; + + init_tasks[cpucount] = idle; if (!idle) panic("No idle process for CPU %d", i); @@ -945,7 +946,10 @@ static void __init do_boot_cpu(int i) __cpu_logical_map[cpucount] = i; cpu_number_map[i] = cpucount; idle->has_cpu = 1; /* we schedule the first task manually */ - idle->tss.eip = (unsigned long) start_secondary; + idle->thread.eip = (unsigned long) start_secondary; + + del_from_runqueue(idle); + unhash_process(idle); /* start_eip had better be page-aligned! */ start_eip = setup_trampoline(); @@ -1179,7 +1183,6 @@ void __init smp_boot_cpus(void) /* Must be done before other processors booted */ mtrr_init_boot_cpu (); #endif - init_idle(); /* * Initialize the logical to physical CPU number mapping * and the per-CPU profiling counter/multiplier @@ -1210,6 +1213,8 @@ void __init smp_boot_cpus(void) cpu_number_map[boot_cpu_id] = 0; + init_idle(); + /* * If we couldnt find an SMP configuration at boot time, * get out of here now! @@ -1356,31 +1361,33 @@ void __init smp_boot_cpus(void) */ SMP_PRINTK(("Before bogomips.\n")); - if (cpucount==0) - { + if (!cpucount) { printk(KERN_ERR "Error: only one processor found.\n"); cpu_online_map = (1<tss.error_code = error_code; \ - tsk->tss.trap_no = trapnr; \ + tsk->thread.error_code = error_code; \ + tsk->thread.trap_no = trapnr; \ force_sig(signr, tsk); \ die_if_no_fixup(str,regs,error_code); \ } @@ -80,8 +80,8 @@ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ goto out; \ /* else fall through */ \ } \ - tsk->tss.error_code = error_code; \ - tsk->tss.trap_no = trapnr; \ + tsk->thread.error_code = error_code; \ + tsk->thread.trap_no = trapnr; \ force_sig(signr, tsk); \ die_if_kernel(str,regs,error_code); \ out: \ @@ -143,10 +143,8 @@ static void show_registers(struct pt_regs *regs) regs->esi, regs->edi, regs->ebp, esp); printk("ds: %04x es: %04x ss: %04x\n", regs->xds & 0xffff, regs->xes & 0xffff, ss); - store_TR(i); - printk("Process %s (pid: %d, process nr: %d, stackpage=%08lx)", - current->comm, current->pid, 0xffff & i, 4096+(unsigned long)current); - + printk("Process %s (pid: %d, stackpage=%08lx)", + current->comm, current->pid, 4096+(unsigned long)current); /* * When in-kernel, we also print out the stack and code at the * time of the fault.. @@ -249,8 +247,8 @@ asmlinkage void cache_flush_denied(struct pt_regs * regs, long error_code) return; } die_if_kernel("cache flush denied",regs,error_code); - current->tss.error_code = error_code; - current->tss.trap_no = 19; + current->thread.error_code = error_code; + current->thread.trap_no = 19; force_sig(SIGSEGV, current); } @@ -262,8 +260,8 @@ asmlinkage void do_general_protection(struct pt_regs * regs, long error_code) if (!(regs->xcs & 3)) goto gp_in_kernel; - current->tss.error_code = error_code; - current->tss.trap_no = 13; + current->thread.error_code = error_code; + current->thread.trap_no = 13; force_sig(SIGSEGV, current); return; @@ -374,9 +372,9 @@ asmlinkage void do_debug(struct pt_regs * regs, long error_code) goto clear_TF; } - /* Mast out spurious debug traps due to lazy DR7 setting */ + /* Mask out spurious debug traps due to lazy DR7 setting */ if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { - if (!tsk->tss.debugreg[7]) + if (!tsk->thread.debugreg[7]) goto clear_dr7; } @@ -385,8 +383,8 @@ asmlinkage void do_debug(struct pt_regs * regs, long error_code) goto clear_dr7; /* Ok, finally something we can handle */ - tsk->tss.trap_no = 1; - tsk->tss.error_code = error_code; + tsk->thread.trap_no = 1; + tsk->thread.error_code = error_code; force_sig(SIGTRAP, tsk); return; @@ -422,8 +420,8 @@ void math_error(void) */ task = current; save_fpu(task); - task->tss.trap_no = 16; - task->tss.error_code = 0; + task->thread.trap_no = 16; + task->thread.error_code = 0; force_sig(SIGFPE, task); } @@ -453,7 +451,7 @@ asmlinkage void math_state_restore(struct pt_regs regs) { __asm__ __volatile__("clts"); /* Allow maths ops (or we recurse) */ if(current->used_math) - __asm__("frstor %0": :"m" (current->tss.i387)); + __asm__("frstor %0": :"m" (current->thread.i387)); else { /* @@ -486,6 +484,7 @@ __initfunc(void trap_init_f00f_bug(void)) pmd_t * pmd; pte_t * pte; +return; /* * Allocate a new page in virtual address space, * move the IDT into it and write protect this page. @@ -570,12 +569,12 @@ __asm__ __volatile__ ("movw %3,0(%2)\n\t" \ void set_tss_desc(unsigned int n, void *addr) { - _set_tssldt_desc(gdt_table+FIRST_TSS_ENTRY+(n<<1), (int)addr, 235, 0x89); + _set_tssldt_desc(gdt_table+__TSS(n), (int)addr, 235, 0x89); } void set_ldt_desc(unsigned int n, void *addr, unsigned int size) { - _set_tssldt_desc(gdt_table+FIRST_LDT_ENTRY+(n<<1), (int)addr, ((size << 3) - 1), 0x82); + _set_tssldt_desc(gdt_table+__LDT(n), (int)addr, ((size << 3)-1), 0x82); } #ifdef CONFIG_X86_VISWS_APIC @@ -672,7 +671,7 @@ void __init trap_init(void) { if (readl(0x0FFFD9) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24)) EISA_bus = 1; - set_call_gate(&default_ldt,lcall7); + set_trap_gate(0,÷_error); set_trap_gate(1,&debug); set_trap_gate(2,&nmi); @@ -693,14 +692,20 @@ void __init trap_init(void) set_trap_gate(17,&alignment_check); set_system_gate(SYSCALL_VECTOR,&system_call); - /* set up GDT task & ldt entries */ - set_tss_desc(0, &init_task.tss); - set_ldt_desc(0, &default_ldt, 1); + /* + * default LDT is a single-entry callgate to lcall7 + */ + set_call_gate(&default_ldt,lcall7); + + /* + * on SMP we do not yet know which CPU is on which TSS, + * so we delay this until smp_init(). (the CPU is already + * in a reasonable state, otherwise we wouldnt have gotten so far :) + */ +#ifndef __SMP__ + cpu_init(); +#endif - /* Clear NT, so that we won't have troubles with that later on */ - __asm__("pushfl ; andl $0xffffbfff,(%esp) ; popfl"); - load_TR(0); - load_ldt(0); #ifdef CONFIG_X86_VISWS_APIC superio_init(); lithium_init(); diff --git a/arch/i386/kernel/visws_apic.c b/arch/i386/kernel/visws_apic.c index c120546894e0..2eda467e34d1 100644 --- a/arch/i386/kernel/visws_apic.c +++ b/arch/i386/kernel/visws_apic.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include diff --git a/arch/i386/kernel/vm86.c b/arch/i386/kernel/vm86.c index d181dc6994bd..d49eed6e83ac 100644 --- a/arch/i386/kernel/vm86.c +++ b/arch/i386/kernel/vm86.c @@ -48,8 +48,8 @@ /* * virtual flags (16 and 32-bit versions) */ -#define VFLAGS (*(unsigned short *)&(current->tss.v86flags)) -#define VEFLAGS (current->tss.v86flags) +#define VFLAGS (*(unsigned short *)&(current->thread.v86flags)) +#define VEFLAGS (current->thread.v86flags) #define set_flags(X,new,mask) \ ((X) = ((X) & ~(mask)) | ((new) & (mask))) @@ -65,25 +65,27 @@ asmlinkage struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs)); struct pt_regs * save_v86_state(struct kernel_vm86_regs * regs) { + struct hard_thread_struct *tss; struct pt_regs *ret; unsigned long tmp; lock_kernel(); - if (!current->tss.vm86_info) { + if (!current->thread.vm86_info) { printk("no vm86_info: BAD\n"); do_exit(SIGSEGV); } - set_flags(regs->eflags, VEFLAGS, VIF_MASK | current->tss.v86mask); - tmp = copy_to_user(¤t->tss.vm86_info->regs,regs, VM86_REGS_SIZE1); - tmp += copy_to_user(¤t->tss.vm86_info->regs.VM86_REGS_PART2, + set_flags(regs->eflags, VEFLAGS, VIF_MASK | current->thread.v86mask); + tmp = copy_to_user(¤t->thread.vm86_info->regs,regs, VM86_REGS_SIZE1); + tmp += copy_to_user(¤t->thread.vm86_info->regs.VM86_REGS_PART2, ®s->VM86_REGS_PART2, VM86_REGS_SIZE2); - tmp += put_user(current->tss.screen_bitmap,¤t->tss.vm86_info->screen_bitmap); + tmp += put_user(current->thread.screen_bitmap,¤t->thread.vm86_info->screen_bitmap); if (tmp) { printk("vm86: could not access userspace vm86_info\n"); do_exit(SIGSEGV); } - current->tss.esp0 = current->tss.saved_esp0; - current->tss.saved_esp0 = 0; + tss = init_tss + smp_processor_id(); + tss->esp0 = current->thread.esp0 = current->thread.saved_esp0; + current->thread.saved_esp0 = 0; ret = KVM86->regs32; unlock_kernel(); return ret; @@ -138,7 +140,7 @@ asmlinkage int sys_vm86old(struct vm86_struct * v86) lock_kernel(); tsk = current; - if (tsk->tss.saved_esp0) + if (tsk->thread.saved_esp0) goto out; tmp = copy_from_user(&info, v86, VM86_REGS_SIZE1); tmp += copy_from_user(&info.regs.VM86_REGS_PART2, &v86->regs.VM86_REGS_PART2, @@ -148,7 +150,7 @@ asmlinkage int sys_vm86old(struct vm86_struct * v86) goto out; memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus); info.regs32 = (struct pt_regs *) &v86; - tsk->tss.vm86_info = v86; + tsk->thread.vm86_info = v86; do_sys_vm86(&info, tsk); ret = 0; /* we never return here */ out: @@ -188,7 +190,7 @@ asmlinkage int sys_vm86(unsigned long subfunction, struct vm86plus_struct * v86) /* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */ ret = -EPERM; - if (tsk->tss.saved_esp0) + if (tsk->thread.saved_esp0) goto out; tmp = copy_from_user(&info, v86, VM86_REGS_SIZE1); tmp += copy_from_user(&info.regs.VM86_REGS_PART2, &v86->regs.VM86_REGS_PART2, @@ -198,7 +200,7 @@ asmlinkage int sys_vm86(unsigned long subfunction, struct vm86plus_struct * v86) goto out; info.regs32 = (struct pt_regs *) &subfunction; info.vm86plus.is_vm86pus = 1; - tsk->tss.vm86_info = (struct vm86_struct *)v86; + tsk->thread.vm86_info = (struct vm86_struct *)v86; do_sys_vm86(&info, tsk); ret = 0; /* we never return here */ out: @@ -209,6 +211,7 @@ out: static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk) { + struct hard_thread_struct *tss; /* * make sure the vm86() system call doesn't try to do anything silly */ @@ -231,16 +234,16 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk switch (info->cpu_type) { case CPU_286: - tsk->tss.v86mask = 0; + tsk->thread.v86mask = 0; break; case CPU_386: - tsk->tss.v86mask = NT_MASK | IOPL_MASK; + tsk->thread.v86mask = NT_MASK | IOPL_MASK; break; case CPU_486: - tsk->tss.v86mask = AC_MASK | NT_MASK | IOPL_MASK; + tsk->thread.v86mask = AC_MASK | NT_MASK | IOPL_MASK; break; default: - tsk->tss.v86mask = ID_MASK | AC_MASK | NT_MASK | IOPL_MASK; + tsk->thread.v86mask = ID_MASK | AC_MASK | NT_MASK | IOPL_MASK; break; } @@ -248,10 +251,11 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk * Save old state, set default return value (%eax) to 0 */ info->regs32->eax = 0; - tsk->tss.saved_esp0 = tsk->tss.esp0; - tsk->tss.esp0 = (unsigned long) &info->VM86_TSS_ESP0; + tsk->thread.saved_esp0 = tsk->thread.esp0; + tss = init_tss + smp_processor_id(); + tss->esp0 = tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0; - tsk->tss.screen_bitmap = info->screen_bitmap; + tsk->thread.screen_bitmap = info->screen_bitmap; if (info->flags & VM86_SCREEN_BITMAP) mark_screen_rdonly(tsk); unlock_kernel(); @@ -295,7 +299,7 @@ static inline void clear_TF(struct kernel_vm86_regs * regs) static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs * regs) { - set_flags(VEFLAGS, eflags, current->tss.v86mask); + set_flags(VEFLAGS, eflags, current->thread.v86mask); set_flags(regs->eflags, eflags, SAFE_MASK); if (eflags & IF_MASK) set_IF(regs); @@ -303,7 +307,7 @@ static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs * regs) { - set_flags(VFLAGS, flags, current->tss.v86mask); + set_flags(VFLAGS, flags, current->thread.v86mask); set_flags(regs->eflags, flags, SAFE_MASK); if (flags & IF_MASK) set_IF(regs); @@ -315,7 +319,7 @@ static inline unsigned long get_vflags(struct kernel_vm86_regs * regs) if (VEFLAGS & VIF_MASK) flags |= IF_MASK; - return flags | (VEFLAGS & current->tss.v86mask); + return flags | (VEFLAGS & current->thread.v86mask); } static inline int is_revectored(int nr, struct revectored_struct * bitmap) @@ -447,8 +451,8 @@ int handle_vm86_trap(struct kernel_vm86_regs * regs, long error_code, int trapno spin_unlock_irqrestore(¤t->sigmask_lock, flags); } send_sig(SIGTRAP, current, 1); - current->tss.trap_no = trapno; - current->tss.error_code = error_code; + current->thread.trap_no = trapno; + current->thread.error_code = error_code; return 0; } diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c index c3e423b216ef..3fbf22a4bc7b 100644 --- a/arch/i386/mm/fault.c +++ b/arch/i386/mm/fault.c @@ -177,7 +177,7 @@ good_area: if (regs->eflags & VM_MASK) { unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; if (bit < 32) - tsk->tss.screen_bitmap |= 1 << bit; + tsk->thread.screen_bitmap |= 1 << bit; } up(&mm->mmap_sem); return; @@ -191,9 +191,9 @@ bad_area: /* User mode accesses just cause a SIGSEGV */ if (error_code & 4) { - tsk->tss.cr2 = address; - tsk->tss.error_code = error_code; - tsk->tss.trap_no = 14; + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 14; force_sig(SIGSEGV, tsk); return; } @@ -243,9 +243,11 @@ no_context: else printk(KERN_ALERT "Unable to handle kernel paging request"); printk(" at virtual address %08lx\n",address); + printk(" printing eip:\n"); + printk("%08lx\n", regs->eip); __asm__("movl %%cr3,%0" : "=r" (page)); - printk(KERN_ALERT "current->tss.cr3 = %08lx, %%cr3 = %08lx\n", - tsk->tss.cr3, page); + printk(KERN_ALERT "current->thread.cr3 = %08lx, %%cr3 = %08lx\n", + tsk->thread.cr3, page); page = ((unsigned long *) __va(page))[address >> 22]; printk(KERN_ALERT "*pde = %08lx\n", page); if (page & 1) { @@ -275,9 +277,9 @@ do_sigbus: * Send a sigbus, regardless of whether we were in kernel * or user mode. */ - tsk->tss.cr2 = address; - tsk->tss.error_code = error_code; - tsk->tss.trap_no = 14; + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 14; force_sig(SIGBUS, tsk); /* Kernel mode? Handle exceptions or die */ diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index 703b8ca87cd5..e59b6d1c55ad 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -181,34 +181,6 @@ extern unsigned long free_area_init(unsigned long, unsigned long); extern char _text, _etext, _edata, __bss_start, _end; extern char __init_begin, __init_end; -#define X86_CR4_VME 0x0001 /* enable vm86 extensions */ -#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */ -#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */ -#define X86_CR4_DE 0x0008 /* enable debugging extensions */ -#define X86_CR4_PSE 0x0010 /* enable page size extensions */ -#define X86_CR4_PAE 0x0020 /* enable physical address extensions */ -#define X86_CR4_MCE 0x0040 /* Machine check enable */ -#define X86_CR4_PGE 0x0080 /* enable global pages */ -#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */ - -/* - * Save the cr4 feature set we're using (ie - * Pentium 4MB enable and PPro Global page - * enable), so that any CPU's that boot up - * after us can get the correct flags. - */ -unsigned long mmu_cr4_features __initdata = 0; - -static inline void set_in_cr4(unsigned long mask) -{ - mmu_cr4_features |= mask; - __asm__("movl %%cr4,%%eax\n\t" - "orl %0,%%eax\n\t" - "movl %%eax,%%cr4\n" - : : "irg" (mask) - :"ax"); -} - /* * allocate page table(s) for compile-time fixed mappings */ diff --git a/drivers/char/ppdev.c b/drivers/char/ppdev.c index 881e521ad245..2ad48cd2e370 100644 --- a/drivers/char/ppdev.c +++ b/drivers/char/ppdev.c @@ -342,7 +342,6 @@ static int pp_ioctl(struct inode *inode, struct file *file, return 0; } - port = pp_table[portnum][dev].pdev->port; if (cmd == PPEXCL) { if (pp_table[portnum][dev].pdev) { printk (KERN_DEBUG CHRDEV "%02x: too late for PPEXCL; " @@ -368,6 +367,7 @@ static int pp_ioctl(struct inode *inode, struct file *file, return -EPERM; } + port = pp_table[portnum][dev].pdev->port; switch (cmd) { unsigned char reg; unsigned char mask; @@ -535,6 +535,7 @@ int pp_init (void) return -EIO; } + memset (pp_table, 0, sizeof (pp_table)); printk (KERN_INFO PP_VERSION "\n"); return 0; } diff --git a/drivers/scsi/aic7xxx.c b/drivers/scsi/aic7xxx.c index fe4569c1e65c..a4182e7336e8 100644 --- a/drivers/scsi/aic7xxx.c +++ b/drivers/scsi/aic7xxx.c @@ -242,7 +242,6 @@ #include #include #include -#include #include "sd.h" #include "scsi.h" #include "hosts.h" @@ -270,7 +269,7 @@ struct proc_dir_entry proc_scsi_aic7xxx = { 0, 0, 0, NULL, NULL, NULL, NULL, NULL, NULL, NULL }; -#define AIC7XXX_C_VERSION "5.1.18" +#define AIC7XXX_C_VERSION "5.1.17" #define NUMBER(arr) (sizeof(arr) / sizeof(arr[0])) #define MIN(a,b) (((a) < (b)) ? (a) : (b)) @@ -7186,7 +7185,7 @@ read_284x_seeprom(struct aic7xxx_host *p, struct seeprom_config *sc) static int acquire_seeprom(struct aic7xxx_host *p) { - int count=0; + int wait; /* * Request access of the memory port. When access is @@ -7196,10 +7195,11 @@ acquire_seeprom(struct aic7xxx_host *p) * should be no contention. */ aic_outb(p, SEEMS, SEECTL); - while( ((aic_inb(p, SEECTL) & SEERDY) == 0) && count < 1000) { - mb(); - udelay(1); - count++; + wait = 1000; /* 1000 msec = 1 second */ + while ((wait > 0) && ((aic_inb(p, SEECTL) & SEERDY) == 0)) + { + wait--; + mdelay(1); /* 1 msec */ } if ((aic_inb(p, SEECTL) & SEERDY) == 0) { @@ -7411,78 +7411,73 @@ read_seeprom(struct aic7xxx_host *p, int offset, /*+F************************************************************************* * Function: - * read_brdctl + * write_brdctl * * Description: - * Reads the BRDCTL register. + * Writes a value to the BRDCTL register. *-F*************************************************************************/ -static unsigned char -read_brdctl(struct aic7xxx_host *p) +static void +write_brdctl(struct aic7xxx_host *p, unsigned char value) { unsigned char brdctl; if ((p->chip & AHC_CHIPID_MASK) == AHC_AIC7895) { - brdctl = BRDRW; + brdctl = BRDSTB; if (p->flags & AHC_CHNLB) brdctl |= BRDCS; } else if (p->features & AHC_ULTRA2) - brdctl = BRDRW_ULTRA2; + brdctl = 0; else - brdctl = BRDRW | BRDCS; + brdctl = BRDSTB | BRDCS; + aic_outb(p, brdctl, BRDCTL); + udelay(1); + brdctl |= value; + aic_outb(p, brdctl, BRDCTL); + udelay(1); + if (p->features & AHC_ULTRA2) + brdctl |= BRDSTB_ULTRA2; + else + brdctl &= ~BRDSTB; aic_outb(p, brdctl, BRDCTL); - udelay(10); - return (aic_inb(p, BRDCTL)); + udelay(1); + if (p->features & AHC_ULTRA2) + brdctl = 0; + else + brdctl &= ~BRDCS; + aic_outb(p, brdctl, BRDCTL); + udelay(1); } /*+F************************************************************************* * Function: - * write_brdctl + * read_brdctl * * Description: - * Writes a value to the BRDCTL register. + * Reads the BRDCTL register. *-F*************************************************************************/ -static void -write_brdctl(struct aic7xxx_host *p, unsigned char value) +static unsigned char +read_brdctl(struct aic7xxx_host *p) { - unsigned char brdctl; + unsigned char brdctl, value; if ((p->chip & AHC_CHIPID_MASK) == AHC_AIC7895) { - brdctl = BRDSTB; + brdctl = BRDRW; if (p->flags & AHC_CHNLB) brdctl |= BRDCS; - aic_outb(p, brdctl, BRDCTL); - udelay(4); - brdctl |= value; } else if (p->features & AHC_ULTRA2) - { - brdctl = value; - } - else - { - brdctl = BRDSTB | BRDCS; - aic_outb(p, brdctl, BRDCTL); - udelay(4); - brdctl |= value; - } - aic_outb(p, brdctl, BRDCTL); - udelay(4); - if (p->features & AHC_ULTRA2) - brdctl |= BRDSTB_ULTRA2; - else - brdctl &= ~BRDSTB; - aic_outb(p, brdctl, BRDCTL); - udelay(4); - if (p->features & AHC_ULTRA2) - brdctl &= ~BRDSTB_ULTRA2; + brdctl = BRDRW_ULTRA2; else - brdctl &= ~BRDCS; + brdctl = BRDRW | BRDCS; aic_outb(p, brdctl, BRDCTL); - udelay(4); - read_brdctl(p); + udelay(1); + value = aic_inb(p, BRDCTL); + aic_outb(p, 0, BRDCTL); + udelay(1); + return (value); } /*+F************************************************************************* @@ -7499,10 +7494,11 @@ aic785x_cable_detect(struct aic7xxx_host *p, int *int_50, unsigned char brdctl; aic_outb(p, BRDRW | BRDCS, BRDCTL); - udelay(4); + udelay(1); aic_outb(p, 0, BRDCTL); - udelay(4); + udelay(1); brdctl = aic_inb(p, BRDCTL); + udelay(1); *int_50 = !(brdctl & BRDDAT5); *ext_present = !(brdctl & BRDDAT6); *eeprom = (aic_inb(p, SPIOCAP) & EEPROM); @@ -7611,7 +7607,6 @@ configure_termination(struct aic7xxx_host *p) else max_target = 8; aic_outb(p, SEEMS | SEECS, SEECTL); - udelay(4); sxfrctl1 &= ~STPWEN; if ( (p->adapter_control & CFAUTOTERM) || (p->features & AHC_ULTRA2) ) @@ -7736,14 +7731,6 @@ configure_termination(struct aic7xxx_host *p) p->host_no); } - if (enableLVD_high != 0) - { - brddat |= BRDDAT4; - if (aic7xxx_verbose & VERBOSE_PROBE2) - printk(KERN_INFO "(scsi%d) LVD High byte termination Enabled\n", - p->host_no); - } - if (enableLVD_low != 0) { sxfrctl1 |= STPWEN; @@ -7752,17 +7739,17 @@ configure_termination(struct aic7xxx_host *p) printk(KERN_INFO "(scsi%d) LVD Low byte termination Enabled\n", p->host_no); } - } - else - { - if (p->adapter_control & CFWSTERM) + + if (enableLVD_high != 0) { - brddat |= BRDDAT6; + brddat |= BRDDAT4; if (aic7xxx_verbose & VERBOSE_PROBE2) - printk(KERN_INFO "(scsi%d) SE High byte termination Enabled\n", + printk(KERN_INFO "(scsi%d) LVD High byte termination Enabled\n", p->host_no); } - + } + else + { if (p->adapter_control & CFSTERM) { if (p->features & AHC_ULTRA2) @@ -7773,10 +7760,18 @@ configure_termination(struct aic7xxx_host *p) printk(KERN_INFO "(scsi%d) SE Low byte termination Enabled\n", p->host_no); } + + if (p->adapter_control & CFWSTERM) + { + brddat |= BRDDAT6; + if (aic7xxx_verbose & VERBOSE_PROBE2) + printk(KERN_INFO "(scsi%d) SE High byte termination Enabled\n", + p->host_no); + } } - aic_outb(p, sxfrctl1, SXFRCTL1); write_brdctl(p, brddat); release_seeprom(p); + aic_outb(p, sxfrctl1, SXFRCTL1); } } @@ -8090,7 +8085,7 @@ aic7xxx_register(Scsi_Host_Template *template, struct aic7xxx_host *p, /* Select channel B */ aic_outb(p, aic_inb(p, SBLKCTL) | SELBUSB, SBLKCTL); - term = (aic_inb(p, SXFRCTL1) & STPWEN); + term = ((p->flags & AHC_TERM_ENB_B) != 0) ? STPWEN : 0; aic_outb(p, p->scsi_id_b, SCSIID); scsi_conf = aic_inb(p, SCSICONF + 1); aic_outb(p, DFON | SPIOEN, SXFRCTL0); @@ -8104,15 +8099,11 @@ aic7xxx_register(Scsi_Host_Template *template, struct aic7xxx_host *p, aic_outb(p, aic_inb(p, SBLKCTL) & ~SELBUSB, SBLKCTL); } + term = ((p->flags & AHC_TERM_ENB_SE_LOW) != 0) ? STPWEN : 0; if (p->features & AHC_ULTRA2) - { aic_outb(p, p->scsi_id, SCSIID_ULTRA2); - } else - { aic_outb(p, p->scsi_id, SCSIID); - } - term = (aic_inb(p, SXFRCTL1) & STPWEN); scsi_conf = aic_inb(p, SCSICONF); aic_outb(p, DFON | SPIOEN, SXFRCTL0); aic_outb(p, (scsi_conf & ENSPCHK) | STIMESEL | term | @@ -8802,33 +8793,27 @@ aic7xxx_load_seeprom(struct aic7xxx_host *p, unsigned char *sxfrctl1) } if (p->flags & AHC_NEWEEPROM_FMT) { - if ( !(p->features & AHC_ULTRA2) ) + if ( (sc->device_flags[i] & CFNEWULTRAFORMAT) && + !(p->features & AHC_ULTRA2) ) { /* * I know of two different Ultra BIOSes that do this differently. * One on the Gigabyte 6BXU mb that wants flags[i] & CFXFER to - * be == to 0x03 and SYNCHISULTRA to be true to mean 40MByte/s + * be == to 0x03 and SYNCISULTRA to be true to mean 40MByte/s * while on the IBM Netfinity 5000 they want the same thing * to be something else, while flags[i] & CFXFER == 0x03 and - * SYNCHISULTRA false should be 40MByte/s. So, we set both to + * SYNCISULTRA false should be 40MByte/s. So, we set both to * 40MByte/s and the lower speeds be damned. People will have * to select around the conversely mapped lower speeds in order * to select lower speeds on these boards. */ - if ( (sc->device_flags[i] & CFNEWULTRAFORMAT) && - ((sc->device_flags[i] & CFXFER) == 0x03) ) + if ((sc->device_flags[i] & (CFXFER)) == 0x03) { sc->device_flags[i] &= ~CFXFER; sc->device_flags[i] |= CFSYNCHISULTRA; } - if (sc->device_flags[i] & CFSYNCHISULTRA) - { - p->ultraenb |= mask; - } } - else if ( !(sc->device_flags[i] & CFNEWULTRAFORMAT) && - (p->features & AHC_ULTRA2) && - (sc->device_flags[i] & CFSYNCHISULTRA) ) + if (sc->device_flags[i] & CFSYNCHISULTRA) { p->ultraenb |= mask; } diff --git a/drivers/scsi/aic7xxx/aic7xxx.reg b/drivers/scsi/aic7xxx/aic7xxx.reg index 6cc347d5b82b..7f348aa9393a 100644 --- a/drivers/scsi/aic7xxx/aic7xxx.reg +++ b/drivers/scsi/aic7xxx/aic7xxx.reg @@ -845,7 +845,7 @@ register CRCCONTROL1 { bit CRCENDCHKEN 0x20 /* CRC End Check Enable */ bit CRCREQCHKEN 0x10 bit TARGCRCENDEN 0x08 /* Enable End CRC transfer when target */ - bit TARGCRCCNTEN 0x04 /* Enable CRC transfer when target */ + bit TARGCRCCNTEN 0x40 /* Enable CRC transfer when target */ } /* diff --git a/drivers/scsi/aic7xxx_reg.h b/drivers/scsi/aic7xxx_reg.h index f86e1bec8a26..b42750864288 100644 --- a/drivers/scsi/aic7xxx_reg.h +++ b/drivers/scsi/aic7xxx_reg.h @@ -459,11 +459,11 @@ #define CRCCONTROL1 0x9d #define CRCONSEEN 0x80 +#define TARGCRCCNTEN 0x40 #define CRCVALCHKEN 0x40 #define CRCENDCHKEN 0x20 #define CRCREQCHKEN 0x10 #define TARGCRCENDEN 0x08 -#define TARGCRCCNTEN 0x04 #define SCSIPHASE 0x9e #define SP_STATUS 0x20 diff --git a/drivers/sound/es1370.c b/drivers/sound/es1370.c index 7c2093a43911..c667b2106734 100644 --- a/drivers/sound/es1370.c +++ b/drivers/sound/es1370.c @@ -100,6 +100,7 @@ * Guenter Geiger * 15.06.99 0.23 Fix bad allocation bug. * Thanks to Deti Fliegl + * 28.06.99 0.24 Add pci_set_master * * some important things missing in Ensoniq documentation: * @@ -2320,7 +2321,7 @@ int __init init_es1370(void) if (!pci_present()) /* No PCI bus in this machine! */ return -ENODEV; - printk(KERN_INFO "es1370: version v0.23 time " __TIME__ " " __DATE__ "\n"); + printk(KERN_INFO "es1370: version v0.24 time " __TIME__ " " __DATE__ "\n"); while (index < NR_DEVICE && (pcidev = pci_find_device(PCI_VENDOR_ID_ENSONIQ, PCI_DEVICE_ID_ENSONIQ_ES1370, pcidev))) { if (pcidev->base_address[0] == 0 || @@ -2384,6 +2385,7 @@ int __init init_es1370(void) /* initialize the chips */ outl(s->ctrl, s->io+ES1370_REG_CONTROL); outl(s->sctrl, s->io+ES1370_REG_SERIAL_CONTROL); + pci_set_master(pcidev); /* enable bus mastering */ wrcodec(s, 0x16, 3); /* no RST, PD */ wrcodec(s, 0x17, 0); /* CODEC ADC and CODEC DAC use {LR,B}CLK2 and run off the LRCLK2 PLL; program DAC_SYNC=0!! */ wrcodec(s, 0x18, 0); /* recording source is mixer */ diff --git a/drivers/sound/es1371.c b/drivers/sound/es1371.c index 80812de15371..080ebaca628e 100644 --- a/drivers/sound/es1371.c +++ b/drivers/sound/es1371.c @@ -67,6 +67,7 @@ * other than i386 * 15.06.99 0.12 Fix bad allocation bug. * Thanks to Deti Fliegl + * 28.06.99 0.13 Add pci_set_master * */ @@ -2735,7 +2736,7 @@ int __init init_es1371(void) if (!pci_present()) /* No PCI bus in this machine! */ return -ENODEV; - printk(KERN_INFO "es1371: version v0.12 time " __TIME__ " " __DATE__ "\n"); + printk(KERN_INFO "es1371: version v0.13 time " __TIME__ " " __DATE__ "\n"); while (index < NR_DEVICE && (pcidev = pci_find_device(PCI_VENDOR_ID_ENSONIQ, PCI_DEVICE_ID_ENSONIQ_ES1371, pcidev))) { if (pcidev->base_address[0] == 0 || @@ -2792,6 +2793,7 @@ int __init init_es1371(void) outl(s->ctrl, s->io+ES1371_REG_CONTROL); outl(s->sctrl, s->io+ES1371_REG_SERIAL_CONTROL); outl(0, s->io+ES1371_REG_LEGACY); + pci_set_master(pcidev); /* enable bus mastering */ /* AC97 warm reset to start the bitclk */ outl(s->ctrl | CTRL_SYNCRES, s->io+ES1371_REG_CONTROL); udelay(2); diff --git a/drivers/sound/sonicvibes.c b/drivers/sound/sonicvibes.c index 0e696b1134ae..8b8a2c188ac1 100644 --- a/drivers/sound/sonicvibes.c +++ b/drivers/sound/sonicvibes.c @@ -70,6 +70,7 @@ * Note: dmaio hack might still be wrong on archs other than i386 * 15.06.99 0.15 Fix bad allocation bug. * Thanks to Deti Fliegl + * 28.06.99 0.16 Add pci_set_master * */ @@ -2324,7 +2325,7 @@ int __init init_sonicvibes(void) if (!pci_present()) /* No PCI bus in this machine! */ return -ENODEV; - printk(KERN_INFO "sv: version v0.15 time " __TIME__ " " __DATE__ "\n"); + printk(KERN_INFO "sv: version v0.16 time " __TIME__ " " __DATE__ "\n"); #if 0 if (!(wavetable_mem = __get_free_pages(GFP_KERNEL, 20-PAGE_SHIFT))) printk(KERN_INFO "sv: cannot allocate 1MB of contiguous nonpageable memory for wavetable data\n"); @@ -2450,6 +2451,7 @@ int __init init_sonicvibes(void) goto err_dev3; if ((s->dev_dmfm = register_sound_special(&sv_dmfm_fops, 15 /* ?? */)) < 0) goto err_dev4; + pci_set_master(pcidev); /* enable bus mastering */ /* initialize the chips */ fs = get_fs(); set_fs(KERNEL_DS); diff --git a/drivers/video/vgacon.c b/drivers/video/vgacon.c index 95a758dd1d1c..20348e9bf208 100644 --- a/drivers/video/vgacon.c +++ b/drivers/video/vgacon.c @@ -186,18 +186,21 @@ __initfunc(static const char *vgacon_startup(void)) vga_video_port_val = 0x3b5; if ((ORIG_VIDEO_EGA_BX & 0xff) != 0x10) { + static struct resource ega_console_resource = { "ega", 0x3B0, 0x3BF }; vga_video_type = VIDEO_TYPE_EGAM; vga_vram_end = 0xb8000; display_desc = "EGA+"; - request_region(0x3b0,16,"ega"); + request_resource(&pci_io_resource, &ega_console_resource); } else { + static struct resource mda1_console_resource = { "mda", 0x3B0, 0x3BB }; + static struct resource mda2_console_resource = { "mda", 0x3BF, 0x3BF }; vga_video_type = VIDEO_TYPE_MDA; vga_vram_end = 0xb2000; display_desc = "*MDA"; - request_region(0x3b0,12,"mda"); - request_region(0x3bf, 1,"mda"); + request_resource(&pci_io_resource, &mda1_console_resource); + request_resource(&pci_io_resource, &mda2_console_resource); vga_video_font_height = 14; } } @@ -214,13 +217,15 @@ __initfunc(static const char *vgacon_startup(void)) vga_vram_end = 0xc0000; if (!ORIG_VIDEO_ISVGA) { + static struct resource ega_console_resource = { "ega", 0x3C0, 0x3DF }; vga_video_type = VIDEO_TYPE_EGAC; display_desc = "EGA"; - request_region(0x3c0,32,"ega"); + request_resource(&pci_io_resource, &ega_console_resource); } else { + static struct resource vga_console_resource = { "vga+", 0x3C0, 0x3DF }; vga_video_type = VIDEO_TYPE_VGAC; display_desc = "VGA+"; - request_region(0x3c0,32,"vga+"); + request_resource(&pci_io_resource, &vga_console_resource); #ifdef VGA_CAN_DO_64KB /* @@ -261,10 +266,11 @@ __initfunc(static const char *vgacon_startup(void)) } else { + static struct resource cga_console_resource = { "cga", 0x3D4, 0x3D5 }; vga_video_type = VIDEO_TYPE_CGA; vga_vram_end = 0xba000; display_desc = "*CGA"; - request_region(0x3d4,2,"cga"); + request_resource(&pci_io_resource, &cga_console_resource); vga_video_font_height = 8; } } diff --git a/fs/exec.c b/fs/exec.c index be92bb62cbca..887f9145ade3 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -366,7 +366,7 @@ end_readexec: static int exec_mmap(void) { struct mm_struct * mm, * old_mm; - int retval, nr; + int retval; if (atomic_read(¤t->mm->count) == 1) { flush_cache_mm(current->mm); @@ -386,10 +386,9 @@ static int exec_mmap(void) mm->total_vm = 0; mm->rss = 0; /* - * Make sure we have a private ldt if needed ... + * Make sure we have a private LDT if needed ... */ - nr = current->tarray_ptr - &task[0]; - copy_segments(nr, current, mm); + copy_segments(current, mm); old_mm = current->mm; current->mm = mm; @@ -408,7 +407,7 @@ static int exec_mmap(void) fail_restore: current->mm = old_mm; /* restore the ldt for this task */ - copy_segments(nr, current, NULL); + copy_segments(current, NULL); release_segments(mm); kmem_cache_free(mm_cachep, mm); diff --git a/fs/proc/array.c b/fs/proc/array.c index 66108f9a79f0..e5671587d2a7 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -223,7 +223,7 @@ static int get_loadavg(char * buffer) LOAD_INT(a), LOAD_FRAC(a), LOAD_INT(b), LOAD_FRAC(b), LOAD_INT(c), LOAD_FRAC(c), - nr_running, nr_tasks, last_pid); + nr_running, nr_threads, last_pid); } static int get_kstat(char * buffer) @@ -312,7 +312,7 @@ static int get_uptime(char * buffer) unsigned long idle; uptime = jiffies; - idle = task[0]->times.tms_utime + task[0]->times.tms_stime; + idle = init_tasks[0]->times.tms_utime + init_tasks[0]->times.tms_stime; /* The formula for the fraction parts really is ((t * 100) / HZ) % 100, but that would overflow about every five days at HZ == 100. @@ -495,7 +495,7 @@ static unsigned long get_wchan(struct task_struct *p) int count = 0; stack_page = (unsigned long)p; - esp = p->tss.esp; + esp = p->thread.esp; if (!stack_page || esp < stack_page || esp >= 8188+stack_page) return 0; /* include/asm-i386/system.h:switch_to() pushes ebp last. */ diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 195ca41b8144..4ea8d33cffc6 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -133,7 +133,7 @@ out: static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir) { struct inode *inode = filp->f_dentry->d_inode; - struct task_struct * p, **tarrayp; + struct task_struct *p, *tmp; unsigned int fd, pid, ino; int retval; char buf[NUMBUF]; @@ -157,7 +157,6 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir) p = find_task_by_pid(pid); if (!p) goto out_unlock; - tarrayp = p->tarray_ptr; for (fd -= 2 ; p->files && fd < p->files->max_fds; fd++, filp->f_pos++) { @@ -182,8 +181,13 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir) goto out; read_lock(&tasklist_lock); - /* filldir() might have slept, so we must re-validate "p" */ - if (p != *tarrayp || p->pid != pid) + /* + * filldir() might have slept, so we must + * re-validate "p". This is fast enough due + * to the pidhash + */ + tmp = find_task_by_pid(pid); + if (p != tmp) break; } out_unlock: diff --git a/fs/proc/root.c b/fs/proc/root.c index a6c05e91e955..bab64758c1c3 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -849,14 +849,29 @@ static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentr int len; if (dir->i_ino == PROC_ROOT_INO) { /* check for safety... */ - dir->i_nlink = proc_root.nlink; - - read_lock(&tasklist_lock); - for_each_task(p) { - if (p->pid) - dir->i_nlink++; + extern unsigned long total_forks; + static int last_timestamp = 0; + + /* + * this one can be a serious 'ps' performance problem if + * there are many threads running - thus we do 'lazy' + * link-recalculation - we change it only if the number + * of threads has increased. + */ + if (total_forks != last_timestamp) { + int nlink = proc_root.nlink; + + read_lock(&tasklist_lock); + last_timestamp = total_forks; + for_each_task(p) + nlink++; + read_unlock(&tasklist_lock); + /* + * subtract the # of idle threads which + * do not show up in /proc: + */ + dir->i_nlink = nlink - smp_num_cpus; } - read_unlock(&tasklist_lock); } if (!proc_lookup(dir, dentry)) diff --git a/include/asm-i386/desc.h b/include/asm-i386/desc.h index e91580e04ca7..31e267386415 100644 --- a/include/asm-i386/desc.h +++ b/include/asm-i386/desc.h @@ -1,6 +1,46 @@ #ifndef __ARCH_DESC_H #define __ARCH_DESC_H +#include + +/* + * The layout of the GDT under Linux: + * + * 0 - null + * 1 - not used + * 2 - kernel code segment + * 3 - kernel data segment + * 4 - user code segment <-- new cacheline + * 5 - user data segment + * 6 - not used + * 7 - not used + * 8 - APM BIOS support <-- new cacheline + * 9 - APM BIOS support + * 10 - APM BIOS support + * 11 - APM BIOS support + * + * The TSS+LDT descriptors are spread out a bit so that every CPU + * has an exclusive cacheline for the per-CPU TSS and LDT: + * + * 12 - CPU#0 TSS <-- new cacheline + * 13 - CPU#0 LDT + * 14 - not used + * 15 - not used + * 16 - CPU#1 TSS <-- new cacheline + * 17 - CPU#1 LDT + * 18 - not used + * 19 - not used + * ... NR_CPUS per-CPU TSS+LDT's if on SMP + * + * Entry into gdt where to find first TSS. + */ +#define __FIRST_TSS_ENTRY 12 +#define __FIRST_LDT_ENTRY (__FIRST_TSS_ENTRY+1) + +#define __TSS(n) (((n)<<2) + __FIRST_TSS_ENTRY) +#define __LDT(n) (((n)<<2) + __FIRST_LDT_ENTRY) + +#ifndef __ASSEMBLY__ struct desc_struct { unsigned long a,b; }; @@ -16,46 +56,33 @@ struct Xgt_desc_struct { #define idt_descr (*(struct Xgt_desc_struct *)((char *)&idt - 2)) #define gdt_descr (*(struct Xgt_desc_struct *)((char *)&gdt - 2)) +#define load_TR(n) __asm__ __volatile__("ltr %%ax"::"a" (__TSS(n)<<3)) + +#define __load_LDT(n) __asm__ __volatile__("lldt %%ax"::"a" (__LDT(n)<<3)) + /* - * Entry into gdt where to find first TSS. GDT layout: - * 0 - null - * 1 - not used - * 2 - kernel code segment - * 3 - kernel data segment - * 4 - user code segment - * 5 - user data segment - * 6 - not used - * 7 - not used - * 8 - APM BIOS support - * 9 - APM BIOS support - * 10 - APM BIOS support - * 11 - APM BIOS support - * 12 - TSS #0 - * 13 - LDT #0 - * 14 - TSS #1 - * 15 - LDT #1 + * This is the ldt that every process will get unless we need + * something other than this. */ -#define FIRST_TSS_ENTRY 12 -#define FIRST_LDT_ENTRY (FIRST_TSS_ENTRY+1) -#define _TSS(n) ((((unsigned long) n)<<4)+(FIRST_TSS_ENTRY<<3)) -#define _LDT(n) ((((unsigned long) n)<<4)+(FIRST_LDT_ENTRY<<3)) -#define load_TR(n) __asm__ __volatile__("ltr %%ax": /* no output */ :"a" (_TSS(n))) -#define load_ldt(n) __asm__ __volatile__("lldt %%ax": /* no output */ :"a" (_LDT(n))) -#define store_TR(n) \ -__asm__("str %%ax\n\t" \ - "subl %2,%%eax\n\t" \ - "shrl $4,%%eax" \ - :"=a" (n) \ - :"0" (0),"i" (FIRST_TSS_ENTRY<<3)) - +extern struct desc_struct default_ldt; extern void set_intr_gate(unsigned int irq, void * addr); extern void set_ldt_desc(unsigned int n, void *addr, unsigned int size); extern void set_tss_desc(unsigned int n, void *addr); /* - * This is the ldt that every process will get unless we need - * something other than this. + * load one particular LDT into the current CPU */ -extern struct desc_struct default_ldt; +extern inline void load_LDT (struct mm_struct *mm) +{ + int cpu = smp_processor_id(); + + if (mm->segments) + set_ldt_desc(cpu, mm->segments, LDT_ENTRIES); + else + set_ldt_desc(cpu, &default_ldt, 1); + __load_LDT(cpu); +} + +#endif /* !__ASSEMBLY__ */ #endif diff --git a/include/asm-i386/hardirq.h b/include/asm-i386/hardirq.h index 533961343360..25dbbc2c0e91 100644 --- a/include/asm-i386/hardirq.h +++ b/include/asm-i386/hardirq.h @@ -1,7 +1,7 @@ #ifndef __ASM_HARDIRQ_H #define __ASM_HARDIRQ_H -#include +#include extern unsigned int local_irq_count[NR_CPUS]; diff --git a/include/asm-i386/ldt.h b/include/asm-i386/ldt.h index 55b75ca39a76..9d1110f9847c 100644 --- a/include/asm-i386/ldt.h +++ b/include/asm-i386/ldt.h @@ -11,6 +11,7 @@ /* The size of each LDT entry. */ #define LDT_ENTRY_SIZE 8 +#ifndef __ASSEMBLY__ struct modify_ldt_ldt_s { unsigned int entry_number; unsigned long base_addr; @@ -27,4 +28,5 @@ struct modify_ldt_ldt_s { #define MODIFY_LDT_CONTENTS_STACK 1 #define MODIFY_LDT_CONTENTS_CODE 2 +#endif /* !__ASSEMBLY__ */ #endif diff --git a/include/asm-i386/mmu_context.h b/include/asm-i386/mmu_context.h index e8b812e2f83d..0c9f5691acfb 100644 --- a/include/asm-i386/mmu_context.h +++ b/include/asm-i386/mmu_context.h @@ -1,13 +1,19 @@ #ifndef __I386_MMU_CONTEXT_H #define __I386_MMU_CONTEXT_H +#include + /* - * get a new mmu context.. x86's don't know about contexts. + * get a new mmu context.. x86's don't know much about contexts, + * but we have to reload the new LDT in exec(). */ -#define get_mmu_context(x) do { } while (0) +#define get_mmu_context(tsk) do { } while(0) #define init_new_context(mm) do { } while(0) +/* + * possibly do the LDT unload here? + */ #define destroy_context(mm) do { } while(0) -#define activate_context(tsk) do { } while(0) +#define activate_context(x) load_LDT((x)->mm) #endif diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h index b4c8d0e99585..1ba8f6766761 100644 --- a/include/asm-i386/pgtable.h +++ b/include/asm-i386/pgtable.h @@ -15,7 +15,7 @@ #ifndef __ASSEMBLY__ #include #include -#include +#include /* Caches aren't brain-dead on the intel. */ #define flush_cache_all() do { } while (0) @@ -306,7 +306,7 @@ extern pte_t * __bad_pagetable(void); #define SET_PAGE_DIR(tsk,pgdir) \ do { \ unsigned long __pgdir = __pa(pgdir); \ - (tsk)->tss.cr3 = __pgdir; \ + (tsk)->thread.cr3 = __pgdir; \ if ((tsk) == current) \ __asm__ __volatile__("movl %0,%%cr3": :"r" (__pgdir)); \ } while (0) @@ -481,9 +481,9 @@ extern __inline__ void free_pmd_slow(pmd_t *pmd) extern void __bad_pte(pmd_t *pmd); extern void __bad_pte_kernel(pmd_t *pmd); -#define pte_free_kernel(pte) free_pte_fast(pte) -#define pte_free(pte) free_pte_fast(pte) -#define pgd_free(pgd) free_pgd_fast(pgd) +#define pte_free_kernel(pte) free_pte_slow(pte) +#define pte_free(pte) free_pte_slow(pte) +#define pgd_free(pgd) free_pgd_slow(pgd) #define pgd_alloc() get_pgd_fast() extern inline pte_t * pte_alloc_kernel(pmd_t * pmd, unsigned long address) diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h index 43c0a3673e25..f56bf6690c39 100644 --- a/include/asm-i386/processor.h +++ b/include/asm-i386/processor.h @@ -12,6 +12,7 @@ #include #include #include +#include /* * Default implementation of macro that returns current @@ -95,6 +96,7 @@ struct cpuinfo_x86 { #define X86_FEATURE_AMD3D 0x80000000 extern struct cpuinfo_x86 boot_cpu_data; +extern struct hard_thread_struct init_tss[NR_CPUS]; #ifdef __SMP__ extern struct cpuinfo_x86 cpu_data[]; @@ -124,6 +126,48 @@ extern inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx) : "cc"); } + +/* + * Intel CPU features in CR4 + */ +#define X86_CR4_VME 0x0001 /* enable vm86 extensions */ +#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */ +#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */ +#define X86_CR4_DE 0x0008 /* enable debugging extensions */ +#define X86_CR4_PSE 0x0010 /* enable page size extensions */ +#define X86_CR4_PAE 0x0020 /* enable physical address extensions */ +#define X86_CR4_MCE 0x0040 /* Machine check enable */ +#define X86_CR4_PGE 0x0080 /* enable global pages */ +#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */ + +/* + * Save the cr4 feature set we're using (ie + * Pentium 4MB enable and PPro Global page + * enable), so that any CPU's that boot up + * after us can get the correct flags. + */ +extern unsigned long mmu_cr4_features; + +static inline void set_in_cr4 (unsigned long mask) +{ + mmu_cr4_features |= mask; + __asm__("movl %%cr4,%%eax\n\t" + "orl %0,%%eax\n\t" + "movl %%eax,%%cr4\n" + : : "irg" (mask) + :"ax"); +} + +static inline void clear_in_cr4 (unsigned long mask) +{ + mmu_cr4_features &= ~mask; + __asm__("movl %%cr4,%%eax\n\t" + "andl %0,%%eax\n\t" + "movl %%eax,%%cr4\n" + : : "irg" (~mask) + :"ax"); +} + /* * Cyrix CPU configuration register indexes */ @@ -177,6 +221,8 @@ extern unsigned int mca_pentium_flag; * Size of io_bitmap in longwords: 32 is ports 0-0x3ff. */ #define IO_BITMAP_SIZE 32 +#define IO_BITMAP_OFFSET offsetof(struct hard_thread_struct,io_bitmap) +#define INVALID_IO_BITMAP_OFFSET 0x8000 struct i387_hard_struct { long cwd; @@ -213,7 +259,7 @@ typedef struct { unsigned long seg; } mm_segment_t; -struct thread_struct { +struct hard_thread_struct { unsigned short back_link,__blh; unsigned long esp0; unsigned short ss0,__ss0h; @@ -238,19 +284,44 @@ struct thread_struct { unsigned short ldt, __ldth; unsigned short trace, bitmap; unsigned long io_bitmap[IO_BITMAP_SIZE+1]; - unsigned long tr; + /* + * pads the TSS to be cacheline-aligned (size is 0x100) + */ + unsigned long __cacheline_filler[5]; +}; + +struct soft_thread_struct { + unsigned long esp0; + unsigned long cr3; + unsigned long eip; + unsigned long esp; + unsigned long fs; + unsigned long gs; +/* Hardware debugging registers */ + unsigned long debugreg[8]; /* %%db0-7 debug registers */ +/* fault info */ unsigned long cr2, trap_no, error_code; - mm_segment_t segment; -/* debug registers */ - long debugreg[8]; /* Hardware debugging registers */ /* floating point info */ - union i387_union i387; + union i387_union i387; /* virtual 86 mode info */ - struct vm86_struct * vm86_info; - unsigned long screen_bitmap; - unsigned long v86flags, v86mask, v86mode, saved_esp0; + struct vm86_struct * vm86_info; + unsigned long screen_bitmap; + unsigned long v86flags, v86mask, v86mode, saved_esp0; +/* IO permissions */ + int ioperm; + unsigned long io_bitmap[IO_BITMAP_SIZE+1]; }; +#define INIT_THREAD { \ + 0,(long) &swapper_pg_dir - PAGE_OFFSET, \ + 0, 0, 0, 0, \ + { [0 ... 7] = 0 }, /* debugging registers */ \ + 0, 0, 0, \ + { { 0, }, }, /* 387 state */ \ + 0,0,0,0,0,0, \ + 0,{~0,} /* io permissions */ \ +} + #define INIT_MMAP \ { &init_mm, 0, 0, NULL, PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL } @@ -265,13 +336,9 @@ struct thread_struct { 0,0,0,0, /* esp,ebp,esi,edi */ \ 0,0,0,0,0,0, /* es,cs,ss */ \ 0,0,0,0,0,0, /* ds,fs,gs */ \ - _LDT(0),0, /* ldt */ \ - 0, 0x8000, /* tace, bitmap */ \ - {~0, }, /* ioperm */ \ - _TSS(0), 0, 0, 0, (mm_segment_t) { 0 }, /* obsolete */ \ - { 0, }, \ - { { 0, }, }, /* 387 state */ \ - NULL, 0, 0, 0, 0, 0, /* vm86_info */ \ + __LDT(0),0, /* ldt */ \ + 0, INVALID_IO_BITMAP_OFFSET, /* tace, bitmap */ \ + {~0, } /* ioperm */ \ } #define start_thread(regs, new_eip, new_esp) do { \ @@ -291,10 +358,13 @@ struct mm_struct; /* Free all resources held by a thread. */ extern void release_thread(struct task_struct *); +/* + * create a kernel thread without removing it from tasklists + */ extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags); /* Copy and release all segment info associated with a VM */ -extern void copy_segments(int nr, struct task_struct *p, struct mm_struct * mm); +extern void copy_segments(struct task_struct *p, struct mm_struct * mm); extern void release_segments(struct mm_struct * mm); extern void forget_segments(void); @@ -302,7 +372,7 @@ extern void forget_segments(void); * FPU lazy state save handling.. */ #define save_fpu(tsk) do { \ - asm volatile("fnsave %0\n\tfwait":"=m" (tsk->tss.i387)); \ + asm volatile("fnsave %0\n\tfwait":"=m" (tsk->thread.i387)); \ tsk->flags &= ~PF_USEDFPU; \ stts(); \ } while (0) @@ -322,11 +392,12 @@ extern void forget_segments(void); /* * Return saved PC of a blocked thread. */ -extern inline unsigned long thread_saved_pc(struct thread_struct *t) +extern inline unsigned long thread_saved_pc(struct soft_thread_struct *t) { return ((unsigned long *)t->esp)[3]; } +#define THREAD_SIZE (2*PAGE_SIZE) extern struct task_struct * alloc_task_struct(void); extern void free_task_struct(struct task_struct *); diff --git a/include/asm-i386/resource.h b/include/asm-i386/resource.h index e7e2d115996c..1bc94c0e499a 100644 --- a/include/asm-i386/resource.h +++ b/include/asm-i386/resource.h @@ -28,7 +28,7 @@ { _STK_LIM, LONG_MAX }, \ { 0, LONG_MAX }, \ { LONG_MAX, LONG_MAX }, \ - { MAX_TASKS_PER_USER, MAX_TASKS_PER_USER }, \ + { 0, 0 }, \ { NR_OPEN, NR_OPEN }, \ { LONG_MAX, LONG_MAX }, \ { LONG_MAX, LONG_MAX }, \ diff --git a/include/asm-i386/smp.h b/include/asm-i386/smp.h index ec24476aed69..b8a150aa7575 100644 --- a/include/asm-i386/smp.h +++ b/include/asm-i386/smp.h @@ -16,7 +16,7 @@ #ifdef __SMP__ #ifndef ASSEMBLY -#include +#include #include /* diff --git a/include/asm-i386/system.h b/include/asm-i386/system.h index 147ad6da4844..34305de8cb17 100644 --- a/include/asm-i386/system.h +++ b/include/asm-i386/system.h @@ -22,9 +22,9 @@ extern void FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *n "popl %%ebp\n\t" \ "popl %%edi\n\t" \ "popl %%esi\n\t" \ - :"=m" (prev->tss.esp),"=m" (prev->tss.eip), \ + :"=m" (prev->thread.esp),"=m" (prev->thread.eip), \ "=b" (last) \ - :"m" (next->tss.esp),"m" (next->tss.eip), \ + :"m" (next->thread.esp),"m" (next->thread.eip), \ "a" (prev), "d" (next), \ "b" (prev)); \ } while (0) diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index cd584c1af250..8c58edf5edf5 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -3,7 +3,7 @@ #include #include -#include +#include /* * 'kernel_stat.h' contains the definitions needed for doing diff --git a/include/linux/list.h b/include/linux/list.h index e77559a68fda..656aacc2a0dd 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -17,8 +17,10 @@ struct list_head { struct list_head *next, *prev; }; +#define LIST_HEAD_INIT(name) { &(name), &(name) } + #define LIST_HEAD(name) \ - struct list_head name = { &name, &name } + struct list_head name = LIST_HEAD_INIT(name) #define INIT_LIST_HEAD(ptr) do { \ (ptr)->next = (ptr); (ptr)->prev = (ptr); \ @@ -48,6 +50,14 @@ static __inline__ void list_add(struct list_head *new, struct list_head *head) __list_add(new, head, head->next); } +/* + * Insert a new entry before the specified head.. + */ +static __inline__ void list_add_tail(struct list_head *new, struct list_head *head) +{ + __list_add(new, head->prev, head); +} + /* * Delete a list entry by making the prev/next entries * point to each other. diff --git a/include/linux/sched.h b/include/linux/sched.h index df3753dbed66..294a3f69a2f6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -7,7 +7,7 @@ extern unsigned long event; #include #include -#include +#include #include #include #include @@ -63,7 +63,7 @@ extern unsigned long avenrun[]; /* Load averages */ #define CT_TO_SECS(x) ((x) / HZ) #define CT_TO_USECS(x) (((x) % HZ) * 1000000/HZ) -extern int nr_running, nr_tasks; +extern int nr_running, nr_threads; extern int last_pid; #include @@ -119,6 +119,7 @@ extern spinlock_t runqueue_lock; extern void sched_init(void); extern void init_idle(void); extern void show_state(void); +extern void cpu_init (void); extern void trap_init(void); #define MAX_SCHEDULE_TIMEOUT LONG_MAX @@ -243,7 +244,7 @@ struct task_struct { int last_processor; int lock_depth; /* Lock depth. We can context switch in and out of holding a syscall kernel lock... */ struct task_struct *next_task, *prev_task; - struct task_struct *next_run, *prev_run; + struct list_head run_list; /* task state */ struct linux_binfmt *binfmt; @@ -270,9 +271,6 @@ struct task_struct { struct task_struct *pidhash_next; struct task_struct **pidhash_pprev; - /* Pointer to task[] array linkage. */ - struct task_struct **tarray_ptr; - wait_queue_head_t wait_chldexit; /* for wait4() */ struct semaphore *vfork_sem; /* for vfork() */ unsigned long policy, rt_priority; @@ -302,8 +300,8 @@ struct task_struct { /* ipc stuff */ struct sem_undo *semundo; struct sem_queue *semsleeping; -/* tss for this task */ - struct thread_struct tss; +/* CPU-specific state of this task */ + struct soft_thread_struct thread; /* filesystem information */ struct fs_struct *fs; /* open file information */ @@ -355,13 +353,12 @@ struct task_struct { /* state etc */ { 0,0,0,KERNEL_DS,&default_exec_domain,0, \ /* counter */ DEF_PRIORITY,DEF_PRIORITY,0, \ /* SMP */ 0,0,0,-1, \ -/* schedlink */ &init_task,&init_task, &init_task, &init_task, \ +/* schedlink */ &init_task,&init_task, LIST_HEAD_INIT(init_task.run_list), \ /* binfmt */ NULL, \ /* ec,brk... */ 0,0,0,0,0,0, \ /* pid etc.. */ 0,0,0,0,0, \ /* proc links*/ &init_task,&init_task,NULL,NULL,NULL, \ /* pidhash */ NULL, NULL, \ -/* tarray */ &task[0], \ /* chld wait */ __WAIT_QUEUE_HEAD_INITIALIZER(name.wait_chldexit), NULL, \ /* timeout */ SCHED_OTHER,0,0,0,0,0,0,0, \ /* timer */ { NULL, NULL, 0, 0, it_real_fn }, \ @@ -379,7 +376,7 @@ struct task_struct { /* comm */ "swapper", \ /* fs info */ 0,NULL, \ /* ipc */ NULL, NULL, \ -/* tss */ INIT_TSS, \ +/* thread */ INIT_THREAD, \ /* fs */ &init_fs, \ /* files */ &init_files, \ /* mm */ &init_mm, \ @@ -398,33 +395,10 @@ union task_union { extern union task_union init_task_union; extern struct mm_struct init_mm; -extern struct task_struct *task[NR_TASKS]; - -extern struct task_struct **tarray_freelist; -extern spinlock_t taskslot_lock; - -extern __inline__ void add_free_taskslot(struct task_struct **t) -{ - spin_lock(&taskslot_lock); - *t = (struct task_struct *) tarray_freelist; - tarray_freelist = t; - spin_unlock(&taskslot_lock); -} - -extern __inline__ struct task_struct **get_free_taskslot(void) -{ - struct task_struct **tslot; - - spin_lock(&taskslot_lock); - if((tslot = tarray_freelist) != NULL) - tarray_freelist = (struct task_struct **) *tslot; - spin_unlock(&taskslot_lock); - - return tslot; -} +extern struct task_struct *init_tasks[NR_CPUS]; -/* PID hashing. */ -#define PIDHASH_SZ (NR_TASKS >> 2) +/* PID hashing. (shouldnt this be dynamic?) */ +#define PIDHASH_SZ (4096 >> 2) extern struct task_struct *pidhash[PIDHASH_SZ]; #define pid_hashfn(x) ((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1)) @@ -741,6 +715,29 @@ do { \ #define for_each_task(p) \ for (p = &init_task ; (p = p->next_task) != &init_task ; ) + +static inline void del_from_runqueue(struct task_struct * p) +{ + nr_running--; + list_del(&p->run_list); + p->run_list.next = NULL; +} + +extern inline int task_on_runqueue(struct task_struct *p) +{ + return (p->run_list.next != NULL); +} + +extern inline void unhash_process(struct task_struct *p) +{ + if (task_on_runqueue(p)) BUG(); + nr_threads--; + write_lock_irq(&tasklist_lock); + unhash_pid(p); + REMOVE_LINKS(p); + write_unlock_irq(&tasklist_lock); +} + #endif /* __KERNEL__ */ #endif diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 5d9eedc1d12b..8297db00d6a6 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -100,7 +100,8 @@ enum KERN_SHMMAX=34, /* int: Maximum shared memory segment */ KERN_MSGMAX=35, /* int: Maximum size of a messege */ KERN_MSGMNB=36, /* int: Maximum message queue size */ - KERN_MSGPOOL=37 /* int: Maximum system message pool size */ + KERN_MSGPOOL=37, /* int: Maximum system message pool size */ + KERN_MAX_THREADS=38 /* int: Maximum nr of threads in the system */ }; diff --git a/include/linux/tasks.h b/include/linux/tasks.h deleted file mode 100644 index 91b758f49681..000000000000 --- a/include/linux/tasks.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef _LINUX_TASKS_H -#define _LINUX_TASKS_H - -/* - * This is the maximum nr of tasks - change it if you need to - */ - -#ifdef __SMP__ -#define NR_CPUS 32 /* Max processors that can be running in SMP */ -#else -#define NR_CPUS 1 -#endif - -#define NR_TASKS 512 /* On x86 Max 4092, or 4090 w/APM configured. */ - -#define MAX_TASKS_PER_USER (NR_TASKS/2) -#define MIN_TASKS_LEFT_FOR_ROOT 4 - - -/* - * This controls the maximum pid allocated to a process - */ -#define PID_MAX 0x8000 - -#endif diff --git a/include/linux/threads.h b/include/linux/threads.h new file mode 100644 index 000000000000..f1f502eb6614 --- /dev/null +++ b/include/linux/threads.h @@ -0,0 +1,22 @@ +#ifndef _LINUX_THREADS_H +#define _LINUX_THREADS_H + +/* + * The default limit for the nr of threads is now in + * /proc/sys/kernel/max-threads. + */ + +#ifdef __SMP__ +#define NR_CPUS 32 /* Max processors that can be running in SMP */ +#else +#define NR_CPUS 1 +#endif + +#define MIN_THREADS_LEFT_FOR_ROOT 4 + +/* + * This controls the maximum pid allocated to a process + */ +#define PID_MAX 0x8000 + +#endif diff --git a/init/main.c b/init/main.c index 7141a9152608..431407c8b350 100644 --- a/init/main.c +++ b/init/main.c @@ -75,7 +75,7 @@ extern void init_IRQ(void); extern void init_modules(void); extern long console_init(long, long); extern void sock_init(void); -extern void uidcache_init(void); +extern void fork_init(unsigned long); extern void mca_init(void); extern void sbus_init(void); extern void ppc_init(void); @@ -1187,7 +1187,7 @@ asmlinkage void __init start_kernel(void) #ifdef CONFIG_PROC_FS proc_root_init(); #endif - uidcache_init(); + fork_init(memory_end-memory_start); filescache_init(); dcache_init(); vma_init(); diff --git a/kernel/exit.c b/kernel/exit.c index c4b2c4eb94d9..a0277d6dc977 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -27,29 +27,15 @@ static void release(struct task_struct * p) if (p != current) { #ifdef __SMP__ /* - * Wait to make sure the process isn't active on any - * other CPU + * Wait to make sure the process isn't on the + * runqueue (active on some other CPU still) */ - for (;;) { - int has_cpu; - spin_lock_irq(&runqueue_lock); - has_cpu = p->has_cpu; - spin_unlock_irq(&runqueue_lock); - if (!has_cpu) - break; - do { - barrier(); - } while (p->has_cpu); - } + do { + barrier(); + } while (p->has_cpu); #endif free_uid(p); - nr_tasks--; - add_free_taskslot(p->tarray_ptr); - - write_lock_irq(&tasklist_lock); - unhash_pid(p); - REMOVE_LINKS(p); - write_unlock_irq(&tasklist_lock); + unhash_process(p); release_thread(p); current->cmin_flt += p->min_flt + p->cmin_flt; diff --git a/kernel/fork.c b/kernel/fork.c index 12c580852928..e7c2e3b630f4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -22,11 +22,12 @@ #include #include -/* The idle tasks do not count.. */ -int nr_tasks=0; +/* The idle threads do not count.. */ +int nr_threads=0; int nr_running=0; -unsigned long int total_forks=0; /* Handle normal Linux uptimes. */ +int max_threads; +unsigned long total_forks = 0; /* Handle normal Linux uptimes. */ int last_pid=0; /* SLAB cache for mm_struct's. */ @@ -37,9 +38,6 @@ kmem_cache_t *files_cachep; struct task_struct *pidhash[PIDHASH_SZ]; -struct task_struct **tarray_freelist = NULL; -spinlock_t taskslot_lock = SPIN_LOCK_UNLOCKED; - /* UID task count cache, to prevent walking entire process list every * single fork() operation. */ @@ -159,7 +157,7 @@ int alloc_uid(struct task_struct *p) return 0; } -void __init uidcache_init(void) +void __init fork_init(unsigned long memsize) { int i; @@ -171,15 +169,16 @@ void __init uidcache_init(void) for(i = 0; i < UIDHASH_SZ; i++) uidhash[i] = 0; -} -static inline struct task_struct ** find_empty_process(void) -{ - struct task_struct **tslot = NULL; + /* + * The default maximum number of threads is set to a safe + * value: the thread structures can take up at most half + * of memory. + */ + max_threads = memsize / THREAD_SIZE / 2; - if ((nr_tasks < NR_TASKS - MIN_TASKS_LEFT_FOR_ROOT) || !current->uid) - tslot = get_free_taskslot(); - return tslot; + init_task.rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; + init_task.rlim[RLIMIT_NPROC].rlim_max = max_threads/2; } /* Protects next_safe and last_pid. */ @@ -358,7 +357,7 @@ void mmput(struct mm_struct *mm) } } -static inline int copy_mm(int nr, unsigned long clone_flags, struct task_struct * tsk) +static inline int copy_mm(unsigned long clone_flags, struct task_struct * tsk) { struct mm_struct * mm; int retval; @@ -370,9 +369,10 @@ static inline int copy_mm(int nr, unsigned long clone_flags, struct task_struct if (clone_flags & CLONE_VM) { mmget(current->mm); /* - * Set up the LDT descriptor for the clone task. + * No need to worry about the LDT descriptor for the + * cloned task, LDTs get magically loaded at + * __switch_to time if necessary. */ - copy_segments(nr, tsk, NULL); SET_PAGE_DIR(tsk, current->mm->pgd); return 0; } @@ -383,7 +383,11 @@ static inline int copy_mm(int nr, unsigned long clone_flags, struct task_struct goto fail_nomem; tsk->mm = mm; - copy_segments(nr, tsk, mm); + /* + * child gets a private LDT (if there was an LDT in the parent) + */ + copy_segments(tsk, mm); + retval = new_page_tables(tsk); if (retval) goto free_mm; @@ -542,7 +546,6 @@ static inline void copy_flags(unsigned long clone_flags, struct task_struct *p) */ int do_fork(unsigned long clone_flags, unsigned long usp, struct pt_regs *regs) { - int nr; int retval = -ENOMEM; struct task_struct *p; DECLARE_MUTEX_LOCKED(sem); @@ -565,15 +568,12 @@ int do_fork(unsigned long clone_flags, unsigned long usp, struct pt_regs *regs) atomic_inc(&p->user->count); } - { - struct task_struct **tslot; - tslot = find_empty_process(); - if (!tslot) - goto bad_fork_cleanup_count; - p->tarray_ptr = tslot; - *tslot = p; - nr = tslot - &task[0]; - } + /* + * Counter atomicity is protected by + * the kernel lock + */ + if (nr_threads >= max_threads) + goto bad_fork_cleanup_count; if (p->exec_domain && p->exec_domain->module) __MOD_INC_USE_COUNT(p->exec_domain->module); @@ -594,8 +594,8 @@ int do_fork(unsigned long clone_flags, unsigned long usp, struct pt_regs *regs) * very end). */ p->state = TASK_RUNNING; - p->next_run = p; - p->prev_run = p; + p->run_list.next = NULL; + p->run_list.prev = NULL; p->p_pptr = p->p_opptr = current; p->p_cptr = NULL; @@ -638,9 +638,9 @@ int do_fork(unsigned long clone_flags, unsigned long usp, struct pt_regs *regs) goto bad_fork_cleanup_files; if (copy_sighand(clone_flags, p)) goto bad_fork_cleanup_fs; - if (copy_mm(nr, clone_flags, p)) + if (copy_mm(clone_flags, p)) goto bad_fork_cleanup_sighand; - retval = copy_thread(nr, clone_flags, usp, p, regs); + retval = copy_thread(0, clone_flags, usp, p, regs); if (retval) goto bad_fork_cleanup_sighand; p->semundo = NULL; @@ -666,19 +666,15 @@ int do_fork(unsigned long clone_flags, unsigned long usp, struct pt_regs *regs) * Let it rip! */ retval = p->pid; - if (retval) { - write_lock_irq(&tasklist_lock); - SET_LINKS(p); - hash_pid(p); - write_unlock_irq(&tasklist_lock); + write_lock_irq(&tasklist_lock); + SET_LINKS(p); + hash_pid(p); + write_unlock_irq(&tasklist_lock); - nr_tasks++; - - p->next_run = NULL; - p->prev_run = NULL; - wake_up_process(p); /* do this last */ - } + nr_threads++; + wake_up_process(p); /* do this last */ ++total_forks; + bad_fork: unlock_kernel(); up(¤t->mm->mmap_sem); @@ -699,7 +695,7 @@ bad_fork_cleanup: if (p->binfmt && p->binfmt->module) __MOD_DEC_USE_COUNT(p->binfmt->module); - add_free_taskslot(p->tarray_ptr); + nr_threads--; bad_fork_cleanup_count: if (p->user) free_uid(p); diff --git a/kernel/info.c b/kernel/info.c index 1dffddc7b0e1..199c85f17a41 100644 --- a/kernel/info.c +++ b/kernel/info.c @@ -26,7 +26,7 @@ asmlinkage int sys_sysinfo(struct sysinfo *info) val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); - val.procs = nr_tasks-1; + val.procs = nr_threads-1; sti(); si_meminfo(&val); diff --git a/kernel/ksyms.c b/kernel/ksyms.c index 4b4155e47fc0..5d43388c2cd7 100644 --- a/kernel/ksyms.c +++ b/kernel/ksyms.c @@ -313,6 +313,8 @@ EXPORT_SYMBOL(release_resource); EXPORT_SYMBOL(__request_region); EXPORT_SYMBOL(__check_region); EXPORT_SYMBOL(__release_region); +EXPORT_SYMBOL(pci_io_resource); +EXPORT_SYMBOL(pci_mem_resource); /* process management */ EXPORT_SYMBOL(__wake_up); diff --git a/kernel/panic.c b/kernel/panic.c index a7dbe450341c..f64ccfb3745b 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -40,7 +40,7 @@ NORET_TYPE void panic(const char * fmt, ...) vsprintf(buf, fmt, args); va_end(args); printk(KERN_EMERG "Kernel panic: %s\n",buf); - if (current == task[0]) + if (current == init_tasks[0]) printk(KERN_EMERG "In swapper task - not syncing\n"); else if (in_interrupt()) printk(KERN_EMERG "In interrupt handler - not syncing\n"); diff --git a/kernel/resource.c b/kernel/resource.c index b0e5327898b7..ce15021e30e6 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -13,7 +13,7 @@ #include struct resource pci_io_resource = { "PCI IO", 0x0000, 0xFFFF }; -struct resource pci_mem_resource = { "PCI mem", 0x00000000, 0xFFFFFFFF }; +struct resource pci_mem_resource = { "PCI mem", 0x00000000, 0xFFFFFFFF }; /* * This generates reports for /proc/ioports and /proc/memory @@ -135,6 +135,7 @@ int __check_region(struct resource *parent, unsigned long start, unsigned long n return -EBUSY; release_resource(res); + kfree(res); return 0; } @@ -147,15 +148,16 @@ void __release_region(struct resource *parent, unsigned long start, unsigned lon end = start + n - 1; for (;;) { - struct resource *tmp = *p; + struct resource *res = *p; - if (!tmp) + if (!res) break; - if (tmp->start == start && tmp->end == end) { - *p = tmp->sibling; + if (res->start == start && res->end == end) { + *p = res->sibling; + kfree(res); break; } - p = &tmp->sibling; + p = &res->sibling; } } diff --git a/kernel/sched.c b/kernel/sched.c index 95b9b823c01c..3de8d010caee 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -94,7 +94,23 @@ unsigned long volatile jiffies=0; * via the SMP irq return path. */ -struct task_struct * task[NR_TASKS] = {&init_task, }; +struct task_struct * init_tasks[NR_CPUS] = {&init_task, }; + +/* + * The tasklist_lock protects the linked list of processes. + * + * The scheduler lock is protecting against multiple entry + * into the scheduling code, and doesn't need to worry + * about interrupts (because interrupts cannot call the + * scheduler). + * + * The run-queue lock locks the parts that actually access + * and change the run-queues, and have to be interrupt-safe. + */ +spinlock_t runqueue_lock = SPIN_LOCK_UNLOCKED; /* second */ +rwlock_t tasklist_lock = RW_LOCK_UNLOCKED; /* third */ + +static LIST_HEAD(runqueue_head); /* * We align per-CPU scheduling data on cacheline boundaries, @@ -114,7 +130,7 @@ struct kernel_stat kstat = { 0 }; #ifdef __SMP__ -#define idle_task(cpu) (task[cpu_number_map[(cpu)]]) +#define idle_task(cpu) (init_tasks[cpu_number_map[(cpu)]]) #define can_schedule(p) (!(p)->has_cpu) #else @@ -366,73 +382,22 @@ static void reschedule_idle(struct task_struct * p) */ static inline void add_to_runqueue(struct task_struct * p) { - struct task_struct *next = init_task.next_run; - - p->prev_run = &init_task; - init_task.next_run = p; - p->next_run = next; - next->prev_run = p; + list_add(&p->run_list, &runqueue_head); nr_running++; } -static inline void del_from_runqueue(struct task_struct * p) -{ - struct task_struct *next = p->next_run; - struct task_struct *prev = p->prev_run; - - nr_running--; - next->prev_run = prev; - prev->next_run = next; - p->next_run = NULL; - p->prev_run = NULL; -} - static inline void move_last_runqueue(struct task_struct * p) { - struct task_struct *next = p->next_run; - struct task_struct *prev = p->prev_run; - - /* remove from list */ - next->prev_run = prev; - prev->next_run = next; - /* add back to list */ - p->next_run = &init_task; - prev = init_task.prev_run; - init_task.prev_run = p; - p->prev_run = prev; - prev->next_run = p; + list_del(&p->run_list); + list_add_tail(&p->run_list, &runqueue_head); } static inline void move_first_runqueue(struct task_struct * p) { - struct task_struct *next = p->next_run; - struct task_struct *prev = p->prev_run; - - /* remove from list */ - next->prev_run = prev; - prev->next_run = next; - /* add back to list */ - p->prev_run = &init_task; - next = init_task.next_run; - init_task.next_run = p; - p->next_run = next; - next->prev_run = p; + list_del(&p->run_list); + list_add(&p->run_list, &runqueue_head); } -/* - * The tasklist_lock protects the linked list of processes. - * - * The scheduler lock is protecting against multiple entry - * into the scheduling code, and doesn't need to worry - * about interrupts (because interrupts cannot call the - * scheduler). - * - * The run-queue lock locks the parts that actually access - * and change the run-queues, and have to be interrupt-safe. - */ -spinlock_t runqueue_lock = SPIN_LOCK_UNLOCKED; /* second */ -rwlock_t tasklist_lock = RW_LOCK_UNLOCKED; /* third */ - /* * Wake up a process. Put it on the run-queue if it's not * already there. The "current" process is always on the @@ -450,7 +415,7 @@ void wake_up_process(struct task_struct * p) */ spin_lock_irqsave(&runqueue_lock, flags); p->state = TASK_RUNNING; - if (p->next_run) + if (task_on_runqueue(p)) goto out; add_to_runqueue(p); spin_unlock_irqrestore(&runqueue_lock, flags); @@ -687,6 +652,7 @@ asmlinkage void schedule(void) { struct schedule_data * sched_data; struct task_struct *prev, *next, *p; + struct list_head *tmp; int this_cpu, c; if (tq_scheduler) @@ -731,42 +697,29 @@ move_rr_back: } prev->need_resched = 0; -repeat_schedule: - /* * this is the scheduler proper: */ - p = init_task.next_run; - /* Default process to select.. */ +repeat_schedule: + /* + * Default process to select.. + */ next = idle_task(this_cpu); c = -1000; if (prev->state == TASK_RUNNING) goto still_running; still_running_back: - /* - * This is subtle. - * Note how we can enable interrupts here, even - * though interrupts can add processes to the run- - * queue. This is because any new processes will - * be added to the front of the queue, so "p" above - * is a safe starting point. - * run-queue deletion and re-ordering is protected by - * the scheduler lock - */ -/* - * Note! there may appear new tasks on the run-queue during this, as - * interrupts are enabled. However, they will be put on front of the - * list, so our list starting at "p" is essentially fixed. - */ - while (p != &init_task) { + tmp = runqueue_head.next; + while (tmp != &runqueue_head) { + p = list_entry(tmp, struct task_struct, run_list); if (can_schedule(p)) { int weight = goodness(prev, p, this_cpu); if (weight > c) c = weight, next = p; } - p = p->next_run; + tmp = tmp->next; } /* Do we need to re-calculate counters? */ @@ -837,8 +790,8 @@ recalculate: p->counter = (p->counter >> 1) + p->priority; read_unlock(&tasklist_lock); spin_lock_irq(&runqueue_lock); - goto repeat_schedule; } + goto repeat_schedule; still_running: c = prev_goodness(prev, prev, this_cpu); @@ -1760,7 +1713,7 @@ static int setscheduler(pid_t pid, int policy, retval = 0; p->policy = policy; p->rt_priority = lp.sched_priority; - if (p->next_run) + if (task_on_runqueue(p)) move_first_runqueue(p); current->need_resched = 1; @@ -1934,13 +1887,13 @@ asmlinkage int sys_nanosleep(struct timespec *rqtp, struct timespec *rmtp) return 0; } -static void show_task(int nr,struct task_struct * p) +static void show_task(struct task_struct * p) { unsigned long free = 0; int state; static const char * stat_nam[] = { "R", "S", "D", "Z", "T", "W" }; - printk("%-8s %3d ", p->comm, (p == current) ? -nr : nr); + printk("%-8s ", p->comm); state = p->state ? ffz(~p->state) + 1 : 0; if (((unsigned) state) < sizeof(stat_nam)/sizeof(char *)) printk(stat_nam[state]); @@ -1950,12 +1903,12 @@ static void show_task(int nr,struct task_struct * p) if (p == current) printk(" current "); else - printk(" %08lX ", thread_saved_pc(&p->tss)); + printk(" %08lX ", thread_saved_pc(&p->thread)); #else if (p == current) printk(" current task "); else - printk(" %016lx ", thread_saved_pc(&p->tss)); + printk(" %016lx ", thread_saved_pc(&p->thread)); #endif { unsigned long * n = (unsigned long *) (p+1); @@ -2020,7 +1973,7 @@ void show_state(void) #endif read_lock(&tasklist_lock); for_each_task(p) - show_task((p->tarray_ptr - &task[0]),p); + show_task(p); read_unlock(&tasklist_lock); } @@ -2030,6 +1983,11 @@ void __init init_idle(void) struct schedule_data * sched_data; sched_data = &aligned_data[smp_processor_id()].schedule_data; + if (current != &init_task && task_on_runqueue(current)) { + printk("UGH! (%d:%d) was on the runqueue, removing.\n", + smp_processor_id(), current->pid); + del_from_runqueue(current); + } t = get_cycles(); sched_data->curr = current; sched_data->last_schedule = t; @@ -2042,14 +2000,10 @@ void __init sched_init(void) * process right in SMP mode. */ int cpu=hard_smp_processor_id(); - int nr = NR_TASKS; + int nr; init_task.processor=cpu; - /* Init task array free list and pidhash table. */ - while(--nr > 0) - add_free_taskslot(&task[nr]); - for(nr = 0; nr < PIDHASH_SZ; nr++) pidhash[nr] = NULL; @@ -2057,3 +2011,4 @@ void __init sched_init(void) init_bh(TQUEUE_BH, tqueue_bh); init_bh(IMMEDIATE_BH, immediate_bh); } + diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 195c2cb5bdb5..3b636f898502 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -34,6 +34,7 @@ extern int panic_timeout; extern int console_loglevel, C_A_D; extern int bdf_prm[], bdflush_min[], bdflush_max[]; extern int sysctl_overcommit_memory; +extern int max_threads; extern int nr_queued_signals, max_queued_signals; #ifdef CONFIG_KMOD @@ -207,6 +208,8 @@ static ctl_table kern_table[] = { {KERN_SHMMAX, "shmmax", &shmmax, sizeof (int), 0644, NULL, &proc_dointvec}, #endif + {KERN_MAX_THREADS, "threads-max", &max_threads, sizeof(int), + 0644, NULL, &proc_dointvec}, {0} }; diff --git a/mm/vmscan.c b/mm/vmscan.c index 1ae052b94a7e..e5c159560ac8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -338,11 +338,11 @@ static int swap_out(unsigned int priority, int gfp_mask) * Think of swap_cnt as a "shadow rss" - it tells us which process * we want to page out (always try largest first). */ - counter = nr_tasks / (priority+1); + counter = nr_threads / (priority+1); if (counter < 1) counter = 1; - if (counter > nr_tasks) - counter = nr_tasks; + if (counter > nr_threads) + counter = nr_threads; for (; counter >= 0; counter--) { assign = 0;