From eb79918f272fe119902db3028e0fbdc752f4942d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 23 Nov 2007 15:09:04 -0500 Subject: [PATCH] [PATCH] Linux-0.97.2 (August 23, 1992) We're making ready for big processes, and vm86 mode! - Move the kernel to virtual address 0xc0000000 instead of zero. - Allocate per-process page tables We can now have 3GB processes, and more than 64 of them! [Original announcement below] As promised, 0.97.pl2 is out today (well, over here it's already tomorrow, so I guess I'm 35 minutes late. Naughty, naughty). Right now, the patch (and full source for those that don't like to patch up the system) is available at "nic.funet.fi: pub/OS/Linux/testing/Linus", but I'll try to put it on some other sites as well if I'm able and energetic enough. Probably tomorrow - together with a binary for those that aren't willing to comple the kernel on their own. 0.97.2 has mostly my mm/fs patches, along with some relatively minor diffs by others (including file locking by Doug Evans). User-level changes are minor: but the mm has changed a lot, and the vfs routines have been changed to keep track of the error-messages a bit better. Also, the vfs-interface to "follow_link()" changed slightly: people who are making filesystems should look at the changes (but they are relatively minor, and shouldn't result in any problems - both the extended fs and minix fs needed just a simple change in their respective symlink.c files). The mm changes /might/ lower performance slightly, as the paging TLB's are now flushed at every task-switch due to the new system, but I doubt it's noticeable. The other performance changes (dynamic buffers etc) in 0.97(.pl1) should overshadow that particular problem. I hope this release means that these kinds of low-level rewrites aren't needed for a while: the last couple of releases have changed some very fundamental things. Nothing seems to have suffered too badly, but I'd be happier if it all got tested more thoroughly. Anyway, discounting the ps/free etc suite of programs, everything I have tried has worked flawlessly despite the big kernel changes. I'm still worried about the reports about messed-up buffers, but have been unable to reproduce the problem, and nobody has so far disillusioned me about my guess that it's a problem with the SCSI code (which at least gives me an excuse for not doing anything about it :-). Other problems include at least one report of spontaneous re-booting, which is totally inexplicable, so I'm blaming hardware once more until I can get better data on the thing. As to patches sent by others: 0.97.2 contains very little of that kind of code. I've been too busy either working, or implementing my own changes that I have simply ignored them for the most part. Remind me (or resend them relative to the new kernel) if you have a patch that is still needed. There is one new system call: 'vm86(struct vm86_struct * info)'. It's not ready for general use yet - it works, but will probably need some tweaking before being practical. But supporting a virtual 86 mode was so easy after the mm rewrite that I felt it was worth implementing: the vm86 code is less than 50 lines of C right now. Linus PS. The bright spot of the week goes to "The Oxford Beer Trolls" - all UK inhabitants should probably be locked into some (big) mental institution and TOBT should probably have a wing of their own, but thanks to them linux can now call itself "beerware" :-) --- Makefile | 15 +- boot/head.s | 92 ++++---- fs/Makefile | 8 +- fs/buffer.c | 1 - fs/exec.c | 43 ++-- fs/ext/freelists.c | 1 + fs/ext/inode.c | 1 + fs/ext/symlink.c | 19 +- fs/fcntl.c | 10 +- fs/locks.c | 471 +++++++++++++++++++++++++++++++++++++ fs/minix/bitmap.c | 1 + fs/minix/inode.c | 1 + fs/minix/symlink.c | 19 +- fs/msdos/Makefile | 5 +- fs/namei.c | 137 ++++++----- fs/open.c | 186 +++++++-------- fs/select.c | 15 +- fs/stat.c | 68 +++--- fs/super.c | 20 +- include/asm/system.h | 10 +- include/linux/ext_fs_i.h | 10 + include/linux/fs.h | 32 ++- include/linux/head.h | 2 +- include/linux/limits.h | 1 + include/linux/minix_fs_i.h | 10 + include/linux/mm.h | 10 +- include/linux/msdos_fs_i.h | 10 + include/linux/sched.h | 45 ++-- include/linux/sys.h | 3 +- include/linux/unistd.h | 1 + include/linux/vm86.h | 55 +++++ kernel/Makefile | 8 +- kernel/blk_drv/Makefile | 6 +- kernel/blk_drv/blk.h | 1 + kernel/blk_drv/hd.c | 44 ++-- kernel/blk_drv/ll_rw_blk.c | 11 +- kernel/chr_drv/mem.c | 4 +- kernel/exit.c | 3 +- kernel/fork.c | 9 +- kernel/math/emulate.c | 3 + kernel/ptrace.c | 2 - kernel/sched.c | 8 +- kernel/signal.c | 5 - kernel/sys.c | 51 +++- kernel/sys_call.S | 23 +- kernel/traps.c | 2 +- mm/memory.c | 376 +++++++++++++++-------------- mm/swap.c | 111 +++++---- net/Makefile | 4 +- net/unix.c | 2 +- 50 files changed, 1337 insertions(+), 638 deletions(-) create mode 100644 fs/locks.c create mode 100644 include/linux/ext_fs_i.h create mode 100644 include/linux/minix_fs_i.h create mode 100644 include/linux/msdos_fs_i.h create mode 100644 include/linux/vm86.h diff --git a/Makefile b/Makefile index d143eacc5331..879e95cc5e87 100644 --- a/Makefile +++ b/Makefile @@ -94,7 +94,7 @@ SUBDIRS =kernel mm fs net lib KERNELHDRS =/usr/src/linux/include .c.s: - $(CC) $(CFLAGS) -S $< + $(CC) $(CFLAGS) -S -o $*.s $< .s.o: $(AS) -c -o $*.o $< .c.o: @@ -103,11 +103,11 @@ KERNELHDRS =/usr/src/linux/include all: Version Image linuxsubdirs: dummy - @for i in $(SUBDIRS); do (cd $$i; echo $$i; $(MAKE)) || exit; done + @for i in $(SUBDIRS); do (cd $$i && echo $$i && $(MAKE)) || exit; done Version: @./makever.sh - @echo \#define UTS_RELEASE \"0.97-`cat .version`\" > include/linux/config_rel.h + @echo \#define UTS_RELEASE \"0.97.pl2-`cat .version`\" > include/linux/config_rel.h @echo \#define UTS_VERSION \"`date +%D`\" > include/linux/config_ver.h touch include/linux/config.h @@ -127,6 +127,9 @@ tools/build: tools/build.c boot/head.o: boot/head.s +init/main.o: init/main.c + $(CC) $(CFLAGS) $(PROFILING) -c -o $*.o $< + tools/system: boot/head.o init/main.o linuxsubdirs $(LD) $(LDFLAGS) -M boot/head.o init/main.o \ $(ARCHIVES) \ @@ -157,17 +160,17 @@ clean: rm -f Image System.map tmp_make core boot/bootsect boot/setup \ boot/bootsect.s boot/setup.s init/main.s rm -f init/*.o tools/system tools/build boot/*.o - for i in $(SUBDIRS); do (cd $$i; $(MAKE) clean); done + for i in $(SUBDIRS); do (cd $$i && $(MAKE) clean); done backup: clean - cd .. ; tar cf - linux | compress - > backup.Z + cd .. && tar cf - linux | compress - > backup.Z sync depend dep: sed '/\#\#\# Dependencies/q' < Makefile > tmp_make for i in init/*.c;do echo -n "init/";$(CPP) -M $$i;done >> tmp_make cp tmp_make Makefile - for i in $(SUBDIRS); do (cd $$i; $(MAKE) dep) || exit; done + for i in $(SUBDIRS); do (cd $$i && $(MAKE) dep) || exit; done dummy: diff --git a/boot/head.s b/boot/head.s index 30ef2d861c5e..2444502a08a0 100644 --- a/boot/head.s +++ b/boot/head.s @@ -12,11 +12,11 @@ * the page directory. */ .text -.globl _idt,_gdt,_pg_dir,_tmp_floppy_area,_floppy_track_buffer +.globl _idt,_gdt,_swapper_pg_dir,_tmp_floppy_area,_floppy_track_buffer /* - * pg_dir is the main page directory, address 0x00000000 + * swapper_pg_dir is the main page directory, address 0x00000000 */ -_pg_dir: +_swapper_pg_dir: startup_32: cld movl $0x10,%eax @@ -26,13 +26,6 @@ startup_32: mov %ax,%gs lss _stack_start,%esp call setup_idt - call setup_gdt - movl $0x10,%eax # reload all the segment registers - mov %ax,%ds # after changing gdt. CS was already - mov %ax,%es # reloaded in 'setup_gdt' - mov %ax,%fs - mov %ax,%gs - lss _stack_start,%esp xorl %eax,%eax 1: incl %eax # check that A20 really IS enabled movl %eax,0x000000 # loop forever if it isn't @@ -94,9 +87,9 @@ check_x87: * setup_idt * * sets up a idt with 256 entries pointing to - * ignore_int, interrupt gates. It then loads - * idt. Everything that wants to install itself - * in the idt-table may do so themselves. Interrupts + * ignore_int, interrupt gates. It doesn't actually load + * idt - that can be done only after paging has been enabled + * and the kernel moved to 0xC0000000. Interrupts * are enabled elsewhere, when we can be relatively * sure everything is ok. This routine will be over- * written by the page tables. @@ -115,21 +108,6 @@ rp_sidt: addl $8,%edi dec %ecx jne rp_sidt - lidt idt_descr - ret - -/* - * setup_gdt - * - * This routines sets up a new gdt and loads it. - * Only two entries are currently built, the same - * ones that were built in init.s. The routine - * is VERY complicated at two whole lines, so this - * rather long comment is certainly needed :-). - * This routine will beoverwritten by the page tables. - */ -setup_gdt: - lgdt gdt_descr ret /* @@ -185,6 +163,15 @@ _floppy_track_buffer: after_page_tables: call setup_paging + lgdt gdt_descr + lidt idt_descr + ljmp $0x08,$1f +1: movl $0x10,%eax # reload all the segment registers + mov %ax,%ds # after changing gdt. + mov %ax,%es + mov %ax,%fs + mov %ax,%gs + lss _stack_start,%esp pushl $0 # These are the parameters to main :-) pushl $0 pushl $0 @@ -248,14 +235,17 @@ ignore_int: */ .align 2 setup_paging: - movl $1024*5,%ecx /* 5 pages - pg_dir+4 page tables */ + movl $1024*5,%ecx /* 5 pages - swapper_pg_dir+4 page tables */ xorl %eax,%eax - xorl %edi,%edi /* pg_dir is at 0x000 */ + xorl %edi,%edi /* swapper_pg_dir is at 0x000 */ cld;rep;stosl - movl $pg0+7,_pg_dir /* set present bit/user r/w */ - movl $pg1+7,_pg_dir+4 /* --------- " " --------- */ - movl $pg2+7,_pg_dir+8 /* --------- " " --------- */ - movl $pg3+7,_pg_dir+12 /* --------- " " --------- */ +/* Identity-map the kernel in low 4MB memory for ease of transition */ + movl $pg0+7,_swapper_pg_dir /* set present bit/user r/w */ +/* But the real place is at 0xC0000000 */ + movl $pg0+7,_swapper_pg_dir+3072 /* set present bit/user r/w */ + movl $pg1+7,_swapper_pg_dir+3076 /* --------- " " --------- */ + movl $pg2+7,_swapper_pg_dir+3080 /* --------- " " --------- */ + movl $pg3+7,_swapper_pg_dir+3084 /* --------- " " --------- */ movl $pg3+4092,%edi movl $0xfff007,%eax /* 16Mb - 4096 + 7 (r/w user,p) */ std @@ -263,29 +253,39 @@ setup_paging: subl $0x1000,%eax jge 1b cld - xorl %eax,%eax /* pg_dir is at 0x0000 */ + xorl %eax,%eax /* swapper_pg_dir is at 0x0000 */ movl %eax,%cr3 /* cr3 - page directory start */ movl %cr0,%eax orl $0x80000000,%eax movl %eax,%cr0 /* set paging (PG) bit */ ret /* this also flushes prefetch-queue */ -.align 2 +/* + * The interrupt descriptor table has room for 256 idt's + */ +.align 4 .word 0 idt_descr: .word 256*8-1 # idt contains 256 entries - .long _idt -.align 2 + .long 0xc0000000+_idt + +.align 4 +_idt: + .fill 256,8,0 # idt is uninitialized + +/* + * The real GDT is also 256 entries long - no real reason + */ +.align 4 .word 0 gdt_descr: - .word 256*8-1 # so does gdt (not that that's any - .long _gdt # magic number, but it works for me :^) - - .align 3 -_idt: .fill 256,8,0 # idt is uninitialized + .word 256*8-1 + .long 0xc0000000+_gdt -_gdt: .quad 0x0000000000000000 /* NULL descriptor */ - .quad 0x00c09a0000000fff /* 16Mb */ - .quad 0x00c0920000000fff /* 16Mb */ +.align 4 +_gdt: + .quad 0x0000000000000000 /* NULL descriptor */ + .quad 0xc0c09a0000000fff /* 16Mb at 0xC0000000 */ + .quad 0xc0c0920000000fff /* 16Mb */ .quad 0x0000000000000000 /* TEMPORARY - don't use */ .fill 252,8,0 /* space for LDT's and TSS's etc */ diff --git a/fs/Makefile b/fs/Makefile index bfe604b37e05..84591324a2c1 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -18,7 +18,7 @@ SUBDIRS =minix ext msdos OBJS= open.o read_write.o inode.o file_table.o buffer.o super.o \ block_dev.o stat.o exec.o pipe.o namei.o fcntl.o ioctl.o \ - select.o fifo.o + select.o fifo.o locks.o all: fs.o fssubdirs @@ -26,18 +26,18 @@ fs.o: $(OBJS) $(LD) -r -o fs.o $(OBJS) fssubdirs: dummy - @for i in $(SUBDIRS); do (cd $$i; echo $$i; $(MAKE)) || exit; done + @for i in $(SUBDIRS); do (cd $$i && echo $$i && $(MAKE)) || exit; done clean: rm -f core *.o *.a tmp_make for i in *.c; do rm -f `basename $$i .c`.s;done - for i in $(SUBDIRS); do (cd $$i; $(MAKE) clean); done + for i in $(SUBDIRS); do (cd $$i && $(MAKE) clean); done depend dep: sed '/\#\#\# Dependencies/q' < Makefile > tmp_make for i in *.c;do $(CPP) -M $$i;done >> tmp_make cp tmp_make Makefile - for i in $(SUBDIRS); do (cd $$i; $(MAKE) dep) || exit; done + for i in $(SUBDIRS); do (cd $$i && $(MAKE) dep) || exit; done dummy: diff --git a/fs/buffer.c b/fs/buffer.c index b8604bebc3a9..2c5b95378bff 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -494,7 +494,6 @@ void grow_buffers(int size) tmp = bh; bh->b_data = (char * ) (page+i); bh->b_size = size; - i += size; } tmp = bh; while (1) { diff --git a/fs/exec.c b/fs/exec.c index 2be1aa3580c8..c0f1c21bbbfd 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -80,7 +80,7 @@ int core_dump(long signr, struct pt_regs * regs) if(current->rlim[RLIMIT_CORE].rlim_cur < PAGE_SIZE/1024) return 0; __asm__("mov %%fs,%0":"=r" (fs)); __asm__("mov %0,%%fs"::"r" ((unsigned short) 0x10)); - if (open_namei("core",O_CREAT | O_WRONLY | O_TRUNC,0600,&inode)) + if (open_namei("core",O_CREAT | O_WRONLY | O_TRUNC,0600,&inode,NULL)) goto end_coredump; if (!S_ISREG(inode->i_mode)) goto end_coredump; @@ -169,17 +169,15 @@ int sys_uselib(const char * library) struct inode * inode; struct buffer_head * bh; struct exec ex; + int error; - if (get_limit(0x17) != TASK_SIZE) + if (!library || get_limit(0x17) != TASK_SIZE) return -EINVAL; if ((libnum >= MAX_SHARED_LIBS) || (libnum < 0)) return -EINVAL; - if (library) - inode = namei(library); - else - inode = NULL; - if (!inode) - return -ENOENT; + error = namei(library,&inode); + if (error) + return error; if (!inode->i_sb || !S_ISREG(inode->i_mode) || !permission(inode,MAY_READ)) { iput(inode); return -EACCES; @@ -203,6 +201,7 @@ int sys_uselib(const char * library) current->libraries[libnum].library = inode; current->libraries[libnum].start = ex.a_entry; current->libraries[libnum].length = (ex.a_data+ex.a_text+0xfff) & 0xfffff000; + current->libraries[libnum].bss = (ex.a_bss+0xfff) & 0xfffff000; #if 0 printk("Loaded library %d at %08x, length %08x\n", libnum, @@ -334,19 +333,19 @@ static unsigned long change_ldt(unsigned long text_size,unsigned long * page) code_limit = TASK_SIZE; data_limit = TASK_SIZE; - code_base = get_base(current->ldt[1]); - data_base = code_base; + code_base = data_base = 0; + current->start_code = code_base; set_base(current->ldt[1],code_base); set_limit(current->ldt[1],code_limit); set_base(current->ldt[2],data_base); set_limit(current->ldt[2],data_limit); /* make sure fs points to the NEW data segment */ __asm__("pushl $0x17\n\tpop %%fs"::); - data_base += data_limit - LIBRARY_SIZE; + data_base += data_limit; for (i=MAX_ARG_PAGES-1 ; i>=0 ; i--) { data_base -= PAGE_SIZE; if (page[i]) - put_dirty_page(page[i],data_base); + put_dirty_page(current,page[i],data_base); } return data_limit; } @@ -405,8 +404,9 @@ int do_execve(unsigned long * eip,long tmp,char * filename, panic("execve called from supervisor mode"); for (i=0 ; iclose_on_exec>>i)&1) sys_close(i); current->close_on_exec = 0; - free_page_tables(get_base(current->ldt[1]),get_limit(0x0f)); - free_page_tables(get_base(current->ldt[2]),get_limit(0x17)); + clear_page_tables(current); if (last_task_used_math == current) last_task_used_math = NULL; current->used_math = 0; p += change_ldt(ex.a_text,page); - p -= LIBRARY_SIZE + MAX_ARG_PAGES*PAGE_SIZE; + p -= MAX_ARG_PAGES*PAGE_SIZE; p = (unsigned long) create_tables((char *)p,argc,envc); current->brk = ex.a_bss + (current->end_data = ex.a_data + (current->end_code = ex.a_text)); current->start_stack = p; - current->rss = (LIBRARY_OFFSET - p + PAGE_SIZE-1) / PAGE_SIZE; + current->rss = (TASK_SIZE - p + PAGE_SIZE-1) / PAGE_SIZE; current->suid = current->euid = e_uid; current->sgid = current->egid = e_gid; if (N_MAGIC(ex) == OMAGIC) diff --git a/fs/ext/freelists.c b/fs/ext/freelists.c index 454796ca982b..a2f6ed372a1d 100644 --- a/fs/ext/freelists.c +++ b/fs/ext/freelists.c @@ -290,6 +290,7 @@ printk("ext_free_inode: inode empty, skipping to %d\n", efi->next); inode->i_ino = j; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; inode->i_op = NULL; + inode->i_blocks = inode->i_blksize = 0; #ifdef EXTFS_DEBUG printk("ext_new_inode : allocating inode %d\n", inode->i_ino); #endif diff --git a/fs/ext/inode.c b/fs/ext/inode.c index 67ed5233a12f..e8874e53f9e0 100644 --- a/fs/ext/inode.c +++ b/fs/ext/inode.c @@ -295,6 +295,7 @@ void ext_read_inode(struct inode * inode) inode->i_nlink = raw_inode->i_nlinks; inode->i_size = raw_inode->i_size; inode->i_mtime = inode->i_atime = inode->i_ctime = raw_inode->i_time; + inode->i_blocks = inode->i_blksize = 0; if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) inode->i_rdev = raw_inode->i_zone[0]; else for (block = 0; block < 12; block++) diff --git a/fs/ext/symlink.c b/fs/ext/symlink.c index 1daf1cb0a23e..96b3b6e1695b 100644 --- a/fs/ext/symlink.c +++ b/fs/ext/symlink.c @@ -21,7 +21,7 @@ #include static int ext_readlink(struct inode *, char *, int); -static struct inode * ext_follow_link(struct inode *, struct inode *); +static int ext_follow_link(struct inode *, struct inode *, int, int, struct inode **); /* * symlinks can't do much... @@ -43,8 +43,10 @@ struct inode_operations ext_symlink_inode_operations = { NULL /* truncate */ }; -static struct inode * ext_follow_link(struct inode * dir, struct inode * inode) +static int ext_follow_link(struct inode * dir, struct inode * inode, + int flag, int mode, struct inode ** res_inode) { + int error; unsigned short fs; struct buffer_head * bh; @@ -54,27 +56,30 @@ static struct inode * ext_follow_link(struct inode * dir, struct inode * inode) } if (!inode) { iput(dir); - return NULL; + *res_inode = NULL; + return -ENOENT; } if (!S_ISLNK(inode->i_mode)) { iput(dir); - return inode; + *res_inode = inode; + return 0; } __asm__("mov %%fs,%0":"=r" (fs)); if ((current->link_count > 5) || !inode->i_data[0] || !(bh = bread(inode->i_dev, inode->i_data[0], BLOCK_SIZE))) { iput(dir); iput(inode); - return NULL; + *res_inode = NULL; + return -ELOOP; } iput(inode); __asm__("mov %0,%%fs"::"r" ((unsigned short) 0x10)); current->link_count++; - inode = _namei(bh->b_data,dir,1); + error = open_namei(bh->b_data,flag,mode,res_inode,dir); current->link_count--; __asm__("mov %0,%%fs"::"r" (fs)); brelse(bh); - return inode; + return error; } static int ext_readlink(struct inode * inode, char * buffer, int buflen) diff --git a/fs/fcntl.c b/fs/fcntl.c index ed2decff70c0..fa0d23bd0e79 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -14,6 +14,8 @@ #include extern int sys_close(int fd); +extern int fcntl_getlk(unsigned int, struct flock *); +extern int fcntl_setlk(unsigned int, unsigned int, struct flock *); static int dupfd(unsigned int fd, unsigned int arg) { @@ -72,8 +74,12 @@ int sys_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg) filp->f_flags &= ~(O_APPEND | O_NONBLOCK); filp->f_flags |= arg & (O_APPEND | O_NONBLOCK); return 0; - case F_GETLK: case F_SETLK: case F_SETLKW: - return -ENOSYS; + case F_GETLK: + return fcntl_getlk(fd, (struct flock *) arg); + case F_SETLK: + return fcntl_setlk(fd, cmd, (struct flock *) arg); + case F_SETLKW: + return fcntl_setlk(fd, cmd, (struct flock *) arg); default: /* sockets need a few special fcntls. */ if (S_ISSOCK (filp->f_inode->i_mode)) diff --git a/fs/locks.c b/fs/locks.c new file mode 100644 index 000000000000..d99821b6ddda --- /dev/null +++ b/fs/locks.c @@ -0,0 +1,471 @@ +/* + * linux/fs/locks.c + * + * Provide support for fcntl()'s F_GETLK, F_SETLK, and F_SETLKW calls. + * Doug Evans, 92Aug07, dje@sspiff.uucp. + * + * FIXME: two things aren't handled yet: + * - deadlock detection/avoidance (of dubious merit, but since it's in + * the definition, I guess it should be provided eventually) + * - mandatory locks (requires lots of changes elsewhere) + */ + +#include + +#include +#include +#include +#include +#include + +#define OFFSET_MAX 0x7fffffff /* FIXME: move elsewhere? */ + +static int copy_flock(struct file *filp, struct file_lock *fl, struct flock *l); +static int conflict(struct file_lock *caller_fl, struct file_lock *sys_fl); +static int overlap(struct file_lock *fl1, struct file_lock *fl2); +static int lock_it(struct file *filp, struct file_lock *caller); +static int unlock_it(struct file *filp, struct file_lock *caller); +static struct file_lock *alloc_lock(struct file *filp, struct file_lock *template); +static void free_lock(struct file *filp, struct file_lock *fl); + +static struct file_lock file_lock_table[NR_FILE_LOCKS]; +static struct file_lock *file_lock_free_list; + +/* + * Called at boot time to initialize the lock table ... + */ + +void fcntl_init_locks(void) +{ + struct file_lock *fl; + + for (fl = &file_lock_table[0]; fl < file_lock_table + NR_FILE_LOCKS - 1; fl++) { + fl->fl_next = fl + 1; + fl->fl_owner = NULL; + } + file_lock_table[NR_FILE_LOCKS - 1].fl_next = NULL; + file_lock_table[NR_FILE_LOCKS - 1].fl_owner = NULL; + file_lock_free_list = &file_lock_table[0]; +} + +int fcntl_getlk(unsigned int fd, struct flock *l) +{ + struct flock flock; + struct file *filp; + struct file_lock *fl,file_lock; + + if (fd >= NR_OPEN || !(filp = current->filp[fd])) + return -EBADF; + verify_area(l, sizeof(*l)); + memcpy_fromfs(&flock, l, sizeof(flock)); + if (flock.l_type == F_UNLCK) + return -EINVAL; + if (!copy_flock(filp, &file_lock, &flock)) + return -EINVAL; + + for (fl = filp->f_inode->i_flock; fl != NULL; fl = fl->fl_next) { + if (conflict(&file_lock, fl)) { + flock.l_pid = fl->fl_owner->pid; + flock.l_start = fl->fl_start; + flock.l_len = fl->fl_end == OFFSET_MAX ? 0 : + fl->fl_end - fl->fl_start + 1; + flock.l_whence = fl->fl_whence; + flock.l_type = fl->fl_type; + memcpy_tofs(l, &flock, sizeof(flock)); + return 0; + } + } + + flock.l_type = F_UNLCK; /* no conflict found */ + memcpy_tofs(l, &flock, sizeof(flock)); + return 0; +} + +/* + * This function implements both F_SETLK and F_SETLKW. + */ + +int fcntl_setlk(unsigned int fd, unsigned int cmd, struct flock *l) +{ + struct file *filp; + struct file_lock *fl,file_lock; + struct flock flock; + + /* + * Get arguments and validate them ... + */ + + if (fd >= NR_OPEN || !(filp = current->filp[fd])) + return -EBADF; + verify_area(l, sizeof(*l)); + memcpy_fromfs(&flock, l, sizeof(flock)); + if (!copy_flock(filp, &file_lock, &flock)) + return -EINVAL; + switch (file_lock.fl_type) { + case F_RDLCK : + if (!(filp->f_mode & 1)) + return -EBADF; + break; + case F_WRLCK : + if (!(filp->f_mode & 2)) + return -EBADF; + break; + case F_UNLCK : + break; + } + + /* + * F_UNLCK needs to be handled differently ... + */ + + if (file_lock.fl_type == F_UNLCK) + return unlock_it(filp, &file_lock); + + /* + * Scan for a conflicting lock ... + */ + +repeat: + for (fl = filp->f_inode->i_flock; fl != NULL; fl = fl->fl_next) { + if (!conflict(&file_lock, fl)) + continue; + /* + * File is locked by another process. If this is F_SETLKW + * wait for the lock to be released. + * FIXME: We need to check for deadlocks here. + */ + if (cmd == F_SETLKW) { + interruptible_sleep_on(&fl->fl_wait); + goto repeat; + } + return -EAGAIN; + } + + /* + * Lock doesn't conflict with any other lock ... + */ + + return lock_it(filp, &file_lock); +} + +/* + * This function is called when the file is closed. + */ + +void fcntl_remove_locks(struct task_struct *task, struct file *filp) +{ + struct file_lock *fl,*next; + + for (fl = filp->f_inode->i_flock; fl != NULL; ) { + /* + * If this one is freed, {fl_next} gets clobbered when the + * entry is moved to the free list, so grab it now ... + */ + next = fl->fl_next; + if (fl->fl_owner == task) + free_lock(filp, fl); + fl = next; + } +} + +/* + * Verify a "struct flock" and copy it to a "struct file_lock" ... + * Result is a boolean indicating success. + */ + +static int copy_flock(struct file *filp, struct file_lock *fl, struct flock *l) +{ + off_t start; + + if (!filp->f_inode) /* just in case */ + return 0; + if (!S_ISREG(filp->f_inode->i_mode)) + return 0; + if (l->l_type != F_UNLCK && l->l_type != F_RDLCK && l->l_type != F_WRLCK) + return 0; + switch (l->l_whence) { + case 0 /*SEEK_SET*/ : start = 0; break; + case 1 /*SEEK_CUR*/ : start = filp->f_pos; break; + case 2 /*SEEK_END*/ : start = filp->f_inode->i_size; break; + default : return 0; + } + if ((start += l->l_start) < 0 || l->l_len < 0) + return 0; + fl->fl_type = l->l_type; + fl->fl_start = start; /* we record the absolute position */ + fl->fl_whence = 0; /* FIXME: do we record {l_start} as passed? */ + if (l->l_len == 0 || (fl->fl_end = start + l->l_len - 1) < 0) + fl->fl_end = OFFSET_MAX; + fl->fl_owner = current; + fl->fl_wait = NULL; /* just for cleanliness */ + return 1; +} + +/* + * Determine if lock {sys_fl} blocks lock {caller_fl} ... + */ + +static int conflict(struct file_lock *caller_fl, struct file_lock *sys_fl) +{ + if (caller_fl->fl_owner == sys_fl->fl_owner) + return 0; + if (!overlap(caller_fl, sys_fl)) + return 0; + switch (caller_fl->fl_type) { + case F_RDLCK : + return sys_fl->fl_type != F_RDLCK; + case F_WRLCK : + return 1; /* overlapping region not owned by caller */ + } + return 0; /* shouldn't get here, but just in case */ +} + +static int overlap(struct file_lock *fl1, struct file_lock *fl2) +{ + if (fl1->fl_start <= fl2->fl_start) { + return fl1->fl_end >= fl2->fl_start; + } else { + return fl2->fl_end >= fl1->fl_start; + } +} + +/* + * Add a lock to a file ... + * Result is 0 for success or -ENOLCK. + * + * We try to be real clever here and always minimize the number of table + * entries we use. For example we merge adjacent locks whenever possible. This + * consumes a bit of cpu and code space, is it really worth it? Beats me. + * + * I've tried to keep the following as small and simple as possible. If you can + * make it smaller or simpler, please do. /dje 92Aug11 + * + * WARNING: We assume the lock doesn't conflict with any other lock. + */ + +static int lock_it(struct file *filp, struct file_lock *caller) +{ + struct file_lock *fl,*new; + + /* + * It's easier if we allocate a slot for the lock first, and then + * release it later if we have to (IE: if it can be merged with + * another). This way the for() loop always knows that {caller} is an + * existing entry. This will cause the routine to fail unnecessarily + * in rare cases, but perfection can be pushed too far. :-) + */ + + if ((caller = alloc_lock(filp, caller)) == NULL) + return -ENOLCK; + + /* + * First scan to see if we are changing/augmenting an existing lock ... + */ + + for (fl = filp->f_inode->i_flock; fl != NULL; fl = fl->fl_next) { + if (caller->fl_owner != fl->fl_owner) + continue; + if (caller == fl) + continue; + if (!overlap(caller, fl)) { + /* + * Detect adjacent regions (if same lock type) ... + */ + if (caller->fl_type != fl->fl_type) + continue; + if (caller->fl_end + 1 == fl->fl_start) { + fl->fl_start = caller->fl_start; + free_lock(filp, caller); + caller = fl; + /* must continue, may overlap others now */ + } else if (caller->fl_start - 1 == fl->fl_end) { + fl->fl_end = caller->fl_end; + free_lock(filp, caller); + caller = fl; + /* must continue, may overlap others now */ + } + continue; + } + /* + * We've found an overlapping region. Is it a change of lock + * type, or are we changing the size of the locked space? + */ + if (caller->fl_type != fl->fl_type) { + if (caller->fl_start > fl->fl_start && caller->fl_end < fl->fl_end) { + /* + * The new lock splits the old one in two ... + * {fl} is the bottom piece, {caller} is the + * new lock, and {new} is the top piece. + */ + if ((new = alloc_lock(filp, fl)) == NULL) { + free_lock(filp, caller); + return -ENOLCK; + } + fl->fl_end = caller->fl_start - 1; + new->fl_start = caller->fl_end + 1; + return 0; + } + if (caller->fl_start <= fl->fl_start && caller->fl_end >= fl->fl_end) { + /* + * The new lock completely replaces old one ... + */ + free_lock(filp, fl); + return 0; + } + if (caller->fl_end < fl->fl_end) { + fl->fl_start = caller->fl_end + 1; + /* must continue, may be more overlaps */ + } else if (caller->fl_start > fl->fl_start) { + fl->fl_end = caller->fl_start - 1; + /* must continue, may be more overlaps */ + } else { + printk("lock_it: program bug: unanticipated overlap\n"); + free_lock(filp, caller); + return -ENOLCK; + } + } else { /* The new lock augments an existing lock ... */ + int grew = 0; + + if (caller->fl_start < fl->fl_start) { + fl->fl_start = caller->fl_start; + grew = 1; + } + if (caller->fl_end > fl->fl_end) { + fl->fl_end = caller->fl_end; + grew = 1; + } + free_lock(filp, caller); + caller = fl; + if (!grew) + return 0; + /* must continue, may be more overlaps */ + } + } + + /* + * New lock doesn't overlap any regions ... + * alloc_lock() has already been called, so we're done! + */ + + return 0; +} + +/* + * Handle F_UNLCK ... + * Result is 0 for success, or -EINVAL or -ENOLCK. + * ENOLCK can happen when a lock is split into two. + */ + +static int unlock_it(struct file *filp, struct file_lock *caller) +{ + int one_unlocked = 0; + struct file_lock *fl,*next; + + for (fl = filp->f_inode->i_flock; fl != NULL; ) { + if (caller->fl_owner != fl->fl_owner || !overlap(caller, fl)) { + fl = fl->fl_next; + continue; + } + one_unlocked = 1; + if (caller->fl_start > fl->fl_start && caller->fl_end < fl->fl_end) { + /* + * Lock is split in two ... + * {fl} is the bottom piece, {next} is the top piece. + */ + if ((next = alloc_lock(filp, fl)) == NULL) + return -ENOLCK; + fl->fl_end = caller->fl_start - 1; + next->fl_start = caller->fl_end + 1; + return 0; + } + /* + * At this point we know there is an overlap and we know the + * lock isn't split into two ... + * + * Unless the lock table is broken, entries will not overlap. + * IE: User X won't have an entry locking bytes 1-3 and another + * entry locking bytes 3-5. Therefore, if the area being + * unlocked is a subset of the total area, we don't need to + * traverse any more of the list. The code is a tad more + * complicated by this optimization. Perhaps it's not worth it. + * + * WARNING: We assume free_lock() does not alter + * {fl_start, fl_end}. + * + * {fl_next} gets clobbered when the entry is moved to + * the free list, so grab it now ... + */ + next = fl->fl_next; + if (caller->fl_start <= fl->fl_start && caller->fl_end >= fl->fl_end) { + free_lock(filp, fl); + } else if (caller->fl_start > fl->fl_start) { + fl->fl_end = caller->fl_start - 1; + } else { + /* caller->fl_end < fl->fl_end */ + fl->fl_start = caller->fl_end + 1; + } + if (caller->fl_start >= fl->fl_start && caller->fl_end <= fl->fl_end) + return 0; /* no more to be found */ + fl = next; + /* must continue, there may be more to unlock */ + } + + return one_unlocked ? 0 : -EINVAL; +} + +static struct file_lock *alloc_lock(struct file *filp, struct file_lock *template) +{ + struct file_lock *new; + + if (file_lock_free_list == NULL) + return NULL; /* no available entry */ + if (file_lock_free_list->fl_owner != NULL) + panic("alloc_lock: broken free list\n"); + + new = file_lock_free_list; /* remove from free list */ + file_lock_free_list = file_lock_free_list->fl_next; + + *new = *template; + + new->fl_next = filp->f_inode->i_flock; /* insert into file's list */ + filp->f_inode->i_flock = new; + + new->fl_owner = current; /* FIXME: needed? */ + new->fl_wait = NULL; + return new; +} + +/* + * Add a lock to the free list ... + * + * WARNING: We must not alter {fl_start, fl_end}. See unlock_it(). + */ + +static void free_lock(struct file *filp, struct file_lock *fl) +{ + struct file_lock **fl_p; + + if (fl->fl_owner == NULL) /* sanity check */ + panic("free_lock: broken lock list\n"); + + /* + * We only use a singly linked list to save some memory space + * (the only place we'd use a doubly linked list is here). + */ + + for (fl_p = &filp->f_inode->i_flock; *fl_p != NULL; fl_p = &(*fl_p)->fl_next) { + if (*fl_p == fl) + break; + } + if (*fl_p == NULL) { + printk("free_lock: lock is not in file's lock list\n"); + } else { + *fl_p = (*fl_p)->fl_next; + } + + fl->fl_next = file_lock_free_list; /* add to free list */ + file_lock_free_list = fl; + fl->fl_owner = NULL; /* for sanity checks */ + + wake_up(&fl->fl_wait); +} diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c index ddf74f616528..51082b2bd435 100644 --- a/fs/minix/bitmap.c +++ b/fs/minix/bitmap.c @@ -230,6 +230,7 @@ struct inode * minix_new_inode(int dev) inode->i_ino = j + i*8192; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; inode->i_op = NULL; + inode->i_blocks = inode->i_blksize = 0; return inode; } diff --git a/fs/minix/inode.c b/fs/minix/inode.c index b90a4e8d7d88..d8ae3bce42c9 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -230,6 +230,7 @@ void minix_read_inode(struct inode * inode) inode->i_nlink = raw_inode->i_nlinks; inode->i_size = raw_inode->i_size; inode->i_mtime = inode->i_atime = inode->i_ctime = raw_inode->i_time; + inode->i_blocks = inode->i_blksize = 0; if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) inode->i_rdev = raw_inode->i_zone[0]; else for (block = 0; block < 9; block++) diff --git a/fs/minix/symlink.c b/fs/minix/symlink.c index 65263a0e8cc8..b5683ba20dcf 100644 --- a/fs/minix/symlink.c +++ b/fs/minix/symlink.c @@ -15,7 +15,7 @@ #include static int minix_readlink(struct inode *, char *, int); -static struct inode * minix_follow_link(struct inode *, struct inode *); +static int minix_follow_link(struct inode *, struct inode *, int, int, struct inode **); /* * symlinks can't do much... @@ -37,8 +37,10 @@ struct inode_operations minix_symlink_inode_operations = { NULL /* truncate */ }; -static struct inode * minix_follow_link(struct inode * dir, struct inode * inode) +static int minix_follow_link(struct inode * dir, struct inode * inode, + int flag, int mode, struct inode ** res_inode) { + int error; unsigned short fs; struct buffer_head * bh; @@ -48,27 +50,30 @@ static struct inode * minix_follow_link(struct inode * dir, struct inode * inode } if (!inode) { iput(dir); - return NULL; + *res_inode = NULL; + return -ENOENT; } if (!S_ISLNK(inode->i_mode)) { iput(dir); - return inode; + *res_inode = inode; + return 0; } __asm__("mov %%fs,%0":"=r" (fs)); if ((current->link_count > 5) || !inode->i_data[0] || !(bh = bread(inode->i_dev, inode->i_data[0], BLOCK_SIZE))) { iput(dir); iput(inode); - return NULL; + *res_inode = NULL; + return -ELOOP; } iput(inode); __asm__("mov %0,%%fs"::"r" ((unsigned short) 0x10)); current->link_count++; - inode = _namei(bh->b_data,dir,1); + error = open_namei(bh->b_data,flag,mode,res_inode,dir); current->link_count--; __asm__("mov %0,%%fs"::"r" (fs)); brelse(bh); - return inode; + return error; } static int minix_readlink(struct inode * inode, char * buffer, int buflen) diff --git a/fs/msdos/Makefile b/fs/msdos/Makefile index 485a62b063c7..0d172a211e8b 100644 --- a/fs/msdos/Makefile +++ b/fs/msdos/Makefile @@ -8,10 +8,9 @@ # Note 2! The CFLAGS definitions are now in the main makefile... .c.s: - $(CC) $(CFLAGS) \ - -S -o $*.s $< + $(CC) $(CFLAGS) -S $< .c.o: - $(CC) $(CFLAGS) -c -o $*.o $< + $(CC) $(CFLAGS) -c $< .s.o: $(AS) -o $*.o $< diff --git a/fs/namei.c b/fs/namei.c index 42eb868bd488..17b3b448a3ec 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -19,9 +19,6 @@ #include #include -struct inode * _namei(const char * filename, struct inode * base, - int follow_links); - #define ACC_MODE(x) ("\004\002\006\377"[(x)&O_ACCMODE]) /* @@ -91,18 +88,21 @@ int lookup(struct inode * dir,const char * name, int len, return dir->i_op->lookup(dir,name,len,result); } -struct inode * follow_link(struct inode * dir, struct inode * inode) +int follow_link(struct inode * dir, struct inode * inode, + int flag, int mode, struct inode ** res_inode) { if (!dir || !inode) { iput(dir); iput(inode); - return NULL; + *res_inode = NULL; + return -ENOENT; } if (!inode->i_op || !inode->i_op->follow_link) { iput(dir); - return inode; + *res_inode = inode; + return 0; } - return inode->i_op->follow_link(dir,inode); + return inode->i_op->follow_link(dir,inode,flag,mode,res_inode); } /* @@ -111,14 +111,15 @@ struct inode * follow_link(struct inode * dir, struct inode * inode) * dir_namei() returns the inode of the directory of the * specified name, and the name within that directory. */ -static struct inode * dir_namei(const char * pathname, - int * namelen, const char ** name, struct inode * base) +static int dir_namei(const char * pathname, int * namelen, const char ** name, + struct inode * base, struct inode ** res_inode) { char c; const char * thisname; int len,error; struct inode * inode; + *res_inode = NULL; if (!base) { base = current->pwd; base->i_count++; @@ -139,41 +140,48 @@ static struct inode * dir_namei(const char * pathname, error = lookup(base,thisname,len,&inode); if (error) { iput(base); - return NULL; + return error; } - if (!(base = follow_link(base,inode))) - return NULL; + error = follow_link(base,inode,0,0,&base); + if (error) + return error; } *name = thisname; *namelen = len; - return base; + *res_inode = base; + return 0; } -struct inode * _namei(const char * pathname, struct inode * base, - int follow_links) +static int _namei(const char * pathname, struct inode * base, + int follow_links, struct inode ** res_inode) { const char * basename; int namelen,error; struct inode * inode; - if (!(base = dir_namei(pathname,&namelen,&basename,base))) - return NULL; + *res_inode = NULL; + error = dir_namei(pathname,&namelen,&basename,base,&base); + if (error) + return error; base->i_count++; /* lookup uses up base */ error = lookup(base,basename,namelen,&inode); if (error) { iput(base); - return NULL; + return error; } - if (follow_links) - inode = follow_link(base,inode); - else + if (follow_links) { + error = follow_link(base,inode,0,0,&inode); + if (error) + return error; + } else iput(base); - return inode; + *res_inode = inode; + return 0; } -struct inode * lnamei(const char * pathname) +int lnamei(const char * pathname, struct inode ** res_inode) { - return _namei(pathname, NULL, 0); + return _namei(pathname,NULL,0,res_inode); } /* @@ -183,9 +191,9 @@ struct inode * lnamei(const char * pathname) * Open, link etc use their own routines, but this is enough for things * like 'chmod' etc. */ -struct inode * namei(const char * pathname) +int namei(const char * pathname, struct inode ** res_inode) { - return _namei(pathname,NULL,1); + return _namei(pathname,NULL,1,res_inode); } /* @@ -194,7 +202,7 @@ struct inode * namei(const char * pathname) * namei for open - this is in fact almost the whole open-routine. */ int open_namei(const char * pathname, int flag, int mode, - struct inode ** res_inode) + struct inode ** res_inode, struct inode * base) { const char * basename; int namelen,error,i; @@ -205,8 +213,9 @@ int open_namei(const char * pathname, int flag, int mode, flag |= O_WRONLY; mode &= 07777 & ~current->umask; mode |= I_REGULAR; - if (!(dir = dir_namei(pathname,&namelen,&basename,NULL))) - return -ENOENT; + error = dir_namei(pathname,&namelen,&basename,base,&dir); + if (error) + return error; if (!namelen) { /* special case: '/usr/' etc */ if (!(flag & (O_ACCMODE|O_CREAT|O_TRUNC))) { *res_inode=dir; @@ -241,8 +250,8 @@ int open_namei(const char * pathname, int flag, int mode, iput(inode); return -EEXIST; } - if (!(inode = follow_link(dir,inode))) - return -ELOOP; + if (error = follow_link(dir,inode,flag,mode,&inode)) + return error; if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) { if (IS_NODEV(inode)) { iput(inode); @@ -289,11 +298,12 @@ int open_namei(const char * pathname, int flag, int mode, int do_mknod(const char * filename, int mode, int dev) { const char * basename; - int namelen; + int namelen, error; struct inode * dir; - - if (!(dir = dir_namei(filename,&namelen,&basename, NULL))) - return -ENOENT; + + error = dir_namei(filename,&namelen,&basename, NULL, &dir); + if (error) + return error; if (!namelen) { iput(dir); return -ENOENT; @@ -323,11 +333,12 @@ int sys_mknod(const char * filename, int mode, int dev) int sys_mkdir(const char * pathname, int mode) { const char * basename; - int namelen; + int namelen, error; struct inode * dir; - if (!(dir = dir_namei(pathname,&namelen,&basename, NULL))) - return -ENOENT; + error = dir_namei(pathname,&namelen,&basename,NULL,&dir); + if (error) + return error; if (!namelen) { iput(dir); return -ENOENT; @@ -350,11 +361,12 @@ int sys_mkdir(const char * pathname, int mode) int sys_rmdir(const char * name) { const char * basename; - int namelen; + int namelen, error; struct inode * dir; - if (!(dir = dir_namei(name,&namelen,&basename, NULL))) - return -ENOENT; + error = dir_namei(name,&namelen,&basename,NULL,&dir); + if (error) + return error; if (!namelen) { iput(dir); return -ENOENT; @@ -377,11 +389,12 @@ int sys_rmdir(const char * name) int sys_unlink(const char * name) { const char * basename; - int namelen; + int namelen, error; struct inode * dir; - if (!(dir = dir_namei(name,&namelen,&basename, NULL))) - return -ENOENT; + error = dir_namei(name,&namelen,&basename,NULL,&dir); + if (error) + return error; if (!namelen) { iput(dir); return -EPERM; @@ -405,11 +418,11 @@ int sys_symlink(const char * oldname, const char * newname) { struct inode * dir; const char * basename; - int namelen; + int namelen, error; - dir = dir_namei(newname,&namelen,&basename, NULL); - if (!dir) - return -ENOENT; + error = dir_namei(newname,&namelen,&basename,NULL,&dir); + if (error) + return error; if (!namelen) { iput(dir); return -ENOENT; @@ -433,15 +446,15 @@ int sys_link(const char * oldname, const char * newname) { struct inode * oldinode, * dir; const char * basename; - int namelen; + int namelen, error; - oldinode = namei(oldname); - if (!oldinode) - return -ENOENT; - dir = dir_namei(newname,&namelen,&basename, NULL); - if (!dir) { + error = namei(oldname, &oldinode); + if (error) + return error; + error = dir_namei(newname,&namelen,&basename,NULL,&dir); + if (error) { iput(oldinode); - return -EACCES; + return error; } if (!namelen) { iput(oldinode); @@ -475,11 +488,11 @@ int sys_rename(const char * oldname, const char * newname) { struct inode * old_dir, * new_dir; const char * old_base, * new_base; - int old_len, new_len; + int old_len, new_len, error; - old_dir = dir_namei(oldname,&old_len,&old_base, NULL); - if (!old_dir) - return -ENOENT; + error = dir_namei(oldname,&old_len,&old_base,NULL,&old_dir); + if (error) + return error; if (!permission(old_dir,MAY_WRITE)) { iput(old_dir); return -EACCES; @@ -490,10 +503,10 @@ int sys_rename(const char * oldname, const char * newname) iput(old_dir); return -EPERM; } - new_dir = dir_namei(newname,&new_len,&new_base, NULL); - if (!new_dir) { + error = dir_namei(newname,&new_len,&new_base,NULL,&new_dir); + if (error) { iput(old_dir); - return -ENOENT; + return error; } if (!permission(new_dir,MAY_WRITE)) { iput(old_dir); diff --git a/fs/open.c b/fs/open.c index 183b10cbe74f..3ee6c987f757 100644 --- a/fs/open.c +++ b/fs/open.c @@ -17,6 +17,8 @@ #include #include +extern void fcntl_remove_locks(struct task_struct *, struct file *); + struct file_operations * chrdev_fops[MAX_CHRDEV] = { NULL, }; @@ -33,10 +35,12 @@ int sys_ustat(int dev, struct ustat * ubuf) int sys_statfs(const char * path, struct statfs * buf) { struct inode * inode; + int error; verify_area(buf, sizeof(struct statfs)); - if (!(inode = namei(path))) - return -ENOENT; + error = namei(path,&inode); + if (error) + return error; if (!inode->i_sb->s_op->statfs) { iput(inode); return -ENOSYS; @@ -65,9 +69,11 @@ int sys_fstatfs(unsigned int fd, struct statfs * buf) int sys_truncate(const char * path, unsigned int length) { struct inode * inode; + int error; - if (!(inode = namei(path))) - return -ENOENT; + error = namei(path,&inode); + if (error) + return error; if (S_ISDIR(inode->i_mode) || !permission(inode,MAY_WRITE)) { iput(inode); return -EACCES; @@ -112,9 +118,11 @@ int sys_utime(char * filename, struct utimbuf * times) { struct inode * inode; long actime,modtime; + int error; - if (!(inode=namei(filename))) - return -ENOENT; + error = namei(filename,&inode); + if (error) + return error; if (IS_RDONLY(inode)) { iput(inode); return -EROFS; @@ -151,8 +159,9 @@ int sys_access(const char * filename,int mode) int res, i_mode; mode &= 0007; - if (!(inode=namei(filename))) - return -EACCES; + res = namei(filename,&inode); + if (res) + return res; i_mode = res = inode->i_mode & 0777; iput(inode); if (current->uid == inode->i_uid) @@ -176,9 +185,11 @@ int sys_access(const char * filename,int mode) int sys_chdir(const char * filename) { struct inode * inode; + int error; - if (!(inode = namei(filename))) - return -ENOENT; + error = namei(filename,&inode); + if (error) + return error; if (!S_ISDIR(inode->i_mode)) { iput(inode); return -ENOTDIR; @@ -195,9 +206,11 @@ int sys_chdir(const char * filename) int sys_chroot(const char * filename) { struct inode * inode; + int error; - if (!(inode=namei(filename))) - return -ENOENT; + error = namei(filename,&inode); + if (error) + return error; if (!S_ISDIR(inode->i_mode)) { iput(inode); return -ENOTDIR; @@ -232,9 +245,11 @@ int sys_fchmod(unsigned int fd, mode_t mode) int sys_chmod(const char * filename, mode_t mode) { struct inode * inode; + int error; - if (!(inode = namei(filename))) - return -ENOENT; + error = namei(filename,&inode); + if (error) + return error; if ((current->euid != inode->i_uid) && !suser()) { iput(inode); return -EPERM; @@ -274,9 +289,11 @@ int sys_fchown(unsigned int fd, uid_t user, gid_t group) int sys_chown(const char * filename, uid_t user, gid_t group) { struct inode * inode; + int error; - if (!(inode = lnamei(filename))) - return -ENOENT; + error = lnamei(filename,&inode); + if (error) + return error; if (IS_RDONLY(inode)) { iput(inode); return -EROFS; @@ -310,7 +327,7 @@ int sys_open(const char * filename,int flag,int mode) if (!f) return -ENFILE; current->filp[fd] = f; - if ((i = open_namei(filename,flag,mode,&inode))<0) { + if ((i = open_namei(filename,flag,mode,&inode,NULL))<0) { current->filp[fd]=NULL; f->f_count--; return i; @@ -338,25 +355,23 @@ int sys_creat(const char * pathname, int mode) return sys_open(pathname, O_CREAT | O_WRONLY | O_TRUNC, mode); } -static int -close_fp (struct file *filp) +static int close_fp(struct file *filp) { - struct inode *inode; + struct inode *inode; if (filp->f_count == 0) { printk("Close: file count is 0\n"); return 0; } - + inode = filp->f_inode; + if (S_ISREG(inode->i_mode)) + fcntl_remove_locks(current, filp); if (filp->f_count > 1) { filp->f_count--; return 0; } - - inode = filp->f_inode; if (filp->f_op && filp->f_op->release) filp->f_op->release(inode,filp); - filp->f_count--; filp->f_inode = NULL; iput(inode); @@ -376,94 +391,75 @@ int sys_close(unsigned int fd) return (close_fp (filp)); } -/* This routine looks through all the process's and closes any - references to the current processes tty. To avoid problems with - process sleeping on an inode which has already been iput, anyprocess - which is sleeping on the tty is sent a sigkill (It's probably a rogue - process.) Also no process should ever have /dev/console as it's - controlling tty, or have it open for reading. So we don't have to - worry about messing with all the daemons abilities to write messages - to the console. (Besides they should be using syslog.) */ - -int -sys_vhangup(void) +/* + * This routine looks through all the process's and closes any + * references to the current processes tty. To avoid problems with + * process sleeping on an inode which has already been iput, anyprocess + * which is sleeping on the tty is sent a sigkill (It's probably a rogue + * process.) Also no process should ever have /dev/console as it's + * controlling tty, or have it open for reading. So we don't have to + * worry about messing with all the daemons abilities to write messages + * to the console. (Besides they should be using syslog.) + */ +int sys_vhangup(void) { - int i; - int j; - struct file *filep; - struct tty_struct *tty; - extern void kill_wait (struct wait_queue **q, int signal); - extern int kill_pg (int pgrp, int sig, int priv); - - if (!suser()) return (-EPERM); - - /* send the SIGHUP signal. */ - kill_pg (current->pgrp, SIGHUP, 0); + int i,j; + struct file *filep; + struct tty_struct *tty; + extern void kill_wait (struct wait_queue **q, int signal); + extern int kill_pg (int pgrp, int sig, int priv); - /* See if there is a controlling tty. */ - if (current->tty < 0) return (0); - - for (i = 0; i < NR_TASKS; i++) - { - if (task[i] == NULL) continue; - for (j = 0; j < NR_OPEN; j++) - { - filep = task[i]->filp[j]; - - if (filep == NULL) continue; - - /* now we need to check to see if this file points to the - device we are trying to close. */ - - if (!S_ISCHR (filep->f_inode->i_mode)) continue; - - /* This will catch both /dev/tty and the explicit terminal - device. However, we must make sure that f_rdev is - defined and correct. */ - - if ((MAJOR(filep->f_inode->i_rdev) == 5 || - MAJOR(filep->f_inode->i_rdev) == 4 ) && - (MAJOR(filep->f_rdev) == 4 && - MINOR(filep->f_rdev) == MINOR (current->tty))) - { - task[i]->filp[j] = NULL; + if (!suser()) + return -EPERM; + /* send the SIGHUP signal. */ + kill_pg(current->pgrp, SIGHUP, 0); + /* See if there is a controlling tty. */ + if (current->tty < 0) + return 0; + for (i = 0; i < NR_TASKS; i++) { + if (task[i] == NULL) + continue; + for (j = 0; j < NR_OPEN; j++) { + filep = task[i]->filp[j]; + if (!filep) + continue; + if (!S_ISCHR(filep->f_inode->i_mode)) + continue; + if ((MAJOR(filep->f_inode->i_rdev) == 5 || + MAJOR(filep->f_inode->i_rdev) == 4 ) && + (MAJOR(filep->f_rdev) == 4 && + MINOR(filep->f_rdev) == MINOR (current->tty))) { /* so now we have found something to close. We need to kill every process waiting on the inode. */ - - kill_wait (&filep->f_inode->i_wait, SIGKILL); + task[i]->filp[j] = NULL; + kill_wait (&filep->f_inode->i_wait, SIGKILL); /* now make sure they are awake before we close the file. */ - wake_up (&filep->f_inode->i_wait); + wake_up (&filep->f_inode->i_wait); /* finally close the file. */ - current->close_on_exec &= ~(1<close_on_exec &= ~(1<tty until after the loop is complete. */ - if (task[i]->tty == current->tty && task[i] != current) - { - task[i]->tty = -1; - } - } - + if (task[i]->tty == current->tty && task[i] != current) { + task[i]->tty = -1; + } + } /* need to do tty->session = 0 */ - tty = TTY_TABLE(MINOR(current->tty)); - tty->session = 0; - tty->pgrp = -1; - current->tty = -1; - - - return (0); + tty = TTY_TABLE(MINOR(current->tty)); + tty->session = 0; + tty->pgrp = -1; + current->tty = -1; + return 0; } diff --git a/fs/select.c b/fs/select.c index 5cd8e0f01e92..4dc1682b50eb 100644 --- a/fs/select.c +++ b/fs/select.c @@ -51,6 +51,8 @@ static int check_in(select_table * wait, struct inode * inode, struct file * fil { if (file->f_op && file->f_op->select) return file->f_op->select(inode,file,SEL_IN,wait); + if (inode && S_ISREG(inode->i_mode)) + return 1; return 0; } @@ -58,6 +60,8 @@ static int check_out(select_table * wait, struct inode * inode, struct file * fi { if (file->f_op && file->f_op->select) return file->f_op->select(inode,file,SEL_OUT,wait); + if (inode && S_ISREG(inode->i_mode)) + return 1; return 0; } @@ -65,6 +69,8 @@ static int check_ex(select_table * wait, struct inode * inode, struct file * fil { if (file->f_op && file->f_op->select) return file->f_op->select(inode,file,SEL_EX,wait); + if (inode && S_ISREG(inode->i_mode)) + return 1; return 0; } @@ -85,15 +91,6 @@ int do_select(fd_set in, fd_set out, fd_set ex, return -EBADF; if (!current->filp[i]->f_inode) return -EBADF; - if (current->filp[i]->f_inode->i_pipe) - continue; - if (S_ISCHR(current->filp[i]->f_inode->i_mode)) - continue; - if (S_ISFIFO(current->filp[i]->f_inode->i_mode)) - continue; - if (S_ISSOCK(current->filp[i]->f_inode->i_mode)) - continue; - return -EBADF; } repeat: wait_table.nr = 0; diff --git a/fs/stat.c b/fs/stat.c index 225f9d3bbecf..459f418a6e1b 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -56,35 +56,41 @@ static void cp_new_stat(struct inode * inode, struct new_stat * statbuf) tmp.st_mtime = inode->i_mtime; tmp.st_ctime = inode->i_ctime; /* - * Right now we fake the st_blocks numbers: we'll eventually have to - * add st_blocks to the inode, and let the vfs routines keep track of - * it all. This algorithm doesn't guarantee correct block numbers, but - * at least it tries to come up with a plausible answer... - * - * In fact, the minix fs doesn't use these numbers (it uses 7 and 512 - * instead of 10 and 256), but who cares... It's not that exact anyway. + * st_blocks and st_blksize are approximated with a simple algorithm if + * they aren't supported directly by the filesystem. The minix and msdos + * filesystems don't keep track of blocks, so they would either have to + * be counted explicitly (by delving into the file itself), or by using + * this simple algorithm to get a reasonable (although not 100% accurate) + * value. */ - blocks = (tmp.st_size + 1023) / 1024; - if (blocks > 10) { - indirect = (blocks - 11)/256+1; - if (blocks > 10+256) { - indirect += (blocks - 267)/(256*256)+1; - if (blocks > 10+256+256*256) - indirect++; + if (!inode->i_blksize) { + blocks = (tmp.st_size + 511) / 512; + if (blocks > 10) { + indirect = (blocks - 11)/256+1; + if (blocks > 10+256) { + indirect += (blocks - 267)/(256*256)+1; + if (blocks > 10+256+256*256) + indirect++; + } + blocks += indirect; } - blocks += indirect; + tmp.st_blksize = 512; + tmp.st_blocks = blocks; + } else { + tmp.st_blksize = inode->i_blksize; + tmp.st_blocks = inode->i_blocks; } - tmp.st_blksize = 1024; - tmp.st_blocks = blocks; memcpy_tofs(statbuf,&tmp,sizeof(tmp)); } int sys_stat(char * filename, struct old_stat * statbuf) { struct inode * inode; + int error; - if (!(inode=namei(filename))) - return -ENOENT; + error = namei(filename,&inode); + if (error) + return error; cp_old_stat(inode,statbuf); iput(inode); return 0; @@ -93,9 +99,11 @@ int sys_stat(char * filename, struct old_stat * statbuf) int sys_newstat(char * filename, struct new_stat * statbuf) { struct inode * inode; + int error; - if (!(inode=namei(filename))) - return -ENOENT; + error = namei(filename,&inode); + if (error) + return error; cp_new_stat(inode,statbuf); iput(inode); return 0; @@ -104,9 +112,11 @@ int sys_newstat(char * filename, struct new_stat * statbuf) int sys_lstat(char * filename, struct old_stat * statbuf) { struct inode * inode; + int error; - if (!(inode = lnamei(filename))) - return -ENOENT; + error = lnamei(filename,&inode); + if (error) + return error; cp_old_stat(inode,statbuf); iput(inode); return 0; @@ -115,9 +125,11 @@ int sys_lstat(char * filename, struct old_stat * statbuf) int sys_newlstat(char * filename, struct new_stat * statbuf) { struct inode * inode; + int error; - if (!(inode = lnamei(filename))) - return -ENOENT; + error = lnamei(filename,&inode); + if (error) + return error; cp_new_stat(inode,statbuf); iput(inode); return 0; @@ -148,12 +160,14 @@ int sys_newfstat(unsigned int fd, struct new_stat * statbuf) int sys_readlink(const char * path, char * buf, int bufsiz) { struct inode * inode; + int error; if (bufsiz <= 0) return -EINVAL; verify_area(buf,bufsiz); - if (!(inode = lnamei(path))) - return -ENOENT; + error = lnamei(path,&inode); + if (error) + return error; if (!inode->i_op || !inode->i_op->readlink) { iput(inode); return -EINVAL; diff --git a/fs/super.c b/fs/super.c index 4d0e0303faf5..c3c47dca4165 100644 --- a/fs/super.c +++ b/fs/super.c @@ -21,6 +21,7 @@ int sync_dev(int dev); void wait_for_keypress(void); +void fcntl_init_locks(void); /* set_bit uses setb, as gas doesn't recognize setc */ #define set_bit(bitnr,addr) ({ \ @@ -178,8 +179,9 @@ int sys_umount(char * dev_name) if (!suser()) return -EPERM; - if (!(inode = namei(dev_name))) - return -ENOENT; + retval = namei(dev_name,&inode); + if (retval) + return retval; dev = inode->i_rdev; if (!S_ISBLK(inode->i_mode)) { iput(inode); @@ -208,9 +210,11 @@ static int do_mount(int dev, const char * dir, char * type, int flags, void * da { struct inode * inode, * dir_i; struct super_block * sb; + int error; - if (!(dir_i = namei(dir))) - return -ENOENT; + error = namei(dir,&dir_i); + if (error) + return error; if (dir_i->i_count != 1 || dir_i->i_mount) { iput(dir_i); return -EBUSY; @@ -256,7 +260,7 @@ int sys_mount(char * dev_name, char * dir_name, char * type, { struct inode * inode; int dev; - int retval = 0; + int retval; char tmp[100],*t; int i; unsigned long flags = 0; @@ -264,8 +268,9 @@ int sys_mount(char * dev_name, char * dir_name, char * type, if (!suser()) return -EPERM; - if (!(inode = namei(dev_name))) - return -ENOENT; + retval = namei(dev_name,&inode); + if (retval) + return retval; dev = inode->i_rdev; if (!S_ISBLK(inode->i_mode)) retval = -EPERM; @@ -314,6 +319,7 @@ void mount_root(void) panic("bad i-node size"); for(i=0;i +#include +#include + struct inode { dev_t i_dev; unsigned long i_ino; @@ -123,11 +127,14 @@ struct inode { time_t i_atime; time_t i_mtime; time_t i_ctime; + unsigned long i_blksize; + unsigned long i_blocks; unsigned long i_data[16]; struct inode_operations * i_op; struct super_block * i_sb; struct wait_queue * i_wait; struct wait_queue * i_wait2; /* for pipes */ + struct file_lock *i_flock; unsigned short i_count; unsigned short i_flags; unsigned char i_lock; @@ -136,6 +143,11 @@ struct inode { unsigned char i_mount; unsigned char i_seek; unsigned char i_update; + union { + struct minix_inode_info minix_i; + struct ext_inode_info ext_i; + struct msdos_inode_info msdos_i; + } u; }; struct file { @@ -149,6 +161,16 @@ struct file { off_t f_pos; }; +struct file_lock { + struct file_lock *fl_next; /* singly linked list */ + struct task_struct *fl_owner; /* NULL if on free list, for sanity checks */ + struct wait_queue *fl_wait; + char fl_type; + char fl_whence; + off_t fl_start; + off_t fl_end; +}; + #include #include #include @@ -196,7 +218,7 @@ struct inode_operations { int (*mknod) (struct inode *,const char *,int,int,int); int (*rename) (struct inode *,const char *,int,struct inode *,const char *,int); int (*readlink) (struct inode *,char *,int); - struct inode * (*follow_link) (struct inode *, struct inode *); + int (*follow_link) (struct inode *, struct inode *, int flag, int mode, struct inode ** res_inode); int (*bmap) (struct inode *,int); void (*truncate) (struct inode *); }; @@ -239,13 +261,11 @@ extern void floppy_off(unsigned int dev); extern void sync_inodes(void); extern void wait_on(struct inode * inode); extern int bmap(struct inode * inode,int block); -extern struct inode * namei(const char * pathname); -extern struct inode * lnamei(const char * pathname); +extern int namei(const char * pathname, struct inode ** res_inode); +extern int lnamei(const char * pathname, struct inode ** res_inode); extern int permission(struct inode * inode,int mask); -extern struct inode * _namei(const char * filename, struct inode * base, - int follow_links); extern int open_namei(const char * pathname, int flag, int mode, - struct inode ** res_inode); + struct inode ** res_inode, struct inode * base); extern int do_mknod(const char * filename, int mode, int dev); extern void iput(struct inode * inode); extern struct inode * iget(int dev,int nr); diff --git a/include/linux/head.h b/include/linux/head.h index b871742accb7..8911a68198de 100644 --- a/include/linux/head.h +++ b/include/linux/head.h @@ -5,7 +5,7 @@ typedef struct desc_struct { unsigned long a,b; } desc_table[256]; -extern unsigned long pg_dir[1024]; +extern unsigned long swapper_pg_dir[1024]; extern desc_table idt,gdt; #define GDT_NUL 0 diff --git a/include/linux/limits.h b/include/linux/limits.h index 1de038822b99..f3912fa3d32a 100644 --- a/include/linux/limits.h +++ b/include/linux/limits.h @@ -8,6 +8,7 @@ #define NR_FILE 128 #define NR_SUPER 8 #define NR_HASH 997 +#define NR_FILE_LOCKS 32 #define BLOCK_SIZE 1024 #define BLOCK_SIZE_BITS 10 #define MAX_CHRDEV 16 diff --git a/include/linux/minix_fs_i.h b/include/linux/minix_fs_i.h new file mode 100644 index 000000000000..dabe5afa617c --- /dev/null +++ b/include/linux/minix_fs_i.h @@ -0,0 +1,10 @@ +#ifndef _MINIX_FS_I +#define _MINIX_FS_I + +/* + * minix fs inode data in memory + */ +struct minix_inode_info { +}; + +#endif diff --git a/include/linux/mm.h b/include/linux/mm.h index a7b6af23b65a..9a0ff3e478a9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -57,10 +57,12 @@ extern void rw_swap_page(int rw, unsigned int nr, char * buf); /* memory.c */ extern unsigned long get_free_page(int priority); -extern unsigned long put_dirty_page(unsigned long page,unsigned long address); +extern unsigned long put_dirty_page(struct task_struct * tsk,unsigned long page, + unsigned long address); extern void free_page(unsigned long addr); -extern int free_page_tables(unsigned long from,unsigned long size); -extern int copy_page_tables(unsigned long from,unsigned long to,long size); +extern void free_page_tables(struct task_struct * tsk); +extern void clear_page_tables(struct task_struct * tsk); +extern int copy_page_tables(struct task_struct * new); extern int unmap_page_range(unsigned long from, unsigned long size); extern int remap_page_range(unsigned long from, unsigned long to, unsigned long size, int permiss); @@ -82,7 +84,7 @@ extern void swap_free(unsigned int page_nr); extern void swap_in(unsigned long *table_ptr); #define invalidate() \ -__asm__("movl %%eax,%%cr3"::"a" (0)) +__asm__ __volatile__("movl %%cr3,%%eax\n\tmovl %%eax,%%cr3":::"ax") extern unsigned long low_memory; extern unsigned long high_memory; diff --git a/include/linux/msdos_fs_i.h b/include/linux/msdos_fs_i.h new file mode 100644 index 000000000000..bd900c02c188 --- /dev/null +++ b/include/linux/msdos_fs_i.h @@ -0,0 +1,10 @@ +#ifndef _MSDOS_FS_I +#define _MSDOS_FS_I + +/* + * msdos file system inode data in memory + */ +struct msdos_inode_info { +}; + +#endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 21a3fbbe4606..fab929f41df8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -3,33 +3,22 @@ #define HZ 100 +/* + * This is the maximum nr of tasks - change it if you need to + */ #define NR_TASKS 64 -#define TASK_SIZE 0x04000000 -#define LIBRARY_SIZE 0x00400000 + +/* + * User space process size: 3GB. This is hardcoded into a few places, + * so don't change it unless you know what you are doing. + */ +#define TASK_SIZE 0xc0000000 /* * Size of io_bitmap in longwords: 32 is ports 0-0x3ff. */ #define IO_BITMAP_SIZE 32 -#if (TASK_SIZE & 0x3fffff) -#error "TASK_SIZE must be multiple of 4M" -#endif - -#if (LIBRARY_SIZE & 0x3fffff) -#error "LIBRARY_SIZE must be a multiple of 4M" -#endif - -#if (LIBRARY_SIZE >= (TASK_SIZE/2)) -#error "LIBRARY_SIZE too damn big!" -#endif - -#if (((TASK_SIZE>>16)*NR_TASKS) != 0x10000) -#error "TASK_SIZE*NR_TASKS must be 4GB" -#endif - -#define LIBRARY_OFFSET (TASK_SIZE - LIBRARY_SIZE) - #define CT_TO_SECS(x) ((x) / HZ) #define CT_TO_USECS(x) (((x) % HZ) * 1000000/HZ) @@ -43,6 +32,7 @@ #include #include #include +#include #if (NR_OPEN > 32) #error "Currently the close-on-exec-flags and select masks are in one long, max 32 files/proc" @@ -115,6 +105,7 @@ struct task_struct { long signal; struct sigaction sigaction[32]; long blocked; /* bitmap of masked signals */ + unsigned long saved_kernel_stack; /* various fields */ int exit_code; int dumpable:1; @@ -146,6 +137,7 @@ struct task_struct { unsigned short used_math; unsigned short rss; /* number of resident pages */ char comm[8]; + struct vm86_struct * vm86_info; /* file system info */ int link_count; int tty; /* -1 if no tty, so it must be signed */ @@ -157,6 +149,7 @@ struct task_struct { struct inode * library; unsigned long start; unsigned long length; + unsigned long bss; } libraries[MAX_SHARED_LIBS]; int numlibraries; struct file * filp[NR_OPEN]; @@ -173,9 +166,6 @@ struct task_struct { #define PF_ALIGNWARN 0x00000001 /* Print alignment warning msgs */ /* Not implemented yet, only for 486*/ #define PF_PTRACED 0x00000010 /* set if ptrace (0) has been called. */ -#define PF_VM86 0x00000020 /* set if process can execute a vm86 */ - /* task. */ - /* not impelmented. */ /* * INIT_TASK is used to set up the first task table, touch at @@ -183,7 +173,7 @@ struct task_struct { */ #define INIT_TASK \ /* state etc */ { 0,15,15, \ -/* signals */ 0,{{},},0, \ +/* signals */ 0,{{},},0,0, \ /* ec,brk... */ 0,0,0,0,0,0,0,0, \ /* pid etc.. */ 0,0,0,0, \ /* suppl grps*/ {NOGROUP,}, \ @@ -199,15 +189,16 @@ struct task_struct { /* math */ 0, \ /* rss */ 2, \ /* comm */ "swapper", \ +/* vm86_info */ NULL, \ /* fs info */ 0,-1,0022,NULL,NULL,NULL, \ /* libraries */ { { NULL, 0, 0}, }, 0, \ /* filp */ {NULL,}, 0, \ { \ {0,0}, \ -/* ldt */ {0x9f,0xc0fa00}, \ - {0x9f,0xc0f200} \ +/* ldt */ {0x9f,0xc0c0fa00}, \ + {0x9f,0xc0c0f200} \ }, \ -/*tss*/ {0,PAGE_SIZE+(long)&init_task,0x10,0,0,0,0,(long)&pg_dir,\ +/*tss*/ {0,PAGE_SIZE+(long)&init_task,0x10,0,0,0,0,(long)&swapper_pg_dir,\ 0,0,0,0,0,0,0,0, \ 0,0,0x17,0x17,0x17,0x17,0x17,0x17, \ _LDT(0),0x80000000,{0xffffffff}, \ diff --git a/include/linux/sys.h b/include/linux/sys.h index a479faa9b05a..885fe2b1e25f 100644 --- a/include/linux/sys.h +++ b/include/linux/sys.h @@ -115,6 +115,7 @@ extern int sys_newuname(); extern int sys_iopl(); extern int sys_vhangup(); extern int sys_idle(); +extern int sys_vm86(); fn_ptr sys_call_table[] = { sys_setup, sys_exit, sys_fork, sys_read, sys_write, sys_open, sys_close, sys_waitpid, sys_creat, sys_link, @@ -137,7 +138,7 @@ sys_truncate, sys_ftruncate, sys_fchmod, sys_fchown, sys_getpriority, sys_setpriority, sys_profil, sys_statfs, sys_fstatfs, sys_ioperm, sys_socketcall, sys_syslog, sys_setitimer, sys_getitimer, sys_newstat, sys_newlstat, sys_newfstat, sys_newuname, sys_iopl, sys_vhangup, -sys_idle }; +sys_idle, sys_vm86 }; /* So we don't have to do any more manual updating.... */ int NR_syscalls = sizeof(sys_call_table)/sizeof(fn_ptr); diff --git a/include/linux/unistd.h b/include/linux/unistd.h index 569c552922f8..a15853fc695a 100644 --- a/include/linux/unistd.h +++ b/include/linux/unistd.h @@ -119,6 +119,7 @@ #define __NR_iopl 110 #define __NR_vhangup 111 #define __NR_idle 112 +#define __NR_vm86 113 extern int errno; diff --git a/include/linux/vm86.h b/include/linux/vm86.h new file mode 100644 index 000000000000..96b8959c5fcd --- /dev/null +++ b/include/linux/vm86.h @@ -0,0 +1,55 @@ +#ifndef _LINUX_VM86_H +#define _LINUX_VM86_H + +#define VM_MASK 0x00020000 + +/* + * This is the stack-layout when we have done a "SAVE_ALL" from vm86 + * mode - the main change is that the old segment descriptors aren't + * useful any more and are forced to be zero by the kernel (and the + * hardware when a trap occurs), and the real segment descriptors are + * at the end of the structure. Look at ptrace.h to see the "normal" + * setup. + */ + +struct vm86_regs { +/* + * normal regs, with special meaning for the segment descriptors.. + */ + long ebx; + long ecx; + long edx; + long esi; + long edi; + long ebp; + long eax; + long __null_ds; + long __null_es; + long __null_fs; + long __null_gs; + long orig_eax; + long eip; + long cs; + long eflags; + long esp; + long ss; +/* + * these are specific to v86 mode: + */ + long es; + long ds; + long fs; + long gs; +}; + +/* + * flags isn't even used yet: it's just there as an example of + * what kind of information we might want to give sys_vm86() (or + * want it to return to us). + */ +struct vm86_struct { + struct vm86_regs regs; + unsigned long flags; +}; + +#endif diff --git a/kernel/Makefile b/kernel/Makefile index 695e77c5a352..3bb280ed71d9 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -29,25 +29,25 @@ kernel.o: $(OBJS) sync kernelsubdirs: dummy - @for i in $(SUBDIRS); do (cd $$i; echo $$i; $(MAKE)) || exit; done + @for i in $(SUBDIRS); do (cd $$i && echo $$i && $(MAKE)) || exit; done sys_call.s: sys_call.S sys_call.o: sys_call.s sched.o: sched.c - $(CC) $(CFLAGS) -fno-omit-frame-pointer -c $< + $(CC) $(CFLAGS) $(PROFILING) -fno-omit-frame-pointer -c $< clean: rm -f core *.o *.a tmp_make sys_call.s for i in *.c;do rm -f `basename $$i .c`.s;done - for i in $(SUBDIRS); do (cd $$i; $(MAKE) clean); done + for i in $(SUBDIRS); do (cd $$i && $(MAKE) clean); done dep: sed '/\#\#\# Dependencies/q' < Makefile > tmp_make for i in *.c;do $(CPP) -M $$i;done >> tmp_make cp tmp_make Makefile - for i in $(SUBDIRS); do (cd $$i; $(MAKE) dep) || exit; done + for i in $(SUBDIRS); do (cd $$i && $(MAKE) dep) || exit; done dummy: diff --git a/kernel/blk_drv/Makefile b/kernel/blk_drv/Makefile index 232a0c8d8924..55d48ca645ee 100644 --- a/kernel/blk_drv/Makefile +++ b/kernel/blk_drv/Makefile @@ -28,18 +28,18 @@ blk_drv.a: $(OBJS) sync scsisubdirs: dummy - @for i in $(SUBDIRS); do (cd $$i; echo $$i; $(MAKE)) || exit; done + @for i in $(SUBDIRS); do (cd $$i && echo $$i && $(MAKE)) || exit; done clean: rm -f core *.o *.a tmp_make for i in *.c;do rm -f `basename $$i .c`.s;done - for i in $(SUBDIRS); do (cd $$i; $(MAKE) clean); done + for i in $(SUBDIRS); do (cd $$i && $(MAKE) clean); done dep: sed '/\#\#\# Dependencies/q' < Makefile > tmp_make for i in *.c;do $(CPP) -M $$i;done >> tmp_make cp tmp_make Makefile - for i in $(SUBDIRS); do (cd $$i; $(MAKE) dep); done + for i in $(SUBDIRS); do (cd $$i && $(MAKE) dep); done dummy: diff --git a/kernel/blk_drv/blk.h b/kernel/blk_drv/blk.h index 70dab1bfdfc7..2288c57fc1e7 100644 --- a/kernel/blk_drv/blk.h +++ b/kernel/blk_drv/blk.h @@ -69,6 +69,7 @@ extern struct wait_queue * wait_for_request; extern int * blk_size[NR_BLK_DEV]; +extern unsigned long hd_init(unsigned long mem_start, unsigned long mem_end); extern int is_read_only(int dev); extern void set_device_ro(int dev,int flag); diff --git a/kernel/blk_drv/hd.c b/kernel/blk_drv/hd.c index f8d7d2102d28..77c76fcbbf88 100644 --- a/kernel/blk_drv/hd.c +++ b/kernel/blk_drv/hd.c @@ -53,7 +53,8 @@ static inline unsigned char CMOS_READ(unsigned char addr) static void recal_intr(void); static void bad_rw_intr(void); -static int recalibrate = 0; +static char recalibrate[ MAX_HD ] = { 0, }; + static int reset = 0; #if (HD_DELAY > 0) @@ -221,6 +222,8 @@ void unexpected_hd_interrupt(void) static void bad_rw_intr(void) { + int i; + if (!CURRENT) return; if (++CURRENT->errors >= MAX_ERRORS) @@ -228,7 +231,8 @@ static void bad_rw_intr(void) else if (CURRENT->errors > MAX_ERRORS/2) reset = 1; else - recalibrate = 1; + for (i=0; i < NR_HD; i++) + recalibrate[i] = 1; } static inline int wait_DRQ(void) @@ -378,7 +382,7 @@ static void hd_times_out(void) static void do_hd_request(void) { unsigned int block,dev; - unsigned int sec,head,cyl; + unsigned int sec,head,cyl,track; unsigned int nsect; repeat: @@ -399,24 +403,26 @@ repeat: } block += hd[dev].start_sect; dev >>= 6; - sec = block % hd_info[dev].sect; - block /= hd_info[dev].sect; - head = block % hd_info[dev].head; - cyl = block / hd_info[dev].head; - sec++; + sec = block % hd_info[dev].sect + 1; + track = block / hd_info[dev].sect; + head = track % hd_info[dev].head; + cyl = track / hd_info[dev].head; #ifdef DEBUG printk("hd%d : cyl = %d, head = %d, sector = %d, buffer = %08x\n", dev, cyl, head, sec, CURRENT->buffer); #endif cli(); if (reset) { - recalibrate = 1; + int i; + + for (i=0; i < NR_HD; i++) + recalibrate[i] = 1; reset_hd(); sti(); return; } - if (recalibrate) { - recalibrate = 0; + if (recalibrate[dev]) { + recalibrate[dev] = 0; hd_out(dev,hd_info[dev].sect,0,0,0,WIN_RESTORE,&recal_intr); if (reset) goto repeat; @@ -434,13 +440,16 @@ repeat: } port_write(HD_DATA,CURRENT->buffer,256); sti(); - } else if (CURRENT->cmd == READ) { + return; + } + if (CURRENT->cmd == READ) { hd_out(dev,nsect,sec,head,cyl,WIN_READ,&read_intr); if (reset) goto repeat; sti(); - } else - panic("unknown hd-command"); + return; + } + panic("unknown hd-command"); } static int hd_ioctl(struct inode * inode, struct file * file, @@ -481,7 +490,6 @@ static void hd_release(struct inode * inode, struct file * file) sync_dev(inode->i_rdev); } - static void hd_geninit(); static struct gendisk hd_gendisk = { @@ -500,11 +508,11 @@ static struct gendisk hd_gendisk = { static void hd_geninit(void) { - int drive; + int drive, i; #ifndef HD_TYPE extern struct drive_info drive_info; void *BIOS = (void *) &drive_info; - int cmos_disks, i; + int cmos_disks; for (drive=0 ; drive<2 ; drive++) { hd_info[drive].cyl = *(unsigned short *) BIOS; @@ -593,7 +601,7 @@ static struct sigaction hd_sigaction = { NULL }; -unsigned long hd_init(unsigned long mem_start) +unsigned long hd_init(unsigned long mem_start, unsigned long mem_end) { blk_dev[MAJOR_NR].request_fn = DEVICE_REQUEST; blkdev_fops[MAJOR_NR] = &hd_fops; diff --git a/kernel/blk_drv/ll_rw_blk.c b/kernel/blk_drv/ll_rw_blk.c index 1a5f6404c0a9..058900966c73 100644 --- a/kernel/blk_drv/ll_rw_blk.c +++ b/kernel/blk_drv/ll_rw_blk.c @@ -102,9 +102,6 @@ void set_device_ro(int dev,int flag) * add-request adds a request to the linked list. * It disables interrupts so that it can muck with the * request-lists in peace. - * - * Note that swapping requests always go before other requests, - * and are done in the order they appear. */ static void add_request(struct blk_dev_struct * dev, struct request * req) { @@ -121,11 +118,6 @@ static void add_request(struct blk_dev_struct * dev, struct request * req) return; } for ( ; tmp->next ; tmp = tmp->next) { - if (!req->bh) - if (tmp->next->bh) - break; - else - continue; if ((IN_ORDER(tmp,req) || !IN_ORDER(tmp,tmp->next)) && IN_ORDER(req,tmp->next)) @@ -208,9 +200,10 @@ repeat: sti(); goto repeat; -found: sti(); +found: /* fill up the request-info, and add it to the queue */ req->dev = bh->b_dev; + sti(); req->cmd = rw; req->errors = 0; req->sector = sector; diff --git a/kernel/chr_drv/mem.c b/kernel/chr_drv/mem.c index 14962393b921..6caba5b6ea83 100644 --- a/kernel/chr_drv/mem.c +++ b/kernel/chr_drv/mem.c @@ -38,7 +38,7 @@ static int read_mem(struct inode * inode, struct file * file,char * buf, int cou while (count > 0) { if (current->signal & ~current->blocked) break; - pde = (unsigned long) pg_dir + (addr >> 20 & 0xffc); + pde = current->tss.cr3 + (addr >> 20 & 0xffc); pte = *(unsigned long *) pde; if (!(pte & PAGE_PRESENT)) break; @@ -75,7 +75,7 @@ static int write_mem(struct inode * inode, struct file * file,char * buf, int co while (count > 0) { if (current->signal & ~current->blocked) break; - pde = (unsigned long) pg_dir + (addr >> 20 & 0xffc); + pde = current->tss.cr3 + (addr >> 20 & 0xffc); pte = *(unsigned long *) pde; if (!(pte & PAGE_PRESENT)) break; diff --git a/kernel/exit.c b/kernel/exit.c index 8cc5451a52ef..11ec282459a1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -321,8 +321,7 @@ volatile void do_exit(long code) int i; fake_volatile: - free_page_tables(get_base(current->ldt[1]),get_limit(0x0f)); - free_page_tables(get_base(current->ldt[2]),get_limit(0x17)); + free_page_tables(current); for (i=0 ; ifilp[i]) sys_close(i); diff --git a/kernel/fork.c b/kernel/fork.c index e80abe8f7050..8024f29b58a5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -56,15 +56,12 @@ int copy_mem(int nr,struct task_struct * p) } if (data_limit < code_limit) panic("Bad data_limit"); - new_data_base = new_code_base = nr * TASK_SIZE; + new_data_base = old_data_base; + new_code_base = old_code_base; p->start_code = new_code_base; set_base(p->ldt[1],new_code_base); set_base(p->ldt[2],new_data_base); - if (copy_page_tables(old_data_base,new_data_base,data_limit)) { - free_page_tables(new_data_base,data_limit); - return -ENOMEM; - } - return 0; + return copy_page_tables(p); } static int find_empty_process(void) diff --git a/kernel/math/emulate.c b/kernel/math/emulate.c index 1df0691ba030..9c86a6b3b082 100644 --- a/kernel/math/emulate.c +++ b/kernel/math/emulate.c @@ -62,6 +62,9 @@ static void do_emu(struct info * info) else I387.swd &= 0x7fff; ORIG_EIP = EIP; +/* We cannot handle emulation in v86-mode */ + if (EFLAGS & 0x00020000) + math_abort(info,SIGILL); /* 0x0007 means user code space */ if (CS != 0x000F) { printk("math_emulate: %04x:%08x\n\r",CS,EIP); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 07f3a8d5f48c..73575abf94af 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -88,7 +88,6 @@ static unsigned long get_long(struct task_struct * tsk, { unsigned long page; - addr += tsk->start_code; repeat: page = tsk->tss.cr3 + ((addr >> 20) & 0xffc); page = *(unsigned long *) page; @@ -117,7 +116,6 @@ static void put_long(struct task_struct * tsk, unsigned long addr, { unsigned long page; - addr += tsk->start_code; repeat: page = tsk->tss.cr3 + ((addr >> 20) & 0xffc); page = *(unsigned long *) page; diff --git a/kernel/sched.c b/kernel/sched.c index 1e8a710a8663..9a4baea2447e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -393,7 +393,7 @@ struct timer_struct timer_table[32]; * irq uses this to decide if it should update the user or system * times. */ -static void do_timer(int regs) +static void do_timer(struct pt_regs * regs) { unsigned long mask; struct timer_struct *tp = timer_table+0; @@ -401,7 +401,7 @@ static void do_timer(int regs) static int avg_cnt = 0; jiffies++; - if (3 & ((struct pt_regs *) regs)->cs) { + if ((VM_MASK & regs->eflags) || (3 & regs->cs)) { current->utime++; /* Update ITIMER_VIRT for current task if not in a system call */ if (current->it_virt_value && !(--current->it_virt_value)) { @@ -412,7 +412,7 @@ static void do_timer(int regs) current->stime++; #ifdef PROFILE_SHIFT if (prof_buffer && current != task[0]) { - unsigned long eip = ((struct pt_regs *) regs)->eip; + unsigned long eip = regs->eip; eip >>= PROFILE_SHIFT; if (eip < prof_len) prof_buffer[eip]++; @@ -543,5 +543,5 @@ void sched_init(void) outb_p(0x36,0x43); /* binary, mode 3, LSB/MSB, ch 0 */ outb_p(LATCH & 0xff , 0x40); /* LSB */ outb(LATCH >> 8 , 0x40); /* MSB */ - request_irq(TIMER_IRQ,do_timer); + request_irq(TIMER_IRQ,(void (*)(int)) do_timer); } diff --git a/kernel/signal.c b/kernel/signal.c index ca850fa7b121..85b248b95f97 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -135,11 +135,6 @@ int do_signal(long signr,struct pt_regs * regs) int longs; unsigned long * tmp_esp; -#ifdef notdef - printk("pid: %d, signr: %x, eax=%d, oeax = %d, int=%d\n", - current->pid, signr, regs->eax, regs->orig_eax, - sa->sa_flags & SA_INTERRUPT); -#endif sa_handler = (unsigned long) sa->sa_handler; if ((regs->orig_eax != -1) && ((regs->eax == -ERESTARTSYS) || (regs->eax == -ERESTARTNOINTR))) { diff --git a/kernel/sys.c b/kernel/sys.c index 94a8de54e765..7368805f1c39 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -13,12 +13,14 @@ #include #include #include +#include #include +#include #include /* - * this indicates wether you can reboot with ctrl-alt-del: the deault is yes + * this indicates wether you can reboot with ctrl-alt-del: the default is yes */ static int C_A_D = 1; @@ -128,6 +130,53 @@ int sys_prof() return -ENOSYS; } +unsigned long save_v86_state(int signr,struct vm86_regs * regs) +{ + unsigned long stack; + + if (!current->vm86_info) { + printk("no vm86_info: BAD\n"); + do_exit(SIGSEGV); + } + memcpy_tofs(&(current->vm86_info->regs),regs,sizeof(*regs)); + stack = current->tss.esp0; + current->tss.esp0 = current->saved_kernel_stack; + current->saved_kernel_stack = 0; + return stack; +} + +int sys_vm86(struct vm86_struct * v86) +{ + struct vm86_struct info; + struct pt_regs * pt_regs = (struct pt_regs *) &v86; + + if (current->saved_kernel_stack) + return -EPERM; + memcpy_fromfs(&info,v86,sizeof(info)); +/* + * make sure the vm86() system call doesn't try to do anything silly + */ + info.regs.__null_ds = 0; + info.regs.__null_es = 0; + info.regs.__null_fs = 0; + info.regs.__null_gs = 0; +/* + * The eflags register is also special: we cannot trust that the user + * has set it up safely, so this makes sure interrupt etc flags are + * inherited from protected mode. + */ + info.regs.eflags &= 0x00000dd5; + info.regs.eflags |= 0xfffff22a & pt_regs->eflags; + info.regs.eflags |= VM_MASK; + current->saved_kernel_stack = current->tss.esp0; + current->tss.esp0 = (unsigned long) pt_regs; + current->vm86_info = v86; + __asm__ __volatile__("movl %0,%%esp\n\t" + "pushl $ret_from_sys_call\n\t" + "ret"::"g" ((long) &(info.regs)),"a" (info.regs.eax)); + return 0; +} + extern void hard_reset_now(void); /* diff --git a/kernel/sys_call.S b/kernel/sys_call.S index 44c072326c55..916455ef628a 100644 --- a/kernel/sys_call.S +++ b/kernel/sys_call.S @@ -58,6 +58,10 @@ EFLAGS = 0x38 OLDESP = 0x3C OLDSS = 0x40 +IF_MASK = 0x00000200 +NT_MASK = 0x00004000 +VM_MASK = 0x00020000 + /* * these are offsets into the task-struct. */ @@ -67,6 +71,7 @@ priority = 8 signal = 12 sigaction = 16 # MUST be 16 (=len of sigaction) blocked = (33*16) +saved_kernel_stack = ((33*16)+4) /* * offsets within sigaction @@ -121,11 +126,17 @@ _system_call: movl %eax,EAX(%esp) # save the return value .align 4,0x90 ret_from_sys_call: + movl EFLAGS(%esp),%eax + testl $VM_MASK,%eax + jne 1f cmpw $0x0f,CS(%esp) # was old code segment supervisor ? jne 2f cmpw $0x17,OLDSS(%esp) # was stack segment = 0x17 ? jne 2f -1: cmpl $0,_need_resched +1: orl $IF_MASK,%eax # these just try to make sure + andl $~NT_MASK,%eax # the program doesn't do anything + movl %eax,EFLAGS(%esp) # stupid + cmpl $0,_need_resched jne reschedule movl _current,%eax cmpl _task,%eax # task[0] cannot have signals @@ -141,10 +152,18 @@ ret_from_sys_call: bsfl %ecx,%ecx je 2f btrl %ecx,%ebx + incl %ecx movl %ebx,signal(%eax) movl %esp,%ebx + testl $VM_MASK,EFLAGS(%esp) + je 3f pushl %ebx - incl %ecx + pushl %ecx + call _save_v86_state + popl %ecx + movl %eax,%ebx + movl %eax,%esp +3: pushl %ebx pushl %ecx call _do_signal popl %ecx diff --git a/kernel/traps.c b/kernel/traps.c index 8d7d0397f122..9a5f086c6885 100644 --- a/kernel/traps.c +++ b/kernel/traps.c @@ -63,7 +63,7 @@ static void die_if_kernel(char * str,long esp_ptr,long nr) long * esp = (long *) esp_ptr; int i; - if ((0xffff & esp[1]) == 0xf) + if ((esp[2] & VM_MASK) || ((0xffff & esp[1]) == 0xf)) return; printk("%s: %04x\n\r",str,nr&0xffff); printk("EIP: %04x:%p\nEFLAGS: %p\n", 0xffff & esp[1],esp[0],esp[2]); diff --git a/mm/memory.c b/mm/memory.c index 1595b4f1d949..5fd804b63bdb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -34,11 +34,9 @@ #include #include #include +#include #include -#define CODE_SPACE(addr) ((((addr)+4095)&~4095) < \ -current->start_code + current->end_code) - unsigned long low_memory = 0; unsigned long high_memory = 0; unsigned long free_page_list = 0; @@ -85,45 +83,92 @@ void free_page(unsigned long addr) printk("trying to free free page (%08x): memory probably corrupted\n",addr); } +static void free_one_table(unsigned long * page_dir) +{ + int j; + unsigned long pg_table = *page_dir; + unsigned long * page_table; + + if (!pg_table) + return; + if (!(pg_table & 1)) { + printk("Bad page table: [%08x]=%08x\n",page_dir,pg_table); + *page_dir = 0; + return; + } + *page_dir = 0; + if (pg_table < low_memory) + return; + page_table = (unsigned long *) (pg_table & 0xfffff000); + for (j = 0 ; j < 1024 ; j++,page_table++) { + unsigned long pg = *page_table; + + if (!pg) + continue; + *page_table = 0; + if (1 & pg) + free_page(0xfffff000 & pg); + else + swap_free(pg >> 1); + } + free_page(0xfffff000 & pg_table); +} + /* - * This function frees a continuos block of page tables, as needed - * by 'exit()'. As does copy_page_tables(), this handles only 4Mb blocks. + * This function clears all user-level page tables of a process - this + * is needed by execve(), so that old pages aren't in the way. Note that + * unlike 'free_page_tables()', this function still leaves a valid + * page-table-tree in memory: it just removes the user pages. The two + * functions are similar, but there is a fundamental difference. */ -int free_page_tables(unsigned long from,unsigned long size) +void clear_page_tables(struct task_struct * tsk) { - unsigned long page; - unsigned long page_dir; - unsigned long *pg_table; - unsigned long * dir, nr; + int i; + unsigned long * page_dir; - if (from & 0x3fffff) - panic("free_page_tables called with wrong alignment"); - if (!from) + if (!tsk) + return; + if (tsk == task[0]) + panic("task[0] (swapper) doesn't support exec() yet\n"); + page_dir = (unsigned long *) tsk->tss.cr3; + if (!page_dir) { + printk("Trying to clear kernel page-directory: not good\n"); + return; + } + for (i = 0 ; i < 768 ; i++,page_dir++) + free_one_table(page_dir); + invalidate(); + return; +} + +/* + * This function frees up all page tables of a process when it exits. + */ +void free_page_tables(struct task_struct * tsk) +{ + int i; + unsigned long pg_dir; + unsigned long * page_dir; + + if (!tsk) + return; + if (tsk == task[0]) { + printk("task[0] (swapper) killed: unable to recover\n"); panic("Trying to free up swapper memory space"); - size = (size + 0x3fffff) >> 22; - dir = (unsigned long *) ((from>>20) & 0xffc); /* _pg_dir = 0 */ - for ( ; size-->0 ; dir++) { - if (!(page_dir = *dir)) - continue; - *dir = 0; - if (!(page_dir & 1)) { - printk("free_page_tables: bad page directory."); - continue; - } - pg_table = (unsigned long *) (0xfffff000 & page_dir); - for (nr=0 ; nr<1024 ; nr++,pg_table++) { - if (!(page = *pg_table)) - continue; - *pg_table = 0; - if (1 & page) - free_page(0xfffff000 & page); - else - swap_free(page >> 1); - } - free_page(0xfffff000 & page_dir); } + pg_dir = tsk->tss.cr3; + if (!pg_dir) { + printk("Trying to free kernel page-directory: not good\n"); + return; + } + tsk->tss.cr3 = (unsigned long) swapper_pg_dir; + if (tsk == current) + __asm__ __volatile__("movl %0,%%cr3"::"a" (tsk->tss.cr3)); + page_dir = (unsigned long *) pg_dir; + for (i = 0 ; i < 1024 ; i++,page_dir++) + free_one_table(page_dir); + free_page(pg_dir); invalidate(); - return 0; } /* @@ -143,66 +188,80 @@ int free_page_tables(unsigned long from,unsigned long size) * 1 Mb-range, so the pages can be shared with the kernel. Thus the * special case for nr=xxxx. */ -int copy_page_tables(unsigned long from,unsigned long to,long size) +int copy_page_tables(struct task_struct * tsk) { - unsigned long * from_page_table; - unsigned long * to_page_table; - unsigned long this_page; - unsigned long * from_dir, * to_dir; - unsigned long new_page; - unsigned long nr; - - if ((from&0x3fffff) || (to&0x3fffff)) - panic("copy_page_tables called with wrong alignment"); - from_dir = (unsigned long *) ((from>>20) & 0xffc); /* _pg_dir = 0 */ - to_dir = (unsigned long *) ((to>>20) & 0xffc); - size = ((unsigned) (size+0x3fffff)) >> 22; - for( ; size-->0 ; from_dir++,to_dir++) { - if (*to_dir) - printk("copy_page_tables: already exist, " - "probable memory corruption\n"); - if (!*from_dir) + int i; + unsigned long temp_page = 0; + unsigned long old_pg_dir, *old_page_dir; + unsigned long new_pg_dir, *new_page_dir; + + old_pg_dir = current->tss.cr3; + new_pg_dir = get_free_page(GFP_KERNEL); + if (!new_pg_dir) + return -ENOMEM; + tsk->tss.cr3 = new_pg_dir; + old_page_dir = (unsigned long *) old_pg_dir; + new_page_dir = (unsigned long *) new_pg_dir; + for (i = 0 ; i < 1024 ; i++,old_page_dir++,new_page_dir++) { + int j; + unsigned long old_pg_table, *old_page_table; + unsigned long new_pg_table, *new_page_table; + + old_pg_table = *old_page_dir; + if (!old_pg_table) continue; - if (!(1 & *from_dir)) { + if (!(1 & old_pg_table)) { printk("copy_page_tables: page table swapped out, " "probable memory corruption"); - *from_dir = 0; + *old_page_dir = 0; + continue; + } + if (old_pg_table < low_memory) { + *new_page_dir = old_pg_table; continue; } - from_page_table = (unsigned long *) (0xfffff000 & *from_dir); - if (!(to_page_table = (unsigned long *) get_free_page(GFP_KERNEL))) - return -1; /* Out of memory, see freeing */ - *to_dir = ((unsigned long) to_page_table) | PAGE_ACCESSED | 7; - nr = (from==0)?0xA0:1024; - for ( ; nr-- > 0 ; from_page_table++,to_page_table++) { + new_pg_table = get_free_page(GFP_KERNEL); + if (!new_pg_table) { + free_page_tables(tsk); + free_page(temp_page); + return -ENOMEM; + } + *new_page_dir = new_pg_table | PAGE_ACCESSED | 7; + old_page_table = (unsigned long *) (0xfffff000 & old_pg_table); + new_page_table = (unsigned long *) (0xfffff000 & new_pg_table); + for (j = 0 ; j < 1024 ; j++,old_page_table++,new_page_table++) { + unsigned long pg; repeat: - this_page = *from_page_table; - if (!this_page) + pg = *old_page_table; + if (!pg) continue; - if (!(1 & this_page)) { - if (!(new_page = get_free_page(GFP_KERNEL))) - return -1; - ++current->rss; - read_swap_page(this_page>>1, (char *) new_page); - if (*from_page_table != this_page) { - free_page(new_page); - goto repeat; - } - *to_page_table = this_page; - *from_page_table = new_page | (PAGE_DIRTY | PAGE_ACCESSED | 7); + if (pg & 1) { + pg &= ~2; + *new_page_table = pg; + if (pg < low_memory) + continue; + *old_page_table = pg; + mem_map[(pg-low_memory)>>12]++; continue; } - this_page &= ~2; - *to_page_table = this_page; - if (this_page > low_memory) { - *from_page_table = this_page; - this_page -= low_memory; - this_page >>= 12; - if (!mem_map[this_page]++) - --nr_free_pages; + if (!temp_page) { + temp_page = get_free_page(GFP_KERNEL); + if (!temp_page) { + free_page_tables(tsk); + return -ENOMEM; + } + goto repeat; } + ++current->rss; + read_swap_page(pg>>1, (char *) temp_page); + if (*old_page_table != pg) + goto repeat; + *new_page_table = pg; + *old_page_table = temp_page | (PAGE_DIRTY | PAGE_ACCESSED | 7); + temp_page = 0; } } + free_page(temp_page); invalidate(); return 0; } @@ -222,7 +281,7 @@ int unmap_page_range(unsigned long from, unsigned long size) if (!from) panic("unmap_page_range trying to free swapper memory space"); size = (size + 0xfff) >> 12; - dir = (unsigned long *) ((from >> 20) & 0xffc); /* _pg_dir = 0 */ + dir = (unsigned long *) (current->tss.cr3 + ((from >> 20) & 0xffc)); poff = (from >> 12) & 0x3ff; if ((pcnt = 1024 - poff) > size) pcnt = size; @@ -284,7 +343,7 @@ int remap_page_range(unsigned long from, unsigned long to, unsigned long size, if ((from & 0xfff) || (to & 0xfff)) panic("remap_page_range called with wrong alignment"); - dir = (unsigned long *) ((from >> 20) & 0xffc); /* _pg_dir = 0 */ + dir = (unsigned long *) (current->tss.cr3 + ((from >> 20) & 0xffc)); size = (size + 0xfff) >> 12; poff = (from >> 12) & 0x3ff; if ((pcnt = 1024 - poff) > size) @@ -363,7 +422,7 @@ int remap_page_range(unsigned long from, unsigned long to, unsigned long size, * out of memory (either when trying to access page-table or * page.) */ -static unsigned long put_page(unsigned long page,unsigned long address) +static unsigned long put_page(struct task_struct * tsk,unsigned long page,unsigned long address) { unsigned long tmp, *page_table; @@ -377,13 +436,13 @@ static unsigned long put_page(unsigned long page,unsigned long address) printk("put_page: mem_map disagrees with %p at %p\n",page,address); return 0; } - page_table = (unsigned long *) ((address>>20) & 0xffc); + page_table = (unsigned long *) (tsk->tss.cr3 + ((address>>20) & 0xffc)); if ((*page_table)&1) page_table = (unsigned long *) (0xfffff000 & *page_table); else { tmp = get_free_page(GFP_KERNEL); if (!tmp) { - oom(current); + oom(tsk); tmp = BAD_PAGETABLE; } *page_table = tmp | PAGE_ACCESSED | 7; @@ -406,7 +465,7 @@ static unsigned long put_page(unsigned long page,unsigned long address) * and we want the dirty-status to be correct (for VM). Thus the same * routine, but this time we mark it dirty too. */ -unsigned long put_dirty_page(unsigned long page, unsigned long address) +unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsigned long address) { unsigned long tmp, *page_table; @@ -416,7 +475,7 @@ unsigned long put_dirty_page(unsigned long page, unsigned long address) printk("put_dirty_page: trying to put page %p at %p\n",page,address); if (mem_map[(page-low_memory)>>12] != 1) printk("mem_map disagrees with %p at %p\n",page,address); - page_table = (unsigned long *) ((address>>20) & 0xffc); + page_table = (unsigned long *) (tsk->tss.cr3 + ((address>>20) & 0xffc)); if ((*page_table)&1) page_table = (unsigned long *) (0xfffff000 & *page_table); else { @@ -491,7 +550,7 @@ void do_wp_page(unsigned long error_code, unsigned long address, { unsigned long pde, pte, page; - pde = (address>>20) & 0xffc; + pde = tsk->tss.cr3 + ((address>>20) & 0xffc); pte = *(unsigned long *) pde; if ((pte & 3) != 3) { printk("do_wp_page: bogus page-table at address %08x (%08x)\n",address,pte); @@ -499,12 +558,6 @@ void do_wp_page(unsigned long error_code, unsigned long address, send_sig(SIGSEGV, tsk, 1); return; } - if (address < TASK_SIZE) { - printk("do_wp_page: kernel WP error at address %08x (%08x)\n",address,pte); - *(unsigned long *) pde = BAD_PAGETABLE | 7; - send_sig(SIGSEGV, tsk, 1); - return; - } pte &= 0xfffff000; pte += (address>>10) & 0xffc; page = *(unsigned long *) pte; @@ -514,7 +567,7 @@ void do_wp_page(unsigned long error_code, unsigned long address, send_sig(SIGSEGV, tsk, 1); return; } - ++current->min_flt; + tsk->min_flt++; un_wp_page((unsigned long *) pte, tsk); } @@ -522,7 +575,7 @@ void write_verify(unsigned long address) { unsigned long page; - page = *(unsigned long *) ((address>>20) & 0xffc); + page = *(unsigned long *) (current->tss.cr3 + ((address>>20) & 0xffc)); if (!(page & PAGE_PRESENT)) return; page &= 0xfffff000; @@ -532,16 +585,16 @@ void write_verify(unsigned long address) return; } -static void get_empty_page(unsigned long address) +static void get_empty_page(struct task_struct * tsk, unsigned long address) { unsigned long tmp; tmp = get_free_page(GFP_KERNEL); if (!tmp) { - oom(current); + oom(tsk); tmp = BAD_PAGE; } - if (!put_page(tmp,address)) + if (!put_page(tsk,tmp,address)) free_page(tmp); } @@ -553,7 +606,8 @@ static void get_empty_page(unsigned long address) * NOTE! This assumes we have checked that p != current, and that they * share the same executable or library. */ -static int try_to_share(unsigned long address, struct task_struct * p) +static int try_to_share(unsigned long address, struct task_struct * tsk, + struct task_struct * p) { unsigned long from; unsigned long to; @@ -561,9 +615,8 @@ static int try_to_share(unsigned long address, struct task_struct * p) unsigned long to_page; unsigned long phys_addr; - from_page = to_page = ((address>>20) & 0xffc); - from_page += ((p->start_code>>20) & 0xffc); - to_page += ((current->start_code>>20) & 0xffc); + from_page = p->tss.cr3 + ((address>>20) & 0xffc); + to_page = tsk->tss.cr3 + ((address>>20) & 0xffc); /* is there a page-directory at from? */ from = *(unsigned long *) from_page; if (!(from & 1)) @@ -607,7 +660,7 @@ static int try_to_share(unsigned long address, struct task_struct * p) * We first check if it is at all feasible by checking executable->i_count. * It should be >1 if there are other tasks sharing this inode. */ -static int share_page(struct inode * inode, unsigned long address) +static int share_page(struct task_struct * tsk, struct inode * inode, unsigned long address) { struct task_struct ** p; int i; @@ -617,19 +670,16 @@ static int share_page(struct inode * inode, unsigned long address) for (p = &LAST_TASK ; p > &FIRST_TASK ; --p) { if (!*p) continue; - if (current == *p) + if (tsk == *p) continue; - if (address < LIBRARY_OFFSET) { - if (inode != (*p)->executable) - continue; - } else { + if (inode != (*p)->executable) { for (i=0; i < (*p)->numlibraries; i++) if (inode == (*p)->libraries[i].library) break; if (i >= (*p)->numlibraries) continue; } - if (try_to_share(address,*p)) + if (try_to_share(address,tsk,*p)) return 1; } return 0; @@ -671,15 +721,7 @@ void do_no_page(unsigned long error_code, unsigned long address, unsigned int block,i; struct inode * inode; - if (address < TASK_SIZE) { - printk("\n\rBAD!! KERNEL PAGE MISSING\n\r"); - do_exit(SIGSEGV); - } - if (address - tsk->start_code >= TASK_SIZE) { - printk("Bad things happen: nonexistent page error in do_no_page\n\r"); - do_exit(SIGSEGV); - } - page = get_empty_pgtable((unsigned long *) ((address >> 20) & 0xffc)); + page = get_empty_pgtable((unsigned long *) (tsk->tss.cr3 + ((address >> 20) & 0xffc))); if (!page) return; page &= 0xfffff000; @@ -696,53 +738,56 @@ void do_no_page(unsigned long error_code, unsigned long address, return; } address &= 0xfffff000; - tmp = address - tsk->start_code; inode = NULL; block = 0; - if (tmp < tsk->end_data) { + if (address < tsk->end_data) { inode = tsk->executable; - block = 1 + tmp / BLOCK_SIZE; + block = 1 + address / BLOCK_SIZE; } else { i = tsk->numlibraries; while (i-- > 0) { - if (tmp < tsk->libraries[i].start) + if (address < tsk->libraries[i].start) continue; - block = tmp - tsk->libraries[i].start; - if (block >= tsk->libraries[i].length) + block = address - tsk->libraries[i].start; + if (block >= tsk->libraries[i].length + tsk->libraries[i].bss) continue; inode = tsk->libraries[i].library; - block = 1 + block / BLOCK_SIZE; + if (block < tsk->libraries[i].length) + block = 1 + block / BLOCK_SIZE; + else + block = 0; break; } } if (!inode) { ++tsk->min_flt; - get_empty_page(address); + get_empty_page(tsk,address); if (tsk != current) return; - if (tmp >= LIBRARY_OFFSET || tmp < tsk->brk) + if (address < tsk->brk) return; - if (tmp+8192 >= (user_esp & 0xfffff000)) + if (address+8192 >= (user_esp & 0xfffff000)) return; send_sig(SIGSEGV,tsk,1); return; } - if (tsk == current) - if (share_page(inode,tmp)) { - ++tsk->min_flt; - return; - } + if (share_page(tsk,inode,address)) { + ++tsk->min_flt; + return; + } ++tsk->maj_flt; page = get_free_page(GFP_KERNEL); if (!page) { oom(current); - put_page(BAD_PAGE,address); + put_page(tsk,BAD_PAGE,address); return; } - for (i=0 ; i<4 ; block++,i++) - nr[i] = bmap(inode,block); - bread_page(page,inode->i_dev,nr); - i = tmp + 4096 - tsk->end_data; + if (block) { + for (i=0 ; i<4 ; block++,i++) + nr[i] = bmap(inode,block); + bread_page(page,inode->i_dev,nr); + } + i = address + 4096 - tsk->end_data; if (i>4095) i = 0; tmp = page + 4096; @@ -750,7 +795,7 @@ void do_no_page(unsigned long error_code, unsigned long address, tmp--; *(char *)tmp = 0; } - if (put_page(page,address)) + if (put_page(tsk,page,address)) return; free_page(page); oom(current); @@ -758,9 +803,8 @@ void do_no_page(unsigned long error_code, unsigned long address, void show_mem(void) { - int i,j,k,free=0,total=0; + int i,free=0,total=0; int shared = 0; - unsigned long * pg_tbl; printk("Mem-info:\n\r"); printk("Free pages: %6d\n",nr_free_pages); @@ -776,41 +820,14 @@ void show_mem(void) } printk("%d free pages of %d\n\r",free,total); printk("%d pages shared\n\r",shared); - printk("%d free pages via nr_free_pages\n\r", nr_free_pages); - k = 0; - for(i=4 ; i<1024 ;) { - if (1&pg_dir[i]) { - if (pg_dir[i]>high_memory) { - printk("page directory[%d]: %08X\n\r", - i,pg_dir[i]); - i++; - continue; - } - if (pg_dir[i]>low_memory) - free++,k++; - pg_tbl=(unsigned long *) (0xfffff000 & pg_dir[i]); - for(j=0 ; j<1024 ; j++) - if ((pg_tbl[j]&1) && pg_tbl[j]>low_memory) - if (pg_tbl[j]>high_memory) - printk("page_dir[%d][%d]: %08X\n\r", - i,j, pg_tbl[j]); - else - k++,free++; - } - i++; - if (!(i&15) && k) { - k++,free++; /* one page/process for task_struct */ - printk("Process %d: %d pages\n\r",(i>>4)-1,k); - k = 0; - } - } - printk("Memory found: %d (%d)\n\r",free-shared,total); } -/* This routine handles page faults. It determines the address, - and the problem then passes it off to one of the appropriate - routines. */ +/* + * This routine handles page faults. It determines the address, + * and the problem, and then passes it off to one of the appropriate + * routines. + */ void do_page_fault(unsigned long *esp, unsigned long error_code) { unsigned long address; @@ -822,13 +839,10 @@ void do_page_fault(unsigned long *esp, unsigned long error_code) user_esp = 0; /* get the address */ __asm__("movl %%cr2,%0":"=r" (address)); - if (!(error_code & 1)) { + if (!(error_code & 1)) do_no_page(error_code, address, current, user_esp); - return; - } else { + else do_wp_page(error_code, address, current, user_esp); - return; - } } unsigned long mem_init(unsigned long start_mem, unsigned long end_mem) diff --git a/mm/swap.c b/mm/swap.c index cc95a72125fd..ce3d7982eba0 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -179,16 +179,9 @@ int try_to_swap_out(unsigned long * table_ptr) return 1; } -/* - * We never page the pages in task[0] - kernel memory. - * We page all other pages. - */ -#define FIRST_VM_PAGE (TASK_SIZE>>12) -#define LAST_VM_PAGE (1024*1024) -#define VM_PAGES (LAST_VM_PAGE - FIRST_VM_PAGE) - -static unsigned int dir_entry = 1024; -static unsigned int page_entry = 0; +static int swap_task = 1; +static int swap_table = 0; +static int swap_page = 0; /* * sys_idle() does nothing much: it just searches for likely candidates for @@ -201,23 +194,32 @@ int sys_idle(void) unsigned long page; need_resched = 1; - if (dir_entry >= 1024) - dir_entry = FIRST_VM_PAGE>>10; - p = task[dir_entry >> 4]; - page = pg_dir[dir_entry]; - if (!(page & 1) || !p || !p->swappable) { - dir_entry++; + if (swap_task >= NR_TASKS) + swap_task = 1; + p = task[swap_task]; + if (!p || !p->swappable) { + swap_task++; + return 0; + } + if (swap_table >= 1024) { + swap_task++; + swap_table = 0; + return 0; + } + page = ((unsigned long *) p->tss.cr3)[swap_table]; + if (!(page & 1) || (page < low_memory)) { + swap_table++; return 0; } page &= 0xfffff000; - if (page_entry >= 1024) { - page_entry = 0; - dir_entry++; + if (swap_page >= 1024) { + swap_page = 0; + swap_table++; return 0; } - page = *(page_entry + (unsigned long *) page); + page = *(swap_page + (unsigned long *) page); if ((page < low_memory) || !(page & PAGE_PRESENT) || (page & PAGE_ACCESSED)) - page_entry++; + swap_page++; return 0; } @@ -231,48 +233,54 @@ int sys_idle(void) */ int swap_out(unsigned int priority) { - int counter = VM_PAGES / 2; + int counter = NR_TASKS; int pg_table; struct task_struct * p; + counter <<= priority; +check_task: + if (counter-- < 0) + return 0; + if (swap_task >= NR_TASKS) { + swap_task = 1; + goto check_task; + } + p = task[swap_task]; + if (!p || !p->swappable) { + swap_task++; + goto check_task; + } check_dir: - if (counter < 0) - goto no_swap; - if (dir_entry >= 1024) - dir_entry = FIRST_VM_PAGE>>10; - if (!(p = task[dir_entry >> 4]) || !p->swappable) { - counter -= 1024; - dir_entry++; + if (swap_table >= 1024) { + swap_table = 0; + swap_task++; + goto check_task; + } + pg_table = ((unsigned long *) p->tss.cr3)[swap_table]; + if (pg_table < low_memory) { + swap_table++; goto check_dir; } - if (!(1 & (pg_table = pg_dir[dir_entry]))) { - if (pg_table) { - printk("bad page-table at pg_dir[%d]: %08x\n\r", - dir_entry,pg_table); - pg_dir[dir_entry] = 0; - } - counter -= 1024; - dir_entry++; + if (!(1 & pg_table)) { + printk("bad page-table at pg_dir[%d]: %08x\n\r", + swap_table,pg_table); + ((unsigned long *) p->tss.cr3)[swap_table] = 0; + swap_table++; goto check_dir; } pg_table &= 0xfffff000; check_table: - if (counter < 0) - goto no_swap; - if (page_entry >= 1024) { - page_entry = 0; - dir_entry++; + if (swap_page >= 1024) { + swap_page = 0; + swap_table++; goto check_dir; } - if (try_to_swap_out(page_entry + (unsigned long *) pg_table)) { + if (try_to_swap_out(swap_page + (unsigned long *) pg_table)) { p->rss--; return 1; } - page_entry++; - counter--; + swap_page++; goto check_table; -no_swap: - return 0; } static int try_to_free_page(void) @@ -335,10 +343,8 @@ repeat: } if (priority <= GFP_BUFFER) return 0; - if (try_to_free_page()) { - schedule(); + if (try_to_free_page()) goto repeat; - } return 0; } @@ -355,8 +361,9 @@ int sys_swapon(const char * specialfile) if (!suser()) return -EPERM; - if (!(swap_inode = namei(specialfile))) - return -ENOENT; + i = namei(specialfile,&swap_inode); + if (i) + return i; if (swap_file || swap_device || swap_bitmap || swap_lockmap) { iput(swap_inode); return -EBUSY; diff --git a/net/Makefile b/net/Makefile index 72a28e1a3dca..b61a843e83bd 100644 --- a/net/Makefile +++ b/net/Makefile @@ -27,7 +27,7 @@ net.o: $(OBJS) subdirs subdirs: dummy - for i in $(SUBDIRS); do (cd $$i; echo $$i; $(MAKE)) || exit; done + for i in $(SUBDIRS); do (cd $$i && echo $$i && $(MAKE)) || exit; done clean: rm -f core *.o *.a tmp_make @@ -37,7 +37,7 @@ dep: sed '/\#\#\# Dependencies/q' < Makefile > tmp_make for i in *.c;do $(CPP) -M $$i;done >> tmp_make cp tmp_make Makefile - @for i in $(SUBDIRS); do (cd $$i; echo $$i; $(MAKE) dep || exit; done + @for i in $(SUBDIRS); do (cd $$i && echo $$i && $(MAKE) dep) || exit; done dummy: diff --git a/net/unix.c b/net/unix.c index b0a2f10a7eea..26bc918d2bd3 100644 --- a/net/unix.c +++ b/net/unix.c @@ -351,7 +351,7 @@ unix_proto_bind(struct socket *sock, struct sockaddr *umyaddr, set_fs(get_ds()); i = do_mknod(fname, S_IFSOCK | 0777, 0); if (i == 0) - i = open_namei(fname, 0, S_IFSOCK, &upd->inode); + i = open_namei(fname, 0, S_IFSOCK, &upd->inode, NULL); set_fs(old_fs); if (i < 0) { printk("unix_proto_bind: can't open socket %s\n", fname); -- 2.39.5