From 197e67c54abad8c3633bc57d10ddbff917ec344a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 23 Nov 2007 15:26:29 -0500 Subject: [PATCH] Import 2.3.12pre7 --- arch/alpha/kernel/process.c | 1 - arch/arm/kernel/init_task.c | 1 - arch/i386/kernel/init_task.c | 1 - arch/m68k/kernel/process.c | 1 - arch/mips/kernel/init_task.c | 1 - arch/ppc/kernel/process.c | 1 - arch/sparc/kernel/init_task.c | 1 - arch/sparc64/kernel/init_task.c | 1 - arch/sparc64/solaris/timod.c | 4 +- drivers/block/ll_rw_blk.c | 6 +- drivers/char/Makefile | 2 +- drivers/char/mem.c | 2 + drivers/char/raw.c | 384 ++++++++++++++++++++++++++++++++ drivers/usb/hub.c | 38 +--- fs/Makefile | 2 +- fs/buffer.c | 216 ++++++++++++++++++ fs/exec.c | 4 +- fs/fcntl.c | 118 ++++++++-- fs/file.c | 240 ++++++++++++++++++++ fs/iobuf.c | 136 +++++++++++ fs/ioctl.c | 4 +- fs/open.c | 57 ++++- fs/proc/array.c | 4 +- fs/select.c | 10 +- include/asm-alpha/resource.h | 2 +- include/asm-arm/resource.h | 2 +- include/asm-i386/resource.h | 2 +- include/asm-m68k/resource.h | 2 +- include/asm-mips/resource.h | 2 +- include/asm-ppc/resource.h | 2 +- include/asm-sparc/resource.h | 2 +- include/asm-sparc64/resource.h | 2 +- include/linux/file.h | 34 ++- include/linux/fs.h | 15 +- include/linux/iobuf.h | 80 +++++++ include/linux/limits.h | 2 +- include/linux/major.h | 2 + include/linux/raw.h | 23 ++ include/linux/sched.h | 68 +++++- init/main.c | 2 + kernel/exit.c | 18 +- kernel/fork.c | 110 +++++---- mm/memory.c | 176 +++++++++++++++ 43 files changed, 1616 insertions(+), 165 deletions(-) create mode 100644 drivers/char/raw.c create mode 100644 fs/file.c create mode 100644 fs/iobuf.c create mode 100644 include/linux/iobuf.h create mode 100644 include/linux/raw.h diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index 6a0262b8cc72..faaa147f8457 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -55,7 +55,6 @@ unsigned long init_user_stack[1024] = { STACK_MAGIC, }; static struct vm_area_struct init_mmap = INIT_MMAP; static struct fs_struct init_fs = INIT_FS; -static struct file * init_fd_array[NR_OPEN] = { NULL, }; static struct files_struct init_files = INIT_FILES; static struct signal_struct init_signals = INIT_SIGNALS; struct mm_struct init_mm = INIT_MM(init_mm); diff --git a/arch/arm/kernel/init_task.c b/arch/arm/kernel/init_task.c index 5d09ea54009c..e3853f3d58e2 100644 --- a/arch/arm/kernel/init_task.c +++ b/arch/arm/kernel/init_task.c @@ -6,7 +6,6 @@ static struct vm_area_struct init_mmap = INIT_MMAP; static struct fs_struct init_fs = INIT_FS; -static struct file * init_fd_array[NR_OPEN] = { NULL, }; static struct files_struct init_files = INIT_FILES; static struct signal_struct init_signals = INIT_SIGNALS; struct mm_struct init_mm = INIT_MM(init_mm); diff --git a/arch/i386/kernel/init_task.c b/arch/i386/kernel/init_task.c index 364be7c16858..e3ca30fd0bcd 100644 --- a/arch/i386/kernel/init_task.c +++ b/arch/i386/kernel/init_task.c @@ -8,7 +8,6 @@ static struct vm_area_struct init_mmap = INIT_MMAP; static struct fs_struct init_fs = INIT_FS; -static struct file * init_fd_array[NR_OPEN] = { NULL, }; static struct files_struct init_files = INIT_FILES; static struct signal_struct init_signals = INIT_SIGNALS; struct mm_struct init_mm = INIT_MM(init_mm); diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c index cb5e5f781565..a57def1937d7 100644 --- a/arch/m68k/kernel/process.c +++ b/arch/m68k/kernel/process.c @@ -40,7 +40,6 @@ */ static struct vm_area_struct init_mmap = INIT_MMAP; static struct fs_struct init_fs = INIT_FS; -static struct file * init_fd_array[NR_OPEN] = { NULL, }; static struct files_struct init_files = INIT_FILES; static struct signal_struct init_signals = INIT_SIGNALS; struct mm_struct init_mm = INIT_MM(init_mm); diff --git a/arch/mips/kernel/init_task.c b/arch/mips/kernel/init_task.c index 2ce7885c5cc9..fbf5bf9f09cb 100644 --- a/arch/mips/kernel/init_task.c +++ b/arch/mips/kernel/init_task.c @@ -6,7 +6,6 @@ static struct vm_area_struct init_mmap = INIT_MMAP; static struct fs_struct init_fs = INIT_FS; -static struct files * init_fd_array[NR_OPEN] = { NULL, }; static struct files_struct init_files = INIT_FILES; static struct signal_struct init_signals = INIT_SIGNALS; struct mm_struct init_mm = INIT_MM(init_mm); diff --git a/arch/ppc/kernel/process.c b/arch/ppc/kernel/process.c index 58b6dd04752e..d6f271609469 100644 --- a/arch/ppc/kernel/process.c +++ b/arch/ppc/kernel/process.c @@ -47,7 +47,6 @@ extern unsigned long _get_SP(void); struct task_struct *last_task_used_math = NULL; static struct vm_area_struct init_mmap = INIT_MMAP; static struct fs_struct init_fs = INIT_FS; -static struct file * init_fd_array[NR_OPEN] = { NULL, }; static struct files_struct init_files = INIT_FILES; static struct signal_struct init_signals = INIT_SIGNALS; struct mm_struct init_mm = INIT_MM(init_mm); diff --git a/arch/sparc/kernel/init_task.c b/arch/sparc/kernel/init_task.c index dc2bc917196f..daa07bb78f83 100644 --- a/arch/sparc/kernel/init_task.c +++ b/arch/sparc/kernel/init_task.c @@ -6,7 +6,6 @@ static struct vm_area_struct init_mmap = INIT_MMAP; static struct fs_struct init_fs = INIT_FS; -static struct file * init_fd_array[NR_OPEN] = { NULL, }; static struct files_struct init_files = INIT_FILES; static struct signal_struct init_signals = INIT_SIGNALS; struct mm_struct init_mm = INIT_MM(init_mm); diff --git a/arch/sparc64/kernel/init_task.c b/arch/sparc64/kernel/init_task.c index 66869404da95..d256b57610bb 100644 --- a/arch/sparc64/kernel/init_task.c +++ b/arch/sparc64/kernel/init_task.c @@ -6,7 +6,6 @@ static struct vm_area_struct init_mmap = INIT_MMAP; static struct fs_struct init_fs = INIT_FS; -static struct file * init_fd_array[NR_OPEN] = { NULL, }; static struct files_struct init_files = INIT_FILES; static struct signal_struct init_signals = INIT_SIGNALS; struct mm_struct init_mm = INIT_MM(init_mm); diff --git a/arch/sparc64/solaris/timod.c b/arch/sparc64/solaris/timod.c index cba34c36d9d0..04c9f114e114 100644 --- a/arch/sparc64/solaris/timod.c +++ b/arch/sparc64/solaris/timod.c @@ -866,7 +866,7 @@ asmlinkage int solaris_getmsg(unsigned int fd, u32 arg1, u32 arg2, u32 arg3) SOLD("entry"); lock_kernel(); - if(fd >= NR_OPEN) goto out; + if(fd >= current->files->max_fds) goto out; filp = current->files->fd[fd]; if(!filp) goto out; @@ -933,7 +933,7 @@ asmlinkage int solaris_putmsg(unsigned int fd, u32 arg1, u32 arg2, u32 arg3) SOLD("entry"); lock_kernel(); - if(fd >= NR_OPEN) goto out; + if(fd >= current->files->max_fds) goto out; filp = current->files->fd[fd]; if(!filp) goto out; diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c index cdfd315b7172..50254c9d8f3c 100644 --- a/drivers/block/ll_rw_blk.c +++ b/drivers/block/ll_rw_blk.c @@ -428,6 +428,9 @@ void make_request(int major,int rw, struct buffer_head * bh) kstat.pgpgin++; max_req = NR_REQUEST; /* reads take precedence */ break; + case WRITERAW: + rw = WRITE; + goto do_write; /* Skip the buffer refile */ case WRITEA: rw_ahead = 1; rw = WRITE; /* drop into WRITE */ @@ -435,6 +438,7 @@ void make_request(int major,int rw, struct buffer_head * bh) if (!test_and_clear_bit(BH_Dirty, &bh->b_state)) goto end_io; /* Hmmph! Nothing to write */ refile_buffer(bh); + do_write: /* * We don't allow the write-requests to fill up the * queue completely: we want some room for reads, @@ -641,7 +645,7 @@ void ll_rw_block(int rw, int nr, struct buffer_head * bh[]) #endif } - if ((rw == WRITE || rw == WRITEA) && is_read_only(bh[0]->b_dev)) { + if ((rw & WRITE) && is_read_only(bh[0]->b_dev)) { printk(KERN_NOTICE "Can't write to read-only device %s\n", kdevname(bh[0]->b_dev)); goto sorry; diff --git a/drivers/char/Makefile b/drivers/char/Makefile index 050571932956..23af66b2eb79 100644 --- a/drivers/char/Makefile +++ b/drivers/char/Makefile @@ -20,7 +20,7 @@ FONTMAPFILE = cp437.uni L_TARGET := char.a M_OBJS := -L_OBJS := tty_io.o n_tty.o tty_ioctl.o mem.o random.o +L_OBJS := tty_io.o n_tty.o tty_ioctl.o mem.o random.o raw.o LX_OBJS := pty.o misc.o ifdef CONFIG_VT diff --git a/drivers/char/mem.c b/drivers/char/mem.c index d83775f7c46b..986f9007bd84 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -602,6 +603,7 @@ __initfunc(int chr_dev_init(void)) if (register_chrdev(MEM_MAJOR,"mem",&memory_fops)) printk("unable to get major %d for memory devs\n", MEM_MAJOR); rand_initialize(); + raw_init(); #ifdef CONFIG_USB usb_init(); #endif diff --git a/drivers/char/raw.c b/drivers/char/raw.c new file mode 100644 index 000000000000..a3d0d0278649 --- /dev/null +++ b/drivers/char/raw.c @@ -0,0 +1,384 @@ +/* + * linux/drivers/char/raw.c + * + * Front-end raw character devices. These can be bound to any block + * devices to provide genuine Unix raw character device semantics. + * + * We reserve minor number 0 for a control interface. ioctl()s on this + * device are used to bind the other minor numbers to block devices. + */ + +#include +#include +#include +#include +#include +#include + +#define dprintk(x...) + +static kdev_t raw_device_bindings[256] = {}; +static int raw_device_inuse[256] = {}; +static int raw_device_sector_size[256] = {}; +static int raw_device_sector_bits[256] = {}; + +extern struct file_operations * get_blkfops(unsigned int major); + +static ssize_t rw_raw_dev(int rw, struct file *, char *, size_t, loff_t *); + +ssize_t raw_read(struct file *, char *, size_t, loff_t *); +ssize_t raw_write(struct file *, const char *, size_t, loff_t *); +int raw_open(struct inode *, struct file *); +int raw_release(struct inode *, struct file *); +int raw_ctl_ioctl(struct inode *, struct file *, unsigned int, unsigned long); + + +static struct file_operations raw_fops = { + NULL, /* llseek */ + raw_read, /* read */ + raw_write, /* write */ + NULL, /* readdir */ + NULL, /* poll */ + NULL, /* ioctl */ + NULL, /* mmap */ + raw_open, /* open */ + NULL, /* flush */ + raw_release, /* release */ + NULL /* fsync */ +}; + +static struct file_operations raw_ctl_fops = { + NULL, /* llseek */ + NULL, /* read */ + NULL, /* write */ + NULL, /* readdir */ + NULL, /* poll */ + raw_ctl_ioctl, /* ioctl */ + NULL, /* mmap */ + raw_open, /* open */ + NULL, /* flush */ + NULL, /* no special release code */ + NULL /* fsync */ +}; + + + +void __init raw_init(void) +{ + register_chrdev(RAW_MAJOR, "raw", &raw_fops); +} + + +/* + * The raw IO open and release code needs to fake appropriate + * open/release calls to the underlying block devices. + */ + +static int bdev_open(kdev_t dev, int mode) +{ + int err = 0; + struct file dummy_file = {}; + struct dentry dummy_dentry = {}; + struct inode * inode = get_empty_inode(); + + if (!inode) + return -ENOMEM; + + dummy_file.f_op = get_blkfops(MAJOR(dev)); + if (!dummy_file.f_op) { + err = -ENODEV; + goto done; + } + + if (dummy_file.f_op->open) { + inode->i_rdev = dev; + dummy_dentry.d_inode = inode; + dummy_file.f_dentry = &dummy_dentry; + dummy_file.f_mode = mode; + err = dummy_file.f_op->open(inode, &dummy_file); + } + + done: + iput(inode); + return err; +} + +static int bdev_close(kdev_t dev) +{ + int err; + struct inode * inode = get_empty_inode(); + + if (!inode) + return -ENOMEM; + + inode->i_rdev = dev; + err = blkdev_release(inode); + iput(inode); + return err; +} + + + +/* + * Open/close code for raw IO. + */ + +int raw_open(struct inode *inode, struct file *filp) +{ + int minor; + kdev_t bdev; + int err; + int sector_size; + int sector_bits; + + minor = MINOR(inode->i_rdev); + + /* + * Is it the control device? + */ + + if (minor == 0) { + filp->f_op = &raw_ctl_fops; + return 0; + } + + /* + * No, it is a normal raw device. All we need to do on open is + * to check that the device is bound, and force the underlying + * block device to a sector-size blocksize. + */ + + bdev = raw_device_bindings[minor]; + if (bdev == NODEV) + return -ENODEV; + + err = bdev_open(bdev, filp->f_mode); + if (err) + return err; + + /* + * Don't change the blocksize if we already have users using + * this device + */ + + if (raw_device_inuse[minor]++) + return 0; + + /* + * Don't interfere with mounted devices: we cannot safely set + * the blocksize on a device which is already mounted. + */ + + sector_size = 512; + if (lookup_vfsmnt(bdev) != NULL) { + if (blksize_size[MAJOR(bdev)]) + sector_size = blksize_size[MAJOR(bdev)][MINOR(bdev)]; + } else { + if (hardsect_size[MAJOR(bdev)]) + sector_size = hardsect_size[MAJOR(bdev)][MINOR(bdev)]; + } + + set_blocksize(bdev, sector_size); + raw_device_sector_size[minor] = sector_size; + + for (sector_bits = 0; !(sector_size & 1); ) + sector_size>>=1, sector_bits++; + raw_device_sector_bits[minor] = sector_bits; + + return 0; +} + +int raw_release(struct inode *inode, struct file *filp) +{ + int minor; + kdev_t bdev; + + minor = MINOR(inode->i_rdev); + bdev = raw_device_bindings[minor]; + bdev_close(bdev); + raw_device_inuse[minor]--; + return 0; +} + + + +/* + * Deal with ioctls against the raw-device control interface, to bind + * and unbind other raw devices. + */ + +int raw_ctl_ioctl(struct inode *inode, + struct file *flip, + unsigned int command, + unsigned long arg) +{ + struct raw_config_request rq; + int err = 0; + int minor; + + switch (command) { + case RAW_SETBIND: + case RAW_GETBIND: + + /* First, find out which raw minor we want */ + + err = copy_from_user(&rq, (void *) arg, sizeof(rq)); + if (err) + break; + + minor = rq.raw_minor; + if (minor == 0 || minor > MINORMASK) { + err = -EINVAL; + break; + } + + if (command == RAW_SETBIND) { + /* + * For now, we don't need to check that the underlying + * block device is present or not: we can do that when + * the raw device is opened. Just check that the + * major/minor numbers make sense. + */ + + if (rq.block_major == NODEV || + rq.block_major > MAX_BLKDEV || + rq.block_minor > MINORMASK) { + err = -EINVAL; + break; + } + + if (raw_device_inuse[minor]) { + err = -EBUSY; + break; + } + raw_device_bindings[minor] = + MKDEV(rq.block_major, rq.block_minor); + } else { + rq.block_major = MAJOR(raw_device_bindings[minor]); + rq.block_minor = MINOR(raw_device_bindings[minor]); + err = copy_to_user((void *) arg, &rq, sizeof(rq)); + } + break; + + default: + err = -EINVAL; + } + + return err; +} + + + +ssize_t raw_read(struct file *filp, char * buf, + size_t size, loff_t *offp) +{ + return rw_raw_dev(READ, filp, buf, size, offp); +} + +ssize_t raw_write(struct file *filp, const char *buf, + size_t size, loff_t *offp) +{ + return rw_raw_dev(WRITE, filp, (char *) buf, size, offp); +} + +#define SECTOR_BITS 9 +#define SECTOR_SIZE (1U << SECTOR_BITS) +#define SECTOR_MASK (SECTOR_SIZE - 1) + +ssize_t rw_raw_dev(int rw, struct file *filp, char *buf, + size_t size, loff_t *offp) +{ + struct kiobuf * iobuf; + int err; + unsigned long blocknr, blocks; + unsigned long b[KIO_MAX_SECTORS]; + size_t transferred; + int iosize; + int i; + int minor; + kdev_t dev; + unsigned long limit; + + int sector_size, sector_bits, sector_mask; + int max_sectors; + + /* + * First, a few checks on device size limits + */ + + minor = MINOR(filp->f_dentry->d_inode->i_rdev); + dev = raw_device_bindings[minor]; + sector_size = raw_device_sector_size[minor]; + sector_bits = raw_device_sector_bits[minor]; + sector_mask = sector_size- 1; + max_sectors = KIO_MAX_SECTORS >> (sector_bits - 9); + + if (blk_size[MAJOR(dev)]) + limit = (((loff_t) blk_size[MAJOR(dev)][MINOR(dev)]) << BLOCK_SIZE_BITS) >> sector_bits; + else + limit = INT_MAX; + dprintk ("rw_raw_dev: dev %d:%d (+%d)\n", + MAJOR(dev), MINOR(dev), limit); + + if ((*offp & sector_mask) || (size & sector_mask)) + return -EINVAL; + if ((*offp >> sector_bits) > limit) + return 0; + + /* + * We'll just use one kiobuf + */ + + err = alloc_kiovec(1, &iobuf); + if (err) + return err; + + /* + * Split the IO into KIO_MAX_SECTORS chunks, mapping and + * unmapping the single kiobuf as we go to perform each chunk of + * IO. + */ + + transferred = 0; + blocknr = *offp >> sector_bits; + while (size > 0) { + blocks = size >> sector_bits; + if (blocks > max_sectors) + blocks = max_sectors; + if (blocks > limit - blocknr) + blocks = limit - blocknr; + if (!blocks) + break; + + iosize = blocks << sector_bits; + + err = map_user_kiobuf(rw, iobuf, (unsigned long) buf, iosize); + if (err) + break; + + for (i=0; i < blocks; i++) + b[i] = blocknr++; + + err = brw_kiovec(rw, 1, &iobuf, dev, b, sector_size, 0); + + if (err >= 0) { + transferred += err; + size -= err; + buf += err; + } + + unmap_kiobuf(iobuf); + + if (err != iosize) + break; + } + + free_kiovec(1, &iobuf); + + if (transferred) { + *offp += transferred; + return transferred; + } + + return err; +} diff --git a/drivers/usb/hub.c b/drivers/usb/hub.c index 02443f965b8d..8b5e26fb6cc9 100644 --- a/drivers/usb/hub.c +++ b/drivers/usb/hub.c @@ -432,38 +432,14 @@ int usb_hub_init(void) void usb_hub_cleanup(void) { - struct list_head *next, *tmp, *head = &all_hubs_list; - struct usb_hub *hub; - unsigned long flags, flags2; - - /* Free the resources allocated by each hub */ - spin_lock_irqsave(&hub_list_lock, flags); - spin_lock_irqsave(&hub_event_lock, flags2); - - tmp = head->next; - while (tmp != head) { - hub = list_entry(tmp, struct usb_hub, hub_list); - - next = tmp->next; - - list_del(&hub->event_list); - INIT_LIST_HEAD(&hub->event_list); - list_del(tmp); /* &hub->hub_list */ - INIT_LIST_HEAD(tmp); /* &hub->hub_list */ - - /* XXX we should disconnect each connected port here */ - - usb_release_irq(hub->dev, hub->irq_handle); - hub->irq_handle = NULL; - kfree(hub); - - tmp = next; - } - + /* + * Hub resources are freed for us by usb_deregister. It + * usb_driver_purge on every device which in turn calls that + * devices disconnect function if it is using this driver. + * The hub_disconnect function takes care of releasing the + * individual hub resources. -greg + */ usb_deregister(&hub_driver); - - spin_unlock_irqrestore(&hub_event_lock, flags2); - spin_unlock_irqrestore(&hub_list_lock, flags); } /* usb_hub_cleanup() */ #ifdef MODULE diff --git a/fs/Makefile b/fs/Makefile index 069b1a2a41e3..4bc1177d10b3 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -13,7 +13,7 @@ O_TARGET := fs.o O_OBJS = open.o read_write.o devices.o file_table.o buffer.o \ super.o block_dev.o stat.o exec.o pipe.o namei.o fcntl.o \ ioctl.o readdir.o select.o fifo.o locks.o filesystems.o \ - dcache.o inode.o attr.o bad_inode.o $(BINFMTS) + dcache.o inode.o attr.o bad_inode.o file.o iobuf.o $(BINFMTS) MOD_LIST_NAME := FS_MODULES ALL_SUB_DIRS = coda minix ext2 fat msdos vfat proc isofs nfs umsdos ntfs \ diff --git a/fs/buffer.c b/fs/buffer.c index 7e9bd1a02fed..f869c280e30c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -1527,6 +1528,221 @@ out: return err; } + +/* + * IO completion routine for a buffer_head being used for kiobuf IO: we + * can't dispatch the kiobuf callback until io_count reaches 0. + */ + +static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate) +{ + struct kiobuf *kiobuf; + + mark_buffer_uptodate(bh, uptodate); + + kiobuf = bh->b_kiobuf; + if (atomic_dec_and_test(&kiobuf->io_count)) + kiobuf->end_io(kiobuf); + if (!uptodate) + kiobuf->errno = -EIO; +} + + +/* + * For brw_kiovec: submit a set of buffer_head temporary IOs and wait + * for them to complete. Clean up the buffer_heads afterwards. + */ + +#define dprintk(x...) + +static int do_kio(struct kiobuf *kiobuf, + int rw, int nr, struct buffer_head *bh[], int size) +{ + int iosize; + int i; + struct buffer_head *tmp; + + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + + dprintk ("do_kio start %d\n", rw); + + if (rw == WRITE) + rw = WRITERAW; + atomic_add(nr, &kiobuf->io_count); + kiobuf->errno = 0; + ll_rw_block(rw, nr, bh); + + kiobuf_wait_for_io(kiobuf); + + spin_lock(&unused_list_lock); + + iosize = 0; + for (i = nr; --i >= 0; ) { + iosize += size; + tmp = bh[i]; + if (!buffer_uptodate(tmp)) { + /* We are traversing bh'es in reverse order so + clearing iosize on error calculates the + amount of IO before the first error. */ + iosize = 0; + } + __put_unused_buffer_head(tmp); + } + + spin_unlock(&unused_list_lock); + + dprintk ("do_kio end %d %d\n", iosize, err); + + if (iosize) + return iosize; + if (kiobuf->errno) + return kiobuf->errno; + return -EIO; +} + +/* + * Start I/O on a physical range of kernel memory, defined by a vector + * of kiobuf structs (much like a user-space iovec list). + * + * The kiobuf must already be locked for IO. IO is submitted + * asynchronously: you need to check page->locked, page->uptodate, and + * maybe wait on page->wait. + * + * It is up to the caller to make sure that there are enough blocks + * passed in to completely map the iobufs to disk. + */ + +int brw_kiovec(int rw, int nr, struct kiobuf *iovec[], + kdev_t dev, unsigned long b[], int size, int bmap) +{ + int err; + int length; + int transferred; + int i; + int bufind; + int pageind; + int bhind; + int offset; + unsigned long blocknr; + struct kiobuf * iobuf = NULL; + unsigned long page; + struct page * map; + struct buffer_head *tmp, *bh[KIO_MAX_SECTORS]; + + if (!nr) + return 0; + + /* + * First, do some alignment and validity checks + */ + for (i = 0; i < nr; i++) { + iobuf = iovec[i]; + if ((iobuf->offset & (size-1)) || + (iobuf->length & (size-1))) + return -EINVAL; + if (!iobuf->locked) + panic("brw_kiovec: iobuf not locked for I/O"); + if (!iobuf->nr_pages) + panic("brw_kiovec: iobuf not initialised"); + } + + /* DEBUG */ +#if 0 + return iobuf->length; +#endif + dprintk ("brw_kiovec: start\n"); + + /* + * OK to walk down the iovec doing page IO on each page we find. + */ + bufind = bhind = transferred = err = 0; + for (i = 0; i < nr; i++) { + iobuf = iovec[i]; + offset = iobuf->offset; + length = iobuf->length; + dprintk ("iobuf %d %d %d\n", offset, length, size); + + for (pageind = 0; pageind < iobuf->nr_pages; pageind++) { + page = iobuf->pagelist[pageind]; + map = iobuf->maplist[pageind]; + + while (length > 0) { + blocknr = b[bufind++]; + tmp = get_unused_buffer_head(0); + if (!tmp) { + err = -ENOMEM; + goto error; + } + + tmp->b_dev = B_FREE; + tmp->b_size = size; + tmp->b_data = (char *) (page + offset); + tmp->b_this_page = tmp; + + init_buffer(tmp, end_buffer_io_kiobuf, NULL); + tmp->b_dev = dev; + tmp->b_blocknr = blocknr; + tmp->b_state = 1 << BH_Mapped; + tmp->b_kiobuf = iobuf; + + if (rw == WRITE) { + set_bit(BH_Uptodate, &tmp->b_state); + set_bit(BH_Dirty, &tmp->b_state); + } + + dprintk ("buffer %d (%d) at %p\n", + bhind, tmp->b_blocknr, tmp->b_data); + bh[bhind++] = tmp; + length -= size; + offset += size; + + /* + * Start the IO if we have got too much + */ + if (bhind >= KIO_MAX_SECTORS) { + err = do_kio(iobuf, rw, bhind, bh, size); + if (err >= 0) + transferred += err; + else + goto finished; + bhind = 0; + } + + if (offset >= PAGE_SIZE) { + offset = 0; + break; + } + } /* End of block loop */ + } /* End of page loop */ + } /* End of iovec loop */ + + /* Is there any IO still left to submit? */ + if (bhind) { + err = do_kio(iobuf, rw, bhind, bh, size); + if (err >= 0) + transferred += err; + else + goto finished; + } + + finished: + dprintk ("brw_kiovec: end (%d, %d)\n", transferred, err); + if (transferred) + return transferred; + return err; + + error: + /* We got an error allocation the bh'es. Just free the current + buffer_heads and exit. */ + spin_lock(&unused_list_lock); + for (i = bhind; --i >= 0; ) { + __put_unused_buffer_head(bh[bhind]); + } + spin_unlock(&unused_list_lock); + goto finished; +} + /* * Start I/O on a page. * This function expects the page to be locked and may return diff --git a/fs/exec.c b/fs/exec.c index e3d6fa4b7e28..8d6ee8dc3fd0 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -449,9 +449,9 @@ static inline void flush_old_files(struct files_struct * files) unsigned long set, i; i = j * __NFDBITS; - if (i >= files->max_fds) + if (i >= files->max_fds || i >= files->max_fdset) break; - set = xchg(&files->close_on_exec.fds_bits[j], 0); + set = xchg(&files->close_on_exec->fds_bits[j], 0); j++; for ( ; set ; i++,set >>= 1) { if (set & 1) diff --git a/fs/fcntl.c b/fs/fcntl.c index 016604f5d864..95ed3e9f48df 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -12,36 +12,89 @@ extern int sock_fcntl (struct file *, unsigned int cmd, unsigned long arg); -static inline int dupfd(struct file *file, unsigned int arg) +/* + * locate_fd finds a free file descriptor in the open_fds fdset, + * expanding the fd arrays if necessary. The files write lock will be + * held on exit to ensure that the fd can be entered atomically. + */ + +static inline int locate_fd(struct files_struct *files, + struct file *file, int start) { - struct files_struct * files = current->files; + unsigned int newfd; int error; - error = -EMFILE; write_lock(&files->file_lock); - arg = find_next_zero_bit(&files->open_fds, NR_OPEN, arg); - if (arg >= current->rlim[RLIMIT_NOFILE].rlim_cur) - goto out_putf; - FD_SET(arg, &files->open_fds); - FD_CLR(arg, &files->close_on_exec); - write_unlock(&files->file_lock); - fd_install(arg, file); - error = arg; + +repeat: + error = -EMFILE; + if (start < files->next_fd) + start = files->next_fd; + if (start >= files->max_fdset) { + expand: + error = expand_files(files, start); + if (error < 0) + goto out; + goto repeat; + } + + newfd = find_next_zero_bit(files->open_fds->fds_bits, + files->max_fdset, start); + + error = -EMFILE; + if (newfd >= current->rlim[RLIMIT_NOFILE].rlim_cur) + goto out; + if (newfd >= files->max_fdset) + goto expand; + + error = expand_files(files, newfd); + if (error < 0) + goto out; + if (error) /* If we might have blocked, try again. */ + goto repeat; + + if (start <= files->next_fd) + files->next_fd = newfd + 1; + + error = newfd; + out: return error; +} + +static inline void allocate_fd(struct files_struct *files, + struct file *file, int fd) +{ + FD_SET(fd, files->open_fds); + FD_CLR(fd, files->close_on_exec); + write_unlock(&files->file_lock); + fd_install(fd, file); +} + +static int dupfd(struct file *file, int start) +{ + struct files_struct * files = current->files; + int ret; + + ret = locate_fd(files, file, start); + if (ret < 0) + goto out_putf; + allocate_fd(files, file, ret); + return ret; out_putf: write_unlock(&files->file_lock); fput(file); - goto out; + return ret; } asmlinkage int sys_dup2(unsigned int oldfd, unsigned int newfd) { int err = -EBADF; struct file * file; + struct files_struct * files = current->files; - read_lock(¤t->files->file_lock); + write_lock(¤t->files->file_lock); if (!(file = fcheck(oldfd))) goto out_unlock; err = newfd; @@ -50,15 +103,33 @@ asmlinkage int sys_dup2(unsigned int oldfd, unsigned int newfd) err = -EBADF; if (newfd >= NR_OPEN) goto out_unlock; /* following POSIX.1 6.2.1 */ - get_file(file); - read_unlock(¤t->files->file_lock); + get_file(file); /* We are now finished with oldfd */ + + err = expand_files(files, newfd); + if (err < 0) { + write_unlock(&files->file_lock); + fput(file); + goto out; + } + + /* To avoid races with open() and dup(), we will mark the fd as + * in-use in the open-file bitmap throughout the entire dup2() + * process. This is quite safe: do_close() uses the fd array + * entry, not the bitmap, to decide what work needs to be + * done. --sct */ + FD_SET(newfd, files->open_fds); + write_unlock(&files->file_lock); + + do_close(newfd, 0); + + write_lock(&files->file_lock); + allocate_fd(files, file, newfd); + err = newfd; - sys_close(newfd); - err = dupfd(file, newfd); out: return err; out_unlock: - read_unlock(¤t->files->file_lock); + write_unlock(¤t->files->file_lock); goto out; } @@ -66,6 +137,7 @@ asmlinkage int sys_dup(unsigned int fildes) { int ret = -EBADF; struct file * file = fget(fildes); + if (file) ret = dupfd(file, 0); return ret; @@ -118,13 +190,13 @@ asmlinkage long sys_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg) } break; case F_GETFD: - err = FD_ISSET(fd, ¤t->files->close_on_exec); + err = FD_ISSET(fd, current->files->close_on_exec); break; case F_SETFD: if (arg&1) - FD_SET(fd, ¤t->files->close_on_exec); + FD_SET(fd, current->files->close_on_exec); else - FD_CLR(fd, ¤t->files->close_on_exec); + FD_CLR(fd, current->files->close_on_exec); break; case F_GETFL: err = filp->f_flags; @@ -152,7 +224,6 @@ asmlinkage long sys_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg) err = filp->f_owner.pid; break; case F_SETOWN: - err = 0; filp->f_owner.pid = arg; filp->f_owner.uid = current->uid; filp->f_owner.euid = current->euid; @@ -172,10 +243,9 @@ asmlinkage long sys_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg) break; default: /* sockets need a few special fcntls. */ + err = -EINVAL; if (S_ISSOCK (filp->f_dentry->d_inode->i_mode)) err = sock_fcntl (filp, cmd, arg); - else - err = -EINVAL; break; } fput(filp); diff --git a/fs/file.c b/fs/file.c new file mode 100644 index 000000000000..fd33dc8b8482 --- /dev/null +++ b/fs/file.c @@ -0,0 +1,240 @@ +/* + * linux/fs/open.c + * + * Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes + * + * Manage the dynamic fd arrays in the process files_struct. + */ + +#include +#include +#include +#include +#include + +#include + + +/* + * Allocate an fd array, using get_free_page() if possible. + * Note: the array isn't cleared at allocation time. + */ +struct file ** alloc_fd_array(int num) +{ + struct file **new_fds; + int size = num * sizeof(struct file *); + + if (size < PAGE_SIZE) + new_fds = (struct file **) kmalloc(size, GFP_KERNEL); + else if (size == PAGE_SIZE) + new_fds = (struct file **) __get_free_page(GFP_KERNEL); + else + new_fds = (struct file **) vmalloc(size); + return new_fds; +} + +void free_fd_array(struct file **array, int num) +{ + int size = num * sizeof(struct file *); + + if (!array) { + printk (KERN_ERR __FUNCTION__ "array = 0 (num = %d)\n", num); + return; + } + + if (num <= NR_OPEN_DEFAULT) /* Don't free the embedded fd array! */ + return; + else if (size < PAGE_SIZE) + kfree(array); + else if (size == PAGE_SIZE) + free_page((unsigned long) array); + else + vfree(array); +} + +/* + * Expand the fd array in the files_struct. Called with the files + * spinlock held for write. + */ + +int expand_fd_array(struct files_struct *files, int nr) +{ + struct file **new_fds; + int error, nfds; + + + error = -EMFILE; + if (files->max_fds >= NR_OPEN || nr > NR_OPEN) + goto out; + + nfds = files->max_fds; + write_unlock(&files->file_lock); + + /* + * Expand to the max in easy steps, and keep expanding it until + * we have enough for the requested fd array size. + */ + + do { +#if NR_OPEN_DEFAULT < 256 + if (nfds < 256) + nfds = 256; + else +#endif + if (nfds < (PAGE_SIZE / sizeof(struct file *))) + nfds = PAGE_SIZE / sizeof(struct file *); + else { + nfds = nfds * 2; + if (nfds > NR_OPEN) + nfds = NR_OPEN; + } + } while (nfds < nr); + + error = -ENOMEM; + new_fds = alloc_fd_array(nfds); + write_lock(&files->file_lock); + if (!new_fds) + goto out; + + /* Copy the existing array and install the new pointer */ + + if (nfds > files->max_fds) { + struct file **old_fds; + int i; + + old_fds = xchg(&files->fd, new_fds); + i = xchg(&files->max_fds, nfds); + + /* Don't copy/clear the array if we are creating a new + fd array for fork() */ + if (i) { + memcpy(new_fds, old_fds, i * sizeof(struct file *)); + /* clear the remainder of the array */ + memset(&new_fds[i], 0, + (nfds-i) * sizeof(struct file *)); + + write_unlock(&files->file_lock); + free_fd_array(old_fds, i); + write_lock(&files->file_lock); + } + } else { + /* Somebody expanded the array while we slept ... */ + write_unlock(&files->file_lock); + free_fd_array(new_fds, nfds); + write_lock(&files->file_lock); + } + error = 0; +out: + return error; +} + +/* + * Allocate an fdset array, using get_free_page() if possible. + * Note: the array isn't cleared at allocation time. + */ +fd_set * alloc_fdset(int num) +{ + fd_set *new_fdset; + int size = num / 8; + + if (size < PAGE_SIZE) + new_fdset = (fd_set *) kmalloc(size, GFP_KERNEL); + else if (size == PAGE_SIZE) + new_fdset = (fd_set *) __get_free_page(GFP_KERNEL); + else + new_fdset = (fd_set *) vmalloc(size); + return new_fdset; +} + +void free_fdset(fd_set *array, int num) +{ + int size = num / 8; + + if (!array) { + printk (KERN_ERR __FUNCTION__ "array = 0 (num = %d)\n", num); + return; + } + + if (num <= __FD_SETSIZE) /* Don't free an embedded fdset */ + return; + else if (size < PAGE_SIZE) + kfree(array); + else if (size == PAGE_SIZE) + free_page((unsigned long) array); + else + vfree(array); +} + +/* + * Expand the fdset in the files_struct. Called with the files spinlock + * held for write. + */ +int expand_fdset(struct files_struct *files, int nr) +{ + fd_set *new_openset = 0, *new_execset = 0; + int error, nfds = 0; + + error = -EMFILE; + if (files->max_fdset >= NR_OPEN || nr > NR_OPEN) + goto out; + + nfds = files->max_fdset; + write_unlock(&files->file_lock); + + /* Expand to the max in easy steps */ + do { + if (nfds < (PAGE_SIZE * 8)) + nfds = PAGE_SIZE * 8; + else { + nfds = nfds * 2; + if (nfds > NR_OPEN) + nfds = NR_OPEN; + } + } while (nfds < nr); + + error = -ENOMEM; + new_openset = alloc_fdset(nfds); + new_execset = alloc_fdset(nfds); + write_lock(&files->file_lock); + if (!new_openset || !new_execset) + goto out; + + error = 0; + + /* Copy the existing tables and install the new pointers */ + if (nfds > files->max_fdset) { + int i = files->max_fdset / (sizeof(unsigned long) * 8); + int count = (nfds - files->max_fdset) / 8; + + /* + * Don't copy the entire array if the current fdset is + * not yet initialised. + */ + if (i) { + memcpy (new_openset, files->open_fds, files->max_fdset/8); + memcpy (new_execset, files->close_on_exec, files->max_fdset/8); + memset (&new_openset->fds_bits[i], 0, count); + memset (&new_execset->fds_bits[i], 0, count); + } + + nfds = xchg(&files->max_fdset, nfds); + new_openset = xchg(&files->open_fds, new_openset); + new_execset = xchg(&files->close_on_exec, new_execset); + write_unlock(&files->file_lock); + free_fdset (new_openset, nfds); + free_fdset (new_execset, nfds); + write_lock(&files->file_lock); + return 0; + } + /* Somebody expanded the array while we slept ... */ + +out: + write_unlock(&files->file_lock); + if (new_openset) + free_fdset(new_openset, nfds); + if (new_execset) + free_fdset(new_execset, nfds); + write_lock(&files->file_lock); + return error; +} + diff --git a/fs/iobuf.c b/fs/iobuf.c new file mode 100644 index 000000000000..a227159f1730 --- /dev/null +++ b/fs/iobuf.c @@ -0,0 +1,136 @@ +/* + * iobuf.c + * + * Keep track of the general-purpose IO-buffer structures used to track + * abstract kernel-space io buffers. + * + */ + +#include +#include +#include + +static kmem_cache_t *kiobuf_cachep; + +/* + * The default IO completion routine for kiobufs: just wake up + * the kiobuf, nothing more. + */ + +void simple_wakeup_kiobuf(struct kiobuf *kiobuf) +{ + wake_up(&kiobuf->wait_queue); +} + + +void __init kiobuf_init(void) +{ + kiobuf_cachep = kmem_cache_create("kiobuf", + sizeof(struct kiobuf), + 0, + SLAB_HWCACHE_ALIGN, NULL, NULL); + if(!kiobuf_cachep) + panic("Cannot create kernel iobuf cache\n"); +} + + +int alloc_kiovec(int nr, struct kiobuf **bufp) +{ + int i; + struct kiobuf *iobuf; + + for (i = 0; i < nr; i++) { + iobuf = kmem_cache_alloc(kiobuf_cachep, SLAB_KERNEL); + if (!iobuf) { + free_kiovec(i, bufp); + return -ENOMEM; + } + + memset(iobuf, 0, sizeof(*iobuf)); + init_waitqueue_head(&iobuf->wait_queue); + iobuf->end_io = simple_wakeup_kiobuf; + iobuf->array_len = KIO_STATIC_PAGES; + iobuf->pagelist = iobuf->page_array; + iobuf->maplist = iobuf->map_array; + *bufp++ = iobuf; + } + + return 0; +} + +void free_kiovec(int nr, struct kiobuf **bufp) +{ + int i; + struct kiobuf *iobuf; + + for (i = 0; i < nr; i++) { + iobuf = bufp[i]; + if (iobuf->array_len > KIO_STATIC_PAGES) { + kfree (iobuf->pagelist); + kfree (iobuf->maplist); + } + kmem_cache_free(kiobuf_cachep, bufp[i]); + } +} + +int expand_kiobuf(struct kiobuf *iobuf, int wanted) +{ + unsigned long * pagelist; + struct page ** maplist; + + if (iobuf->array_len >= wanted) + return 0; + + pagelist = (unsigned long *) + kmalloc(wanted * sizeof(unsigned long), GFP_KERNEL); + if (!pagelist) + return -ENOMEM; + + maplist = (struct page **) + kmalloc(wanted * sizeof(struct page **), GFP_KERNEL); + if (!maplist) { + kfree(pagelist); + return -ENOMEM; + } + + /* Did it grow while we waited? */ + if (iobuf->array_len >= wanted) { + kfree(pagelist); + kfree(maplist); + return 0; + } + + memcpy (pagelist, iobuf->pagelist, wanted * sizeof(unsigned long)); + memcpy (maplist, iobuf->maplist, wanted * sizeof(struct page **)); + + if (iobuf->array_len > KIO_STATIC_PAGES) { + kfree (iobuf->pagelist); + kfree (iobuf->maplist); + } + + iobuf->pagelist = pagelist; + iobuf->maplist = maplist; + iobuf->array_len = wanted; + return 0; +} + + +void kiobuf_wait_for_io(struct kiobuf *kiobuf) +{ + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + + add_wait_queue(&kiobuf->wait_queue, &wait); +repeat: + tsk->state = TASK_UNINTERRUPTIBLE; + run_task_queue(&tq_disk); + if (atomic_read(&kiobuf->io_count) != 0) { + schedule(); + goto repeat; + } + tsk->state = TASK_RUNNING; + remove_wait_queue(&kiobuf->wait_queue, &wait); +} + + + diff --git a/fs/ioctl.c b/fs/ioctl.c index b9f2363e6d87..54e9e94cdc4b 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -61,11 +61,11 @@ asmlinkage int sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) lock_kernel(); switch (cmd) { case FIOCLEX: - FD_SET(fd, ¤t->files->close_on_exec); + FD_SET(fd, current->files->close_on_exec); break; case FIONCLEX: - FD_CLR(fd, ¤t->files->close_on_exec); + FD_CLR(fd, current->files->close_on_exec); break; case FIONBIO: diff --git a/fs/open.c b/fs/open.c index 3155d90462f0..11e0855b1f45 100644 --- a/fs/open.c +++ b/fs/open.c @@ -685,10 +685,14 @@ int get_unused_fd(void) struct files_struct * files = current->files; int fd, error; - error = -EMFILE; - + error = -EMFILE; write_lock(&files->file_lock); - fd = find_first_zero_bit(&files->open_fds, NR_OPEN); + +repeat: + fd = find_next_zero_bit(files->open_fds, + current->files->max_fdset, + files->next_fd); + /* * N.B. For clone tasks sharing a files structure, this test * will limit the total number of files that can be opened. @@ -696,10 +700,31 @@ int get_unused_fd(void) if (fd >= current->rlim[RLIMIT_NOFILE].rlim_cur) goto out; - /* Check here for fd > files->max_fds to do dynamic expansion */ + /* Do we need to expand the fdset array? */ + if (fd >= current->files->max_fdset) { + error = expand_fdset(files, 0); + if (!error) { + error = -EMFILE; + goto repeat; + } + goto out; + } + + /* + * Check whether we need to expand the fd array. + */ + if (fd >= files->max_fds) { + error = expand_fd_array(files, 0); + if (!error) { + error = -EMFILE; + goto repeat; + } + goto out; + } - FD_SET(fd, &files->open_fds); - FD_CLR(fd, &files->close_on_exec); + FD_SET(fd, files->open_fds); + FD_CLR(fd, files->close_on_exec); + files->next_fd = fd + 1; #if 1 /* Sanity check */ if (files->fd[fd] != NULL) { @@ -717,7 +742,9 @@ out: inline void put_unused_fd(unsigned int fd) { write_lock(¤t->files->file_lock); - FD_CLR(fd, ¤t->files->open_fds); + FD_CLR(fd, current->files->open_fds); + if (fd < current->files->next_fd) + current->files->next_fd = fd; write_unlock(¤t->files->file_lock); } @@ -790,8 +817,12 @@ int filp_close(struct file *filp, fl_owner_t id) * Careful here! We test whether the file pointer is NULL before * releasing the fd. This ensures that one clone task can't release * an fd while another clone is opening it. + * + * The "release" argument tells us whether or not to mark the fd as free + * or not in the open-files bitmap. dup2 uses this to retain the fd + * without races. */ -asmlinkage int sys_close(unsigned int fd) +int do_close(unsigned int fd, int release) { int error; struct file * filp; @@ -802,9 +833,10 @@ asmlinkage int sys_close(unsigned int fd) filp = frip(fd); if (!filp) goto out_unlock; - FD_CLR(fd, &files->close_on_exec); + FD_CLR(fd, files->close_on_exec); write_unlock(&files->file_lock); - put_unused_fd(fd); + if (release) + put_unused_fd(fd); lock_kernel(); error = filp_close(filp, files); unlock_kernel(); @@ -815,6 +847,11 @@ out_unlock: goto out; } +asmlinkage int sys_close(unsigned int fd) +{ + return do_close(fd, 1); +} + /* * This routine simulates a hangup on the tty, to arrange that users * are given clean terminals at login time. diff --git a/fs/proc/array.c b/fs/proc/array.c index bc4ca74d6b6c..be556a50bd2a 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -725,11 +725,13 @@ static inline char * task_state(struct task_struct *p, char *buffer) "PPid:\t%d\n" "Uid:\t%d\t%d\t%d\t%d\n" "Gid:\t%d\t%d\t%d\t%d\n" + "FDSize:\t%d\n" "Groups:\t", get_task_state(p), p->pid, p->p_pptr->pid, p->uid, p->euid, p->suid, p->fsuid, - p->gid, p->egid, p->sgid, p->fsgid); + p->gid, p->egid, p->sgid, p->fsgid, + p->files ? p->files->max_fds : 0); for (g = 0; g < p->ngroups; g++) buffer += sprintf(buffer, "%d ", p->groups[g]); diff --git a/fs/select.c b/fs/select.c index 3f278187e3db..9ca5c6d893ec 100644 --- a/fs/select.c +++ b/fs/select.c @@ -106,7 +106,7 @@ static int max_select_fd(unsigned long n, fd_set_bits *fds) /* handle last in-complete long-word first */ set = ~(~0UL << (n & (__NFDBITS-1))); n /= __NFDBITS; - open_fds = current->files->open_fds.fds_bits+n; + open_fds = current->files->open_fds->fds_bits+n; max = 0; if (set) { set &= BITS(fds, n); @@ -268,8 +268,8 @@ sys_select(int n, fd_set *inp, fd_set *outp, fd_set *exp, struct timeval *tvp) if (n < 0) goto out_nofds; - if (n > KFDS_NR) - n = KFDS_NR; + if (n > current->files->max_fdset + 1) + n = current->files->max_fdset + 1; /* * We need 6 bitmaps (in/out/ex for both incoming and outgoing), @@ -277,7 +277,7 @@ sys_select(int n, fd_set *inp, fd_set *outp, fd_set *exp, struct timeval *tvp) * long-words. */ ret = -ENOMEM; - size = FDS_BYTES(n); + size = (n + 8 * sizeof(long) - 1) / (8 * sizeof(long)) * sizeof(long); bits = kmalloc(6 * size, GFP_KERNEL); if (!bits) goto out_nofds; @@ -380,7 +380,7 @@ asmlinkage int sys_poll(struct pollfd * ufds, unsigned int nfds, long timeout) lock_kernel(); /* Do a sanity check on nfds ... */ err = -EINVAL; - if (nfds > NR_OPEN) + if (nfds > current->files->max_fds) goto out; if (timeout) { diff --git a/include/asm-alpha/resource.h b/include/asm-alpha/resource.h index 96b338fc48be..c6b6314ee705 100644 --- a/include/asm-alpha/resource.h +++ b/include/asm-alpha/resource.h @@ -28,7 +28,7 @@ {_STK_LIM, _STK_LIM}, /* RLIMIT_STACK */ \ { 0, LONG_MAX}, /* RLIMIT_CORE */ \ {LONG_MAX, LONG_MAX}, /* RLIMIT_RSS */ \ - { NR_OPEN, NR_OPEN}, /* RLIMIT_NOFILE */ \ + {INR_OPEN, INR_OPEN}, /* RLIMIT_NOFILE */ \ {LONG_MAX, LONG_MAX}, /* RLIMIT_AS */ \ {MAX_TASKS_PER_USER, MAX_TASKS_PER_USER}, /* RLIMIT_NPROC */ \ {LONG_MAX, LONG_MAX}, /* RLIMIT_MEMLOCK */ \ diff --git a/include/asm-arm/resource.h b/include/asm-arm/resource.h index 85d28115753d..c8b5e88f0aff 100644 --- a/include/asm-arm/resource.h +++ b/include/asm-arm/resource.h @@ -29,7 +29,7 @@ { 0, LONG_MAX }, \ { LONG_MAX, LONG_MAX }, \ { MAX_TASKS_PER_USER, MAX_TASKS_PER_USER }, \ - { NR_OPEN, NR_OPEN }, \ + { INR_OPEN, INR_OPEN }, \ { LONG_MAX, LONG_MAX }, \ { LONG_MAX, LONG_MAX }, \ } diff --git a/include/asm-i386/resource.h b/include/asm-i386/resource.h index 1bc94c0e499a..0f4089694017 100644 --- a/include/asm-i386/resource.h +++ b/include/asm-i386/resource.h @@ -29,7 +29,7 @@ { 0, LONG_MAX }, \ { LONG_MAX, LONG_MAX }, \ { 0, 0 }, \ - { NR_OPEN, NR_OPEN }, \ + { INR_OPEN, INR_OPEN }, \ { LONG_MAX, LONG_MAX }, \ { LONG_MAX, LONG_MAX }, \ } diff --git a/include/asm-m68k/resource.h b/include/asm-m68k/resource.h index 09ae4cfd25fb..3f981c7b68ed 100644 --- a/include/asm-m68k/resource.h +++ b/include/asm-m68k/resource.h @@ -29,7 +29,7 @@ { 0, LONG_MAX}, \ {LONG_MAX, LONG_MAX}, \ {MAX_TASKS_PER_USER, MAX_TASKS_PER_USER}, \ - {NR_OPEN, NR_OPEN}, \ + {INR_OPEN, INR_OPEN}, \ {LONG_MAX, LONG_MAX}, \ {LONG_MAX, LONG_MAX} \ } diff --git a/include/asm-mips/resource.h b/include/asm-mips/resource.h index c9722357b064..b0596d63abcd 100644 --- a/include/asm-mips/resource.h +++ b/include/asm-mips/resource.h @@ -34,7 +34,7 @@ { LONG_MAX, LONG_MAX }, \ { _STK_LIM, LONG_MAX }, \ { 0, LONG_MAX }, \ - { NR_OPEN, NR_OPEN }, \ + { INR_OPEN, INR_OPEN }, \ { LONG_MAX, LONG_MAX }, \ { LONG_MAX, LONG_MAX }, \ { MAX_TASKS_PER_USER, MAX_TASKS_PER_USER }, \ diff --git a/include/asm-ppc/resource.h b/include/asm-ppc/resource.h index 674e6d6f1eb6..7fe812abb3f8 100644 --- a/include/asm-ppc/resource.h +++ b/include/asm-ppc/resource.h @@ -25,7 +25,7 @@ { 0, LONG_MAX}, /* RLIMIT_CORE */ \ {LONG_MAX, LONG_MAX}, /* RLIMIT_RSS */ \ {MAX_TASKS_PER_USER, MAX_TASKS_PER_USER}, /* RLIMIT_NPROC */ \ - { NR_OPEN, NR_OPEN}, /* RLIMIT_NOFILE */ \ + {INR_OPEN, INR_OPEN}, /* RLIMIT_NOFILE */ \ {LONG_MAX, LONG_MAX}, /* RLIMIT_MEMLOCK */ \ {LONG_MAX, LONG_MAX}, /* RLIMIT_AS */ \ } diff --git a/include/asm-sparc/resource.h b/include/asm-sparc/resource.h index 7e4f49093f4c..ff73c8e50376 100644 --- a/include/asm-sparc/resource.h +++ b/include/asm-sparc/resource.h @@ -31,7 +31,7 @@ {LONG_MAX, LONG_MAX}, {LONG_MAX, LONG_MAX}, \ {LONG_MAX, LONG_MAX}, {_STK_LIM, LONG_MAX}, \ { 0, LONG_MAX}, {LONG_MAX, LONG_MAX}, \ - {NR_OPEN, NR_OPEN}, {MAX_TASKS_PER_USER, MAX_TASKS_PER_USER}, \ + {INR_OPEN, INR_OPEN}, {MAX_TASKS_PER_USER, MAX_TASKS_PER_USER}, \ {LONG_MAX, LONG_MAX}, {LONG_MAX, LONG_MAX} \ } diff --git a/include/asm-sparc64/resource.h b/include/asm-sparc64/resource.h index 02ba40894d49..7e490c729c85 100644 --- a/include/asm-sparc64/resource.h +++ b/include/asm-sparc64/resource.h @@ -30,7 +30,7 @@ {LONG_MAX, LONG_MAX}, {LONG_MAX, LONG_MAX}, \ {LONG_MAX, LONG_MAX}, {_STK_LIM, LONG_MAX}, \ { 0, LONG_MAX}, {LONG_MAX, LONG_MAX}, \ - {NR_OPEN, NR_OPEN}, {MAX_TASKS_PER_USER, MAX_TASKS_PER_USER}, \ + {INR_OPEN, INR_OPEN}, {MAX_TASKS_PER_USER, MAX_TASKS_PER_USER}, \ {LONG_MAX, LONG_MAX}, {LONG_MAX, LONG_MAX} \ } diff --git a/include/linux/file.h b/include/linux/file.h index 914fca925890..1a11904b64e0 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -55,18 +55,6 @@ extern inline struct file * fget(unsigned int fd) return file; } -/* - * Install a file pointer in the fd array. - */ -extern inline void fd_install(unsigned int fd, struct file * file) -{ - struct files_struct *files = current->files; - - write_lock(&files->file_lock); - files->fd[fd] = file; - write_unlock(&files->file_lock); -} - /* * 23/12/1998 Marcin Dalecki : * @@ -90,4 +78,26 @@ extern inline void fput(struct file * file) } extern void put_filp(struct file *); +/* + * Install a file pointer in the fd array. + * + * The VFS is full of places where we drop the files lock between + * setting the open_fds bitmap and installing the file in the file + * array. At any such point, we are vulnerable to a dup2() race + * installing a file in the array before us. We need to detect this and + * fput() the struct file we are about to overwrite in this case. + */ + +extern inline void fd_install(unsigned int fd, struct file * file) +{ + struct files_struct *files = current->files; + struct file * result; + + write_lock(&files->file_lock); + result = xchg(&files->fd[fd], file); + write_unlock(&files->file_lock); + if (result) + fput(result); +} + #endif /* __LINUX_FILE_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index e95879146ed7..c6fbe66a932a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -27,17 +27,19 @@ struct poll_table_struct; /* - * It's silly to have NR_OPEN bigger than NR_FILE, but I'll fix - * that later. Anyway, now the file code is no longer dependent - * on bitmaps in unsigned longs, but uses the new fd_set structure.. + * It's silly to have NR_OPEN bigger than NR_FILE, but you can change + * the file limit at runtime and only root can increase the per-process + * nr_file rlimit, so it's safe to set up a ridiculously high absolute + * upper limit on files-per-process. * * Some programs (notably those using select()) may have to be - * recompiled to take full advantage of the new limits.. + * recompiled to take full advantage of the new limits.. */ /* Fixed constants first: */ #undef NR_OPEN -#define NR_OPEN 1024 +#define NR_OPEN (1024*1024) /* Absolute upper limit on fd num */ +#define INR_OPEN 1024 /* Initial setting for nfile rlimits */ #define BLOCK_SIZE_BITS 10 #define BLOCK_SIZE (1< +#include +#include +#include + +/* + * The kiobuf structure describes a physical set of pages reserved + * locked for IO. The reference counts on each page will have been + * incremented, and the flags field will indicate whether or not we have + * pre-locked all of the pages for IO. + * + * kiobufs may be passed in arrays to form a kiovec, but we must + * preserve the property that no page is present more than once over the + * entire iovec. + */ + +#define KIO_MAX_ATOMIC_IO 64 /* in kb */ +#define KIO_MAX_ATOMIC_BYTES (64 * 1024) +#define KIO_STATIC_PAGES (KIO_MAX_ATOMIC_IO / (PAGE_SIZE >> 10) + 1) +#define KIO_MAX_SECTORS (KIO_MAX_ATOMIC_IO * 2) + +struct kiobuf +{ + int nr_pages; /* Pages actually referenced */ + int array_len; /* Space in the allocated lists */ + int offset; /* Offset to start of valid data */ + int length; /* Number of valid bytes of data */ + + /* Keep separate track of the physical addresses and page + * structs involved. If we do IO to a memory-mapped device + * region, there won't necessarily be page structs defined for + * every address. */ + + unsigned long * pagelist; + struct page ** maplist; + + unsigned int locked : 1; /* If set, pages has been locked */ + + /* Always embed enough struct pages for 64k of IO */ + unsigned long page_array[KIO_STATIC_PAGES]; + struct page * map_array[KIO_STATIC_PAGES]; + + /* Dynamic state for IO completion: */ + atomic_t io_count; /* IOs still in progress */ + int errno; /* Status of completed IO */ + void (*end_io) (struct kiobuf *); /* Completion callback */ + wait_queue_head_t wait_queue; +}; + + +/* mm/memory.c */ + +int map_user_kiobuf(int rw, struct kiobuf *, unsigned long va, size_t len); +void unmap_kiobuf(struct kiobuf *iobuf); + +/* fs/iobuf.c */ + +void __init kiobuf_init(void); +void simple_wakeup_kiobuf(struct kiobuf *); +int alloc_kiovec(int nr, struct kiobuf **); +void free_kiovec(int nr, struct kiobuf **); +int expand_kiobuf(struct kiobuf *, int); +void kiobuf_wait_for_io(struct kiobuf *); + +/* fs/buffer.c */ + +int brw_kiovec(int rw, int nr, struct kiobuf *iovec[], + kdev_t dev, unsigned long b[], int size, int bmap); + +#endif /* __LINUX_IOBUF_H */ diff --git a/include/linux/limits.h b/include/linux/limits.h index 6ca0ae68b5a8..d2cc01a01429 100644 --- a/include/linux/limits.h +++ b/include/linux/limits.h @@ -1,7 +1,7 @@ #ifndef _LINUX_LIMITS_H #define _LINUX_LIMITS_H -#define NR_OPEN 1024 +#define NR_OPEN 1024 #define NGROUPS_MAX 32 /* supplemental group IDs are available */ #define ARG_MAX 131072 /* # bytes of args + environ for exec() */ diff --git a/include/linux/major.h b/include/linux/major.h index 20b267b9fcc3..e8f3548150f9 100644 --- a/include/linux/major.h +++ b/include/linux/major.h @@ -115,6 +115,8 @@ #define AURORA_MAJOR 79 +#define RAW_MAJOR 162 + #define UNIX98_PTY_MASTER_MAJOR 128 #define UNIX98_PTY_MAJOR_COUNT 8 #define UNIX98_PTY_SLAVE_MAJOR (UNIX98_PTY_MASTER_MAJOR+UNIX98_PTY_MAJOR_COUNT) diff --git a/include/linux/raw.h b/include/linux/raw.h new file mode 100644 index 000000000000..a2d9b14cd302 --- /dev/null +++ b/include/linux/raw.h @@ -0,0 +1,23 @@ +#ifndef __LINUX_RAW_H +#define __LINUX_RAW_H + +#include + +#define RAW_SETBIND _IO( 0xac, 0 ) +#define RAW_GETBIND _IO( 0xac, 1 ) + +struct raw_config_request +{ + int raw_minor; + __u64 block_major; + __u64 block_minor; +}; + +#ifdef __KERNEL__ + +/* drivers/char/raw.c */ +extern void raw_init(void); + +#endif /* __KERNEL__ */ + +#endif /* __LINUX_RAW_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 3a68b7168ad2..3a9af859b208 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -127,6 +127,12 @@ extern void trap_init(void); extern signed long FASTCALL(schedule_timeout(signed long timeout)); asmlinkage void schedule(void); +/* + * The default fd array needs to be at least BITS_PER_LONG, + * as this is the granularity returned by copy_fdset(). + */ +#define NR_OPEN_DEFAULT BITS_PER_LONG + /* * Open file table structure */ @@ -134,18 +140,28 @@ struct files_struct { atomic_t count; rwlock_t file_lock; int max_fds; + int max_fdset; + int next_fd; struct file ** fd; /* current fd array */ - fd_set close_on_exec; - fd_set open_fds; + fd_set *close_on_exec; + fd_set *open_fds; + fd_set close_on_exec_init; + fd_set open_fds_init; + struct file * fd_array[NR_OPEN_DEFAULT]; }; #define INIT_FILES { \ ATOMIC_INIT(1), \ RW_LOCK_UNLOCKED, \ - NR_OPEN, \ - &init_fd_array[0], \ + NR_OPEN_DEFAULT, \ + __FD_SETSIZE, \ + 0, \ + &init_files.fd_array[0], \ + &init_files.close_on_exec_init, \ + &init_files.open_fds_init, \ { { 0, } }, \ - { { 0, } } \ + { { 0, } }, \ + { NULL, } \ } struct fs_struct { @@ -633,6 +649,48 @@ extern void mmput(struct mm_struct *); /* Remove the current tasks stale references to the old mm_struct */ extern void mm_release(void); +/* + * Routines for handling the fd arrays + */ +extern struct file ** alloc_fd_array(int); +extern int expand_fd_array(struct files_struct *, int nr); +extern void free_fd_array(struct file **, int); + +extern fd_set *alloc_fdset(int); +extern int expand_fdset(struct files_struct *, int nr); +extern void free_fdset(fd_set *, int); + +/* Expand files. Return <0 on error; 0 nothing done; 1 files expanded, + * we may have blocked. + * + * Should be called with the files->file_lock spinlock held for write. + */ +static inline int expand_files(struct files_struct *files, int nr) +{ + int err, expand = 0; +#ifdef FDSET_DEBUG + printk (KERN_ERR __FUNCTION__ " %d: nr = %d\n", current->pid, nr); +#endif + + if (nr >= files->max_fdset) { + expand = 1; + if ((err = expand_fdset(files, nr))) + goto out; + } + if (nr >= files->max_fds) { + expand = 1; + if ((err = expand_fd_array(files, nr))) + goto out; + } + err = expand; + out: +#ifdef FDSET_DEBUG + if (err) + printk (KERN_ERR __FUNCTION__ " %d: return %d\n", current->pid, err); +#endif + return err; +} + extern int copy_thread(int, unsigned long, unsigned long, struct task_struct *, struct pt_regs *); extern void flush_thread(void); extern void exit_thread(void); diff --git a/init/main.c b/init/main.c index 431407c8b350..0506e345efb0 100644 --- a/init/main.c +++ b/init/main.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -1193,6 +1194,7 @@ asmlinkage void __init start_kernel(void) vma_init(); buffer_init(memory_end-memory_start); page_cache_init(memory_end-memory_start); + kiobuf_init(); signals_init(); inode_init(); file_table_init(); diff --git a/kernel/exit.c b/kernel/exit.c index dbdcff3b2e74..7288395b1ebb 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -149,11 +149,11 @@ static inline void close_files(struct files_struct * files) j = 0; for (;;) { - unsigned long set = files->open_fds.fds_bits[j]; + unsigned long set; i = j * __NFDBITS; - j++; - if (i >= files->max_fds) + if (i >= files->max_fdset || i >= files->max_fds) break; + set = files->open_fds->fds_bits[j++]; while (set) { if (set & 1) { struct file * file = xchg(&files->fd[i], NULL); @@ -176,12 +176,14 @@ static inline void __exit_files(struct task_struct *tsk) if (atomic_dec_and_test(&files->count)) { close_files(files); /* - * Free the fd array as appropriate ... + * Free the fd and fdset arrays if we expanded them. */ - if (NR_OPEN * sizeof(struct file *) == PAGE_SIZE) - free_page((unsigned long) files->fd); - else - kfree(files->fd); + if (files->fd != &files->fd_array[0]) + free_fd_array(files->fd, files->max_fds); + if (files->max_fdset > __FD_SETSIZE) { + free_fdset(files->open_fds, files->max_fdset); + free_fdset(files->close_on_exec, files->max_fdset); + } kmem_cache_free(files_cachep, files); } } diff --git a/kernel/fork.c b/kernel/fork.c index e6f1417dcf6b..b173ddffba64 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -433,32 +433,24 @@ static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk) return 0; } -/* - * Copy a fd_set and compute the maximum fd it contains. - */ -static inline int __copy_fdset(unsigned long *d, unsigned long *src) +static int count_open_files(struct files_struct *files, int size) { - int i; - unsigned long *p = src; - unsigned long *max = src; - - for (i = __FDSET_LONGS; i; --i) { - if ((*d++ = *p++) != 0) - max = p; + int i; + + /* Find the last open fd */ + for (i = size/(8*sizeof(long)); i > 0; ) { + if (files->open_fds->fds_bits[--i]) + break; } - return (max - src)*sizeof(long)*8; -} - -static inline int copy_fdset(fd_set *dst, fd_set *src) -{ - return __copy_fdset(dst->fds_bits, src->fds_bits); + i = (i+1) * 8 * sizeof(long); + return i; } static int copy_files(unsigned long clone_flags, struct task_struct * tsk) { struct files_struct *oldf, *newf; struct file **old_fds, **new_fds; - int size, i, error = 0; + int open_files, nfds, size, i, error = 0; /* * A background process may not have any files ... @@ -478,43 +470,85 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) if (!newf) goto out; - /* - * Allocate the fd array, using get_free_page() if possible. - * Eventually we want to make the array size variable ... - */ - size = NR_OPEN * sizeof(struct file *); - if (size == PAGE_SIZE) - new_fds = (struct file **) __get_free_page(GFP_KERNEL); - else - new_fds = (struct file **) kmalloc(size, GFP_KERNEL); - if (!new_fds) - goto out_release; - - newf->file_lock = RW_LOCK_UNLOCKED; atomic_set(&newf->count, 1); - newf->max_fds = NR_OPEN; - newf->fd = new_fds; + + newf->file_lock = RW_LOCK_UNLOCKED; + newf->next_fd = 0; + newf->max_fds = NR_OPEN_DEFAULT; + newf->max_fdset = __FD_SETSIZE; + newf->close_on_exec = &newf->close_on_exec_init; + newf->open_fds = &newf->open_fds_init; + newf->fd = &newf->fd_array[0]; + + /* We don't yet have the oldf readlock, but even if the old + fdset gets grown now, we'll only copy up to "size" fds */ + size = oldf->max_fdset; + if (size > __FD_SETSIZE) { + newf->max_fdset = 0; + write_lock(&newf->file_lock); + error = expand_fdset(newf, size); + write_unlock(&newf->file_lock); + if (error) + goto out_release; + } read_lock(&oldf->file_lock); - newf->close_on_exec = oldf->close_on_exec; - i = copy_fdset(&newf->open_fds, &oldf->open_fds); + + open_files = count_open_files(oldf, size); + + /* + * Check whether we need to allocate a larger fd array. + * Note: we're not a clone task, so the open count won't + * change. + */ + nfds = NR_OPEN_DEFAULT; + if (open_files > nfds) { + read_unlock(&oldf->file_lock); + newf->max_fds = 0; + write_lock(&newf->file_lock); + error = expand_fd_array(newf, open_files); + write_unlock(&newf->file_lock); + if (error) + goto out_release; + nfds = newf->max_fds; + read_lock(&oldf->file_lock); + } old_fds = oldf->fd; - for (; i != 0; i--) { + new_fds = newf->fd; + + memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8); + memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8); + + for (i = open_files; i != 0; i--) { struct file *f = *old_fds++; if (f) get_file(f); *new_fds++ = f; } read_unlock(&oldf->file_lock); + + /* compute the remainder to be cleared */ + size = (newf->max_fds - open_files) * sizeof(struct file *); + /* This is long word aligned thus could use a optimized version */ - memset(new_fds, 0, (char *)newf->fd + size - (char *)new_fds); - + memset(new_fds, 0, size); + + if (newf->max_fdset > open_files) { + int left = (newf->max_fdset-open_files)/8; + int start = open_files / (8 * sizeof(unsigned long)); + + memset(&newf->open_fds->fds_bits[start], 0, left); + memset(&newf->close_on_exec->fds_bits[start], 0, left); + } + tsk->files = newf; error = 0; out: return error; out_release: + free_fdset (newf->close_on_exec, newf->max_fdset); + free_fdset (newf->open_fds, newf->max_fdset); kmem_cache_free(files_cachep, newf); goto out; } diff --git a/mm/memory.c b/mm/memory.c index 916918632702..13b31027af9e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -406,6 +407,181 @@ void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long s } } + +/* + * Do a quick page-table lookup for a single page. + */ +static unsigned long follow_page(unsigned long address) +{ + pgd_t *pgd; + pmd_t *pmd; + + pgd = pgd_offset(current->mm, address); + pmd = pmd_offset(pgd, address); + if (pmd) { + pte_t * pte = pte_offset(pmd, address); + if (pte && pte_present(*pte)) { + return pte_page(*pte); + } + } + + printk(KERN_ERR "Missing page in follow_page\n"); + return 0; +} + +/* + * Given a physical address, is there a useful struct page pointing to it? + */ + +static struct page * get_page_map(unsigned long page) +{ + struct page *map; + + if (MAP_NR(page) >= max_mapnr) + return 0; + if (page == ZERO_PAGE(page)) + return 0; + map = mem_map + MAP_NR(page); + if (PageReserved(map)) + return 0; + return map; +} + +/* + * Force in an entire range of pages from the current process's user VA, + * and pin and lock the pages for IO. + */ + +#define dprintk(x...) +int map_user_kiobuf(int rw, struct kiobuf *iobuf, unsigned long va, size_t len) +{ + unsigned long ptr, end; + int err; + struct mm_struct * mm; + struct vm_area_struct * vma = 0; + unsigned long page; + struct page * map; + int doublepage = 0; + int repeat = 0; + int i; + + /* Make sure the iobuf is not already mapped somewhere. */ + if (iobuf->nr_pages) + return -EINVAL; + + mm = current->mm; + dprintk ("map_user_kiobuf: begin\n"); + + ptr = va & PAGE_MASK; + end = (va + len + PAGE_SIZE - 1) & PAGE_MASK; + err = expand_kiobuf(iobuf, (end - ptr) >> PAGE_SHIFT); + if (err) + return err; + + repeat: + down(&mm->mmap_sem); + + err = -EFAULT; + iobuf->locked = 1; + iobuf->offset = va & ~PAGE_MASK; + iobuf->length = len; + + i = 0; + + /* + * First of all, try to fault in all of the necessary pages + */ + while (ptr < end) { + if (!vma || ptr >= vma->vm_end) { + vma = find_vma(current->mm, ptr); + if (!vma) + goto out_unlock; + } + if (!handle_mm_fault(current, vma, ptr, (rw==READ))) + goto out_unlock; + page = follow_page(ptr); + if (!page) { + printk (KERN_ERR "Missing page in map_user_kiobuf\n"); + goto out_unlock; + } + map = get_page_map(page); + if (map) { + if (TryLockPage(map)) + goto retry; + atomic_inc(&map->count); + } + dprintk ("Installing page %p %p: %d\n", (void *)page, map, i); + iobuf->pagelist[i] = page; + iobuf->maplist[i] = map; + iobuf->nr_pages = ++i; + + ptr += PAGE_SIZE; + } + + up(&mm->mmap_sem); + dprintk ("map_user_kiobuf: end OK\n"); + return 0; + + out_unlock: + up(&mm->mmap_sem); + unmap_kiobuf(iobuf); + dprintk ("map_user_kiobuf: end %d\n", err); + return err; + + retry: + + /* + * Undo the locking so far, wait on the page we got to, and try again. + */ + unmap_kiobuf(iobuf); + up(&mm->mmap_sem); + + /* + * Did the release also unlock the page we got stuck on? + */ + if (!PageLocked(map)) { + /* If so, we may well have the page mapped twice in the + * IO address range. Bad news. Of course, it _might_ + * just be a coincidence, but if it happens more than + * once, chances are we have a double-mapped page. */ + if (++doublepage >= 3) { + return -EINVAL; + } + } + + /* + * Try again... + */ + wait_on_page(map); + if (++repeat < 16) + goto repeat; + return -EAGAIN; +} + + +/* + * Unmap all of the pages referenced by a kiobuf. We release the pages, + * and unlock them if they were locked. + */ + +void unmap_kiobuf (struct kiobuf *iobuf) +{ + int i; + struct page *map; + + for (i = 0; i < iobuf->nr_pages; i++) { + map = iobuf->maplist[i]; + + if (map && iobuf->locked) { + __free_page(map); + UnlockPage(map); + } + } + + iobuf->nr_pages = 0; + iobuf->locked = 0; +} + static inline void zeromap_pte_range(pte_t * pte, unsigned long address, unsigned long size, pgprot_t prot) { -- 2.39.5