From 991b3ae8019276269816512425f102c4687f2291 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 4 Feb 2002 20:11:45 -0800 Subject: [PATCH] v2.4.9.3 -> v2.4.9.4 - Hugh Dickins: swapoff cleanups and speedups - Matthew Dharm: USB storage update - Keith Owens: Makefile fixes - Tom Rini: MPC8xx build fix - Nikita Danilov: reiserfs update - Jakub Jelinek: ELF loader fix for ET_DYN - Andrew Morton: reparent_to_init() for kernel threads - Christoph Hellwig: VxFS and SysV updates, vfs_permission fix --- Documentation/Configure.help | 14 + Makefile | 2 +- arch/i386/defconfig | 2 +- arch/ppc/8xx_io/enet.c | 3 +- arch/ppc/8xx_io/fec.c | 3 +- drivers/acpi/ospm/busmgr/Makefile | 2 + drivers/char/Makefile | 18 +- drivers/net/8139too.c | 1 + drivers/net/Makefile | 8 +- drivers/usb/Config.in | 7 +- drivers/usb/storage/scsiglue.c | 2 +- drivers/usb/storage/sddr09.c | 4 +- drivers/usb/storage/transport.c | 45 +--- drivers/usb/storage/unusual_devs.h | 30 ++- fs/binfmt_elf.c | 2 +- fs/freevxfs/vxfs_bmap.c | 2 +- fs/freevxfs/vxfs_extern.h | 5 +- fs/freevxfs/vxfs_fshead.c | 8 +- fs/freevxfs/vxfs_inode.c | 22 +- fs/freevxfs/vxfs_olt.c | 11 +- fs/freevxfs/vxfs_super.c | 52 ++-- fs/namei.c | 44 +++- fs/reiserfs/bitmap.c | 11 +- fs/reiserfs/inode.c | 39 +-- fs/reiserfs/journal.c | 29 ++- fs/reiserfs/stree.c | 1 + fs/reiserfs/super.c | 130 +++------- fs/sysv/Makefile | 3 +- fs/sysv/balloc.c | 16 ++ fs/sysv/inode.c | 8 +- fs/sysv/super.c | 65 ++++- fs/sysv/symlink.c | 25 ++ include/linux/pagemap.h | 5 - include/linux/sched.h | 1 + include/linux/sysv_fs.h | 2 + kernel/fork.c | 47 ++-- kernel/ksyms.c | 1 + kernel/sched.c | 54 ++++ mm/filemap.c | 28 -- mm/memory.c | 29 +-- mm/swap_state.c | 97 ++++--- mm/swapfile.c | 395 +++++++++++++++++------------ mm/vmscan.c | 56 ++-- scripts/ver_linux | 6 +- 44 files changed, 763 insertions(+), 572 deletions(-) create mode 100644 fs/sysv/symlink.c diff --git a/Documentation/Configure.help b/Documentation/Configure.help index 9a457df6bfdc..d3ae15e6924e 100644 --- a/Documentation/Configure.help +++ b/Documentation/Configure.help @@ -11601,6 +11601,20 @@ CONFIG_USB_STORAGE_DEBUG Say Y here in order to have the USB Mass Storage code generate verbose debugging messages. +ISD-200 USB/ATA driver +CONFIG_USB_STORAGE_ISD200 + Say Y here if you want to use USB Mass Store devices based + on the In-Systems Design ISD-200 USB/ATA bridge. + + Some of the products that use this chip are: + + - Archos Jukebox 6000 + - ISD SmartCable for Storage + - Taiwan Skymaster CD530U/DEL-0241 IDE bridge + - Sony CRX10U CD-R/RW drive + - CyQ've CQ8060A CDRW drive + - Planex eXtreme Drive RX-25HU USB-IDE cable (not model RX-25U) + USS720 parport driver CONFIG_USB_USS720 This driver is for USB parallel port adapters that use the Lucent diff --git a/Makefile b/Makefile index 16857718ac8b..1f94e2650962 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 4 SUBLEVEL = 10 -EXTRAVERSION =-pre3 +EXTRAVERSION =-pre4 KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION) diff --git a/arch/i386/defconfig b/arch/i386/defconfig index 6e5153049c94..7ad21812667d 100644 --- a/arch/i386/defconfig +++ b/arch/i386/defconfig @@ -705,6 +705,7 @@ CONFIG_USB_UHCI_ALT=y CONFIG_USB_STORAGE=y # CONFIG_USB_STORAGE_DEBUG is not set # CONFIG_USB_STORAGE_FREECOM is not set +# CONFIG_USB_STORAGE_ISD200 is not set # CONFIG_USB_STORAGE_DPCM is not set # CONFIG_USB_ACM is not set # CONFIG_USB_PRINTER is not set @@ -738,7 +739,6 @@ CONFIG_USB_STORAGE=y # # USB Network adaptors # -# CONFIG_USB_PLUSB is not set # CONFIG_USB_PEGASUS is not set # CONFIG_USB_CATC is not set # CONFIG_USB_KAWETH is not set diff --git a/arch/ppc/8xx_io/enet.c b/arch/ppc/8xx_io/enet.c index 9ef663a053bd..52f7a52e53ea 100644 --- a/arch/ppc/8xx_io/enet.c +++ b/arch/ppc/8xx_io/enet.c @@ -651,11 +651,10 @@ int __init scc_enet_init(void) volatile scc_t *sccp; volatile scc_enet_t *ep; volatile immap_t *immap; - extern unsigned long _get_IMMR(void); cp = cpmp; /* Get pointer to Communication Processor */ - immap = (immap_t *)(_get_IMMR() & 0xFFFF0000); /* and to internal registers */ + immap = (immap_t *)(mfspr(IMMR) & 0xFFFF0000); /* and to internal registers */ bd = (bd_t *)__res; diff --git a/arch/ppc/8xx_io/fec.c b/arch/ppc/8xx_io/fec.c index 815533c9d60c..dea3952c70cc 100644 --- a/arch/ppc/8xx_io/fec.c +++ b/arch/ppc/8xx_io/fec.c @@ -1509,7 +1509,6 @@ int __init fec_enet_init(void) volatile immap_t *immap; volatile fec_t *fecp; bd_t *bd; - extern uint _get_IMMR(void); #ifdef CONFIG_SCC_ENET unsigned char tmpaddr[6]; #endif @@ -1680,7 +1679,7 @@ int __init fec_enet_init(void) /* Bits moved from Rev. D onward. */ - if ((_get_IMMR() & 0xffff) < 0x0501) + if ((mfspr(IMMR) & 0xffff) < 0x0501) immap->im_ioport.iop_pddir = 0x1c58; /* Pre rev. D */ else immap->im_ioport.iop_pddir = 0x1fff; /* Rev. D and later */ diff --git a/drivers/acpi/ospm/busmgr/Makefile b/drivers/acpi/ospm/busmgr/Makefile index 61da73279c4a..03ac94768c80 100644 --- a/drivers/acpi/ospm/busmgr/Makefile +++ b/drivers/acpi/ospm/busmgr/Makefile @@ -1,3 +1,5 @@ +export-objs := bm_osl.o + O_TARGET := ospm_$(notdir $(CURDIR)).o obj-m := $(O_TARGET) EXTRA_CFLAGS += $(ACPI_CFLAGS) diff --git a/drivers/char/Makefile b/drivers/char/Makefile index 1fa4054ed9c9..4bce5b67caa8 100644 --- a/drivers/char/Makefile +++ b/drivers/char/Makefile @@ -48,20 +48,6 @@ ifeq ($(ARCH),s390x) SERIAL = endif -ifeq ($(ARCH),s390) - KEYMAP = - KEYBD = - CONSOLE = - SERIAL = -endif - -ifeq ($(ARCH),s390x) - KEYMAP = - KEYBD = - CONSOLE = - SERIAL = -endif - ifeq ($(ARCH),m68k) ifdef CONFIG_AMIGA KEYBD = amikeyb.o @@ -241,5 +227,7 @@ consolemap_deftbl.c: $(FONTMAPFILE) conmakehash consolemap_deftbl.o: consolemap_deftbl.c $(TOPDIR)/include/linux/types.h +.DELETE_ON_ERROR: + defkeymap.c: defkeymap.map - loadkeys --mktable defkeymap.map > defkeymap.c + set -e ; loadkeys --mktable $< | sed -e 's/^static *//' > $@ diff --git a/drivers/net/8139too.c b/drivers/net/8139too.c index 51a3f7b89c0c..815eed82e0da 100644 --- a/drivers/net/8139too.c +++ b/drivers/net/8139too.c @@ -1610,6 +1610,7 @@ static int rtl8139_thread (void *data) unsigned long timeout; daemonize (); + reparent_to_init(); spin_lock_irq(¤t->sigmask_lock); sigemptyset(¤t->blocked); recalc_sigpending(current); diff --git a/drivers/net/Makefile b/drivers/net/Makefile index 23074af321f0..83a3dc4dd39a 100644 --- a/drivers/net/Makefile +++ b/drivers/net/Makefile @@ -137,12 +137,8 @@ obj-$(CONFIG_PPP_BSDCOMP) += bsd_comp.o obj-$(CONFIG_PPPOE) += pppox.o pppoe.o obj-$(CONFIG_SLIP) += slip.o -ifeq ($(CONFIG_SLIP),y) - obj-$(CONFIG_SLIP_COMPRESSED) += slhc.o -else - ifeq ($(CONFIG_SLIP),m) - obj-$(CONFIG_SLIP_COMPRESSED) += slhc.o - endif +ifeq ($(CONFIG_SLIP_COMPRESSED),y) + obj-$(CONFIG_SLIP) += slhc.o endif obj-$(CONFIG_STRIP) += strip.o diff --git a/drivers/usb/Config.in b/drivers/usb/Config.in index d2c623a50e9d..2ff7a9d288fe 100644 --- a/drivers/usb/Config.in +++ b/drivers/usb/Config.in @@ -32,7 +32,12 @@ comment 'USB Controllers' if [ "$CONFIG_USB_STORAGE" != "n" ]; then bool ' USB Mass Storage verbose debug' CONFIG_USB_STORAGE_DEBUG bool ' Freecom USB/ATAPI Bridge support' CONFIG_USB_STORAGE_FREECOM - bool ' Microtech CompactFlash/SmartMedia reader' CONFIG_USB_STORAGE_DPCM + bool ' ISD-200 USB/ATA Bridge support' CONFIG_USB_STORAGE_ISD200 + bool ' Microtech CompactFlash/SmartMedia support' CONFIG_USB_STORAGE_DPCM + if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then + bool ' HP CD-Writer 82xx support' CONFIG_USB_STORAGE_HP8200e + bool ' SanDisk SDDR-09 (and other SmartMedia) support' CONFIG_USB_STORAGE_SDDR09 + fi fi dep_tristate ' USB Modem (CDC ACM) support' CONFIG_USB_ACM $CONFIG_USB dep_tristate ' USB Printer support' CONFIG_USB_PRINTER $CONFIG_USB diff --git a/drivers/usb/storage/scsiglue.c b/drivers/usb/storage/scsiglue.c index 0a75c8a68bb5..2f26d8dc48a8 100644 --- a/drivers/usb/storage/scsiglue.c +++ b/drivers/usb/storage/scsiglue.c @@ -1,7 +1,7 @@ /* Driver for USB Mass Storage compliant devices * SCSI layer glue code * - * $Id: scsiglue.c,v 1.21 2001/07/29 23:41:52 mdharm Exp $ + * $Id: scsiglue.c,v 1.22 2001/09/02 04:29:27 mdharm Exp $ * * Current development and maintenance by: * (c) 1999, 2000 Matthew Dharm (mdharm-usb@one-eyed-alien.net) diff --git a/drivers/usb/storage/sddr09.c b/drivers/usb/storage/sddr09.c index aaa889d1288e..227f61922feb 100644 --- a/drivers/usb/storage/sddr09.c +++ b/drivers/usb/storage/sddr09.c @@ -1,6 +1,6 @@ /* Driver for SanDisk SDDR-09 SmartMedia reader * - * $Id: sddr09.c,v 1.18 2001/06/11 02:54:25 mdharm Exp $ + * $Id: sddr09.c,v 1.19 2001/09/02 06:07:20 mdharm Exp $ * * SDDR09 driver v0.1: * @@ -693,7 +693,7 @@ int sddr09_read_map(struct us_data *us) { // scatterlist block i*64/128k = i*(2^6)*(2^-17) = i*(2^-11) for (i=0; i>11].address+(i<<6); + ptr = sg[i>>11].address+((i&0x7ff)<<6); if (ptr[0]!=0xFF || ptr[1]!=0xFF || ptr[2]!=0xFF || ptr[3]!=0xFF || ptr[4]!=0xFF || ptr[5]!=0xFF) { US_DEBUGP("PBA %04X has no logical mapping: reserved area = " diff --git a/drivers/usb/storage/transport.c b/drivers/usb/storage/transport.c index 6c7a8c2baa34..92a7ca9a14a0 100644 --- a/drivers/usb/storage/transport.c +++ b/drivers/usb/storage/transport.c @@ -1,6 +1,6 @@ /* Driver for USB Mass Storage compliant devices * - * $Id: transport.c,v 1.39 2001/03/10 16:46:28 zagor Exp $ + * $Id: transport.c,v 1.40 2001/08/18 08:37:46 mdharm Exp $ * * Current development and maintenance by: * (c) 1999, 2000 Matthew Dharm (mdharm-usb@one-eyed-alien.net) @@ -371,10 +371,9 @@ int usb_stor_clear_halt(struct usb_device *dev, int pipe) */ static void usb_stor_blocking_completion(urb_t *urb) { - wait_queue_head_t *wqh_ptr = (wait_queue_head_t *)urb->context; + struct completion *urb_done_ptr = (struct completion *)urb->context; - if (waitqueue_active(wqh_ptr)) - wake_up(wqh_ptr); + complete(urb_done_ptr); } /* This is our function to emulate usb_control_msg() but give us enough @@ -384,8 +383,7 @@ int usb_stor_control_msg(struct us_data *us, unsigned int pipe, u8 request, u8 requesttype, u16 value, u16 index, void *data, u16 size) { - wait_queue_head_t wqh; - wait_queue_t wait; + struct completion urb_done; int status; devrequest *dr; @@ -402,9 +400,7 @@ int usb_stor_control_msg(struct us_data *us, unsigned int pipe, dr->length = cpu_to_le16(size); /* set up data structures for the wakeup system */ - init_waitqueue_head(&wqh); - init_waitqueue_entry(&wait, current); - add_wait_queue(&wqh, &wait); + init_completion(&urb_done); /* lock the URB */ down(&(us->current_urb_sem)); @@ -412,33 +408,25 @@ int usb_stor_control_msg(struct us_data *us, unsigned int pipe, /* fill the URB */ FILL_CONTROL_URB(us->current_urb, us->pusb_dev, pipe, (unsigned char*) dr, data, size, - usb_stor_blocking_completion, &wqh); + usb_stor_blocking_completion, &urb_done); us->current_urb->actual_length = 0; us->current_urb->error_count = 0; us->current_urb->transfer_flags = USB_ASYNC_UNLINK; /* submit the URB */ - set_current_state(TASK_UNINTERRUPTIBLE); status = usb_submit_urb(us->current_urb); if (status) { /* something went wrong */ up(&(us->current_urb_sem)); - set_current_state(TASK_RUNNING); - remove_wait_queue(&wqh, &wait); kfree(dr); return status; } /* wait for the completion of the URB */ up(&(us->current_urb_sem)); - while (us->current_urb->status == -EINPROGRESS) - schedule(); + wait_for_completion(&urb_done); down(&(us->current_urb_sem)); - /* we either timed out or got woken up -- clean up either way */ - set_current_state(TASK_RUNNING); - remove_wait_queue(&wqh, &wait); - /* return the actual length of the data transferred if no error*/ status = us->current_urb->status; if (status >= 0) @@ -456,46 +444,35 @@ int usb_stor_control_msg(struct us_data *us, unsigned int pipe, int usb_stor_bulk_msg(struct us_data *us, void *data, int pipe, unsigned int len, unsigned int *act_len) { - wait_queue_head_t wqh; - wait_queue_t wait; + struct completion urb_done; int status; /* set up data structures for the wakeup system */ - init_waitqueue_head(&wqh); - init_waitqueue_entry(&wait, current); - add_wait_queue(&wqh, &wait); + init_completion(&urb_done); /* lock the URB */ down(&(us->current_urb_sem)); /* fill the URB */ FILL_BULK_URB(us->current_urb, us->pusb_dev, pipe, data, len, - usb_stor_blocking_completion, &wqh); + usb_stor_blocking_completion, &urb_done); us->current_urb->actual_length = 0; us->current_urb->error_count = 0; us->current_urb->transfer_flags = USB_ASYNC_UNLINK; /* submit the URB */ - set_current_state(TASK_UNINTERRUPTIBLE); status = usb_submit_urb(us->current_urb); if (status) { /* something went wrong */ up(&(us->current_urb_sem)); - set_current_state(TASK_RUNNING); - remove_wait_queue(&wqh, &wait); return status; } /* wait for the completion of the URB */ up(&(us->current_urb_sem)); - while (us->current_urb->status == -EINPROGRESS) - schedule(); + wait_for_completion(&urb_done); down(&(us->current_urb_sem)); - /* we either timed out or got woken up -- clean up either way */ - set_current_state(TASK_RUNNING); - remove_wait_queue(&wqh, &wait); - /* return the actual length of the data transferred */ *act_len = us->current_urb->actual_length; diff --git a/drivers/usb/storage/unusual_devs.h b/drivers/usb/storage/unusual_devs.h index 8a6a731cfab2..dcdc14411711 100644 --- a/drivers/usb/storage/unusual_devs.h +++ b/drivers/usb/storage/unusual_devs.h @@ -1,7 +1,7 @@ /* Driver for USB Mass Storage compliant devices * Ununsual Devices File * - * $Id: unusual_devs.h,v 1.16 2001/07/30 00:27:59 mdharm Exp $ + * $Id: unusual_devs.h,v 1.20 2001/09/02 05:12:57 mdharm Exp $ * * Current development and maintenance by: * (c) 2000 Matthew Dharm (mdharm-usb@one-eyed-alien.net) @@ -68,6 +68,19 @@ UNUSUAL_DEV( 0x0436, 0x0005, 0x0100, 0x0100, US_FL_START_STOP ), #endif +/* Made with the help of Edd Dumbill */ +UNUSUAL_DEV( 0x0451, 0x5409, 0x0001, 0x0001, + "Frontier Labs", + "Nex II Digital", + US_SC_SCSI, US_PR_BULK, NULL, US_FL_START_STOP), + +/* Reported by Paul Stewart + * This entry is needed because the device reports Sub=ff */ +UNUSUAL_DEV( 0x04a4, 0x0004, 0x0001, 0x0001, + "Hitachi", + "DVD-CAM DZ-MV100A Camcorder", + US_SC_SCSI, US_PR_CB, NULL, US_FL_SINGLE_LUN), + UNUSUAL_DEV( 0x04cb, 0x0100, 0x0000, 0x2210, "Fujifilm", "FinePix 1400Zoom", @@ -155,13 +168,20 @@ UNUSUAL_DEV( 0x054c, 0x0010, 0x0106, 0x0322, US_SC_SCSI, US_PR_CB, NULL, US_FL_SINGLE_LUN | US_FL_START_STOP | US_FL_MODE_XLATE ), +/* Reported by win@geeks.nl */ +UNUSUAL_DEV( 0x054c, 0x0025, 0x0100, 0x0100, + "Sony", + "Memorystick NW-MS7", + US_SC_UFI, US_PR_CB, NULL, + US_FL_SINGLE_LUN | US_FL_START_STOP ), + UNUSUAL_DEV( 0x054c, 0x002d, 0x0100, 0x0100, "Sony", "Memorystick MSAC-US1", US_SC_UFI, US_PR_CB, NULL, US_FL_SINGLE_LUN | US_FL_START_STOP ), -/* Submitted by Klaus Mueller */ +/* Submitted by Klaus Mueller */ UNUSUAL_DEV( 0x054c, 0x002e, 0x0106, 0x0310, "Sony", "Handycam", @@ -198,12 +218,6 @@ UNUSUAL_DEV( 0x05ab, 0x0031, 0x0100, 0x0110, US_SC_ISD200, US_PR_BULK, isd200_Initialization, 0 ), -UNUSUAL_DEV( 0x05ab, 0x0060, 0x0100, 0x0110, - "In-System", - "USB 2.0/IDE Bridge (ATA/ATAPI)", - US_SC_ISD200, US_PR_BULK, isd200_Initialization, - 0 ), - UNUSUAL_DEV( 0x05ab, 0x0301, 0x0100, 0x0110, "In-System", "Portable USB Harddrive V2", diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index fe02a92f8856..f543281622a4 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -633,7 +633,7 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs) if (elf_ex.e_type == ET_DYN) { load_bias += error - ELF_PAGESTART(load_bias + vaddr); - load_addr += error; + load_addr += load_bias; } } k = elf_ppnt->p_vaddr; diff --git a/fs/freevxfs/vxfs_bmap.c b/fs/freevxfs/vxfs_bmap.c index 5eebc1382541..979bb3718b49 100644 --- a/fs/freevxfs/vxfs_bmap.c +++ b/fs/freevxfs/vxfs_bmap.c @@ -27,7 +27,7 @@ * SUCH DAMAGE. */ -#ident "$Id: vxfs_bmap.c,v 1.22 2001/05/26 22:41:23 hch Exp hch $" +#ident "$Id: vxfs_bmap.c,v 1.23 2001/07/05 19:48:03 hch Exp hch $" /* * Veritas filesystem driver - filesystem to disk block mapping. diff --git a/fs/freevxfs/vxfs_extern.h b/fs/freevxfs/vxfs_extern.h index ff9e35f1a7d2..32f37e612f48 100644 --- a/fs/freevxfs/vxfs_extern.h +++ b/fs/freevxfs/vxfs_extern.h @@ -30,7 +30,7 @@ #ifndef _VXFS_EXTERN_H_ #define _VXFS_EXTERN_H_ -#ident "$Id: vxfs_extern.h,v 1.20 2001/04/26 22:48:44 hch Exp hch $" +#ident "$Id: vxfs_extern.h,v 1.21 2001/08/07 16:13:30 hch Exp hch $" /* * Veritas filesystem driver - external prototypes. @@ -55,8 +55,9 @@ extern int vxfs_read_fshead(struct super_block *); /* vxfs_inode.c */ extern struct kmem_cache_s *vxfs_inode_cachep; extern void vxfs_dumpi(struct vxfs_inode_info *, ino_t); -extern struct inode * vxfs_fake_inode(struct super_block *, +extern struct inode * vxfs_get_fake_inode(struct super_block *, struct vxfs_inode_info *); +extern void vxfs_put_fake_inode(struct inode *); extern struct vxfs_inode_info * vxfs_blkiget(struct super_block *, u_long, ino_t); extern struct vxfs_inode_info * vxfs_stiget(struct super_block *, ino_t); extern void vxfs_read_inode(struct inode *); diff --git a/fs/freevxfs/vxfs_fshead.c b/fs/freevxfs/vxfs_fshead.c index a2249898af6a..942bfa087884 100644 --- a/fs/freevxfs/vxfs_fshead.c +++ b/fs/freevxfs/vxfs_fshead.c @@ -27,7 +27,7 @@ * SUCH DAMAGE. */ -#ident "$Id: vxfs_fshead.c,v 1.18 2001/04/25 18:11:23 hch Exp $" +#ident "$Id: vxfs_fshead.c,v 1.19 2001/08/07 16:14:10 hch Exp hch $" /* * Veritas filesystem driver - fileset header routines. @@ -124,7 +124,7 @@ vxfs_read_fshead(struct super_block *sbp) vxfs_dumpi(vip, infp->vsi_fshino); #endif - if (!(infp->vsi_fship = vxfs_fake_inode(sbp, vip))) { + if (!(infp->vsi_fship = vxfs_get_fake_inode(sbp, vip))) { printk(KERN_ERR "vxfs: unabled to get fsh inode\n"); return -EINVAL; } @@ -148,7 +148,7 @@ vxfs_read_fshead(struct super_block *sbp) #endif tip = vxfs_blkiget(sbp, infp->vsi_iext, sfp->fsh_ilistino[0]); - if (!tip || ((infp->vsi_stilist = vxfs_fake_inode(sbp, tip)) == NULL)) { + if (!tip || ((infp->vsi_stilist = vxfs_get_fake_inode(sbp, tip)) == NULL)) { printk(KERN_ERR "vxfs: unabled to get structual list inode\n"); return -EINVAL; } else if (!VXFS_ISILT(VXFS_INO(infp->vsi_stilist))) { @@ -158,7 +158,7 @@ vxfs_read_fshead(struct super_block *sbp) } tip = vxfs_stiget(sbp, pfp->fsh_ilistino[0]); - if (!tip || ((infp->vsi_ilist = vxfs_fake_inode(sbp, tip)) == NULL)) { + if (!tip || ((infp->vsi_ilist = vxfs_get_fake_inode(sbp, tip)) == NULL)) { printk(KERN_ERR "vxfs: unabled to get inode list inode\n"); return -EINVAL; } else if (!VXFS_ISILT(VXFS_INO(infp->vsi_ilist))) { diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c index 1eb34b28a5d5..a06f13f3b4a1 100644 --- a/fs/freevxfs/vxfs_inode.c +++ b/fs/freevxfs/vxfs_inode.c @@ -27,7 +27,7 @@ * SUCH DAMAGE. */ -#ident "$Id: vxfs_inode.c,v 1.36 2001/05/26 22:28:02 hch Exp hch $" +#ident "$Id: vxfs_inode.c,v 1.37 2001/08/07 16:13:30 hch Exp hch $" /* * Veritas filesystem driver - inode routines. @@ -47,6 +47,7 @@ extern struct address_space_operations vxfs_immed_aops; extern struct inode_operations vxfs_immed_symlink_iops; static struct file_operations vxfs_file_operations = { + .llseek = generic_file_llseek, .read = generic_file_read, .mmap = generic_file_mmap, }; @@ -93,7 +94,7 @@ vxfs_dumpi(struct vxfs_inode_info *vip, ino_t ino) * NOTE: * While __vxfs_iget uses the pagecache vxfs_blkiget uses the * buffercache. This function should not be used outside the - * read_super() method, othwerwise the data may be incoherent. + * read_super() method, otherwise the data may be incoherent. */ struct vxfs_inode_info * vxfs_blkiget(struct super_block *sbp, u_long extent, ino_t ino) @@ -251,7 +252,7 @@ vxfs_iinit(struct inode *ip, struct vxfs_inode_info *vip) } /** - * vxfs_fake_inode - get fake inode structure + * vxfs_get_fake_inode - get fake inode structure * @sbp: filesystem superblock * @vip: fspriv inode * @@ -261,7 +262,7 @@ vxfs_iinit(struct inode *ip, struct vxfs_inode_info *vip) * Returns the filled VFS inode. */ struct inode * -vxfs_fake_inode(struct super_block *sbp, struct vxfs_inode_info *vip) +vxfs_get_fake_inode(struct super_block *sbp, struct vxfs_inode_info *vip) { struct inode *ip = NULL; @@ -272,6 +273,19 @@ vxfs_fake_inode(struct super_block *sbp, struct vxfs_inode_info *vip) return (ip); } +/** + * vxfs_put_fake_inode - free faked inode + * *ip: VFS inode + * + * Description: + * vxfs_put_fake_inode frees all data asssociated with @ip. + */ +void +vxfs_put_fake_inode(struct inode *ip) +{ + iput(ip); +} + /** * vxfs_read_inode - fill in inode information * @ip: inode pointer to fill diff --git a/fs/freevxfs/vxfs_olt.c b/fs/freevxfs/vxfs_olt.c index b12934494e6c..44bfbd06671c 100644 --- a/fs/freevxfs/vxfs_olt.c +++ b/fs/freevxfs/vxfs_olt.c @@ -27,7 +27,7 @@ * SUCH DAMAGE. */ -#ident "$Id: vxfs_olt.c,v 1.8 2001/04/25 18:11:23 hch Exp hch $" +#ident "$Id: vxfs_olt.c,v 1.9 2001/08/07 16:14:45 hch Exp hch $" /* * Veritas filesystem driver - object location table support. @@ -56,11 +56,11 @@ vxfs_get_ilist(struct vxfs_oltilist *ilistp, struct vxfs_sb_info *infp) } static __inline__ u_long -vxfs_oblock(daddr_t oblock, u_long bsize) +vxfs_oblock(struct super_block *sbp, daddr_t block, u_long bsize) { - if ((oblock * BLOCK_SIZE) % bsize) + if (sbp->s_blocksize % bsize) BUG(); - return ((oblock * BLOCK_SIZE) / bsize); + return (block * (sbp->s_blocksize / bsize)); } @@ -85,7 +85,8 @@ vxfs_read_olt(struct super_block *sbp, u_long bsize) char *oaddr, *eaddr; - bp = bread(sbp->s_dev, vxfs_oblock(infp->vsi_oltext, bsize), bsize); + bp = bread(sbp->s_dev, + vxfs_oblock(sbp, infp->vsi_oltext, bsize), bsize); if (!bp || !bp->b_data) goto fail; diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index f76c472fb71a..9a06acb8d50d 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -27,7 +27,7 @@ * SUCH DAMAGE. */ -#ident "$Id: vxfs_super.c,v 1.25 2001/05/25 18:25:55 hch Exp hch $" +#ident "$Id: vxfs_super.c,v 1.26 2001/08/07 16:13:30 hch Exp hch $" /* * Veritas filesystem driver - superblock related routines. @@ -54,7 +54,6 @@ MODULE_DESCRIPTION("Veritas Filesystem (VxFS) driver"); static void vxfs_put_super(struct super_block *); static int vxfs_statfs(struct super_block *, struct statfs *); - static struct super_operations vxfs_super_ops = { .read_inode = vxfs_read_inode, .put_inode = vxfs_put_inode, @@ -83,14 +82,15 @@ vxfs_validate_bsize(kdev_t dev) * vxfs_put_super frees all resources allocated for @sbp * after the last instance of the filesystem is unmounted. */ + static void vxfs_put_super(struct super_block *sbp) { struct vxfs_sb_info *infp = VXFS_SBI(sbp); - vxfs_put_inode(infp->vsi_fship); - vxfs_put_inode(infp->vsi_ilist); - vxfs_put_inode(infp->vsi_stilist); + vxfs_put_fake_inode(infp->vsi_fship); + vxfs_put_fake_inode(infp->vsi_ilist); + vxfs_put_fake_inode(infp->vsi_stilist); brelse(infp->vsi_bp); kfree(infp); @@ -135,7 +135,7 @@ vxfs_statfs(struct super_block *sbp, struct statfs *bufp) * vxfs_read_super - read superblock into memory and initalize filesystem * @sbp: VFS superblock (to fill) * @dp: fs private mount data - * @silent: ??? + * @silent: do not complain loudly when sth is wrong * * Description: * We are called on the first mount of a filesystem to read the @@ -167,18 +167,23 @@ vxfs_read_super(struct super_block *sbp, void *dp, int silent) bp = bread(dev, 1, bsize); if (!bp) { - printk(KERN_WARNING "vxfs: unable to read disk superblock\n"); + if (!silent) { + printk(KERN_WARNING + "vxfs: unable to read disk superblock\n"); + } goto out; } rsbp = (struct vxfs_sb *)bp->b_data; if (rsbp->vs_magic != VXFS_SUPER_MAGIC) { - printk(KERN_NOTICE "vxfs: WRONG superblock magic\n"); + if (!silent) + printk(KERN_NOTICE "vxfs: WRONG superblock magic\n"); goto out; } - if (rsbp->vs_version < 2 || rsbp->vs_version > 4) { - printk(KERN_NOTICE "vxfs: unsupported VxFS version (%d)\n", rsbp->vs_version); + if ((rsbp->vs_version < 2 || rsbp->vs_version > 4) && !silent) { + printk(KERN_NOTICE "vxfs: unsupported VxFS version (%d)\n", + rsbp->vs_version); goto out; } @@ -188,6 +193,7 @@ vxfs_read_super(struct super_block *sbp, void *dp, int silent) #endif sbp->s_magic = rsbp->vs_magic; + sbp->s_blocksize = rsbp->vs_bsize; sbp->u.generic_sbp = (void *)infp; infp->vsi_raw = rsbp; @@ -195,7 +201,6 @@ vxfs_read_super(struct super_block *sbp, void *dp, int silent) infp->vsi_oltext = rsbp->vs_oltext[0]; infp->vsi_oltsize = rsbp->vs_oltsize; - sbp->s_blocksize = rsbp->vs_bsize; switch (rsbp->vs_bsize) { case 1024: @@ -208,8 +213,11 @@ vxfs_read_super(struct super_block *sbp, void *dp, int silent) sbp->s_blocksize_bits = 12; break; default: - printk(KERN_WARNING "vxfs: unsupported blocksise: %d\n", + if (!silent) { + printk(KERN_WARNING + "vxfs: unsupported blocksise: %d\n", rsbp->vs_bsize); + } goto out; } @@ -220,20 +228,28 @@ vxfs_read_super(struct super_block *sbp, void *dp, int silent) if (vxfs_read_fshead(sbp)) { printk(KERN_WARNING "vxfs: unable to read fshead\n"); - return NULL; + goto out; } sbp->s_op = &vxfs_super_ops; - if ((sbp->s_root = d_alloc_root(iget(sbp, VXFS_ROOT_INO)))) - return (sbp); + sbp->s_root = d_alloc_root(iget(sbp, VXFS_ROOT_INO)); + if (!sbp->s_root) { + printk(KERN_WARNING "vxfs: unable to get root dentry.\n"); + goto out_free_ilist; + } + + return (sbp); - printk(KERN_WARNING "vxfs: unable to get root dentry.\n"); +out_free_ilist: + vxfs_put_fake_inode(infp->vsi_fship); + vxfs_put_fake_inode(infp->vsi_ilist); + vxfs_put_fake_inode(infp->vsi_stilist); out: + brelse(bp); kfree(infp); return NULL; } - /* * The usual module blurb. */ @@ -246,7 +262,7 @@ vxfs_init(void) sizeof(struct vxfs_inode_info), 0, 0, NULL, NULL); if (vxfs_inode_cachep) return (register_filesystem(&vxfs_fs_type)); - return 0; + return -ENOMEM; } static void __exit diff --git a/fs/namei.c b/fs/namei.c index e2a0efa0e1bf..16be452df7ca 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -139,35 +139,55 @@ char * getname(const char * filename) } /* - * permission() + * vfs_permission() * * is used to check for read/write/execute permissions on a file. * We use "fsuid" for this, letting us set arbitrary permissions * for filesystem access without changing the "normal" uids which * are used for other things.. */ -int vfs_permission(struct inode * inode,int mask) +int vfs_permission(struct inode * inode, int mask) { - int mode = inode->i_mode; + umode_t mode = inode->i_mode; - if ((mask & S_IWOTH) && IS_RDONLY(inode) && - (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) - return -EROFS; /* Nobody gets write access to a read-only fs */ + if (mask & MAY_WRITE) { + /* + * Nobody gets write access to a read-only fs. + */ + if (IS_RDONLY(inode) && + (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) + return -EROFS; - if ((mask & S_IWOTH) && IS_IMMUTABLE(inode)) - return -EACCES; /* Nobody gets write access to an immutable file */ + /* + * Nobody gets write access to an immutable file. + */ + if (IS_IMMUTABLE(inode)) + return -EACCES; + } if (current->fsuid == inode->i_uid) mode >>= 6; else if (in_group_p(inode->i_gid)) mode >>= 3; - if (((mode & mask & S_IRWXO) == mask) || capable(CAP_DAC_OVERRIDE)) + /* + * If the DACs are ok we don't need any capability check. + */ + if (((mode & mask & (MAY_READ|MAY_WRITE|MAY_EXEC)) == mask)) return 0; - /* read and search access */ - if ((mask == S_IROTH) || - (S_ISDIR(inode->i_mode) && !(mask & ~(S_IROTH | S_IXOTH)))) + /* + * Read/write DACs are always overridable. + * Executable DACs are overridable if at least one exec bit is set. + */ + if ((mask & (MAY_READ|MAY_WRITE)) || (inode->i_mode & S_IXUGO)) + if (capable(CAP_DAC_OVERRIDE)) + return 0; + + /* + * Searching includes executable on directories, else just read. + */ + if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))) if (capable(CAP_DAC_READ_SEARCH)) return 0; diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c index eb9eb64b994f..105708764535 100644 --- a/fs/reiserfs/bitmap.c +++ b/fs/reiserfs/bitmap.c @@ -499,6 +499,7 @@ int reiserfs_new_unf_blocknrs2 (struct reiserfs_transaction_handle *th, unsigned long border = 0; unsigned long bstart = 0; unsigned long hash_in, hash_out; + unsigned long saved_search_start=search_start; int allocated[PREALLOCATION_SIZE]; int blks; @@ -604,7 +605,15 @@ int reiserfs_new_unf_blocknrs2 (struct reiserfs_transaction_handle *th, ** and should probably be removed */ if ( search_start < border ) search_start=border; - + + /* If the disk free space is already below 10% we should + ** start looking for the free blocks from the beginning + ** of the partition, before the border line. + */ + if ( SB_FREE_BLOCKS(th->t_super) <= (SB_BLOCK_COUNT(th->t_super) / 10) ) { + search_start=saved_search_start; + } + *free_blocknrs = 0; blks = PREALLOCATION_SIZE-1; for (blks_gotten=0; blks_gotteni_blocks = 0; unlock_kernel() ; } @@ -525,16 +526,26 @@ int reiserfs_get_block (struct inode * inode, long block, int fs_gen; int windex ; struct reiserfs_transaction_handle th ; - int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 ; + /* space reserved in transaction batch: + . 3 balancings in direct->indirect conversion + . 1 block involved into reiserfs_update_sd() + XXX in practically impossible worst case direct2indirect() + can incur (much) more that 3 balancings. */ + int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 1; int version; int transaction_started = 0 ; - loff_t new_offset = (block << inode->i_sb->s_blocksize_bits) + 1 ; + loff_t new_offset = (((loff_t)block) << inode->i_sb->s_blocksize_bits) + 1 ; /* bad.... */ lock_kernel() ; th.t_trans_id = 0 ; version = inode_items_version (inode); + if (block < 0) { + unlock_kernel(); + return -EIO; + } + if (!file_capable (inode, block)) { unlock_kernel() ; return -EFBIG; @@ -552,20 +563,14 @@ int reiserfs_get_block (struct inode * inode, long block, return ret; } - if (block < 0) { - unlock_kernel(); - return -EIO; - } - inode->u.reiserfs_i.i_pack_on_close = 1 ; windex = push_journal_writer("reiserfs_get_block") ; /* set the key of the first byte in the 'block'-th block of file */ - make_cpu_key (&key, inode, - (loff_t)block * inode->i_sb->s_blocksize + 1, // k_offset + make_cpu_key (&key, inode, new_offset, TYPE_ANY, 3/*key length*/); - if ((new_offset + inode->i_sb->s_blocksize) >= inode->i_size) { + if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) { journal_begin(&th, inode->i_sb, jbegin_count) ; transaction_started = 1 ; } @@ -618,10 +623,13 @@ int reiserfs_get_block (struct inode * inode, long block, } if (indirect_item_found (retval, ih)) { + b_blocknr_t unfm_ptr; + /* 'block'-th block is in the file already (there is corresponding cell in some indirect item). But it may be zero unformatted node pointer (hole) */ - if (!item[pos_in_item]) { + unfm_ptr = le32_to_cpu (item[pos_in_item]); + if (unfm_ptr == 0) { /* use allocated block to plug the hole */ reiserfs_prepare_for_journal(inode->i_sb, bh, 1) ; if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) { @@ -630,15 +638,14 @@ int reiserfs_get_block (struct inode * inode, long block, } bh_result->b_state |= (1UL << BH_New); item[pos_in_item] = cpu_to_le32 (allocated_block_nr); + unfm_ptr = allocated_block_nr; journal_mark_dirty (&th, inode->i_sb, bh); inode->i_blocks += (inode->i_sb->s_blocksize / 512) ; reiserfs_update_sd(&th, inode) ; } - set_block_dev_mapped(bh_result, le32_to_cpu (item[pos_in_item]), inode); + set_block_dev_mapped(bh_result, unfm_ptr, inode); pathrelse (&path); -#ifdef REISERFS_CHECK pop_journal_writer(windex) ; -#endif /* REISERFS_CHECK */ if (transaction_started) journal_end(&th, inode->i_sb, jbegin_count) ; @@ -815,8 +822,8 @@ int reiserfs_get_block (struct inode * inode, long block, goto failure; } if (retval == POSITION_FOUND) { - reiserfs_warning ("vs-: reiserfs_get_block: " - "%k should not be found", &key); + reiserfs_warning ("vs-825: reiserfs_get_block: " + "%k should not be found\n", &key); retval = -EEXIST; if (allocated_block_nr) reiserfs_free_block (&th, allocated_block_nr); diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 67757b387cbc..97e301c6a31d 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -815,7 +815,7 @@ static void remove_all_from_journal_list(struct super_block *p_s_sb, struct reis ** called by flush_journal_list, before it calls remove_all_from_journal_list ** */ -static int update_journal_header_block(struct super_block *p_s_sb, unsigned long offset, unsigned long trans_id) { +static int _update_journal_header_block(struct super_block *p_s_sb, unsigned long offset, unsigned long trans_id) { struct reiserfs_journal_header *jh ; if (trans_id >= SB_JOURNAL(p_s_sb)->j_last_flush_trans_id) { if (buffer_locked((SB_JOURNAL(p_s_sb)->j_header_bh))) { @@ -834,12 +834,21 @@ static int update_journal_header_block(struct super_block *p_s_sb, unsigned long ll_rw_block(WRITE, 1, &(SB_JOURNAL(p_s_sb)->j_header_bh)) ; wait_on_buffer((SB_JOURNAL(p_s_sb)->j_header_bh)) ; if (!buffer_uptodate(SB_JOURNAL(p_s_sb)->j_header_bh)) { - reiserfs_panic(p_s_sb, "journal-712: buffer write failed\n") ; + printk( "reiserfs: journal-837: IO error during journal replay\n" ); + return -EIO ; } } return 0 ; } +static int update_journal_header_block(struct super_block *p_s_sb, + unsigned long offset, + unsigned long trans_id) { + if (_update_journal_header_block(p_s_sb, offset, trans_id)) { + reiserfs_panic(p_s_sb, "journal-712: buffer write failed\n") ; + } + return 0 ; +} /* ** flush any and all journal lists older than you are ** can only be called from flush_journal_list @@ -1374,6 +1383,9 @@ static int journal_transaction_is_valid(struct super_block *p_s_sb, struct buffe struct buffer_head *c_bh ; unsigned long offset ; + if (!d_bh) + return 0 ; + desc = (struct reiserfs_journal_desc *)d_bh->b_data ; if (le32_to_cpu(desc->j_len) > 0 && !memcmp(desc->j_magic, JOURNAL_DESC_MAGIC, 8)) { if (oldest_invalid_trans_id && *oldest_invalid_trans_id && le32_to_cpu(desc->j_trans_id) > *oldest_invalid_trans_id) { @@ -1641,8 +1653,6 @@ static int journal_read(struct super_block *p_s_sb) { if (continue_replay && is_read_only(p_s_sb->s_dev)) { printk("clm-2076: device is readonly, unable to replay log\n") ; - brelse(SB_JOURNAL(p_s_sb)->j_header_bh) ; - SB_JOURNAL(p_s_sb)->j_header_bh = NULL ; return -1 ; } if (continue_replay && (p_s_sb->s_flags & MS_RDONLY)) { @@ -1734,9 +1744,14 @@ static int journal_read(struct super_block *p_s_sb) { printk("reiserfs: replayed %d transactions in %lu seconds\n", replay_count, CURRENT_TIME - start) ; } - if (!is_read_only(p_s_sb->s_dev)) { - update_journal_header_block(p_s_sb, SB_JOURNAL(p_s_sb)->j_start, - SB_JOURNAL(p_s_sb)->j_last_flush_trans_id) ; + if (!is_read_only(p_s_sb->s_dev) && + _update_journal_header_block(p_s_sb, SB_JOURNAL(p_s_sb)->j_start, + SB_JOURNAL(p_s_sb)->j_last_flush_trans_id)) + { + /* replay failed, caller must call free_journal_ram and abort + ** the mount + */ + return -1 ; } return 0 ; } diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index 74ebaaa9a994..a31e73b269fe 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -1857,6 +1857,7 @@ void reiserfs_do_truncate (struct reiserfs_transaction_handle *th, return; } if (retval == POSITION_FOUND || retval == FILE_NOT_FOUND) { + pathrelse (&s_search_path); reiserfs_warning ("PAP-5660: reiserfs_do_truncate: " "wrong result %d of search for %K\n", retval, &s_item_key); return; diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index ca96e53eeaf1..943629d1ecf0 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -29,19 +29,11 @@ #endif -#define SUPPORT_OLD_FORMAT #define REISERFS_OLD_BLOCKSIZE 4096 #define REISERFS_SUPER_MAGIC_STRING_OFFSET_NJ 20 -#if 0 -// this one is not used currently -inline void reiserfs_mark_buffer_dirty (struct buffer_head * bh, int flag) -{ - mark_buffer_dirty (bh, flag); -} -#endif // // a portion of this function, particularly the VFS interface portion, @@ -367,98 +359,34 @@ void check_bitmap (struct super_block * s) free, SB_FREE_BLOCKS (s)); } -#ifdef SUPPORT_OLD_FORMAT - -/* support old disk layout */ -static int read_old_super_block (struct super_block * s, int size) -{ - struct buffer_head * bh; - struct reiserfs_super_block * rs; - - printk("read_old_super_block: try to find super block in old location\n"); - /* there are only 4k-sized blocks in v3.5.10 */ - if (size != REISERFS_OLD_BLOCKSIZE) - set_blocksize(s->s_dev, REISERFS_OLD_BLOCKSIZE); - bh = bread (s->s_dev, - REISERFS_OLD_DISK_OFFSET_IN_BYTES / REISERFS_OLD_BLOCKSIZE, - REISERFS_OLD_BLOCKSIZE); - if (!bh) { - printk("read_old_super_block: unable to read superblock on dev %s\n", kdevname(s->s_dev)); - return 1; - } - - rs = (struct reiserfs_super_block *)bh->b_data; - if (strncmp (rs->s_magic, REISERFS_SUPER_MAGIC_STRING, strlen ( REISERFS_SUPER_MAGIC_STRING))) { - /* pre-journaling version check */ - if(!strncmp((char*)rs + REISERFS_SUPER_MAGIC_STRING_OFFSET_NJ, - REISERFS_SUPER_MAGIC_STRING, strlen(REISERFS_SUPER_MAGIC_STRING))) { - printk("read_old_super_blockr: a pre-journaling reiserfs filesystem isn't suitable there.\n"); - brelse(bh); - return 1; - } - - brelse (bh); - printk ("read_old_super_block: can't find a reiserfs filesystem on dev %s.\n", kdevname(s->s_dev)); - return 1; - } - if(REISERFS_OLD_BLOCKSIZE != le16_to_cpu (rs->s_blocksize)) { - printk("read_old_super_block: blocksize mismatch, super block corrupted\n"); - brelse(bh); - return 1; - } - - s->s_blocksize = REISERFS_OLD_BLOCKSIZE; - s->s_blocksize_bits = 0; - while ((1 << s->s_blocksize_bits) != s->s_blocksize) - s->s_blocksize_bits ++; - SB_BUFFER_WITH_SB (s) = bh; - SB_DISK_SUPER_BLOCK (s) = rs; - s->s_op = &reiserfs_sops; - return 0; -} -#endif - -// -// FIXME: mounting old filesystems we _must_ change magic string to -// make then unmountable by reiserfs of 3.5.x -// -static int read_super_block (struct super_block * s, int size) +static int read_super_block (struct super_block * s, int size, int offset) { struct buffer_head * bh; struct reiserfs_super_block * rs; - bh = bread (s->s_dev, REISERFS_DISK_OFFSET_IN_BYTES / size, size); + + bh = bread (s->s_dev, offset / size, size); if (!bh) { - printk("read_super_block: unable to read superblock on dev %s\n", kdevname(s->s_dev)); + printk ("read_super_block: " + "bread failed (dev %s, block %d, size %d)\n", + kdevname (s->s_dev), offset / size, size); return 1; } rs = (struct reiserfs_super_block *)bh->b_data; if (!is_reiserfs_magic_string (rs)) { - printk ("read_super_block: can't find a reiserfs filesystem on dev %s\n", - kdevname(s->s_dev)); + printk ("read_super_block: " + "can't find a reiserfs filesystem on (dev %s, block %lu, size %d)\n", + kdevname(s->s_dev), bh->b_blocknr, size); brelse (bh); return 1; } // - // ok, reiserfs signature (old or new) found in 64-th 1k block of - // the device + // ok, reiserfs signature (old or new) found in at the given offset // - -#ifndef SUPPORT_OLD_FORMAT - // with SUPPORT_OLD_FORMAT undefined - detect old format by - // checking super block version - if (le16_to_cpu (rs->s_version) != REISERFS_VERSION_2) { - brelse (bh); - printk ("read_super_block: unsupported version (%d) of reiserfs found on dev %s\n", - le16_to_cpu (rs->s_version), kdevname(s->s_dev)); - return 1; - } -#endif - s->s_blocksize = le16_to_cpu (rs->s_blocksize); s->s_blocksize_bits = 0; while ((1 << s->s_blocksize_bits) != s->s_blocksize) @@ -468,17 +396,22 @@ static int read_super_block (struct super_block * s, int size) if (s->s_blocksize != size) set_blocksize (s->s_dev, s->s_blocksize); - bh = reiserfs_bread (s->s_dev, REISERFS_DISK_OFFSET_IN_BYTES / s->s_blocksize, s->s_blocksize); + + bh = bread (s->s_dev, offset / s->s_blocksize, s->s_blocksize); if (!bh) { - printk("read_super_block: unable to read superblock on dev %s\n", kdevname(s->s_dev)); + printk ("read_super_block: " + "bread failed (dev %s, block %d, size %d)\n", + kdevname (s->s_dev), offset / size, size); return 1; } rs = (struct reiserfs_super_block *)bh->b_data; if (!is_reiserfs_magic_string (rs) || le16_to_cpu (rs->s_blocksize) != s->s_blocksize) { + printk ("read_super_block: " + "can't find a reiserfs filesystem on (dev %s, block %lu, size %d)\n", + kdevname(s->s_dev), bh->b_blocknr, size); brelse (bh); - printk ("read_super_block: can't find a reiserfs filesystem on dev %s.\n", kdevname(s->s_dev)); return 1; } /* must check to be sure we haven't pulled an old format super out @@ -489,7 +422,8 @@ static int read_super_block (struct super_block * s, int size) if (bh->b_blocknr >= le32_to_cpu(rs->s_journal_block) && bh->b_blocknr < (le32_to_cpu(rs->s_journal_block) + JOURNAL_BLOCK_COUNT)) { brelse(bh) ; - printk("super-459: read_super_block: super found at block %lu is within its own log. " + printk("super-459: read_super_block: " + "super found at block %lu is within its own log. " "It must not be of this format type.\n", bh->b_blocknr) ; return 1 ; } @@ -504,6 +438,8 @@ static int read_super_block (struct super_block * s, int size) return 0; } + + /* after journal replay, reread all bitmap and super blocks */ static int reread_meta_blocks(struct super_block *s) { int i ; @@ -712,15 +648,12 @@ struct super_block * reiserfs_read_super (struct super_block * s, void * data, i } /* read block (64-th 1k block), which can contain reiserfs super block */ - if (read_super_block (s, size)) { -#ifdef SUPPORT_OLD_FORMAT + if (read_super_block (s, size, REISERFS_DISK_OFFSET_IN_BYTES)) { // try old format (undistributed bitmap, super block in 8-th 1k block of a device) - if(read_old_super_block(s,size)) + if (read_super_block (s, size, REISERFS_OLD_DISK_OFFSET_IN_BYTES)) goto error; else old_format = 1; -#endif - goto error ; } s->u.reiserfs_sb.s_mount_state = le16_to_cpu (SB_DISK_SUPER_BLOCK (s)->s_state); /* journal victim */ @@ -779,16 +712,23 @@ struct super_block * reiserfs_read_super (struct super_block * s, void * data, i if (!(s->s_flags & MS_RDONLY)) { struct reiserfs_super_block * rs = SB_DISK_SUPER_BLOCK (s); + int old_magic; + + old_magic = strncmp (rs->s_magic, REISER2FS_SUPER_MAGIC_STRING, + strlen ( REISER2FS_SUPER_MAGIC_STRING)); + if( old_magic && le16_to_cpu(rs->s_version) != 0 ) { + dput(s->s_root) ; + s->s_root = NULL ; + reiserfs_warning("reiserfs: wrong version/magic combination in the super-block\n") ; + goto error ; + } journal_begin(&th, s, 1) ; reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ; rs->s_state = cpu_to_le16 (REISERFS_ERROR_FS); - if (strncmp (rs->s_magic, REISER2FS_SUPER_MAGIC_STRING, - strlen ( REISER2FS_SUPER_MAGIC_STRING))) { - if (le16_to_cpu(rs->s_version) != 0) - BUG (); + if ( old_magic ) { // filesystem created under 3.5.x found if (!old_format_only (s)) { reiserfs_warning("reiserfs: converting 3.5.x filesystem to the new format\n") ; diff --git a/fs/sysv/Makefile b/fs/sysv/Makefile index 997a9d3226bf..4c574eda0242 100644 --- a/fs/sysv/Makefile +++ b/fs/sysv/Makefile @@ -9,7 +9,8 @@ O_TARGET := sysv.o -obj-y := ialloc.o balloc.o inode.o itree.o file.o dir.o namei.o super.o +obj-y := ialloc.o balloc.o inode.o itree.o file.o dir.o \ + namei.o super.o symlink.o obj-m := $(O_TARGET) include $(TOPDIR)/Rules.make diff --git a/fs/sysv/balloc.c b/fs/sysv/balloc.c index c9b2f90462ad..1d76bb94148f 100644 --- a/fs/sysv/balloc.c +++ b/fs/sysv/balloc.c @@ -46,6 +46,14 @@ void sysv_free_block(struct super_block * sb, u32 nr) unsigned count; unsigned block = fs32_to_cpu(sb, nr); + /* + * This code does not work at all for AFS (it has a bitmap + * free list). As AFS is supposed to be read-only no one + * should call this for an AFS filesystem anyway... + */ + if (sb->sv_type == FSTYPE_AFS) + return; + if (block < sb->sv_firstdatazone || block >= sb->sv_nzones) { printk("sysv_free_block: trying to free block not in datazone\n"); return; @@ -154,6 +162,14 @@ unsigned long sysv_count_free_blocks(struct super_block * sb) unsigned block; int n; + /* + * This code does not work at all for AFS (it has a bitmap + * free list). As AFS is supposed to be read-only we just + * lie and say it has no free block at all. + */ + if (sb->sv_type == FSTYPE_AFS) + return 0; + lock_super(sb); sb_count = fs32_to_cpu(sb, *sb->sv_free_blocks); diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index fd34fad5b350..c24f41e2607e 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -131,8 +131,11 @@ void sysv_set_inode(struct inode *inode, dev_t rdev) inode->i_fop = &sysv_dir_operations; inode->i_mapping->a_ops = &sysv_aops; } else if (S_ISLNK(inode->i_mode)) { - inode->i_op = &sysv_symlink_inode_operations; - inode->i_mapping->a_ops = &sysv_aops; + if (inode->i_blocks) { + inode->i_op = &sysv_symlink_inode_operations; + inode->i_mapping->a_ops = &sysv_aops; + } else + inode->i_op = &sysv_fast_symlink_inode_operations; } else init_special_inode(inode, inode->i_mode, rdev); } @@ -196,7 +199,6 @@ int sysv_notify_change(struct dentry *dentry, struct iattr *attr) attr->ia_mode = COH_KLUDGE_NOT_SYMLINK; inode_setattr(inode, attr); - return 0; } diff --git a/fs/sysv/super.c b/fs/sysv/super.c index 4c3b14e523c1..66e46552d48b 100644 --- a/fs/sysv/super.c +++ b/fs/sysv/super.c @@ -26,11 +26,16 @@ #include #include -/* The following functions try to recognize specific filesystems. +/* + * The following functions try to recognize specific filesystems. + * * We recognize: * - Xenix FS by its magic number. * - SystemV FS by its magic number. * - Coherent FS by its funny fname/fpack field. + * - SCO AFS by s_nfree == 0xffff + * - V7 FS has no distinguishing features. + * * We discriminate among SystemV4 and SystemV2 FS by the assumption that * the time stamp is not < 01-01-1980. */ @@ -197,7 +202,19 @@ static int detect_sysv (struct super_block *sb, struct buffer_head *bh) sb->sv_bytesex = BYTESEX_BE; else return 0; - if (sbd->s_time < JAN_1_1980) { + + if (fs16_to_cpu(sb, sbd->s_nfree) == 0xffff) { + sb->sv_type = FSTYPE_AFS; + if (!(sb->s_flags & MS_RDONLY)) { + printk("SysV FS: SCO EAFS on %s detected, " + "forcing read-only mode.\n", + bdevname(sb->s_dev)); + sb->s_flags |= MS_RDONLY; + } + return sbd->s_type; + } + + if (fs32_to_cpu(sb, sbd->s_time) < JAN_1_1980) { /* this is likely to happen on SystemV2 FS */ if (sbd->s_type > 3 || sbd->s_type < 1) return 0; @@ -261,6 +278,7 @@ static char *flavour_names[] = { [FSTYPE_SYSV2] "SystemV Release 2", [FSTYPE_COH] "Coherent", [FSTYPE_V7] "V7", + [FSTYPE_AFS] "AFS", }; static void (*flavour_setup[])(struct super_block *) = { @@ -269,6 +287,7 @@ static void (*flavour_setup[])(struct super_block *) = { [FSTYPE_SYSV2] detected_sysv2, [FSTYPE_COH] detected_coherent, [FSTYPE_V7] detected_v7, + [FSTYPE_AFS] detected_sysv4, }; static int complete_read_super(struct super_block *sb, int silent, int size) @@ -294,7 +313,8 @@ static int complete_read_super(struct super_block *sb, int silent, int size) sb->sv_toobig_block = 10 + bsize_4 * (1 + bsize_4 * (1 + bsize_4)); sb->sv_ind_per_block_bits = n_bits-2; - sb->sv_ninodes = (sb->sv_firstdatazone - sb->sv_firstinodezone) << sb->sv_inodes_per_block_bits; + sb->sv_ninodes = (sb->sv_firstdatazone - sb->sv_firstinodezone) + << sb->sv_inodes_per_block_bits; sb->s_blocksize = bsize; sb->s_blocksize_bits = n_bits; @@ -346,13 +366,10 @@ static struct super_block *sysv_read_super(struct super_block *sb, sb->sv_block_base = 0; for (i = 0; i < sizeof(flavours)/sizeof(flavours[0]) && !size; i++) { - struct buffer_head *next_bh; - next_bh = bread(dev, flavours[i].block, BLOCK_SIZE); - if (!next_bh) - continue; brelse(bh); - bh = next_bh; - + bh = bread(dev, flavours[i].block, BLOCK_SIZE); + if (!bh) + continue; size = flavours[i].test(sb, bh); } @@ -411,8 +428,10 @@ Ebadsize: static struct super_block *v7_read_super(struct super_block *sb,void *data, int silent) { - struct buffer_head *bh; + struct buffer_head *bh, *bh2 = NULL; kdev_t dev = sb->s_dev; + struct v7_super_block *v7sb; + struct sysv_inode *v7i; if (440 != sizeof (struct v7_super_block)) panic("V7 FS: bad super-block size"); @@ -422,23 +441,41 @@ static struct super_block *v7_read_super(struct super_block *sb,void *data, sb->sv_type = FSTYPE_V7; sb->sv_bytesex = BYTESEX_PDP; - set_blocksize(dev,512); + set_blocksize(dev, 512); if ((bh = bread(dev, 1, 512)) == NULL) { if (!silent) - printk("VFS: unable to read V7 FS superblock on device " - "%s.\n", bdevname(dev)); + printk("VFS: unable to read V7 FS superblock on " + "device %s.\n", bdevname(dev)); goto failed; } + /* plausibility check on superblock */ + v7sb = (struct v7_super_block *) bh->b_data; + if (fs16_to_cpu(sb,v7sb->s_nfree) > V7_NICFREE || + fs16_to_cpu(sb,v7sb->s_ninode) > V7_NICINOD || + fs32_to_cpu(sb,v7sb->s_time) == 0) + goto failed; + + /* plausibility check on root inode: it is a directory, + with a nonzero size that is a multiple of 16 */ + if ((bh2 = bread(dev, 2, 512)) == NULL) + goto failed; + v7i = (struct sysv_inode *)(bh2->b_data + 64); + if ((fs16_to_cpu(sb,v7i->i_mode) & ~0777) != S_IFDIR || + (fs32_to_cpu(sb,v7i->i_size) == 0) || + (fs32_to_cpu(sb,v7i->i_size) & 017) != 0) + goto failed; + brelse(bh2); sb->sv_bh1 = bh; sb->sv_bh2 = bh; if (complete_read_super(sb, silent, 1)) return sb; - brelse(bh); failed: + brelse(bh2); + brelse(bh); return NULL; } diff --git a/fs/sysv/symlink.c b/fs/sysv/symlink.c new file mode 100644 index 000000000000..d6840317762c --- /dev/null +++ b/fs/sysv/symlink.c @@ -0,0 +1,25 @@ +/* + * linux/fs/sysv/symlink.c + * + * Handling of System V filesystem fast symlinks extensions. + * Aug 2001, Christoph Hellwig (hch@caldera.de) + */ + +#include + +static int sysv_readlink(struct dentry *dentry, char *buffer, int buflen) +{ + char *s = (char *)dentry->d_inode->u.sysv_i.i_data; + return vfs_readlink(dentry, buffer, buflen, s); +} + +static int sysv_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + char *s = (char *)dentry->d_inode->u.sysv_i.i_data; + return vfs_follow_link(nd, s); +} + +struct inode_operations sysv_fast_symlink_inode_operations = { + readlink: sysv_readlink, + follow_link: sysv_follow_link, +}; diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index aaa89db79be4..fa422a86f02a 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -81,11 +81,6 @@ extern void lock_page(struct page *page); #define find_lock_page(mapping, index) \ __find_lock_page(mapping, index, page_hash(mapping, index)) -extern struct page * __find_get_swapcache_page (struct address_space * mapping, - unsigned long index, struct page **hash); -#define find_get_swapcache_page(mapping, index) \ - __find_get_swapcache_page(mapping, index, page_hash(mapping, index)) - extern void __add_page_to_hash_queue(struct page * page, struct page **p); extern void add_to_page_cache(struct page * page, struct address_space *mapping, unsigned long index); diff --git a/include/linux/sched.h b/include/linux/sched.h index 72e79c9ff113..94957c39b4c3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -749,6 +749,7 @@ extern void exit_mm(struct task_struct *); extern void exit_files(struct task_struct *); extern void exit_sighand(struct task_struct *); +extern void reparent_to_init(void); extern void daemonize(void); extern int do_execve(char *, char **, char **, struct pt_regs *); diff --git a/include/linux/sysv_fs.h b/include/linux/sysv_fs.h index b88f13aff2a6..60a38abf536c 100644 --- a/include/linux/sysv_fs.h +++ b/include/linux/sysv_fs.h @@ -325,6 +325,7 @@ enum { FSTYPE_SYSV2, FSTYPE_COH, FSTYPE_V7, + FSTYPE_AFS, FSTYPE_END, }; @@ -373,6 +374,7 @@ extern ino_t sysv_inode_by_name(struct dentry*); extern struct inode_operations sysv_file_inode_operations; extern struct inode_operations sysv_dir_inode_operations; +extern struct inode_operations sysv_fast_symlink_inode_operations; extern struct file_operations sysv_file_operations; extern struct file_operations sysv_dir_operations; extern struct address_space_operations sysv_aops; diff --git a/kernel/fork.c b/kernel/fork.c index 55cb9e33ae6b..b692f28c2bf4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -8,7 +8,7 @@ * 'fork.c' contains the help-routines for the 'fork' system call * (see also entry.S and others). * Fork is rather simple, once you get the hang of it, but the memory - * management can be a bitch. See 'mm/memory.c': 'copy_page_tables()' + * management can be a bitch. See 'mm/memory.c': 'copy_page_range()' */ #include @@ -134,9 +134,22 @@ static inline int dup_mmap(struct mm_struct * mm) mm->mmap_avl = NULL; mm->mmap_cache = NULL; mm->map_count = 0; + mm->rss = 0; mm->cpu_vm_mask = 0; mm->swap_address = 0; pprev = &mm->mmap; + + /* + * Add it to the mmlist after the parent. + * Doing it this way means that we can order the list, + * and fork() won't mess up the ordering significantly. + * Add it first so that swapoff can see any swap entries. + */ + spin_lock(&mmlist_lock); + list_add(&mm->mmlist, ¤t->mm->mmlist); + mmlist_nr++; + spin_unlock(&mmlist_lock); + for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) { struct file *file; @@ -149,7 +162,6 @@ static inline int dup_mmap(struct mm_struct * mm) *tmp = *mpnt; tmp->vm_flags &= ~VM_LOCKED; tmp->vm_mm = mm; - mm->map_count++; tmp->vm_next = NULL; file = tmp->vm_file; if (file) { @@ -168,17 +180,19 @@ static inline int dup_mmap(struct mm_struct * mm) spin_unlock(&inode->i_mapping->i_shared_lock); } - /* Copy the pages, but defer checking for errors */ - retval = copy_page_range(mm, current->mm, tmp); - if (!retval && tmp->vm_ops && tmp->vm_ops->open) - tmp->vm_ops->open(tmp); - /* - * Link in the new vma even if an error occurred, - * so that exit_mmap() can clean up the mess. + * Link in the new vma and copy the page table entries: + * link in first so that swapoff can see swap entries. */ + spin_lock(&mm->page_table_lock); *pprev = tmp; pprev = &tmp->vm_next; + mm->map_count++; + retval = copy_page_range(mm, current->mm, tmp); + spin_unlock(&mm->page_table_lock); + + if (tmp->vm_ops && tmp->vm_ops->open) + tmp->vm_ops->open(tmp); if (retval) goto fail_nomem; @@ -246,6 +260,9 @@ inline void __mmdrop(struct mm_struct *mm) void mmput(struct mm_struct *mm) { if (atomic_dec_and_lock(&mm->mm_users, &mmlist_lock)) { + extern struct mm_struct *swap_mm; + if (swap_mm == mm) + swap_mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist); list_del(&mm->mmlist); mmlist_nr--; spin_unlock(&mmlist_lock); @@ -320,18 +337,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) retval = dup_mmap(mm); up_write(&oldmm->mmap_sem); - /* - * Add it to the mmlist after the parent. - * - * Doing it this way means that we can order - * the list, and fork() won't mess up the - * ordering significantly. - */ - spin_lock(&mmlist_lock); - list_add(&mm->mmlist, &oldmm->mmlist); - mmlist_nr++; - spin_unlock(&mmlist_lock); - if (retval) goto free_pt; diff --git a/kernel/ksyms.c b/kernel/ksyms.c index de0370e2157a..a717792c5f96 100644 --- a/kernel/ksyms.c +++ b/kernel/ksyms.c @@ -478,6 +478,7 @@ EXPORT_SYMBOL(secure_tcp_sequence_number); EXPORT_SYMBOL(get_random_bytes); EXPORT_SYMBOL(securebits); EXPORT_SYMBOL(cap_bset); +EXPORT_SYMBOL(reparent_to_init); EXPORT_SYMBOL(daemonize); EXPORT_SYMBOL(csum_partial); /* for networking and md */ diff --git a/kernel/sched.c b/kernel/sched.c index 2de4e334cb48..deb5854e89c1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -107,6 +107,7 @@ static union { #define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule struct kernel_stat kstat; +extern struct task_struct *child_reaper; #ifdef CONFIG_SMP @@ -1215,6 +1216,59 @@ void show_state(void) read_unlock(&tasklist_lock); } +/** + * reparent_to_init() - Reparent the calling kernel thread to the init task. + * + * If a kernel thread is launched as a result of a system call, or if + * it ever exits, it should generally reparent itself to init so that + * it is correctly cleaned up on exit. + * + * The various task state such as scheduling policy and priority may have + * been inherited fro a user process, so we reset them to sane values here. + * + * NOTE that reparent_to_init() gives the caller full capabilities. + */ +void reparent_to_init(void) +{ + struct task_struct *this_task = current; + + write_lock_irq(&tasklist_lock); + + /* Reparent to init */ + REMOVE_LINKS(this_task); + this_task->p_pptr = child_reaper; + this_task->p_opptr = child_reaper; + SET_LINKS(this_task); + + /* Set the exit signal to SIGCHLD so we signal init on exit */ + if (this_task->exit_signal != 0) { + printk(KERN_ERR "task `%s' exit_signal %d in " + __FUNCTION__ "\n", + this_task->comm, this_task->exit_signal); + } + this_task->exit_signal = SIGCHLD; + + /* We also take the runqueue_lock while altering task fields + * which affect scheduling decisions */ + spin_lock(&runqueue_lock); + + this_task->ptrace = 0; + this_task->nice = DEF_NICE; + this_task->policy = SCHED_OTHER; + /* cpus_allowed? */ + /* rt_priority? */ + /* signals? */ + this_task->cap_effective = CAP_INIT_EFF_SET; + this_task->cap_inheritable = CAP_INIT_INH_SET; + this_task->cap_permitted = CAP_FULL_SET; + this_task->keep_capabilities = 0; + memcpy(this_task->rlim, init_task.rlim, sizeof(*(this_task->rlim))); + this_task->user = INIT_USER; + + spin_unlock(&runqueue_lock); + write_unlock_irq(&tasklist_lock); +} + /* * Put all the gunge required to become a kernel thread without * attached user resources in one place where it belongs. diff --git a/mm/filemap.c b/mm/filemap.c index b45cffa39685..a3cd1c4eaf47 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -681,34 +681,6 @@ struct page * __find_get_page(struct address_space *mapping, return page; } -/* - * Find a swapcache page (and get a reference) or return NULL. - * The SwapCache check is protected by the pagecache lock. - */ -struct page * __find_get_swapcache_page(struct address_space *mapping, - unsigned long offset, struct page **hash) -{ - struct page *page; - - /* - * We need the LRU lock to protect against page_launder(). - */ - - spin_lock(&pagecache_lock); - page = __find_page_nolock(mapping, offset, *hash); - if (page) { - spin_lock(&pagemap_lru_lock); - if (PageSwapCache(page)) - page_cache_get(page); - else - page = NULL; - spin_unlock(&pagemap_lru_lock); - } - spin_unlock(&pagecache_lock); - - return page; -} - /* * Same as the above, but lock the page too, verifying that * it's still valid once we own it. diff --git a/mm/memory.c b/mm/memory.c index 00c74f7ff31e..8dc197c05e4b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -148,6 +148,9 @@ void clear_page_tables(struct mm_struct *mm, unsigned long first, int nr) * * 08Jan98 Merged into one routine from several inline routines to reduce * variable count and make things faster. -jj + * + * dst->page_table_lock is held on entry and exit, + * but may be dropped within pmd_alloc() and pte_alloc(). */ int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma) @@ -159,8 +162,7 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src, src_pgd = pgd_offset(src, address)-1; dst_pgd = pgd_offset(dst, address)-1; - - spin_lock(&dst->page_table_lock); + for (;;) { pmd_t * src_pmd, * dst_pmd; @@ -234,6 +236,7 @@ skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK; pte = pte_mkclean(pte); pte = pte_mkold(pte); get_page(ptepage); + dst->rss++; cont_copy_pte_range: set_pte(dst_pte, pte); cont_copy_pte_range_noset: address += PAGE_SIZE; @@ -251,11 +254,8 @@ cont_copy_pmd_range: src_pmd++; out_unlock: spin_unlock(&src->page_table_lock); out: - spin_unlock(&dst->page_table_lock); return 0; - nomem: - spin_unlock(&dst->page_table_lock); return -ENOMEM; } @@ -999,7 +999,6 @@ static void vmtruncate_list(struct vm_area_struct *mpnt, unsigned long pgoff) flush_tlb_range(mm, start, end); } while ((mpnt = mpnt->vm_next_share) != NULL); } - /* * Handle all mappings that got truncated by a "truncate()" @@ -1057,8 +1056,6 @@ out: return; } - - /* * Primitive swap readahead code. We simply read an aligned block of * (1 << page_cluster) entries in the swap area. This method is chosen @@ -1072,23 +1069,19 @@ void swapin_readahead(swp_entry_t entry) unsigned long offset; /* - * Get the number of handles we should do readahead io to. Also, - * grab temporary references on them, releasing them as io completes. + * Get the number of handles we should do readahead io to. */ num = valid_swaphandles(entry, &offset); for (i = 0; i < num; offset++, i++) { /* Don't block on I/O for read-ahead */ - if (atomic_read(&nr_async_pages) >= pager_daemon.swap_cluster - * (1 << page_cluster)) { - while (i++ < num) - swap_free(SWP_ENTRY(SWP_TYPE(entry), offset++)); + if (atomic_read(&nr_async_pages) >= + pager_daemon.swap_cluster << page_cluster) break; - } /* Ok, do the async read-ahead now */ new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset)); - if (new_page != NULL) - page_cache_release(new_page); - swap_free(SWP_ENTRY(SWP_TYPE(entry), offset)); + if (!new_page) + break; + page_cache_release(new_page); } return; } diff --git a/mm/swap_state.c b/mm/swap_state.c index 2263a3189a5d..bbfb11189571 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -29,7 +29,7 @@ static int swap_writepage(struct page *page) if (swap_count(page) > 1) goto in_use; - /* We could remove it here, but page_launder will do it anyway */ + delete_from_swap_cache_nolock(page); UnlockPage(page); return 0; @@ -79,40 +79,35 @@ void add_to_swap_cache(struct page *page, swp_entry_t entry) BUG(); if (page->mapping) BUG(); - flags = page->flags & ~((1 << PG_error) | (1 << PG_arch_1)); + + /* clear PG_dirty so a subsequent set_page_dirty takes effect */ + flags = page->flags & ~((1 << PG_error) | (1 << PG_dirty) | (1 << PG_arch_1)); page->flags = flags | (1 << PG_uptodate); page->age = PAGE_AGE_START; add_to_page_cache_locked(page, &swapper_space, entry.val); } -static inline void remove_from_swap_cache(struct page *page) -{ - struct address_space *mapping = page->mapping; - - if (mapping != &swapper_space) - BUG(); - if (!PageSwapCache(page) || !PageLocked(page)) - PAGE_BUG(page); - - PageClearSwapCache(page); - ClearPageDirty(page); - __remove_inode_page(page); -} - /* * This must be called only on pages that have * been verified to be in the swap cache. */ void __delete_from_swap_cache(struct page *page) { + struct address_space *mapping = page->mapping; swp_entry_t entry; - entry.val = page->index; - #ifdef SWAP_CACHE_INFO swap_cache_del_total++; #endif - remove_from_swap_cache(page); + if (mapping != &swapper_space) + BUG(); + if (!PageSwapCache(page) || !PageLocked(page)) + BUG(); + + entry.val = page->index; + PageClearSwapCache(page); + ClearPageDirty(page); + __remove_inode_page(page); swap_free(entry); } @@ -129,7 +124,6 @@ void delete_from_swap_cache_nolock(struct page *page) lru_cache_del(page); spin_lock(&pagecache_lock); - ClearPageDirty(page); __delete_from_swap_cache(page); spin_unlock(&pagecache_lock); page_cache_release(page); @@ -169,14 +163,12 @@ void free_page_and_swap_cache(struct page *page) page_cache_release(page); } - /* * Lookup a swap entry in the swap cache. A found page will be returned * unlocked and with its refcount incremented - we rely on the kernel * lock getting page table operations atomic even if we drop the page * lock before returning. */ - struct page * lookup_swap_cache(swp_entry_t entry) { struct page *found; @@ -184,59 +176,62 @@ struct page * lookup_swap_cache(swp_entry_t entry) #ifdef SWAP_CACHE_INFO swap_cache_find_total++; #endif - while (1) { - /* - * Right now the pagecache is 32-bit only. But it's a 32 bit index. =) - */ - found = find_get_swapcache_page(&swapper_space, entry.val); - if (!found) - return 0; - if (!PageSwapCache(found)) - BUG(); - if (found->mapping != &swapper_space) - BUG(); + found = find_get_page(&swapper_space, entry.val); + /* + * Unsafe to assert PageSwapCache and mapping on page found: + * if SMP nothing prevents swapoff from deleting this page from + * the swap cache at this moment. find_lock_page would prevent + * that, but no need to change: we _have_ got the right page. + */ #ifdef SWAP_CACHE_INFO + if (found) swap_cache_find_success++; #endif - return found; - } + return found; } /* * Locate a page of swap in physical memory, reserving swap cache space - * and reading the disk if it is not already cached. If wait==0, we are - * only doing readahead, so don't worry if the page is already locked. - * + * and reading the disk if it is not already cached. * A failure return means that either the page allocation failed or that * the swap entry is no longer in use. */ - struct page * read_swap_cache_async(swp_entry_t entry) { - struct page *found_page = 0, *new_page; + struct page *found_page, *new_page; + struct page **hash; /* - * Make sure the swap entry is still in use. + * Look for the page in the swap cache. Since we normally call + * this only after lookup_swap_cache() failed, re-calling that + * would confuse the statistics: use __find_get_page() directly. */ - if (!swap_duplicate(entry)) /* Account for the swap cache */ - goto out; - /* - * Look for the page in the swap cache. - */ - found_page = lookup_swap_cache(entry); + hash = page_hash(&swapper_space, entry.val); + found_page = __find_get_page(&swapper_space, entry.val, hash); if (found_page) - goto out_free_swap; + goto out; new_page = alloc_page(GFP_HIGHUSER); if (!new_page) - goto out_free_swap; /* Out of memory */ + goto out; /* Out of memory */ /* * Check the swap cache again, in case we stalled above. + * The BKL is guarding against races between this check + * and where the new page is added to the swap cache below. */ - found_page = lookup_swap_cache(entry); + found_page = __find_get_page(&swapper_space, entry.val, hash); if (found_page) goto out_free_page; + + /* + * Make sure the swap entry is still in use. It could have gone + * while caller waited for BKL, or while allocating page above, + * or while allocating page in prior call via swapin_readahead. + */ + if (!swap_duplicate(entry)) /* Account for the swap cache */ + goto out_free_page; + /* * Add it to the swap cache and read its contents. */ @@ -248,8 +243,6 @@ struct page * read_swap_cache_async(swp_entry_t entry) out_free_page: page_cache_release(new_page); -out_free_swap: - swap_free(entry); out: return found_page; } diff --git a/mm/swapfile.c b/mm/swapfile.c index 32ecbd66746d..4798c544a019 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -20,6 +20,12 @@ spinlock_t swaplock = SPIN_LOCK_UNLOCKED; unsigned int nr_swapfiles; int total_swap_pages; +static int swap_overflow; + +static const char Bad_file[] = "Bad swap file entry "; +static const char Unused_file[] = "Unused swap file entry "; +static const char Bad_offset[] = "Bad swap offset entry "; +static const char Unused_offset[] = "Unused swap offset entry "; struct swap_list_t swap_list = {-1, -1}; @@ -202,21 +208,21 @@ out: return; bad_nofile: - printk("swap_free: Trying to free nonexistent swap-page\n"); + printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); goto out; bad_device: - printk("swap_free: Trying to free swap from unused swap-device\n"); + printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val); goto out; bad_offset: - printk("swap_free: offset exceeds max\n"); + printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val); goto out; bad_free: - printk("VM: Bad swap entry %08lx\n", entry.val); + printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val); goto out; bad_count: swap_device_unlock(p); swap_list_unlock(); - printk(KERN_ERR "VM: Bad count %hd current count %hd\n", count, p->swap_map[offset]); + printk(KERN_ERR "swap_free: Bad count %hd current count %hd\n", count, p->swap_map[offset]); goto out; } @@ -229,33 +235,23 @@ bad_count: * share this swap entry, so be cautious and let do_wp_page work out * what to do if a write is requested later. */ -/* tasklist_lock and vma->vm_mm->page_table_lock are held */ +/* BKL, mmlist_lock and vma->vm_mm->page_table_lock are held */ static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address, pte_t *dir, swp_entry_t entry, struct page* page) { pte_t pte = *dir; - if (pte_none(pte)) - return; - if (pte_present(pte)) { - /* If this entry is swap-cached, then page must already - hold the right address for any copies in physical - memory */ - if (pte_page(pte) != page) - return; - /* We will be removing the swap cache in a moment, so... */ - ptep_mkdirty(dir); - return; - } if (pte_to_swp_entry(pte).val != entry.val) return; - set_pte(dir, pte_mkdirty(mk_pte(page, vma->vm_page_prot))); - swap_free(entry); + if (pte_none(pte) || pte_present(pte)) + return; get_page(page); + set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); + swap_free(entry); ++vma->vm_mm->rss; } -/* tasklist_lock and vma->vm_mm->page_table_lock are held */ +/* BKL, mmlist_lock and vma->vm_mm->page_table_lock are held */ static inline void unuse_pmd(struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long size, unsigned long offset, swp_entry_t entry, struct page* page) @@ -283,7 +279,7 @@ static inline void unuse_pmd(struct vm_area_struct * vma, pmd_t *dir, } while (address && (address < end)); } -/* tasklist_lock and vma->vm_mm->page_table_lock are held */ +/* BKL, mmlist_lock and vma->vm_mm->page_table_lock are held */ static inline void unuse_pgd(struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long size, swp_entry_t entry, struct page* page) @@ -314,7 +310,7 @@ static inline void unuse_pgd(struct vm_area_struct * vma, pgd_t *dir, } while (address && (address < end)); } -/* tasklist_lock and vma->vm_mm->page_table_lock are held */ +/* BKL, mmlist_lock and vma->vm_mm->page_table_lock are held */ static void unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir, swp_entry_t entry, struct page* page) { @@ -337,8 +333,6 @@ static void unuse_process(struct mm_struct * mm, /* * Go through process' page directory. */ - if (!mm) - return; spin_lock(&mm->page_table_lock); for (vma = mm->mmap; vma; vma = vma->vm_next) { pgd_t * pgd = pgd_offset(mm, vma->vm_start); @@ -349,53 +343,42 @@ static void unuse_process(struct mm_struct * mm, } /* - * this is called when we find a page in the swap list - * all the locks have been dropped at this point which - * isn't a problem because we rescan the swap map - * and we _don't_ clear the refrence count if for - * some reason it isn't 0 + * Scan swap_map from current position to next entry still in use. + * Recycle to start on reaching the end, returning 0 when empty. */ - -static inline int free_found_swap_entry(unsigned int type, int i) +static int find_next_to_unuse(struct swap_info_struct *si, int prev) { - struct task_struct *p; - struct page *page; - swp_entry_t entry; - - entry = SWP_ENTRY(type, i); + int max = si->max; + int i = prev; + int count; - /* - * Get a page for the entry, using the existing swap - * cache page if there is one. Otherwise, get a clean - * page and read the swap into it. - */ - page = read_swap_cache_async(entry); - if (!page) { - swap_free(entry); - return -ENOMEM; - } - lock_page(page); - if (PageSwapCache(page)) - delete_from_swap_cache_nolock(page); - UnlockPage(page); - read_lock(&tasklist_lock); - for_each_task(p) - unuse_process(p->mm, entry, page); - read_unlock(&tasklist_lock); - shmem_unuse(entry, page); - /* - * Now get rid of the extra reference to the temporary - * page we've been using. - */ - page_cache_release(page); /* - * Check for and clear any overflowed swap map counts. + * No need for swap_device_lock(si) here: we're just looking + * for whether an entry is in use, not modifying it; false + * hits are okay, and sys_swapoff() has already prevented new + * allocations from this area (while holding swap_list_lock()). */ - swap_free(entry); - return 0; + for (;;) { + if (++i >= max) { + if (!prev) { + i = 0; + break; + } + /* + * No entries in use at top of swap_map, + * loop back to start and recheck there. + */ + max = prev + 1; + prev = 0; + i = 1; + } + count = si->swap_map[i]; + if (count && count != SWAP_MAP_BAD) + break; + } + return i; } - /* * We completely avoid races by reading each swap page in advance, * and then search for the process using it. All the necessary @@ -404,80 +387,175 @@ static inline int free_found_swap_entry(unsigned int type, int i) static int try_to_unuse(unsigned int type) { struct swap_info_struct * si = &swap_info[type]; - int ret, foundpage; + struct mm_struct *start_mm; + unsigned short *swap_map; + unsigned short swcount; + struct page *page; + swp_entry_t entry; + int i = 0; + int retval = 0; + int reset_overflow = 0; - do { - int i; + /* + * When searching mms for an entry, a good strategy is to + * start at the first mm we freed the previous entry from + * (though actually we don't notice whether we or coincidence + * freed the entry). Initialize this start_mm with a hold. + * + * A simpler strategy would be to start at the last mm we + * freed the previous entry from; but that would take less + * advantage of mmlist ordering (now preserved by swap_out()), + * which clusters forked address spaces together, most recent + * child immediately after parent. If we race with dup_mmap(), + * we very much want to resolve parent before child, otherwise + * we may miss some entries: using last mm would invert that. + */ + start_mm = &init_mm; + atomic_inc(&init_mm.mm_users); - /* - * The algorithm is inefficient but seldomly used - * - * Find a swap page in use and read it in. + /* + * Keep on scanning until all entries have gone. Usually, + * one pass through swap_map is enough, but not necessarily: + * mmput() removes mm from mmlist before exit_mmap() and its + * zap_page_range(). That's not too bad, those entries are + * on their way out, and handled faster there than here. + * do_munmap() behaves similarly, taking the range out of mm's + * vma list before zap_page_range(). But unfortunately, when + * unmapping a part of a vma, it takes the whole out first, + * then reinserts what's left after (might even reschedule if + * open() method called) - so swap entries may be invisible + * to swapoff for a while, then reappear - but that is rare. + */ + while ((i = find_next_to_unuse(si, i))) { + /* + * Get a page for the entry, using the existing swap + * cache page if there is one. Otherwise, get a clean + * page and read the swap into it. */ - foundpage = 0; - swap_device_lock(si); - for (i = 1; i < si->max ; i++) { - int count = si->swap_map[i]; - if (!count || count == SWAP_MAP_BAD) - continue; - + swap_map = &si->swap_map[i]; + entry = SWP_ENTRY(type, i); + page = read_swap_cache_async(entry); + if (!page) { /* - * Prevent swaphandle from being completely - * unused by swap_free while we are trying - * to read in the page - this prevents warning - * messages from rw_swap_page_base. + * Either swap_duplicate() failed because entry + * has been freed independently, and will not be + * reused since sys_swapoff() already disabled + * allocation from here, or alloc_page() failed. */ - foundpage = 1; - if (count != SWAP_MAP_MAX) - si->swap_map[i] = count + 1; + if (!*swap_map) + continue; + retval = -ENOMEM; + break; + } - swap_device_unlock(si); - ret = free_found_swap_entry(type,i); - if (ret) - return ret; + /* + * Don't hold on to start_mm if it looks like exiting. + * Can mmput ever block? if so, then we cannot risk + * it between deleting the page from the swap cache, + * and completing the search through mms (and cannot + * use it to avoid the long hold on mmlist_lock there). + */ + if (atomic_read(&start_mm->mm_users) == 1) { + mmput(start_mm); + start_mm = &init_mm; + atomic_inc(&init_mm.mm_users); + } - /* - * we pick up the swap_list_lock() to guard the nr_swap_pages, - * si->swap_map[] should only be changed if it is SWAP_MAP_MAX - * otherwise ugly stuff can happen with other people who are in - * the middle of a swap operation to this device. This kind of - * operation can sometimes be detected with the undead swap - * check. Don't worry about these 'undead' entries for now - * they will be caught the next time though the top loop. - * Do worry, about the weak locking that allows this to happen - * because if it happens to a page that is SWAP_MAP_MAX - * then bad stuff can happen. - */ - swap_list_lock(); - swap_device_lock(si); - if (si->swap_map[i] > 0) { - /* normally this would just kill the swap page if - * it still existed, it appears though that the locks - * are a little fuzzy - */ - if (si->swap_map[i] != SWAP_MAP_MAX) { - printk("VM: Undead swap entry %08lx\n", - SWP_ENTRY(type, i).val); - } else { - nr_swap_pages++; - si->swap_map[i] = 0; + /* + * Wait for and lock page. Remove it from swap cache + * so try_to_swap_out won't bump swap count. Mark dirty + * so try_to_swap_out will preserve it without us having + * to mark any present ptes as dirty: so we can skip + * searching processes once swap count has all gone. + */ + lock_page(page); + if (PageSwapCache(page)) + delete_from_swap_cache_nolock(page); + SetPageDirty(page); + UnlockPage(page); + flush_page_to_ram(page); + + /* + * Remove all references to entry, without blocking. + * Whenever we reach init_mm, there's no address space + * to search, but use it as a reminder to search shmem. + */ + swcount = *swap_map; + if (swcount) { + if (start_mm == &init_mm) + shmem_unuse(entry, page); + else + unuse_process(start_mm, entry, page); + } + if (*swap_map) { + int set_start_mm = (*swap_map >= swcount); + struct list_head *p = &start_mm->mmlist; + struct mm_struct *new_start_mm = start_mm; + struct mm_struct *mm; + + spin_lock(&mmlist_lock); + while (*swap_map && (p = p->next) != &start_mm->mmlist) { + mm = list_entry(p, struct mm_struct, mmlist); + swcount = *swap_map; + if (mm == &init_mm) { + set_start_mm = 1; + shmem_unuse(entry, page); + } else + unuse_process(mm, entry, page); + if (set_start_mm && *swap_map < swcount) { + new_start_mm = mm; + set_start_mm = 0; } } + atomic_inc(&new_start_mm->mm_users); + spin_unlock(&mmlist_lock); + mmput(start_mm); + start_mm = new_start_mm; + } + page_cache_release(page); + + /* + * How could swap count reach 0x7fff when the maximum + * pid is 0x7fff, and there's no way to repeat a swap + * page within an mm (except in shmem, where it's the + * shared object which takes the reference count)? + * We believe SWAP_MAP_MAX cannot occur in Linux 2.4. + * + * If that's wrong, then we should worry more about + * exit_mmap() and do_munmap() cases described above: + * we might be resetting SWAP_MAP_MAX too early here. + * We know "Undead"s can happen, they're okay, so don't + * report them; but do report if we reset SWAP_MAP_MAX. + */ + if (*swap_map == SWAP_MAP_MAX) { + swap_list_lock(); + swap_device_lock(si); + nr_swap_pages++; + *swap_map = 0; swap_device_unlock(si); swap_list_unlock(); + reset_overflow = 1; + } - /* - * This lock stuff is ulgy! - * Make sure that we aren't completely killing - * interactive performance. - */ - if (current->need_resched) - schedule(); - swap_device_lock(si); + /* + * Make sure that we aren't completely killing + * interactive performance. Interruptible check on + * signal_pending() would be nice, but changes the spec? + */ + if (current->need_resched) + schedule(); + else { + unlock_kernel(); + lock_kernel(); } - swap_device_unlock(si); - } while (foundpage); - return 0; + } + + mmput(start_mm); + if (reset_overflow) { + printk(KERN_WARNING "swapoff: cleared swap entry overflow\n"); + swap_overflow = 0; + } + return retval; } asmlinkage long sys_swapoff(const char * specialfile) @@ -528,8 +606,8 @@ asmlinkage long sys_swapoff(const char * specialfile) } nr_swap_pages -= p->pages; total_swap_pages -= p->pages; - swap_list_unlock(); p->flags = SWP_USED; + swap_list_unlock(); err = try_to_unuse(type); if (err) { /* re-insert swap space back into swap_list */ @@ -544,8 +622,8 @@ asmlinkage long sys_swapoff(const char * specialfile) swap_info[prev].next = p - swap_info; nr_swap_pages += p->pages; total_swap_pages += p->pages; - swap_list_unlock(); p->flags = SWP_WRITEOK; + swap_list_unlock(); goto out_dput; } if (p->swap_device) @@ -557,6 +635,7 @@ asmlinkage long sys_swapoff(const char * specialfile) nd.mnt = p->swap_vfsmnt; p->swap_vfsmnt = NULL; p->swap_device = 0; + p->max = 0; vfree(p->swap_map); p->swap_map = NULL; p->flags = 0; @@ -637,7 +716,7 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags) union swap_header *swap_header = 0; int swap_header_version; int nr_good_pages = 0; - unsigned long maxpages; + unsigned long maxpages = 1; int swapfilesize; struct block_device *bdev = NULL; @@ -662,7 +741,6 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags) p->highest_bit = 0; p->cluster_nr = 0; p->sdev_lock = SPIN_LOCK_UNLOCKED; - p->max = 1; p->next = -1; if (swap_flags & SWAP_FLAG_PREFER) { p->prio = @@ -752,17 +830,17 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags) if (!p->lowest_bit) p->lowest_bit = i; p->highest_bit = i; - p->max = i+1; + maxpages = i+1; j++; } } nr_good_pages = j; - p->swap_map = vmalloc(p->max * sizeof(short)); + p->swap_map = vmalloc(maxpages * sizeof(short)); if (!p->swap_map) { error = -ENOMEM; goto bad_swap; } - for (i = 1 ; i < p->max ; i++) { + for (i = 1 ; i < maxpages ; i++) { if (test_bit(i,(char *) swap_header)) p->swap_map[i] = 0; else @@ -783,24 +861,22 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags) p->lowest_bit = 1; p->highest_bit = swap_header->info.last_page - 1; - p->max = swap_header->info.last_page; - - maxpages = SWP_OFFSET(SWP_ENTRY(0,~0UL)); - if (p->max >= maxpages) - p->max = maxpages-1; + maxpages = SWP_OFFSET(SWP_ENTRY(0,~0UL)) - 1; + if (maxpages > swap_header->info.last_page) + maxpages = swap_header->info.last_page; error = -EINVAL; if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) goto bad_swap; /* OK, set up the swap map and apply the bad block list */ - if (!(p->swap_map = vmalloc (p->max * sizeof(short)))) { + if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) { error = -ENOMEM; goto bad_swap; } error = 0; - memset(p->swap_map, 0, p->max * sizeof(short)); + memset(p->swap_map, 0, maxpages * sizeof(short)); for (i=0; iinfo.nr_badpages; i++) { int page = swap_header->info.badpages[i]; if (page <= 0 || page >= swap_header->info.last_page) @@ -815,7 +891,7 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags) goto bad_swap; } - if (swapfilesize && p->max > swapfilesize) { + if (swapfilesize && maxpages > swapfilesize) { printk(KERN_WARNING "Swap area shorter than signature indicates\n"); error = -EINVAL; @@ -827,6 +903,7 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags) goto bad_swap; } p->swap_map[0] = SWAP_MAP_BAD; + p->max = maxpages; p->flags = SWP_WRITEOK; p->pages = nr_good_pages; swap_list_lock(); @@ -903,7 +980,6 @@ void si_swapinfo(struct sysinfo *val) /* * Verify that a swap entry is valid and increment its swap map count. - * Kernel_lock is held, which guarantees existance of swap device. * * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as * "permanent", but will be reclaimed by the next swapoff. @@ -933,9 +1009,8 @@ int swap_duplicate(swp_entry_t entry) if (p->swap_map[offset] < SWAP_MAP_MAX) p->swap_map[offset]++; else { - static int overflow = 0; - if (overflow++ < 5) - printk("VM: swap entry overflow\n"); + if (swap_overflow++ < 5) + printk(KERN_WARNING "swap_dup: swap entry overflow\n"); p->swap_map[offset] = SWAP_MAP_MAX; } swap_device_unlock(p); @@ -944,13 +1019,13 @@ out: return result; bad_file: - printk("Bad swap file entry %08lx\n", entry.val); + printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); goto out; bad_offset: - printk("Bad swap offset entry %08lx\n", entry.val); + /* Don't report: can happen in read_swap_cache_async after swapoff */ goto out; bad_unused: - printk("Unused swap offset entry in swap_dup %08lx\n", entry.val); + /* Don't report: can happen in read_swap_cache_async after blocking */ goto out; } @@ -985,13 +1060,13 @@ bad_entry: printk(KERN_ERR "swap_count: null entry!\n"); goto out; bad_file: - printk("Bad swap file entry %08lx\n", entry.val); + printk(KERN_ERR "swap_count: %s%08lx\n", Bad_file, entry.val); goto out; bad_offset: - printk("Bad swap offset entry %08lx\n", entry.val); + printk(KERN_ERR "swap_count: %s%08lx\n", Bad_offset, entry.val); goto out; bad_unused: - printk("Unused swap offset entry in swap_count %08lx\n", entry.val); + printk(KERN_ERR "swap_count: %s%08lx\n", Unused_offset, entry.val); goto out; } @@ -1006,23 +1081,22 @@ void get_swaphandle_info(swp_entry_t entry, unsigned long *offset, type = SWP_TYPE(entry); if (type >= nr_swapfiles) { - printk("Internal error: bad swap-device\n"); + printk(KERN_ERR "rw_swap_page: %s%08lx\n", Bad_file, entry.val); return; } p = &swap_info[type]; *offset = SWP_OFFSET(entry); - if (*offset >= p->max) { - printk("rw_swap_page: weirdness\n"); + if (*offset >= p->max && *offset != 0) { + printk(KERN_ERR "rw_swap_page: %s%08lx\n", Bad_offset, entry.val); return; } if (p->swap_map && !p->swap_map[*offset]) { - printk("VM: Bad swap entry %08lx\n", entry.val); + printk(KERN_ERR "rw_swap_page: %s%08lx\n", Unused_offset, entry.val); return; } if (!(p->flags & SWP_USED)) { - printk(KERN_ERR "rw_swap_page: " - "Trying to swap to unused swap-device\n"); + printk(KERN_ERR "rw_swap_page: %s%08lx\n", Unused_file, entry.val); return; } @@ -1037,8 +1111,8 @@ void get_swaphandle_info(swp_entry_t entry, unsigned long *offset, } /* - * Kernel_lock protects against swap device deletion. Grab an extra - * reference on the swaphandle so that it dos not become unused. + * Kernel_lock protects against swap device deletion. Don't grab an extra + * reference on the swaphandle, it doesn't matter if it becomes unused. */ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) { @@ -1049,7 +1123,6 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) *offset = SWP_OFFSET(entry); toff = *offset = (*offset >> page_cluster) << page_cluster; - swap_device_lock(swapdev); do { /* Don't read-ahead past the end of the swap area */ if (toff >= swapdev->max) @@ -1059,10 +1132,8 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) break; if (swapdev->swap_map[toff] == SWAP_MAP_BAD) break; - swapdev->swap_map[toff]++; toff++; ret++; } while (--i); - swap_device_unlock(swapdev); return ret; } diff --git a/mm/vmscan.c b/mm/vmscan.c index e6fed11b1e07..0c68f9ee8f3d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -32,8 +32,6 @@ */ #define DEF_PRIORITY (6) -#define MAX(a,b) ((a) > (b) ? (a) : (b)) - static inline void age_page_up(struct page *page) { unsigned age = page->age + PAGE_AGE_ADV; @@ -111,6 +109,7 @@ static void try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, u * is needed on CPUs which update the accessed and dirty * bits in hardware. */ + flush_cache_page(vma, address); pte = ptep_get_and_clear(page_table); flush_tlb_page(vma, address); @@ -138,7 +137,8 @@ drop_pte: /* * Is it a clean page? Then it must be recoverable * by just paging it in again, and we can just drop - * it.. + * it.. or if it's dirty but has backing store, + * just mark the page dirty and drop it. * * However, this won't actually free any real * memory, as the page will just be in the page cache @@ -148,20 +148,17 @@ drop_pte: * Basically, this just makes it possible for us to do * some real work in the future in "refill_inactive()". */ - flush_cache_page(vma, address); - if (!pte_dirty(pte)) + if (page->mapping) { + if (pte_dirty(pte)) + set_page_dirty(page); goto drop_pte; - + } /* - * Ok, it's really dirty. That means that - * we should either create a new swap cache - * entry for it, or we should write it back - * to its own backing store. + * Check PageDirty as well as pte_dirty: page may + * have been brought back from swap by swapoff. */ - if (page->mapping) { - set_page_dirty(page); + if (!pte_dirty(pte) && !PageDirty(page)) goto drop_pte; - } /* * This is a dirty, swappable page. First of all, @@ -334,6 +331,9 @@ static inline int swap_amount(struct mm_struct *mm) return nr; } +/* Placeholder for swap_out(): may be updated by fork.c:mmput() */ +struct mm_struct *swap_mm = &init_mm; + static void swap_out(unsigned int priority, int gfp_mask) { int counter; @@ -347,17 +347,15 @@ static void swap_out(unsigned int priority, int gfp_mask) /* Then, look at the other mm's */ counter = (mmlist_nr << SWAP_MM_SHIFT) >> priority; do { - struct list_head *p; - spin_lock(&mmlist_lock); - p = init_mm.mmlist.next; - if (p == &init_mm.mmlist) - goto empty; - - /* Move it to the back of the queue.. */ - list_del(p); - list_add_tail(p, &init_mm.mmlist); - mm = list_entry(p, struct mm_struct, mmlist); + mm = swap_mm; + if (mm == &init_mm) { + mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist); + if (mm == &init_mm) + goto empty; + } + /* Set pointer for next call to next in the list */ + swap_mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist); /* Make sure the mm doesn't disappear when we drop the lock.. */ atomic_inc(&mm->mm_users); @@ -539,8 +537,12 @@ int page_launder(int gfp_mask, int sync) * last copy.. */ if (PageDirty(page)) { - int (*writepage)(struct page *) = page->mapping->a_ops->writepage; + int (*writepage)(struct page *); + /* Can a page get here without page->mapping? */ + if (!page->mapping) + goto page_active; + writepage = page->mapping->a_ops->writepage; if (!writepage) goto page_active; @@ -779,7 +781,7 @@ int inactive_shortage(void) { pg_data_t *pgdat; unsigned int global_target = freepages.high + inactive_target; - unsigned int global_incative = 0; + unsigned int global_inactive = 0; pgdat = pgdat_list; do { @@ -799,13 +801,13 @@ int inactive_shortage(void) if (inactive < zone->pages_high) return 1; - global_incative += inactive; + global_inactive += inactive; } pgdat = pgdat->node_next; } while (pgdat); /* Global shortage? */ - return global_incative < global_target; + return global_inactive < global_target; } /* diff --git a/scripts/ver_linux b/scripts/ver_linux index a5a459f0b1d1..fa151baf940a 100644 --- a/scripts/ver_linux +++ b/scripts/ver_linux @@ -66,5 +66,7 @@ loadkeys -V 2>&1 | awk \ expr --v 2>&1 | awk 'NR==1{print "Sh-utils ", $NF}' -X=`cat /proc/modules | sed -e "s/ .*$//"` -echo "Modules Loaded "$X +if [ -e /proc/modules ]; then + X=`cat /proc/modules | sed -e "s/ .*$//"` + echo "Modules Loaded "$X +fi -- 2.39.5