From 9d11a5176cc5b9609542b1bd5a827b8618efe681 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 23 Nov 2007 15:14:53 -0500 Subject: [PATCH] I just put a pre-90 on ftp.kernel.org, and I'm happy to report that Davem seems to have found and fixed the TCP performance problem, which means that the code-freeze for 2.2 is going to go into effect shortly.. pre-90 does a few other minor things, like for example getting rid of kerneld because the new kmod thing is a lot simpler in many ways. Let's see what the reaction to that is, but I'm fairly certain that this was a major good thing: I've personally never liked kerneld, but kmod seems to be a much nicer and more controlled way of handling the same issues that kerneld tried to do. I'd actually almost be willing to use the thing myself, something that was never true of kerneld. This also moves the WD7000 SCSI driver to a working status again, thanks to Miroslav Zagorac. But the interesting and important part of the patches are the networking fixes from David and Bill Hawes.. Linus --- CREDITS | 11 +- Documentation/Configure.help | 21 +- Documentation/kmod.txt | 47 + Documentation/modules.txt | 6 + MAINTAINERS | 6 + Makefile | 5 +- arch/alpha/config.in | 2 +- arch/alpha/defconfig | 2 +- arch/alpha/kernel/process.c | 1 + arch/arm/kernel/init_task.c | 1 + arch/i386/config.in | 2 +- arch/i386/defconfig | 4 +- arch/i386/kernel/init_task.c | 1 + arch/m68k/config.in | 2 +- arch/m68k/defconfig | 2 +- arch/m68k/kernel/process.c | 1 + arch/mips/config.in | 2 +- arch/mips/kernel/init_task.c | 1 + arch/ppc/config.in | 2 +- arch/ppc/defconfig | 2 +- arch/ppc/kernel/process.c | 1 + arch/sparc/config.in | 2 +- arch/sparc/defconfig | 2 +- arch/sparc/kernel/init_task.c | 1 + arch/sparc64/config.in | 2 +- arch/sparc64/kernel/init_task.c | 1 + drivers/block/ide-disk.c | 16 +- drivers/block/ide.c | 14 +- drivers/block/md.c | 9 +- drivers/block/rd.c | 41 +- drivers/char/ftape/zftape/zftape-ctl.c | 3 - drivers/char/ftape/zftape/zftape-init.c | 6 +- drivers/char/ftape/zftape/zftape-read.c | 3 - drivers/char/ftape/zftape/zftape-write.c | 3 - drivers/char/lp_m68k.c | 6 +- drivers/char/misc.c | 9 +- drivers/char/tty_io.c | 7 +- drivers/misc/parport_init.c | 1 - drivers/misc/parport_share.c | 8 +- drivers/net/ppp.c | 9 +- drivers/net/slip.c | 6 +- drivers/scsi/ppa.c | 6 +- drivers/scsi/scsi.c | 7 +- drivers/scsi/scsi_error.c | 4 - drivers/scsi/scsi_obsolete.c | 8 - drivers/scsi/scsi_queue.c | 4 - drivers/scsi/wd7000.c | 1552 +++++++++++++--------- drivers/scsi/wd7000.h | 50 +- drivers/sound/dev_table.c | 3 - drivers/sound/dmabuf.c | 19 +- drivers/sound/sound_calls.h | 1 - drivers/video/fbcon.c | 14 +- fs/devices.c | 13 +- fs/exec.c | 7 +- fs/fcntl.c | 39 +- fs/filesystems.c | 8 +- fs/ncpfs/dir.c | 2 +- fs/ncpfs/inode.c | 3 - fs/nfs/inode.c | 21 + fs/nfs/nfs2xdr.c | 65 +- fs/nls/nls_base.c | 8 +- fs/ntfs/fs.c | 10 +- fs/open.c | 35 +- fs/proc/root.c | 6 +- fs/super.c | 10 +- include/linux/file.h | 44 +- include/linux/kerneld.h | 135 -- include/linux/kmod.h | 4 + include/linux/module.h | 2 +- include/linux/mroute.h | 2 +- include/linux/netdevice.h | 6 +- include/linux/nfs_fs.h | 3 + include/linux/rtnetlink.h | 2 +- include/linux/sched.h | 32 +- include/linux/socket.h | 2 + include/linux/sunrpc/clnt.h | 5 +- include/linux/sysctl.h | 5 +- include/net/dst.h | 2 + include/net/ip6_route.h | 2 +- include/net/ipv6.h | 2 +- include/net/route.h | 1 + include/net/sock.h | 194 ++- include/net/tcp.h | 257 ++-- init/main.c | 11 +- ipc/Makefile | 4 - ipc/msg.c | 307 +---- ipc/util.c | 5 +- kernel/Makefile | 4 + kernel/exit.c | 11 +- kernel/fork.c | 42 +- kernel/kmod.c | 149 +++ kernel/ksyms.c | 11 +- kernel/module.c | 2 +- kernel/sched.c | 19 +- kernel/sysctl.c | 10 + mm/slab.c | 6 +- mm/swap_state.c | 10 +- mm/vmscan.c | 10 +- net/802/sysctl_net_802.c | 2 + net/802/tr.c | 18 +- net/appletalk/ddp.c | 2 +- net/ax25/af_ax25.c | 4 +- net/core/dev.c | 83 +- net/core/iovec.c | 33 +- net/core/neighbour.c | 22 +- net/core/sock.c | 178 ++- net/core/sysctl_net_core.c | 4 + net/ipv4/af_inet.c | 15 +- net/ipv4/arp.c | 8 +- net/ipv4/devinet.c | 38 +- net/ipv4/fib_frontend.c | 24 +- net/ipv4/fib_hash.c | 2 +- net/ipv4/fib_rules.c | 2 +- net/ipv4/fib_semantics.c | 2 +- net/ipv4/icmp.c | 8 +- net/ipv4/igmp.c | 2 +- net/ipv4/ip_forward.c | 2 +- net/ipv4/ip_fragment.c | 2 +- net/ipv4/ip_fw.c | 2 +- net/ipv4/ip_input.c | 2 +- net/ipv4/ip_masq_mod.c | 10 +- net/ipv4/ip_masq_raudio.c | 2 +- net/ipv4/ip_options.c | 2 +- net/ipv4/ip_output.c | 2 +- net/ipv4/ip_sockglue.c | 2 +- net/ipv4/ipconfig.c | 2 +- net/ipv4/ipip.c | 2 +- net/ipv4/ipmr.c | 16 +- net/ipv4/proc.c | 6 +- net/ipv4/rarp.c | 2 +- net/ipv4/raw.c | 2 +- net/ipv4/route.c | 348 +++-- net/ipv4/syncookies.c | 2 +- net/ipv4/sysctl_net_ipv4.c | 2 +- net/ipv4/tcp.c | 128 +- net/ipv4/tcp_input.c | 384 ++---- net/ipv4/tcp_ipv4.c | 670 +++++----- net/ipv4/tcp_output.c | 169 +-- net/ipv4/tcp_timer.c | 55 +- net/ipv4/timer.c | 5 +- net/ipv4/udp.c | 14 +- net/ipv6/addrconf.c | 4 +- net/ipv6/af_inet6.c | 7 +- net/ipv6/exthdrs.c | 2 +- net/ipv6/icmp.c | 2 +- net/ipv6/ip6_fib.c | 2 +- net/ipv6/ip6_fw.c | 2 +- net/ipv6/ip6_input.c | 2 +- net/ipv6/ip6_output.c | 2 +- net/ipv6/ipv6_sockglue.c | 2 +- net/ipv6/ndisc.c | 2 +- net/ipv6/proc.c | 6 +- net/ipv6/raw.c | 2 +- net/ipv6/reassembly.c | 2 +- net/ipv6/route.c | 36 +- net/ipv6/sit.c | 2 +- net/ipv6/tcp_ipv6.c | 292 ++-- net/ipv6/udp.c | 39 +- net/ipx/af_ipx.c | 2 +- net/netbeui/af_netbeui.c | 2 +- net/netlink/af_netlink.c | 2 +- net/netrom/af_netrom.c | 2 +- net/netsyms.c | 3 + net/packet/af_packet.c | 5 +- net/rose/af_rose.c | 2 +- net/socket.c | 10 +- net/unix/af_unix.c | 2 +- net/x25/af_x25.c | 2 +- 168 files changed, 3274 insertions(+), 2911 deletions(-) create mode 100644 Documentation/kmod.txt delete mode 100644 include/linux/kerneld.h create mode 100644 include/linux/kmod.h create mode 100644 kernel/kmod.c diff --git a/CREDITS b/CREDITS index 4014526e4434..8ed3dc55278e 100644 --- a/CREDITS +++ b/CREDITS @@ -62,9 +62,10 @@ S: USA N: Andrea Arcangeli E: arcangeli@mbox.queen.it -W: http://www-linux.deis.unibo.it/~mirror/ +W: http://www.cs.unibo.it/~arcangel/ P: 1024/CB4660B9 CC A0 71 81 F4 A0 63 AC C0 4B 81 1D 8C 15 C8 E5 -D: parport sharing fix. Various other kernel hacks. +D: Parport sharing hacker. +D: Various other kernel hacks. S: Via Ciaclini 26 S: Imola 40026 S: Italy @@ -395,6 +396,11 @@ S: Virginia Tech S: Blacksburg, Virginia 24061 S: USA +N: Cyrus Durgin +E: cider@speakeasy.org +W: http://www.speakeasy.org/~cider/ +D: implemented kmod + N: Torsten Duwe E: Torsten.Duwe@informatik.uni-erlangen.de D: Part-time kernel hacker @@ -1337,6 +1343,7 @@ S: Russia N: Kirk Petersen E: kirk@speakeasy.org W: http://www.speakeasy.org/~kirk/ +D: implemented kmod D: modularized BSD Unix domain sockets N: Kai Petzke diff --git a/Documentation/Configure.help b/Documentation/Configure.help index b9855d535c31..7a5098c7e32c 100644 --- a/Documentation/Configure.help +++ b/Documentation/Configure.help @@ -1326,21 +1326,12 @@ CONFIG_MODVERSIONS non-kernel sources, you would benefit from this option. Otherwise it's not that important. So, N ought to be a safe bet. -Kernel daemon support -CONFIG_KERNELD - Normally when you have selected some drivers and/or filesystems to - be created as loadable modules, you also have the responsibility to - load the corresponding module (via insmod/modprobe) before you can - use it. If you select Y here, the kernel will take care of this all - by itself, together with the user level daemon "kerneld". Note that - "kerneld" will also automatically unload all unused modules, so you - don't have to use "rmmod" either. kerneld will also provide support - for different user-level beeper and screen blanker programs later - on. The "kerneld" daemon is included in the modutils package (check - Documentation/Changes for latest version and location). You will - probably want to read the kerneld mini-HOWTO, available via ftp - (user: anonymous) from - sunsite.unc.edu:/pub/Linux/docs/HOWTO/mini. If unsure, say Y. +Kernel module loader support +CONFIG_KMOD + This feature allows the kernel to load modules for itself. When + a part of the kernel needs a module, it runs modprobe with the + appropriate arguments. Say Y here and read about configuring it + in Documentation/kmod.txt. (this is a replacement of kerneld) ARP daemon support (EXPERIMENTAL) CONFIG_ARPD diff --git a/Documentation/kmod.txt b/Documentation/kmod.txt new file mode 100644 index 000000000000..3deeeaed878c --- /dev/null +++ b/Documentation/kmod.txt @@ -0,0 +1,47 @@ +Kmod: The Kernel Module Loader +Kirk Petersen + +Kmod is a simple replacement for kerneld. It consists of a +request_module() replacement and a kernel thread called kmod. When the +kernel requests a module, the kmod wakes up and execve()s modprobe, +passing it the name that was requested. After a configurable period of +time, kmod will have delete_module() remove any unused modules. + +Kmod is configurable through two entries in /proc/sys/kernel. You can +set the path of modprobe (where the kernel looks for it) by doing: + + echo "/sbin/modprobe" > /proc/sys/kernel/modprobe + +To tell kmod when to unload unused modules, do something like: + + echo "120" > /proc/sys/kernel/kmod_unload_delay + +Kmod only loads and unloads modules. Kerneld could do more (although +nothing in the standard kernel used its other features). If you +require features such as request_route, we suggest that you take +a similar approach. A simple request_route function could be called, +and a kroute kernel thread could be sent off to do the work. But +we should probably keep this to a minimum. + +Kerneld also had a mechanism for storing device driver settings. This +can easily be done with modprobe. When a module is unloaded, modprobe +could look at a per-driver-configurable location (/proc/sys/drivers/blah) +for device driver settings and save them to a file. When a module +is loaded, simply cat that file back to that location in the proc +filesystem. Or perhaps a script could be a setting in /etc/modules.conf. +There are many user-land methods that will work (I prefer using /proc, +myself). + +If kerneld worked, why replace it? + +- kerneld used sysv ipc, which can now be made into a module. Besides, + sysv ipc is ugly and should therefore be avoided (well, certainly for + kernel level stuff) + +- both kmod and kerneld end up doing the same thing (calling modprobe), + so why not skip the middle man? + +- removing kerneld related stuff from ipc/msg.c made it 40% smaller + +- kmod reports errors through the normal kernel mechanisms, which avoids + the chicken and egg problem of kerneld and modular unix domain sockets diff --git a/Documentation/modules.txt b/Documentation/modules.txt index b637da0a40a7..ca9c434f716e 100644 --- a/Documentation/modules.txt +++ b/Documentation/modules.txt @@ -10,6 +10,12 @@ Some older modules packages aren't aware of some of the newer modular features that the kernel now supports. The current required version is listed in the file linux/Documentation/Changes. +* * * NOTE * * * +The kernel has been changed to remove kerneld support and use +the new kmod support. Keep this in mind when reading this file. Kmod +does the exact same thing as kerneld, but doesn't require an external +program (see Documentation/kmod.txt) + In the beginning... ------------------- diff --git a/MAINTAINERS b/MAINTAINERS index 81bf2aeb42bc..90f0299866ca 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -604,6 +604,12 @@ P: Jean Tourrilhes M: jt@hplb.hpl.hp.com S: Maintained +WD7000 SCSI DRIVER +P: Miroslav Zagorac +M: zaga@fly.cc.fer.hr +L: linux-scsi@vger.rutgers.edu +S: Maintained + Z8530 DRIVER FOR AX.25 P: Joerg Reuter M: jreuter@poboxes.com diff --git a/Makefile b/Makefile index e0cf1b205929..aed7fbef0b99 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 2 PATCHLEVEL = 1 -SUBLEVEL = 89 +SUBLEVEL = 90 ARCH := $(shell uname -m | sed -e s/i.86/i386/ -e s/sun4u/sparc64/) @@ -9,7 +9,7 @@ ARCH := $(shell uname -m | sed -e s/i.86/i386/ -e s/sun4u/sparc64/) # because it makes re-config very ugly and too many fundamental files depend # on "CONFIG_SMP" # -# NOTE! SMP is experimental. See the file Documentation/SMP.txt +# For UP operations COMMENT THIS OUT, simply setting SMP = 0 won't work # SMP = 1 @@ -262,6 +262,7 @@ include/linux/compile.h: $(CONFIGURATION) include/linux/version.h newversion include/linux/version.h: ./Makefile @echo \#define UTS_RELEASE \"$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)\" > .ver @echo \#define LINUX_VERSION_CODE `expr $(VERSION) \\* 65536 + $(PATCHLEVEL) \\* 256 + $(SUBLEVEL)` >> .ver + @echo '#define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + (c))' >>.ver @mv -f .ver $@ init/version.o: init/version.c include/linux/compile.h diff --git a/arch/alpha/config.in b/arch/alpha/config.in index bda9ca13072a..1fefe0c6280f 100644 --- a/arch/alpha/config.in +++ b/arch/alpha/config.in @@ -23,7 +23,7 @@ bool 'Enable loadable module support' CONFIG_MODULES if [ "$CONFIG_MODULES" = "y" ]; then MODULES=y bool 'Set version information on all symbols for modules' CONFIG_MODVERSIONS - bool 'Kernel daemon support (e.g. autoload of modules)' CONFIG_KERNELD + bool 'Kernel module loader' CONFIG_KMOD fi endmenu diff --git a/arch/alpha/defconfig b/arch/alpha/defconfig index 8c1a4f696fe6..1bdecf4449bd 100644 --- a/arch/alpha/defconfig +++ b/arch/alpha/defconfig @@ -12,7 +12,7 @@ # CONFIG_MODULES=y # CONFIG_MODVERSIONS is not set -# CONFIG_KERNELD is not set +# CONFIG_KMOD is not set # # General setup diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index 6a90ecb02e10..5970d443aed9 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -50,6 +50,7 @@ unsigned long init_user_stack[1024] = { STACK_MAGIC, }; static struct vm_area_struct init_mmap = INIT_MMAP; static struct fs_struct init_fs = INIT_FS; +static struct files * init_fd_array[NR_OPEN] = { NULL, }; static struct files_struct init_files = INIT_FILES; static struct signal_struct init_signals = INIT_SIGNALS; struct mm_struct init_mm = INIT_MM; diff --git a/arch/arm/kernel/init_task.c b/arch/arm/kernel/init_task.c index acc206942ad1..90ae6952f7eb 100644 --- a/arch/arm/kernel/init_task.c +++ b/arch/arm/kernel/init_task.c @@ -6,6 +6,7 @@ static struct vm_area_struct init_mmap = INIT_MMAP; static struct fs_struct init_fs = INIT_FS; +static struct files * init_fd_array[NR_OPEN] = { NULL, }; static struct files_struct init_files = INIT_FILES; static struct signal_struct init_signals = INIT_SIGNALS; struct mm_struct init_mm = INIT_MM; diff --git a/arch/i386/config.in b/arch/i386/config.in index 3e52c2218bc6..0589fc9fe59e 100644 --- a/arch/i386/config.in +++ b/arch/i386/config.in @@ -24,7 +24,7 @@ comment 'Loadable module support' bool 'Enable loadable module support' CONFIG_MODULES if [ "$CONFIG_MODULES" = "y" ]; then bool 'Set version information on all symbols for modules' CONFIG_MODVERSIONS - bool 'Kernel daemon support (e.g. autoload of modules)' CONFIG_KERNELD + bool 'Kernel module loader' CONFIG_KMOD fi endmenu diff --git a/arch/i386/defconfig b/arch/i386/defconfig index 2ca6da6bc4fc..bd1bdf5eb172 100644 --- a/arch/i386/defconfig +++ b/arch/i386/defconfig @@ -21,7 +21,7 @@ CONFIG_M586=y # CONFIG_MODULES=y # CONFIG_MODVERSIONS is not set -# CONFIG_KERNELD is not set +# CONFIG_KMOD is not set # # General setup @@ -30,7 +30,6 @@ CONFIG_NET=y CONFIG_PCI=y CONFIG_PCI_BIOS=y CONFIG_PCI_DIRECT=y -# CONFIG_PCI_OPTIMIZE is not set CONFIG_PCI_OLD_PROC=y # CONFIG_MCA is not set CONFIG_SYSVIPC=y @@ -94,7 +93,6 @@ CONFIG_INET=y # CONFIG_IP_ADVANCED_ROUTER is not set # CONFIG_IP_PNP is not set # CONFIG_IP_ACCT is not set -# CONFIG_IP_MASQUERADE is not set # CONFIG_IP_ROUTER is not set # CONFIG_NET_IPIP is not set # CONFIG_NET_IPGRE is not set diff --git a/arch/i386/kernel/init_task.c b/arch/i386/kernel/init_task.c index acc206942ad1..90ae6952f7eb 100644 --- a/arch/i386/kernel/init_task.c +++ b/arch/i386/kernel/init_task.c @@ -6,6 +6,7 @@ static struct vm_area_struct init_mmap = INIT_MMAP; static struct fs_struct init_fs = INIT_FS; +static struct files * init_fd_array[NR_OPEN] = { NULL, }; static struct files_struct init_files = INIT_FILES; static struct signal_struct init_signals = INIT_SIGNALS; struct mm_struct init_mm = INIT_MM; diff --git a/arch/m68k/config.in b/arch/m68k/config.in index 0b287bb9d40b..4b90b6b02a48 100644 --- a/arch/m68k/config.in +++ b/arch/m68k/config.in @@ -14,7 +14,7 @@ comment 'Loadable module support' bool 'Enable loadable module support' CONFIG_MODULES if [ "$CONFIG_MODULES" = "y" ]; then bool 'Set version information on all symbols for modules' CONFIG_MODVERSIONS - bool 'Kernel daemon support (e.g. autoload of modules)' CONFIG_KERNELD + bool 'Kernel module loader' CONFIG_KMOD fi endmenu diff --git a/arch/m68k/defconfig b/arch/m68k/defconfig index e9fe2175b57f..6ce3991d4e3d 100644 --- a/arch/m68k/defconfig +++ b/arch/m68k/defconfig @@ -12,7 +12,7 @@ CONFIG_EXPERIMENTAL=y # # CONFIG_MODULES is not set # CONFIG_MODVERSIONS is not set -# CONFIG_KERNELD is not set +# CONFIG_KMOD is not set # # Platform-dependent setup diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c index 2318caf810d0..1f82ba8251bc 100644 --- a/arch/m68k/kernel/process.c +++ b/arch/m68k/kernel/process.c @@ -40,6 +40,7 @@ */ static struct vm_area_struct init_mmap = INIT_MMAP; static struct fs_struct init_fs = INIT_FS; +static struct files * init_fd_array[NR_OPEN] = { NULL, }; static struct files_struct init_files = INIT_FILES; static struct signal_struct init_signals = INIT_SIGNALS; struct mm_struct init_mm = INIT_MM; diff --git a/arch/mips/config.in b/arch/mips/config.in index c66f3eb25f40..d0bf64f478dd 100644 --- a/arch/mips/config.in +++ b/arch/mips/config.in @@ -109,7 +109,7 @@ comment 'Loadable module support' bool 'Enable loadable module support' CONFIG_MODULES if [ "$CONFIG_MODULES" = "y" ]; then bool 'Set version information on all symbols for modules' CONFIG_MODVERSIONS - bool 'Kernel daemon support (e.g. autoload of modules)' CONFIG_KERNELD + bool 'Kernel module loader' CONFIG_KMOD fi source drivers/block/Config.in diff --git a/arch/mips/kernel/init_task.c b/arch/mips/kernel/init_task.c index cc0a19231f19..46b9a528764b 100644 --- a/arch/mips/kernel/init_task.c +++ b/arch/mips/kernel/init_task.c @@ -5,6 +5,7 @@ static struct vm_area_struct init_mmap = INIT_MMAP; static struct fs_struct init_fs = INIT_FS; +static struct files * init_fd_array[NR_OPEN] = { NULL, }; static struct files_struct init_files = INIT_FILES; static struct signal_struct init_signals = INIT_SIGNALS; struct mm_struct init_mm = INIT_MM; diff --git a/arch/ppc/config.in b/arch/ppc/config.in index 6a244a72ae52..72bab8928c58 100644 --- a/arch/ppc/config.in +++ b/arch/ppc/config.in @@ -32,7 +32,7 @@ bool 'Prompt for development and/or incomplete code/drivers' CONFIG_EXPERIMENTAL bool 'Enable loadable module support' CONFIG_MODULES if [ "$CONFIG_MODULES" = "y" ]; then bool 'Set version information on all symbols for modules' CONFIG_MODVERSIONS - bool 'Kernel daemon support (e.g. autoload of modules)' CONFIG_KERNELD + bool 'Kernel module loader' CONFIG_KMOD fi define_bool CONFIG_PCI y diff --git a/arch/ppc/defconfig b/arch/ppc/defconfig index ae97abde660a..7e5caa71dfa0 100644 --- a/arch/ppc/defconfig +++ b/arch/ppc/defconfig @@ -19,7 +19,7 @@ CONFIG_COMMON=y CONFIG_EXPERIMENTAL=y CONFIG_MODULES=y CONFIG_MODVERSIONS=y -CONFIG_KERNELD=y +CONFIG_KMOD=y CONFIG_PCI=y # CONFIG_PCI_OPTIMIZE is not set CONFIG_PCI_OLD_PROC=y diff --git a/arch/ppc/kernel/process.c b/arch/ppc/kernel/process.c index b2a1478cb892..f42e6162ed65 100644 --- a/arch/ppc/kernel/process.c +++ b/arch/ppc/kernel/process.c @@ -64,6 +64,7 @@ task_top(struct task_struct *tsk) static struct vm_area_struct init_mmap = INIT_MMAP; static struct fs_struct init_fs = INIT_FS; +static struct files * init_fd_array[NR_OPEN] = { NULL, }; static struct files_struct init_files = INIT_FILES; static struct signal_struct init_signals = INIT_SIGNALS; diff --git a/arch/sparc/config.in b/arch/sparc/config.in index d7902e88694c..4a087fca2aeb 100644 --- a/arch/sparc/config.in +++ b/arch/sparc/config.in @@ -14,7 +14,7 @@ comment 'Loadable module support' bool 'Enable loadable module support' CONFIG_MODULES if [ "$CONFIG_MODULES" = "y" ]; then bool 'Set version information on all symbols for modules' CONFIG_MODVERSIONS - bool 'Kernel daemon support (e.g. autoload of modules)' CONFIG_KERNELD + bool 'Kernel module loader' CONFIG_KMOD fi endmenu diff --git a/arch/sparc/defconfig b/arch/sparc/defconfig index 5641663e534d..38b6096ccd11 100644 --- a/arch/sparc/defconfig +++ b/arch/sparc/defconfig @@ -12,7 +12,7 @@ CONFIG_EXPERIMENTAL=y # CONFIG_MODULES=y CONFIG_MODVERSIONS=y -CONFIG_KERNELD=y +CONFIG_KMOD=y # # General setup diff --git a/arch/sparc/kernel/init_task.c b/arch/sparc/kernel/init_task.c index d0fc09346eca..1829daeea8fe 100644 --- a/arch/sparc/kernel/init_task.c +++ b/arch/sparc/kernel/init_task.c @@ -6,6 +6,7 @@ static struct vm_area_struct init_mmap = INIT_MMAP; static struct fs_struct init_fs = INIT_FS; +static struct files * init_fd_array[NR_OPEN] = { NULL, }; static struct files_struct init_files = INIT_FILES; static struct signal_struct init_signals = INIT_SIGNALS; struct mm_struct init_mm = INIT_MM; diff --git a/arch/sparc64/config.in b/arch/sparc64/config.in index 0bb079f4ebf4..4461cdea00ba 100644 --- a/arch/sparc64/config.in +++ b/arch/sparc64/config.in @@ -14,7 +14,7 @@ comment 'Loadable module support' bool 'Enable loadable module support' CONFIG_MODULES if [ "$CONFIG_MODULES" = "y" ]; then bool 'Set version information on all symbols for modules' CONFIG_MODVERSIONS - bool 'Kernel daemon support (e.g. autoload of modules)' CONFIG_KERNELD + bool 'Kernel module loader' CONFIG_KMOD fi endmenu diff --git a/arch/sparc64/kernel/init_task.c b/arch/sparc64/kernel/init_task.c index d0fc09346eca..1829daeea8fe 100644 --- a/arch/sparc64/kernel/init_task.c +++ b/arch/sparc64/kernel/init_task.c @@ -6,6 +6,7 @@ static struct vm_area_struct init_mmap = INIT_MMAP; static struct fs_struct init_fs = INIT_FS; +static struct files * init_fd_array[NR_OPEN] = { NULL, }; static struct files_struct init_files = INIT_FILES; static struct signal_struct init_signals = INIT_SIGNALS; struct mm_struct init_mm = INIT_MM; diff --git a/drivers/block/ide-disk.c b/drivers/block/ide-disk.c index 4ad8bf606fd2..8c72c791d41e 100644 --- a/drivers/block/ide-disk.c +++ b/drivers/block/ide-disk.c @@ -16,9 +16,11 @@ * Version 1.02 remove ", LBA" from drive identification msgs * Version 1.03 fix display of id->buf_size for big-endian * Version 1.04 add /proc configurable settings and S.M.A.R.T support + * Version 1.05 add capacity support for ATA3 >= 8GB + * Version 1.06 get boot-up messages to show full cyl count */ -#define IDEDISK_VERSION "1.04" +#define IDEDISK_VERSION "1.06" #undef REALLY_SLOW_IO /* most systems can safely undef this */ @@ -83,6 +85,11 @@ static int lba_capacity_is_ok (struct hd_driveid *id) unsigned long chs_sects = id->cyls * id->heads * id->sectors; unsigned long _10_percent = chs_sects / 10; + /* very large drives (8GB+) may lie about the number of cylinders */ + if (id->cyls == 16383 && id->heads == 16 && id->sectors == 63 && lba_sects > chs_sects) { + id->cyls = lba_sects / (16 * 63); /* correct cyls */ + return 1; /* lba_capacity is our only option */ + } /* perform a rough sanity check on lba_sects: within 10% is "okay" */ if ((lba_sects - chs_sects) < _10_percent) return 1; /* lba_capacity is good */ @@ -439,6 +446,7 @@ static unsigned long idedisk_capacity (ide_drive_t *drive) /* Determine capacity, and use LBA if the drive properly supports it */ if (id != NULL && (id->capability & 2) && lba_capacity_is_ok(id)) { if (id->lba_capacity >= capacity) { + drive->cyl = id->lba_capacity / (drive->head * drive->sect); capacity = id->lba_capacity; drive->select.b.lba = 1; } @@ -698,6 +706,10 @@ static void idedisk_setup (ide_drive_t *drive) drive->head = id->heads; drive->sect = id->sectors; } + + /* calculate drive capacity, and select LBA if possible */ + (void) idedisk_capacity (drive); + /* Correct the number of cyls if the bios value is too small */ if (drive->sect == drive->bios_sect && drive->head == drive->bios_head) { if (drive->cyl > drive->bios_cyl) @@ -706,8 +718,6 @@ static void idedisk_setup (ide_drive_t *drive) /* fix byte-ordering of buffer size field */ id->buf_size = le16_to_cpu(id->buf_size); - (void) idedisk_capacity (drive); /* initialize LBA selection */ - printk (KERN_INFO "%s: %.40s, %ldMB w/%dkB Cache, CHS=%d/%d/%d", drive->name, id->model, idedisk_capacity(drive)/2048L, id->buf_size/2, drive->bios_cyl, drive->bios_head, drive->bios_sect); diff --git a/drivers/block/ide.c b/drivers/block/ide.c index 7ec27d45d3ae..e10578508fda 100644 --- a/drivers/block/ide.c +++ b/drivers/block/ide.c @@ -134,9 +134,9 @@ #include "ide.h" #include "ide_modes.h" -#ifdef CONFIG_KERNELD -#include -#endif /* CONFIG_KERNELD */ +#ifdef CONFIG_KMOD +#include +#endif /* CONFIG_KMOD */ static const byte ide_hwif_to_major[] = {IDE0_MAJOR, IDE1_MAJOR, IDE2_MAJOR, IDE3_MAJOR}; @@ -1505,10 +1505,10 @@ static void ide_init_module (int type) module = module->next; } revalidate_drives(); -#ifdef CONFIG_KERNELD +#ifdef CONFIG_KMOD if (!found && type == IDE_PROBE_MODULE) (void) request_module("ide-probe"); -#endif /* CONFIG_KERNELD */ +#endif /* CONFIG_KMOD */ } static int ide_open(struct inode * inode, struct file * filp) @@ -1521,7 +1521,7 @@ static int ide_open(struct inode * inode, struct file * filp) MOD_INC_USE_COUNT; if (drive->driver == NULL) ide_init_module(IDE_DRIVER_MODULE); -#ifdef CONFIG_KERNELD +#ifdef CONFIG_KMOD if (drive->driver == NULL) { if (drive->media == ide_disk) (void) request_module("ide-disk"); @@ -1532,7 +1532,7 @@ static int ide_open(struct inode * inode, struct file * filp) if (drive->media == ide_floppy) (void) request_module("ide-floppy"); } -#endif /* CONFIG_KERNELD */ +#endif /* CONFIG_KMOD */ while (drive->busy) sleep_on(&drive->wqueue); drive->usage++; diff --git a/drivers/block/md.c b/drivers/block/md.c index 4feeb0ce920f..4d0ccf401c67 100644 --- a/drivers/block/md.c +++ b/drivers/block/md.c @@ -12,6 +12,9 @@ RAID-1/RAID-5 extensions by: Ingo Molnar, Miguel de Icaza, Gadi Oxman + + Changes for kmod by: + Cyrus Durgin This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -43,8 +46,8 @@ #include #include #include -#ifdef CONFIG_KERNELD -#include +#ifdef CONFIG_KMOD +#include #endif #include #include @@ -431,7 +434,7 @@ static int do_md_run (int minor, int repart) } if (!pers[pnum]) { -#ifdef CONFIG_KERNELD +#ifdef CONFIG_KMOD char module_name[80]; sprintf (module_name, "md-personality-%d", pnum); request_module (module_name); diff --git a/drivers/block/rd.c b/drivers/block/rd.c index de17158a7a39..59df50545a45 100644 --- a/drivers/block/rd.c +++ b/drivers/block/rd.c @@ -35,6 +35,9 @@ * * 4/25/96 : Made ramdisk size a parameter (default is now 4MB) * - Chad Page + * + * Add support for fs images split across >1 disk, Paul Gortmaker, Mar '98 + * */ #include @@ -344,7 +347,6 @@ identify_ramdisk_image(kdev_t device, struct file *fp, int start_block)) struct ext2_super_block *ext2sb; struct romfs_super_block *romfsb; int nblocks = -1; - int max_blocks; unsigned char *buf; buf = kmalloc(size, GFP_KERNEL); @@ -423,17 +425,6 @@ done: fp->f_op->llseek(fp, start_block * BLOCK_SIZE, 0); fp->f_pos = start_block * BLOCK_SIZE; - if ((nblocks > 0) && blk_size[MAJOR(device)]) { - max_blocks = blk_size[MAJOR(device)][MINOR(device)]; - max_blocks -= start_block; - if (nblocks > max_blocks) { - printk(KERN_NOTICE - "RAMDISK: Restricting filesystem size " - "from %d to %d blocks.\n", - nblocks, max_blocks); - nblocks = max_blocks; - } - } kfree(buf); return nblocks; } @@ -451,6 +442,7 @@ __initfunc(static void rd_load_image(kdev_t device,int offset)) int nblocks, i; char *buf; unsigned short rotate = 0; + unsigned short devblocks = 0; char rotator[4] = { '|' , '/' , '-' , '\\' }; ram_device = MKDEV(MAJOR_NR, 0); @@ -508,8 +500,31 @@ __initfunc(static void rd_load_image(kdev_t device,int offset)) goto done; } - printk(KERN_NOTICE "RAMDISK: Loading %d blocks into ram disk... ", nblocks); + if (blk_size[MAJOR(device)]) + devblocks = blk_size[MAJOR(device)][MINOR(device)]; + + if (devblocks == 0) { + printk(KERN_ERR "RAMDISK: could not determine device size\n"); + goto done; + } + + printk(KERN_NOTICE "RAMDISK: Loading %d blocks [%d disk(s)] into ram disk... ", nblocks, nblocks/devblocks+1); for (i=0; i < nblocks; i++) { + if (i && (i % devblocks == 0)) { + printk("done.\n"); + rotate = 0; + invalidate_buffers(device); + if (infile.f_op->release) + infile.f_op->release(&inode, &infile); + printk("Please insert disk #%d and press ENTER\n", i/devblocks+1); + wait_for_keypress(); + if (blkdev_open(&inode, &infile) != 0) { + printk("Error opening disk.\n"); + return; + } + infile.f_pos = 0; + printk("Loading disk #%d... ", i/devblocks+1); + } infile.f_op->read(&infile, buf, BLOCK_SIZE, &infile.f_pos); outfile.f_op->write(&outfile, buf, BLOCK_SIZE, &outfile.f_pos); if (!(i % 16)) { diff --git a/drivers/char/ftape/zftape/zftape-ctl.c b/drivers/char/ftape/zftape/zftape-ctl.c index 19d10c95b66f..3b80c207f54e 100644 --- a/drivers/char/ftape/zftape/zftape-ctl.c +++ b/drivers/char/ftape/zftape/zftape-ctl.c @@ -29,9 +29,6 @@ #include #define __NO_VERSION__ #include -#ifdef CONFIG_KERNELD -#include -#endif #include #include diff --git a/drivers/char/ftape/zftape/zftape-init.c b/drivers/char/ftape/zftape/zftape-init.c index 0c8a6f9eb641..78a857ace737 100644 --- a/drivers/char/ftape/zftape/zftape-init.c +++ b/drivers/char/ftape/zftape/zftape-init.c @@ -30,8 +30,8 @@ #include #include #include -#ifdef CONFIG_KERNELD -#include +#ifdef CONFIG_KMOD +#include #endif #include #include @@ -391,7 +391,7 @@ struct zft_cmpr_ops *zft_cmpr_unregister(void) int zft_cmpr_lock(int try_to_load) { if (zft_cmpr_ops == NULL) { -#ifdef CONFIG_KERNELD +#ifdef CONFIG_KMOD if (try_to_load) { request_module("zft-compressor"); if (zft_cmpr_ops == NULL) { diff --git a/drivers/char/ftape/zftape/zftape-read.c b/drivers/char/ftape/zftape/zftape-read.c index c7d319fac221..d0e756e54b8a 100644 --- a/drivers/char/ftape/zftape/zftape-read.c +++ b/drivers/char/ftape/zftape/zftape-read.c @@ -27,9 +27,6 @@ #include #include #include -#ifdef CONFIG_KERNELD -#include -#endif #include diff --git a/drivers/char/ftape/zftape/zftape-write.c b/drivers/char/ftape/zftape/zftape-write.c index 46f1ecc09531..05e5239ec266 100644 --- a/drivers/char/ftape/zftape/zftape-write.c +++ b/drivers/char/ftape/zftape/zftape-write.c @@ -27,9 +27,6 @@ #include #include #include -#ifdef CONFIG_KERNELD -#include -#endif #include diff --git a/drivers/char/lp_m68k.c b/drivers/char/lp_m68k.c index a36903c7a45f..4a000e9aa079 100644 --- a/drivers/char/lp_m68k.c +++ b/drivers/char/lp_m68k.c @@ -43,8 +43,8 @@ #include #include #include -#ifdef CONFIG_KERNELD -#include +#ifdef CONFIG_KMOD +#include #endif #ifdef CONFIG_AMIGA @@ -365,7 +365,7 @@ static int lp_open(struct inode *inode, struct file *file) if (dev >= MAX_LP) return -ENODEV; -#ifdef CONFIG_KERNELD +#ifdef CONFIG_KMOD if (!lp_table[dev]) { char modname[30]; diff --git a/drivers/char/misc.c b/drivers/char/misc.c index 98b74551c363..751a84d7bdb4 100644 --- a/drivers/char/misc.c +++ b/drivers/char/misc.c @@ -26,6 +26,9 @@ * Idea by Jacques Gelinas , * adapted by Bjorn Ekwall * corrected by Alan Cox + * + * Changes for kmod (from kerneld): + Cyrus Durgin */ #include @@ -46,8 +49,8 @@ #include #include -#ifdef CONFIG_KERNELD -#include +#ifdef CONFIG_KMOD +#include #endif /* @@ -107,7 +110,7 @@ static int misc_open(struct inode * inode, struct file * file) while ((c != &misc_list) && (c->minor != minor)) c = c->next; if (c == &misc_list) { -#ifdef CONFIG_KERNELD +#ifdef CONFIG_KMOD char modname[20]; sprintf(modname, "char-major-%d-%d", MISC_MAJOR, minor); request_module(modname); diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c index dcc38830fdd7..075447a7dbda 100644 --- a/drivers/char/tty_io.c +++ b/drivers/char/tty_io.c @@ -87,8 +87,8 @@ #include #include -#ifdef CONFIG_KERNELD -#include +#ifdef CONFIG_KMOD +#include #endif #define CONSOLE_DEV MKDEV(TTY_MAJOR,0) @@ -216,8 +216,9 @@ static int tty_set_ldisc(struct tty_struct *tty, int ldisc) if ((ldisc < N_TTY) || (ldisc >= NR_LDISCS)) return -EINVAL; -#ifdef CONFIG_KERNELD +#ifdef CONFIG_KMOD /* Eduardo Blanco */ + /* Cyrus Durgin */ if (!(ldiscs[ldisc].flags & LDISC_FLAG_DEFINED)) { char modname [20]; sprintf(modname, "tty-ldisc-%d", ldisc); diff --git a/drivers/misc/parport_init.c b/drivers/misc/parport_init.c index 3fc222ff9c90..31753dc1a31a 100644 --- a/drivers/misc/parport_init.c +++ b/drivers/misc/parport_init.c @@ -17,7 +17,6 @@ #include #include #include -#include #ifndef MODULE static int io[PARPORT_MAX+1] __initdata = { [0 ... PARPORT_MAX] = 0 }; diff --git a/drivers/misc/parport_share.c b/drivers/misc/parport_share.c index ac0f15ed8a09..3446cf3944c3 100644 --- a/drivers/misc/parport_share.c +++ b/drivers/misc/parport_share.c @@ -28,8 +28,8 @@ #include -#ifdef CONFIG_KERNELD -#include +#ifdef CONFIG_KMOD +#include #endif #undef PARPORT_PARANOID @@ -44,14 +44,14 @@ void (*parport_probe_hook)(struct parport *port) = NULL; /* Return a list of all the ports we know about. */ struct parport *parport_enumerate(void) { -#ifdef CONFIG_KERNELD +#ifdef CONFIG_KMOD if (portlist == NULL) { request_module("parport_lowlevel"); #ifdef CONFIG_PNP_PARPORT_MODULE request_module("parport_probe"); #endif /* CONFIG_PNP_PARPORT_MODULE */ } -#endif /* CONFIG_KERNELD */ +#endif /* CONFIG_KMOD */ return portlist; } diff --git a/drivers/net/ppp.c b/drivers/net/ppp.c index 9380f32f8b25..829f79df6855 100644 --- a/drivers/net/ppp.c +++ b/drivers/net/ppp.c @@ -3,6 +3,7 @@ * Michael Callahan * Al Longyear * Paul Mackerras + * Cyrus Durgin (changes for kmod) * * Dynamic PPP devices by Jim Freeman . * ppp_tty_receive ``noisy-raise-bug'' fixed by Ove Ewerlid @@ -94,8 +95,8 @@ typedef struct sk_buff sk_buff; #include #include -#ifdef CONFIG_KERNELD -#include +#ifdef CONFIG_KMOD +#include #endif #ifndef PPP_IPX @@ -2190,14 +2191,14 @@ ppp_set_compression (struct ppp *ppp, struct ppp_option_data *odp) restore_flags(flags); cp = find_compressor (ccp_option[0]); -#ifdef CONFIG_KERNELD +#ifdef CONFIG_KMOD if (cp == NULL) { char modname[32]; sprintf(modname, "ppp-compress-%d", ccp_option[0]); request_module(modname); cp = find_compressor(ccp_option[0]); } -#endif /* CONFIG_KERNELD */ +#endif /* CONFIG_KMOD */ if (cp == NULL) goto out_no_comp; diff --git a/drivers/net/slip.c b/drivers/net/slip.c index fe464965edd0..489f0a39e36c 100644 --- a/drivers/net/slip.c +++ b/drivers/net/slip.c @@ -181,11 +181,13 @@ sl_alloc_bufs(struct slip *sl, int mtu) sl->xleft = 0; rbuff = xchg(&sl->rbuff, rbuff); xbuff = xchg(&sl->xbuff, xbuff); -#ifdef CONFIG_SLIP_MODE_SLIP6 +#ifdef SL_INCLUDE_CSLIP cbuff = xchg(&sl->cbuff, cbuff); slcomp = xchg(&sl->slcomp, slcomp); +#ifdef CONFIG_SLIP_MODE_SLIP6 sl->xdata = 0; sl->xbits = 0; +#endif #endif end_bh_atomic(); err = 0; @@ -1134,7 +1136,7 @@ slip_ioctl(struct tty_struct *tty, void *file, int cmd, void *arg) it breaks my old poor gcc on alpha --ANK */ tmp = strlen(sl->dev->name) + 1; - if (copy_to_user(arg, sl->dev->name, tmp) < 0) + if (copy_to_user(arg, sl->dev->name, tmp)) return -EFAULT; return 0; diff --git a/drivers/scsi/ppa.c b/drivers/scsi/ppa.c index 3c2f083bd564..ebdcb560c637 100644 --- a/drivers/scsi/ppa.c +++ b/drivers/scsi/ppa.c @@ -50,8 +50,8 @@ NULL, /* cur_cmd */ \ #include "ppa.h" #include -#ifdef CONFIG_KERNELD -#include +#ifdef CONFIG_KMOD +#include #ifndef PARPORT_MODULES #define PARPORT_MODULES "parport_pc" #endif @@ -130,7 +130,7 @@ int ppa_detect(Scsi_Host_Template * host) nhosts = 0; try_again = 0; -#ifdef CONFIG_KERNELD +#ifdef CONFIG_KMOD if (!pb) { request_module(PARPORT_MODULES); pb = parport_enumerate(); diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c index 648d5037243d..44736b5fc8d0 100644 --- a/drivers/scsi/scsi.c +++ b/drivers/scsi/scsi.c @@ -23,6 +23,7 @@ * Added request_module("scsi_hostadapter") for kerneld: * (Put an "alias scsi_hostadapter your_hostadapter" in /etc/conf.modules) * Bjorn Ekwall + * (changed to kmod) * * Major improvements to the timeout, abort, and reset processing, * as well as performance modifications for large queue depths by @@ -57,8 +58,8 @@ #include "hosts.h" #include "constants.h" -#ifdef CONFIG_KERNELD -#include +#ifdef CONFIG_KMOD +#include #endif #undef USE_STATIC_SCSI_MEMORY @@ -3114,7 +3115,7 @@ int scsi_register_module(int module_type, void * ptr) /* Load upper level device handler of some kind */ case MODULE_SCSI_DEV: -#ifdef CONFIG_KERNELD +#ifdef CONFIG_KMOD if (scsi_hosts == NULL) request_module("scsi_hostadapter"); #endif diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index f5e19c547244..cc259fd67c2e 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -38,10 +38,6 @@ #define SHUTDOWN_SIGS (sigmask(SIGKILL)|sigmask(SIGINT)|sigmask(SIGTERM)) -#ifdef CONFIG_KERNELD -#include -#endif - #ifdef DEBUG #define SENSE_TIMEOUT SCSI_TIMEOUT #define ABORT_TIMEOUT SCSI_TIMEOUT diff --git a/drivers/scsi/scsi_obsolete.c b/drivers/scsi/scsi_obsolete.c index 077b73063cd2..a49bf3816e10 100644 --- a/drivers/scsi/scsi_obsolete.c +++ b/drivers/scsi/scsi_obsolete.c @@ -20,10 +20,6 @@ * Native multichannel, wide scsi, /proc/scsi and hot plugging * support added by Michael Neuffer * - * Added request_module("scsi_hostadapter") for kerneld: - * (Put an "alias scsi_hostadapter your_hostadapter" in /etc/conf.modules) - * Bjorn Ekwall - * * Major improvements to the timeout, abort, and reset processing, * as well as performance modifications for large queue depths by * Leonard N. Zubkoff @@ -70,10 +66,6 @@ #include "hosts.h" #include "constants.h" -#ifdef CONFIG_KERNELD -#include -#endif - #undef USE_STATIC_SCSI_MEMORY /* diff --git a/drivers/scsi/scsi_queue.c b/drivers/scsi/scsi_queue.c index b9e2a4febee3..46d3ea73abd8 100644 --- a/drivers/scsi/scsi_queue.c +++ b/drivers/scsi/scsi_queue.c @@ -38,10 +38,6 @@ #include "hosts.h" #include "constants.h" -#ifdef CONFIG_KERNELD -#include -#endif - /* * TODO: * 1) Prevent multiple traversals of list to look for commands to diff --git a/drivers/scsi/wd7000.c b/drivers/scsi/wd7000.c index f8c89953526e..1b915e3ed89d 100644 --- a/drivers/scsi/wd7000.c +++ b/drivers/scsi/wd7000.c @@ -127,6 +127,19 @@ * * Thanks to Roger Scott for driver debugging. * + * 06/07/1997 + * + * Added support for /proc file system (/proc/scsi/wd7000/[0...] files). + * Now, driver can handle hard disks with capacity >1GB. + * + * 01/15/1998 + * + * Added support for BUS_ON and BUS_OFF parameters in config line. + * Miscellaneous cleanup. + * + * 03/01/1998 + * + * WD7000 driver now work on kernels >= 2.1.x */ #ifdef MODULE @@ -143,24 +156,29 @@ #include #include #include -#include #include #include #include +#include #include "scsi.h" #include "hosts.h" #include "sd.h" +#include #define ANY2SCSI_INLINE /* undef this to use old macros */ -#undef DEBUG +#undef WD7000_DEBUG /* general debug */ #include "wd7000.h" +#include -#include -struct proc_dir_entry proc_scsi_wd7000 = { - PROC_SCSI_7000FASST, 6, "wd7000", - S_IFDIR | S_IRUGO | S_IXUGO, 2 +struct proc_dir_entry proc_scsi_wd7000 = +{ + PROC_SCSI_7000FASST, + 6, + "wd7000", + S_IFDIR | S_IRUGO | S_IXUGO, + 2 }; @@ -186,52 +204,48 @@ struct proc_dir_entry proc_scsi_wd7000 = { * WD7000-specific mailbox structure * */ -typedef volatile struct mailbox{ - unchar status; - unchar scbptr[3]; /* SCSI-style - MSB first (big endian) */ +typedef volatile struct mailbox { + unchar status; + unchar scbptr[3]; /* SCSI-style - MSB first (big endian) */ } Mailbox; /* * This structure should contain all per-adapter global data. I.e., any * new global per-adapter data should put in here. - * */ typedef struct adapter { - struct Scsi_Host *sh; /* Pointer to Scsi_Host structure */ - int iobase; /* This adapter's I/O base address */ - int irq; /* This adapter's IRQ level */ - int dma; /* This adapter's DMA channel */ - struct { /* This adapter's mailboxes */ - Mailbox ogmb[OGMB_CNT]; /* Outgoing mailboxes */ - Mailbox icmb[ICMB_CNT]; /* Incoming mailboxes */ - } mb; - int next_ogmb; /* to reduce contention at mailboxes */ - unchar control; /* shadows CONTROL port value */ - unchar rev1, rev2; /* filled in by wd7000_revision */ + struct Scsi_Host *sh; /* Pointer to Scsi_Host structure */ + int iobase; /* This adapter's I/O base address */ + int irq; /* This adapter's IRQ level */ + int dma; /* This adapter's DMA channel */ + int int_counter; /* This adapter's interrupt counter */ + int bus_on; /* This adapter's BUS_ON time */ + int bus_off; /* This adapter's BUS_OFF time */ + struct { /* This adapter's mailboxes */ + Mailbox ogmb[OGMB_CNT]; /* Outgoing mailboxes */ + Mailbox icmb[ICMB_CNT]; /* Incoming mailboxes */ + } mb; + int next_ogmb; /* to reduce contention at mailboxes */ + unchar control; /* shadows CONTROL port value */ + unchar rev1, rev2; /* filled in by wd7000_revision */ } Adapter; -/* - * The following is set up by wd7000_detect, and used thereafter by - * wd7000_intr_handle to map the irq level to the corresponding Adapter. - * Note that if SA_INTERRUPT is not used, wd7000_intr_handle must be - * changed to pick up the IRQ level correctly. - */ -static Adapter *irq2host[NR_IRQS] = {NULL}; - /* * (linear) base address for ROM BIOS */ -static const long wd7000_biosaddr[] = { - 0xc0000, 0xc2000, 0xc4000, 0xc6000, 0xc8000, 0xca000, 0xcc000, 0xce000, - 0xd0000, 0xd2000, 0xd4000, 0xd6000, 0xd8000, 0xda000, 0xdc000, 0xde000 +static const long wd7000_biosaddr[] = +{ + 0xc0000, 0xc2000, 0xc4000, 0xc6000, 0xc8000, 0xca000, 0xcc000, 0xce000, + 0xd0000, 0xd2000, 0xd4000, 0xd6000, 0xd8000, 0xda000, 0xdc000, 0xde000 }; #define NUM_ADDRS (sizeof(wd7000_biosaddr)/sizeof(long)) -static const unsigned short wd7000_iobase[] = { - 0x0300, 0x0308, 0x0310, 0x0318, 0x0320, 0x0328, 0x0330, 0x0338, - 0x0340, 0x0348, 0x0350, 0x0358, 0x0360, 0x0368, 0x0370, 0x0378, - 0x0380, 0x0388, 0x0390, 0x0398, 0x03a0, 0x03a8, 0x03b0, 0x03b8, - 0x03c0, 0x03c8, 0x03d0, 0x03d8, 0x03e0, 0x03e8, 0x03f0, 0x03f8 +static const unsigned short wd7000_iobase[] = +{ + 0x0300, 0x0308, 0x0310, 0x0318, 0x0320, 0x0328, 0x0330, 0x0338, + 0x0340, 0x0348, 0x0350, 0x0358, 0x0360, 0x0368, 0x0370, 0x0378, + 0x0380, 0x0388, 0x0390, 0x0398, 0x03a0, 0x03a8, 0x03b0, 0x03b8, + 0x03c0, 0x03c8, 0x03d0, 0x03d8, 0x03e0, 0x03e8, 0x03f0, 0x03f8 }; #define NUM_IOPORTS (sizeof(wd7000_iobase)/sizeof(unsigned short)) @@ -240,24 +254,48 @@ static const short wd7000_irq[] = { 3, 4, 5, 7, 9, 10, 11, 12, 14, 15 }; static const short wd7000_dma[] = { 5, 6, 7 }; #define NUM_DMAS (sizeof(wd7000_dma)/sizeof(short)) - + +/* + * possible irq range + */ +#define IRQ_MIN 3 +#define IRQ_MAX 15 +#define IRQS (IRQ_MAX - IRQ_MIN + 1) + +/* + * The following is set up by wd7000_detect, and used thereafter by + * wd7000_intr_handle to map the irq level to the corresponding Adapter. + * Note that if SA_INTERRUPT is not used, wd7000_intr_handle must be + * changed to pick up the IRQ level correctly. + */ +static struct Scsi_Host *wd7000_host[IRQS]; + +#define BUS_ON 64 /* x 125ns = 8000ns (BIOS default) */ +#define BUS_OFF 15 /* x 125ns = 1875ns (BIOS default) */ + /* * Standard Adapter Configurations - used by wd7000_detect */ typedef struct { - int irq; /* IRQ level */ - int dma; /* DMA channel */ - unsigned iobase; /* I/O base address */ + short irq; /* IRQ level */ + short dma; /* DMA channel */ + unsigned iobase; /* I/O base address */ + short bus_on; /* Time that WD7000 spends on the AT-bus when */ + /* transferring data. BIOS default is 8000ns. */ + short bus_off; /* Time that WD7000 spends OFF THE BUS after */ + /* while it is transferring data. */ + /* BIOS default is 1875ns */ } Config; /* * Add here your configuration... */ -static const Config configs[] = { - { 15, 6, 0x350 }, /* defaults for single adapter */ - { 11, 5, 0x320 }, /* defaults for second adapter */ - { 7, 6, 0x350 }, /* My configuration (Zaga) */ - { -1, -1, 0x0 } /* Empty slot */ +static Config configs[] = +{ + { 15, 6, 0x350, BUS_ON, BUS_OFF }, /* defaults for single adapter */ + { 11, 5, 0x320, BUS_ON, BUS_OFF }, /* defaults for second adapter */ + { 7, 6, 0x350, BUS_ON, BUS_OFF }, /* My configuration (Zaga) */ + { -1, -1, 0x0, BUS_ON, BUS_OFF } /* Empty slot */ }; #define NUM_CONFIGS (sizeof(configs)/sizeof(Config)) @@ -267,13 +305,14 @@ static const Config configs[] = { * added for the Future Domain version. */ typedef struct signature { - const void *sig; /* String to look for */ - unsigned ofs; /* offset from BIOS base address */ - unsigned len; /* length of string */ + const char *sig; /* String to look for */ + unsigned long ofs; /* offset from BIOS base address */ + unsigned len; /* length of string */ } Signature; -static const Signature signatures[] = { - { "SSTBIOS", 0x0000d, 7 } /* "SSTBIOS" @ offset 0x0000d */ +static const Signature signatures[] = +{ + {"SSTBIOS", 0x0000d, 7} /* "SSTBIOS" @ offset 0x0000d */ }; #define NUM_SIGNATURES (sizeof(signatures)/sizeof(Signature)) @@ -282,22 +321,23 @@ static const Signature signatures[] = { * I/O Port Offsets and Bit Definitions * 4 addresses are used. Those not defined here are reserved. */ -#define ASC_STAT 0 /* Status, Read */ -#define ASC_COMMAND 0 /* Command, Write */ +#define ASC_STAT 0 /* Status, Read */ +#define ASC_COMMAND 0 /* Command, Write */ #define ASC_INTR_STAT 1 /* Interrupt Status, Read */ -#define ASC_INTR_ACK 1 /* Acknowledge, Write */ -#define ASC_CONTROL 2 /* Control, Write */ +#define ASC_INTR_ACK 1 /* Acknowledge, Write */ +#define ASC_CONTROL 2 /* Control, Write */ /* * ASC Status Port */ -#define INT_IM 0x80 /* Interrupt Image Flag */ -#define CMD_RDY 0x40 /* Command Port Ready */ -#define CMD_REJ 0x20 /* Command Port Byte Rejected */ -#define ASC_INIT 0x10 /* ASC Initialized Flag */ +#define INT_IM 0x80 /* Interrupt Image Flag */ +#define CMD_RDY 0x40 /* Command Port Ready */ +#define CMD_REJ 0x20 /* Command Port Byte Rejected */ +#define ASC_INIT 0x10 /* ASC Initialized Flag */ #define ASC_STATMASK 0xf0 /* The lower 4 Bytes are reserved */ -/* COMMAND opcodes +/* + * COMMAND opcodes * * Unfortunately, I have no idea how to properly use some of these commands, * as the OEM manual does not make it clear. I have not been able to use @@ -305,39 +345,38 @@ static const Signature signatures[] = { * discernible effect whatsoever. I think they may be related to certain * ICB commands, but again, the OEM manual doesn't make that clear. */ -#define NO_OP 0 /* NO-OP toggles CMD_RDY bit in ASC_STAT */ -#define INITIALIZATION 1 /* initialization (10 bytes) */ -#define DISABLE_UNS_INTR 2 /* disable unsolicited interrupts */ -#define ENABLE_UNS_INTR 3 /* enable unsolicited interrupts */ -#define INTR_ON_FREE_OGMB 4 /* interrupt on free OGMB */ -#define SOFT_RESET 5 /* SCSI bus soft reset */ -#define HARD_RESET_ACK 6 /* SCSI bus hard reset acknowledge */ -#define START_OGMB 0x80 /* start command in OGMB (n) */ +#define NO_OP 0 /* NO-OP toggles CMD_RDY bit in ASC_STAT */ +#define INITIALIZATION 1 /* initialization (10 bytes) */ +#define DISABLE_UNS_INTR 2 /* disable unsolicited interrupts */ +#define ENABLE_UNS_INTR 3 /* enable unsolicited interrupts */ +#define INTR_ON_FREE_OGMB 4 /* interrupt on free OGMB */ +#define SOFT_RESET 5 /* SCSI bus soft reset */ +#define HARD_RESET_ACK 6 /* SCSI bus hard reset acknowledge */ +#define START_OGMB 0x80 /* start command in OGMB (n) */ #define SCAN_OGMBS 0xc0 /* start multiple commands, signature (n) */ - /* where (n) = lower 6 bits */ -/* For INITIALIZATION: + /* where (n) = lower 6 bits */ +/* + * For INITIALIZATION: */ typedef struct initCmd { - unchar op; /* command opcode (= 1) */ - unchar ID; /* Adapter's SCSI ID */ - unchar bus_on; /* Bus on time, x 125ns (see below) */ - unchar bus_off; /* Bus off time, "" "" */ - unchar rsvd; /* Reserved */ - unchar mailboxes[3]; /* Address of Mailboxes, MSB first */ - unchar ogmbs; /* Number of outgoing MBs, max 64, 0,1 = 1 */ - unchar icmbs; /* Number of incoming MBs, "" "" */ + unchar op; /* command opcode (= 1) */ + unchar ID; /* Adapter's SCSI ID */ + unchar bus_on; /* Bus on time, x 125ns (see below) */ + unchar bus_off; /* Bus off time, "" "" */ + unchar rsvd; /* Reserved */ + unchar mailboxes[3]; /* Address of Mailboxes, MSB first */ + unchar ogmbs; /* Number of outgoing MBs, max 64, 0,1 = 1 */ + unchar icmbs; /* Number of incoming MBs, "" "" */ } InitCmd; -#define BUS_ON 64 /* x 125ns = 8000ns (BIOS default) */ -#define BUS_OFF 15 /* x 125ns = 1875ns (BIOS default) */ - -/* Interrupt Status Port - also returns diagnostic codes at ASC reset +/* + * Interrupt Status Port - also returns diagnostic codes at ASC reset * * if msb is zero, the lower bits are diagnostic status * Diagnostics: - * 01 No diagnostic error occurred - * 02 RAM failure - * 03 FIFO R/W failed + * 01 No diagnostic error occurred + * 02 RAM failure + * 03 FIFO R/W failed * 04 SBIC register read/write failed * 05 Initialization D-FF failed * 06 Host IRQ D-FF failed @@ -346,19 +385,20 @@ typedef struct initCmd { * 10NNNNNN outgoing mailbox NNNNNN is free * 11NNNNNN incoming mailbox NNNNNN needs service */ -#define MB_INTR 0xC0 /* Mailbox Service possible/required */ -#define IMB_INTR 0x40 /* 1 Incoming / 0 Outgoing */ -#define MB_MASK 0x3f /* mask for mailbox number */ +#define MB_INTR 0xC0 /* Mailbox Service possible/required */ +#define IMB_INTR 0x40 /* 1 Incoming / 0 Outgoing */ +#define MB_MASK 0x3f /* mask for mailbox number */ -/* CONTROL port bits +/* + * CONTROL port bits */ -#define INT_EN 0x08 /* Interrupt Enable */ -#define DMA_EN 0x04 /* DMA Enable */ -#define SCSI_RES 0x02 /* SCSI Reset */ -#define ASC_RES 0x01 /* ASC Reset */ +#define INT_EN 0x08 /* Interrupt Enable */ +#define DMA_EN 0x04 /* DMA Enable */ +#define SCSI_RES 0x02 /* SCSI Reset */ +#define ASC_RES 0x01 /* ASC Reset */ /* - * Driver data structures: + * Driver data structures: * - mb and scbs are required for interfacing with the host adapter. * An SCB has extra fields not visible to the adapter; mb's * _cannot_ do this, since the adapter assumes they are contiguous in @@ -387,28 +427,28 @@ typedef struct initCmd { */ typedef struct sgb { unchar len[3]; - unchar ptr[3]; /* Also SCSI-style - MSB first */ + unchar ptr[3]; /* Also SCSI-style - MSB first */ } Sgb; -typedef struct scb { /* Command Control Block 5.4.1 */ - unchar op; /* Command Control Block Operation Code */ - unchar idlun; /* op=0,2:Target Id, op=1:Initiator Id */ - /* Outbound data transfer, length is checked*/ - /* Inbound data transfer, length is checked */ - /* Logical Unit Number */ - unchar cdb[12]; /* SCSI Command Block */ - volatile unchar status; /* SCSI Return Status */ - volatile unchar vue; /* Vendor Unique Error Code */ - unchar maxlen[3]; /* Maximum Data Transfer Length */ - unchar dataptr[3]; /* SCSI Data Block Pointer */ - unchar linkptr[3]; /* Next Command Link Pointer */ - unchar direc; /* Transfer Direction */ - unchar reserved2[6]; /* SCSI Command Descriptor Block */ - /* end of hardware SCB */ - Scsi_Cmnd *SCpnt; /* Scsi_Cmnd using this SCB */ - Sgb sgb[WD7000_SG]; /* Scatter/gather list for this SCB */ - Adapter *host; /* host adapter */ - struct scb *next; /* for lists of scbs */ +typedef struct scb { /* Command Control Block 5.4.1 */ + unchar op; /* Command Control Block Operation Code */ + unchar idlun; /* op=0,2:Target Id, op=1:Initiator Id */ + /* Outbound data transfer, length is checked */ + /* Inbound data transfer, length is checked */ + /* Logical Unit Number */ + unchar cdb[12]; /* SCSI Command Block */ + volatile unchar status; /* SCSI Return Status */ + volatile unchar vue; /* Vendor Unique Error Code */ + unchar maxlen[3]; /* Maximum Data Transfer Length */ + unchar dataptr[3]; /* SCSI Data Block Pointer */ + unchar linkptr[3]; /* Next Command Link Pointer */ + unchar direc; /* Transfer Direction */ + unchar reserved2[6]; /* SCSI Command Descriptor Block */ + /* end of hardware SCB */ + Scsi_Cmnd *SCpnt; /* Scsi_Cmnd using this SCB */ + Sgb sgb[WD7000_SG]; /* Scatter/gather list for this SCB */ + Adapter *host; /* host adapter */ + struct scb *next; /* for lists of scbs */ } Scb; /* @@ -422,110 +462,110 @@ typedef struct scb { /* Command Control Block 5.4.1 */ * (notably, get/set unsolicited interrupt status) in my copy of the OEM * manual, and others are ambiguous/hard to follow. */ -#define ICB_OP_MASK 0x80 /* distinguishes scbs from icbs */ -#define ICB_OP_OPEN_RBUF 0x80 /* open receive buffer */ -#define ICB_OP_RECV_CMD 0x81 /* receive command from initiator */ -#define ICB_OP_RECV_DATA 0x82 /* receive data from initiator */ -#define ICB_OP_RECV_SDATA 0x83 /* receive data with status from init. */ -#define ICB_OP_SEND_DATA 0x84 /* send data with status to initiator */ -#define ICB_OP_SEND_STAT 0x86 /* send command status to initiator */ - /* 0x87 is reserved */ -#define ICB_OP_READ_INIT 0x88 /* read initialization bytes */ -#define ICB_OP_READ_ID 0x89 /* read adapter's SCSI ID */ -#define ICB_OP_SET_UMASK 0x8A /* set unsolicited interrupt mask */ -#define ICB_OP_GET_UMASK 0x8B /* read unsolicited interrupt mask */ -#define ICB_OP_GET_REVISION 0x8C /* read firmware revision level */ -#define ICB_OP_DIAGNOSTICS 0x8D /* execute diagnostics */ -#define ICB_OP_SET_EPARMS 0x8E /* set execution parameters */ -#define ICB_OP_GET_EPARMS 0x8F /* read execution parameters */ +#define ICB_OP_MASK 0x80 /* distinguishes scbs from icbs */ +#define ICB_OP_OPEN_RBUF 0x80 /* open receive buffer */ +#define ICB_OP_RECV_CMD 0x81 /* receive command from initiator */ +#define ICB_OP_RECV_DATA 0x82 /* receive data from initiator */ +#define ICB_OP_RECV_SDATA 0x83 /* receive data with status from init. */ +#define ICB_OP_SEND_DATA 0x84 /* send data with status to initiator */ +#define ICB_OP_SEND_STAT 0x86 /* send command status to initiator */ + /* 0x87 is reserved */ +#define ICB_OP_READ_INIT 0x88 /* read initialization bytes */ +#define ICB_OP_READ_ID 0x89 /* read adapter's SCSI ID */ +#define ICB_OP_SET_UMASK 0x8A /* set unsolicited interrupt mask */ +#define ICB_OP_GET_UMASK 0x8B /* read unsolicited interrupt mask */ +#define ICB_OP_GET_REVISION 0x8C /* read firmware revision level */ +#define ICB_OP_DIAGNOSTICS 0x8D /* execute diagnostics */ +#define ICB_OP_SET_EPARMS 0x8E /* set execution parameters */ +#define ICB_OP_GET_EPARMS 0x8F /* read execution parameters */ typedef struct icbRecvCmd { - unchar op; - unchar IDlun; /* Initiator SCSI ID/lun */ - unchar len[3]; /* command buffer length */ - unchar ptr[3]; /* command buffer address */ - unchar rsvd[7]; /* reserved */ - volatile unchar vue; /* vendor-unique error code */ - volatile unchar status; /* returned (icmb) status */ - volatile unchar phase; /* used by interrupt handler */ + unchar op; + unchar IDlun; /* Initiator SCSI ID/lun */ + unchar len[3]; /* command buffer length */ + unchar ptr[3]; /* command buffer address */ + unchar rsvd[7]; /* reserved */ + volatile unchar vue; /* vendor-unique error code */ + volatile unchar status; /* returned (icmb) status */ + volatile unchar phase; /* used by interrupt handler */ } IcbRecvCmd; typedef struct icbSendStat { - unchar op; - unchar IDlun; /* Target SCSI ID/lun */ - unchar stat; /* (outgoing) completion status byte 1 */ - unchar rsvd[12]; /* reserved */ - volatile unchar vue; /* vendor-unique error code */ - volatile unchar status; /* returned (icmb) status */ - volatile unchar phase; /* used by interrupt handler */ + unchar op; + unchar IDlun; /* Target SCSI ID/lun */ + unchar stat; /* (outgoing) completion status byte 1 */ + unchar rsvd[12]; /* reserved */ + volatile unchar vue; /* vendor-unique error code */ + volatile unchar status; /* returned (icmb) status */ + volatile unchar phase; /* used by interrupt handler */ } IcbSendStat; typedef struct icbRevLvl { - unchar op; - volatile unchar primary; /* primary revision level (returned) */ - volatile unchar secondary; /* secondary revision level (returned) */ - unchar rsvd[12]; /* reserved */ - volatile unchar vue; /* vendor-unique error code */ - volatile unchar status; /* returned (icmb) status */ - volatile unchar phase; /* used by interrupt handler */ + unchar op; + volatile unchar primary; /* primary revision level (returned) */ + volatile unchar secondary; /* secondary revision level (returned) */ + unchar rsvd[12]; /* reserved */ + volatile unchar vue; /* vendor-unique error code */ + volatile unchar status; /* returned (icmb) status */ + volatile unchar phase; /* used by interrupt handler */ } IcbRevLvl; -typedef struct icbUnsMask { /* I'm totally guessing here */ - unchar op; - volatile unchar mask[14]; /* mask bits */ -#if 0 - unchar rsvd[12]; /* reserved */ +typedef struct icbUnsMask { /* I'm totally guessing here */ + unchar op; + volatile unchar mask[14]; /* mask bits */ +#ifdef 0 + unchar rsvd[12]; /* reserved */ #endif - volatile unchar vue; /* vendor-unique error code */ - volatile unchar status; /* returned (icmb) status */ - volatile unchar phase; /* used by interrupt handler */ + volatile unchar vue; /* vendor-unique error code */ + volatile unchar status; /* returned (icmb) status */ + volatile unchar phase; /* used by interrupt handler */ } IcbUnsMask; typedef struct icbDiag { - unchar op; - unchar type; /* diagnostics type code (0-3) */ - unchar len[3]; /* buffer length */ - unchar ptr[3]; /* buffer address */ - unchar rsvd[7]; /* reserved */ - volatile unchar vue; /* vendor-unique error code */ - volatile unchar status; /* returned (icmb) status */ - volatile unchar phase; /* used by interrupt handler */ + unchar op; + unchar type; /* diagnostics type code (0-3) */ + unchar len[3]; /* buffer length */ + unchar ptr[3]; /* buffer address */ + unchar rsvd[7]; /* reserved */ + volatile unchar vue; /* vendor-unique error code */ + volatile unchar status; /* returned (icmb) status */ + volatile unchar phase; /* used by interrupt handler */ } IcbDiag; -#define ICB_DIAG_POWERUP 0 /* Power-up diags only */ -#define ICB_DIAG_WALKING 1 /* walking 1's pattern */ -#define ICB_DIAG_DMA 2 /* DMA - system memory diags */ -#define ICB_DIAG_FULL 3 /* do both 1 & 2 */ +#define ICB_DIAG_POWERUP 0 /* Power-up diags only */ +#define ICB_DIAG_WALKING 1 /* walking 1's pattern */ +#define ICB_DIAG_DMA 2 /* DMA - system memory diags */ +#define ICB_DIAG_FULL 3 /* do both 1 & 2 */ typedef struct icbParms { - unchar op; - unchar rsvd1; /* reserved */ - unchar len[3]; /* parms buffer length */ - unchar ptr[3]; /* parms buffer address */ - unchar idx[2]; /* index (MSB-LSB) */ - unchar rsvd2[5]; /* reserved */ - volatile unchar vue; /* vendor-unique error code */ - volatile unchar status; /* returned (icmb) status */ - volatile unchar phase; /* used by interrupt handler */ + unchar op; + unchar rsvd1; /* reserved */ + unchar len[3]; /* parms buffer length */ + unchar ptr[3]; /* parms buffer address */ + unchar idx[2]; /* index (MSB-LSB) */ + unchar rsvd2[5]; /* reserved */ + volatile unchar vue; /* vendor-unique error code */ + volatile unchar status; /* returned (icmb) status */ + volatile unchar phase; /* used by interrupt handler */ } IcbParms; typedef struct icbAny { - unchar op; - unchar data[14]; /* format-specific data */ - volatile unchar vue; /* vendor-unique error code */ - volatile unchar status; /* returned (icmb) status */ - volatile unchar phase; /* used by interrupt handler */ + unchar op; + unchar data[14]; /* format-specific data */ + volatile unchar vue; /* vendor-unique error code */ + volatile unchar status; /* returned (icmb) status */ + volatile unchar phase; /* used by interrupt handler */ } IcbAny; typedef union icb { - unchar op; /* ICB opcode */ - IcbRecvCmd recv_cmd; /* format for receive command */ - IcbSendStat send_stat; /* format for send status */ - IcbRevLvl rev_lvl; /* format for get revision level */ - IcbDiag diag; /* format for execute diagnostics */ - IcbParms eparms; /* format for get/set exec parms */ - IcbAny icb; /* generic format */ - unchar data[18]; + unchar op; /* ICB opcode */ + IcbRecvCmd recv_cmd; /* format for receive command */ + IcbSendStat send_stat; /* format for send status */ + IcbRevLvl rev_lvl; /* format for get revision level */ + IcbDiag diag; /* format for execute diagnostics */ + IcbParms eparms; /* format for get/set exec parms */ + IcbAny icb; /* generic format */ + unchar data[18]; } Icb; @@ -536,27 +576,34 @@ typedef union icb { * structure is not part of the Adapter structure. */ static Scb scbs[MAX_SCBS]; -static Scb *scbfree = NULL; /* free list */ -static int freescbs = MAX_SCBS; /* free list counter */ - -/* - * - */ -static short wd7000_setupIRQ[NUM_CONFIGS]; -static short wd7000_setupDMA[NUM_CONFIGS]; -static short wd7000_setupIO[NUM_CONFIGS]; -static short wd7000_card_num = 0; +static Scb *scbfree = NULL; /* free list */ +static int freescbs = MAX_SCBS; /* free list counter */ /* * END of data/declarations - code follows. */ +static void setup_error (char *mesg, int *ints) +{ + if (ints[0] == 3) + printk ("wd7000_setup: \"wd7000=%d,%d,0x%x\" -> %s\n", + ints[1], ints[2], ints[3], mesg); + else if (ints[0] == 4) + printk ("wd7000_setup: \"wd7000=%d,%d,0x%x,%d\" -> %s\n", + ints[1], ints[2], ints[3], ints[4], mesg); + else + printk ("wd7000_setup: \"wd7000=%d,%d,0x%x,%d,%d\" -> %s\n", + ints[1], ints[2], ints[3], ints[4], ints[5], mesg); +} /* * Note: You can now set these options from the kernel's "command line". * The syntax is: * - * wd7000=IRQ,DMA,IO + * wd7000=,,[,[,]] + * + * , where BUS_ON and BUS_OFF are in nanoseconds. BIOS default values + * are 8000ns for BUS_ON and 1875ns for BUS_OFF. * eg: * wd7000=7,6,0x350 * @@ -565,82 +612,99 @@ static short wd7000_card_num = 0; */ void wd7000_setup (char *str, int *ints) { + static short wd7000_card_num = 0; short i, j; if (wd7000_card_num >= NUM_CONFIGS) { - printk ("wd7000_setup: Too many \"wd7000=\" configurations in " - "command line!\n"); - - return; + printk ("wd7000_setup: Too many \"wd7000=\" configurations in " + "command line!\n"); + return; } - if (ints[0] != 3) + if ((ints[0] < 3) || (ints[0] > 5)) printk ("wd7000_setup: Error in command line! " - "Usage: wd7000=IRQ,DMA,IO\n"); + "Usage: wd7000=,,IO>[,[,]]\n"); else { for (i = 0; i < NUM_IRQS; i++) if (ints[1] == wd7000_irq[i]) break; if (i == NUM_IRQS) { - printk ("wd7000_setup: \"wd7000=%d,%d,0x%x\" -> " - "invalid IRQ.\n", ints[1], ints[2], ints[3]); + setup_error ("invalid IRQ.", ints); return; } else - wd7000_setupIRQ[wd7000_card_num] = ints[1]; + configs[wd7000_card_num].irq = ints[1]; for (i = 0; i < NUM_DMAS; i++) if (ints[2] == wd7000_dma[i]) break; if (i == NUM_DMAS) { - printk ("wd7000_setup: \"wd7000=%d,%d,0x%x\" -> " - "invalid DMA channel.\n", ints[1], ints[2], ints[3]); + setup_error ("invalid DMA channel.", ints); return; } else - wd7000_setupDMA[wd7000_card_num] = ints[2]; + configs[wd7000_card_num].dma = ints[2]; for (i = 0; i < NUM_IOPORTS; i++) if (ints[3] == wd7000_iobase[i]) break; if (i == NUM_IOPORTS) { - printk ("wd7000_setup: \"wd7000=%d,%d,0x%x\" -> " - "invalid I/O base address.\n", ints[1], ints[2], ints[3]); + setup_error ("invalid I/O base address.", ints); return; } else - wd7000_setupIO[wd7000_card_num] = ints[3]; + configs[wd7000_card_num].iobase = ints[3]; + + if (ints[0] > 3) { + if ((ints[4] < 500) || (ints[4] > 31875)) { + setup_error ("BUS_ON value is out of range (500 to 31875 nanoseconds)!", + ints); + configs[wd7000_card_num].bus_on = BUS_ON; + } + else + configs[wd7000_card_num].bus_on = ints[4] / 125.0; + } + else + configs[wd7000_card_num].bus_on = BUS_ON; + + if (ints[0] > 4) { + if ((ints[5] < 500) || (ints[5] > 31875)) { + setup_error ("BUS_OFF value is out of range (500 to 31875 nanoseconds)!", + ints); + configs[wd7000_card_num].bus_off = BUS_OFF; + } + else + configs[wd7000_card_num].bus_off = ints[5] / 125.0; + } + else + configs[wd7000_card_num].bus_off = BUS_OFF; if (wd7000_card_num) for (i = 0; i < (wd7000_card_num - 1); i++) - for (j = i + 1; j < wd7000_card_num; j++) - if (wd7000_setupIRQ[i] == wd7000_setupIRQ[j]) { - printk ("wd7000_setup: \"wd7000=%d,%d,0x%x\" -> " - "duplicated IRQ!\n", - ints[1], ints[2], ints[3]); - return; - } - else if (wd7000_setupDMA[i] == wd7000_setupDMA[j]) { - printk ("wd7000_setup: \"wd7000=%d,%d,0x%x\" -> " - "duplicated DMA channel!\n", - ints[1], ints[2], ints[3]); - return; - } - else if (wd7000_setupIO[i] == wd7000_setupIO[j]) { - printk ("wd7000_setup: \"wd7000=%d,%d,0x%x\" -> " - "duplicated I/O base address!\n", - ints[1], ints[2], ints[3]); - return; - } - -#ifdef DEBUG - printk ("wd7000_setup: IRQ=%d, DMA=%d, I/O=0x%x\n", - wd7000_setupIRQ[wd7000_card_num], - wd7000_setupDMA[wd7000_card_num], - wd7000_setupIO[wd7000_card_num]); + for (j = i + 1; j < wd7000_card_num; j++) + if (configs[i].irq == configs[j].irq) { + setup_error ("duplicated IRQ!", ints); + return; + } + else if (configs[i].dma == configs[j].dma) { + setup_error ("duplicated DMA channel!", ints); + return; + } + else if (configs[i].iobase == configs[j].iobase) { + setup_error ("duplicated I/O base address!", ints); + return; + } + +#ifdef WD7000_DEBUG + printk ("wd7000_setup: IRQ=%d, DMA=%d, I/O=0x%x, BUS_ON=%dns, BUS_OFF=%dns\n", + configs[wd7000_card_num].irq, + configs[wd7000_card_num].dma, + configs[wd7000_card_num].iobase, + configs[wd7000_card_num].bus_on * 125, + configs[wd7000_card_num].bus_off * 125); #endif wd7000_card_num++; @@ -650,21 +714,20 @@ void wd7000_setup (char *str, int *ints) #ifdef ANY2SCSI_INLINE /* - Since they're used a lot, I've redone the following from the macros - formerly in wd7000.h, hopefully to speed them up by getting rid of - all the shifting (it may not matter; GCC might have done as well anyway). - - xany2scsi and xscsi2int were not being used, and are no longer defined. - (They were simply 4-byte versions of these routines). -*/ - -typedef union { /* let's cheat... */ - int i; - unchar u[sizeof(int)]; /* the sizeof(int) makes it more portable */ + * Since they're used a lot, I've redone the following from the macros + * formerly in wd7000.h, hopefully to speed them up by getting rid of + * all the shifting (it may not matter; GCC might have done as well anyway). + * + * xany2scsi and xscsi2int were not being used, and are no longer defined. + * (They were simply 4-byte versions of these routines). + */ +typedef union { /* let's cheat... */ + int i; + unchar u[sizeof (int)]; /* the sizeof(int) makes it more portable */ } i_u; -static inline void any2scsi( unchar *scsi, int any ) +static inline void any2scsi (unchar * scsi, int any) { *scsi++ = ((i_u) any).u[2]; *scsi++ = ((i_u) any).u[1]; @@ -672,49 +735,50 @@ static inline void any2scsi( unchar *scsi, int any ) } -static inline int scsi2int( unchar *scsi ) +static inline int scsi2int (unchar * scsi) { i_u result; - result.i = 0; /* clears unused bytes */ - *(result.u+2) = *scsi++; - *(result.u+1) = *scsi++; - *(result.u) = *scsi++; - return result.i; + result.i = 0; /* clears unused bytes */ + result.u[2] = *scsi++; + result.u[1] = *scsi++; + result.u[0] = *scsi++; + + return (result.i); } #else /* - These are the old ones - I've just moved them here... -*/ + * These are the old ones - I've just moved them here... + */ #undef any2scsi -#define any2scsi(up, p) \ -(up)[0] = (((unsigned long)(p)) >> 16); \ -(up)[1] = ((unsigned long)(p)) >> 8; \ -(up)[2] = ((unsigned long)(p)); +#define any2scsi(up, p) (up)[0] = (((unsigned long) (p)) >> 16); \ + (up)[1] = ((unsigned long) (p)) >> 8; \ + (up)[2] = ((unsigned long) (p)); #undef scsi2int -#define scsi2int(up) ( (((unsigned long)*(up)) << 16) + \ - (((unsigned long)(up)[1]) << 8) + ((unsigned long)(up)[2]) ) +#define scsi2int(up) ( (((unsigned long) *(up)) << 16) + \ + (((unsigned long) (up)[1]) << 8) + \ + ((unsigned long) (up)[2]) ) #endif - -static inline void wd7000_enable_intr(Adapter *host) + +static inline void wd7000_enable_intr (Adapter *host) { host->control |= INT_EN; - outb(host->control, host->iobase+ASC_CONTROL); + outb (host->control, host->iobase + ASC_CONTROL); } -static inline void wd7000_enable_dma(Adapter *host) +static inline void wd7000_enable_dma (Adapter *host) { host->control |= DMA_EN; - outb(host->control,host->iobase+ASC_CONTROL); - set_dma_mode(host->dma, DMA_MODE_CASCADE); - enable_dma(host->dma); + outb (host->control, host->iobase + ASC_CONTROL); + set_dma_mode (host->dma, DMA_MODE_CASCADE); + enable_dma (host->dma); } -#define WAITnexttimeout 200 /* 2 seconds */ +#define WAITnexttimeout 200 /* 2 seconds */ static inline short WAIT (unsigned port, unsigned mask, unsigned allof, unsigned noneof) { @@ -722,7 +786,7 @@ static inline short WAIT (unsigned port, unsigned mask, unsigned allof, unsigned register unsigned long WAITtimeout = jiffies + WAITnexttimeout; while (jiffies <= WAITtimeout) { - WAITbits = inb (port) & mask; + WAITbits = inb (port) & mask; if (((WAITbits & allof) == allof) && ((WAITbits & noneof) == 0)) return (0); @@ -732,31 +796,32 @@ static inline short WAIT (unsigned port, unsigned mask, unsigned allof, unsigned } -static inline void delay( unsigned how_long ) +static inline void delay (unsigned how_long) { - register unsigned long time = jiffies + how_long; + register unsigned long time = jiffies + how_long; - while (jiffies < time); + while (jiffies < time); } -static inline int command_out(Adapter *host, unchar *cmd, int len) +static inline int command_out (Adapter * host, unchar * cmd, int len) { - if (! WAIT (host->iobase+ASC_STAT,ASC_STATMASK,CMD_RDY,0)) { - while (len--) { - do { - outb(*cmd, host->iobase+ASC_COMMAND); - WAIT(host->iobase+ASC_STAT, ASC_STATMASK, CMD_RDY, 0); - } while (inb(host->iobase+ASC_STAT) & CMD_REJ); + if (!WAIT (host->iobase + ASC_STAT, ASC_STATMASK, CMD_RDY, 0)) { + while (len--) { + do { + outb (*cmd, host->iobase + ASC_COMMAND); + WAIT (host->iobase + ASC_STAT, ASC_STATMASK, CMD_RDY, 0); + } while (inb (host->iobase + ASC_STAT) & CMD_REJ); cmd++; } - return 1; + return (1); } - printk("wd7000 command_out: WAIT failed(%d)\n", len+1); - return 0; + printk ("wd7000 command_out: WAIT failed(%d)\n", len + 1); + + return (0); } @@ -770,7 +835,7 @@ static inline int command_out(Adapter *host, unchar *cmd, int len) * the satisfiability of a request is not dependent on the size of the * request. */ -static inline Scb *alloc_scbs(int needed) +static inline Scb *alloc_scbs (int needed) { register Scb *scb, *p; register unsigned long flags; @@ -779,84 +844,89 @@ static inline Scb *alloc_scbs(int needed) static int busy = 0; int i; - if (needed <= 0) return NULL; /* sanity check */ + if (needed <= 0) + return (NULL); /* sanity check */ - save_flags(flags); - cli(); - while (busy) { /* someone else is allocating */ - sti(); /* Yes this is really needed here */ - now = jiffies; while (jiffies == now) /* wait a jiffy */; - cli(); + save_flags (flags); + cli (); + while (busy) { /* someone else is allocating */ + sti (); /* Yes this is really needed here */ + for (now = jiffies; now == jiffies; ); /* wait a jiffy */ + cli (); } - busy = 1; /* not busy now; it's our turn */ + busy = 1; /* not busy now; it's our turn */ - while (freescbs < needed) { + while (freescbs < needed) { timeout = jiffies + WAITnexttimeout; do { - sti(); /* Yes this is really needed here */ - now = jiffies; - while (jiffies == now); /* wait a jiffy */ - cli(); - } while (freescbs < needed && jiffies <= timeout); + sti (); /* Yes this is really needed here */ + for (now = jiffies; now == jiffies; ); /* wait a jiffy */ + cli (); + } while (freescbs < needed && jiffies <= timeout); /* * If we get here with enough free Scbs, we can take them. * Otherwise, we timed out and didn't get enough. */ - if (freescbs < needed) { + if (freescbs < needed) { busy = 0; - panic("wd7000: can't get enough free SCBs.\n"); - restore_flags(flags); - return NULL; + panic ("wd7000: can't get enough free SCBs.\n"); + restore_flags (flags); + return (NULL); } } - scb = scbfree; freescbs -= needed; - for (i = 0; i < needed; i++) { p = scbfree; scbfree = p->next; } + scb = scbfree; + freescbs -= needed; + for (i = 0; i < needed; i++) { + p = scbfree; + scbfree = p->next; + } p->next = NULL; - - busy = 0; /* we're done */ + busy = 0; /* we're done */ - restore_flags(flags); + restore_flags (flags); - return scb; + return (scb); } -static inline void free_scb( Scb *scb ) +static inline void free_scb (Scb *scb) { register unsigned long flags; - save_flags(flags); - cli(); + save_flags (flags); + cli (); - memset(scb, 0, sizeof(Scb)); - scb->next = scbfree; scbfree = scb; + memset (scb, 0, sizeof (Scb)); + scb->next = scbfree; + scbfree = scb; freescbs++; - restore_flags(flags); + restore_flags (flags); } -static inline void init_scbs(void) +static inline void init_scbs (void) { int i; unsigned long flags; - save_flags(flags); - cli(); + save_flags (flags); + cli (); scbfree = &(scbs[0]); - memset(scbs, 0, sizeof(scbs)); - for (i = 0; i < MAX_SCBS-1; i++) { - scbs[i].next = &(scbs[i+1]); scbs[i].SCpnt = NULL; + memset (scbs, 0, sizeof (scbs)); + for (i = 0; i < MAX_SCBS - 1; i++) { + scbs[i].next = &(scbs[i + 1]); + scbs[i].SCpnt = NULL; } - scbs[MAX_SCBS-1].next = NULL; - scbs[MAX_SCBS-1].SCpnt = NULL; + scbs[MAX_SCBS - 1].next = NULL; + scbs[MAX_SCBS - 1].SCpnt = NULL; + + restore_flags (flags); +} - restore_flags(flags); -} - -static int mail_out( Adapter *host, Scb *scbptr ) +static int mail_out (Adapter *host, Scb *scbptr) /* * Note: this can also be used for ICBs; just cast to the parm type. */ @@ -866,30 +936,35 @@ static int mail_out( Adapter *host, Scb *scbptr ) unchar start_ogmb; Mailbox *ogmbs = host->mb.ogmb; int *next_ogmb = &(host->next_ogmb); -#ifdef DEBUG - printk("wd7000 mail_out: 0x%06lx",(long) scbptr); + +#ifdef WD7000_DEBUG + printk ("wd7000_mail_out: 0x%06lx", (long) scbptr); #endif + /* We first look for a free outgoing mailbox */ - save_flags(flags); - cli(); + save_flags (flags); + cli (); ogmb = *next_ogmb; for (i = 0; i < OGMB_CNT; i++) { - if (ogmbs[ogmb].status == 0) { -#ifdef DEBUG - printk(" using OGMB 0x%x",ogmb); + if (ogmbs[ogmb].status == 0) { +#ifdef WD7000_DEBUG + printk (" using OGMB 0x%x", ogmb); #endif ogmbs[ogmb].status = 1; - any2scsi((unchar *) ogmbs[ogmb].scbptr, (int) scbptr); + any2scsi ((unchar *) ogmbs[ogmb].scbptr, (int) scbptr); - *next_ogmb = (ogmb+1) % OGMB_CNT; + *next_ogmb = (ogmb + 1) % OGMB_CNT; break; - } else + } + else ogmb = (++ogmb) % OGMB_CNT; } - restore_flags(flags); -#ifdef DEBUG - printk(", scb is 0x%06lx",(long) scbptr); + restore_flags (flags); + +#ifdef WD7000_DEBUG + printk (", scb is 0x%06lx", (long) scbptr); #endif + if (i >= OGMB_CNT) { /* * Alternatively, we might issue the "interrupt on free OGMB", @@ -899,171 +974,182 @@ static int mail_out( Adapter *host, Scb *scbptr ) * that marks OGMB's free, waiting even with interrupts off * should work, since they are freed very quickly in most cases. */ - #ifdef DEBUG - printk(", no free OGMBs.\n"); +#ifdef WD7000_DEBUG + printk (", no free OGMBs.\n"); #endif - return 0; + return (0); } - wd7000_enable_intr(host); + wd7000_enable_intr (host); start_ogmb = START_OGMB | ogmb; - command_out( host, &start_ogmb, 1 ); -#ifdef DEBUG - printk(", awaiting interrupt.\n"); + command_out (host, &start_ogmb, 1); + +#ifdef WD7000_DEBUG + printk (", awaiting interrupt.\n"); #endif - return 1; + + return (1); } -int make_code(unsigned hosterr, unsigned scsierr) -{ -#ifdef DEBUG +int make_code (unsigned hosterr, unsigned scsierr) +{ +#ifdef WD7000_DEBUG int in_error = hosterr; #endif - switch ((hosterr>>8)&0xff){ - case 0: /* Reserved */ - hosterr = DID_ERROR; - break; - case 1: /* Command Complete, no errors */ - hosterr = DID_OK; - break; - case 2: /* Command complete, error logged in scb status (scsierr) */ - hosterr = DID_OK; - break; - case 4: /* Command failed to complete - timeout */ - hosterr = DID_TIME_OUT; - break; - case 5: /* Command terminated; Bus reset by external device */ - hosterr = DID_RESET; - break; - case 6: /* Unexpected Command Received w/ host as target */ - hosterr = DID_BAD_TARGET; - break; + switch ((hosterr >> 8) & 0xff) { + case 0: /* Reserved */ + hosterr = DID_ERROR; + break; + case 1: /* Command Complete, no errors */ + hosterr = DID_OK; + break; + case 2: /* Command complete, error logged in scb status (scsierr) */ + hosterr = DID_OK; + break; + case 4: /* Command failed to complete - timeout */ + hosterr = DID_TIME_OUT; + break; + case 5: /* Command terminated; Bus reset by external device */ + hosterr = DID_RESET; + break; + case 6: /* Unexpected Command Received w/ host as target */ + hosterr = DID_BAD_TARGET; + break; case 80: /* Unexpected Reselection */ case 81: /* Unexpected Selection */ - hosterr = DID_BAD_INTR; - break; + hosterr = DID_BAD_INTR; + break; case 82: /* Abort Command Message */ - hosterr = DID_ABORT; - break; + hosterr = DID_ABORT; + break; case 83: /* SCSI Bus Software Reset */ case 84: /* SCSI Bus Hardware Reset */ - hosterr = DID_RESET; - break; + hosterr = DID_RESET; + break; default: /* Reserved */ - hosterr = DID_ERROR; - break; - } -#ifdef DEBUG - if (scsierr||hosterr) - printk("\nSCSI command error: SCSI 0x%02x host 0x%04x return %d\n", - scsierr,in_error,hosterr); + hosterr = DID_ERROR; + } +#ifdef WD7000_DEBUG + if (scsierr || hosterr) + printk ("\nSCSI command error: SCSI 0x%02x host 0x%04x return %d\n", + scsierr, in_error, hosterr); #endif - return scsierr | (hosterr << 16); + return (scsierr | (hosterr << 16)); } -static void wd7000_scsi_done(Scsi_Cmnd * SCpnt) +static void wd7000_scsi_done (Scsi_Cmnd *SCpnt) { -#ifdef DEBUG +#ifdef WD7000_DEBUG printk ("wd7000_scsi_done: 0x%06lx\n", (long) SCpnt); #endif + SCpnt->SCp.phase = 0; } -#define wd7000_intr_ack(host) outb(0,host->iobase+ASC_INTR_ACK) +#define wd7000_intr_ack(host) outb (0, host->iobase + ASC_INTR_ACK) -void wd7000_intr_handle(int irq, void *dev_id, struct pt_regs * regs) +void wd7000_intr_handle (int irq, void *dev_id, struct pt_regs *regs) { register int flag, icmb, errstatus, icmb_status; register int host_error, scsi_error; - register Scb *scb; /* for SCSI commands */ - register IcbAny *icb; /* for host commands */ + register Scb *scb; /* for SCSI commands */ + register IcbAny *icb; /* for host commands */ register Scsi_Cmnd *SCpnt; - Adapter *host = irq2host[irq]; /* This MUST be set!!! */ + Adapter *host = (Adapter *) wd7000_host[irq - IRQ_MIN]->hostdata; /* This MUST be set!!! */ Mailbox *icmbs = host->mb.icmb; -#ifdef DEBUG - printk("wd7000_intr_handle: irq = %d, host = 0x%06lx\n", irq, (long) host); + host->int_counter++; + +#ifdef WD7000_DEBUG + printk ("wd7000_intr_handle: irq = %d, host = 0x%06lx\n", irq, (long) host); #endif - flag = inb(host->iobase+ASC_INTR_STAT); -#ifdef DEBUG - printk("wd7000_intr_handle: intr stat = 0x%02x\n",flag); + flag = inb (host->iobase + ASC_INTR_STAT); + +#ifdef WD7000_DEBUG + printk ("wd7000_intr_handle: intr stat = 0x%02x\n", flag); #endif - if (!(inb(host->iobase+ASC_STAT) & INT_IM)) { + if (!(inb (host->iobase + ASC_STAT) & INT_IM)) { /* NB: these are _very_ possible if IRQ 15 is being used, since - it's the "garbage collector" on the 2nd 8259 PIC. Specifically, - any interrupt signal into the 8259 which can't be identified - comes out as 7 from the 8259, which is 15 to the host. Thus, it - is a good thing the WD7000 has an interrupt status port, so we - can sort these out. Otherwise, electrical noise and other such - problems would be indistinguishable from valid interrupts... - */ -#ifdef DEBUG - printk("wd7000_intr_handle: phantom interrupt...\n"); + * it's the "garbage collector" on the 2nd 8259 PIC. Specifically, + * any interrupt signal into the 8259 which can't be identified + * comes out as 7 from the 8259, which is 15 to the host. Thus, it + * is a good thing the WD7000 has an interrupt status port, so we + * can sort these out. Otherwise, electrical noise and other such + * problems would be indistinguishable from valid interrupts... + */ +#ifdef WD7000_DEBUG + printk ("wd7000_intr_handle: phantom interrupt...\n"); #endif - wd7000_intr_ack(host); - return; + wd7000_intr_ack (host); + return; } - if (flag & MB_INTR) { + if (flag & MB_INTR) { /* The interrupt is for a mailbox */ if (!(flag & IMB_INTR)) { -#ifdef DEBUG - printk("wd7000_intr_handle: free outgoing mailbox\n"); +#ifdef WD7000_DEBUG + printk ("wd7000_intr_handle: free outgoing mailbox\n"); #endif /* * If sleep_on() and the "interrupt on free OGMB" command are * used in mail_out(), wake_up() should correspondingly be called * here. For now, we don't need to do anything special. */ - wd7000_intr_ack(host); + wd7000_intr_ack (host); return; - } else { + } + else { /* The interrupt is for an incoming mailbox */ icmb = flag & MB_MASK; icmb_status = icmbs[icmb].status; - if (icmb_status & 0x80) { /* unsolicited - result in ICMB */ -#ifdef DEBUG - printk("wd7000_intr_handle: unsolicited interrupt 0x%02xh\n", - icmb_status); + if (icmb_status & 0x80) { /* unsolicited - result in ICMB */ +#ifdef WD7000_DEBUG + printk ("wd7000_intr_handle: unsolicited interrupt 0x%02x\n", + icmb_status); #endif - wd7000_intr_ack(host); + wd7000_intr_ack (host); return; } - scb = (struct scb *) scsi2int((unchar *)icmbs[icmb].scbptr); + /* Aaaargh! (Zaga) */ + scb = (struct scb *) (scsi2int ((unchar *) icmbs[icmb].scbptr) | PAGE_OFFSET); icmbs[icmb].status = 0; - if (!(scb->op & ICB_OP_MASK)) { /* an SCB is done */ + if (!(scb->op & ICB_OP_MASK)) { /* an SCB is done */ SCpnt = scb->SCpnt; - if (--(SCpnt->SCp.phase) <= 0) { /* all scbs are done */ + if (--(SCpnt->SCp.phase) <= 0) { /* all scbs are done */ host_error = scb->vue | (icmb_status << 8); scsi_error = scb->status; - errstatus = make_code(host_error,scsi_error); + errstatus = make_code (host_error, scsi_error); SCpnt->result = errstatus; - free_scb(scb); + free_scb (scb); - SCpnt->scsi_done(SCpnt); + SCpnt->scsi_done (SCpnt); } - } else { /* an ICB is done */ + } + else { /* an ICB is done */ icb = (IcbAny *) scb; icb->status = icmb_status; - icb->phase = 0; + icb->phase = 0; } - } /* incoming mailbox */ + } /* incoming mailbox */ } - wd7000_intr_ack(host); - return; + wd7000_intr_ack (host); + +#ifdef WD7000_DEBUG + printk ("wd7000_intr_handle: return from interrupt handler\n"); +#endif } -int wd7000_queuecommand(Scsi_Cmnd * SCpnt, void (*done)(Scsi_Cmnd *)) +int wd7000_queuecommand (Scsi_Cmnd *SCpnt, void (*done) (Scsi_Cmnd *)) { register Scb *scb; register Sgb *sgb; @@ -1076,94 +1162,105 @@ int wd7000_queuecommand(Scsi_Cmnd * SCpnt, void (*done)(Scsi_Cmnd *)) idlun = ((SCpnt->target << 5) & 0xe0) | (SCpnt->lun & 7); SCpnt->scsi_done = done; SCpnt->SCp.phase = 1; - scb = alloc_scbs(1); + scb = alloc_scbs (1); scb->idlun = idlun; - memcpy(scb->cdb, cdb, cdblen); + memcpy (scb->cdb, cdb, cdblen); scb->direc = 0x40; /* Disable direction check */ - scb->SCpnt = SCpnt; /* so we can find stuff later */ + scb->SCpnt = SCpnt; /* so we can find stuff later */ SCpnt->host_scribble = (unchar *) scb; scb->host = host; - if (SCpnt->use_sg) { + if (SCpnt->use_sg) { struct scatterlist *sg = (struct scatterlist *) SCpnt->request_buffer; unsigned i; - if (SCpnt->host->sg_tablesize == SG_NONE) { - panic("wd7000_queuecommand: scatter/gather not supported.\n"); + if (SCpnt->host->sg_tablesize == SG_NONE) { + panic ("wd7000_queuecommand: scatter/gather not supported.\n"); } -#ifdef DEBUG - printk("Using scatter/gather with %d elements.\n",SCpnt->use_sg); +#ifdef WD7000_DEBUG + printk ("Using scatter/gather with %d elements.\n", SCpnt->use_sg); #endif sgb = scb->sgb; - scb->op = 1; - any2scsi(scb->dataptr, (int) sgb); - any2scsi(scb->maxlen, SCpnt->use_sg * sizeof (Sgb) ); + scb->op = 1; + any2scsi (scb->dataptr, (int) sgb); + any2scsi (scb->maxlen, SCpnt->use_sg * sizeof (Sgb)); - for (i = 0; i < SCpnt->use_sg; i++) { - any2scsi(sgb[i].ptr, (int) sg[i].address); - any2scsi(sgb[i].len, sg[i].length); + for (i = 0; i < SCpnt->use_sg; i++) { + any2scsi (sgb[i].ptr, (int) sg[i].address); + any2scsi (sgb[i].len, sg[i].length); } - } else { + } + else { scb->op = 0; - any2scsi(scb->dataptr, (int) SCpnt->request_buffer); - any2scsi(scb->maxlen, SCpnt->request_bufflen); + any2scsi (scb->dataptr, (int) SCpnt->request_buffer); + any2scsi (scb->maxlen, SCpnt->request_bufflen); } - while (!mail_out(host, scb)) /* keep trying */; - return 1; + while (!mail_out (host, scb)); /* keep trying */ + + return (1); } -int wd7000_command(Scsi_Cmnd *SCpnt) +int wd7000_command (Scsi_Cmnd *SCpnt) { - wd7000_queuecommand(SCpnt, wd7000_scsi_done); + wd7000_queuecommand (SCpnt, wd7000_scsi_done); - while (SCpnt->SCp.phase > 0) barrier(); /* phase counts scbs down to 0 */ + while (SCpnt->SCp.phase > 0) + barrier (); /* phase counts scbs down to 0 */ - return SCpnt->result; + return (SCpnt->result); } -int wd7000_diagnostics( Adapter *host, int code ) +int wd7000_diagnostics (Adapter *host, int code) { static IcbDiag icb = {ICB_OP_DIAGNOSTICS}; static unchar buf[256]; unsigned long timeout; icb.type = code; - any2scsi(icb.len, sizeof(buf)); - any2scsi(icb.ptr, (int) &buf); + any2scsi (icb.len, sizeof (buf)); + any2scsi (icb.ptr, (int) &buf); icb.phase = 1; /* * This routine is only called at init, so there should be OGMBs * available. I'm assuming so here. If this is going to * fail, I can just let the timeout catch the failure. */ - mail_out(host, (struct scb *) &icb); - timeout = jiffies + WAITnexttimeout; /* wait up to 2 seconds */ + mail_out (host, (struct scb *) &icb); + timeout = jiffies + WAITnexttimeout; /* wait up to 2 seconds */ while (icb.phase && jiffies < timeout) - barrier(); /* wait for completion */ + barrier (); /* wait for completion */ - if (icb.phase) { - printk("wd7000_diagnostics: timed out.\n"); - return 0; + if (icb.phase) { + printk ("wd7000_diagnostics: timed out.\n"); + return (0); } - if (make_code(icb.vue|(icb.status << 8),0)) { - printk("wd7000_diagnostics: failed (0x%02x,0x%02x)\n", - icb.vue, icb.status); - return 0; + if (make_code (icb.vue | (icb.status << 8), 0)) { + printk ("wd7000_diagnostics: failed (0x%02x,0x%02x)\n", + icb.vue, icb.status); + return (0); } - return 1; + return (1); } -int wd7000_init( Adapter *host ) +int wd7000_init (Adapter *host) { - InitCmd init_cmd = { - INITIALIZATION, 7, BUS_ON, BUS_OFF, 0, {0,0,0}, OGMB_CNT, ICMB_CNT + InitCmd init_cmd = + { + INITIALIZATION, + 7, + host->bus_on, + host->bus_off, + 0, + { 0, 0, 0 }, + OGMB_CNT, + ICMB_CNT }; int diag; @@ -1171,84 +1268,78 @@ int wd7000_init( Adapter *host ) * Reset the adapter - only. The SCSI bus was initialized at power-up, * and we need to do this just so we control the mailboxes, etc. */ - outb(ASC_RES, host->iobase+ASC_CONTROL); - delay(1); /* reset pulse: this is 10ms, only need 25us */ - outb(0,host->iobase+ASC_CONTROL); - host->control = 0; /* this must always shadow ASC_CONTROL */ - - if (WAIT (host->iobase+ASC_STAT, ASC_STATMASK, CMD_RDY, 0)) { - printk ("wd7000_init: WAIT timed out.\n"); - return 0; /* 0 = not ok */ + outb (ASC_RES, host->iobase + ASC_CONTROL); + delay (1); /* reset pulse: this is 10ms, only need 25us */ + outb (0, host->iobase + ASC_CONTROL); + host->control = 0; /* this must always shadow ASC_CONTROL */ + + if (WAIT (host->iobase + ASC_STAT, ASC_STATMASK, CMD_RDY, 0)) { + printk ("wd7000_init: WAIT timed out.\n"); + return (0); /* 0 = not ok */ } - if ((diag = inb(host->iobase+ASC_INTR_STAT)) != 1) { - printk("wd7000_init: "); - - switch (diag) { - case 2: - printk("RAM failure.\n"); - break; - case 3: - printk("FIFO R/W failed\n"); - break; - case 4: - printk("SBIC register R/W failed\n"); - break; - case 5: - printk("Initialization D-FF failed.\n"); - break; - case 6: - printk("Host IRQ D-FF failed.\n"); - break; - case 7: - printk("ROM checksum error.\n"); - break; - default: - printk("diagnostic code 0x%02Xh received.\n", diag); + if ((diag = inb (host->iobase + ASC_INTR_STAT)) != 1) { + printk ("wd7000_init: "); + + switch (diag) { + case 2: printk ("RAM failure.\n"); + break; + case 3: printk ("FIFO R/W failed\n"); + break; + case 4: printk ("SBIC register R/W failed\n"); + break; + case 5: printk ("Initialization D-FF failed.\n"); + break; + case 6: printk ("Host IRQ D-FF failed.\n"); + break; + case 7: printk ("ROM checksum error.\n"); + break; + default: printk ("diagnostic code 0x%02Xh received.\n", diag); } - return 0; + return (0); } - + /* Clear mailboxes */ - memset(&(host->mb), 0, sizeof(host->mb)); + memset (&(host->mb), 0, sizeof (host->mb)); /* Execute init command */ - any2scsi((unchar *) &(init_cmd.mailboxes), (int) &(host->mb)); - if (!command_out(host, (unchar *) &init_cmd, sizeof(init_cmd))) { - printk("wd7000_init: adapter initialization failed.\n"); - return 0; + any2scsi ((unchar *) & (init_cmd.mailboxes), (int) &(host->mb)); + if (!command_out (host, (unchar *) &init_cmd, sizeof (init_cmd))) { + printk ("wd7000_init: adapter initialization failed.\n"); + return (0); } - if (WAIT (host->iobase+ASC_STAT, ASC_STATMASK, ASC_INIT, 0)) { - printk ("wd7000_init: WAIT timed out.\n"); - return 0; + if (WAIT (host->iobase + ASC_STAT, ASC_STATMASK, ASC_INIT, 0)) { + printk ("wd7000_init: WAIT timed out.\n"); + return (0); } - if (request_irq(host->irq, wd7000_intr_handle, SA_INTERRUPT, "wd7000", NULL)) { - printk("wd7000_init: can't get IRQ %d.\n", host->irq); - return 0; + if (request_irq (host->irq, wd7000_intr_handle, SA_INTERRUPT, "wd7000", NULL)) { + printk ("wd7000_init: can't get IRQ %d.\n", host->irq); + return (0); } - if (request_dma(host->dma,"wd7000")) { - printk("wd7000_init: can't get DMA channel %d.\n", host->dma); - free_irq(host->irq, NULL); - return 0; + if (request_dma (host->dma, "wd7000")) { + printk ("wd7000_init: can't get DMA channel %d.\n", host->dma); + free_irq (host->irq, NULL); + return (0); } - wd7000_enable_dma(host); - wd7000_enable_intr(host); + wd7000_enable_dma (host); + wd7000_enable_intr (host); - if (!wd7000_diagnostics(host,ICB_DIAG_FULL)) { - free_dma(host->dma); - free_irq(host->irq, NULL); - return 0; + if (!wd7000_diagnostics (host, ICB_DIAG_FULL)) { + free_dma (host->dma); + free_irq (host->irq, NULL); + return (0); } - return 1; + return (1); } -void wd7000_revision(Adapter *host) +void wd7000_revision (Adapter *host) { - static IcbRevLvl icb = {ICB_OP_GET_REVISION}; + static IcbRevLvl icb = + {ICB_OP_GET_REVISION}; icb.phase = 1; /* @@ -1257,14 +1348,160 @@ void wd7000_revision(Adapter *host) * the only damage will be that the revision will show up as 0.0, * which in turn means that scatter/gather will be disabled. */ - mail_out(host, (struct scb *) &icb); + mail_out (host, (struct scb *) &icb); while (icb.phase) - barrier(); /* wait for completion */ + barrier (); /* wait for completion */ host->rev1 = icb.primary; host->rev2 = icb.secondary; } +#undef SPRINTF +#define SPRINTF(args...) { if (pos < (buffer + length)) pos += sprintf (pos, ## args); } + +int wd7000_set_info (char *buffer, int length, struct Scsi_Host *host) +{ + unsigned long flags; + + save_flags (flags); + cli (); + +#ifdef WD7000_DEBUG + printk ("Buffer = <%.*s>, length = %d\n", length, buffer, length); +#endif + + /* + * Currently this is a no-op + */ + printk ("Sorry, this function is currently out of order...\n"); + + restore_flags (flags); + + return (length); +} + + +int wd7000_proc_info (char *buffer, char **start, off_t offset, int length, int hostno, int inout) +{ + struct Scsi_Host *host = NULL; + Scsi_Device *scd; + Adapter *adapter; + unsigned long flags; + char *pos = buffer; + short i; + +#ifdef WD7000_DEBUG + Mailbox *ogmbs, *icmbs; + short count; +#endif + + /* + * Find the specified host board. + */ + for (i = 0; i < IRQS; i++) + if (wd7000_host[i] && (wd7000_host[i]->host_no == hostno)) { + host = wd7000_host[i]; + + break; + } + + /* + * Host not found! + */ + if (! host) + return (-ESRCH); + + /* + * Has data been written to the file ? + */ + if (inout) + return (wd7000_set_info (buffer, length, host)); + + adapter = (Adapter *) host->hostdata; + + save_flags (flags); + cli (); + + SPRINTF ("Host scsi%d: Western Digital WD-7000 (rev %d.%d)\n", hostno, adapter->rev1, adapter->rev2); + SPRINTF (" IO base: 0x%x\n", adapter->iobase); + SPRINTF (" IRQ: %d\n", adapter->irq); + SPRINTF (" DMA channel: %d\n", adapter->dma); + SPRINTF (" Interrupts: %d\n", adapter->int_counter); + SPRINTF (" BUS_ON time: %d nanoseconds\n", adapter->bus_on * 125); + SPRINTF (" BUS_OFF time: %d nanoseconds\n", adapter->bus_off * 125); + +#ifdef WD7000_DEBUG + ogmbs = adapter->mb.ogmb; + icmbs = adapter->mb.icmb; + + SPRINTF ("\nControl port value: 0x%x\n", adapter->control); + SPRINTF ("Incoming mailbox:\n"); + SPRINTF (" size: %d\n", ICMB_CNT); + SPRINTF (" queued messages: "); + + for (i = count = 0; i < ICMB_CNT; i++) + if (icmbs[i].status) { + count++; + SPRINTF ("0x%x ", i); + } + + SPRINTF (count ? "\n" : "none\n"); + + SPRINTF ("Outgoing mailbox:\n"); + SPRINTF (" size: %d\n", OGMB_CNT); + SPRINTF (" next message: 0x%x\n", adapter->next_ogmb); + SPRINTF (" queued messages: "); + + for (i = count = 0; i < OGMB_CNT; i++) + if (ogmbs[i].status) { + count++; + SPRINTF ("0x%x ", i); + } + + SPRINTF (count ? "\n" : "none\n"); +#endif + + /* + * Display driver information for each device attached to the board. + */ + scd = host->host_queue; + + SPRINTF ("\nAttached devices: %s\n", scd ? "" : "none"); + + for ( ; scd; scd = scd->next) + if (scd->host->host_no == hostno) { + SPRINTF (" [Channel: %02d, Id: %02d, Lun: %02d] ", + scd->channel, scd->id, scd->lun); + SPRINTF ("%s ", (scd->type < MAX_SCSI_DEVICE_CODE) ? + scsi_device_types[(short) scd->type] : "Unknown device"); + + for (i = 0; (i < 8) && (scd->vendor[i] >= 0x20); i++) + SPRINTF ("%c", scd->vendor[i]); + SPRINTF (" "); + + for (i = 0; (i < 16) && (scd->model[i] >= 0x20); i++) + SPRINTF ("%c", scd->model[i]); + SPRINTF ("\n"); + } + + SPRINTF ("\n"); + + restore_flags (flags); + + /* + * Calculate start of next buffer, and return value. + */ + *start = buffer + offset; + + if ((pos - buffer) < offset) + return (0); + else if ((pos - buffer - offset) < length) + return (pos - buffer - offset); + else + return (length); +} + + /* * Returns the number of adapters this driver is supporting. * @@ -1277,84 +1514,102 @@ void wd7000_revision(Adapter *host) */ int wd7000_detect (Scsi_Host_Template *tpnt) { - short present = 0, biosaddr_ptr, cfg_ptr, sig_ptr, i, pass; + short present = 0, biosaddr_ptr, sig_ptr, i, pass; short biosptr[NUM_CONFIGS]; unsigned iobase; Adapter *host = NULL; struct Scsi_Host *sh; - for (i = 0; i < NUM_CONFIGS; biosptr[i++] = -1); +#ifdef WD7000_DEBUG + printk ("wd7000_detect: started\n"); +#endif + + for (i = 0; i < IRQS; wd7000_host[i++] = NULL) ; + for (i = 0; i < NUM_CONFIGS; biosptr[i++] = -1) ; tpnt->proc_dir = &proc_scsi_wd7000; + tpnt->proc_info = &wd7000_proc_info; /* * Set up SCB free list, which is shared by all adapters */ init_scbs (); - for (pass = 0, cfg_ptr = 0; pass < NUM_CONFIGS; pass++) { - for (biosaddr_ptr = 0; biosaddr_ptr < NUM_ADDRS; biosaddr_ptr++) + for (pass = 0; pass < NUM_CONFIGS; pass++) { + /* + * First, search for BIOS SIGNATURE... + */ + for (biosaddr_ptr = 0; biosaddr_ptr < NUM_ADDRS; biosaddr_ptr++) for (sig_ptr = 0; sig_ptr < NUM_SIGNATURES; sig_ptr++) { - for (i = 0; i < pass; i++) - if (biosptr[i] == biosaddr_ptr) - break; - - if ((i == pass) && - check_signature(wd7000_biosaddr[biosaddr_ptr] + - signatures[sig_ptr].ofs, - signatures[sig_ptr].sig, - signatures[sig_ptr].len)) - goto bios_matched; - } + for (i = 0; i < pass; i++) + if (biosptr[i] == biosaddr_ptr) + break; -bios_matched: + if (i == pass) { +#if (LINUX_VERSION_CODE < 0x020100) +#else + void *biosaddr = ioremap (wd7000_biosaddr[biosaddr_ptr] + + signatures[sig_ptr].ofs, + signatures[sig_ptr].len); +#endif + short bios_match = memcmp ((char *) biosaddr, signatures[sig_ptr].sig, + signatures[sig_ptr].len); + +#if (LINUX_VERSION_CODE < 0x020100) +#else + iounmap (biosaddr); +#endif -#ifdef DEBUG + if (! bios_match) + goto bios_matched; + } + } + + bios_matched: + /* + * BIOS SIGNATURE has been found. + */ +#ifdef WD7000_DEBUG printk ("wd7000_detect: pass %d\n", pass + 1); - if (biosaddr_ptr == NUM_ADDRS) + if (biosaddr_ptr == NUM_ADDRS) printk ("WD-7000 SST BIOS not detected...\n"); else printk ("WD-7000 SST BIOS detected at 0x%lx: checking...\n", - wd7000_biosaddr[biosaddr_ptr]); + wd7000_biosaddr[biosaddr_ptr]); #endif - if (wd7000_card_num) - iobase = wd7000_setupIO[wd7000_card_num - 1]; - else { - if (configs[cfg_ptr++].irq < 0) - continue; + if (configs[pass].irq < 0) + continue; - iobase = configs[cfg_ptr - 1].iobase; - } + iobase = configs[pass].iobase; -#ifdef DEBUG - printk ("wd7000_detect: check IO 0x%x region...\n", iobase); +#ifdef WD7000_DEBUG + printk ("wd7000_detect: check IO 0x%x region...\n", iobase); #endif - if (! check_region (iobase, 4)) { + if (!check_region (iobase, 4)) { -#ifdef DEBUG - printk ("wd7000_detect: ASC reset (IO 0x%x) ...", iobase); +#ifdef WD7000_DEBUG + printk ("wd7000_detect: ASC reset (IO 0x%x) ...", iobase); #endif - - /* - * ASC reset... - */ - outb (ASC_RES, iobase + ASC_CONTROL); - delay (1); - outb (0, iobase + ASC_CONTROL); - - if (WAIT (iobase + ASC_STAT, ASC_STATMASK, CMD_RDY, 0)) -#ifdef DEBUG - { - printk ("failed!\n"); - continue; - } - else - printk ("ok!\n"); + /* + * ASC reset... + */ + outb (ASC_RES, iobase + ASC_CONTROL); + delay (1); + outb (0, iobase + ASC_CONTROL); + + if (WAIT (iobase + ASC_STAT, ASC_STATMASK, CMD_RDY, 0)) +#ifdef WD7000_DEBUG + { + printk ("failed!\n"); + continue; + } + else + printk ("ok!\n"); #else - continue; + continue; #endif if (inb (iobase + ASC_INTR_STAT) == 1) { @@ -1368,33 +1623,27 @@ bios_matched: sh = scsi_register (tpnt, sizeof (Adapter)); host = (Adapter *) sh->hostdata; -#ifdef DEBUG - printk ("wd7000_detect: adapter allocated at 0x%x\n", - (int) host); +#ifdef WD7000_DEBUG + printk ("wd7000_detect: adapter allocated at 0x%x\n", (int) host); #endif memset (host, 0, sizeof (Adapter)); - if (wd7000_card_num) { - host->irq = wd7000_setupIRQ[--wd7000_card_num]; - host->dma = wd7000_setupDMA[wd7000_card_num]; - } - else { - host->irq = configs[cfg_ptr - 1].irq; - host->dma = configs[cfg_ptr - 1].dma; - } - - host->sh = sh; - host->iobase = iobase; - irq2host[host->irq] = host; + host->irq = configs[pass].irq; + host->dma = configs[pass].dma; + host->iobase = iobase; + host->int_counter = 0; + host->bus_on = configs[pass].bus_on; + host->bus_off = configs[pass].bus_off; + host->sh = wd7000_host[host->irq - IRQ_MIN] = sh; -#ifdef DEBUG +#ifdef WD7000_DEBUG printk ("wd7000_detect: Trying init WD-7000 card at IO " "0x%x, IRQ %d, DMA %d...\n", host->iobase, host->irq, host->dma); #endif - if (! wd7000_init (host)) { /* Initialization failed */ + if (!wd7000_init (host)) { /* Initialization failed */ scsi_unregister (sh); continue; @@ -1403,7 +1652,7 @@ bios_matched: /* * OK from here - we'll use this adapter/configuration. */ - wd7000_revision (host); /* important for scatter/gather */ + wd7000_revision (host); /* important for scatter/gather */ /* * Register our ports. @@ -1411,34 +1660,34 @@ bios_matched: request_region (host->iobase, 4, "wd7000"); /* - * For boards before rev 6.0, scatter/gather - * isn't supported. + * For boards before rev 6.0, scatter/gather isn't supported. */ if (host->rev1 < 6) sh->sg_tablesize = SG_NONE; - present++; /* count it */ + present++; /* count it */ if (biosaddr_ptr != NUM_ADDRS) biosptr[pass] = biosaddr_ptr; printk ("Western Digital WD-7000 (rev %d.%d) ", host->rev1, host->rev2); - printk ("using IO 0x%x, IRQ %d, DMA %d.\n", + printk ("using IO 0x%x, IRQ %d, DMA %d.\n", host->iobase, host->irq, host->dma); + printk (" BUS_ON time: %dns, BUS_OFF time: %dns\n", + host->bus_on * 125, host->bus_off * 125); } } -#ifdef DEBUG +#ifdef WD7000_DEBUG else - printk ("wd7000_detect: IO 0x%x region already allocated!\n", - iobase); + printk ("wd7000_detect: IO 0x%x region already allocated!\n", iobase); #endif } - if (! present) - printk ("Failed initialization of WD-7000 SCSI card!\n"); + if (!present) + printk ("Failed initialization of WD-7000 SCSI card!\n"); return (present); } @@ -1447,42 +1696,81 @@ bios_matched: /* * I have absolutely NO idea how to do an abort with the WD7000... */ -int wd7000_abort(Scsi_Cmnd * SCpnt) +int wd7000_abort (Scsi_Cmnd *SCpnt) { Adapter *host = (Adapter *) SCpnt->host->hostdata; - if (inb(host->iobase+ASC_STAT) & INT_IM) { - printk("wd7000_abort: lost interrupt\n"); - wd7000_intr_handle(host->irq, NULL, NULL); - return SCSI_ABORT_SUCCESS; + if (inb (host->iobase + ASC_STAT) & INT_IM) { + printk ("wd7000_abort: lost interrupt\n"); + wd7000_intr_handle (host->irq, NULL, NULL); + + return (SCSI_ABORT_SUCCESS); } - return SCSI_ABORT_SNOOZE; + return (SCSI_ABORT_SNOOZE); } /* * I also have no idea how to do a reset... */ -int wd7000_reset(Scsi_Cmnd * SCpnt, unsigned int ignored) +int wd7000_reset (Scsi_Cmnd *SCpnt, unsigned int unused) { - return SCSI_RESET_PUNT; + return (SCSI_RESET_PUNT); } /* - * This was borrowed directly from aha1542.c, but my disks are organized - * this way, so I think it will work OK. Someone who is ambitious can - * borrow a newer or more complete version from another driver. + * This was borrowed directly from aha1542.c. (Zaga) */ -int wd7000_biosparam(Disk * disk, kdev_t dev, int* ip) +int wd7000_biosparam (Disk *disk, kdev_t dev, int *ip) { - int size = disk->capacity; - ip[0] = 64; - ip[1] = 32; - ip[2] = size >> 11; -/* if (ip[2] >= 1024) ip[2] = 1024; */ - return 0; +#ifdef WD7000_DEBUG + printk ("wd7000_biosparam: dev=%s, size=%d, ", kdevname (dev), disk->capacity); +#endif + + /* + * try default translation + */ + ip[0] = 64; + ip[1] = 32; + ip[2] = disk->capacity / (64 * 32); + + /* + * for disks >1GB do some guessing + */ + if (ip[2] >= 1024) { + int info[3]; + + /* + * try to figure out the geometry from the partition table + */ + if ((scsicam_bios_param (disk, dev, info) < 0) || + !(((info[0] == 64) && (info[1] == 32)) || + ((info[0] == 255) && (info[1] == 63)))) { + printk ("wd7000_biosparam: unable to verify geometry for disk with >1GB.\n" + " using extended translation.\n"); + + ip[0] = 255; + ip[1] = 63; + ip[2] = disk->capacity / (255 * 63); + } + else { + ip[0] = info[0]; + ip[1] = info[1]; + ip[2] = info[2]; + + if (info[0] == 255) + printk ("wd7000_biosparam: current partition table is using extended translation.\n"); + } + } + +#ifdef WD7000_DEBUG + printk ("bios geometry: head=%d, sec=%d, cyl=%d\n", ip[0], ip[1], ip[2]); + printk ("WARNING: check, if the bios geometry is correct.\n"); +#endif + + return (0); } #ifdef MODULE diff --git a/drivers/scsi/wd7000.h b/drivers/scsi/wd7000.h index 8835e5f508d6..73679c24a4da 100644 --- a/drivers/scsi/wd7000.h +++ b/drivers/scsi/wd7000.h @@ -8,17 +8,23 @@ * This file has been reduced to only the definitions needed for the * WD7000 host structure. * + * Revision by Miroslav Zagorac Jun 1997. */ #include #include -int wd7000_detect(Scsi_Host_Template *); -int wd7000_command(Scsi_Cmnd *); -int wd7000_queuecommand(Scsi_Cmnd *, void (*done)(Scsi_Cmnd *)); -int wd7000_abort(Scsi_Cmnd *); -int wd7000_reset(Scsi_Cmnd *, unsigned int); -int wd7000_biosparam(Disk *, kdev_t, int *); +extern struct proc_dir_entry proc_scsi_wd7000; + + +int wd7000_set_info (char *buffer, int length, struct Scsi_Host *host); +int wd7000_proc_info (char *buffer, char **start, off_t offset, int length, int hostno, int inout); +int wd7000_detect (Scsi_Host_Template *); +int wd7000_command (Scsi_Cmnd *); +int wd7000_queuecommand (Scsi_Cmnd *, void (*done)(Scsi_Cmnd *)); +int wd7000_abort (Scsi_Cmnd *); +int wd7000_reset (Scsi_Cmnd *, unsigned int); +int wd7000_biosparam (Disk *, kdev_t, int *); #ifndef NULL #define NULL 0L @@ -38,18 +44,22 @@ int wd7000_biosparam(Disk *, kdev_t, int *); #define WD7000_Q 16 #define WD7000_SG 16 -#define WD7000 { \ - name: "Western Digital WD-7000", \ - detect: wd7000_detect, \ - command: wd7000_command, \ - queuecommand: wd7000_queuecommand, \ - abort: wd7000_abort, \ - reset: wd7000_reset, \ - bios_param: wd7000_biosparam, \ - can_queue: WD7000_Q, \ - this_id: 7, \ - sg_tablesize: WD7000_SG, \ - cmd_per_lun: 1, \ - unchecked_isa_dma: 1, \ - use_clustering: ENABLE_CLUSTERING} +#define WD7000 { \ + proc_dir: &proc_scsi_wd7000, \ + proc_info: wd7000_proc_info, \ + name: "Western Digital WD-7000", \ + detect: wd7000_detect, \ + command: wd7000_command, \ + queuecommand: wd7000_queuecommand, \ + abort: wd7000_abort, \ + reset: wd7000_reset, \ + bios_param: wd7000_biosparam, \ + can_queue: WD7000_Q, \ + this_id: 7, \ + sg_tablesize: WD7000_SG, \ + cmd_per_lun: 1, \ + unchecked_isa_dma: 1, \ + use_clustering: ENABLE_CLUSTERING, \ + use_new_eh_code: 0 \ +} #endif diff --git a/drivers/sound/dev_table.c b/drivers/sound/dev_table.c index 666d7765ee56..7f3957bfdf98 100644 --- a/drivers/sound/dev_table.c +++ b/drivers/sound/dev_table.c @@ -148,9 +148,6 @@ void sound_unload_drivers(void) } } - for (i=0;iraw_buf_phys = virt_to_bus(start_addr); for (i = MAP_NR(start_addr); i <= MAP_NR(end_addr); i++) - set_bit(PG_reserved, &mem_map[i].flags);; + set_bit(PG_reserved, &mem_map[i].flags); return 0; } @@ -115,8 +115,6 @@ static void sound_free_dmap(struct dma_buffparms *dmap) if (dmap->raw_buf == NULL) return; - if (dmap->mapping_flags & DMA_MAP_MAPPED) - return; /* Don't free mmapped buffer. Will use it next time */ for (sz = 0, size = PAGE_SIZE; size < dmap->buffsize; sz++, size <<= 1); start_addr = (unsigned long) dmap->raw_buf; @@ -206,6 +204,7 @@ static void close_dmap(struct audio_operations *adev, struct dma_buffparms *dmap dmap->dma_mode = DMODE_NONE; dmap->flags &= ~DMA_BUSY; disable_dma(dmap->dma); + sound_free_dmap(dmap); } @@ -1211,18 +1210,4 @@ unsigned int DMAbuf_poll(struct file * file, int dev, poll_table *wait) return poll_input(file, dev, wait) | poll_output(file, dev, wait); } -void DMAbuf_deinit(int dev) -{ - struct audio_operations *adev = audio_devs[dev]; - /* This routine is called when driver is being unloaded */ - if (!adev) - return; -#ifdef RUNTIME_DMA_ALLOC - sound_free_dmap(adev->dmap_out); - - if (adev->flags & DMA_DUPLEX) - sound_free_dmap(adev->dmap_in); -#endif -} - #endif diff --git a/drivers/sound/sound_calls.h b/drivers/sound/sound_calls.h index aed677c897d9..0ed168e1f319 100644 --- a/drivers/sound/sound_calls.h +++ b/drivers/sound/sound_calls.h @@ -11,7 +11,6 @@ int DMAbuf_start_output(int dev, int buff_no, int l); int DMAbuf_move_wrpointer(int dev, int l); /* int DMAbuf_ioctl(int dev, unsigned int cmd, caddr_t arg, int local); */ void DMAbuf_init(int dev, int dma1, int dma2); -void DMAbuf_deinit(int dev); int DMAbuf_start_dma (int dev, unsigned long physaddr, int count, int dma_mode); int DMAbuf_open_dma (int dev); void DMAbuf_close_dma (int dev); diff --git a/drivers/video/fbcon.c b/drivers/video/fbcon.c index 1341d31965ba..872d77928d39 100644 --- a/drivers/video/fbcon.c +++ b/drivers/video/fbcon.c @@ -74,8 +74,8 @@ #include #include #include -#ifdef CONFIG_KERNELD -#include +#ifdef CONFIG_KMOD +#include #endif #include @@ -196,7 +196,7 @@ static void fbcon_bmove_rec(struct display *p, int sy, int sx, int dy, int dx, static struct display_switch *probe_list(struct display_switch *dispsw, struct display *disp); -#ifdef CONFIG_KERNELD +#ifdef CONFIG_KMOD static void request_driver(struct display *disp, int is_accel); #endif static struct display_switch *fbcon_get_driver(struct display *disp); @@ -1368,7 +1368,7 @@ static struct display_switch *probe_list(struct display_switch *dispsw, } -#ifdef CONFIG_KERNELD +#ifdef CONFIG_KMOD static void request_driver(struct display *disp, int is_accel) { char modname[30]; @@ -1400,7 +1400,7 @@ static void request_driver(struct display *disp, int is_accel) len += sprintf(modname+len, "-%d", disp->var.accel); request_module(modname); } -#endif /* CONFIG_KERNELD */ +#endif /* CONFIG_KMOD */ static struct display_switch *fbcon_get_driver(struct display *disp) @@ -1410,7 +1410,7 @@ static struct display_switch *fbcon_get_driver(struct display *disp) if (disp->var.accel != FB_ACCEL_NONE) { /* First try an accelerated driver */ dispsw = probe_list(accel_drivers, disp); -#ifdef CONFIG_KERNELD +#ifdef CONFIG_KMOD if (!dispsw) { request_driver(disp, 1); dispsw = probe_list(accel_drivers, disp); @@ -1422,7 +1422,7 @@ static struct display_switch *fbcon_get_driver(struct display *disp) /* Then try an unaccelerated driver */ dispsw = probe_list(drivers, disp); -#ifdef CONFIG_KERNELD +#ifdef CONFIG_KMOD if (!dispsw) { request_driver(disp, 0); dispsw = probe_list(drivers, disp); diff --git a/fs/devices.c b/fs/devices.c index 9cae0441e712..6a8a60627276 100644 --- a/fs/devices.c +++ b/fs/devices.c @@ -6,6 +6,7 @@ * Copyright (C) 1991, 1992 Linus Torvalds * * Added kerneld support: Jacques Gelinas and Bjorn Ekwall + * (changed to kmod) */ #include @@ -16,12 +17,12 @@ #include #include #include -#ifdef CONFIG_KERNELD -#include +#ifdef CONFIG_KMOD +#include #include -/* serial module kerneld load support */ +/* serial module kmod load support */ struct tty_driver *get_tty_driver(kdev_t device); #define isa_tty_dev(ma) (ma == TTY_MAJOR || ma == TTYAUX_MAJOR) #define need_serial(ma,mi) (get_tty_driver(MKDEV(ma,mi)) == NULL) @@ -74,12 +75,12 @@ static struct file_operations * get_fops( struct file_operations *ret = NULL; if (major < maxdev){ -#ifdef CONFIG_KERNELD +#ifdef CONFIG_KMOD /* * I do get request for device 0. I have no idea why. It happen * at shutdown time for one. Without the following test, the * kernel will happily trigger a request_module() which will - * trigger kerneld and modprobe for nothing (since there + * trigger kmod and modprobe for nothing (since there * is no device with major number == 0. And furthermore * it locks the reboot process :-( * @@ -87,7 +88,7 @@ static struct file_operations * get_fops( * * A. Haritsis : fix for serial module * though we need the minor here to check if serial dev, - * we pass only the normal major char dev to kerneld + * we pass only the normal major char dev to kmod * as there is no other loadable dev on these majors */ if ((isa_tty_dev(major) && need_serial(major,minor)) || diff --git a/fs/exec.c b/fs/exec.c index 50df2f60d7b9..7d4b2f3b7b77 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -49,8 +49,9 @@ #include #include -#ifdef CONFIG_KERNELD -#include + +#ifdef CONFIG_KMOD +#include #endif asmlinkage int sys_exit(int exit_code); @@ -693,7 +694,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) } if (retval != -ENOEXEC) { break; -#ifdef CONFIG_KERNELD +#ifdef CONFIG_KMOD }else{ #define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e)) char modname[20]; diff --git a/fs/fcntl.c b/fs/fcntl.c index 8743916eef9d..5cfeb3658ddc 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -22,18 +23,32 @@ extern int sock_fcntl (struct file *, unsigned int cmd, unsigned long arg); static inline int dupfd(unsigned int fd, unsigned int arg) { struct files_struct * files = current->files; + struct file * file; + int error; - if (fd >= NR_OPEN || !files->fd[fd]) - return -EBADF; + error = -EINVAL; if (arg >= NR_OPEN) - return -EINVAL; + goto out; + + error = -EBADF; + file = fget(fd); + if (!file) + goto out; + + error = -EMFILE; arg = find_next_zero_bit(&files->open_fds, NR_OPEN, arg); if (arg >= current->rlim[RLIMIT_NOFILE].rlim_cur) - return -EMFILE; + goto out_putf; FD_SET(arg, &files->open_fds); FD_CLR(arg, &files->close_on_exec); - (files->fd[arg] = files->fd[fd])->f_count++; - return arg; + fd_install(arg, file); + error = arg; +out: + return error; + +out_putf: + fput(file); + goto out; } asmlinkage int sys_dup2(unsigned int oldfd, unsigned int newfd) @@ -41,7 +56,7 @@ asmlinkage int sys_dup2(unsigned int oldfd, unsigned int newfd) int err = -EBADF; lock_kernel(); - if (oldfd >= NR_OPEN || !current->files->fd[oldfd]) + if (!fcheck(oldfd)) goto out; err = newfd; if (newfd == oldfd) @@ -51,7 +66,7 @@ asmlinkage int sys_dup2(unsigned int oldfd, unsigned int newfd) goto out; /* following POSIX.1 6.2.1 */ sys_close(newfd); - err = dupfd(oldfd,newfd); + err = dupfd(oldfd, newfd); out: unlock_kernel(); return err; @@ -62,7 +77,7 @@ asmlinkage int sys_dup(unsigned int fildes) int ret; lock_kernel(); - ret = dupfd(fildes,0); + ret = dupfd(fildes, 0); unlock_kernel(); return ret; } @@ -101,12 +116,13 @@ asmlinkage long sys_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg) long err = -EBADF; lock_kernel(); - if (fd >= NR_OPEN || !(filp = current->files->fd[fd])) + filp = fget(fd); + if (!filp) goto out; err = 0; switch (cmd) { case F_DUPFD: - err = dupfd(fd,arg); + err = dupfd(fd, arg); break; case F_GETFD: err = FD_ISSET(fd, ¤t->files->close_on_exec); @@ -158,6 +174,7 @@ asmlinkage long sys_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg) err = -EINVAL; break; } + fput(filp); out: unlock_kernel(); return err; diff --git a/fs/filesystems.c b/fs/filesystems.c index 996b42279823..602a077ebecd 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -29,8 +29,8 @@ #include #include #include -#ifdef CONFIG_KERNELD -#include +#ifdef CONFIG_KMOD +#include #endif #include #include @@ -187,12 +187,12 @@ asmlinkage sys_nfsservctl(int cmd, void *argp, void *resp) ret = do_nfsservctl(cmd, argp, resp); goto out; } -#ifdef CONFIG_KERNELD +#ifdef CONFIG_KMOD if (request_module ("nfsd") == 0) { if (do_nfsservctl) ret = do_nfsservctl(cmd, argp, resp); } -#endif /* CONFIG_KERNELD */ +#endif /* CONFIG_KMOD */ out: unlock_kernel(); return ret; diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index 8424b2ec7b3a..bec1c55a23f1 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -112,7 +112,7 @@ static int ncp_hash_dentry(struct dentry *, struct qstr *); static int ncp_compare_dentry (struct dentry *, struct qstr *, struct qstr *); static void ncp_delete_dentry(struct dentry *); -static struct dentry_operations ncp_dentry_operations = +struct dentry_operations ncp_dentry_operations = { ncp_lookup_validate, /* d_validate(struct dentry *) */ ncp_hash_dentry, /* d_hash */ diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index f11f4640c4d4..b535e30e7d42 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -25,9 +25,6 @@ #include #include #include -#ifdef CONFIG_KERNELD -#include -#endif #include #include "ncplib_kernel.h" diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 8300fee67030..2de790e42fcc 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -650,10 +650,31 @@ _nfs_revalidate_inode(struct nfs_server *server, struct dentry *dentry) inode->i_ino); status = nfs_proc_getattr(server, NFS_FH(dentry), &fattr); if (status) { + int error; + u32 *fh; + struct nfs_fh fhandle; #ifdef NFS_PARANOIA printk("nfs_revalidate_inode: %s/%s getattr failed, ino=%ld, error=%d\n", dentry->d_parent->d_name.name, dentry->d_name.name, inode->i_ino, status); #endif + if (status != -ESTALE) + goto out; + /* + * A "stale filehandle" error ... show the current fh + * and find out what the filehandle should be. + */ + fh = (u32 *) NFS_FH(dentry); + printk("NFS: bad fh %08x%08x%08x%08x%08x%08x%08x%08x\n", + fh[0],fh[1],fh[2],fh[3],fh[4],fh[5],fh[6],fh[7]); + error = nfs_proc_lookup(server, NFS_FH(dentry->d_parent), + dentry->d_name.name, &fhandle, &fattr); + if (error) { + printk("NFS: lookup failed, error=%d\n", error); + goto out; + } + fh = (u32 *) &fhandle; + printk(" %08x%08x%08x%08x%08x%08x%08x%08x\n", + fh[0],fh[1],fh[2],fh[3],fh[4],fh[5],fh[6],fh[7]); goto out; } diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 216aafb80882..1c6a74a71799 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -22,6 +22,9 @@ #include #include +/* Uncomment this to support servers requiring longword lengths */ +#define NFS_PAD_WRITES 1 + #define NFSDBG_FACILITY NFSDBG_XDR /* #define NFS_PARANOIA 1 */ @@ -181,7 +184,7 @@ nfs_xdr_diropargs(struct rpc_rqst *req, u32 *p, struct nfs_diropargs *args) /* * Arguments to a READ call. Since we read data directly into the page * cache, we also set up the reply iovec here so that iov[1] points - * exactly to the page wewant to fetch. + * exactly to the page we want to fetch. */ static int nfs_xdr_readargs(struct rpc_rqst *req, u32 *p, struct nfs_readargs *args) @@ -258,18 +261,38 @@ nfs_xdr_readres(struct rpc_rqst *req, u32 *p, struct nfs_readres *res) static int nfs_xdr_writeargs(struct rpc_rqst *req, u32 *p, struct nfs_writeargs *args) { + u32 count = args->count; + p = xdr_encode_fhandle(p, args->fh); *p++ = htonl(args->offset); *p++ = htonl(args->offset); - *p++ = htonl(args->count); - *p++ = htonl(args->count); + *p++ = htonl(count); + *p++ = htonl(count); req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); req->rq_svec[1].iov_base = (void *) args->buffer; - req->rq_svec[1].iov_len = args->count; - req->rq_slen += args->count; + req->rq_svec[1].iov_len = count; + req->rq_slen += count; req->rq_snr = 2; +#ifdef NFS_PAD_WRITES + /* + * Some old servers require that the message length + * be a multiple of 4, so we pad it here if needed. + */ + count = ((count + 3) & ~3) - count; + if (count) { +#if 0 +printk("nfs_writeargs: padding write, len=%d, slen=%d, pad=%d\n", +req->rq_svec[1].iov_len, req->rq_slen, count); +#endif + req->rq_svec[2].iov_base = (void *) "\0\0\0"; + req->rq_svec[2].iov_len = count; + req->rq_slen += count; + req->rq_snr = 3; + } +#endif + return 0; } @@ -334,12 +357,21 @@ nfs_xdr_symlinkargs(struct rpc_rqst *req, u32 *p, struct nfs_symlinkargs *args) static int nfs_xdr_readdirargs(struct rpc_rqst *req, u32 *p, struct nfs_readdirargs *args) { - struct rpc_auth *auth = req->rq_task->tk_auth; + struct rpc_task *task = req->rq_task; + struct rpc_auth *auth = task->tk_auth; + u32 bufsiz = args->bufsiz; int replen; + /* + * Some servers (e.g. HP OS 9.5) seem to expect the buffer size + * to be in longwords ... check whether to convert the size. + */ + if (task->tk_client->cl_flags & NFS_CLNTF_BUFSIZE) + bufsiz = bufsiz >> 2; + p = xdr_encode_fhandle(p, args->fh); *p++ = htonl(args->cookie); - *p++ = htonl(args->bufsiz); + *p++ = htonl(bufsiz); /* see above */ req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); /* set up reply iovec */ @@ -380,10 +412,9 @@ static int nfs_xdr_readdirres(struct rpc_rqst *req, u32 *p, struct nfs_readdirres *res) { struct iovec *iov = req->rq_rvec; - int status, nr, len; + int status, nr; char *string, *start; - u32 *end; - __u32 fileid, cookie, *entry; + u32 *end, *entry, len, fileid, cookie; if ((status = ntohl(*p++))) return -nfs_stat_to_errno(status); @@ -398,17 +429,25 @@ nfs_xdr_readdirres(struct rpc_rqst *req, u32 *p, struct nfs_readdirres *res) end = (u32 *) ((u8 *) p + iov[1].iov_len); /* Get start and end of dirent buffer */ - entry = (__u32 *) res->buffer; + entry = (u32 *) res->buffer; start = (char *) res->buffer; string = (char *) res->buffer + res->bufsiz; for (nr = 0; *p++; nr++) { fileid = ntohl(*p++); len = ntohl(*p++); + /* + * Check whether the server has exceeded our reply buffer, + * and set a flag to convert the size to longwords. + */ if ((p + QUADLEN(len) + 3) > end) { - printk(KERN_WARNING "NFS: short readdir reply! " - "nr=%d, slots=%d, len=%d\n", + struct rpc_clnt *clnt = req->rq_task->tk_client; + printk(KERN_WARNING + "NFS: server %s, readdir reply truncated\n", + clnt->cl_server); + printk(KERN_WARNING "NFS: nr=%d, slots=%d, len=%d\n", nr, (end - p), len); + clnt->cl_flags |= NFS_CLNTF_BUFSIZE; break; } if (len > NFS_MAXNAMLEN) { diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c index 33e6dfd26fe2..afc219838736 100644 --- a/fs/nls/nls_base.c +++ b/fs/nls/nls_base.c @@ -12,8 +12,8 @@ #include #include #include -#ifdef CONFIG_KERNELD -#include +#ifdef CONFIG_KMOD +#include #endif #include @@ -205,7 +205,7 @@ struct nls_table *find_nls(char *charset) struct nls_table *load_nls(char *charset) { struct nls_table *nls; -#ifdef CONFIG_KERNELD +#ifdef CONFIG_KMOD char buf[40]; int ret; #endif @@ -216,7 +216,7 @@ struct nls_table *load_nls(char *charset) return nls; } -#ifndef CONFIG_KERNELD +#ifndef CONFIG_KMOD return NULL; #else if (strlen(charset) > sizeof(buf) - sizeof("nls_")) { diff --git a/fs/ntfs/fs.c b/fs/ntfs/fs.c index aa6a7c40cb3f..d190b21e4a7c 100644 --- a/fs/ntfs/fs.c +++ b/fs/ntfs/fs.c @@ -776,13 +776,13 @@ struct super_block * ntfs_read_super(struct super_block *sb, struct buffer_head *bh; int i; - /* When the driver is compiled as a module, kerneld must know when it + /* When the driver is compiled as a module, kmod must know when it * can safely remove it from memory. To do this, each module owns a * reference counter. */ MOD_INC_USE_COUNT; /* Don't put ntfs_debug() before MOD_INC_USE_COUNT, printk() can block - * so this could lead to a race condition with kerneld. + * so this could lead to a race condition with kmod. */ ntfs_debug(DEBUG_OTHER, "ntfs_read_super\n"); @@ -939,7 +939,7 @@ __initfunc(int init_ntfs_fs(void)) #ifdef MODULE /* A module is a piece of code which can be inserted in and removed * from the running kernel whenever you want using lsmod, or on demand using - * kerneld + * kmod */ /* No function of this module is needed by another module */ @@ -956,7 +956,7 @@ MODULE_PARM_DESC(ntdebug, "Debug level"); /* When this code is compiled as a module, if you use mount -t ntfs when no * ntfs filesystem is registered (see /proc/filesystems), get_fs_type() in - * fs/super.c asks kerneld to load the module named ntfs in memory. + * fs/super.c asks kmod to load the module named ntfs in memory. * * Therefore, this function is the main entry point in this case */ @@ -965,7 +965,7 @@ int init_module(void) return init_ntfs_fs(); } -/* Called by kerneld just before the kernel removes the module from memory */ +/* Called by kmod just before the kernel removes the module from memory */ void cleanup_module(void) { SYSCTL(0); diff --git a/fs/open.c b/fs/open.c index 5b0ff99244f8..204294cc3380 100644 --- a/fs/open.c +++ b/fs/open.c @@ -681,24 +681,37 @@ out: } /* - * Find an empty file descriptor entry, and mark it busy + * Find an empty file descriptor entry, and mark it busy. */ int get_unused_fd(void) { - int fd; struct files_struct * files = current->files; + int fd, error; + error = -EMFILE; fd = find_first_zero_bit(&files->open_fds, NR_OPEN); /* * N.B. For clone tasks sharing a files structure, this test * will limit the total number of files that can be opened. */ - if (fd < current->rlim[RLIMIT_NOFILE].rlim_cur) { - FD_SET(fd, &files->open_fds); - FD_CLR(fd, &files->close_on_exec); - return fd; + if (fd >= current->rlim[RLIMIT_NOFILE].rlim_cur) + goto out; + + /* Check here for fd > files->max_fds to do dynamic expansion */ + + FD_SET(fd, &files->open_fds); + FD_CLR(fd, &files->close_on_exec); +#if 1 + /* Sanity check */ + if (files->fd[fd] != NULL) { + printk("get_unused_fd: slot %d not NULL!\n", fd); + files->fd[fd] = NULL; } - return -EMFILE; +#endif + error = fd; + +out: + return error; } inline void put_unused_fd(unsigned int fd) @@ -796,15 +809,15 @@ asmlinkage int sys_close(unsigned int fd) { int error; struct file * filp; - struct files_struct * files; lock_kernel(); - files = current->files; error = -EBADF; - if (fd < NR_OPEN && (filp = files->fd[fd]) != NULL) { + filp = fcheck(fd); + if (filp) { + struct files_struct * files = current->files; + files->fd[fd] = NULL; put_unused_fd(fd); FD_CLR(fd, &files->close_on_exec); - files->fd[fd] = NULL; error = close_fp(filp, files); } unlock_kernel(); diff --git a/fs/proc/root.c b/fs/proc/root.c index 3e344bd09777..124e15e152a5 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -14,8 +14,8 @@ #include #include #include -#ifdef CONFIG_KERNELD -#include +#ifdef CONFIG_KMOD +#include #endif /* @@ -234,7 +234,7 @@ proc_openprom_deregister(void) } #endif -#if defined(CONFIG_SUN_OPENPROMFS_MODULE) && defined(CONFIG_KERNELD) +#if defined(CONFIG_SUN_OPENPROMFS_MODULE) && defined(CONFIG_KMOD) static int proc_openprom_defreaddir(struct inode * inode, struct file * filp, void * dirent, filldir_t filldir) diff --git a/fs/super.c b/fs/super.c index 84ef3ffb8b2c..50a6cb9a6270 100644 --- a/fs/super.c +++ b/fs/super.c @@ -37,14 +37,14 @@ #include #include -#ifdef CONFIG_KERNELD -#include -#endif - #include #include #include +#ifdef CONFIG_KMOD +#include +#endif + /* * We use a semaphore to synchronize all mount/umount * activity - imagine the mess if we have a race between @@ -405,7 +405,7 @@ struct file_system_type *get_fs_type(const char *name) return fs; for (fs = file_systems; fs && strcmp(fs->name, name); fs = fs->next) ; -#ifdef CONFIG_KERNELD +#ifdef CONFIG_KMOD if (!fs && (request_module(name) == 0)) { for (fs = file_systems; fs && strcmp(fs->name, name); fs = fs->next) ; diff --git a/include/linux/file.h b/include/linux/file.h index 3f3870b9ed84..240a5039c89e 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -1,19 +1,41 @@ +/* + * Wrapper functions for accessing the file_struct fd array. + */ + #ifndef __LINUX_FILE_H #define __LINUX_FILE_H -extern inline struct file * fget(unsigned long fd) +extern int __fput(struct file *); +extern void insert_file_free(struct file *file); + +/* + * Check whether the specified fd has an open file. + */ +extern inline struct file * fcheck(unsigned int fd) { struct file * file = NULL; - if (fd < NR_OPEN) { + + if (fd < NR_OPEN) file = current->files->fd[fd]; - if (file) - file->f_count++; - } return file; } -extern int __fput(struct file *); -extern void insert_file_free(struct file *file); +extern inline struct file * fget(unsigned int fd) +{ + struct file * file = fcheck(fd); + + if (file) + file->f_count++; + return file; +} + +/* + * Install a file pointer in the fd array. + */ +extern inline void fd_install(unsigned int fd, struct file *file) +{ + current->files->fd[fd] = file; +} /* It does not matter which list it is on. */ extern inline void remove_filp(struct file *file) @@ -47,12 +69,4 @@ extern inline void put_filp(struct file *file) } } -/* - * Install a file pointer in the files structure. - */ -extern inline void fd_install(unsigned long fd, struct file *file) -{ - current->files->fd[fd] = file; -} - #endif diff --git a/include/linux/kerneld.h b/include/linux/kerneld.h deleted file mode 100644 index b2db5f8c7cf9..000000000000 --- a/include/linux/kerneld.h +++ /dev/null @@ -1,135 +0,0 @@ -#ifndef _LINUX_KERNELD_H -#define _LINUX_KERNELD_H - -#define KERNELD_SYSTEM 1 -#define KERNELD_REQUEST_MODULE 2 /* "insmod" */ -#define KERNELD_RELEASE_MODULE 3 /* "rmmod" */ -#define KERNELD_DELAYED_RELEASE_MODULE 4 /* "rmmod" */ -#define KERNELD_CANCEL_RELEASE_MODULE 5 /* "rmmod" */ -#define KERNELD_REQUEST_ROUTE 6 /* from net/ipv4/route.c */ -#define KERNELD_BLANKER 7 /* from drivers/char/console.c */ -#define KERNELD_PNP 8 /* from drivers/pnp/kerneld.c */ -#define KERNELD_ARP 256 /* from net/ipv4/arp.c */ - -/* - * Uncomment the following line for the new kerneld protocol - * This includes the pid of the kernel level requester into the kerneld header - */ -/* -#define NEW_KERNELD_PROTOCOL - */ -#ifdef NEW_KERNELD_PROTOCOL -#define OLDIPC_KERNELD 00040000 /* use the kerneld message channel */ -#define IPC_KERNELD 00140000 /* use the kerneld message channel, new protocol */ -#define KDHDR (sizeof(long) + sizeof(short) + sizeof(short)) -#define NULL_KDHDR 0, 2, 0 -#else -#define IPC_KERNELD 00040000 /* use the kerneld message channel */ -#define KDHDR (sizeof(long)) -#define NULL_KDHDR 0 -#endif -#define KERNELD_MAXCMD 0x7ffeffff -#define KERNELD_MINSEQ 0x7fff0000 /* "commands" legal up to 0x7ffeffff */ -#define KERNELD_WAIT 0x80000000 -#define KERNELD_NOWAIT 0 - -struct kerneld_msg { - long mtype; - long id; -#ifdef NEW_KERNELD_PROTOCOL - short version; - short pid; -#endif -#ifdef __KERNEL__ - char *text; -#else - char text[1]; -#endif /* __KERNEL__ */ -}; - -#ifdef __KERNEL__ -#include - -extern int kerneld_send(int msgtype, int ret_size, int msgsz, - const char *text, const char *ret_val); - -/* - * Request that a module should be loaded. - * Wait for the exit status from insmod/modprobe. - * If it fails, it fails... at least we tried... - */ -static inline int request_module(const char *name) -{ - return kerneld_send(KERNELD_REQUEST_MODULE, - 0 | KERNELD_WAIT, - strlen(name), name, NULL); -} - -/* - * Request the removal of a module, maybe don't wait for it. - * It doesn't matter if the removal fails, now does it? - */ -static inline int release_module(const char *name, int waitflag) -{ - return kerneld_send(KERNELD_RELEASE_MODULE, - 0 | (waitflag?KERNELD_WAIT:KERNELD_NOWAIT), - strlen(name), name, NULL); -} - -/* - * Request a delayed removal of a module, but don't wait for it. - * The delay is done by kerneld (default: 60 seconds) - */ -static inline int delayed_release_module(const char *name) -{ - return kerneld_send(KERNELD_DELAYED_RELEASE_MODULE, - 0 | KERNELD_NOWAIT, - strlen(name), name, NULL); -} - -/* - * Attempt to cancel a previous request for removal of a module, - * but don't wait for it. - * This call can be made if the kernel wants to prevent a delayed - * unloading of a module. - */ -static inline int cancel_release_module(const char *name) -{ - return kerneld_send(KERNELD_CANCEL_RELEASE_MODULE, - 0 | KERNELD_NOWAIT, - strlen(name), name, NULL); -} - -/* - * Perform an "inverted" system call, maybe return the exit status - */ -static inline int ksystem(const char *cmd, int waitflag) -{ - return kerneld_send(KERNELD_SYSTEM, - 0 | (waitflag?KERNELD_WAIT:KERNELD_NOWAIT), - strlen(cmd), cmd, NULL); -} - -/* - * Try to create a route, possibly by opening a ppp-connection - */ -static inline int kerneld_route(const char *ip_route) -{ - return kerneld_send(KERNELD_REQUEST_ROUTE, - 0 | KERNELD_WAIT, - strlen(ip_route), ip_route, NULL); -} - -/* - * Handle an external screen blanker - */ -static inline int kerneld_blanker(int on_off) -{ - char *s = on_off ? "on" : "off"; - return kerneld_send(KERNELD_BLANKER, - 0 | (on_off ? KERNELD_NOWAIT : KERNELD_WAIT), - strlen(s), s, NULL); -} - -#endif /* __KERNEL__ */ -#endif diff --git a/include/linux/kmod.h b/include/linux/kmod.h new file mode 100644 index 000000000000..876c7f222a4a --- /dev/null +++ b/include/linux/kmod.h @@ -0,0 +1,4 @@ +/* + kmod header +*/ +extern int request_module(const char * name); diff --git a/include/linux/module.h b/include/linux/module.h index 475c6885403a..ad3d10baf6cb 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -148,7 +148,7 @@ const char __module_author[] __attribute__((section(".modinfo"))) = \ const char __module_description[] __attribute__((section(".modinfo"))) = \ "description=" desc -/* Could potentially be used by kerneld... */ +/* Could potentially be used by kmod... */ #define MODULE_SUPPORTED_DEVICE(dev) \ const char __module_device[] __attribute__((section(".modinfo"))) = \ diff --git a/include/linux/mroute.h b/include/linux/mroute.h index 55193867d55e..b57519b72895 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -217,7 +217,7 @@ extern int pim_rcv(struct sk_buff * , unsigned short); extern int pim_rcv_v1(struct sk_buff * , unsigned short len); struct rtmsg; -extern int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm); +extern int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait); #endif #endif diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 72430508a0b0..d1c005c7092e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -125,6 +125,9 @@ struct net_device_stats unsigned long tx_heartbeat_errors; unsigned long tx_window_errors; + /* for cslip etc */ + unsigned long rx_compressed; + unsigned long tx_compressed; }; #ifdef CONFIG_NET_FASTROUTE @@ -352,6 +355,7 @@ extern __inline__ int unregister_gifconf(unsigned int family) #define HAVE_NETIF_RX 1 extern void netif_rx(struct sk_buff *skb); extern void net_bh(void); +extern void dev_tint(struct device *dev); extern int dev_get_info(char *buffer, char **start, off_t offset, int length, int dummy); extern int dev_ioctl(unsigned int cmd, void *); extern int dev_change_flags(struct device *, unsigned); @@ -423,7 +427,7 @@ extern int dev_mc_add(struct device *dev, void *addr, int alen, int newonly); extern void dev_mc_discard(struct device *dev); extern void dev_set_promiscuity(struct device *dev, int inc); extern void dev_set_allmulti(struct device *dev, int inc); -/* Load a device via the kerneld */ +/* Load a device via the kmod */ extern void dev_load(const char *name); extern void dev_mcast_init(void); extern int netdev_register_fc(struct device *dev, void (*stimul)(struct device *dev)); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index a7b51b977c4b..b72ad4ed14db 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -85,6 +85,9 @@ do { \ */ #define NFS_RPC_SWAPFLAGS (RPC_TASK_SWAPPER|RPC_TASK_ROOTCREDS) +/* Flags in the RPC client structure */ +#define NFS_CLNTF_BUFSIZE 0x0001 /* readdir buffer in longwords */ + #ifdef __KERNEL__ /* diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 4a309eb91586..8c6467010023 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -66,7 +66,7 @@ struct rtattr #define RTA_ALIGNTO 4 #define RTA_ALIGN(len) ( ((len)+RTA_ALIGNTO-1) & ~(RTA_ALIGNTO-1) ) -#define RTA_OK(rta,len) ((rta)->rta_len > sizeof(struct rtattr) && \ +#define RTA_OK(rta,len) ((rta)->rta_len >= sizeof(struct rtattr) && \ (rta)->rta_len <= (len)) #define RTA_NEXT(rta,attrlen) ((attrlen) -= RTA_ALIGN((rta)->rta_len), \ (struct rtattr*)(((char*)(rta)) + RTA_ALIGN((rta)->rta_len))) diff --git a/include/linux/sched.h b/include/linux/sched.h index 52b290572efe..a42eb4feaff1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -86,6 +86,12 @@ extern int last_pid; #define SCHED_FIFO 1 #define SCHED_RR 2 +/* + * This is an additional bit set when we want to + * yield the CPU for one re-schedule.. + */ +#define SCHED_YIELD 0x10 + struct sched_param { int sched_priority; }; @@ -113,19 +119,24 @@ extern void trap_init(void); asmlinkage void schedule(void); -/* Open file table structure */ + +/* + * Open file table structure + */ struct files_struct { int count; + int max_fds; + struct file ** fd; /* current fd array */ fd_set close_on_exec; fd_set open_fds; - struct file * fd[NR_OPEN]; }; #define INIT_FILES { \ 1, \ + NR_OPEN, \ + &init_fd_array[0], \ { { 0, } }, \ - { { 0, } }, \ - { NULL, } \ + { { 0, } } \ } struct fs_struct { @@ -571,19 +582,6 @@ extern void exit_sighand(struct task_struct *); extern int do_execve(char *, char **, char **, struct pt_regs *); extern int do_fork(unsigned long, unsigned long, struct pt_regs *); -/* See if we have a valid user level fd. - * If it makes sense, return the file structure it references. - * Otherwise return NULL. - */ -extern inline struct file *file_from_fd(const unsigned int fd) -{ - - if (fd >= NR_OPEN) - return NULL; - /* either valid or null */ - return current->files->fd[fd]; -} - /* * The wait-queues are circular lists, and you have to be *very* sure * to keep them correct. Use only these two functions to add/remove diff --git a/include/linux/socket.h b/include/linux/socket.h index 160fc1063362..7b2ed215ce23 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -149,6 +149,7 @@ struct ucred { #define AF_NETLINK 16 #define AF_ROUTE AF_NETLINK /* Alias to emulate 4.4BSD */ #define AF_PACKET 17 /* Packet family */ +#define AF_ASH 18 /* Ash */ #define AF_MAX 32 /* For now.. */ /* Protocol families, same as address families. */ @@ -172,6 +173,7 @@ struct ucred { #define PF_NETLINK AF_NETLINK #define PF_ROUTE AF_ROUTE #define PF_PACKET AF_PACKET +#define PF_ASH AF_ASH #define PF_MAX AF_MAX diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 60fb2d74f1ee..da2b2cdd19dc 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -30,6 +30,7 @@ struct rpc_portmap { * The high-level client handle */ struct rpc_clnt { + unsigned int cl_users; /* number of references */ struct rpc_xprt * cl_xprt; /* transport */ struct rpc_procinfo * cl_procinfo; /* procedure info */ u32 cl_maxproc; /* max procedure number */ @@ -37,7 +38,6 @@ struct rpc_clnt { char * cl_server; /* server machine name */ char * cl_protname; /* protocol name */ struct rpc_auth * cl_auth; /* authenticator */ - struct rpc_portmap cl_pmap; /* port mapping */ struct rpc_stat * cl_stats; /* statistics */ unsigned int cl_softrtry : 1,/* soft timeouts */ @@ -47,10 +47,11 @@ struct rpc_clnt { cl_binding : 1,/* doing a getport() */ cl_oneshot : 1,/* dispose after use */ cl_dead : 1;/* abandoned */ + unsigned int cl_flags; /* misc client flags */ unsigned long cl_hardmax; /* max hard timeout */ + struct rpc_portmap cl_pmap; /* port mapping */ struct rpc_wait_queue cl_bindwait; /* waiting on getport() */ - unsigned int cl_users; /* number of references */ }; #define cl_timeout cl_xprt->timeout #define cl_prog cl_pmap.pm_prog diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 865bdd1dd427..dd1ed43e8000 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -70,7 +70,9 @@ enum KERN_PRINTK, /* sturct: control printk logging parameters */ KERN_NAMETRANS, /* Name translation */ KERN_STATINODE, - KERN_DENTRY /* dentry statistics */ + KERN_DENTRY, /* dentry statistics */ + KERN_MODPROBE, + KERN_KMOD_UNLOAD_DELAY }; @@ -118,6 +120,7 @@ enum NET_CORE_FASTROUTE, NET_CORE_MSG_COST, NET_CORE_MSG_BURST, + NET_CORE_OPTMEM_MAX, }; /* /proc/sys/net/ethernet */ diff --git a/include/net/dst.h b/include/net/dst.h index b879bb05901c..dc4b8ce670a4 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -120,6 +120,8 @@ extern void dst_destroy(struct dst_entry * dst); extern __inline__ void dst_free(struct dst_entry * dst) { + if (dst->obsolete > 0) + return; if (!atomic_read(&dst->use)) { dst_destroy(dst); return; diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 42233aadf182..863037b2355c 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -114,7 +114,7 @@ extern __inline__ void ip6_dst_store(struct sock *sk, struct dst_entry *dst) struct rt6_info *rt; np = &sk->net_pinfo.af_inet6; - sk->dst_cache = dst; + dst_release(xchg(&sk->dst_cache,dst)); rt = (struct rt6_info *) dst; diff --git a/include/net/ipv6.h b/include/net/ipv6.h index b6055ae44b66..1a322a49813f 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -4,7 +4,7 @@ * Authors: * Pedro Roque * - * $Id: ipv6.h,v 1.8 1997/12/29 19:52:09 kuznet Exp $ + * $Id: ipv6.h,v 1.9 1998/03/08 05:55:20 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/include/net/route.h b/include/net/route.h index 338e158fd99d..624fd233af3a 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -102,6 +102,7 @@ extern unsigned inet_addr_type(u32 addr); extern void ip_rt_multicast_event(struct in_device *); extern int ip_rt_ioctl(unsigned int cmd, void *arg); extern void ip_rt_get_source(u8 *src, struct rtable *rt); +extern int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb); extern __inline__ void ip_rt_put(struct rtable * rt) diff --git a/include/net/sock.h b/include/net/sock.h index c225a00150d6..8ad6a22cc64c 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -191,36 +191,73 @@ struct raw_opt { struct tcp_opt { + /* TCP bind bucket hash linkage. */ + struct sock *bind_next; + struct sock **bind_pprev; + int tcp_header_len; /* Bytes of tcp header to send */ + +/* + * Header prediction flags + * 0x5?10 << 16 + snd_wnd in net byte order + */ + __u32 pred_flags; + /* * RFC793 variables by their proper names. This means you can * read the code and the spec side by side (and laugh ...) * See RFC793 and RFC1122. The RFC writes these in capitals. */ __u32 rcv_nxt; /* What we want to receive next */ - __u32 rcv_up; /* The urgent point (may not be valid) */ - __u32 rcv_wnd; /* Current receiver window */ __u32 snd_nxt; /* Next sequence we send */ + __u32 snd_una; /* First byte we want an ack for */ - __u32 snd_up; /* Outgoing urgent pointer */ - __u32 snd_wl1; /* Sequence for window update */ - __u32 snd_wl2; /* Ack sequence for update */ + __u32 rcv_tstamp; /* timestamp of last received packet */ + __u32 lrcvtime; /* timestamp of last received data packet*/ + __u32 srtt; /* smothed round trip time << 3 */ - __u32 rcv_wup; /* rcv_nxt on last window update sent */ + __u32 ato; /* delayed ack timeout */ + __u32 snd_wl1; /* Sequence for window update */ - __u32 fin_seq; /* XXX This one should go, we don't need it. -DaveM */ + __u32 snd_wl2; /* Ack sequence for update */ + __u32 snd_wnd; /* The window we expect to receive */ + __u16 max_window; + __u8 pending; /* pending events */ + __u8 retransmits; + __u32 last_ack_sent; /* last ack we sent */ - __u32 srtt; /* smothed round trip time << 3 */ + __u32 backoff; /* backoff */ __u32 mdev; /* medium deviation */ + __u32 snd_cwnd; /* Sending congestion window */ __u32 rto; /* retransmit timeout */ - __u32 backoff; /* backoff */ + + __u32 packets_out; /* Packets which are "in flight" */ + __u32 high_seq; /* highest sequence number sent by onset of congestion */ /* * Slow start and congestion control (see also Nagle, and Karn & Partridge) */ - __u32 snd_cwnd; /* Sending congestion window */ __u32 snd_ssthresh; /* Slow start size threshold */ __u16 snd_cwnd_cnt; - __u16 max_window; + __u8 dup_acks; /* Consequetive duplicate acks seen from other end */ + __u8 delayed_acks; + + /* Two commonly used timers in both sender and receiver paths. */ + struct timer_list retransmit_timer; /* Resend (no ack) */ + struct timer_list delack_timer; /* Ack delay */ + + struct sk_buff_head out_of_order_queue; /* Out of order segments go here */ + + struct tcp_func *af_specific; /* Operations which are AF_INET{4,6} specific */ + struct sk_buff *send_head; /* Front of stuff to transmit */ + struct sk_buff *retrans_head; /* retrans head can be + * different to the head of + * write queue if we are doing + * fast retransmit + */ + __u32 rcv_wnd; /* Current receiver window */ + __u32 rcv_wup; /* rcv_nxt on last window update sent */ + __u32 write_seq; + __u32 copied_seq; /* * Options received (usually on last packet, some only on SYN packets). */ @@ -235,60 +272,23 @@ struct tcp_opt __u32 rcv_tsecr; /* Time stamp echo reply */ __u32 ts_recent; /* Time stamp to echo next */ __u32 ts_recent_stamp;/* Time we stored ts_recent (for aging) */ - __u32 last_ack_sent; /* last ack we sent */ int sacks; /* Number of SACK blocks if any */ __u32 left_sack[4]; /* Left edges of blocks */ __u32 right_sack[4]; /* Right edges of blocks */ - int tcp_header_len; /* Bytes of tcp header to send */ -/* - * Timers used by the TCP protocol layer - */ - struct timer_list delack_timer; /* Ack delay */ - struct timer_list idle_timer; /* Idle watch */ - struct timer_list completion_timer; /* Up/Down timer */ struct timer_list probe_timer; /* Probes */ - struct timer_list retransmit_timer; /* Resend (no ack) */ - - __u32 basertt; /* Vegas baseRTT */ - __u32 packets_out; /* Packets which are "in flight" */ - __u32 window_clamp; /* XXX Document this... -DaveM */ - - __u8 pending; /* pending events */ - __u8 delayed_acks; - __u8 dup_acks; /* Consequetive duplicate acks seen from other end */ - __u8 retransmits; - - __u32 lrcvtime; /* timestamp of last received data packet */ - __u32 rcv_tstamp; /* timestamp of last received packet */ - __u32 iat_mdev; /* interarrival time medium deviation */ - __u32 iat; /* interarrival time */ - __u32 ato; /* delayed ack timeout */ - __u32 high_seq; /* highest sequence number sent by onset of congestion */ - -/* - * new send pointers - */ - struct sk_buff * send_head; - struct sk_buff * retrans_head; /* retrans head can be - * different to the head of - * write queue if we are doing - * fast retransmit - */ -/* - * Header prediction flags - * 0x5?10 << 16 + snd_wnd in net byte order - */ - __u32 pred_flags; - __u32 snd_wnd; /* The window we expect to receive */ - - __u32 probes_out; /* unanswered 0 window probes */ + __u32 basertt; /* Vegas baseRTT */ + __u32 window_clamp; /* XXX Document this... -DaveM */ + __u32 probes_out; /* unanswered 0 window probes */ + __u32 syn_seq; + __u32 fin_seq; + __u32 urg_seq; + __u32 urg_data; struct open_request *syn_wait_queue; struct open_request **syn_wait_last; int syn_backlog; - struct tcp_func *af_specific; }; @@ -347,73 +347,69 @@ struct sock struct sock *sklist_next; struct sock *sklist_prev; - atomic_t wmem_alloc; - atomic_t rmem_alloc; - unsigned long allocation; /* Allocation mode */ + /* Main hash linkage for various protocol lookup tables. */ + struct sock *next; + struct sock **pprev; - /* The following stuff should probably move to the tcp private area */ - __u32 write_seq; - __u32 copied_seq; - __u32 syn_seq; - __u32 urg_seq; - __u32 urg_data; - unsigned char delayed_acks; - /* End of block to move */ + /* Socket demultiplex comparisons on incoming packets. */ + __u32 daddr; /* Foreign IPv4 addr */ + __u32 rcv_saddr; /* Bound local IPv4 addr */ + int bound_dev_if; /* Bound device index if != 0 */ + unsigned short num; /* Local port */ + volatile unsigned char state, /* Connection state */ + zapped; /* In ax25 & ipx means not linked */ + struct tcphdr dummy_th; /* TCP header template */ - int sock_readers; /* user count */ + int sock_readers; /* user count */ + int rcvbuf; + + struct wait_queue **sleep; + struct dst_entry *dst_cache; /* Destination cache */ + atomic_t rmem_alloc; /* Receive queue bytes committed */ + struct sk_buff_head receive_queue; /* Incoming packets */ + atomic_t wmem_alloc; /* Transmit queue bytes committed */ + struct sk_buff_head write_queue; /* Packet sending queue */ + atomic_t omem_alloc; /* "o" is "option" or "other" */ + __u32 saddr; /* Sending source */ + unsigned int allocation; /* Allocation mode */ + int sndbuf; + struct sock *prev; /* * Not all are volatile, but some are, so we * might as well say they all are. */ volatile char dead, - urginline, done, + urginline, reuse, keepopen, linger, destroy, no_check, - zapped, /* In ax25 & ipx means not linked */ broadcast, nonagle, bsdism; - int bound_dev_if; - unsigned long lingertime; + unsigned char debug; int proc; + unsigned long lingertime; - struct sock *next; - struct sock **pprev; - struct sock *bind_next; - struct sock **bind_pprev; - struct sock *prev; int hashent; struct sock *pair; - struct sk_buff_head back_log; - - struct sk_buff_head write_queue, - receive_queue, - out_of_order_queue, + /* Error and backlog packet queues, rarely used. */ + struct sk_buff_head back_log, error_queue; unsigned short family; struct proto *prot; - struct wait_queue **sleep; - - __u32 daddr; - __u32 saddr; /* Sending source */ - __u32 rcv_saddr; /* Bound address */ - struct dst_entry *dst_cache; /* * mss is min(mtu, max_window) */ unsigned short mtu; /* mss negotiated in the syn's */ unsigned short mss; /* current eff. mss - can change */ unsigned short user_mss; /* mss requested by user in ioctl */ - unsigned short num; - unsigned short shutdown; #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) @@ -436,16 +432,12 @@ struct sock cause failure but are the cause of a persistent failure not just 'timed out' */ - unsigned char protocol; - volatile unsigned char state; unsigned short ack_backlog; unsigned short max_ack_backlog; - unsigned char debug; __u32 priority; - int rcvbuf; - int sndbuf; unsigned short type; unsigned char localroute; /* Route locally only */ + unsigned char protocol; struct ucred peercred; #ifdef CONFIG_FILTER @@ -472,11 +464,6 @@ struct sock #if defined (CONFIG_PACKET) || defined(CONFIG_PACKET_MODULE) struct packet_opt *af_packet; #endif -#ifdef CONFIG_INET -#ifdef CONFIG_NUTCP - struct tcp_opt af_tcp; -#endif -#endif #if defined(CONFIG_X25) || defined(CONFIG_X25_MODULE) x25_cb *x25; #endif @@ -503,7 +490,6 @@ struct sock int ip_ttl; /* TTL setting */ int ip_tos; /* TOS */ unsigned ip_cmsg_flags; - struct tcphdr dummy_th; struct ip_options *opt; unsigned char ip_hdrincl; /* Include headers ? */ __u8 ip_mc_ttl; /* Multicasting TTL */ @@ -731,7 +717,7 @@ here: } /* - * This might not be the most apropriate place for this two + * This might not be the most appropriate place for this two * but since they are used by a lot of the net related code * at least they get declared on a include that is common to all */ @@ -750,7 +736,7 @@ static __inline__ int max(unsigned int a, unsigned int b) return a; } -extern struct sock * sk_alloc(int family, int priority); +extern struct sock * sk_alloc(int family, int priority, int zero_it); extern void sk_free(struct sock *sk); extern void destroy_sock(struct sock *sk); @@ -884,7 +870,6 @@ extern __inline__ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) */ if (atomic_read(&sk->rmem_alloc) + skb->truesize >= (unsigned)sk->rcvbuf) return -ENOMEM; - skb_set_owner_r(skb, sk); #ifdef CONFIG_FILTER if (sk->filter) @@ -894,7 +879,8 @@ extern __inline__ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) } #endif /* CONFIG_FILTER */ - skb_queue_tail(&sk->receive_queue,skb); + skb_set_owner_r(skb, sk); + skb_queue_tail(&sk->receive_queue, skb); if (!sk->dead) sk->data_ready(sk,skb->len); return 0; diff --git a/include/net/tcp.h b/include/net/tcp.h index 4c445ca1af04..cf0a2ee5ae4d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -27,13 +27,13 @@ * New scheme, half the table is for TIME_WAIT, the other half is * for the rest. I'll experiment with dynamic table growth later. */ -#define TCP_HTABLE_SIZE 1024 +#define TCP_HTABLE_SIZE 512 /* This is for listening sockets, thus all sockets which possess wildcards. */ #define TCP_LHTABLE_SIZE 32 /* Yes, really, this is all you need. */ /* This is for all sockets, to keep track of the local port allocations. */ -#define TCP_BHTABLE_SIZE 64 +#define TCP_BHTABLE_SIZE 512 /* tcp_ipv4.c: These need to be shared by v4 and v6 because the lookup * and hashing code needs to work with different AF's yet @@ -41,47 +41,118 @@ */ extern struct sock *tcp_established_hash[TCP_HTABLE_SIZE]; extern struct sock *tcp_listening_hash[TCP_LHTABLE_SIZE]; -extern struct sock *tcp_bound_hash[TCP_BHTABLE_SIZE]; -/* tcp_ipv4.c: These sysctl variables need to be shared between v4 and v6 - * because the v6 tcp code to intialize a connection needs to interoperate - * with the v4 code using the same variables. - * FIXME: It would be better to rewrite the connection code to be - * address family independent and just leave one copy in the ipv4 section. - * This would also clean up some code duplication. -- erics +/* There are a few simple rules, which allow for local port reuse by + * an application. In essence: + * + * 1) Sockets bound to different interfaces may share a local port. + * Failing that, goto test 2. + * 2) If all sockets have sk->reuse set, and none of them are in + * TCP_LISTEN state, the port may be shared. + * Failing that, goto test 3. + * 3) If all sockets are bound to a specific sk->rcv_saddr local + * address, and none of them are the same, the port may be + * shared. + * Failing this, the port cannot be shared. + * + * The interesting point, is test #2. This is what an FTP server does + * all day. To optimize this case we use a specific flag bit defined + * below. As we add sockets to a bind bucket list, we perform a + * check of: (newsk->reuse && (newsk->state != TCP_LISTEN)) + * As long as all sockets added to a bind bucket pass this test, + * the flag bit will be set. + * The resulting situation is that tcp_v[46]_verify_bind() can just check + * for this flag bit, if it is set and the socket trying to bind has + * sk->reuse set, we don't even have to walk the owners list at all, + * we return that it is ok to bind this socket to the requested local port. + * + * Sounds like a lot of work, but it is worth it. In a more naive + * implementation (ie. current FreeBSD etc.) the entire list of ports + * must be walked for each data port opened by an ftp server. Needless + * to say, this does not scale at all. With a couple thousand FTP + * users logged onto your box, isn't it nice to know that new data + * ports are created in O(1) time? I thought so. ;-) -DaveM */ -extern int sysctl_tcp_sack; -extern int sysctl_tcp_timestamps; -extern int sysctl_tcp_window_scaling; +struct tcp_bind_bucket { + unsigned short port; + unsigned short flags; +#define TCPB_FLAG_LOCKED 0x0001 +#define TCPB_FLAG_FASTREUSE 0x0002 + + struct tcp_bind_bucket *next; + struct sock *owners; + struct tcp_bind_bucket **pprev; +}; -/* These are AF independent. */ -static __inline__ int tcp_bhashfn(__u16 lport) +extern struct tcp_bind_bucket *tcp_bound_hash[TCP_BHTABLE_SIZE]; +extern kmem_cache_t *tcp_bucket_cachep; +extern struct tcp_bind_bucket *tcp_bucket_create(unsigned short snum); +extern void tcp_bucket_unlock(struct sock *sk); +extern int tcp_port_rover; + +/* Level-1 socket-demux cache. */ +#define TCP_NUM_REGS 32 +extern struct sock *tcp_regs[TCP_NUM_REGS]; + +#define TCP_RHASH_FN(__fport) \ + ((((__fport) >> 7) ^ (__fport)) & (TCP_NUM_REGS - 1)) +#define TCP_RHASH(__fport) tcp_regs[TCP_RHASH_FN((__fport))] +#define TCP_SK_RHASH_FN(__sock) TCP_RHASH_FN((__sock)->dummy_th.dest) +#define TCP_SK_RHASH(__sock) tcp_regs[TCP_SK_RHASH_FN((__sock))] + +static __inline__ void tcp_reg_zap(struct sock *sk) { - return (lport ^ (lport >> 7)) & (TCP_BHTABLE_SIZE - 1); + struct sock **rpp; + + rpp = &(TCP_SK_RHASH(sk)); + if(*rpp == sk) + *rpp = NULL; } -/* Find the next port that hashes h that is larger than lport. - * If you change the hash, change this function to match, or you will - * break TCP port selection. This function must also NOT wrap around - * when the next number exceeds the largest possible port (2^16-1). - */ -static __inline__ int tcp_bhashnext(__u16 lport, __u16 h) +/* These are AF independent. */ +static __inline__ int tcp_bhashfn(__u16 lport) { - __u32 s; /* don't change this to a smaller type! */ - - s = (lport ^ (h ^ tcp_bhashfn(lport))); - if (s > lport) - return s; - s = lport + TCP_BHTABLE_SIZE; - return (s ^ (h ^ tcp_bhashfn(s))); + return (lport & (TCP_BHTABLE_SIZE - 1)); } -static __inline__ int tcp_sk_bhashfn(struct sock *sk) +static __inline__ void tcp_sk_bindify(struct sock *sk) { - __u16 lport = sk->num; - return tcp_bhashfn(lport); + struct tcp_bind_bucket *tb; + unsigned short snum = sk->num; + + for(tb = tcp_bound_hash[tcp_bhashfn(snum)]; tb->port != snum; tb = tb->next) + ; + /* Update bucket flags. */ + if(tb->owners == NULL) { + /* We're the first. */ + if(sk->reuse && sk->state != TCP_LISTEN) + tb->flags = TCPB_FLAG_FASTREUSE; + else + tb->flags = 0; + } else { + if((tb->flags & TCPB_FLAG_FASTREUSE) && + ((sk->reuse != 0) || (sk->state == TCP_LISTEN))) + tb->flags &= ~TCPB_FLAG_FASTREUSE; + } + if((sk->tp_pinfo.af_tcp.bind_next = tb->owners) != NULL) + tb->owners->tp_pinfo.af_tcp.bind_pprev = + &sk->tp_pinfo.af_tcp.bind_next; + tb->owners = sk; + sk->tp_pinfo.af_tcp.bind_pprev = &tb->owners; + sk->prev = (struct sock *) tb; } +/* tcp_ipv4.c: These sysctl variables need to be shared between v4 and v6 + * because the v6 tcp code to intialize a connection needs to interoperate + * with the v4 code using the same variables. + * FIXME: It would be better to rewrite the connection code to be + * address family independent and just leave one copy in the ipv4 section. + * This would also clean up some code duplication. -- erics + */ +extern int sysctl_tcp_sack; +extern int sysctl_tcp_timestamps; +extern int sysctl_tcp_window_scaling; + /* These can have wildcards, don't try too hard. */ static __inline__ int tcp_lhashfn(unsigned short num) { @@ -93,28 +164,6 @@ static __inline__ int tcp_sk_listen_hashfn(struct sock *sk) return tcp_lhashfn(sk->num); } -/* Only those holding the sockhash lock call these two things here. - * Note the slightly gross overloading of sk->prev, AF_UNIX is the - * only other main benefactor of that member of SK, so who cares. - */ -static __inline__ void tcp_sk_bindify(struct sock *sk) -{ - int hashent = tcp_sk_bhashfn(sk); - struct sock **htable = &tcp_bound_hash[hashent]; - - if((sk->bind_next = *htable) != NULL) - (*htable)->bind_pprev = &sk->bind_next; - *htable = sk; - sk->bind_pprev = htable; -} - -static __inline__ void tcp_sk_unbindify(struct sock *sk) -{ - if(sk->bind_next) - sk->bind_next->bind_pprev = sk->bind_pprev; - *(sk->bind_pprev) = sk->bind_next; -} - #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) #define NETHDR_SIZE sizeof(struct ipv6hdr) #else @@ -186,6 +235,8 @@ static __inline__ void tcp_sk_unbindify(struct sock *sk) * we tell the LL layer that it is something * wrong (e.g. that it can expire redirects) */ +#define TCP_BUCKETGC_PERIOD (HZ) + /* * TCP option */ @@ -427,6 +478,10 @@ extern int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr, __u32 isn); +extern struct sock * tcp_create_openreq_child(struct sock *sk, + struct open_request *req, + struct sk_buff *skb); + extern struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, struct open_request *req, @@ -460,7 +515,7 @@ extern void tcp_send_fin(struct sock *sk); extern int tcp_send_synack(struct sock *); extern void tcp_send_skb(struct sock *, struct sk_buff *); extern void tcp_send_ack(struct sock *sk); -extern void tcp_send_delayed_ack(struct sock *sk, int max_timeout); +extern void tcp_send_delayed_ack(struct tcp_opt *tp, int max_timeout); /* CONFIG_IP_TRANSPARENT_PROXY */ extern int tcp_chkaddr(struct sk_buff *); @@ -492,40 +547,56 @@ struct tcp_sl_timer { #define TCP_SLT_SYNACK 0 #define TCP_SLT_KEEPALIVE 1 -#define TCP_SLT_MAX 2 +#define TCP_SLT_BUCKETGC 2 +#define TCP_SLT_MAX 3 extern struct tcp_sl_timer tcp_slt_array[TCP_SLT_MAX]; -/* - * FIXME: this method of choosing when to send a window update - * does not seem correct to me. -- erics - */ -static __inline__ unsigned short tcp_raise_window(struct sock *sk) +/* Compute the actual receive window we are currently advertising. */ +static __inline__ u32 tcp_receive_window(struct tcp_opt *tp) { - struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - long cur_win; - int res = 0; - - /* - * compute the actual window i.e. - * old_window - received_bytes_on_that_win - */ + return tp->rcv_wup - (tp->rcv_nxt - tp->rcv_wnd); +} - cur_win = tp->rcv_wup - (tp->rcv_nxt - tp->rcv_wnd); +/* Choose a new window, without checks for shrinking, and without + * scaling applied to the result. The caller does these things + * if necessary. This is a "raw" window selection. + */ +extern u32 __tcp_select_window(struct sock *sk); +/* Chose a new window to advertise, update state in tcp_opt for the + * socket, and return result with RFC1323 scaling applied. The return + * value can be stuffed directly into th->window for an outgoing + * frame. + */ +extern __inline__ u16 tcp_select_window(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + u32 new_win = __tcp_select_window(sk); + u32 cur_win = tcp_receive_window(tp); - /* - * We need to send an ack right away if - * our rcv window is blocking the sender and - * we have more free space to offer. - */ + /* Never shrink the offered window */ + if(new_win < cur_win) + new_win = cur_win; + tp->rcv_wnd = new_win; + tp->rcv_wup = tp->rcv_nxt; - if (cur_win < (sk->mss << 1)) - res = 1; - return res; + /* RFC1323 scaling applied */ + return new_win >> tp->rcv_wscale; } -extern unsigned short tcp_select_window(struct sock *sk); +/* See if we can advertise non-zero, and if so how much we + * can increase our advertisement. If it becomes more than + * twice what we are talking about right now, return true. + */ +extern __inline__ int tcp_raise_window(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + u32 new_win = __tcp_select_window(sk); + u32 cur_win = tcp_receive_window(tp); + + return (new_win && (new_win > (cur_win << 1))); +} /* * List all states of a TCP socket that can be viewed as a "connected" @@ -581,11 +652,12 @@ static __inline__ void tcp_set_state(struct sock *sk, int state) case TCP_CLOSE: /* Should be about 2 rtt's */ net_reset_timer(sk, TIME_DONE, min(tp->srtt * 2, TCP_DONE_TIME)); + sk->prot->unhash(sk); /* fall through */ default: if (oldstate==TCP_ESTABLISHED) tcp_statistics.TcpCurrEstab--; - if (state == TCP_TIME_WAIT || state == TCP_CLOSE) + if (state == TCP_TIME_WAIT) sk->prot->rehash(sk); } } @@ -624,7 +696,7 @@ static inline void tcp_build_header_data(struct tcphdr *th, struct sock *sk, int struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); memcpy(th,(void *) &(sk->dummy_th), sizeof(*th)); - th->seq = htonl(sk->write_seq); + th->seq = htonl(tp->write_seq); if (!push) th->psh = 1; tcp_build_options((__u32*)(th+1), tp); @@ -635,7 +707,7 @@ static inline void tcp_build_header(struct tcphdr *th, struct sock *sk) struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); memcpy(th,(void *) &(sk->dummy_th), sizeof(*th)); - th->seq = htonl(sk->write_seq); + th->seq = htonl(tp->write_seq); th->ack_seq = htonl(tp->last_ack_sent = tp->rcv_nxt); th->window = htons(tcp_select_window(sk)); tcp_build_options((__u32 *)(th+1), tp); @@ -724,7 +796,7 @@ extern __inline__ void tcp_select_initial_window(__u32 space, __u16 mss, (*window_clamp) = min(65535<<(*rcv_wscale),*window_clamp); } -#define SYNQ_DEBUG 1 +/* #define SYNQ_DEBUG 1 */ extern __inline__ void tcp_synq_unlink(struct tcp_opt *tp, struct open_request *req, struct open_request *prev) { @@ -799,6 +871,18 @@ extern __inline__ void tcp_dec_slow_timer(int timer) atomic_dec(&slt->count); } +/* This needs to use a slow timer, so it is here. */ +static __inline__ void tcp_sk_unbindify(struct sock *sk) +{ + struct tcp_bind_bucket *tb = (struct tcp_bind_bucket *) sk->prev; + if(sk->tp_pinfo.af_tcp.bind_next) + sk->tp_pinfo.af_tcp.bind_next->tp_pinfo.af_tcp.bind_pprev = + sk->tp_pinfo.af_tcp.bind_pprev; + *(sk->tp_pinfo.af_tcp.bind_pprev) = sk->tp_pinfo.af_tcp.bind_next; + if(tb->owners == NULL) + tcp_inc_slow_timer(TCP_SLT_BUCKETGC); +} + extern const char timer_bug_msg[]; static inline void tcp_clear_xmit_timer(struct sock *sk, int what) @@ -820,7 +904,8 @@ static inline void tcp_clear_xmit_timer(struct sock *sk, int what) printk(timer_bug_msg); return; }; - del_timer(timer); + if(timer->prev != NULL) + del_timer(timer); } static inline int tcp_timer_is_set(struct sock *sk, int what) @@ -829,13 +914,13 @@ static inline int tcp_timer_is_set(struct sock *sk, int what) switch (what) { case TIME_RETRANS: - return tp->retransmit_timer.next != NULL; + return tp->retransmit_timer.prev != NULL; break; case TIME_DACK: - return tp->delack_timer.next != NULL; + return tp->delack_timer.prev != NULL; break; case TIME_PROBE0: - return tp->probe_timer.next != NULL; + return tp->probe_timer.prev != NULL; break; default: printk(timer_bug_msg); diff --git a/init/main.c b/init/main.c index 30e02d7ff260..c309adfa3161 100644 --- a/init/main.c +++ b/init/main.c @@ -278,7 +278,7 @@ extern void nfs_root_setup(char *str, int *ints); extern void ftape_setup(char *str, int *ints); #endif -#if defined(CONFIG_SYSVIPC) || defined(CONFIG_KERNELD) +#if defined(CONFIG_SYSVIPC) extern void ipc_init(void); #endif @@ -1061,7 +1061,7 @@ __initfunc(asmlinkage void start_kernel(void)) inode_init(); file_table_init(); sock_init(); -#if defined(CONFIG_SYSVIPC) || defined(CONFIG_KERNELD) +#if defined(CONFIG_SYSVIPC) ipc_init(); #endif dquot_init(); @@ -1183,6 +1183,13 @@ static int init(void * unused) } #endif +#ifdef CONFIG_KMOD + { + extern int kmod_init(void); + kmod_init(); + } +#endif + setup(1); if (open("/dev/console", O_RDWR, 0) < 0) diff --git a/ipc/Makefile b/ipc/Makefile index 4240520177d1..4e947582b4b2 100644 --- a/ipc/Makefile +++ b/ipc/Makefile @@ -10,10 +10,6 @@ O_TARGET := ipc.o O_OBJS := util.o -ifdef CONFIG_KERNELD -CONFIG_SYSVIPC=1 -endif - ifdef CONFIG_SYSVIPC O_OBJS += msg.o sem.o shm.o endif diff --git a/ipc/msg.c b/ipc/msg.c index 6ff658b02386..1f1c4fd3c0d6 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -1,10 +1,6 @@ /* * linux/ipc/msg.c * Copyright (C) 1992 Krishna Balasubramanian - * - * Kerneld extensions by Bjorn Ekwall in May 1995, and May 1996 - * - * See for the (optional) new kerneld protocol */ #include @@ -13,7 +9,6 @@ #include #include #include -#include #include #include #include @@ -34,11 +29,6 @@ static unsigned short msg_seq = 0; static int used_queues = 0; static int max_msqid = 0; static struct wait_queue *msg_lock = NULL; -static int kerneld_msqid = -1; - -#define MAX_KERNELDS 20 -static int kerneld_arr[MAX_KERNELDS]; -static int n_kernelds = 0; __initfunc(void msg_init (void)) { @@ -97,20 +87,12 @@ static int real_msgsnd (int msqid, struct msgbuf *msgp, size_t msgsz, int msgflg return -EINVAL; if (!msgp) return -EFAULT; - /* - * Calls from kernel level (IPC_KERNELD set) - * have the message somewhere in kernel space already! - */ - if ((msgflg & IPC_KERNELD)) - mtype = msgp->mtype; - else { - err = verify_area (VERIFY_READ, msgp->mtext, msgsz); - if (err) - return err; - get_user(mtype, &msgp->mtype); - if (mtype < 1) - return -EINVAL; - } + err = verify_area (VERIFY_READ, msgp->mtext, msgsz); + if (err) + return err; + get_user(mtype, &msgp->mtype); + if (mtype < 1) + return -EINVAL; id = (unsigned int) msqid % MSGMNI; msq = msgque [id]; if (msq == IPC_UNUSED || msq == IPC_NOID) @@ -120,29 +102,17 @@ static int real_msgsnd (int msqid, struct msgbuf *msgp, size_t msgsz, int msgflg slept: if (msq->msg_perm.seq != (unsigned int) msqid / MSGMNI) return -EIDRM; - /* - * Non-root kernel level processes may send to kerneld! - * i.e. no permission check if called from the kernel - * otoh we don't want user level non-root snoopers... - */ - if ((msgflg & IPC_KERNELD) == 0) - if (ipcperms(ipcp, S_IWUGO)) - return -EACCES; + + if (ipcperms(ipcp, S_IWUGO)) + return -EACCES; if (msgsz + msq->msg_cbytes > msq->msg_qbytes) { - if ((kerneld_msqid != -1) && (kerneld_msqid == msqid)) - flush_msg(msq); /* flush the kerneld channel only */ if (msgsz + msq->msg_cbytes > msq->msg_qbytes) { /* still no space in queue */ if (msgflg & IPC_NOWAIT) return -EAGAIN; if (signal_pending(current)) return -EINTR; - if (in_interrupt()) { - /* Very unlikely, but better safe than sorry */ - printk(KERN_WARNING "Ouch, kerneld:msgsnd buffers full!\n"); - return -EINTR; - } interruptible_sleep_on (&msq->wwait); goto slept; } @@ -154,22 +124,7 @@ static int real_msgsnd (int msqid, struct msgbuf *msgp, size_t msgsz, int msgflg return -ENOMEM; msgh->msg_spot = (char *) (msgh + 1); - /* - * Calls from kernel level (IPC_KERNELD set) - * have the message somewhere in kernel space already! - */ - if (msgflg & IPC_KERNELD) { - struct kerneld_msg *kdmp = (struct kerneld_msg *)msgp; - - /* - * Note that the kernel supplies a pointer - * but the user-level kerneld uses a char array... - */ - memcpy(msgh->msg_spot, (char *)(&(kdmp->id)), KDHDR); - memcpy(msgh->msg_spot + KDHDR, kdmp->text, msgsz - KDHDR); - } - else - copy_from_user (msgh->msg_spot, msgp->mtext, msgsz); + copy_from_user (msgh->msg_spot, msgp->mtext, msgsz); if (msgque[id] == IPC_UNUSED || msgque[id] == IPC_NOID || msq->msg_perm.seq != (unsigned int) msqid / MSGMNI) { @@ -201,42 +156,8 @@ static int real_msgsnd (int msqid, struct msgbuf *msgp, size_t msgsz, int msgflg return 0; } -/* - * Take care of missing kerneld, especially in case of multiple daemons - */ -#define KERNELD_TIMEOUT 1 * (HZ) -#define DROP_TIMER del_timer(&kd_timer) -/*#define DROP_TIMER if ((msgflg & IPC_KERNELD) && kd_timer.next && kd_timer.prev) del_timer(&kd_timer)*/ - -static void kd_timeout(unsigned long msgid) -{ - struct msqid_ds *msq; - struct msg *tmsg; - unsigned long flags; - - msq = msgque [ (unsigned int) kerneld_msqid % MSGMNI ]; - if (msq == IPC_NOID || msq == IPC_UNUSED) - return; - - save_flags(flags); - cli(); - for (tmsg = msq->msg_first; tmsg; tmsg = tmsg->msg_next) - if (*(long *)(tmsg->msg_spot) == msgid) - break; - restore_flags(flags); - if (tmsg) { /* still there! */ - struct kerneld_msg kmsp = { msgid, NULL_KDHDR, "" }; - - printk(KERN_ALERT "Ouch, no kerneld for message %ld\n", msgid); - kmsp.id = -ENODEV; - real_msgsnd(kerneld_msqid, (struct msgbuf *)&kmsp, KDHDR, - S_IRUSR | S_IWUSR | IPC_KERNELD | MSG_NOERROR); - } -} - static int real_msgrcv (int msqid, struct msgbuf *msgp, size_t msgsz, long msgtyp, int msgflg) { - struct timer_list kd_timer = { NULL, NULL, 0, 0, 0}; struct msqid_ds *msq; struct ipc_perm *ipcp; struct msg *tmsg, *leastp = NULL; @@ -248,15 +169,10 @@ static int real_msgrcv (int msqid, struct msgbuf *msgp, size_t msgsz, long msgty return -EINVAL; if (!msgp || !msgp->mtext) return -EFAULT; - /* - * Calls from kernel level (IPC_KERNELD set) - * wants the message put in kernel space! - */ - if ((msgflg & IPC_KERNELD) == 0) { - err = verify_area (VERIFY_WRITE, msgp->mtext, msgsz); - if (err) - return err; - } + + err = verify_area (VERIFY_WRITE, msgp->mtext, msgsz); + if (err) + return err; id = (unsigned int) msqid % MSGMNI; msq = msgque [id]; @@ -264,16 +180,6 @@ static int real_msgrcv (int msqid, struct msgbuf *msgp, size_t msgsz, long msgty return -EINVAL; ipcp = &msq->msg_perm; - /* - * Start timer for missing kerneld - */ - if (msgflg & IPC_KERNELD) { - kd_timer.data = (unsigned long)msgtyp; - kd_timer.expires = jiffies + KERNELD_TIMEOUT; - kd_timer.function = kd_timeout; - add_timer(&kd_timer); - } - /* * find message of correct type. * msgtyp = 0 => get first. @@ -282,19 +188,10 @@ static int real_msgrcv (int msqid, struct msgbuf *msgp, size_t msgsz, long msgty */ while (!nmsg) { if (msq->msg_perm.seq != (unsigned int) msqid / MSGMNI) { - DROP_TIMER; return -EIDRM; } - if ((msgflg & IPC_KERNELD) == 0) { - /* - * All kernel level processes may receive from kerneld! - * i.e. no permission check if called from the kernel - * otoh we don't want user level non-root snoopers... - */ - if (ipcperms (ipcp, S_IRUGO)) { - DROP_TIMER; /* Not needed, but doesn't hurt */ - return -EACCES; - } + if (ipcperms (ipcp, S_IRUGO)) { + return -EACCES; } save_flags(flags); @@ -326,7 +223,6 @@ static int real_msgrcv (int msqid, struct msgbuf *msgp, size_t msgsz, long msgty restore_flags(flags); if (nmsg) { /* done finding a message */ - DROP_TIMER; if ((msgsz < nmsg->msg_ts) && !(msgflg & MSG_NOERROR)) { return -E2BIG; } @@ -354,43 +250,20 @@ static int real_msgrcv (int msqid, struct msgbuf *msgp, size_t msgsz, long msgty msq->msg_cbytes -= nmsg->msg_ts; restore_flags(flags); wake_up (&msq->wwait); - /* - * Calls from kernel level (IPC_KERNELD set) - * wants the message copied to kernel space! - */ - if (msgflg & IPC_KERNELD) { - struct kerneld_msg *kdmp = (struct kerneld_msg *) msgp; - - memcpy((char *)(&(kdmp->id)), - nmsg->msg_spot, KDHDR); - /* - * Note that kdmp->text is a pointer - * when called from kernel space! - */ - if ((msgsz > KDHDR) && kdmp->text) - memcpy(kdmp->text, - nmsg->msg_spot + KDHDR, - msgsz - KDHDR); - } - else { - put_user (nmsg->msg_type, &msgp->mtype); - copy_to_user (msgp->mtext, nmsg->msg_spot, msgsz); - } + put_user (nmsg->msg_type, &msgp->mtype); + copy_to_user (msgp->mtext, nmsg->msg_spot, msgsz); kfree(nmsg); return msgsz; } else { /* did not find a message */ if (msgflg & IPC_NOWAIT) { - DROP_TIMER; return -ENOMSG; } if (signal_pending(current)) { - DROP_TIMER; return -EINTR; } interruptible_sleep_on (&msq->rwait); } } /* end while */ - DROP_TIMER; return -1; } @@ -398,9 +271,8 @@ asmlinkage int sys_msgsnd (int msqid, struct msgbuf *msgp, size_t msgsz, int msg { int ret; - /* IPC_KERNELD is used as a marker for kernel level calls */ lock_kernel(); - ret = real_msgsnd(msqid, msgp, msgsz, msgflg & ~IPC_KERNELD); + ret = real_msgsnd(msqid, msgp, msgsz, msgflg); unlock_kernel(); return ret; } @@ -410,9 +282,8 @@ asmlinkage int sys_msgrcv (int msqid, struct msgbuf *msgp, size_t msgsz, { int ret; - /* IPC_KERNELD is used as a marker for kernel level calls */ lock_kernel(); - ret = real_msgrcv (msqid, msgp, msgsz, msgtyp, msgflg & ~IPC_KERNELD); + ret = real_msgrcv (msqid, msgp, msgsz, msgtyp, msgflg); unlock_kernel(); return ret; } @@ -479,36 +350,7 @@ asmlinkage int sys_msgget (key_t key, int msgflg) int id, ret = -EPERM; struct msqid_ds *msq; - /* - * If the IPC_KERNELD flag is set, the key is forced to IPC_PRIVATE, - * and a designated kerneld message queue is created/referred to - */ lock_kernel(); - if ((msgflg & IPC_KERNELD)) { - int i; - if (!suser()) - goto out; -#ifdef NEW_KERNELD_PROTOCOL - if ((msgflg & IPC_KERNELD) == OLDIPC_KERNELD) { - printk(KERN_ALERT "Please recompile your kerneld daemons!\n"); - goto out; - } -#endif - ret = -ENOSPC; - if ((kerneld_msqid == -1) && (kerneld_msqid = - newque(IPC_PRIVATE, msgflg & S_IRWXU)) < 0) - goto out; - for (i = 0; i < MAX_KERNELDS; ++i) { - if (kerneld_arr[i] == 0) { - kerneld_arr[i] = current->pid; - ++n_kernelds; - ret = kerneld_msqid; - goto out; - } - } - goto out; - } - /* else it is a "normal" request */ if (key == IPC_PRIVATE) ret = newque(key, msgflg); else if ((id = findkey (key)) == -1) { /* key not used */ @@ -527,7 +369,6 @@ asmlinkage int sys_msgget (key_t key, int msgflg) else ret = (unsigned int) msq->msg_perm.seq * MSGMNI + id; } -out: unlock_kernel(); return ret; } @@ -687,12 +528,7 @@ asmlinkage int sys_msgctl (int msqid, int cmd, struct msqid_ds *buf) if (current->euid != ipcp->cuid && current->euid != ipcp->uid && !suser()) goto out; - /* - * There is only one kerneld message queue, - * mark it as non-existent - */ - if ((kerneld_msqid >= 0) && (msqid == kerneld_msqid)) - kerneld_msqid = -1; + freeque (id); err = 0; goto out; @@ -705,104 +541,3 @@ out: return err; } -/* - * We do perhaps need a "flush" for waiting processes, - * so that if they are terminated, a call from do_exit - * will minimize the possibility of orphaned received - * messages in the queue. For now we just make sure - * that the queue is shut down whenever all kernelds have died. - */ -void kerneld_exit(void) -{ - int i; - - if (kerneld_msqid == -1) - return; - for (i = 0; i < MAX_KERNELDS; ++i) { - if (kerneld_arr[i] == current->pid) { - kerneld_arr[i] = 0; - --n_kernelds; - if (n_kernelds == 0) - sys_msgctl(kerneld_msqid, IPC_RMID, NULL); - break; - } - } -} - -/* - * Kerneld internal message format/syntax: - * - * The message type from the kernel to kerneld is used to specify _what_ - * function we want kerneld to perform. - * - * The "normal" message area is divided into a header, followed by a char array. - * The header is used to hold the sequence number of the request, which will - * be used as the return message type from kerneld back to the kernel. - * In the return message, the header will be used to store the exit status - * of the kerneld "job", or task. - * The character array is used to pass parameters to kerneld and (optional) - * return information from kerneld back to the kernel. - * It is the responsibility of kerneld and the kernel level caller - * to set usable sizes on the parameter/return value array, since - * that information is _not_ included in the message format - */ - -/* - * The basic kernel level entry point to kerneld. - * msgtype should correspond to a task type for (a) kerneld - * ret_size is the size of the (optional) return _value, - * OR-ed with KERNELD_WAIT if we want an answer - * msgsize is the size (in bytes) of the message, not including - * the header that is always sent first in a kerneld message - * text is the parameter for the kerneld specific task - * ret_val is NULL or the kernel address where an expected answer - * from kerneld should be placed. - * - * See for usage (inline convenience functions) - * - */ -int kerneld_send(int msgtype, int ret_size, int msgsz, - const char *text, const char *ret_val) -{ - int status = -ENOSYS; -#ifdef CONFIG_KERNELD - static int id = KERNELD_MINSEQ; - struct kerneld_msg kmsp = { msgtype, NULL_KDHDR, (char *)text }; - int msgflg = S_IRUSR | S_IWUSR | IPC_KERNELD | MSG_NOERROR; - unsigned long flags; - - if (kerneld_msqid == -1) - return -ENODEV; - - /* Do not wait for an answer at interrupt-time! */ - if (in_interrupt()) - ret_size &= ~KERNELD_WAIT; -#ifdef NEW_KERNELD_PROTOCOL - else - kmsp.pid = current->pid; -#endif - - msgsz += KDHDR; - if (ret_size & KERNELD_WAIT) { - save_flags(flags); - cli(); - if (++id <= 0) /* overflow */ - id = KERNELD_MINSEQ; - kmsp.id = id; - restore_flags(flags); - } - - status = real_msgsnd(kerneld_msqid, (struct msgbuf *)&kmsp, msgsz, msgflg); - if ((status >= 0) && (ret_size & KERNELD_WAIT)) { - ret_size &= ~KERNELD_WAIT; - kmsp.text = (char *)ret_val; - status = real_msgrcv(kerneld_msqid, (struct msgbuf *)&kmsp, - KDHDR + ((ret_val)?ret_size:0), - kmsp.id, msgflg); - if (status > 0) /* a valid answer contains at least a long */ - status = kmsp.id; - } - -#endif /* CONFIG_KERNELD */ - return status; -} diff --git a/ipc/util.c b/ipc/util.c index f0ba7feddc69..eacea2b6239d 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -20,7 +20,7 @@ #include -#if defined(CONFIG_SYSVIPC) || defined(CONFIG_KERNELD) +#if defined(CONFIG_SYSVIPC) extern void sem_init (void), msg_init (void), shm_init (void); @@ -123,7 +123,4 @@ asmlinkage int sys_shmctl (int shmid, int cmd, struct shmid_ds *buf) return -ENOSYS; } -void kerneld_exit(void) -{ -} #endif /* CONFIG_SYSVIPC */ diff --git a/kernel/Makefile b/kernel/Makefile index ff908f68a879..4e0a1d87d8f5 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -17,6 +17,10 @@ O_OBJS = sched.o dma.o fork.o exec_domain.o panic.o printk.o sys.o \ OX_OBJS += signal.o +ifeq ($(CONFIG_KMOD),y) +O_OBJS += kmod.o +endif + ifeq ($(CONFIG_MODULES),y) OX_OBJS += ksyms.o endif diff --git a/kernel/exit.c b/kernel/exit.c index dfa4c1c464b5..27dd89772e6d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -27,7 +27,6 @@ #include extern void sem_exit (void); -extern void kerneld_exit(void); int getrusage(struct task_struct *, int, struct rusage *); @@ -157,7 +156,7 @@ static inline void close_files(struct files_struct * files) unsigned long set = files->open_fds.fds_bits[j]; i = j * __NFDBITS; j++; - if (i >= NR_OPEN) + if (i >= files->max_fds) break; while (set) { if (set & 1) { @@ -183,6 +182,13 @@ static inline void __exit_files(struct task_struct *tsk) tsk->files = NULL; if (!--files->count) { close_files(files); + /* + * Free the fd array as appropriate ... + */ + if (NR_OPEN * sizeof(struct file *) == PAGE_SIZE) + free_page((unsigned long) files->fd); + else + kfree(files->fd); kmem_cache_free(files_cachep, files); } } @@ -328,7 +334,6 @@ fake_volatile: acct_process(code); del_timer(¤t->real_timer); sem_exit(); - kerneld_exit(); __exit_mm(current); #if CONFIG_AP1000 exit_msc(current); diff --git a/kernel/fork.c b/kernel/fork.c index 38c98b0a8a92..18735c3df991 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -375,44 +375,66 @@ static inline int copy_fdset(fd_set *dst, fd_set *src) return __copy_fdset(dst->fds_bits, src->fds_bits); } -static inline int copy_files(unsigned long clone_flags, struct task_struct * tsk) +static int copy_files(unsigned long clone_flags, struct task_struct * tsk) { - int i; struct files_struct *oldf, *newf; struct file **old_fds, **new_fds; + int size, i, error = 0; /* * A background process may not have any files ... */ oldf = current->files; if (!oldf) - return 0; + goto out; if (clone_flags & CLONE_FILES) { oldf->count++; - return 0; + goto out; } + tsk->files = NULL; + error = -ENOMEM; newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL); - tsk->files = newf; if (!newf) - return -1; + goto out; + + /* + * Allocate the fd array, using get_free_page() if possible. + * Eventually we want to make the array size variable ... + */ + size = NR_OPEN * sizeof(struct file *); + if (size == PAGE_SIZE) + new_fds = (struct file **) __get_free_page(GFP_KERNEL); + else + new_fds = (struct file **) kmalloc(size, GFP_KERNEL); + if (!new_fds) + goto out_release; + memset((void *) new_fds, 0, size); newf->count = 1; + newf->max_fds = NR_OPEN; + newf->fd = new_fds; newf->close_on_exec = oldf->close_on_exec; - i = copy_fdset(&newf->open_fds,&oldf->open_fds); + i = copy_fdset(&newf->open_fds, &oldf->open_fds); old_fds = oldf->fd; - new_fds = newf->fd; for (; i != 0; i--) { struct file * f = *old_fds; old_fds++; *new_fds = f; - new_fds++; if (f) f->f_count++; + new_fds++; } - return 0; + tsk->files = newf; + error = 0; +out: + return error; + +out_release: + kmem_cache_free(files_cachep, newf); + goto out; } static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk) diff --git a/kernel/kmod.c b/kernel/kmod.c new file mode 100644 index 000000000000..30cddf5564b0 --- /dev/null +++ b/kernel/kmod.c @@ -0,0 +1,149 @@ +/* + kmod, the new module loader (replaces kerneld) + Kirk Petersen +*/ + +#define __KERNEL_SYSCALLS__ + +#include +#include +#include + +static inline _syscall1(int,delete_module,const char *,name_user) + +/* + kmod_unload_delay and modprobe_path are set via /proc/sys. +*/ +int kmod_unload_delay = 60; +char modprobe_path[256] = "/sbin/modprobe"; +char module_name[64] = ""; +char * argv[] = { "modprobe", "-k", NULL, NULL, }; +char * envp[] = { "HOME=/", "TERM=linux", NULL, }; + +/* + kmod_queue synchronizes the kmod thread and the rest of the system + kmod_unload_timer is what we use to unload modules + after kmod_unload_delay seconds +*/ +struct wait_queue * kmod_queue = NULL; +struct timer_list kmod_unload_timer; + +/* + kmod_thread is the thread that does most of the work. kmod_unload and + request_module tell it to wake up and do work. +*/ +int kmod_thread(void * data) +{ + int pid; + + /* + Initialize basic thread information + */ + current->session = 1; + current->pgrp = 1; + sprintf(current->comm, "kmod"); + sigfillset(¤t->blocked); + + /* + This is the main kmod_thread loop. It first sleeps, then + handles requests from request_module or kmod_unload. + */ + + while (1) { + interruptible_sleep_on(&kmod_queue); + + /* + If request_module woke us up, we should try to + load module_name. If not, kmod_unload woke us up, + do call delete_module. + (if somehow both want us to do something, ignore the + delete_module request) + */ + if (module_name[0] == '\0') { + delete_module(NULL); + } else { + pid = fork(); + if (pid > 0) { + waitpid(pid, NULL, 0); + module_name[0] = '\0'; + wake_up(&kmod_queue); + } else + if (pid == 0) { + + /* + Call modprobe with module_name. If execve returns, + print out an error. + */ + argv[2] = module_name; + execve(modprobe_path, argv, envp); + + printk("kmod: failed to load module %s\n", module_name); + _exit(0); + } else { + printk("error, fork failed in kmod\n"); + } + } + } + + return 0; /* Never reached. */ +} + +/* + kmod_unload is the function that the kernel calls when + the kmod_unload_timer expires +*/ +void kmod_unload(unsigned long x) +{ + /* + wake up the kmod thread, which does the work + (we can't call delete_module, as it locks the kernel and + we are in the bottom half of the kernel (right?)) + once it is awake, reset the timer + */ + wake_up(&kmod_queue); + kmod_unload_timer.expires = jiffies + (kmod_unload_delay * 100); + add_timer(&kmod_unload_timer); +} + +int kmod_init(void) +{ + printk ("Starting kmod\n"); + + kernel_thread(kmod_thread, NULL, 0); + + kmod_unload_timer.next = NULL; + kmod_unload_timer.prev = NULL; + kmod_unload_timer.expires = jiffies + (300 * 100); + kmod_unload_timer.data = 0L; + kmod_unload_timer.function = kmod_unload; + add_timer(&kmod_unload_timer); + + return 0; +} + +/* + request_module, the function that everyone calls when they need a + module to be loaded +*/ +int request_module(const char * name) +{ + /* first, copy the name of the module into module_name */ + /* then wake_up() the kmod daemon */ + /* wait for the kmod daemon to finish (it will wake us up) */ + + /* + kmod_thread is sleeping, so start by copying the name of + the module into module_name. Once that is done, wake up + kmod_thread. + */ + strcpy(module_name, name); + wake_up(&kmod_queue); + + /* + Now that we have told kmod_thread what to do, we want to + go to sleep and let it do its work. It will wake us up, + at which point we will be done (the module will be loaded). + */ + interruptible_sleep_on(&kmod_queue); + return 0; +} diff --git a/kernel/ksyms.c b/kernel/ksyms.c index 869e5e5bbf21..7ff40d7bd303 100644 --- a/kernel/ksyms.c +++ b/kernel/ksyms.c @@ -61,8 +61,8 @@ extern unsigned char aux_device_present, kbd_read_mask; #if defined(CONFIG_PROC_FS) #include #endif -#ifdef CONFIG_KERNELD -#include +#ifdef CONFIG_KMOD +#include #endif #include #ifdef __SMP__ @@ -91,12 +91,13 @@ __attribute__((section("__ksymtab"))) = { #endif +#ifdef CONFIG_KMOD +EXPORT_SYMBOL(request_module); +#endif + #ifdef CONFIG_MODULES EXPORT_SYMBOL(get_module_symbol); #endif -#ifdef CONFIG_KERNELD -EXPORT_SYMBOL(kerneld_send); -#endif EXPORT_SYMBOL(get_options); #ifdef CONFIG_PCI diff --git a/kernel/module.c b/kernel/module.c index afbbb4c72299..ffd73ef85a15 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -324,7 +324,7 @@ sys_init_module(const char *name_user, struct module *mod_user) dep->next_ref = d->refs; d->refs = dep; /* Being referenced by a dependant module counts as a - use as far as kerneld is concerned. */ + use as far as kmod is concerned. */ d->flags |= MOD_USED_ONCE; } diff --git a/kernel/sched.c b/kernel/sched.c index f48f520ff8ce..fc3b3770e278 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -201,14 +201,20 @@ static void process_timeout(unsigned long __data) */ static inline int goodness(struct task_struct * p, struct task_struct * prev, int this_cpu) { + int policy = p->policy; int weight; + if (policy & SCHED_YIELD) { + p->policy = policy & ~SCHED_YIELD; + return 0; + } + /* * Realtime process, select the first one on the * runqueue (taking priorities within processes * into account). */ - if (p->policy != SCHED_OTHER) + if (policy != SCHED_OTHER) return 1000 + p->rt_priority; /* @@ -228,9 +234,10 @@ static inline int goodness(struct task_struct * p, struct task_struct * prev, in weight += PROC_CHANGE_PENALTY; #endif - /* .. and a slight advantage to the current process */ - if (p == prev) + /* .. and a slight advantage to the current thread */ + if (p->mm == prev->mm) weight += 1; + weight += p->priority; } return weight; @@ -1351,13 +1358,9 @@ asmlinkage int sys_sched_getparam(pid_t pid, struct sched_param *param) asmlinkage int sys_sched_yield(void) { - /* - * This is not really right. We'd like to reschedule - * just _once_ with this process having a zero count. - */ - current->counter = 0; spin_lock(&scheduler_lock); spin_lock_irq(&runqueue_lock); + current->policy |= SCHED_YIELD; move_last_runqueue(current); spin_unlock_irq(&runqueue_lock); spin_unlock(&scheduler_lock); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1b93ad7bd6ef..171a9dc74b37 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -41,6 +41,10 @@ extern int console_loglevel, C_A_D, swapout_interval; extern int bdf_prm[], bdflush_min[], bdflush_max[]; extern char binfmt_java_interpreter[], binfmt_java_appletviewer[]; extern int sysctl_overcommit_memory; +#ifdef CONFIG_KMOD +extern char modprobe_path[]; +extern int kmod_unload_delay; +#endif #ifdef __sparc__ extern char reboot_command []; @@ -174,6 +178,12 @@ static ctl_table kern_table[] = { 0644, NULL, &proc_dointvec}, {KERN_PRINTK, "printk", &console_loglevel, 4*sizeof(int), 0644, NULL, &proc_dointvec}, +#ifdef CONFIG_KMOD + {KERN_MODPROBE, "modprobe", &modprobe_path, 256, + 0644, NULL, &proc_dostring, &sysctl_string }, + {KERN_KMOD_UNLOAD_DELAY, "kmod_unload_delay", &kmod_unload_delay, + sizeof(int), 0644, NULL, &proc_dointvec}, +#endif {0} }; diff --git a/mm/slab.c b/mm/slab.c index 65fc3e7fe04b..ae7734a4c979 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1808,7 +1808,9 @@ next: } spin_lock_irq(&best_cachep->c_spinlock); - if (!best_cachep->c_growing && !(slabp = best_cachep->c_lastp)->s_inuse && slabp != kmem_slab_end(best_cachep)) { + while (!best_cachep->c_growing && + !(slabp = best_cachep->c_lastp)->s_inuse && + slabp != kmem_slab_end(best_cachep)) { if (gfp_mask & GFP_DMA) { do { if (slabp->s_dma) @@ -1832,7 +1834,7 @@ good_dma: */ spin_unlock_irq(&best_cachep->c_spinlock); kmem_slab_destroy(best_cachep, slabp); - return; + spin_lock_irq(&best_cachep->c_spinlock); } dma_fail: spin_unlock_irq(&best_cachep->c_spinlock); diff --git a/mm/swap_state.c b/mm/swap_state.c index 4ebc5c05f761..b575877ff154 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -64,13 +64,13 @@ int add_to_swap_cache(struct page *page, unsigned long entry) #endif if (PageTestandSetSwapCache(page)) { printk("swap_cache: replacing non-empty entry %08lx " - "on page %08lx", + "on page %08lx\n", page->offset, page_address(page)); return 0; } if (page->inode) { printk("swap_cache: replacing page-cached entry " - "on page %08lx", page_address(page)); + "on page %08lx\n", page_address(page)); return 0; } atomic_inc(&page->count); @@ -138,18 +138,18 @@ void remove_from_swap_cache(struct page *page) { if (!page->inode) { printk ("VM: Removing swap cache page with zero inode hash " - "on page %08lx", page_address(page)); + "on page %08lx\n", page_address(page)); return; } if (page->inode != &swapper_inode) { printk ("VM: Removing swap cache page with wrong inode hash " - "on page %08lx", page_address(page)); + "on page %08lx\n", page_address(page)); } /* * This will be a legal case once we have a more mature swap cache. */ if (atomic_read(&page->count) == 1) { - printk ("VM: Removing page cache on unshared page %08lx", + printk ("VM: Removing page cache on unshared page %08lx\n", page_address(page)); return; } diff --git a/mm/vmscan.c b/mm/vmscan.c index ebef7a362b88..0a4a016ee66f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -542,11 +542,11 @@ int kswapd(void *unused) while (1) { int tries; + current->state = TASK_INTERRUPTIBLE; kswapd_awake = 0; flush_signals(current); run_task_queue(&tq_disk); schedule(); - current->state = TASK_INTERRUPTIBLE; kswapd_awake = 1; swapstats.wakeups++; /* Do the background pageout: @@ -583,14 +583,6 @@ int kswapd(void *unused) run_task_queue(&tq_disk); } -#if 0 - /* - * Report failure if we couldn't even reach min_free_pages. - */ - if (nr_free_pages < min_free_pages) - printk("kswapd: failed, got %d of %d\n", - nr_free_pages, min_free_pages); -#endif } /* As if we could ever get here - maybe we want to make this killable */ remove_wait_queue(&kswapd_wait, &wait); diff --git a/net/802/sysctl_net_802.c b/net/802/sysctl_net_802.c index f97141d3c725..0773d4c8eefe 100644 --- a/net/802/sysctl_net_802.c +++ b/net/802/sysctl_net_802.c @@ -23,5 +23,7 @@ extern int sysctl_tr_rif_timeout; ctl_table tr_table[] = { {NET_TR_RIF_TIMEOUT, "rif_timeout", &sysctl_tr_rif_timeout, sizeof(int), 0644, NULL, &proc_dointvec}, + {0} + {0} }; #endif diff --git a/net/802/tr.c b/net/802/tr.c index bf6cd83d73c4..0e044833556b 100644 --- a/net/802/tr.c +++ b/net/802/tr.c @@ -510,10 +510,16 @@ int rif_get_info(char *buffer,char **start, off_t offset, int length, int dummy) * Called during bootup. We don't actually have to initialise * too much for this. */ - + +static struct proc_dir_entry tr_rif_proc = { + PROC_NET_TR_RIF, 6, "tr_rif", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + rif_get_info +}; + __initfunc(void rif_init(struct net_proto *unused)) { - rif_timer.expires = RIF_TIMEOUT; rif_timer.data = 0L; rif_timer.function = rif_check_expire; @@ -521,11 +527,5 @@ __initfunc(void rif_init(struct net_proto *unused)) add_timer(&rif_timer); #ifdef CONFIG_PROC_FS - proc_net_register(&(struct proc_dir_entry) { - PROC_NET_TR_RIF, 6, "tr_rif", - S_IFREG | S_IRUGO, 1, 0, 0, - 0, &proc_net_inode_operations, - rif_get_info - }); + proc_net_register(&tr_rif_proc); #endif -} diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 8b724361d692..a3e7659c6613 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -990,7 +990,7 @@ static int atalk_create(struct socket *sock, int protocol) { struct sock *sk; - sk = sk_alloc(AF_APPLETALK, GFP_KERNEL); + sk = sk_alloc(AF_APPLETALK, GFP_KERNEL, 1); if(sk == NULL) return (-ENOMEM); diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 3a4196b3fad1..567169980596 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -828,7 +828,7 @@ int ax25_create(struct socket *sock, int protocol) return -ESOCKTNOSUPPORT; } - if ((sk = sk_alloc(AF_AX25, GFP_ATOMIC)) == NULL) + if ((sk = sk_alloc(AF_AX25, GFP_ATOMIC, 1)) == NULL) return -ENOMEM; if ((ax25 = ax25_create_cb()) == NULL) { @@ -854,7 +854,7 @@ struct sock *ax25_make_new(struct sock *osk, struct ax25_dev *ax25_dev) struct sock *sk; ax25_cb *ax25; - if ((sk = sk_alloc(AF_AX25, GFP_ATOMIC)) == NULL) + if ((sk = sk_alloc(AF_AX25, GFP_ATOMIC, 1)) == NULL) return NULL; if ((ax25 = ax25_create_cb()) == NULL) { diff --git a/net/core/dev.c b/net/core/dev.c index b06d0053eaa5..36efa363b8f4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -50,6 +50,7 @@ * is no device open function. * Andi Kleen : Fix error reporting for SIOCGIFCONF * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF + * Cyrus Durgin : Cleaned for KMOD * */ @@ -81,7 +82,7 @@ #include #include #include -#include +#include #ifdef CONFIG_NET_RADIO #include #endif /* CONFIG_NET_RADIO */ @@ -316,7 +317,7 @@ struct device *dev_alloc(const char *name, int *err) * Find and possibly load an interface. */ -#ifdef CONFIG_KERNELD +#ifdef CONFIG_KMOD void dev_load(const char *name) { @@ -398,20 +399,24 @@ int dev_open(struct device *dev) } #ifdef CONFIG_NET_FASTROUTE -void dev_clear_fastroute(struct device *dev) + +static __inline__ void dev_do_clear_fastroute(struct device *dev) { - int i; + if (dev->accept_fastpath) { + int i; - if (dev) { for (i=0; i<=NETDEV_FASTROUTE_HMASK; i++) dst_release(xchg(dev->fastpath+i, NULL)); + } +} + +void dev_clear_fastroute(struct device *dev) +{ + if (dev) { + dev_do_clear_fastroute(dev); } else { - for (dev = dev_base; dev; dev = dev->next) { - if (dev->accept_fastpath) { - for (i=0; i<=NETDEV_FASTROUTE_HMASK; i++) - dst_release(xchg(dev->fastpath+i, NULL)); - } - } + for (dev = dev_base; dev; dev = dev->next) + dev_do_clear_fastroute(dev); } } #endif @@ -643,7 +648,7 @@ int netdev_register_fc(struct device *dev, void (*stimul)(struct device *dev)) set_bit(bit, &netdev_fc_mask); clear_bit(bit, &netdev_fc_xoff); } - sti(); + restore_flags(flags); return bit; } @@ -659,7 +664,7 @@ void netdev_unregister_fc(int bit) clear_bit(bit, &netdev_fc_mask); clear_bit(bit, &netdev_fc_xoff); } - sti(); + restore_flags(flags); } static void netdev_wakeup(void) @@ -977,39 +982,6 @@ int register_gifconf(unsigned int family, gifconf_func_t * gifconf) } -/* - This ioctl is wrong by design. It really existed in some - old SYSV systems, only was named SIOCGIFNUM. - In multiprotocol environment it is just useless. - Well, SIOCGIFCONF is wrong too, but we have to preserve - it by compatibility reasons. - - If someone wants to achieve the same effect, please, use undocumented - feature of SIOCGIFCONF: it returns buffer length, if buffer - is not supplied. - - Let's remove it, until someone started to use it. --ANK - - In any case, if someone cannot live without it, it should - be renamed to SIOCGIFNUM. - */ - - -/* - * Count the installed interfaces (SIOCGIFCOUNT) - */ - -static int dev_ifcount(unsigned int *arg) -{ - struct device *dev; - unsigned int count = 0; - - for (dev = dev_base; dev != NULL; dev = dev->next) - count++; - - return put_user(count, arg); -} - /* * Map an interface index to its name (SIOCGIFNAME) */ @@ -1022,6 +994,11 @@ static int dev_ifcount(unsigned int *arg) * Besides that, it is pretty silly to put "drawing" facility * to kernel, it is useful only to print ifindices * in readable form, is not it? --ANK + * + * We need this ioctl for efficient implementation of the + * if_indextoname() function required by the IPv6 API. Without + * it, we would have to search all the interfaces to find a + * match. --pb */ static int dev_ifname(struct ifreq *arg) @@ -1120,20 +1097,21 @@ static int sprintf_stats(char *buffer, struct device *dev) int size; if (stats) - size = sprintf(buffer, "%6s:%8lu %7lu %4lu %4lu %4lu %4lu %8lu %8lu %4lu %4lu %4lu %5lu %4lu %4lu\n", - dev->name, + size = sprintf(buffer, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu %8lu %8lu %4lu %4lu %4lu %5lu %4lu %4lu\n", + dev->name, stats->rx_bytes, stats->rx_packets, stats->rx_errors, stats->rx_dropped + stats->rx_missed_errors, stats->rx_fifo_errors, stats->rx_length_errors + stats->rx_over_errors + stats->rx_crc_errors + stats->rx_frame_errors, + stats->rx_compressed, stats->multicast, stats->tx_bytes, stats->tx_packets, stats->tx_errors, stats->tx_dropped, stats->tx_fifo_errors, stats->collisions, stats->tx_carrier_errors + stats->tx_aborted_errors + stats->tx_window_errors + stats->tx_heartbeat_errors, - stats->multicast); + stats->tx_compressed); else size = sprintf(buffer, "%6s: No statistics available.\n", dev->name); @@ -1156,8 +1134,8 @@ int dev_get_info(char *buffer, char **start, off_t offset, int length, int dummy size = sprintf(buffer, - "Inter-| Receive | Transmit\n" - " face |bytes packets errs drop fifo frame|bytes packets errs drop fifo colls carrier multicast\n"); + "Inter-| Receive | Transmit\n" + " face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed\n"); pos+=size; len+=size; @@ -1555,9 +1533,6 @@ int dev_ioctl(unsigned int cmd, void *arg) rtnl_shunlock(); return ret; } - if (cmd == SIOCGIFCOUNT) { - return dev_ifcount((unsigned int*)arg); - } if (cmd == SIOCGIFNAME) { return dev_ifname((struct ifreq *)arg); } diff --git a/net/core/iovec.c b/net/core/iovec.c index 18a9a3b5bbff..843eed65724c 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -80,6 +80,8 @@ out_free: /* * Copy kernel to iovec. + * + * Note: this modifies the original iovec. */ int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len) @@ -107,6 +109,8 @@ out: /* * Copy iovec to kernel. + * + * Note: this modifies the original iovec. */ int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len) @@ -187,9 +191,8 @@ out: * call to this function will be unaligned also. */ -int csum_partial_copy_fromiovecend(unsigned char *kdata, - struct iovec *iov, int offset, - unsigned int len, int *csump) +int csum_partial_copy_fromiovecend(unsigned char *kdata, struct iovec *iov, + int offset, unsigned int len, int *csump) { int partial_cnt = 0; int err = 0; @@ -246,9 +249,9 @@ int csum_partial_copy_fromiovecend(unsigned char *kdata, if (copy_from_user(kdata, base, copy)) goto out_fault; kdata += copy; - base += copy; + base += copy; partial_cnt += copy; - len -= copy; + len -= copy; iov++; if (len) continue; @@ -260,9 +263,9 @@ int csum_partial_copy_fromiovecend(unsigned char *kdata, goto out_fault; csum = csum_partial(kdata - partial_cnt, 4, csum); kdata += par_len; - base += par_len; - copy -= par_len; - len -= par_len; + base += par_len; + copy -= par_len; + len -= par_len; partial_cnt = 0; } @@ -278,16 +281,12 @@ int csum_partial_copy_fromiovecend(unsigned char *kdata, } } - /* Why do we want to break?? There may be more to copy ... */ - if (copy == 0) { -if (len > partial_cnt) -printk("csum_iovec: early break? len=%d, partial=%d\n", len, partial_cnt); - break; + if (copy) { + csum = csum_and_copy_from_user(base, kdata, copy, + csum, &err); + if (err) + goto out; } - - csum = csum_and_copy_from_user(base, kdata, copy, csum, &err); - if (err) - goto out; len -= copy + partial_cnt; kdata += copy + partial_cnt; iov++; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 3de3743e077c..9edb759b9a1b 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -153,12 +153,14 @@ int neigh_ifdown(struct neigh_table *tbl, struct device *dev) static struct neighbour *neigh_alloc(struct neigh_table *tbl, int creat) { struct neighbour *n; + unsigned long now = jiffies; if (tbl->entries > tbl->gc_thresh1) { if (creat < 0) return NULL; - if (tbl->entries > tbl->gc_thresh2 || - jiffies - tbl->last_flush > 5*HZ) { + if (tbl->entries > tbl->gc_thresh3 || + (tbl->entries > tbl->gc_thresh2 && + now - tbl->last_flush > 5*HZ)) { if (neigh_forced_gc(tbl) == 0 && tbl->entries > tbl->gc_thresh3) return NULL; @@ -172,7 +174,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, int creat) memset(n, 0, tbl->entry_size); skb_queue_head_init(&n->arp_queue); - n->updated = n->used = jiffies; + n->updated = n->used = now; n->nud_state = NUD_NONE; n->output = neigh_blackhole; n->parms = &tbl->parms; @@ -666,8 +668,18 @@ int neigh_update(struct neighbour *neigh, u8 *lladdr, u8 new, int override, int neigh_suspect(neigh); if (!(old&NUD_VALID)) { struct sk_buff *skb; - while ((skb=__skb_dequeue(&neigh->arp_queue)) != NULL) - neigh->output(skb); + + /* Again: avoid dead loop if something went wrong */ + + while (neigh->nud_state&NUD_VALID && + (skb=__skb_dequeue(&neigh->arp_queue)) != NULL) { + struct neighbour *n1 = neigh; + /* On shaper/eql skb->dst->neighbour != neigh :( */ + if (skb->dst && skb->dst->neighbour) + n1 = skb->dst->neighbour; + n1->output(skb); + } + skb_queue_purge(&neigh->arp_queue); } return 0; } diff --git a/net/core/sock.c b/net/core/sock.c index 6da5f5a0da57..f940e5a80f03 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -137,6 +137,8 @@ __u32 sysctl_wmem_default = SK_WMEM_MAX; __u32 sysctl_rmem_default = SK_RMEM_MAX; int sysctl_core_destroy_delay = SOCK_DESTROY_TIME; +/* Maximal space eaten by iovec (still not made (2.1.88)!) plus some space */ +int sysctl_optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV + 512); /* * This is meant for all protocols to use and covers goings on @@ -472,11 +474,11 @@ static kmem_cache_t *sk_cachep; * usage. */ -struct sock *sk_alloc(int family, int priority) +struct sock *sk_alloc(int family, int priority, int zero_it) { struct sock *sk = kmem_cache_alloc(sk_cachep, priority); - if(sk) { + if(sk && zero_it) { memset(sk, 0, sizeof(struct sock)); sk->family = family; } @@ -561,34 +563,22 @@ struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int void *sock_kmalloc(struct sock *sk, int size, int priority) { void *mem = NULL; - /* Always use wmem.. */ - if (atomic_read(&sk->wmem_alloc)+size < sk->sndbuf) { + if (atomic_read(&sk->omem_alloc)+size < sysctl_optmem_max) { /* First do the add, to avoid the race if kmalloc * might sleep. */ - atomic_add(size, &sk->wmem_alloc); + atomic_add(size, &sk->omem_alloc); mem = kmalloc(size, priority); - if (mem) - return mem; - atomic_sub(size, &sk->wmem_alloc); } return mem; } void sock_kfree_s(struct sock *sk, void *mem, int size) { -#if 1 /* Debug */ - if (atomic_read(&sk->wmem_alloc) < size) { - printk(KERN_DEBUG "sock_kfree_s: mem not accounted.\n"); - return; - } -#endif kfree_s(mem, size); - atomic_sub(size, &sk->wmem_alloc); - sk->write_space(sk); + atomic_sub(size, &sk->omem_alloc); } - /* FIXME: this is insane. We are trying suppose to be controlling how * how much space we have for data bytes, not packet headers. * This really points out that we need a better system for doing the @@ -633,6 +623,30 @@ unsigned long sock_wspace(struct sock *sk) return(0); } +/* It is almost wait_for_tcp_memory minus release_sock/lock_sock. + I think, these locks should be removed for datagram sockets. + */ +static void sock_wait_for_wmem(struct sock * sk) +{ + struct wait_queue wait = { current, NULL }; + + sk->socket->flags &= ~SO_NOSPACE; + add_wait_queue(sk->sleep, &wait); + for (;;) { + if (signal_pending(current)) + break; + current->state = TASK_INTERRUPTIBLE; + if (atomic_read(&sk->wmem_alloc) < sk->sndbuf) + break; + if (sk->shutdown & SEND_SHUTDOWN) + break; + if (sk->err) + break; + schedule(); + } + current->state = TASK_RUNNING; + remove_wait_queue(sk->sleep, &wait); +} /* @@ -641,94 +655,78 @@ unsigned long sock_wspace(struct sock *sk) struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, unsigned long fallback, int noblock, int *errcode) { + int err; struct sk_buff *skb; - do - { - if(sk->err!=0) - { - *errcode=xchg(&sk->err,0); - return NULL; - } - - if(sk->shutdown&SEND_SHUTDOWN) - { - /* - * FIXME: Check 1003.1g should we deliver - * a signal here ??? - */ - *errcode=-EPIPE; - return NULL; - } - - if(!fallback) + do { + if ((err = xchg(&sk->err,0)) != 0) + goto failure; + + /* + * FIXME: Check 1003.1g should we deliver + * a signal here ??? + * + * Alan, could we solve this question once and forever? + * + * I believe, datagram sockets should never + * generate SIGPIPE. Moreover, I DO think that + * TCP is allowed to generate it only on write() + * call, but never on send/sendto/sendmsg. + * (btw, Solaris generates it even on read() :-)) + * + * The reason is that SIGPIPE is global flag, + * so that library function using sockets (f.e. syslog()), + * must save/disable it on entry and restore on exit. + * As result, signal arriving for another thread will + * be lost. Generation it on write() is still necessary + * because a lot of stupid programs never check write() + * return value. + * + * Seems, SIGPIPE is very bad idea, sort of gets(). + * At least, we could have an option disabling + * this behaviour on per-socket and/or per-message base. + * BTW it is very easy - MSG_SIGPIPE flag, which + * always set by read/write and checked here. + * --ANK + */ + + err = -EPIPE; + if (sk->shutdown&SEND_SHUTDOWN) + goto failure; + + if (!fallback) skb = sock_wmalloc(sk, size, 0, sk->allocation); - else - { + else { /* The buffer get won't block, or use the atomic queue. It does produce annoying no free page messages still.... */ skb = sock_wmalloc(sk, size, 0 , GFP_BUFFER); - if(!skb) + if (!skb) skb=sock_wmalloc(sk, fallback, 0, sk->allocation); } - + /* * This means we have too many buffers for this socket already. */ - - if(skb==NULL) - { - unsigned long tmp; + /* The following code is stolen "as is" from tcp.c */ + + if (skb==NULL) { sk->socket->flags |= SO_NOSPACE; - if(noblock) - { - *errcode=-EAGAIN; - return NULL; - } - if(sk->shutdown&SEND_SHUTDOWN) - { - *errcode=-EPIPE; - return NULL; - } - tmp = atomic_read(&sk->wmem_alloc); - cli(); - if(sk->shutdown&SEND_SHUTDOWN) - { - sti(); - *errcode=-EPIPE; - return NULL; - } - -#if 1 - if( tmp <= atomic_read(&sk->wmem_alloc)) -#else - /* ANK: Line above seems either incorrect - * or useless. sk->wmem_alloc has a tiny chance to change - * between tmp = sk->w... and cli(), - * but it might(?) change earlier. In real life - * it does not (I never seen the message). - * In any case I'd delete this check at all, or - * change it to: - */ - if (atomic_read(&sk->wmem_alloc) >= sk->sndbuf) -#endif - { - sk->socket->flags &= ~SO_NOSPACE; - interruptible_sleep_on(sk->sleep); - if (signal_pending(current)) - { - sti(); - *errcode = -ERESTARTSYS; - return NULL; - } - } - sti(); + err = -EAGAIN; + if (noblock) + goto failure; + err = -ERESTARTSYS; + if (signal_pending(current)) + goto failure; + sock_wait_for_wmem(sk); } - } - while(skb==NULL); - + } while (skb==NULL); + return skb; + +failure: + *errcode = err; + return NULL; } diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 1da2cc152a4b..47c85d006721 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -22,6 +22,7 @@ extern __u32 sysctl_wmem_default; extern __u32 sysctl_rmem_default; extern int sysctl_core_destroy_delay; +extern int sysctl_optmem_max; ctl_table core_table[] = { {NET_CORE_WMEM_MAX, "wmem_max", @@ -53,6 +54,9 @@ ctl_table core_table[] = { {NET_CORE_MSG_BURST, "message_burst", &net_msg_burst, sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {NET_CORE_OPTMEM_MAX, "optmem_max", + &sysctl_optmem_max, sizeof(int), 0644, NULL, + &proc_dointvec}, { 0 } }; #endif diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index f0906502aa27..0ef804218d62 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -5,7 +5,7 @@ * * AF_INET protocol family socket handler. * - * Version: $Id: af_inet.c,v 1.58 1997/10/29 20:27:21 kuznet Exp $ + * Version: $Id: af_inet.c,v 1.63 1998/03/08 05:56:12 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -52,6 +52,7 @@ * Willy Konynenberg : Transparent proxying support. * David S. Miller : New socket lookup architecture. * Some other random speedups. + * Cyrus Durgin : Cleaned up file for kmod hacks. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -107,8 +108,8 @@ #ifdef CONFIG_BRIDGE #include #endif -#ifdef CONFIG_KERNELD -#include +#ifdef CONFIG_KMOD +#include #endif #ifdef CONFIG_NET_RADIO #include @@ -327,7 +328,7 @@ static int inet_create(struct socket *sock, int protocol) static int warned; if (net_families[AF_PACKET]==NULL) { -#if defined(CONFIG_KERNELD) && defined(CONFIG_PACKET_MODULE) +#if defined(CONFIG_KMOD) && defined(CONFIG_PACKET_MODULE) char module_name[30]; sprintf(module_name,"net-pf-%d", AF_PACKET); request_module(module_name); @@ -341,7 +342,7 @@ static int inet_create(struct socket *sock, int protocol) } sock->state = SS_UNCONNECTED; - sk = sk_alloc(AF_INET, GFP_KERNEL); + sk = sk_alloc(AF_INET, GFP_KERNEL, 1); if (sk == NULL) goto do_oom; @@ -894,7 +895,7 @@ static int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCDRARP: case SIOCGRARP: case SIOCSRARP: -#ifdef CONFIG_KERNELD +#ifdef CONFIG_KMOD if (rarp_ioctl_hook == NULL) request_module("rarp"); #endif @@ -928,7 +929,7 @@ static int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) #ifdef CONFIG_DLCI_MODULE -#ifdef CONFIG_KERNELD +#ifdef CONFIG_KMOD if (dlci_ioctl_hook == NULL) request_module("dlci"); #endif diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 04b58943f454..c3067dc1c1a5 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1,6 +1,6 @@ /* linux/net/inet/arp.c * - * Version: $Id: arp.c,v 1.58 1997/12/13 21:52:46 kuznet Exp $ + * Version: $Id: arp.c,v 1.65 1998/03/08 20:52:34 davem Exp $ * * Copyright (C) 1994 by Florian La Roche * @@ -189,7 +189,7 @@ struct neigh_table arp_tbl = NULL, parp_redo, { NULL, NULL, &arp_tbl, 0, NULL, NULL, - 30*HZ, 1*HZ, 60*HZ, 30*HZ, 5*HZ, 3, 3, 0, 3, 1*HZ, (8*HZ)/10, 1*HZ, 64 }, + 30*HZ, 1*HZ, 60*HZ, 30*HZ, 5*HZ, 3, 3, 0, 3, 1*HZ, (8*HZ)/10, 64, 1*HZ }, 30*HZ, 128, 512, 1024, }; @@ -954,6 +954,10 @@ int arp_get_info(char *buffer, char **start, off_t offset, int length, int dummy struct device *dev = n->dev; int hatype = dev->type; + /* Do not confuse users "arp -a" with magic entries */ + if (!(n->nud_state&~NUD_NOARP)) + continue; + /* I'd get great pleasure deleting this ugly code. Let's output it in hexadecimal format. "arp" utility will eventually repaired --ANK diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index d4e59cce3c30..025401aae108 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1,7 +1,7 @@ /* * NET3 IP device support routines. * - * Version: $Id: devinet.c,v 1.15 1997/12/13 21:52:47 kuznet Exp $ + * Version: $Id: devinet.c,v 1.19 1998/03/08 20:52:35 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -19,6 +19,7 @@ * * Changes: * Alexey Kuznetsov: pa_* fields are replaced with ifaddr lists. + Cyrus Durgin: updated for kmod */ #include @@ -49,8 +50,8 @@ #ifdef CONFIG_SYSCTL #include #endif -#ifdef CONFIG_KERNELD -#include +#ifdef CONFIG_KMOD +#include #endif #include @@ -157,28 +158,32 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, int destroy) { struct in_ifaddr *ifa1 = *ifap; - struct in_ifaddr *ifa; - - /* 1. Unlink it */ - *ifap = ifa1->ifa_next; - - /* 2. Deleting primary ifaddr forces deletion all secondaries */ + /* 1. Deleting primary ifaddr forces deletion all secondaries */ if (!(ifa1->ifa_flags&IFA_F_SECONDARY)) { - while ((ifa=*ifap) != NULL) { - if (ifa1->ifa_mask != ifa->ifa_mask || + struct in_ifaddr *ifa; + struct in_ifaddr **ifap1 = &ifa1->ifa_next; + + while ((ifa=*ifap1) != NULL) { + if (!(ifa->ifa_flags&IFA_F_SECONDARY) || + ifa1->ifa_mask != ifa->ifa_mask || !inet_ifa_match(ifa1->ifa_address, ifa)) { - ifap = &ifa->ifa_next; + ifap1 = &ifa->ifa_next; continue; } - *ifap = ifa->ifa_next; + *ifap1 = ifa->ifa_next; rtmsg_ifa(RTM_DELADDR, ifa); notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa); inet_free_ifa(ifa); } } + /* 2. Unlink it */ + + *ifap = ifa1->ifa_next; + + /* 3. Announce address deletion */ /* Send message first, then call notifier. @@ -232,10 +237,9 @@ inet_insert_ifa(struct in_device *in_dev, struct in_ifaddr *ifa) ifap = last_primary; } - cli(); ifa->ifa_next = *ifap; + /* ATOMIC_SET */ *ifap = ifa; - sti(); /* Send message first, then call notifier. Notifier will trigger FIB update, so that @@ -413,7 +417,7 @@ int devinet_ioctl(unsigned int cmd, void *arg) *colon = 0; #endif -#ifdef CONFIG_KERNELD +#ifdef CONFIG_KMOD dev_load(ifr.ifr_name); #endif @@ -960,6 +964,8 @@ static void devinet_sysctl_register(struct in_device *in_dev, struct ipv4_devcon t->sysctl_header = register_sysctl_table(t->devinet_root_dir, 0); if (t->sysctl_header == NULL) kfree(t); + else + p->sysctl = t; } static void devinet_sysctl_unregister(struct ipv4_devconf *p) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 409db8209f8d..6350a6366006 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -5,7 +5,7 @@ * * IPv4 Forwarding Information Base: FIB frontend. * - * Version: $Id: fib_frontend.c,v 1.6 1997/12/13 21:52:48 kuznet Exp $ + * Version: $Id: fib_frontend.c,v 1.9 1998/03/08 20:52:36 davem Exp $ * * Authors: Alexey Kuznetsov, * @@ -151,7 +151,6 @@ struct device * ip_dev_find(u32 addr) memset(&key, 0, sizeof(key)); key.dst = addr; - key.scope = RT_SCOPE_UNIVERSE; if (!local_table || local_table->tb_lookup(local_table, &key, &res) || res.type != RTN_LOCAL) @@ -344,6 +343,10 @@ int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) int s_t; struct fib_table *tb; + if (NLMSG_PAYLOAD(cb->nlh, 0) >= sizeof(struct rtmsg) && + ((struct rtmsg*)NLMSG_DATA(cb->nlh))->rtm_flags&RTM_F_CLONED) + return ip_rt_dump(skb, cb); + s_t = cb->args[0]; if (s_t == 0) s_t = cb->args[0] = RT_TABLE_MIN; @@ -423,8 +426,13 @@ static void fib_add_ifaddr(struct in_ifaddr *ifa) u32 addr = ifa->ifa_local; u32 prefix = ifa->ifa_address&mask; - if (ifa->ifa_flags&IFA_F_SECONDARY) + if (ifa->ifa_flags&IFA_F_SECONDARY) { prim = inet_ifa_byprefix(in_dev, prefix, mask); + if (prim == NULL) { + printk(KERN_DEBUG "fib_add_ifaddr: bug: prim == NULL\n"); + return; + } + } fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim); @@ -435,7 +443,8 @@ static void fib_add_ifaddr(struct in_ifaddr *ifa) if (ifa->ifa_broadcast && ifa->ifa_broadcast != 0xFFFFFFFF) fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); - if (!ZERONET(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY)) { + if (!ZERONET(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY) && + (prefix != addr || ifa->ifa_prefixlen < 32)) { fib_magic(RTM_NEWROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim); @@ -464,8 +473,13 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa) if (!(ifa->ifa_flags&IFA_F_SECONDARY)) fib_magic(RTM_DELROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, any, ifa->ifa_prefixlen, prim); - else + else { prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask); + if (prim == NULL) { + printk(KERN_DEBUG "fib_del_ifaddr: bug: prim == NULL\n"); + return; + } + } /* Deletion is more complicated than add. We should take care of not to delete too much :-) diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index 33bcf0321616..4b89ab6767bc 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -5,7 +5,7 @@ * * IPv4 FIB: lookup engine and maintenance routines. * - * Version: $Id: fib_hash.c,v 1.1 1997/11/09 19:53:13 kuznet Exp $ + * Version: $Id: fib_hash.c,v 1.3 1998/03/08 05:56:16 davem Exp $ * * Authors: Alexey Kuznetsov, * diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 3ffb404b5cff..7ec60a5bea0a 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -5,7 +5,7 @@ * * IPv4 Forwarding Information Base: policy rules. * - * Version: $Id: fib_rules.c,v 1.2 1997/10/10 22:40:49 davem Exp $ + * Version: $Id: fib_rules.c,v 1.3 1998/03/08 05:56:17 davem Exp $ * * Authors: Alexey Kuznetsov, * diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 3883fcba0414..d2d37e11e482 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -5,7 +5,7 @@ * * IPv4 Forwarding Information Base: semantics. * - * Version: $Id: fib_semantics.c,v 1.6 1997/12/13 21:52:49 kuznet Exp $ + * Version: $Id: fib_semantics.c,v 1.7 1998/03/08 05:56:18 davem Exp $ * * Authors: Alexey Kuznetsov, * diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 58cf2455b366..854c5cc5f1de 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -3,7 +3,7 @@ * * Alan Cox, * - * Version: $Id: icmp.c,v 1.36 1997/12/04 03:42:03 freitag Exp $ + * Version: $Id: icmp.c,v 1.39 1998/03/08 05:56:19 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -928,10 +928,8 @@ int icmp_chkaddr(struct sk_buff *skb) struct tcphdr *th = (struct tcphdr *)(((unsigned char *)iph)+(iph->ihl<<2)); sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, skb->dev->ifindex); - if (!sk) return 0; - if (sk->saddr != iph->saddr) return 0; - if (sk->daddr != iph->daddr) return 0; - if (sk->dummy_th.dest != th->dest) return 0; + if (!sk || (sk->state == TCP_LISTEN)) + return 0; /* * This packet came from us. */ diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 3f082d08b4cf..74757adf8f74 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -8,7 +8,7 @@ * the older version didn't come out right using gcc 2.5.8, the newer one * seems to fall out with gcc 2.6.2. * - * Version: $Id: igmp.c,v 1.24 1997/12/17 20:14:10 kuznet Exp $ + * Version: $Id: igmp.c,v 1.26 1998/03/08 05:56:19 davem Exp $ * * Authors: * Alan Cox diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 98650b644273..e136a16ca68d 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -5,7 +5,7 @@ * * The IP forwarding functionality. * - * Version: $Id: ip_forward.c,v 1.37 1997/12/18 17:01:11 kuznet Exp $ + * Version: $Id: ip_forward.c,v 1.40 1998/03/08 05:56:20 davem Exp $ * * Authors: see ip.c * diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 9dccb5324464..e6831adb8afe 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -5,7 +5,7 @@ * * The IP fragmentation functionality. * - * Version: $Id: ip_fragment.c,v 1.30 1997/12/29 19:52:32 kuznet Exp $ + * Version: $Id: ip_fragment.c,v 1.32 1998/03/08 05:56:21 davem Exp $ * * Authors: Fred N. van Kempen * Alan Cox diff --git a/net/ipv4/ip_fw.c b/net/ipv4/ip_fw.c index 18449a7d550f..7502f6e4aa21 100644 --- a/net/ipv4/ip_fw.c +++ b/net/ipv4/ip_fw.c @@ -6,7 +6,7 @@ * license in recognition of the original copyright. * -- Alan Cox. * - * $Id: ip_fw.c,v 1.30 1997/12/19 12:06:27 freitag Exp $ + * $Id: ip_fw.c,v 1.32 1998/02/23 02:50:17 davem Exp $ * * Ported from BSD to Linux, * Alan Cox 22/Nov/1994. diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 5a364c08298e..28dee6bd0b9c 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -5,7 +5,7 @@ * * The Internet Protocol (IP) module. * - * Version: $Id: ip_input.c,v 1.25 1997/12/13 21:52:53 kuznet Exp $ + * Version: $Id: ip_input.c,v 1.27 1998/03/08 05:56:24 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, diff --git a/net/ipv4/ip_masq_mod.c b/net/ipv4/ip_masq_mod.c index 797f9112ffd7..2265161f3315 100644 --- a/net/ipv4/ip_masq_mod.c +++ b/net/ipv4/ip_masq_mod.c @@ -12,6 +12,8 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * + * Changes: + * Cyrus Durgin: fixed kerneld stuff for kmod. */ #include @@ -21,8 +23,8 @@ #include #include #include -#ifdef CONFIG_KERNELD -#include +#ifdef CONFIG_KMOD +#include #endif EXPORT_SYMBOL(register_ip_masq_mod); @@ -290,7 +292,7 @@ struct ip_masq_mod * ip_masq_mod_getbyname(const char *mmod_name) int ip_masq_mod_ctl(int optname, struct ip_fw_masqctl *mctl, int optlen) { struct ip_masq_mod * mmod; -#ifdef CONFIG_KERNELD +#ifdef CONFIG_KMOD char kmod_name[IP_MASQ_MOD_NMAX+8]; #endif /* tappo */ @@ -299,7 +301,7 @@ int ip_masq_mod_ctl(int optname, struct ip_fw_masqctl *mctl, int optlen) mmod = ip_masq_mod_getbyname(mctl->u.mod.name); if (mmod) return mmod->mmod_ctl(optname, mctl, optlen); -#ifdef CONFIG_KERNELD +#ifdef CONFIG_KMOD sprintf(kmod_name,"ip_masq_%s", mctl->u.mod.name); IP_MASQ_DEBUG(1, "About to request \"%s\" module\n", kmod_name); diff --git a/net/ipv4/ip_masq_raudio.c b/net/ipv4/ip_masq_raudio.c index f7e28f21ab6e..377b8223e695 100644 --- a/net/ipv4/ip_masq_raudio.c +++ b/net/ipv4/ip_masq_raudio.c @@ -2,7 +2,7 @@ * IP_MASQ_RAUDIO - Real Audio masquerading module * * - * Version: @(#)$Id: ip_masq_raudio.c,v 1.8 1997/11/28 15:32:32 alan Exp $ + * Version: @(#)$Id: ip_masq_raudio.c,v 1.9 1998/02/23 02:50:19 davem Exp $ * * Author: Nigel Metheringham * Real Time Streaming code by Progressive Networks diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index b33f202ecc5a..3e3674ef717a 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -5,7 +5,7 @@ * * The options processing module for ip.c * - * Version: $Id: ip_options.c,v 1.12 1997/10/10 22:41:08 davem Exp $ + * Version: $Id: ip_options.c,v 1.13 1998/02/12 07:43:12 davem Exp $ * * Authors: A.N.Kuznetsov * diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 07cec4c51c2e..bb0dac7f9cf2 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -5,7 +5,7 @@ * * The Internet Protocol (IP) output module. * - * Version: $Id: ip_output.c,v 1.45 1998/01/15 22:06:35 freitag Exp $ + * Version: $Id: ip_output.c,v 1.48 1998/03/08 05:56:25 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 3ab8a9704289..f60c206a0ad7 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -5,7 +5,7 @@ * * The IP to API glue. * - * Version: $Id: ip_sockglue.c,v 1.30 1997/12/29 19:52:39 kuznet Exp $ + * Version: $Id: ip_sockglue.c,v 1.32 1998/03/08 05:56:26 davem Exp $ * * Authors: see ip.c * diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 20521e643aeb..1e44ae8aa654 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -1,5 +1,5 @@ /* - * $Id: ipconfig.c,v 1.6 1998/01/09 17:19:46 mj Exp $ + * $Id: ipconfig.c,v 1.11 1998/02/12 07:43:16 davem Exp $ * * Automatic Configuration of IP -- use BOOTP or RARP or user-supplied * information to configure own IP address and routes. diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 0bc57d1e70ec..d0b3b5ff2e07 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -1,7 +1,7 @@ /* * Linux NET3: IP/IP protocol decoder. * - * Version: $Id: ipip.c,v 1.19 1997/11/08 17:50:21 kuznet Exp $ + * Version: $Id: ipip.c,v 1.22 1998/03/08 05:56:27 davem Exp $ * * Authors: * Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95 diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index d3c07dca316e..df8dc1896014 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -9,7 +9,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * Version: $Id: ipmr.c,v 1.29 1997/12/13 21:52:55 kuznet Exp $ + * Version: $Id: ipmr.c,v 1.33 1998/03/08 20:52:37 davem Exp $ * * Fixes: * Michael Chastain : Incorrect size of copying. @@ -1351,6 +1351,7 @@ ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm) int ct; struct rtnexthop *nhp; struct device *dev = vif_table[c->mfc_parent].dev; + u8 *b = skb->tail; #ifdef CONFIG_RTNL_OLD_IFINFO if (dev) { @@ -1389,10 +1390,11 @@ ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm) return 1; rtattr_failure: + skb_trim(skb, b - skb->data); return -EMSGSIZE; } -int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm) +int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait) { struct mfc_cache *cache; struct rtable *rt = (struct rtable*)skb->dst; @@ -1400,10 +1402,16 @@ int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm) start_bh_atomic(); cache = ipmr_cache_find(rt->rt_src, rt->rt_dst); if (cache==NULL || (cache->mfc_flags&MFC_QUEUED)) { - struct device *dev = skb->dev; + struct device *dev; int vif; int err; + if (nowait) { + end_bh_atomic(); + return -EAGAIN; + } + + dev = skb->dev; if (dev == NULL || (vif = ipmr_find_vif(dev)) == ALL_VIFS) { end_bh_atomic(); return -ENODEV; @@ -1422,7 +1430,7 @@ int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm) */ end_bh_atomic(); - if (rtm->rtm_flags & RTM_F_NOTIFY) + if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY)) cache->mfc_flags |= MFC_NOTIFY; return ipmr_fill_mroute(skb, cache, rtm); } diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 7f3b5f9bbb9b..e4ca463e754a 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -7,7 +7,7 @@ * PROC file system. It is mainly used for debugging and * statistics. * - * Version: $Id: proc.c,v 1.23 1997/10/30 23:52:20 davem Exp $ + * Version: $Id: proc.c,v 1.24 1998/03/06 01:23:06 davem Exp $ * * Authors: Fred N. van Kempen, * Gerald J. Heim, @@ -113,8 +113,8 @@ static inline void get__sock(struct sock *sp, char *tmpbuf, int i, int format) sprintf(tmpbuf, "%4d: %08lX:%04X %08lX:%04X" " %02X %08X:%08X %02X:%08lX %08X %5d %8d %ld", i, src, srcp, dest, destp, sp->state, - format==0?sp->write_seq-tp->snd_una:atomic_read(&sp->wmem_alloc), - format==0?tp->rcv_nxt-sp->copied_seq:atomic_read(&sp->rmem_alloc), + format==0?tp->write_seq-tp->snd_una:atomic_read(&sp->wmem_alloc), + format==0?tp->rcv_nxt-tp->copied_seq:atomic_read(&sp->rmem_alloc), timer_active, timer_expires-jiffies, tp->retransmits, sp->socket ? sp->socket->inode->i_uid:0, diff --git a/net/ipv4/rarp.c b/net/ipv4/rarp.c index 5a74d6e692fa..9fd174b68a29 100644 --- a/net/ipv4/rarp.c +++ b/net/ipv4/rarp.c @@ -3,7 +3,7 @@ * Copyright (C) 1994 by Ross Martin * Based on linux/net/inet/arp.c, Copyright (C) 1994 by Florian La Roche * - * $Id: rarp.c,v 1.21 1997/10/27 09:13:16 geert Exp $ + * $Id: rarp.c,v 1.24 1998/03/08 05:56:30 davem Exp $ * * This module implements the Reverse Address Resolution Protocol * (RARP, RFC 903), which is used to convert low level addresses such diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 12b612363bbc..735d06d44640 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -5,7 +5,7 @@ * * RAW - implementation of IP "raw" sockets. * - * Version: $Id: raw.c,v 1.33 1997/12/27 20:41:15 kuznet Exp $ + * Version: $Id: raw.c,v 1.35 1998/03/08 05:56:32 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 8579ee72691b..0101caaa3fc2 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -5,7 +5,7 @@ * * ROUTE - implementation of the IP router. * - * Version: $Id: route.c,v 1.36 1997/12/17 20:14:18 kuznet Exp $ + * Version: $Id: route.c,v 1.41 1998/03/08 20:52:38 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -167,7 +167,7 @@ __u8 ip_tos2prio[16] = { static struct rtable *rt_hash_table[RT_HASH_DIVISOR]; -static struct rtable * rt_intern_hash(unsigned hash, struct rtable * rth, u16 protocol); +static struct rtable * rt_intern_hash(unsigned hash, struct rtable * rth); static __inline__ unsigned rt_hash_code(u32 daddr, u32 saddr, u8 tos) { @@ -301,6 +301,8 @@ static void rt_run_flush(unsigned long dummy) int i; struct rtable * rth, * next; + rt_deadline = 0; + for (i=0; i 0 && rt_deadline) { - long tmo = (long)(rt_deadline - rt_flush_timer.expires); + long tmo = (long)(rt_deadline - now); /* If flush timer is already running and flush request is not immediate (delay > 0): - if deadline is not achieved, prolongate timer to "dealy", + if deadline is not achieved, prolongate timer to "delay", otherwise fire it at deadline time. */ + if (user_mode && (long)(rt_deadline-now) < ip_rt_max_delay-ip_rt_min_delay) + tmo = 0; + if (delay > tmo) delay = tmo; } if (delay <= 0) { - rt_deadline = 0; end_bh_atomic(); - rt_run_flush(0); return; } if (rt_deadline == 0) - rt_deadline = jiffies + ip_rt_max_delay; + rt_deadline = now + ip_rt_max_delay; - rt_flush_timer.expires = jiffies + delay; + rt_flush_timer.expires = now + delay; add_timer(&rt_flush_timer); end_bh_atomic(); } @@ -400,7 +406,7 @@ out: return (atomic_read(&ipv4_dst_ops.entries) > ip_rt_max_size); } -static struct rtable *rt_intern_hash(unsigned hash, struct rtable * rt, u16 protocol) +static struct rtable *rt_intern_hash(unsigned hash, struct rtable * rt) { struct rtable *rth, **rthp; unsigned long now = jiffies; @@ -472,7 +478,9 @@ void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, goto reject_redirect; if (!IN_DEV_SHARED_MEDIA(in_dev)) { - if (ip_fib_check_default(new_gw, dev)) + if (!inet_addr_onlink(in_dev, new_gw, old_gw)) + goto reject_redirect; + if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev)) goto reject_redirect; } else { if (inet_addr_type(new_gw) != RTN_UNICAST) @@ -504,9 +512,13 @@ void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, rth->u.dst.dev != dev) break; + dst_clone(&rth->u.dst); + rt = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops); - if (rt == NULL) + if (rt == NULL) { + ip_rt_put(rth); return; + } /* * Copy all the information. @@ -531,14 +543,16 @@ void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, if (rt->u.dst.neighbour) neigh_event_send(rt->u.dst.neighbour, NULL); ip_rt_put(rt); + ip_rt_put(rth); rt_free(rt); break; } *rthp = rth->u.rt_next; - rt_free(rth); - rt = rt_intern_hash(hash, rt, ETH_P_IP); + rt = rt_intern_hash(hash, rt); ip_rt_put(rt); + ip_rt_put(rth); + rt_free(rth); break; } } @@ -762,19 +776,45 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt) u32 src; struct fib_result res; - if (rt->key.iif == 0) { - memcpy(addr, &rt->rt_src, 4); - return; - } - if (fib_lookup(&rt->key, &res) == 0) { + if (rt->key.iif == 0) + src = rt->rt_src; + else if (fib_lookup(&rt->key, &res) == 0) src = FIB_RES_PREFSRC(res); - memcpy(addr, &src, 4); - return; - } - src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE); + else + src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE); memcpy(addr, &src, 4); } +static void rt_set_nexthop(struct rtable *rt, struct fib_result *res) +{ + struct fib_info *fi = res->fi; + + if (fi) { + if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) + rt->rt_gateway = FIB_RES_GW(*res); +#ifndef CONFIG_RTNL_OLD_IFINFO + rt->u.dst.mxlock = fi->fib_metrics[RTAX_LOCK-1]; + rt->u.dst.pmtu = fi->fib_mtu; + if (fi->fib_mtu == 0) { + rt->u.dst.pmtu = rt->u.dst.dev->mtu; + if (rt->u.dst.mxlock&(1<rt_gateway != rt->rt_dst && + rt->u.dst.pmtu > 576) + rt->u.dst.pmtu = 576; + } +#else + rt->u.dst.pmtu = fi->fib_mtu ? : rt->u.dst.dev->mtu; +#endif + rt->u.dst.window= fi->fib_window ? : 0; + rt->u.dst.rtt = fi->fib_rtt ? : TCP_TIMEOUT_INIT; + } else { + rt->u.dst.pmtu = rt->u.dst.dev->mtu; + rt->u.dst.window= 0; + rt->u.dst.rtt = TCP_TIMEOUT_INIT; + } + rt->rt_type = res->type; +} + static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr, u8 tos, struct device *dev, int our) @@ -832,7 +872,7 @@ ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr, #endif hash = rt_hash_code(daddr, saddr^(dev->ifindex<<5), tos); - skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth, 0); + skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth); return 0; } @@ -990,18 +1030,9 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr, rth->u.dst.input = ip_forward; rth->u.dst.output = ip_output; - rth->u.dst.pmtu = res.fi->fib_mtu ? : out_dev->dev->mtu; - rth->u.dst.window=res.fi->fib_window ? : 0; - rth->u.dst.rtt = res.fi->fib_rtt ? : TCP_TIMEOUT_INIT; -#ifndef CONFIG_RTNL_OLD_IFINFO - rth->u.dst.mxlock = res.fi->fib_metrics[RTAX_LOCK-1]; -#endif - - if (FIB_RES_GW(res) && FIB_RES_NH(res).nh_scope == RT_SCOPE_LINK) - rth->rt_gateway = FIB_RES_GW(res); + rt_set_nexthop(rth, &res); rth->rt_flags = flags; - rth->rt_type = res.type; #ifdef CONFIG_NET_FASTROUTE if (netdev_fastroute && !(flags&(RTCF_NAT|RTCF_MASQ|RTCF_DOREDIRECT))) { @@ -1014,7 +1045,7 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr, } #endif - skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth, ntohs(skb->protocol)); + skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth); return 0; brd_input: @@ -1062,7 +1093,7 @@ local_input: } rth->rt_flags = flags|RTCF_LOCAL; rth->rt_type = res.type; - skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth, 0); + skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth); return 0; no_route: @@ -1362,7 +1393,7 @@ make_route: rth->rt_dst_map = key.dst; rth->rt_src_map = key.src; #endif - rth->rt_iif = dev_out->ifindex; + rth->rt_iif = oif ? : dev_out->ifindex; rth->u.dst.dev = dev_out; rth->rt_gateway = key.dst; rth->rt_spec_dst= key.src; @@ -1388,24 +1419,12 @@ make_route: #endif } - if (res.fi) { - if (FIB_RES_GW(res) && FIB_RES_NH(res).nh_scope == RT_SCOPE_LINK) - rth->rt_gateway = FIB_RES_GW(res); - rth->u.dst.pmtu = res.fi->fib_mtu ? : dev_out->mtu; - rth->u.dst.window=res.fi->fib_window ? : 0; - rth->u.dst.rtt = res.fi->fib_rtt ? : TCP_TIMEOUT_INIT; -#ifndef CONFIG_RTNL_OLD_IFINFO - rth->u.dst.mxlock = res.fi->fib_metrics[RTAX_LOCK-1]; -#endif - } else { - rth->u.dst.pmtu = dev_out->mtu; - rth->u.dst.window=0; - rth->u.dst.rtt = TCP_TIMEOUT_INIT; - } + rt_set_nexthop(rth, &res); + rth->rt_flags = flags; - rth->rt_type = res.type; + hash = rt_hash_code(daddr, saddr^(oif<<5), tos); - *rp = rt_intern_hash(hash, rth, ETH_P_IP); + *rp = rt_intern_hash(hash, rth); return 0; } @@ -1444,6 +1463,113 @@ int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif) #ifdef CONFIG_RTNETLINK +static int rt_fill_info(struct sk_buff *skb, pid_t pid, u32 seq, int event, int nowait) +{ + struct rtable *rt = (struct rtable*)skb->dst; + struct rtmsg *r; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + struct rta_cacheinfo ci; +#ifdef CONFIG_IP_MROUTE + struct rtattr *eptr; +#endif +#ifdef CONFIG_RTNL_OLD_IFINFO + unsigned char *o; +#else + struct rtattr *mx; +#endif + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r)); + r = NLMSG_DATA(nlh); + nlh->nlmsg_flags = nowait ? NLM_F_MULTI : 0; + r->rtm_family = AF_INET; + r->rtm_dst_len = 32; + r->rtm_src_len = 32; + r->rtm_tos = rt->key.tos; + r->rtm_table = RT_TABLE_MAIN; + r->rtm_type = rt->rt_type; + r->rtm_scope = RT_SCOPE_UNIVERSE; + r->rtm_protocol = RTPROT_UNSPEC; + r->rtm_flags = (rt->rt_flags&~0xFFFF) | RTM_F_CLONED; +#ifdef CONFIG_RTNL_OLD_IFINFO + r->rtm_nhs = 0; + + o = skb->tail; +#endif + RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst); + RTA_PUT(skb, RTA_SRC, 4, &rt->rt_src); + if (rt->u.dst.dev) + RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex); + if (rt->rt_dst != rt->rt_gateway) + RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway); +#ifdef CONFIG_RTNL_OLD_IFINFO + RTA_PUT(skb, RTA_MTU, sizeof(unsigned), &rt->u.dst.pmtu); + RTA_PUT(skb, RTA_WINDOW, sizeof(unsigned), &rt->u.dst.window); + RTA_PUT(skb, RTA_RTT, sizeof(unsigned), &rt->u.dst.rtt); +#else + mx = (struct rtattr*)skb->tail; + RTA_PUT(skb, RTA_METRICS, 0, NULL); + if (rt->u.dst.mxlock) + RTA_PUT(skb, RTAX_LOCK, sizeof(unsigned), &rt->u.dst.mxlock); + if (rt->u.dst.pmtu) + RTA_PUT(skb, RTAX_MTU, sizeof(unsigned), &rt->u.dst.pmtu); + if (rt->u.dst.window) + RTA_PUT(skb, RTAX_WINDOW, sizeof(unsigned), &rt->u.dst.window); + if (rt->u.dst.rtt) + RTA_PUT(skb, RTAX_RTT, sizeof(unsigned), &rt->u.dst.rtt); + mx->rta_len = skb->tail - (u8*)mx; + if (mx->rta_len == RTA_LENGTH(0)) + skb_trim(skb, (u8*)mx - skb->data); +#endif + RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst); + ci.rta_lastuse = jiffies - rt->u.dst.lastuse; + ci.rta_used = atomic_read(&rt->u.dst.refcnt); + ci.rta_clntref = atomic_read(&rt->u.dst.use); + ci.rta_expires = 0; + ci.rta_error = rt->u.dst.error; +#ifdef CONFIG_IP_MROUTE + eptr = (struct rtattr*)skb->tail; +#endif + RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci); +#ifdef CONFIG_RTNL_OLD_IFINFO + r->rtm_optlen = skb->tail - o; +#endif + if (rt->key.iif) { +#ifdef CONFIG_IP_MROUTE + u32 dst = rt->rt_dst; + + if (MULTICAST(dst) && !LOCAL_MCAST(dst) && ipv4_devconf.mc_forwarding) { + int err = ipmr_get_route(skb, r, nowait); + if (err <= 0) { + if (!nowait) { + if (err == 0) + return 0; + goto nlmsg_failure; + } else { + if (err == -EMSGSIZE) + goto nlmsg_failure; + ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err; + } + } + } else +#endif + { + RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->key.iif); +#ifdef CONFIG_RTNL_OLD_IFINFO + r->rtm_optlen = skb->tail - o; +#endif + } + } + + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) { struct rtattr **rta = arg; @@ -1454,12 +1580,6 @@ int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) int iif = 0; int err; struct sk_buff *skb; - struct rta_cacheinfo ci; -#ifdef CONFIG_RTNL_OLD_IFINFO - unsigned char *o; -#else - struct rtattr *mx; -#endif skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (skb == NULL) @@ -1506,83 +1626,53 @@ int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) if (rtm->rtm_flags & RTM_F_NOTIFY) rt->rt_flags |= RTCF_NOTIFY; - nlh = NLMSG_PUT(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, - RTM_NEWROUTE, sizeof(*rtm)); - rtm = NLMSG_DATA(nlh); - nlh->nlmsg_flags = 0; - rtm->rtm_family = AF_INET; - rtm->rtm_dst_len = 32; - rtm->rtm_src_len = 32; - rtm->rtm_tos = rt->key.tos; - rtm->rtm_table = RT_TABLE_MAIN; - rtm->rtm_type = rt->rt_type; - rtm->rtm_scope = RT_SCOPE_UNIVERSE; - rtm->rtm_protocol = RTPROT_UNSPEC; - rtm->rtm_flags = (rt->rt_flags&~0xFFFF) | RTM_F_CLONED; -#ifdef CONFIG_RTNL_OLD_IFINFO - rtm->rtm_nhs = 0; + NETLINK_CB(skb).pid = NETLINK_CB(in_skb).pid; + + err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, RTM_NEWROUTE, 0); + if (err == 0) + return 0; + if (err < 0) + return -EMSGSIZE; - o = skb->tail; -#endif - RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst); - RTA_PUT(skb, RTA_SRC, 4, &rt->rt_src); - if (rt->u.dst.dev) - RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex); - if (rt->rt_dst != rt->rt_gateway) - RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway); -#ifdef CONFIG_RTNL_OLD_IFINFO - RTA_PUT(skb, RTA_MTU, sizeof(unsigned), &rt->u.dst.pmtu); - RTA_PUT(skb, RTA_WINDOW, sizeof(unsigned), &rt->u.dst.window); - RTA_PUT(skb, RTA_RTT, sizeof(unsigned), &rt->u.dst.rtt); -#else - mx = (struct rtattr*)skb->tail; - RTA_PUT(skb, RTA_METRICS, 0, NULL); - if (rt->u.dst.mxlock) - RTA_PUT(skb, RTAX_LOCK, sizeof(unsigned), &rt->u.dst.mxlock); - if (rt->u.dst.pmtu) - RTA_PUT(skb, RTAX_MTU, sizeof(unsigned), &rt->u.dst.pmtu); - if (rt->u.dst.window) - RTA_PUT(skb, RTAX_WINDOW, sizeof(unsigned), &rt->u.dst.window); - if (rt->u.dst.rtt) - RTA_PUT(skb, RTAX_RTT, sizeof(unsigned), &rt->u.dst.rtt); - mx->rta_len = skb->tail - (u8*)mx; -#endif - RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst); - ci.rta_lastuse = jiffies - rt->u.dst.lastuse; - ci.rta_used = atomic_read(&rt->u.dst.refcnt); - ci.rta_clntref = atomic_read(&rt->u.dst.use); - ci.rta_expires = 0; - ci.rta_error = rt->u.dst.error; - RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci); -#ifdef CONFIG_RTNL_OLD_IFINFO - rtm->rtm_optlen = skb->tail - o; -#endif - if (iif) { -#ifdef CONFIG_IP_MROUTE - if (MULTICAST(dst) && !LOCAL_MCAST(dst) && ipv4_devconf.mc_forwarding) { - NETLINK_CB(skb).pid = NETLINK_CB(in_skb).pid; - err = ipmr_get_route(skb, rtm); - if (err <= 0) - return err; - } else -#endif - { - RTA_PUT(skb, RTA_IIF, sizeof(int), &iif); -#ifdef CONFIG_RTNL_OLD_IFINFO - rtm->rtm_optlen = skb->tail - o; -#endif - } - } - nlh->nlmsg_len = skb->tail - (u8*)nlh; err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); if (err < 0) return err; return 0; +} -nlmsg_failure: -rtattr_failure: - kfree_skb(skb); - return -EMSGSIZE; + +int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct rtable *rt; + int h, s_h; + int idx, s_idx; + + s_h = cb->args[0]; + s_idx = idx = cb->args[1]; + for (h=0; h < RT_HASH_DIVISOR; h++) { + if (h < s_h) continue; + if (h > s_h) + memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(int)); + start_bh_atomic(); + for (rt = rt_hash_table[h], idx = 0; rt; rt = rt->u.rt_next, idx++) { + if (idx < s_idx) + continue; + skb->dst = dst_clone(&rt->u.dst); + if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, RTM_NEWROUTE, 1) <= 0) { + dst_release(xchg(&skb->dst, NULL)); + end_bh_atomic(); + goto done; + } + dst_release(xchg(&skb->dst, NULL)); + } + end_bh_atomic(); + } + +done: + cb->args[0] = h; + cb->args[1] = idx; + return skb->len; } #endif /* CONFIG_RTNETLINK */ diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 7d119716ed7f..00dd0a8efae6 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -9,7 +9,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * $Id: syncookies.c,v 1.3 1997/09/16 17:16:21 freitag Exp $ + * $Id: syncookies.c,v 1.4 1998/03/08 05:56:34 davem Exp $ * * Missing: IPv6 support. * Some counter so that the Administrator can see when the machine diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 606b1e69f1ba..d453e55b0590 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1,7 +1,7 @@ /* * sysctl_net_ipv4.c: sysctl interface to net IPV4 subsystem. * - * $Id: sysctl_net_ipv4.c,v 1.25 1998/01/15 22:40:57 freitag Exp $ + * $Id: sysctl_net_ipv4.c,v 1.26 1998/03/08 05:56:35 davem Exp $ * * Begun April 1, 1996, Mike Shaver. * Added /proc/sys/net/ipv4 directory entry (empty =) ). [MS] diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 17ec6def9861..837d6061e0a0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp.c,v 1.77 1998/01/15 22:40:18 freitag Exp $ + * Version: $Id: tcp.c,v 1.87 1998/03/10 05:11:14 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -425,6 +425,7 @@ int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; struct tcp_mib tcp_statistics; kmem_cache_t *tcp_openreq_cachep; +kmem_cache_t *tcp_bucket_cachep; /* * Find someone to 'accept'. Must be called with @@ -512,7 +513,7 @@ static int tcp_readable(struct sock *sk) return(0); } - counted = sk->copied_seq; /* Where we are at the moment */ + counted = sk->tp_pinfo.af_tcp.copied_seq; /* Where we are at the moment */ amount = 0; /* Do until a push or until we are out of data. */ @@ -606,10 +607,10 @@ unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait) if (sk->shutdown & RCV_SHUTDOWN) mask |= POLLHUP; - if ((tp->rcv_nxt != sk->copied_seq) && - (sk->urg_seq != sk->copied_seq || - tp->rcv_nxt != sk->copied_seq+1 || - sk->urginline || !sk->urg_data)) + if ((tp->rcv_nxt != tp->copied_seq) && + (tp->urg_seq != tp->copied_seq || + tp->rcv_nxt != tp->copied_seq+1 || + sk->urginline || !tp->urg_data)) mask |= POLLIN | POLLRDNORM; #if 1 /* This needs benchmarking and real world tests */ @@ -621,9 +622,9 @@ unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait) space = atomic_read(&sk->wmem_alloc) / 2; #endif /* Always wake the user up when an error occured */ - if (sock_wspace(sk) >= space) + if (sock_wspace(sk) >= space || sk->err) mask |= POLLOUT | POLLWRNORM; - if (sk->urg_data) + if (tp->urg_data) mask |= POLLPRI; } return mask; @@ -649,7 +650,8 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) } case SIOCATMARK: { - int answ = sk->urg_data && sk->urg_seq == sk->copied_seq; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int answ = tp->urg_data && tp->urg_seq == tp->copied_seq; return put_user(answ,(int *) arg); } case TIOCOUTQ: @@ -720,14 +722,15 @@ static void wait_for_tcp_memory(struct sock * sk) lock_sock(sk); } - +/* + * Add more stuff to the end of the skb. + */ static int tcp_append_tail(struct sock *sk, struct sk_buff *skb, u8 *from, int tcp_size, int seglen) { int fault; int copy; - /* Add more stuff to the end of the skb. */ copy = min(sk->mss - tcp_size, skb_tailroom(skb)); copy = min(copy, seglen); @@ -740,7 +743,7 @@ static int tcp_append_tail(struct sock *sk, struct sk_buff *skb, u8 *from, skb_put(skb, copy); skb->csum = csum_partial(skb->tail - tcp_size, tcp_size, 0); - sk->write_seq += copy; + sk->tp_pinfo.af_tcp.write_seq += copy; skb->end_seq += copy; return copy; @@ -882,7 +885,6 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) tmp += min(sk->mss, tp->max_window); else tmp += copy; - skb = sock_wmalloc(sk, tmp, 0, GFP_KERNEL); /* If we didn't get any memory, we need to sleep. */ @@ -933,12 +935,9 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) from += copy; copied += copy; - sk->write_seq += copy; + tp->write_seq += copy; tcp_send_skb(sk, skb); - - release_sock(sk); - lock_sock(sk); } } @@ -980,7 +979,7 @@ static int tcp_recv_urg(struct sock * sk, int nonblock, struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); /* No URG data to read. */ - if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ) + if (sk->urginline || !tp->urg_data || tp->urg_data == URG_READ) return -EINVAL; /* Yes this is right ! */ if (sk->err) @@ -1000,18 +999,10 @@ static int tcp_recv_urg(struct sock * sk, int nonblock, } lock_sock(sk); - if (sk->urg_data & URG_VALID) { - char c = sk->urg_data; + if (tp->urg_data & URG_VALID) { + char c = tp->urg_data; if (!(flags & MSG_PEEK)) - sk->urg_data = URG_READ; - - if(len>0) - { - err = memcpy_toiovec(msg->msg_iov, &c, 1); - msg->msg_flags|=MSG_OOB; - } - else - msg->msg_flags|=MSG_TRUNC; + tp->urg_data = URG_READ; if(msg->msg_name) tp->af_specific->addr2sockaddr(sk, (struct sockaddr *) @@ -1023,6 +1014,15 @@ static int tcp_recv_urg(struct sock * sk, int nonblock, /* Read urgent data. */ msg->msg_flags|=MSG_OOB; release_sock(sk); + + if(len>0) + { + err = memcpy_toiovec(msg->msg_iov, &c, 1); + msg->msg_flags|=MSG_OOB; + } + else + msg->msg_flags|=MSG_TRUNC; + return err ? -EFAULT : 1; } release_sock(sk); @@ -1044,45 +1044,37 @@ static int tcp_recv_urg(struct sock * sk, int nonblock, static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb) { - sk->tp_pinfo.af_tcp.delayed_acks++; - __skb_unlink(skb, &sk->receive_queue); kfree_skb(skb); } - -static void cleanup_rbuf(struct sock *sk) +/* Clean up the receive buffer for full frames taken by the user, + * then send an ACK if necessary. COPIED is the number of bytes + * tcp_recvmsg has given to the user so far, it speeds up the + * calculation of whether or not we must ACK for the sake of + * a window update. + */ +static void cleanup_rbuf(struct sock *sk, int copied) { struct sk_buff *skb; - struct tcp_opt *tp; /* NOTE! The socket must be locked, so that we don't get * a messed-up receive queue. */ while ((skb=skb_peek(&sk->receive_queue)) != NULL) { - if (!skb->used || atomic_read(&skb->users)>1) + if (!skb->used || atomic_read(&skb->users) > 1) break; tcp_eat_skb(sk, skb); } SOCK_DEBUG(sk, "sk->rspace = %lu\n", sock_rspace(sk)); - tp = &(sk->tp_pinfo.af_tcp); - - /* We send a ACK if the sender is blocked - * else let tcp_data deal with the acking policy. + /* We send a ACK if we can now advertise a non-zero window + * which has been raised "significantly". */ - if (tp->delayed_acks) { - __u32 rcv_wnd; - - /* FIXME: double check this rule, then check against - * other use of similar rules. Abtract if possible. - */ - rcv_wnd = tp->rcv_wnd - (tp->rcv_nxt - tp->rcv_wup); - - if ((rcv_wnd < sk->mss) && (sock_rspace(sk) > rcv_wnd)) - tcp_read_wakeup(sk); - } + if(tcp_timer_is_set(sk, TIME_DACK) && + (copied >= tcp_receive_window(&sk->tp_pinfo.af_tcp))) + tcp_read_wakeup(sk); } @@ -1113,8 +1105,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, * the multi-reader case neatly (memcpy_to/fromfs might be * inline and thus not flush cached variables otherwise). */ - peek_seq = sk->copied_seq; - seq = &sk->copied_seq; + peek_seq = tp->copied_seq; + seq = &tp->copied_seq; if (flags & MSG_PEEK) seq = &peek_seq; @@ -1129,7 +1121,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, u32 offset; /* Are we at urgent data? Stop if we have read anything. */ - if (copied && sk->urg_data && sk->urg_seq == *seq) + if (copied && tp->urg_data && tp->urg_seq == *seq) break; /* We need to check signals first, to get correct SIGURG @@ -1200,7 +1192,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, break; } - cleanup_rbuf(sk); + cleanup_rbuf(sk, copied); release_sock(sk); sk->socket->flags |= SO_WAITDATA; schedule(); @@ -1222,8 +1214,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, used = len; /* Do we have urgent data here? */ - if (sk->urg_data) { - u32 urg_offset = sk->urg_seq - *seq; + if (tp->urg_data) { + u32 urg_offset = tp->urg_seq - *seq; if (urg_offset < used) { if (!urg_offset) { if (!sk->urginline) { @@ -1264,8 +1256,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, */ atomic_dec(&skb->users); - if (after(sk->copied_seq,sk->urg_seq)) - sk->urg_data = 0; + if (after(tp->copied_seq,tp->urg_seq)) + tp->urg_data = 0; if (used + offset < skb->len) continue; @@ -1303,7 +1295,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, current->state = TASK_RUNNING; /* Clean up data we have read: This will do ACK frames. */ - cleanup_rbuf(sk); + cleanup_rbuf(sk, copied); release_sock(sk); return copied; } @@ -1421,7 +1413,6 @@ void tcp_close(struct sock *sk, unsigned long timeout) tcp_close_pending(sk); release_sock(sk); sk->dead = 1; - sk->prot->unhash(sk); return; } @@ -1479,9 +1470,6 @@ void tcp_close(struct sock *sk, unsigned long timeout) sk->dead = 1; release_sock(sk); - - if(sk->state == TCP_CLOSE) - sk->prot->unhash(sk); } /* @@ -1538,13 +1526,12 @@ struct sock *tcp_accept(struct sock *sk, int flags) /* If this is a non blocking socket don't sleep */ error = EAGAIN; if (flags & O_NONBLOCK) - goto out; + goto out; error = ERESTARTSYS; req = wait_for_connect(sk, &prev); if (!req) - goto out; - error = 0; + goto out; } tcp_synq_unlink(tp, req, prev); @@ -1647,9 +1634,16 @@ void tcp_set_keepalive(struct sock *sk, int val) __initfunc(void tcp_init(void)) { tcp_openreq_cachep = kmem_cache_create("tcp_open_request", - sizeof(struct open_request), + sizeof(struct open_request), 0, SLAB_HWCACHE_ALIGN, NULL, NULL); if(!tcp_openreq_cachep) panic("tcp_init: Cannot alloc open_request cache."); + + tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket", + sizeof(struct tcp_bind_bucket), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if(!tcp_bucket_cachep) + panic("tcp_init: Cannot alloc tcp_bind_bucket cache."); } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8413597390a9..9f2989082ab9 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_input.c,v 1.66 1998/01/15 22:40:29 freitag Exp $ + * Version: $Id: tcp_input.c,v 1.74 1998/03/10 05:11:15 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -78,46 +78,32 @@ int sysctl_tcp_stdurg; static tcp_sys_cong_ctl_t tcp_sys_cong_ctl_f = &tcp_cong_avoid_vanj; -/* - * Called each time to estimate the delayed ack timeout. This is - * how it should be done so a fast link isnt impacted by ack delay. - * - * I think we need a medium deviation here also... - * The estimated value is changing to fast +/* There is something which you must keep in mind when you analyze the + * behavior of the tp->ato delayed ack timeout interval. When a + * connection starts up, we want to ack as quickly as possible. The + * problem is that "good" TCP's do slow start at the beginning of data + * transmission. The means that until we send the first few ACK's the + * sender will sit on his end and only queue most of his data, because + * he can only send snd_cwnd unacked packets at any given time. For + * each ACK we send, he increments snd_cwnd and transmits more of his + * queue. -DaveM */ - static void tcp_delack_estimator(struct tcp_opt *tp) { - int m; - - /* Delayed ACK time estimator. */ - - m = jiffies - tp->lrcvtime; - - tp->lrcvtime = jiffies; - - if (m < 0) - return; - - /* if the mesured value is bigger than - * twice the round trip time ignore it. - */ - if ((m << 2) <= tp->srtt) { - m -= (tp->iat >> 3); - tp->iat += m; - - if (m <0) - m = -m; - - m -= (tp->iat_mdev >> 2); - tp->iat_mdev += m; - - tp->ato = (tp->iat >> 3) + (tp->iat_mdev >> 2); + if(tp->ato == 0) { + tp->lrcvtime = jiffies; + tp->ato = 2; /* Help sender leave slow start quickly */ + } else { + int m = jiffies - tp->lrcvtime; - if (tp->ato < HZ/50) - tp->ato = HZ/50; - } else - tp->ato = 0; + tp->lrcvtime = jiffies; + if(m <= 0) + m = 1; + if(m > tp->rto) + tp->ato = tp->rto; + else + tp->ato = (tp->ato >> 1) + m; + } } /* Called to compute a smoothed rtt estimate. The data fed to this @@ -401,89 +387,6 @@ static __inline__ int tcp_fast_parse_options(struct tcphdr *th, struct tcp_opt * return 1; } -#if 0 - -/* - * This is the old fast retransmit code. It will go away eventually. -- erics - */ - -/* - * See draft-stevens-tcpca-spec-01 for documentation. - */ - -static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) -{ - struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); - - /* FIXME: if we are already retransmitting should this code - * be skipped? [Floyd high_seq check sort of does this] - * The case I'm worried about is falling into a fast - * retransmit on a link with a congestion window of 1 or 2. - * There was some evidence in 2.0.x that this was problem - * on really slow links (1200 or 2400 baud). I need to - * try this situation again and see what happens. - */ - - /* - * An ACK is a duplicate if: - * (1) it has the same sequence number as the largest number we've - * seen, - * (2) it has the same window as the last ACK, - * (3) we have outstanding data that has not been ACKed - * (4) The packet was not carrying any data. - * (5) [From Floyds paper on fast retransmit wars] - * The packet acked data after high_seq; - */ - - if (ack == tp->snd_una && tp->packets_out && (not_dup == 0)) { - /* 1. When the third duplicate ack is received, set ssthresh - * to one half the current congestion window, but no less - * than two segments. Retransmit the missing segment. - */ - if (tp->high_seq == 0 || after(ack, tp->high_seq)) { - tp->dup_acks++; - - if (tp->dup_acks == 3) { - tp->snd_ssthresh = max(tp->snd_cwnd >> 1, 2); - tp->snd_cwnd = tp->snd_ssthresh + 3; - tcp_do_retransmit(sk, 0); - - /* Careful not to timeout just after fast - * retransmit! - */ - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); - } - } - - /* 2. Each time another duplicate ACK arrives, increment - * cwnd by the segment size. [...] Transmit a packet... - * - * Packet transmission will be done on normal flow processing - * since we're not in "retransmit mode". - */ - if (tp->dup_acks >= 3) { - tp->dup_acks++; - tp->snd_cwnd++; - } - } else { - /* 3. When the next ACK arrives that acknowledges new data, - * set cwnd to ssthresh. - */ - if (tp->dup_acks >= 3) { - tp->retrans_head = NULL; - tp->snd_cwnd = max(tp->snd_ssthresh, 1); - tp->retransmits = 0; - } - tp->dup_acks = 0; - - /* FIXME: This is wrong if the new ack that arrives - * is below the value for high_seq. - */ - tp->high_seq = 0; - } -} -#endif - #define FLAG_DATA 0x01 #define FLAG_WIN_UPDATE 0x02 #define FLAG_DATA_ACKED 0x04 @@ -596,9 +499,6 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) clear_fast_retransmit(sk); } } - } else { - /* Clear any aborted fast retransmit starts. */ - tp->dup_acks = 0; } } @@ -749,7 +649,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, __u32 *seq, SOCK_DEBUG(sk, "removing seg %x-%x from retransmit queue\n", skb->seq, skb->end_seq); #endif - acked = FLAG_DATA_ACKED; /* FIXME: packet counting may break if we have to @@ -766,11 +665,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, __u32 *seq, kfree_skb(skb); } - if (acked) { + if (acked) tp->retrans_head = NULL; - if (!sk->dead) - sk->write_space(sk); - } + return acked; } @@ -795,6 +692,66 @@ static void tcp_ack_probe(struct sock *sk, __u32 ack) } } +/* Read draft-ietf-tcplw-high-performance before mucking + * with this code. (Superceeds RFC1323) + */ +static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp, + u32 seq, u32 ack, int flag) +{ + __u32 seq_rtt = (jiffies-tp->rcv_tsecr); + tcp_rtt_estimator(tp, seq_rtt); + if (tp->retransmits) { + if (tp->packets_out == 0) { + tp->retransmits = 0; + tp->backoff = 0; + tcp_set_rto(tp); + } else { + /* Still retransmitting, use backoff */ + tcp_set_rto(tp); + tp->rto = tp->rto << tp->backoff; + } + } else { + tcp_set_rto(tp); + if (flag & FLAG_DATA_ACKED) + (*tcp_sys_cong_ctl_f)(sk, seq, ack, seq_rtt); + } + /* NOTE: safe here so long as cong_ctl doesn't use rto */ + tcp_bound_rto(tp); +} + +static void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp) +{ + struct sk_buff *skb; + long when; + + skb = skb_peek(&sk->write_queue); + when = tp->rto - (jiffies - skb->when); + + /* FIXME: This assumes that when we are retransmitting + * we should only ever respond with one packet. + * This means congestion windows should not grow + * during recovery. In 2.0.X we allow the congestion + * window to grow. It is not clear to me which + * decision is correct. The RFCs should be double + * checked as should the behavior of other stacks. + * Also note that if we do want to allow the + * congestion window to grow during retransmits + * we have to fix the call to congestion window + * updates so that it works during retransmission. + */ + if (tp->retransmits) { + tp->retrans_head = NULL; + + /* This is tricky. We are retransmiting a + * segment of a window when congestion occured. + */ + tcp_do_retransmit(sk, 0); + tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + } else { + tcp_reset_xmit_timer(sk, TIME_RETRANS, when); + } +} + /* * This routine deals with incoming acks, but not outgoing ones. */ @@ -806,7 +763,6 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th, int flag = 0; u32 seq = 0; u32 seq_rtt = 0; - struct sk_buff *skb; if(sk->zapped) return(1); /* Dead, can't ack any more so why bother */ @@ -869,28 +825,7 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th, /* If we have a timestamp, we always do rtt estimates. */ if (tp->saw_tstamp) { - /* Read draft-ietf-tcplw-high-performance before mucking - * with this code. (Superceeds RFC1323) - */ - seq_rtt = (jiffies-tp->rcv_tsecr); - tcp_rtt_estimator(tp, seq_rtt); - if (tp->retransmits) { - if (tp->packets_out == 0) { - tp->retransmits = 0; - tp->backoff = 0; - tcp_set_rto(tp); - } else { - /* Still retransmitting, use backoff */ - tcp_set_rto(tp); - tp->rto = tp->rto << tp->backoff; - } - } else { - tcp_set_rto(tp); - if (flag & FLAG_DATA_ACKED) - (*tcp_sys_cong_ctl_f)(sk, seq, ack, seq_rtt); - } - /* NOTE: safe here so long as cong_ctl doesn't use rto */ - tcp_bound_rto(tp); + tcp_ack_saw_tstamp(sk, tp, seq, ack, flag); } else { /* If we were retransmiting don't count rtt estimate. */ if (tp->retransmits) { @@ -916,47 +851,25 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th, } if (tp->packets_out) { - if (flag & FLAG_DATA_ACKED) { - long when; - - skb = skb_peek(&sk->write_queue); - when = tp->rto - (jiffies - skb->when); - - /* FIXME: This assumes that when we are retransmitting - * we should only ever respond with one packet. - * This means congestion windows should not grow - * during recovery. In 2.0.X we allow the congestion - * window to grow. It is not clear to me which - * decision is correct. The RFCs should be double - * checked as should the behavior of other stacks. - * Also note that if we do want to allow the - * congestion window to grow during retransmits - * we have to fix the call to congestion window - * updates so that it works during retransmission. - */ - if (tp->retransmits) { - tp->retrans_head = NULL; - - /* This is tricky. We are retransmiting a - * segment of a window when congestion occured. - */ - tcp_do_retransmit(sk, 0); - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); - } else - tcp_reset_xmit_timer(sk, TIME_RETRANS, when); - } - } else + if (flag & FLAG_DATA_ACKED) + tcp_ack_packets_out(sk, tp); + } else { tcp_clear_xmit_timer(sk, TIME_RETRANS); + } - tcp_fast_retrans(sk, ack, (flag & (FLAG_DATA|FLAG_WIN_UPDATE))); - + flag &= (FLAG_DATA | FLAG_WIN_UPDATE); + if ((ack == tp->snd_una && tp->packets_out && flag == 0) || + (tp->high_seq != 0)) { + tcp_fast_retrans(sk, ack, flag); + } else { + /* Clear any aborted fast retransmit starts. */ + tp->dup_acks = 0; + } /* Remember the highest ack received. */ tp->snd_una = ack; - return 1; uninteresting_ack: - SOCK_DEBUG(sk, "Ack ignored %u %u\n", ack, tp->snd_nxt); return 0; } @@ -978,15 +891,12 @@ uninteresting_ack: static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) { - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - if(sk->state == TCP_SYN_SENT) { /* RFC793 says to drop the segment and return. */ return 1; } - /* XXX This fin_seq thing should disappear... -DaveM */ - tp->fin_seq = skb->end_seq; + sk->tp_pinfo.af_tcp.fin_seq = skb->end_seq; tcp_send_ack(sk); @@ -1060,7 +970,7 @@ static void tcp_ofo_queue(struct sock *sk) struct sk_buff *skb; struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - while ((skb = skb_peek(&sk->out_of_order_queue))) { + while ((skb = skb_peek(&tp->out_of_order_queue))) { if (after(skb->seq, tp->rcv_nxt)) break; @@ -1095,7 +1005,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) skb_queue_tail(&sk->receive_queue, skb); tp->rcv_nxt = skb->end_seq; tcp_ofo_queue(sk); - if (skb_queue_len(&sk->out_of_order_queue) == 0) + if (skb_queue_len(&tp->out_of_order_queue) == 0) tp->pred_flags = htonl((0x5010 << 16) | tp->snd_wnd); return; } @@ -1127,10 +1037,10 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", tp->rcv_nxt, skb->seq, skb->end_seq); - if (skb_peek(&sk->out_of_order_queue) == NULL) { - skb_queue_head(&sk->out_of_order_queue,skb); + if (skb_peek(&tp->out_of_order_queue) == NULL) { + skb_queue_head(&tp->out_of_order_queue,skb); } else { - for(skb1=sk->out_of_order_queue.prev; ; skb1 = skb1->prev) { + for(skb1=tp->out_of_order_queue.prev; ; skb1 = skb1->prev) { /* Already there. */ if (skb->seq == skb1->seq && skb->len >= skb1->len) { skb_append(skb1, skb); @@ -1145,8 +1055,8 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) } /* See if we've hit the start. If so insert. */ - if (skb1 == skb_peek(&sk->out_of_order_queue)) { - skb_queue_head(&sk->out_of_order_queue,skb); + if (skb1 == skb_peek(&tp->out_of_order_queue)) { + skb_queue_head(&tp->out_of_order_queue,skb); break; } } @@ -1172,18 +1082,14 @@ static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len) if (skb->len == 0 && !th->fin) return(0); - /* FIXME: don't accept data after the received fin. - * - * Would checking snd_seq against fin_seq be enough? - * If so, how do we handle that case exactly? -DaveM - */ + /* FIXME: don't accept data after the received fin. */ /* We no longer have anyone receiving data on this connection. */ tcp_data_queue(sk, skb); - if (before(tp->rcv_nxt, sk->copied_seq)) { + if (before(tp->rcv_nxt, tp->copied_seq)) { printk(KERN_DEBUG "*** tcp.c:tcp_data bug acked < copied\n"); - tp->rcv_nxt = sk->copied_seq; + tp->rcv_nxt = tp->copied_seq; } tp->delayed_acks++; @@ -1213,11 +1119,8 @@ static void tcp_data_snd_check(struct sock *sk) * each packet it fires onto the wire. -DaveM */ tcp_write_xmit(sk); - if(!sk->dead) - sk->write_space(sk); } else if (tp->packets_out == 0 && !tp->pending) { /* Data to queue but no room. */ - /* FIXME: Is it right to do a zero window probe into * a congestion window limited window??? -- erics */ @@ -1242,10 +1145,16 @@ static __inline__ void __tcp_ack_snd_check(struct sock *sk) * - must send at least every 2 full sized packets */ - if (tp->delayed_acks >= MAX_DELAY_ACK || tcp_raise_window(sk)) + /* Two full frames received or... */ + if (((tp->rcv_nxt - tp->rcv_wup) >= (sk->mss << 1)) || + /* We will update the window "significantly" */ + tcp_raise_window(sk)) { + /* Then ack it now */ tcp_send_ack(sk); - else - tcp_send_delayed_ack(sk, HZ/2); + } else { + /* Else, send delayed ack. */ + tcp_send_delayed_ack(tp, HZ/2); + } } static __inline__ void tcp_ack_snd_check(struct sock *sk) @@ -1279,11 +1188,11 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th) ptr += ntohl(th->seq); /* Ignore urgent data that we've already seen and read. */ - if (after(sk->copied_seq, ptr)) + if (after(tp->copied_seq, ptr)) return; /* Do we already have a newer (or duplicate) urgent pointer? */ - if (sk->urg_data && !after(ptr, sk->urg_seq)) + if (tp->urg_data && !after(ptr, tp->urg_seq)) return; /* Tell the world about our new urgent pointer. */ @@ -1296,14 +1205,14 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th) /* We may be adding urgent data when the last byte read was * urgent. To do this requires some care. We cannot just ignore - * sk->copied_seq since we would read the last urgent byte again + * tp->copied_seq since we would read the last urgent byte again * as data, nor can we alter copied_seq until this data arrives * or we break the sematics of SIOCATMARK (and thus sockatmark()) */ - if (sk->urg_seq == sk->copied_seq) - sk->copied_seq++; /* Move the copied sequence on correctly */ - sk->urg_data = URG_NOTYET; - sk->urg_seq = ptr; + if (tp->urg_seq == tp->copied_seq) + tp->copied_seq++; /* Move the copied sequence on correctly */ + tp->urg_data = URG_NOTYET; + tp->urg_seq = ptr; /* Disable header prediction. */ tp->pred_flags = 0; @@ -1312,17 +1221,19 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th) /* This is the 'fast' part of urgent handling. */ static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + /* Check if we get a new urgent pointer - normally not. */ if (th->urg) tcp_check_urg(sk,th); /* Do we wait for any urgent data? - normally not... */ - if (sk->urg_data == URG_NOTYET) { - u32 ptr = sk->urg_seq - ntohl(th->seq) + (th->doff*4); + if (tp->urg_data == URG_NOTYET) { + u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff*4); /* Is the urgent pointer pointing into this packet? */ if (ptr < len) { - sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th); + tp->urg_data = URG_VALID | *(ptr + (unsigned char *) th); if (!sk->dead) sk->data_ready(sk,0); } @@ -1335,33 +1246,33 @@ static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len */ static void prune_queue(struct sock *sk) { - struct tcp_opt *tp; + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; struct sk_buff * skb; - SOCK_DEBUG(sk, "prune_queue: c=%x\n", sk->copied_seq); + SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq); /* First Clean the out_of_order queue. */ /* Start with the end because there are probably the least * useful packets (crossing fingers). */ - while ((skb = skb_dequeue_tail(&sk->out_of_order_queue))) { + while ((skb = skb_dequeue_tail(&tp->out_of_order_queue))) { kfree_skb(skb); if (atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) return; } - tp = &sk->tp_pinfo.af_tcp; - /* Now continue with the receive queue if it wasn't enough */ while ((skb = skb_peek_tail(&sk->receive_queue))) { /* Never remove packets that have been already acked */ if (before(skb->end_seq, tp->last_ack_sent+1)) { printk(KERN_DEBUG "prune_queue: hit acked data c=%x,%x,%x\n", - sk->copied_seq, skb->end_seq, tp->last_ack_sent); + tp->copied_seq, skb->end_seq, tp->last_ack_sent); break; } skb_unlink(skb); tp->rcv_nxt = skb->seq; + SOCK_DEBUG(sk, "prune_queue: removing %x-%x (c=%x)\n", + skb->seq, skb->end_seq, tp->copied_seq); kfree_skb(skb); if (atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) break; @@ -1429,7 +1340,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, } } else if (skb->ack_seq == tp->snd_una) { /* Bulk data transfer: receiver */ - if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) goto discard; @@ -1444,15 +1354,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, sk->data_ready(sk, 0); tcp_delack_estimator(tp); -#if 1 /* This checks for required window updates too. */ tp->delayed_acks++; __tcp_ack_snd_check(sk); -#else - if (tp->delayed_acks++ == 0) - tcp_send_delayed_ack(sk, HZ/2); - else - tcp_send_ack(sk); -#endif return 0; } } @@ -1469,7 +1372,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, } } - if(th->syn && skb->seq != sk->syn_seq) { + if(th->syn && skb->seq != tp->syn_seq) { SOCK_DEBUG(sk, "syn in established state\n"); tcp_statistics.TcpInErrs++; tcp_reset(sk, skb); @@ -1657,7 +1560,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tp->snd_wnd = htons(th->window) << tp->snd_wscale; tp->snd_wl1 = skb->seq; tp->snd_wl2 = skb->ack_seq; - tp->fin_seq = skb->seq; tcp_set_state(sk, TCP_ESTABLISHED); @@ -1687,7 +1589,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, sk->mss -= tp->tcp_header_len - sizeof(struct tcphdr); sk->dummy_th.dest = th->source; - sk->copied_seq = tp->rcv_nxt; + tp->copied_seq = tp->rcv_nxt; if(!sk->dead) { sk->state_change(sk); @@ -1819,7 +1721,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * original syn. */ - if (th->syn && skb->seq!=sk->syn_seq) { + if (th->syn && skb->seq!=tp->syn_seq) { tcp_reset(sk, skb); return 1; } @@ -1833,7 +1735,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (acceptable) { tcp_set_state(sk, TCP_ESTABLISHED); sk->dummy_th.dest=th->source; - sk->copied_seq = tp->rcv_nxt; + tp->copied_seq = tp->rcv_nxt; if(!sk->dead) sk->state_change(sk); @@ -1850,7 +1752,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, break; case TCP_FIN_WAIT1: - if (tp->snd_una == sk->write_seq) { + if (tp->snd_una == tp->write_seq) { sk->shutdown |= SEND_SHUTDOWN; tcp_set_state(sk, TCP_FIN_WAIT2); if (!sk->dead) @@ -1861,12 +1763,12 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, break; case TCP_CLOSING: - if (tp->snd_una == sk->write_seq) + if (tp->snd_una == tp->write_seq) tcp_time_wait(sk); break; case TCP_LAST_ACK: - if (tp->snd_una == sk->write_seq) { + if (tp->snd_una == tp->write_seq) { sk->shutdown = SHUTDOWN_MASK; tcp_set_state(sk,TCP_CLOSE); if (!sk->dead) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index e4f8981ac7c2..9354f946310a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_ipv4.c,v 1.79 1998/01/15 22:40:47 freitag Exp $ + * Version: $Id: tcp_ipv4.c,v 1.99 1998/03/10 05:11:18 davem Exp $ * * IPv4 specific functions * @@ -89,16 +89,19 @@ void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, */ struct sock *tcp_established_hash[TCP_HTABLE_SIZE]; +/* Ok, let's try this, I give up, we do need a local binding + * TCP hash as well as the others for fast bind/connect. + */ +struct tcp_bind_bucket *tcp_bound_hash[TCP_BHTABLE_SIZE]; + /* All sockets in TCP_LISTEN state will be in here. This is the only table * where wildcard'd TCP sockets can exist. Hash function here is just local * port number. */ struct sock *tcp_listening_hash[TCP_LHTABLE_SIZE]; -/* Ok, let's try this, I give up, we do need a local binding - * TCP hash as well as the others for fast bind/connect. - */ -struct sock *tcp_bound_hash[TCP_BHTABLE_SIZE]; +/* Register cache. */ +struct sock *tcp_regs[TCP_NUM_REGS]; /* * This array holds the first and last local port number. @@ -106,6 +109,7 @@ struct sock *tcp_bound_hash[TCP_BHTABLE_SIZE]; * 32768-61000 */ int sysctl_local_port_range[2] = { 1024, 4999 }; +int tcp_port_rover = (1024 - 1); static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport, __u32 faddr, __u16 fport) @@ -123,155 +127,133 @@ static __inline__ int tcp_sk_hashfn(struct sock *sk) return tcp_hashfn(laddr, lport, faddr, fport); } -static int tcp_v4_verify_bind(struct sock *sk, unsigned short snum) +/* Invariant, sk->num is non-zero. */ +void tcp_bucket_unlock(struct sock *sk) { - struct sock *sk2; - int retval = 0, sk_reuse = sk->reuse; + struct tcp_bind_bucket *tb; + unsigned short snum = sk->num; SOCKHASH_LOCK(); - sk2 = tcp_bound_hash[tcp_bhashfn(snum)]; - for(; sk2 != NULL; sk2 = sk2->bind_next) { - if((sk2->num == snum) && (sk2 != sk)) { - unsigned char state = sk2->state; - int sk2_reuse = sk2->reuse; - - /* Two sockets can be bound to the same port if they're - * bound to different interfaces. - */ - - if(sk->bound_dev_if != sk2->bound_dev_if) - continue; - - if(!sk2->rcv_saddr || !sk->rcv_saddr) { - if((!sk2_reuse) || - (!sk_reuse) || - (state == TCP_LISTEN)) { - retval = 1; - break; - } - } else if(sk2->rcv_saddr == sk->rcv_saddr) { - if((!sk_reuse) || - (!sk2_reuse) || - (state == TCP_LISTEN)) { - retval = 1; - break; - } + for(tb = tcp_bound_hash[tcp_bhashfn(snum)]; tb; tb = tb->next) { + if(tb->port == snum) { + if(tb->owners == NULL && + (tb->flags & TCPB_FLAG_LOCKED)) { + tb->flags &= ~TCPB_FLAG_LOCKED; + tcp_inc_slow_timer(TCP_SLT_BUCKETGC); } + break; } } SOCKHASH_UNLOCK(); +} - return retval; +struct tcp_bind_bucket *tcp_bucket_create(unsigned short snum) +{ + struct tcp_bind_bucket *tb; + + tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC); + if(tb != NULL) { + struct tcp_bind_bucket **head = + &tcp_bound_hash[tcp_bhashfn(snum)]; + tb->port = (snum | 0x10000); + tb->owners = NULL; + if((tb->next = *head) != NULL) + tb->next->pprev = &tb->next; + *head = tb; + tb->pprev = head; + } + return tb; } -static __inline__ int tcp_lport_inuse(int num) +static int tcp_v4_verify_bind(struct sock *sk, unsigned short snum) { - struct sock *sk = tcp_bound_hash[tcp_bhashfn(num)]; + struct tcp_bind_bucket *tb; + int result = 0; - for(; sk != NULL; sk = sk->bind_next) { - if(sk->num == num) - return 1; + SOCKHASH_LOCK(); + for(tb = tcp_bound_hash[tcp_bhashfn(snum)]; + (tb && (tb->port != snum)); + tb = tb->next) + ; + if(tb && tb->owners) { + /* Fast path for reuse ports, see include/net/tcp.h for a very + * detailed description of why this works, and why it is worth + * the effort at all. -DaveM + */ + if((tb->flags & TCPB_FLAG_FASTREUSE) && + (sk->reuse != 0)) { + goto go_like_smoke; + } else { + struct sock *sk2; + int sk_reuse = sk->reuse; + + /* We must walk the whole port owner list in this case. -DaveM */ + for(sk2 = tb->owners; sk2; sk2 = sk2->tp_pinfo.af_tcp.bind_next) { + if(sk->bound_dev_if == sk2->bound_dev_if) { + if(!sk_reuse || !sk2->reuse || sk2->state == TCP_LISTEN) { + if(!sk2->rcv_saddr || + !sk->rcv_saddr || + (sk2->rcv_saddr == sk->rcv_saddr)) + break; + } + } + } + if(sk2 != NULL) + result = 1; + } } - return 0; + if((result == 0) && + (tb == NULL) && + (tcp_bucket_create(snum) == NULL)) + result = 1; +go_like_smoke: + SOCKHASH_UNLOCK(); + return result; } -/* Find a "good" local port, this is family independent. - * There are several strategies working in unison here to - * get the best possible performance. The current socket - * load is kept track of, if it is zero there is a strong - * likely hood that there is a zero length chain we will - * find with a small amount of searching, else the load is - * what we shoot for for when the chains all have at least - * one entry. The base helps us walk the chains in an - * order such that a good chain is found as quickly as possible. -DaveM - */ unsigned short tcp_good_socknum(void) { - static int start = 0; - static int binding_contour = 0; - int best = 0; - int size = 32767; /* a big num. */ - int retval = 0, i, end, bc; + struct tcp_bind_bucket *tb; + int remaining = sysctl_local_port_range[1] - sysctl_local_port_range[0]; + int rover; SOCKHASH_LOCK(); - if (start > sysctl_local_port_range[1] || start < sysctl_local_port_range[0]) - start = sysctl_local_port_range[0]; - i = tcp_bhashfn(start); - end = i + TCP_BHTABLE_SIZE; - bc = binding_contour; - do { - struct sock *sk = tcp_bound_hash[i&(TCP_BHTABLE_SIZE-1)]; - if(!sk) { - /* find the smallest value no smaller than start - * that has this hash value. - */ - retval = tcp_bhashnext(start-1,i&(TCP_BHTABLE_SIZE-1)); - - /* Check for decreasing load. */ - if (bc != 0) - binding_contour = 0; - goto done; - } else { - int j = 0; - do { sk = sk->bind_next; } while (++j < size && sk); - if (j < size) { - best = i&(TCP_BHTABLE_SIZE-1); - size = j; - if (bc && size <= bc) - goto verify; - } - } - } while(++i != end); - i = best; - - /* Socket load is increasing, adjust our load average. */ - binding_contour = size; -verify: - if (size < binding_contour) - binding_contour = size; - - retval = tcp_bhashnext(start-1,i); - - best = retval; /* mark the starting point to avoid infinite loops */ - while(tcp_lport_inuse(retval)) { - retval = tcp_bhashnext(retval,i); - if (retval > sysctl_local_port_range[1]) /* Upper bound */ - retval = tcp_bhashnext(sysctl_local_port_range[0],i); - if (retval == best) { - /* This hash chain is full. No answer. */ - retval = 0; - break; + rover = tcp_port_rover; + do { + rover += 1; + if(rover < sysctl_local_port_range[0] || + rover > sysctl_local_port_range[1]) + rover = sysctl_local_port_range[0]; + tb = tcp_bound_hash[tcp_bhashfn(rover)]; + for( ; tb; tb = tb->next) { + if(tb->port == rover) + goto next; } - } - -done: - start = (retval + 1); + break; + next: + } while(--remaining > 0); + tcp_port_rover = rover; + if((remaining <= 0) || (tcp_bucket_create(rover) == NULL)) + rover = 0; SOCKHASH_UNLOCK(); - return retval; + return rover; } static void tcp_v4_hash(struct sock *sk) { - unsigned char state; - - SOCKHASH_LOCK(); - state = sk->state; - if(state != TCP_CLOSE || !sk->dead) { + if (sk->state != TCP_CLOSE) { struct sock **skp; - if(state == TCP_LISTEN) - skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)]; - else - skp = &tcp_established_hash[tcp_sk_hashfn(sk)]; - + SOCKHASH_LOCK(); + skp = &tcp_established_hash[tcp_sk_hashfn(sk)]; if((sk->next = *skp) != NULL) (*skp)->pprev = &sk->next; *skp = sk; sk->pprev = skp; tcp_sk_bindify(sk); + SOCKHASH_UNLOCK(); } - SOCKHASH_UNLOCK(); } static void tcp_v4_unhash(struct sock *sk) @@ -282,6 +264,7 @@ static void tcp_v4_unhash(struct sock *sk) sk->next->pprev = sk->pprev; *sk->pprev = sk->next; sk->pprev = NULL; + tcp_reg_zap(sk); tcp_sk_unbindify(sk); } SOCKHASH_UNLOCK(); @@ -293,20 +276,20 @@ static void tcp_v4_rehash(struct sock *sk) SOCKHASH_LOCK(); state = sk->state; - if(sk->pprev) { + if(sk->pprev != NULL) { if(sk->next) sk->next->pprev = sk->pprev; *sk->pprev = sk->next; sk->pprev = NULL; - tcp_sk_unbindify(sk); + tcp_reg_zap(sk); } - if(state != TCP_CLOSE || !sk->dead) { + if(state != TCP_CLOSE) { struct sock **skp; if(state == TCP_LISTEN) { skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)]; } else { - int hash= tcp_sk_hashfn(sk); + int hash = tcp_sk_hashfn(sk); if(state == TCP_TIME_WAIT) hash += (TCP_HTABLE_SIZE/2); skp = &tcp_established_hash[hash]; @@ -316,7 +299,8 @@ static void tcp_v4_rehash(struct sock *sk) (*skp)->pprev = &sk->next; *skp = sk; sk->pprev = skp; - tcp_sk_bindify(sk); + if(state == TCP_LISTEN) + tcp_sk_bindify(sk); } SOCKHASH_UNLOCK(); } @@ -360,37 +344,64 @@ static struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int d return result; } +/* Until this is verified... -DaveM */ +/* #define USE_QUICKSYNS */ + /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM */ static inline struct sock *__tcp_v4_lookup(struct tcphdr *th, - u32 saddr, u16 sport, u32 daddr, u16 dport, int dif) + u32 saddr, u16 sport, + u32 daddr, u16 dport, int dif) { unsigned short hnum = ntohs(dport); struct sock *sk; - int hash = tcp_hashfn(daddr, hnum, saddr, sport); + int hash; + +#ifdef USE_QUICKSYNS + /* Incomming connection short-cut. */ + if (th && th->syn == 1 && th->ack == 0) + goto listener_shortcut; +#endif + + /* Check TCP register quick cache first. */ + sk = TCP_RHASH(sport); + if(sk && + sk->daddr == saddr && /* remote address */ + sk->dummy_th.dest == sport && /* remote port */ + sk->num == hnum && /* local port */ + sk->rcv_saddr == daddr && /* local address */ + (!sk->bound_dev_if || sk->bound_dev_if == dif)) + goto hit; /* Optimize here for direct hit, only listening connections can * have wildcards anyways. It is assumed that this code only * gets called from within NET_BH. */ - for(sk = tcp_established_hash[hash]; sk; sk = sk->next) + hash = tcp_hashfn(daddr, hnum, saddr, sport); + for(sk = tcp_established_hash[hash]; sk; sk = sk->next) { if(sk->daddr == saddr && /* remote address */ sk->dummy_th.dest == sport && /* remote port */ sk->num == hnum && /* local port */ sk->rcv_saddr == daddr && /* local address */ - (!sk->bound_dev_if || sk->bound_dev_if == dif)) + (!sk->bound_dev_if || sk->bound_dev_if == dif)) { + if (sk->state == TCP_ESTABLISHED) + TCP_RHASH(sport) = sk; goto hit; /* You sunk my battleship! */ - + } + } /* Must check for a TIME_WAIT'er before going to listener hash. */ - for(sk = tcp_established_hash[hash+(TCP_HTABLE_SIZE/2)]; sk; sk = sk->next) + for(sk = tcp_established_hash[hash+(TCP_HTABLE_SIZE/2)]; sk; sk = sk->next) { if(sk->daddr == saddr && /* remote address */ sk->dummy_th.dest == sport && /* remote port */ sk->num == hnum && /* local port */ sk->rcv_saddr == daddr && /* local address */ (!sk->bound_dev_if || sk->bound_dev_if == dif)) goto hit; - + } +#ifdef USE_QUICKSYNS +listener_shortcut: +#endif sk = tcp_v4_lookup_listener(daddr, hnum, dif); hit: return sk; @@ -402,20 +413,11 @@ __inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport } #ifdef CONFIG_IP_TRANSPARENT_PROXY -#define secondlist(hpnum, sk, fpass) \ -({ struct sock *s1; if(!(sk) && (fpass)--) \ - s1 = tcp_bound_hash[tcp_bhashfn(hpnum)]; \ - else \ - s1 = (sk); \ - s1; \ -}) - -#define tcp_v4_proxy_loop_init(hnum, hpnum, sk, fpass) \ - secondlist((hpnum), tcp_bound_hash[tcp_bhashfn(hnum)],(fpass)) - -#define tcp_v4_proxy_loop_next(hnum, hpnum, sk, fpass) \ - secondlist((hpnum),(sk)->bind_next,(fpass)) - +/* Cleaned up a little and adapted to new bind bucket scheme. + * Oddly, this should increase performance here for + * transparent proxy, as tests within the inner loop have + * been eliminated. -DaveM + */ static struct sock *tcp_v4_proxy_lookup(unsigned short num, unsigned long raddr, unsigned short rnum, unsigned long laddr, struct device *dev, unsigned short pnum, @@ -436,51 +438,60 @@ static struct sock *tcp_v4_proxy_lookup(unsigned short num, unsigned long raddr, } /* This code must run only from NET_BH. */ - for(s = tcp_v4_proxy_loop_init(hnum, hpnum, s, firstpass); - s != NULL; - s = tcp_v4_proxy_loop_next(hnum, hpnum, s, firstpass)) { - if(s->num == hnum || s->num == hpnum) { - int score = 0; - if(s->dead && (s->state == TCP_CLOSE)) + { + struct tcp_bind_bucket *tb = tcp_bound_hash[tcp_bhashfn(hnum)]; + for( ; (tb && tb->port != hnum); tb = tb->next) + ; + if(tb == NULL) + goto next; + s = tb->owners; + } +pass2: + for(; s; s = s->tp_pinfo.af_tcp.bind_next) { + int score = 0; + if(s->rcv_saddr) { + if((s->num != hpnum || s->rcv_saddr != paddr) && + (s->num != hnum || s->rcv_saddr != laddr)) continue; - if(s->rcv_saddr) { - if((s->num != hpnum || s->rcv_saddr != paddr) && - (s->num != hnum || s->rcv_saddr != laddr)) - continue; - score++; - } - if(s->daddr) { - if(s->daddr != raddr) - continue; - score++; - } - if(s->dummy_th.dest) { - if(s->dummy_th.dest != rnum) - continue; - score++; - } - if(s->bound_dev_if) { - if(s->bound_dev_if != dif) - continue; - score++; - } - if(score == 4 && s->num == hnum) { - result = s; - break; - } else if(score > badness && (s->num == hpnum || s->rcv_saddr)) { - result = s; - badness = score; - } + score++; + } + if(s->daddr) { + if(s->daddr != raddr) + continue; + score++; + } + if(s->dummy_th.dest) { + if(s->dummy_th.dest != rnum) + continue; + score++; + } + if(s->bound_dev_if) { + if(s->bound_dev_if != dif) + continue; + score++; + } + if(score == 4 && s->num == hnum) { + result = s; + goto gotit; + } else if(score > badness && (s->num == hpnum || s->rcv_saddr)) { + result = s; + badness = score; + } + } +next: + if(firstpass--) { + struct tcp_bind_bucket *tb = tcp_bound_hash[tcp_bhashfn(hpnum)]; + for( ; (tb && tb->port != hpnum); tb = tb->next) + ; + if(tb) { + s = tb->owners; + goto pass2; } } +gotit: return result; } - -#undef secondlist -#undef tcp_v4_proxy_loop_init -#undef tcp_v4_proxy_loop_next - -#endif +#endif /* CONFIG_IP_TRANSPARENT_PROXY */ static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb) { @@ -495,41 +506,35 @@ static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb) /* * Check that a TCP address is unique, don't allow multiple - * connects to/from the same address + * connects to/from the same address. Actually we can optimize + * quite a bit, since the socket about to connect is still + * in TCP_CLOSE, a tcp_bind_bucket for the local port he will + * use will exist, with a NULL owners list. So check for that. + * The good_socknum and verify_bind scheme we use makes this + * work. */ -static int tcp_unique_address(u32 saddr, u16 snum, u32 daddr, u16 dnum) +static int tcp_unique_address(struct sock *sk) { - int retval = 1, hashent = tcp_hashfn(saddr, snum, daddr, dnum); - struct sock * sk; + struct tcp_bind_bucket *tb; + unsigned short snum = sk->num; + int retval = 1; - /* Make sure we are allowed to connect here. - * But freeze the hash while we snoop around. - */ + /* Freeze the hash while we snoop around. */ SOCKHASH_LOCK(); - sk = tcp_established_hash[hashent]; - for (; sk != NULL; sk = sk->next) { - if(sk->daddr == daddr && /* remote address */ - sk->dummy_th.dest == dnum && /* remote port */ - sk->num == snum && /* local port */ - sk->saddr == saddr) { /* local address */ - retval = 0; - goto out; - } - } - - /* Must check TIME_WAIT'ers too. */ - sk = tcp_established_hash[hashent + (TCP_HTABLE_SIZE/2)]; - for (; sk != NULL; sk = sk->next) { - if(sk->daddr == daddr && /* remote address */ - sk->dummy_th.dest == dnum && /* remote port */ - sk->num == snum && /* local port */ - sk->saddr == saddr) { /* local address */ - retval = 0; - goto out; + tb = tcp_bound_hash[tcp_bhashfn(snum)]; + for(; tb; tb = tb->next) { + if(tb->port == snum && tb->owners != NULL) { + /* Almost certainly the re-use port case, search the real hashes + * so it actually scales. + */ + sk = __tcp_v4_lookup(NULL, sk->daddr, sk->dummy_th.dest, + sk->rcv_saddr, snum, sk->bound_dev_if); + if((sk != NULL) && (sk->state != TCP_LISTEN)) + retval = 0; + break; } } -out: SOCKHASH_UNLOCK(); return retval; } @@ -578,8 +583,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) return -ENETUNREACH; } - if (!tcp_unique_address(rt->rt_src, sk->num, rt->rt_dst, - usin->sin_port)) { + if (!tcp_unique_address(sk)) { ip_rt_put(rt); return -EADDRNOTAVAIL; } @@ -605,14 +609,14 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) sk->dummy_th.dest = usin->sin_port; - sk->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr, + tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr, sk->dummy_th.source, usin->sin_port); tp->snd_wnd = 0; tp->snd_wl1 = 0; - tp->snd_wl2 = sk->write_seq; - tp->snd_una = sk->write_seq; + tp->snd_wl2 = tp->write_seq; + tp->snd_una = tp->write_seq; tp->rcv_nxt = 0; @@ -639,10 +643,10 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) buff->h.th = th; memcpy(th,(void *)&(sk->dummy_th), sizeof(*th)); - buff->seq = sk->write_seq++; + buff->seq = tp->write_seq++; th->seq = htonl(buff->seq); - tp->snd_nxt = sk->write_seq; - buff->end_seq = sk->write_seq; + tp->snd_nxt = tp->write_seq; + buff->end_seq = tp->write_seq; th->ack = 0; th->syn = 1; @@ -686,9 +690,10 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) tcp_set_state(sk,TCP_SYN_SENT); /* Socket identity change complete, no longer - * in TCP_CLOSE, so rehash. + * in TCP_CLOSE, so enter ourselves into the + * hash tables. */ - tcp_v4_rehash(sk); + tcp_v4_hash(sk); tp->rto = rt->u.dst.rtt; @@ -1081,9 +1086,8 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req) memset(th, 0, sizeof(struct tcphdr)); th->syn = 1; th->ack = 1; - th->source = #ifdef CONFIG_IP_TRANSPARENT_PROXY - req->lcl_port; /* LVE */ + th->source = req->lcl_port; /* LVE */ #else th->source = sk->dummy_th.source; #endif @@ -1289,6 +1293,114 @@ error: return 0; } +/* This is not only more efficient than what we used to do, it eliminates + * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM + */ +struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb) +{ + struct sock *newsk = sk_alloc(AF_INET, GFP_ATOMIC, 0); + + if(newsk != NULL) { + struct tcp_opt *newtp; + + memcpy(newsk, sk, sizeof(*newsk)); + newsk->sklist_next = NULL; + newsk->daddr = req->af.v4_req.rmt_addr; + newsk->rcv_saddr = req->af.v4_req.loc_addr; +#ifdef CONFIG_IP_TRANSPARENT_PROXY + newsk->num = ntohs(skb->h.th->dest); +#endif + newsk->state = TCP_SYN_RECV; + + /* Clone the TCP header template */ +#ifdef CONFIG_IP_TRANSPARENT_PROXY + newsk->dummy_th.source = req->lcl_port; +#endif + newsk->dummy_th.dest = req->rmt_port; + newsk->dummy_th.ack = 1; + newsk->dummy_th.doff = sizeof(struct tcphdr)>>2; + + newsk->sock_readers = 0; + atomic_set(&newsk->rmem_alloc, 0); + skb_queue_head_init(&newsk->receive_queue); + atomic_set(&newsk->wmem_alloc, 0); + skb_queue_head_init(&newsk->write_queue); + newsk->saddr = req->af.v4_req.loc_addr; + + newsk->done = 0; + newsk->proc = 0; + newsk->pair = NULL; + skb_queue_head_init(&newsk->back_log); + skb_queue_head_init(&newsk->error_queue); + + /* Now setup tcp_opt */ + newtp = &(newsk->tp_pinfo.af_tcp); + newtp->pred_flags = 0; + newtp->rcv_nxt = req->rcv_isn + 1; + newtp->snd_nxt = req->snt_isn + 1; + newtp->snd_una = req->snt_isn + 1; + newtp->srtt = 0; + newtp->ato = 0; + newtp->snd_wl1 = req->rcv_isn; + newtp->snd_wl2 = req->snt_isn; + newtp->snd_wnd = ntohs(skb->h.th->window); + newtp->max_window = newtp->snd_wnd; + newtp->pending = 0; + newtp->retransmits = 0; + newtp->last_ack_sent = req->rcv_isn + 1; + newtp->backoff = 0; + newtp->mdev = TCP_TIMEOUT_INIT; + newtp->snd_cwnd = 1; + newtp->rto = TCP_TIMEOUT_INIT; + newtp->packets_out = 0; + newtp->high_seq = 0; + newtp->snd_ssthresh = 0x7fffffff; + newtp->snd_cwnd_cnt = 0; + newtp->dup_acks = 0; + newtp->delayed_acks = 0; + init_timer(&newtp->retransmit_timer); + newtp->retransmit_timer.function = &tcp_retransmit_timer; + newtp->retransmit_timer.data = (unsigned long) newsk; + init_timer(&newtp->delack_timer); + newtp->delack_timer.function = &tcp_delack_timer; + newtp->delack_timer.data = (unsigned long) newsk; + skb_queue_head_init(&newtp->out_of_order_queue); + newtp->send_head = newtp->retrans_head = NULL; + newtp->rcv_wup = req->rcv_isn + 1; + newtp->write_seq = req->snt_isn + 1; + newtp->copied_seq = req->rcv_isn + 1; + + newtp->saw_tstamp = 0; + newtp->in_mss = 536; + newtp->sacks = 0; + + init_timer(&newtp->probe_timer); + newtp->probe_timer.function = &tcp_probe_timer; + newtp->probe_timer.data = (unsigned long) newsk; + newtp->probes_out = 0; + newtp->syn_seq = req->rcv_isn; + newtp->fin_seq = req->rcv_isn; + newtp->urg_data = 0; + tcp_synq_init(newtp); + newtp->syn_backlog = 0; + + /* Back to base struct sock members. */ + newsk->err = 0; + newsk->ack_backlog = 0; + newsk->max_ack_backlog = SOMAXCONN; + newsk->priority = 1; + + /* IP layer stuff */ + newsk->opt = req->af.v4_req.opt; + newsk->timeout = 0; + init_timer(&newsk->timer); + newsk->timer.function = &net_timer; + newsk->timer.data = (unsigned long) newsk; + newsk->socket = NULL; + } + return newsk; +} + struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, struct open_request *req, struct dst_entry *dst) @@ -1301,98 +1413,14 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, if (sk->ack_backlog > sk->max_ack_backlog) goto exit; /* head drop */ #endif - newsk = sk_alloc(AF_INET, GFP_ATOMIC); + newsk = tcp_create_openreq_child(sk, req, skb); if (!newsk) goto exit; #ifdef NEW_LISTEN sk->ack_backlog++; #endif - memcpy(newsk, sk, sizeof(*newsk)); - - /* Or else we die! -DaveM */ - newsk->sklist_next = NULL; - - newsk->opt = req->af.v4_req.opt; - - skb_queue_head_init(&newsk->write_queue); - skb_queue_head_init(&newsk->receive_queue); - skb_queue_head_init(&newsk->out_of_order_queue); - skb_queue_head_init(&newsk->error_queue); - /* Unused */ newtp = &(newsk->tp_pinfo.af_tcp); - newtp->send_head = NULL; - newtp->retrans_head = NULL; - - newtp->pending = 0; - - skb_queue_head_init(&newsk->back_log); - - newsk->prot->init(newsk); - - newtp->snd_cwnd_cnt = 0; - newtp->backoff = 0; - newsk->proc = 0; - newsk->done = 0; - newsk->pair = NULL; - atomic_set(&newsk->wmem_alloc, 0); - atomic_set(&newsk->rmem_alloc, 0); - newsk->localroute = sk->localroute; - - newsk->err = 0; - newsk->shutdown = 0; - newsk->ack_backlog = 0; - - newtp->fin_seq = req->rcv_isn; - newsk->syn_seq = req->rcv_isn; - newsk->state = TCP_SYN_RECV; - newsk->timeout = 0; - - newsk->write_seq = req->snt_isn; - - newtp->snd_wnd = ntohs(skb->h.th->window); - newtp->max_window = newtp->snd_wnd; - newtp->snd_wl1 = req->rcv_isn; - newtp->snd_wl2 = newsk->write_seq; - newtp->snd_una = newsk->write_seq++; - newtp->snd_nxt = newsk->write_seq; - - newsk->urg_data = 0; - newtp->packets_out = 0; - newtp->retransmits = 0; - newsk->linger=0; - newsk->destroy = 0; - init_timer(&newsk->timer); - newsk->timer.data = (unsigned long) newsk; - newsk->timer.function = &net_timer; - - tcp_init_xmit_timers(newsk); - - newsk->dummy_th.source = -#ifdef CONFIG_IP_TRANSPARENT_PROXY - req->lcl_port; /* LVE */ -#else - sk->dummy_th.source; -#endif - newsk->dummy_th.dest = req->rmt_port; - newsk->sock_readers=0; - - newtp->last_ack_sent = newtp->rcv_nxt = req->rcv_isn + 1; - newtp->rcv_wup = req->rcv_isn + 1; - newsk->copied_seq = req->rcv_isn + 1; - - newsk->socket = NULL; - -#ifdef CONFIG_IP_TRANSPARENT_PROXY - /* - * Deal with possibly redirected traffic by setting num to - * the intended destination port of the received packet. - */ - newsk->num = ntohs(skb->h.th->dest); -#endif - newsk->daddr = req->af.v4_req.rmt_addr; - newsk->saddr = req->af.v4_req.loc_addr; - newsk->rcv_saddr = req->af.v4_req.loc_addr; /* options / mss / route_cache */ if (dst == NULL) { @@ -1446,13 +1474,13 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, /* Make sure our mtu is adjusted for headers. */ newsk->mss = min(req->mss, snd_mss) + sizeof(struct tcphdr) - newtp->tcp_header_len; - tcp_v4_hash(newsk); + /* Must use the af_specific ops here for the case of IPv6 mapped. */ + newsk->prot->hash(newsk); add_to_prot_sklist(newsk); return newsk; exit: - if (dst) - dst_release(dst); + dst_release(dst); return NULL; } @@ -1774,7 +1802,7 @@ static int tcp_v4_init_sock(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - skb_queue_head_init(&sk->out_of_order_queue); + skb_queue_head_init(&tp->out_of_order_queue); tcp_init_xmit_timers(sk); tp->srtt = 0; @@ -1782,7 +1810,6 @@ static int tcp_v4_init_sock(struct sock *sk) tp->mdev = TCP_TIMEOUT_INIT; tp->ato = 0; - tp->iat = (HZ/5) << 3; /* FIXME: tie this to sk->rcvbuf? (May be unnecessary) */ /* tp->rcv_wnd = 8192; */ @@ -1824,6 +1851,7 @@ static int tcp_v4_init_sock(struct sock *sk) static int tcp_v4_destroy_sock(struct sock *sk) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sk_buff *skb; tcp_clear_xmit_timers(sk); @@ -1836,9 +1864,17 @@ static int tcp_v4_destroy_sock(struct sock *sk) kfree_skb(skb); /* Cleans up our, hopefuly empty, out_of_order_queue. */ - while((skb = skb_dequeue(&sk->out_of_order_queue)) != NULL) + while((skb = skb_dequeue(&tp->out_of_order_queue)) != NULL) kfree_skb(skb); + /* Clean up a locked TCP bind bucket, this only happens if a + * port is allocated for a socket, but it never fully connects. + * In which case we will find num to be non-zero and daddr to + * be zero. + */ + if(sk->daddr == 0 && sk->num != 0) + tcp_bucket_unlock(sk); + return 0; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index fbae5cfa6d6c..d343c1644c4c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_output.c,v 1.51 1998/01/15 22:40:39 freitag Exp $ + * Version: $Id: tcp_output.c,v 1.56 1998/03/10 05:11:16 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -45,7 +45,6 @@ static __inline__ void clear_delayed_acks(struct sock * sk) struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); tp->delayed_acks = 0; - sk->ack_backlog = 0; tcp_clear_xmit_timer(sk, TIME_DACK); } @@ -437,128 +436,44 @@ void tcp_write_xmit(struct sock *sk) * taken by headers, and the remaining space will be available for TCP data. * This should be accounted for correctly instead. */ -unsigned short tcp_select_window(struct sock *sk) +u32 __tcp_select_window(struct sock *sk) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - int mss = sk->mss; - long free_space = sock_rspace(sk) / 2; - long window, cur_win; + unsigned int mss = sk->mss; + unsigned int free_space; + u32 window, cur_win; + free_space = (sk->rcvbuf - atomic_read(&sk->rmem_alloc)) / 2; if (tp->window_clamp) { free_space = min(tp->window_clamp, free_space); mss = min(tp->window_clamp, mss); - } -#ifdef NO_ANK_FIX - /* I am tired of this message */ - else - printk(KERN_DEBUG "Clamp failure. Water leaking.\n"); -#endif + } else { + printk("tcp_select_window: tp->window_clamp == 0.\n"); + } if (mss < 1) { mss = 1; - printk(KERN_DEBUG "tcp_select_window: mss fell to 0.\n"); + printk("tcp_select_window: sk->mss fell to 0.\n"); } - /* compute the actual window i.e. - * old_window - received_bytes_on_that_win - */ - cur_win = tp->rcv_wnd - (tp->rcv_nxt - tp->rcv_wup); - window = tp->rcv_wnd; - - if (cur_win < 0) { - cur_win = 0; -#ifdef NO_ANK_FIX - /* And this too. */ - printk(KERN_DEBUG "TSW: win < 0 w=%d 1=%u 2=%u\n", - tp->rcv_wnd, tp->rcv_nxt, tp->rcv_wup); -#endif - } - - if (free_space < sk->rcvbuf/4 && free_space < mss/2) + cur_win = tcp_receive_window(tp); + if (free_space < sk->rcvbuf/4 && free_space < mss/2) { window = 0; - - /* Get the largest window that is a nice multiple of mss. - * Window clamp already applied above. - * If our current window offering is within 1 mss of the - * free space we just keep it. This prevents the divide - * and multiply from happening most of the time. - * We also don't do any window rounding when the free space - * is too small. - */ - if (window < free_space - mss && free_space > mss) - window = (free_space/mss)*mss; - - /* Never shrink the offered window */ - if (window < cur_win) - window = cur_win; - - tp->rcv_wnd = window; - tp->rcv_wup = tp->rcv_nxt; - return window >> tp->rcv_wscale; /* RFC1323 scaling applied */ -} - -#if 0 -/* Old algorithm for window selection */ -unsigned short tcp_select_window(struct sock *sk) -{ - struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - int mss = sk->mss; - long free_space = sock_rspace(sk); - long window, cur_win, usable; - - if (tp->window_clamp) { - free_space = min(tp->window_clamp, free_space); - mss = min(tp->window_clamp, mss); - } - - /* compute the actual window i.e. - * old_window - received_bytes_on_that_win - */ - cur_win = tp->rcv_wnd - (tp->rcv_nxt - tp->rcv_wup); - window = tp->rcv_wnd; - - if (cur_win < 0) { - cur_win = 0; - printk(KERN_DEBUG "TSW: win < 0 w=%d 1=%u 2=%u\n", - tp->rcv_wnd, tp->rcv_nxt, tp->rcv_wup); - } - - /* RFC 1122: - * "the suggested [SWS] avoidance algoritm for the receiver is to keep - * RECV.NEXT + RCV.WIN fixed until: - * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)" - * - * i.e. don't raise the right edge of the window until you can raise - * it at least MSS bytes. - */ - - usable = free_space - cur_win; - if (usable < 0) - usable = 0; - - if (window < usable) { - /* Window is not blocking the sender - * and we have enough free space for it - */ - if (cur_win > (sk->mss << 1)) - goto out; - } - - if (window >= usable) { - /* We are offering too much, cut it down... - * but don't shrink the window - */ - window = max(usable, cur_win); } else { - while ((usable - window) >= mss) - window += mss; + /* Get the largest window that is a nice multiple of mss. + * Window clamp already applied above. + * If our current window offering is within 1 mss of the + * free space we just keep it. This prevents the divide + * and multiply from happening most of the time. + * We also don't do any window rounding when the free space + * is too small. + */ + window = tp->rcv_wnd; + if ((window <= (free_space - mss)) || (window > free_space)) + window = (free_space/mss)*mss; } -out: - tp->rcv_wnd = window; - tp->rcv_wup = tp->rcv_nxt; return window; } -#endif static int tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb) { @@ -760,7 +675,7 @@ void tcp_send_fin(struct sock *sk) * put a FIN into the queue, otherwise it never gets queued. */ kfree_skb(buff); - sk->write_seq++; + tp->write_seq++; t = del_timer(&sk->timer); if (t) add_timer(&sk->timer); @@ -777,9 +692,9 @@ void tcp_send_fin(struct sock *sk) tcp_build_options((__u32 *)(t1+1),tp); memcpy(t1, th, sizeof(*t1)); - buff->seq = sk->write_seq; - sk->write_seq++; - buff->end_seq = sk->write_seq; + buff->seq = tp->write_seq; + tp->write_seq++; + buff->end_seq = tp->write_seq; t1->seq = htonl(buff->seq); t1->ack_seq = htonl(tp->rcv_nxt); t1->window = htons(tcp_select_window(sk)); @@ -796,7 +711,7 @@ void tcp_send_fin(struct sock *sk) struct sk_buff *skb1; tp->packets_out++; - tp->snd_nxt = sk->write_seq; + tp->snd_nxt = tp->write_seq; buff->when = jiffies; skb1 = skb_clone(buff, GFP_KERNEL); @@ -880,28 +795,20 @@ int tcp_send_synack(struct sock *sk) } /* - * Set up the timers for sending a delayed ack.. - * - * rules for delaying an ack: - * - delay time <= 0.5 HZ - * - must send at least every 2 full sized packets - * - we don't have a window update to send + * Send out a delayed ack, the caller does the policy checking + * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check() + * for details. */ -void tcp_send_delayed_ack(struct sock * sk, int max_timeout) +void tcp_send_delayed_ack(struct tcp_opt *tp, int max_timeout) { - struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - unsigned long timeout, now; + unsigned long timeout; - /* Calculate new timeout. */ - now = jiffies; + /* Stay within the limit we were given */ timeout = tp->ato; - - if (timeout > max_timeout || - ((tp->rcv_nxt - tp->rcv_wup) > (sk->mss << 2))) - timeout = now; - else - timeout += now; + if (timeout > max_timeout) + timeout = max_timeout; + timeout += jiffies; /* Use new timeout only if there wasn't a older one earlier. */ if (!del_timer(&tp->delack_timer) || timeout < tp->delack_timer.expires) @@ -938,7 +845,7 @@ void tcp_send_ack(struct sock *sk) * bandwidth on slow links to send a spare ack than * resend packets. */ - tcp_send_delayed_ack(sk, HZ/2); + tcp_send_delayed_ack(tp, HZ/2); return; } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 8ebf35bc83f8..d67e04da7409 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_timer.c,v 1.33 1997/12/13 21:53:01 kuznet Exp $ + * Version: $Id: tcp_timer.c,v 1.38 1998/03/10 05:11:17 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -31,6 +31,7 @@ int sysctl_tcp_retries2 = TCP_RETR2; static void tcp_sltimer_handler(unsigned long); static void tcp_syn_recv_timer(unsigned long); static void tcp_keepalive(unsigned long data); +static void tcp_bucketgc(unsigned long); struct timer_list tcp_slow_timer = { NULL, NULL, @@ -41,7 +42,8 @@ struct timer_list tcp_slow_timer = { struct tcp_sl_timer tcp_slt_array[TCP_SLT_MAX] = { {ATOMIC_INIT(0), TCP_SYNACK_PERIOD, 0, tcp_syn_recv_timer},/* SYNACK */ - {ATOMIC_INIT(0), TCP_KEEPALIVE_PERIOD, 0, tcp_keepalive} /* KEEPALIVE */ + {ATOMIC_INIT(0), TCP_KEEPALIVE_PERIOD, 0, tcp_keepalive}, /* KEEPALIVE */ + {ATOMIC_INIT(0), TCP_BUCKETGC_PERIOD, 0, tcp_bucketgc} /* BUCKETGC */ }; const char timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n"; @@ -118,9 +120,12 @@ void tcp_clear_xmit_timers(struct sock *sk) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - del_timer(&tp->retransmit_timer); - del_timer(&tp->delack_timer); - del_timer(&tp->probe_timer); + if(tp->retransmit_timer.prev) + del_timer(&tp->retransmit_timer); + if(tp->delack_timer.prev) + del_timer(&tp->delack_timer); + if(tp->probe_timer.prev) + del_timer(&tp->probe_timer); } static int tcp_write_err(struct sock *sk, int force) @@ -131,7 +136,7 @@ static int tcp_write_err(struct sock *sk, int force) tcp_clear_xmit_timers(sk); /* Time wait the socket. */ - if (!force && (1<state) & (TCPF_FIN_WAIT1|TCPF_FIN_WAIT2|TCPF_CLOSING)) { + if (!force && ((1<state) & (TCPF_FIN_WAIT1|TCPF_FIN_WAIT2|TCPF_CLOSING))) { tcp_set_state(sk,TCP_TIME_WAIT); tcp_reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); } else { @@ -173,9 +178,8 @@ static int tcp_write_timeout(struct sock *sk) return 1; } - -void tcp_delack_timer(unsigned long data) { - +void tcp_delack_timer(unsigned long data) +{ struct sock *sk = (struct sock*)data; if(sk->zapped) @@ -185,8 +189,8 @@ void tcp_delack_timer(unsigned long data) { tcp_read_wakeup(sk); } -void tcp_probe_timer(unsigned long data) { - +void tcp_probe_timer(unsigned long data) +{ struct sock *sk = (struct sock*)data; struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; @@ -252,6 +256,35 @@ static __inline__ int tcp_keepopen_proc(struct sock *sk) return res; } +/* Garbage collect TCP bind buckets. */ +static void tcp_bucketgc(unsigned long __unused) +{ + int i; + + for(i = 0; i < TCP_BHTABLE_SIZE; i++) { + struct tcp_bind_bucket *tb = tcp_bound_hash[i]; + + while(tb) { + struct tcp_bind_bucket *next = tb->next; + + if((tb->owners == NULL) && + !(tb->flags & TCPB_FLAG_LOCKED)) { + /* Eat timer reference. */ + tcp_dec_slow_timer(TCP_SLT_BUCKETGC); + + /* Unlink bucket. */ + if(tb->next) + tb->next->pprev = tb->pprev; + *tb->pprev = tb->next; + + /* Finally, free it up. */ + kmem_cache_free(tcp_bucket_cachep, tb); + } + tb = next; + } + } +} + /* * Check all sockets for keepalive timer * Called every 75 seconds diff --git a/net/ipv4/timer.c b/net/ipv4/timer.c index fe02b3f4c2bc..718c8666b11a 100644 --- a/net/ipv4/timer.c +++ b/net/ipv4/timer.c @@ -5,7 +5,7 @@ * * TIMER - implementation of software timers for IP. * - * Version: $Id: timer.c,v 1.7 1997/09/17 18:50:26 freitag Exp $ + * Version: $Id: timer.c,v 1.8 1998/03/06 00:09:24 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -58,7 +58,8 @@ void net_delete_timer (struct sock *t) cli(); t->timeout = 0; - del_timer (&t->timer); + if(t->timer.prev) + del_timer (&t->timer); restore_flags (flags); } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 62a7fcd46210..39a98c5ec747 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -5,7 +5,7 @@ * * The User Datagram Protocol (UDP). * - * Version: $Id: udp.c,v 1.47 1997/12/27 20:41:16 kuznet Exp $ + * Version: $Id: udp.c,v 1.51 1998/03/08 05:56:40 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -1033,17 +1033,18 @@ static inline void udp_deliver(struct sock *sk, struct sk_buff *skb) /* * Multicasts and broadcasts go to each listener. + * + * Note: called only from the BH handler context, + * so we don't need to lock the hashes. */ static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh, u32 saddr, u32 daddr) { struct sock *sk; - int given = 0; - SOCKHASH_LOCK(); sk = udp_hash[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]; sk = udp_v4_mcast_next(sk, uh->dest, saddr, uh->source, daddr); - if(sk) { + if (sk) { struct sock *sknext = NULL; do { @@ -1058,10 +1059,7 @@ static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh, udp_deliver(sk, skb1); sk = sknext; } while(sknext); - given = 1; - } - SOCKHASH_UNLOCK(); - if(!given) + } else kfree_skb(skb); return 0; } diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index c4faba4b7a75..4a40606012b8 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque * - * $Id: addrconf.c,v 1.32 1997/12/27 20:41:18 kuznet Exp $ + * $Id: addrconf.c,v 1.37 1998/03/08 20:52:46 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -1753,6 +1753,8 @@ static void addrconf_sysctl_register(struct inet6_dev *idev, struct ipv6_devconf t->sysctl_header = register_sysctl_table(t->addrconf_root_dir, 0); if (t->sysctl_header == NULL) kfree(t); + else + p->sysctl = t; } static void addrconf_sysctl_unregister(struct ipv6_devconf *p) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index b0a0eb702f75..bc5ba892a4f9 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -7,7 +7,7 @@ * * Adapted from linux/net/ipv4/af_inet.c * - * $Id: af_inet6.c,v 1.24 1997/12/13 21:53:08 kuznet Exp $ + * $Id: af_inet6.c,v 1.28 1998/03/08 05:56:49 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -71,7 +71,7 @@ static int inet6_create(struct socket *sock, int protocol) struct sock *sk; struct proto *prot; - sk = sk_alloc(AF_INET6, GFP_KERNEL); + sk = sk_alloc(AF_INET6, GFP_KERNEL, 1); if (sk == NULL) goto do_oom; @@ -139,8 +139,7 @@ static int inet6_create(struct socket *sock, int protocol) * creation time automatically shares. */ sk->dummy_th.source = ntohs(sk->num); - if(sk->prot->hash) - sk->prot->hash(sk); + sk->prot->hash(sk); add_to_prot_sklist(sk); } diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 6b7508666f9a..af29057ecb6f 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque * - * $Id: exthdrs.c,v 1.4 1997/03/18 18:24:29 davem Exp $ + * $Id: exthdrs.c,v 1.5 1998/02/12 07:43:39 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index b84dc9268ba8..96867403ba13 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque * - * $Id: icmp.c,v 1.12 1997/12/13 21:53:10 kuznet Exp $ + * $Id: icmp.c,v 1.13 1998/02/12 07:43:41 davem Exp $ * * Based on net/ipv4/icmp.c * diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 15ce420acd8f..9fce1accad64 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque * - * $Id: ip6_fib.c,v 1.10 1997/12/13 21:53:10 kuznet Exp $ + * $Id: ip6_fib.c,v 1.11 1998/03/08 05:56:50 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/net/ipv6/ip6_fw.c b/net/ipv6/ip6_fw.c index 7316a30f102a..3c3a0cfc5e17 100644 --- a/net/ipv6/ip6_fw.c +++ b/net/ipv6/ip6_fw.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque * - * $Id: ip6_fw.c,v 1.8 1997/12/13 21:53:11 kuznet Exp $ + * $Id: ip6_fw.c,v 1.9 1998/02/12 07:43:42 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index ead32047afd6..71ad7e1a0977 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -6,7 +6,7 @@ * Pedro Roque * Ian P. Morris * - * $Id: ip6_input.c,v 1.7 1997/09/20 20:48:27 davem Exp $ + * $Id: ip6_input.c,v 1.8 1998/02/12 07:43:43 davem Exp $ * * Based in linux/net/ipv4/ip_input.c * diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 67b81d04107a..13029e17598e 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque * - * $Id: ip6_output.c,v 1.7 1997/12/29 19:52:46 kuznet Exp $ + * $Id: ip6_output.c,v 1.9 1998/03/08 05:56:50 davem Exp $ * * Based on linux/net/ipv4/ip_output.c * diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index f2ef3fd76e42..c6714eea38fb 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -7,7 +7,7 @@ * * Based on linux/net/ipv4/ip_sockglue.c * - * $Id: ipv6_sockglue.c,v 1.16 1997/12/13 21:53:13 kuznet Exp $ + * $Id: ipv6_sockglue.c,v 1.17 1998/03/08 05:56:51 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 3fb0680bc51b..ce37117a382b 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -132,7 +132,7 @@ struct neigh_table nd_tbl = pndisc_destructor, pndisc_redo, { NULL, NULL, &nd_tbl, 0, NULL, NULL, - 30*HZ, 1*HZ, 60*HZ, 30*HZ, 5*HZ, 3, 3, 0, 3, 1*HZ, (8*HZ)/10, 0, 64 }, + 30*HZ, 1*HZ, 60*HZ, 30*HZ, 5*HZ, 3, 3, 0, 3, 1*HZ, (8*HZ)/10, 64, 0 }, 30*HZ, 128, 512, 1024, }; diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index b9b811e35a4f..b03b6ea2332c 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -7,7 +7,7 @@ * PROC file system. This is very similar to the IPv4 version, * except it reports the sockets in the INET6 address family. * - * Version: $Id: proc.c,v 1.4 1997/04/20 22:50:44 schenk Exp $ + * Version: $Id: proc.c,v 1.5 1998/03/06 01:23:22 davem Exp $ * * Authors: David S. Miller (davem@caip.rutgers.edu) * @@ -83,8 +83,8 @@ static int get__netinfo6(struct proto *pro, char *buffer, int format, char **sta dest->s6_addr32[0], dest->s6_addr32[1], dest->s6_addr32[2], dest->s6_addr32[3], destp, sp->state, - format==0?sp->write_seq-tp->snd_una:atomic_read(&sp->wmem_alloc), - format==0?tp->rcv_nxt-sp->copied_seq:atomic_read(&sp->rmem_alloc), + format==0?tp->write_seq-tp->snd_una:atomic_read(&sp->wmem_alloc), + format==0?tp->rcv_nxt-tp->copied_seq:atomic_read(&sp->rmem_alloc), timer_active, timer_expires-jiffies, tp->retransmits, sp->socket ? sp->socket->inode->i_uid:0, diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 4ee1b13addb6..5b182b7ef859 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -7,7 +7,7 @@ * * Adapted from linux/net/ipv4/raw.c * - * $Id: raw.c,v 1.16 1997/12/29 19:52:48 kuznet Exp $ + * $Id: raw.c,v 1.18 1998/03/08 05:56:54 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index aa027da14235..55fecc6766ac 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque * - * $Id: reassembly.c,v 1.8 1997/12/29 19:52:50 kuznet Exp $ + * $Id: reassembly.c,v 1.9 1998/02/12 07:43:48 davem Exp $ * * Based on: net/ipv4/ip_fragment.c * diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 28ee43e783a7..498d4d5b4fab 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque * - * $Id: route.c,v 1.19 1997/12/13 21:53:16 kuznet Exp $ + * $Id: route.c,v 1.24 1998/03/08 20:52:50 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -85,18 +85,18 @@ struct dst_ops ip6_dst_ops = { }; struct rt6_info ip6_null_entry = { - {{NULL, ATOMIC_INIT(0), ATOMIC_INIT(0), NULL, + {{NULL, ATOMIC_INIT(1), ATOMIC_INIT(1), NULL, -1, 0, 0, 0, 0, 0, 0, 0, 0, -ENETUNREACH, NULL, NULL, ip6_pkt_discard, ip6_pkt_discard, &ip6_dst_ops}}, NULL, {{{0}}}, 256, RTF_REJECT|RTF_NONEXTHOP, ~0U, - 0, 255, {NULL}, {{{{0}}}, 128}, {{{{0}}}, 128} + 255, 0, {NULL}, {{{{0}}}, 0}, {{{{0}}}, 0} }; struct fib6_node ip6_routing_table = { NULL, NULL, NULL, NULL, &ip6_null_entry, - 0, RTN_ROOT|RTN_TL_ROOT, 0 + 0, RTN_ROOT|RTN_TL_ROOT|RTN_RTINFO, 0 }; #ifdef CONFIG_RT6_POLICY @@ -716,7 +716,7 @@ struct rt6_info *ip6_route_add(struct in6_rtmsg *rtmsg, int *err) rt->rt6i_expires = rtmsg->rtmsg_info; addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst); - + if (addr_type & IPV6_ADDR_MULTICAST) { RDBG(("MCAST, ")); rt->u.dst.input = ip6_mc_input; @@ -743,6 +743,21 @@ struct rt6_info *ip6_route_add(struct in6_rtmsg *rtmsg, int *err) rt->rt6i_src.plen = rtmsg->rtmsg_src_len; ipv6_wash_prefix(&rt->rt6i_src.addr, rt->rt6i_src.plen); + /* We cannot add true routes via loopback here, + they would result in kernel looping; promote them to reject routes + */ + if ((rtmsg->rtmsg_flags&RTF_REJECT) || + (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) { + dev = dev_get("lo"); + rt->u.dst.output = ip6_pkt_discard; + rt->u.dst.input = ip6_pkt_discard; + rt->u.dst.error = -EHOSTUNREACH; + rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP; + rt->rt6i_metric = rtmsg->rtmsg_metric; + rt->rt6i_dev = dev; + goto install_route; + } + if (rtmsg->rtmsg_flags & RTF_GATEWAY) { struct in6_addr *gw_addr; int gwa_type; @@ -805,6 +820,7 @@ struct rt6_info *ip6_route_add(struct in6_rtmsg *rtmsg, int *err) rt->rt6i_hoplimit = ipv6_get_hoplimit(dev); rt->rt6i_flags = rtmsg->rtmsg_flags; +install_route: RDBG(("rt6ins(%p) ", rt)); rt6_lock(); @@ -1421,6 +1437,7 @@ int ipv6_route_ioctl(unsigned int cmd, void *arg) int ip6_pkt_discard(struct sk_buff *skb) { ipv6_statistics.Ip6OutNoRoutes++; + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev); kfree_skb(skb); return 0; } @@ -1754,7 +1771,12 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt, rtm->rtm_src_len = rt->rt6i_src.plen; rtm->rtm_tos = 0; rtm->rtm_table = RT_TABLE_MAIN; - rtm->rtm_type = RTN_UNICAST; + if (rt->rt6i_flags&RTF_REJECT) + rtm->rtm_type = RTN_UNREACHABLE; + else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK)) + rtm->rtm_type = RTN_LOCAL; + else + rtm->rtm_type = RTN_UNICAST; rtm->rtm_flags = 0; rtm->rtm_scope = RT_SCOPE_UNIVERSE; #ifdef CONFIG_RTNL_OLD_IFINFO @@ -1795,6 +1817,8 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt, if (rt->u.dst.rtt) RTA_PUT(skb, RTAX_RTT, sizeof(unsigned), &rt->u.dst.rtt); mx->rta_len = skb->tail - (u8*)mx; + if (mx->rta_len == RTA_LENGTH(0)) + skb_trim(skb, (u8*)mx - skb->data); #endif if (rt->u.dst.neighbour) RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key); diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index f029942df022..577b85d0fb31 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -6,7 +6,7 @@ * Pedro Roque * Alexey Kuznetsov * - * $Id: sit.c,v 1.24 1997/12/13 21:53:17 kuznet Exp $ + * $Id: sit.c,v 1.27 1998/03/08 05:56:57 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index f7a080a0d043..a7933eb33722 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque * - * $Id: tcp_ipv6.c,v 1.44 1997/12/13 21:53:18 kuznet Exp $ + * $Id: tcp_ipv6.c,v 1.55 1998/03/10 05:11:21 davem Exp $ * * Based on: * linux/net/ipv4/tcp.c @@ -86,62 +86,69 @@ static __inline__ int tcp_v6_sk_hashfn(struct sock *sk) /* Grrr, addr_type already calculated by caller, but I don't want * to add some silly "cookie" argument to this method just for that. + * But it doesn't matter, the recalculation is in the rarest path + * this function ever takes. */ static int tcp_v6_verify_bind(struct sock *sk, unsigned short snum) { - struct sock *sk2; - int addr_type = ipv6_addr_type(&sk->net_pinfo.af_inet6.rcv_saddr); - int retval = 0, sk_reuse = sk->reuse; + struct tcp_bind_bucket *tb; + int result = 0; SOCKHASH_LOCK(); - sk2 = tcp_bound_hash[tcp_sk_bhashfn(sk)]; - for(; sk2 != NULL; sk2 = sk2->bind_next) { - if((sk2->num == snum) && (sk2 != sk)) { - unsigned char state = sk2->state; - int sk2_reuse = sk2->reuse; - if(addr_type == IPV6_ADDR_ANY || (!sk2->rcv_saddr)) { - if((!sk2_reuse) || - (!sk_reuse) || - (state == TCP_LISTEN)) { - retval = 1; - break; - } - } else if(!ipv6_addr_cmp(&sk->net_pinfo.af_inet6.rcv_saddr, - &sk2->net_pinfo.af_inet6.rcv_saddr)) { - if((!sk_reuse) || - (!sk2_reuse) || - (state == TCP_LISTEN)) { - retval = 1; - break; + for(tb = tcp_bound_hash[tcp_bhashfn(snum)]; + (tb && (tb->port != snum)); + tb = tb->next) + ; + if(tb && tb->owners) { + /* Fast path for reuse ports, see include/net/tcp.h for a very + * detailed description of why this works, and why it is worth + * the effort at all. -DaveM + */ + if((tb->flags & TCPB_FLAG_FASTREUSE) && + (sk->reuse != 0)) { + goto go_like_smoke; + } else { + struct sock *sk2; + int sk_reuse = sk->reuse; + int addr_type = ipv6_addr_type(&sk->net_pinfo.af_inet6.rcv_saddr); + + /* We must walk the whole port owner list in this case. -DaveM */ + for(sk2 = tb->owners; sk2; sk2 = sk2->tp_pinfo.af_tcp.bind_next) { + if(!sk_reuse || !sk2->reuse || sk2->state == TCP_LISTEN) { + if(addr_type == IPV6_ADDR_ANY || + !sk2->rcv_saddr || + !ipv6_addr_cmp(&sk->net_pinfo.af_inet6.rcv_saddr, + &sk2->net_pinfo.af_inet6.rcv_saddr)) + break; } } + if(sk2 != NULL) + result = 1; } } + if((result == 0) && + (tb == NULL) && + (tcp_bucket_create(snum) == NULL)) + result = 1; +go_like_smoke: SOCKHASH_UNLOCK(); - - return retval; + return result; } static void tcp_v6_hash(struct sock *sk) { - unsigned char state; - - SOCKHASH_LOCK(); - state = sk->state; - if(state != TCP_CLOSE) { + if(sk->state != TCP_CLOSE) { struct sock **skp; - if(state == TCP_LISTEN) - skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)]; - else - skp = &tcp_established_hash[tcp_v6_sk_hashfn(sk)]; + SOCKHASH_LOCK(); + skp = &tcp_established_hash[tcp_v6_sk_hashfn(sk)]; if((sk->next = *skp) != NULL) (*skp)->pprev = &sk->next; *skp = sk; sk->pprev = skp; tcp_sk_bindify(sk); + SOCKHASH_UNLOCK(); } - SOCKHASH_UNLOCK(); } static void tcp_v6_unhash(struct sock *sk) @@ -153,6 +160,7 @@ static void tcp_v6_unhash(struct sock *sk) *sk->pprev = sk->next; sk->pprev = NULL; tcp_sk_unbindify(sk); + tcp_reg_zap(sk); } SOCKHASH_UNLOCK(); } @@ -163,12 +171,12 @@ static void tcp_v6_rehash(struct sock *sk) SOCKHASH_LOCK(); state = sk->state; - if(sk->pprev) { + if(sk->pprev != NULL) { if(sk->next) sk->next->pprev = sk->pprev; *sk->pprev = sk->next; sk->pprev = NULL; - tcp_sk_unbindify(sk); + tcp_reg_zap(sk); } if(state != TCP_CLOSE) { struct sock **skp; @@ -181,11 +189,13 @@ static void tcp_v6_rehash(struct sock *sk) hash += (TCP_HTABLE_SIZE/2); skp = &tcp_established_hash[hash]; } + if((sk->next = *skp) != NULL) (*skp)->pprev = &sk->next; *skp = sk; sk->pprev = skp; - tcp_sk_bindify(sk); + if(state == TCP_LISTEN) + tcp_sk_bindify(sk); } SOCKHASH_UNLOCK(); } @@ -209,6 +219,9 @@ static struct sock *tcp_v6_lookup_listener(struct in6_addr *daddr, unsigned shor return result; } +/* Until this is verified... -DaveM */ +/* #define USE_QUICKSYNS */ + /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM */ @@ -218,21 +231,41 @@ static inline struct sock *__tcp_v6_lookup(struct tcphdr *th, { unsigned short hnum = ntohs(dport); struct sock *sk; - int hash = tcp_v6_hashfn(daddr, hnum, saddr, sport); + int hash; + +#ifdef USE_QUICKSYNS + /* Incomming connection short-cut. */ + if (th && th->syn == 1 && th->ack == 0) + goto listener_shortcut; +#endif + + /* Check TCP register quick cache first. */ + sk = TCP_RHASH(sport); + if(sk && + sk->num == hnum && /* local port */ + sk->family == AF_INET6 && /* address family */ + sk->dummy_th.dest == sport && /* remote port */ + !ipv6_addr_cmp(&sk->net_pinfo.af_inet6.daddr, saddr) && + !ipv6_addr_cmp(&sk->net_pinfo.af_inet6.rcv_saddr, daddr)) + goto hit; /* Optimize here for direct hit, only listening connections can * have wildcards anyways. It is assumed that this code only * gets called from within NET_BH. */ - for(sk = tcp_established_hash[hash]; sk; sk = sk->next) + hash = tcp_v6_hashfn(daddr, hnum, saddr, sport); + for(sk = tcp_established_hash[hash]; sk; sk = sk->next) { /* For IPV6 do the cheaper port and family tests first. */ if(sk->num == hnum && /* local port */ sk->family == AF_INET6 && /* address family */ sk->dummy_th.dest == sport && /* remote port */ !ipv6_addr_cmp(&sk->net_pinfo.af_inet6.daddr, saddr) && - !ipv6_addr_cmp(&sk->net_pinfo.af_inet6.rcv_saddr, daddr)) + !ipv6_addr_cmp(&sk->net_pinfo.af_inet6.rcv_saddr, daddr)) { + if (sk->state == TCP_ESTABLISHED) + TCP_RHASH(sport) = sk; goto hit; /* You sunk my battleship! */ - + } + } /* Must check for a TIME_WAIT'er before going to listener hash. */ for(sk = tcp_established_hash[hash+(TCP_HTABLE_SIZE/2)]; sk; sk = sk->next) if(sk->num == hnum && /* local port */ @@ -241,7 +274,9 @@ static inline struct sock *__tcp_v6_lookup(struct tcphdr *th, !ipv6_addr_cmp(&sk->net_pinfo.af_inet6.daddr, saddr) && !ipv6_addr_cmp(&sk->net_pinfo.af_inet6.rcv_saddr, daddr)) goto hit; - +#ifdef USE_QUICKSYNS +listener_shortcut: +#endif sk = tcp_v6_lookup_listener(daddr, hnum); hit: return sk; @@ -275,6 +310,33 @@ static __u32 tcp_v6_init_sequence(struct sock *sk, struct sk_buff *skb) skb->h.th->source); } +static int tcp_v6_unique_address(struct sock *sk) +{ + struct tcp_bind_bucket *tb; + unsigned short snum = sk->num; + int retval = 1; + + /* Freeze the hash while we snoop around. */ + SOCKHASH_LOCK(); + tb = tcp_bound_hash[tcp_bhashfn(snum)]; + for(; tb; tb = tb->next) { + if(tb->port == snum && tb->owners != NULL) { + /* Almost certainly the re-use port case, search the real hashes + * so it actually scales. (we hope that all ipv6 ftp servers will + * use passive ftp, I just cover this case for completeness) + */ + sk = __tcp_v6_lookup(NULL, &sk->net_pinfo.af_inet6.daddr, + sk->dummy_th.dest, + &sk->net_pinfo.af_inet6.rcv_saddr, snum); + if((sk != NULL) && (sk->state != TCP_LISTEN)) + retval = 0; + break; + } + } + SOCKHASH_UNLOCK(); + return retval; +} + static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { @@ -390,7 +452,9 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, ipv6_addr_copy(&np->saddr, saddr); } - /* FIXME: Need to do tcp_v6_unique_address() here! -DaveM */ + sk->dummy_th.dest = usin->sin6_port; + if (!tcp_v6_unique_address(sk)) + return -EADDRNOTAVAIL; /* * Init variables @@ -398,16 +462,15 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, lock_sock(sk); - sk->dummy_th.dest = usin->sin6_port; - sk->write_seq = secure_tcp_sequence_number(np->saddr.s6_addr32[3], + tp->write_seq = secure_tcp_sequence_number(np->saddr.s6_addr32[3], np->daddr.s6_addr32[3], sk->dummy_th.source, sk->dummy_th.dest); tp->snd_wnd = 0; tp->snd_wl1 = 0; - tp->snd_wl2 = sk->write_seq; - tp->snd_una = sk->write_seq; + tp->snd_wl2 = tp->write_seq; + tp->snd_una = tp->write_seq; tp->rcv_nxt = 0; @@ -429,10 +492,10 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, buff->h.th = th; memcpy(th, (void *) &(sk->dummy_th), sizeof(*th)); - buff->seq = sk->write_seq++; + buff->seq = tp->write_seq++; th->seq = htonl(buff->seq); - tp->snd_nxt = sk->write_seq; - buff->end_seq = sk->write_seq; + tp->snd_nxt = tp->write_seq; + buff->end_seq = tp->write_seq; th->ack = 0; th->syn = 1; @@ -467,9 +530,10 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, tcp_set_state(sk, TCP_SYN_SENT); /* Socket identity change complete, no longer - * in TCP_CLOSE, so rehash. + * in TCP_CLOSE, so enter ourselves into the + * hash tables. */ - sk->prot->rehash(sk); + sk->prot->hash(sk); /* FIXME: should use dcache->rtt if availiable */ tp->rto = TCP_TIMEOUT_INIT; @@ -740,9 +804,13 @@ static void tcp_v6_send_synack(struct sock *sk, struct open_request *req) &req->af.v6_req.loc_addr, &req->af.v6_req.rmt_addr, csum_partial((char *)th, sizeof(*th)+tmp, skb->csum)); + /* Actually we should not attach dst to socket in state LISTEN, + it results in stale destination per listen socket and + overflow of routing cache. + (IPv4 has the same flaw with more unpleasant consequences.) + */ ip6_dst_store(sk, dst); ip6_xmit(sk, skb, &fl, req->af.v6_req.opt); - dst_release(dst); tcp_statistics.TcpOutSegs++; } @@ -879,92 +947,17 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, return newsk; } - newsk = sk_alloc(AF_INET6, GFP_ATOMIC); + newsk = tcp_create_openreq_child(sk, req, skb); if (newsk == NULL) { - if (dst) - dst_release(dst); + dst_release(dst); return NULL; } - memcpy(newsk, sk, sizeof(*newsk)); - - /* Or else we die! -DaveM */ - newsk->sklist_next = NULL; - - newsk->opt = NULL; newsk->dst_cache = NULL; - skb_queue_head_init(&newsk->write_queue); - skb_queue_head_init(&newsk->receive_queue); - skb_queue_head_init(&newsk->out_of_order_queue); - skb_queue_head_init(&newsk->error_queue); - - /* - * Unused - */ newtp = &(newsk->tp_pinfo.af_tcp); - np = &newsk->net_pinfo.af_inet6; - - newtp->send_head = NULL; - newtp->retrans_head = NULL; - - newtp->pending = 0; - - skb_queue_head_init(&newsk->back_log); - - newsk->prot->init(newsk); - - newtp->snd_cwnd_cnt = 0; -#if 0 /* Don't mess up the initialization we did in the init routine! */ - newtp->snd_ssthresh = 0; -#endif - newtp->backoff = 0; - newsk->proc = 0; - newsk->done = 0; - newsk->pair = NULL; - atomic_set(&newsk->wmem_alloc, 0); - atomic_set(&newsk->rmem_alloc, 0); - newsk->localroute = sk->localroute; - - newsk->err = 0; - newsk->shutdown = 0; - newsk->ack_backlog = 0; - - newtp->fin_seq = req->rcv_isn; - newsk->syn_seq = req->rcv_isn; - newsk->state = TCP_SYN_RECV; - newsk->timeout = 0; - - newsk->write_seq = req->snt_isn; - - newtp->snd_wnd = ntohs(skb->h.th->window); - newtp->max_window = newtp->snd_wnd; - newtp->snd_wl1 = req->rcv_isn; - newtp->snd_wl2 = newsk->write_seq; - newtp->snd_una = newsk->write_seq++; - newtp->snd_nxt = newsk->write_seq; - - newsk->urg_data = 0; - newtp->packets_out = 0; - newtp->retransmits = 0; - newsk->linger=0; - newsk->destroy = 0; - init_timer(&newsk->timer); - newsk->timer.data = (unsigned long) newsk; - newsk->timer.function = &net_timer; - - tcp_init_xmit_timers(newsk); - - newsk->dummy_th.source = sk->dummy_th.source; - newsk->dummy_th.dest = req->rmt_port; - newsk->sock_readers=0; - - newtp->rcv_nxt = req->rcv_isn + 1; - newtp->rcv_wup = req->rcv_isn + 1; - newsk->copied_seq = req->rcv_isn + 1; - - newsk->socket = NULL; + np = &newsk->net_pinfo.af_inet6; ipv6_addr_copy(&np->daddr, &req->af.v6_req.rmt_addr); ipv6_addr_copy(&np->saddr, &req->af.v6_req.loc_addr); ipv6_addr_copy(&np->rcv_saddr, &req->af.v6_req.loc_addr); @@ -989,12 +982,21 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newtp->sack_ok = req->sack_ok; newtp->tstamp_ok = req->tstamp_ok; - newtp->snd_wscale = req->snd_wscale; + newtp->window_clamp = req->window_clamp; + newtp->rcv_wnd = req->rcv_wnd; newtp->wscale_ok = req->wscale_ok; - newtp->ts_recent = req->ts_recent; + if (newtp->wscale_ok) { + newtp->snd_wscale = req->snd_wscale; + newtp->rcv_wscale = req->rcv_wscale; + } else { + newtp->snd_wscale = newtp->rcv_wscale = 0; + newtp->window_clamp = min(newtp->window_clamp,65535); + } if (newtp->tstamp_ok) { - newtp->tcp_header_len = sizeof(struct tcphdr) + 12; /* FIXME: define the contant. */ - newsk->dummy_th.doff += 3; + newtp->ts_recent = req->ts_recent; + newtp->ts_recent_stamp = jiffies; + newtp->tcp_header_len = sizeof(struct tcphdr) + 12; /* FIXME: define constant! */ + newsk->dummy_th.doff += 3; } else { newtp->tcp_header_len = sizeof(struct tcphdr); } @@ -1006,7 +1008,6 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newsk->mss = min(req->mss+sizeof(struct tcphdr)-newtp->tcp_header_len, (newsk->mtu - sizeof(struct ipv6hdr) - newtp->tcp_header_len)); - /* XXX tp->window_clamp??? -DaveM */ newsk->daddr = LOOPBACK4_IPV6; newsk->saddr = LOOPBACK4_IPV6; @@ -1388,7 +1389,7 @@ static int tcp_v6_init_sock(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - skb_queue_head_init(&sk->out_of_order_queue); + skb_queue_head_init(&tp->out_of_order_queue); tcp_init_xmit_timers(sk); tp->srtt = 0; @@ -1396,7 +1397,6 @@ static int tcp_v6_init_sock(struct sock *sk) tp->mdev = TCP_TIMEOUT_INIT; tp->ato = 0; - tp->iat = (HZ/5) << 3; /* FIXME: right thing? */ tp->rcv_wnd = 0; @@ -1442,6 +1442,7 @@ static int tcp_v6_init_sock(struct sock *sk) static int tcp_v6_destroy_sock(struct sock *sk) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sk_buff *skb; tcp_clear_xmit_timers(sk); @@ -1460,15 +1461,22 @@ static int tcp_v6_destroy_sock(struct sock *sk) * Cleans up our, hopefuly empty, out_of_order_queue */ - while((skb = skb_dequeue(&sk->out_of_order_queue)) != NULL) + while((skb = skb_dequeue(&tp->out_of_order_queue)) != NULL) kfree_skb(skb); /* * Release destination entry */ - dst_release(sk->dst_cache); - sk->dst_cache = NULL; + dst_release(xchg(&sk->dst_cache,NULL)); + + /* Clean up a locked TCP bind bucket, this only happens if a + * port is allocated for a socket, but it never fully connects. + * In which case we will find num to be non-zero and daddr to + * be zero. + */ + if(ipv6_addr_any(&(sk->net_pinfo.af_inet6.daddr)) && sk->num != 0) + tcp_bucket_unlock(sk); return 0; } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index b99dc19e37e1..5324fdc73822 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -7,7 +7,7 @@ * * Based on linux/ipv4/udp.c * - * $Id: udp.c,v 1.21 1997/12/29 19:52:52 kuznet Exp $ + * $Id: udp.c,v 1.23 1998/03/08 05:56:59 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -448,32 +448,43 @@ static struct sock *udp_v6_mcast_next(struct sock *sk, return NULL; } +/* + * Note: called only from the BH handler context, + * so we don't need to lock the hashes. + */ static void udpv6_mcast_deliver(struct udphdr *uh, struct in6_addr *saddr, struct in6_addr *daddr, struct sk_buff *skb) { struct sock *sk, *sk2; + struct sk_buff *buff; - SOCKHASH_LOCK(); sk = udp_hash[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]; sk = udp_v6_mcast_next(sk, uh->dest, daddr, uh->source, saddr); - if(sk) { - sk2 = sk; - while((sk2 = udp_v6_mcast_next(sk2->next, - uh->dest, saddr, - uh->source, daddr))) { - struct sk_buff *buff = skb_clone(skb, GFP_ATOMIC); - if (buff && sock_queue_rcv_skb(sk2, buff) < 0) { - buff->sk = NULL; - kfree_skb(buff); - } + if (!sk) + goto free_skb; + + buff = NULL; + sk2 = sk; + while((sk2 = udp_v6_mcast_next(sk2->next, uh->dest, saddr, + uh->source, daddr))) { + if (!buff) { + buff = skb_clone(skb, GFP_ATOMIC); + if (!buff) + continue; } + if (sock_queue_rcv_skb(sk2, buff) >= 0) + buff = NULL; + } + if (buff) { + buff->sk = NULL; + kfree_skb(buff); } - if(!sk || sock_queue_rcv_skb(sk, skb) < 0) { + if (sock_queue_rcv_skb(sk, skb) < 0) { + free_skb: skb->sk = NULL; kfree_skb(skb); } - SOCKHASH_UNLOCK(); } int udpv6_rcv(struct sk_buff *skb, struct device *dev, diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index cf56df492ff7..904fa1174952 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -1713,7 +1713,7 @@ static int ipx_getsockopt(struct socket *sock, int level, int optname, static int ipx_create(struct socket *sock, int protocol) { struct sock *sk; - sk=sk_alloc(AF_IPX, GFP_KERNEL); + sk=sk_alloc(AF_IPX, GFP_KERNEL, 1); if(sk==NULL) return(-ENOMEM); switch(sock->type) diff --git a/net/netbeui/af_netbeui.c b/net/netbeui/af_netbeui.c index 85bd8f4d1df5..6769edde564c 100644 --- a/net/netbeui/af_netbeui.c +++ b/net/netbeui/af_netbeui.c @@ -150,7 +150,7 @@ static int netbeui_listen(struct socket *sock, int backlog) static int netbeui_create(struct socket *sock, int protocol) { netbeui_socket *sk; - sk=(netbeui_socket *)sk_alloc(GFP_KERNEL); + sk=(netbeui_socket *)sk_alloc(GFP_KERNEL, 1); if(sk==NULL) return(-ENOBUFS); switch(sock->type) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 3f02f4c3cd49..8b8e5a4b85c4 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -147,7 +147,7 @@ static int netlink_create(struct socket *sock, int protocol) sock->ops = &netlink_ops; - sk = sk_alloc(AF_NETLINK, GFP_KERNEL); + sk = sk_alloc(AF_NETLINK, GFP_KERNEL, 1); if (!sk) return -ENOMEM; diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index a84d1fd53ffe..d800cf9d6b6e 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -98,7 +98,7 @@ static struct sock *nr_alloc_sock(void) struct sock *sk; nr_cb *nr; - if ((sk = sk_alloc(AF_NETROM, GFP_ATOMIC)) == NULL) + if ((sk = sk_alloc(AF_NETROM, GFP_ATOMIC, 1)) == NULL) return NULL; if ((nr = kmalloc(sizeof(*nr), GFP_ATOMIC)) == NULL) { diff --git a/net/netsyms.c b/net/netsyms.c index b7809863bcbf..8831de3f3b8f 100644 --- a/net/netsyms.c +++ b/net/netsyms.c @@ -277,6 +277,9 @@ EXPORT_SYMBOL(tcp_v4_build_header); EXPORT_SYMBOL(tcp_v4_rebuild_header); EXPORT_SYMBOL(tcp_v4_send_check); EXPORT_SYMBOL(tcp_v4_conn_request); +EXPORT_SYMBOL(tcp_create_openreq_child); +EXPORT_SYMBOL(tcp_bucket_create); +EXPORT_SYMBOL(tcp_bucket_unlock); EXPORT_SYMBOL(tcp_v4_syn_recv_sock); EXPORT_SYMBOL(tcp_v4_do_rcv); EXPORT_SYMBOL(tcp_v4_connect); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index a098f59b9c3b..f3893c9d461f 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -35,6 +35,7 @@ * Alan Cox : sendmsg/recvmsg support. * Alan Cox : Protocol setting support * Alexey Kuznetsov : Untied from IPv4 stack. + * Cyrus Durgin : Fixed kerneld for kmod. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -54,7 +55,7 @@ #include #include #include -#include +#include #include #include #include @@ -710,7 +711,7 @@ static int packet_create(struct socket *sock, int protocol) sock->state = SS_UNCONNECTED; MOD_INC_USE_COUNT; - sk = sk_alloc(AF_PACKET, GFP_KERNEL); + sk = sk_alloc(AF_PACKET, GFP_KERNEL, 1); if (sk == NULL) { MOD_DEC_USE_COUNT; return -ENOBUFS; diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index eeb3963507a5..1056c7d73578 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -149,7 +149,7 @@ static struct sock *rose_alloc_sock(void) struct sock *sk; rose_cb *rose; - if ((sk = sk_alloc(AF_ROSE, GFP_ATOMIC)) == NULL) + if ((sk = sk_alloc(AF_ROSE, GFP_ATOMIC, 1)) == NULL) return NULL; if ((rose = kmalloc(sizeof(*rose), GFP_ATOMIC)) == NULL) { diff --git a/net/socket.c b/net/socket.c index 5c953403122d..dc77ef3e8b96 100644 --- a/net/socket.c +++ b/net/socket.c @@ -76,8 +76,8 @@ #include #include -#if defined(CONFIG_KERNELD) && defined(CONFIG_NET) -#include +#if defined(CONFIG_KMOD) && defined(CONFIG_NET) +#include #endif #include @@ -577,7 +577,7 @@ int sock_create(int family, int type, int protocol, struct socket **res) if(family<0||family>=NPROTO) return -EINVAL; -#if defined(CONFIG_KERNELD) && defined(CONFIG_NET) +#if defined(CONFIG_KMOD) && defined(CONFIG_NET) /* Attempt to load a protocol module if the find failed. * * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user @@ -814,7 +814,7 @@ restart: newsock = socki_lookup(inode); if ((err = get_fd(inode)) < 0) - goto out_inval; + goto out_release; newsock->file = current->files->fd[err]; if (upeer_sockaddr) @@ -835,8 +835,6 @@ out: unlock_kernel(); return err; -out_inval: - err = -EINVAL; out_release: sock_release(newsock); goto out_put; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 2fbce16fe8f0..c74a1997c9f2 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -364,7 +364,7 @@ static int unix_create(struct socket *sock, int protocol) default: return -ESOCKTNOSUPPORT; } - sk = sk_alloc(AF_UNIX, GFP_KERNEL); + sk = sk_alloc(AF_UNIX, GFP_KERNEL, 1); if (!sk) return -ENOMEM; diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 7e3c9cae2a26..a85aeea5f73b 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -423,7 +423,7 @@ static struct sock *x25_alloc_socket(void) struct sock *sk; x25_cb *x25; - if ((sk = sk_alloc(AF_X25, GFP_ATOMIC)) == NULL) + if ((sk = sk_alloc(AF_X25, GFP_ATOMIC, 1)) == NULL) return NULL; if ((x25 = kmalloc(sizeof(*x25), GFP_ATOMIC)) == NULL) { -- 2.39.5