From c0f75cadd14f3cec8b770526f2f3cc3027606d19 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 23 Nov 2007 15:30:23 -0500 Subject: [PATCH] Import 2.3.41pre2 --- CREDITS | 6 +- Documentation/networking/ip-sysctl.txt | 121 +- Documentation/networking/wan-router.txt | 65 +- Documentation/networking/wanpipe.txt | 330 +++--- MAINTAINERS | 2 +- arch/sparc/kernel/process.c | 5 +- arch/sparc/kernel/signal.c | 88 +- arch/sparc/kernel/smp.c | 3 +- arch/sparc/kernel/sparc_ksyms.c | 3 +- arch/sparc/kernel/sun4d_smp.c | 50 +- arch/sparc/kernel/sun4m_smp.c | 65 +- arch/sparc/kernel/sys_sparc.c | 12 +- arch/sparc/kernel/sys_sunos.c | 23 +- arch/sparc/kernel/systbls.S | 8 +- arch/sparc/kernel/time.c | 4 +- arch/sparc/kernel/traps.c | 130 ++- arch/sparc/kernel/unaligned.c | 17 +- arch/sparc/mm/asyncd.c | 12 +- arch/sparc/mm/fault.c | 106 +- arch/sparc/mm/init.c | 23 +- arch/sparc/mm/srmmu.c | 10 +- arch/sparc64/config.in | 2 +- arch/sparc64/kernel/ioctl32.c | 29 +- arch/sparc64/kernel/irq.c | 4 +- arch/sparc64/kernel/process.c | 5 +- arch/sparc64/kernel/signal32.c | 173 +-- arch/sparc64/kernel/smp.c | 10 +- arch/sparc64/kernel/sys_sparc.c | 11 +- arch/sparc64/kernel/sys_sparc32.c | 42 +- arch/sparc64/kernel/sys_sunos32.c | 23 +- arch/sparc64/kernel/systbls.S | 8 +- arch/sparc64/kernel/traps.c | 172 +-- arch/sparc64/lib/Makefile | 6 +- arch/sparc64/lib/VIScsumcopy.S | 4 +- arch/sparc64/lib/VIScsumcopyusr.S | 914 ++++++++++++++++ arch/sparc64/lib/checksum.S | 238 +++- arch/sparc64/mm/asyncd.c | 13 +- arch/sparc64/mm/fault.c | 53 +- drivers/char/Config.in | 1 + drivers/char/Makefile | 8 + drivers/char/console.c | 1 + drivers/char/mixcomwd.c | 250 +++++ drivers/char/saa5249.c | 290 +++-- drivers/char/sx.c | 14 +- drivers/net/setup.c | 3 +- drivers/net/wan/cycx_main.c | 14 +- drivers/net/wan/cycx_x25.c | 9 +- drivers/net/wan/sdla.c | 4 +- drivers/sbus/audio/dbri.c | 6 +- drivers/scsi/megaraid.c | 54 +- drivers/scsi/qlogicfc.c | 7 +- drivers/sound/ad1848.c | 87 +- drivers/sound/msnd_pinnacle.c | 1 + drivers/video/bwtwofb.c | 4 +- fs/ext2/namei.c | 20 +- fs/partitions/sgi.c | 8 +- fs/partitions/sun.c | 6 +- fs/udf/directory.c | 5 +- fs/udf/super.c | 4 +- include/asm-sparc/asm_offsets.h | 136 ++- include/asm-sparc/hdreg.h | 13 + include/asm-sparc/ide.h | 289 +++++ include/asm-sparc/processor.h | 10 +- include/asm-sparc/sembuf.h | 2 +- include/asm-sparc/siginfo.h | 9 +- include/asm-sparc/smp.h | 6 +- include/asm-sparc/stat.h | 36 +- include/asm-sparc/unistd.h | 2 +- include/asm-sparc64/asm_offsets.h | 162 ++- include/asm-sparc64/checksum.h | 24 +- include/asm-sparc64/processor.h | 15 +- include/asm-sparc64/siginfo.h | 7 +- include/asm-sparc64/smp.h | 6 +- include/asm-sparc64/unistd.h | 2 +- include/linux/cyclomx.h | 6 +- include/linux/etherdevice.h | 2 +- include/linux/skbuff.h | 13 +- include/linux/sockios.h | 4 + include/linux/sysctl.h | 7 +- include/linux/tcp.h | 1 + include/net/dst.h | 6 +- include/net/ip.h | 2 +- include/net/route.h | 3 +- include/net/snmp.h | 19 +- include/net/sock.h | 167 ++- include/net/tcp.h | 727 ++++++++---- net/core/datagram.c | 19 +- net/core/iovec.c | 12 +- net/core/skbuff.c | 16 +- net/core/sock.c | 116 +- net/ethernet/eth.c | 7 +- net/ipv4/af_inet.c | 261 +++-- net/ipv4/arp.c | 10 +- net/ipv4/ip_input.c | 13 +- net/ipv4/ip_output.c | 12 +- net/ipv4/ip_sockglue.c | 4 +- net/ipv4/proc.c | 16 +- net/ipv4/raw.c | 8 +- net/ipv4/route.c | 49 +- net/ipv4/syncookies.c | 34 +- net/ipv4/sysctl_net_ipv4.c | 34 +- net/ipv4/tcp.c | 1017 ++++++++++------- net/ipv4/tcp_input.c | 1336 +++++++++++++++-------- net/ipv4/tcp_ipv4.c | 951 ++++++++-------- net/ipv4/tcp_output.c | 495 +++++---- net/ipv4/tcp_timer.c | 648 ++++++----- net/ipv4/udp.c | 182 ++- net/ipv6/addrconf.c | 24 +- net/ipv6/af_inet6.c | 19 +- net/ipv6/icmp.c | 18 +- net/ipv6/ip6_fib.c | 6 +- net/ipv6/ipv6_sockglue.c | 6 +- net/ipv6/mcast.c | 11 +- net/ipv6/raw.c | 8 +- net/ipv6/route.c | 14 +- net/ipv6/tcp_ipv6.c | 1058 +++++++++--------- net/ipv6/udp.c | 100 +- net/khttpd/accept.c | 2 +- net/khttpd/datasending.c | 2 +- net/khttpd/sockets.c | 5 +- net/khttpd/userspace.c | 21 +- net/khttpd/waitheaders.c | 2 +- net/netlink/af_netlink.c | 35 +- net/netsyms.c | 21 +- net/packet/af_packet.c | 9 +- net/socket.c | 42 +- net/unix/af_unix.c | 114 +- net/wanrouter/wanmain.c | 213 ++-- net/wanrouter/wanproc.c | 142 +-- 129 files changed, 7933 insertions(+), 4441 deletions(-) create mode 100644 arch/sparc64/lib/VIScsumcopyusr.S create mode 100644 drivers/char/mixcomwd.c create mode 100644 include/asm-sparc/hdreg.h create mode 100644 include/asm-sparc/ide.h diff --git a/CREDITS b/CREDITS index a5ab5e9cd689..fe7958cd8dca 100644 --- a/CREDITS +++ b/CREDITS @@ -854,12 +854,12 @@ S: Oakland, CA S: USA N: Jochen Hein -E: jochen.hein@delphi.central.de +E: jochen@jochen.org P: 1024/4A27F015 25 72 FB E3 85 9F DE 3B CB 0A DA DA 40 77 05 6C D: National Language Support D: Linux Internationalization Project D: German Localization for Linux and GNU software -S: Frankenstraße +S: Frankenstraße 33 S: 34131 Kassel S: Germany @@ -937,7 +937,7 @@ S: CV5 8BZ S: United Kingdom N: Ron Holt -E: ron@holt.org +E: ron@sovereign.org W: http://www.holt.org/ W: http://www.ronholt.com/ D: Kernel development diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index e432afc6473c..482fbecb0ec6 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -64,8 +64,14 @@ inet_peer_gc_maxtime - INTEGER TCP variables: tcp_syn_retries - INTEGER - Number of times initial SYNs for an TCP connection attempt will - be retransmitted. Should not be higher than 255. + Number of times initial SYNs for an active TCP connection attempt + will be retransmitted. Should not be higher than 255. Default value + is 5, which corresponds to ~180seconds. + +tcp_synack_retries - INTEGER + Number of times SYNACKs for a passive TCP connection attempt will + be retransmitted. Should not be higher than 255. Default value + is 5, which corresponds to ~180seconds. tcp_keepalive_time - INTEGER How often TCP sends out keepalive messages when keepalive is enabled. @@ -73,15 +79,76 @@ tcp_keepalive_time - INTEGER tcp_keepalive_probes - INTEGER How many keepalive probes TCP sends out, until it decides that the - connection is broken. + connection is broken. Default value: 9. + +tcp_keepalive_interval - INTEGER + How frequently the probes are send out. Multiplied by + tcp_keepalive_probes it is time to kill not responding connection, + after probes started. Default value: 75sec i.e. connection + will be aborted after ~11 minutes of retries. tcp_retries1 - INTEGER + How many times to retry before deciding that somethig is wrong + and it is necessary to report this suspection to network layer. + Minimal RFC value is 3, it is default, which corresponds + to ~3sec-8min depending on RTO. + tcp_retries2 - INTEGER -tcp_max_delay_acks - INTEGER + How may times to retry before killing alive TCP connection. + RFC1122 says that the limit should be longer than 100 sec. + It is too small number. Default value 15 corresponds to ~13-30min + depending on RTO. + +tcp_orphan_retries - INTEGER + How may times to retry before killing TCP connection, closed + by our side. Default value 7 corresponds to ~50sec-16min + depending on RTO. If you machine is loaded WEB server, + you should think about lowering this value, such sockets + may consume significant resources. Cf. tcp_max_orphans. + tcp_fin_timeout - INTEGER -tcp_max_ka_probes - INTEGER -tcp_hoe_retransmits - INTEGER - Undocumented for now. + Time to hold socket in state FIN-WAIT-2, if it was closed + by our side. Peer can be broken and never close its side, + or even died unexpectedly. Default value is 60sec. + Usual value used in 2.2 was 180 seconds, you may restore + it, but remember that if your machine is even underloaded WEB server, + you risk to overflow memory with kilotons of dead sockets, + FIN-WAIT-2 sockets are less dangerous than FIN-WAIT-1, + because they eat maximum 1.5K of memory, but they tend + to live longer. Cf. tcp_max_orphans. + +tcp_max_tw_buckets - INTEGER + Maximal number of timewait sockets held by system simultaneously. + If this number is exceeded time-wait socket is immediately destroyed + and warning is printed. This limit exists only to prevent + simple DoS attacks, you _must_ not lower the limit artificially, + but rather increase it (probably, after increasing installed memory), + if network conditions require more than default value. + +tcp_tw_recycle - BOOLEAN + Enable fast recycling TIME-WAIT sockets. Default value is 1. + It should not be changed without advice/request of technical + experts. + +tcp_max_orphans - INTEGER + Maximal number of TCP sockets not attached to any user file handle, + held by system. If this number is exceeded orphaned connections are + reset immediately and warning is printed. This limit exists + only to prevent simple DoS attacks, you _must_ not rely on this + or lower the limit artificially, but rather increase it + (probably, after increasing installed memory), + if network conditions require more than default value, + and tune network services to linger and kill such states + more aggressivley. Let me to remind again: each orphan eats + up to ~64K of unswappable memory. + +tcp_abort_on_overflow - BOOLEAN + If listening service is too slow to accept new connections, + reset them. Default state is FALSE. It means that if overflow + occured due to a burst, connection will recover. Enable this + option _only_ if you are really sure that listening daemon + cannot be tuned to accept connections faster. Enabling this + option can harm clients of your server. tcp_syncookies - BOOLEAN Only valid when the kernel was compiled with CONFIG_SYNCOOKIES @@ -89,15 +156,36 @@ tcp_syncookies - BOOLEAN overflows. This is to prevent against the common 'syn flood attack' Default: FALSE + Note, that syncookies is fallback facility. + It MUST NOT be used to help highly loaded servers to stand + against legal connection rate. If you see synflood warnings + in your logs, but investigation shows that they occur + because of overload with legal connections, you should tune + another parameters until this warning disappear. + See: tcp_max_syn_backlog, tcp_synack_retries, tcp_abort_on_overflow. + + syncookies seriously violate TCP protocol, do not allow + to use TCP extensions, can result in serious degradation + of some services (f.e. SMTP relaying), visible not by you, + but your clients and relays, contacting you. While you see + synflood warnings in logs not being really flooded, your server + is seriously misconfigured. + tcp_stdurg - BOOLEAN Use the Host requirements interpretation of the TCP urg pointer field. Most hosts use the older BSD interpretation, so if you turn this on Linux might not communicate correctly with them. Default: FALSE -tcp_syn_taildrop - BOOLEAN tcp_max_syn_backlog - INTEGER - Undocumented (work in progress) + Maximal number of remembered connection requests, which are + still did not receive an acknowldgement from connecting client. + Default value is 1024 for systems with more than 128Mb of memory, + and 128 for low memory machines. If server suffers of overload, + try to increase this number. Warning! If you make it greater + than 1024, it would be better to change TCP_SYNQ_HSIZE in + include/net/tcp.h to keep TCP_SYNQ_HSIZE*16<=tcp_max_syn_backlog + and to recompile kernel. tcp_window_scaling - BOOLEAN Enable window scaling as defined in RFC1323. @@ -116,8 +204,15 @@ tcp_retrans_collapse - BOOLEAN ip_local_port_range - 2 INTEGERS Defines the local port range that is used by TCP and UDP to choose the local port. The first number is the first, the - second the last local port number. For high-usage systems - change this to 32768-61000. + second the last local port number. Default value depends on + amount of memory available on the system: + > 128Mb 32768-61000 + < 128Mb 1024-4999 or even less. + This number defines number of active connections, which this + system can issue simultaneously to systems not supporting + TCP extensions (timestamps). With tcp_tw_recycle enabled + (i.e. by default) range 1024-4999 is enough to issue up to + 2000 connections per second to systems supporting timestamps. icmp_echo_ignore_all - BOOLEAN icmp_echo_ignore_broadcasts - BOOLEAN @@ -201,7 +296,7 @@ rp_filter - BOOLEAN 0 - No source validation. - Default value is 0. Note that some distribution enable it + Default value is 0. Note that some distributions enable it in startip scripts. Alexey Kuznetsov. @@ -210,4 +305,4 @@ kuznet@ms2.inr.ac.ru Updated by: Andi Kleen ak@muc.de -$Id: ip-sysctl.txt,v 1.11 2000/01/08 20:32:41 davem Exp $ +$Id: ip-sysctl.txt,v 1.13 2000/01/18 08:24:09 davem Exp $ diff --git a/Documentation/networking/wan-router.txt b/Documentation/networking/wan-router.txt index 5fc1afffccde..f82ceb548f63 100644 --- a/Documentation/networking/wan-router.txt +++ b/Documentation/networking/wan-router.txt @@ -1,19 +1,26 @@ ------------------------------------------------------------------------------ WAN Router for Linux Operating System ------------------------------------------------------------------------------ +Version 2.1.1 - Nov 08, 1999 +Version 2.0.8 - Nov 02, 1999 +Version 2.0.7 - Aug 26, 1999 +Version 2.0.6 - Aug 17, 1999 +Version 2.0.5 - Aug 12, 1999 +Version 2.0.4 - Nov 26, 1998 +Version 2.0.3 - Aug 25, 1998 +Version 2.0.2 - Dec 09, 1997 Version 2.0.1 - Nov 28, 1997 Version 2.0.0 - Nov 06, 1997 Version 1.0.3 - June 3, 1997 Version 1.0.1 - January 30, 1997 -Author: Jaspreet Singh - Gene Kozin -Copyright (c) 1995-1997 Sangoma Technologies Inc. +Author: Nenad Corbic +Copyright (c) 1995-1999 Sangoma Technologies Inc. ------------------------------------------------------------------------------ WARNING: This Version of WANPIPE supports only the S508 and S508/FT1 cards. IF YOU OWN A S502E OR A S508 CARD THEN PLEASE CONTACT SANGOMA TECHNOLOGIES FOR -AN UPGRADE. +AN UPGRADE. ONLY THE BiSYNC STREAMING CODE IS SUPPORTED ON S502E/S503 cards. INTRODUCTION @@ -129,11 +136,55 @@ product. REVISION HISTORY +2.1.1 Nov 09, 1999 - New code for S514PCI card + - Completely redesigned drivers + fully tested and optimized. + +2.0.8 Nov 02, 1999 - Fixed up the X25API code. + - Clear call bug fixed.i + - Eanbled driver for multi-card + operation. + +2.0.7 Aug 26, 1999 - Merged X25API code into WANPIPE. + - Fixed a memeory leak for X25API + - Updated the X25API code for 2.2.X kernels. + - Improved NEM handling. + +2.0.6 Aug 17, 1999 - Kernel patch works for both 2.2.10 and 2.2.11 kernels + - Fixed up 2.0.5 installation bugs + - No functional difference between 2.0.6 and 2.0.5 + +2.0.5 Aug 12, 1999 - NEW PPP, interrupt drive code + - NEW X25 Xpipmon debugger + - Comments added to setup scripts + - Numerous bug fixes + +2.0.4 Nov 26, 1998 - NEW Cisco Dual Port support. + - NEW support for BiSync Streaming API. + - NEW support for HDLC (LAPB) API. + - WANPIPE provides an API for application + development using the BSD socket interface. + +2.0.3 Aug 25, 1998 - NEW support for Cisco HDLC, with cpipemon + utility for monitoring + - CIR support for Frame-relay + - Support for PAP and CHAP for ppp has been + implemented + - Dynamic IP assignment for PPP + - Multiple channel IPX support for Frame-relay + and X25 + - Inverse Arp support for Frame-relay + - FT1 Configuration utility for linux + - Man Pages for router.conf, router, sdladump, + cfgft1, fpipemon, ppipemon and cpipemon + +2.0.2 Dev 09, 1997 - Implemented PAP and CHAP for ppp. + 2.0.1 Nov 28, 1997 - Protection of "enable_irq()" while "disable_irq()" has been enabled from any other routine (for Frame Relay, PPP and X25). - - Added additional Stats for Fpipemon and Ppipemon - Improved Load Sharing for multiple boards. - + - Added additional Stats for Fpipemon and Ppipemon + - Improved Load Sharing for multiple boards. 2.0.0 Nov 07, 1997 - Implemented protection of RACE conditions by critical flags for FRAME RELAY and PPP. @@ -173,7 +224,7 @@ REVISION HISTORY 1.0.1 January 30, 1997 - Implemented user-readable status and statistics - via /proc file system + via /proc filesystem 1.0.0 December 31, 1996 diff --git a/Documentation/networking/wanpipe.txt b/Documentation/networking/wanpipe.txt index 0be0c5dc12fd..7cb28178e908 100644 --- a/Documentation/networking/wanpipe.txt +++ b/Documentation/networking/wanpipe.txt @@ -1,36 +1,23 @@ ------------------------------------------------------------------------------ -WANPIPE(tm) Multiprotocol WAN Driver for Linux WAN Router +Linux WAN Router Utilities Package ------------------------------------------------------------------------------ -Release 4.1 -November 17, 1997 -Author: Jaspreet Singh -Copyright (c) 1995-1997 Sangoma Technologies Inc. +Version 2.1.1 +Nov 08, 1999 +Author: Nenad Corbic +Copyright (c) 1995-1999 Sangoma Technologies Inc. ------------------------------------------------------------------------------ INTRODUCTION -WANPIPE(tm) is a family of intelligent multiprotocol WAN communication adapters -for personal computers (ISA bus) designed to provide PC connectivity to -various communication links, such as leased lines and public data networks, at -speeds up to T1/E1 using a variety of synchronous communications protocols, -including frame relay, PPP, X.25, SDLC, etc. +This is a set of utilities and shell scripts you need in order to be able to +use Linux kernel-level WAN Router. Please read WAN Router User's manual +(router.txt) and WANPIPE driver documentation found in /usr/lib/router/doc +directory for installation and configuration instructions. -WANPIPE driver together with Linux WAN Router module allows you to build a -relatively inexpensive, yet high-performance multiprotocol WAN router. For -more information about the Linux WAN Router please read the file -Documentation/networking/wan-router.txt. You must also obtain the WAN Tools -package to be able to use the Linux WAN Router and WANPIPE driver. The package -is available via the Internet from Sangoma Technologies' anonymous FTP server: +You can find the latest version of this software in /pub/linux directory on +Sangoma Technologies' anonymous FTP server (ftp.sangoma.com). - ftp.sangoma.com/pub/linux/wantools-X.Y.Z.tgz - or - ftp.sangoma.com/pub/linux/wanpipe-X.Y.Z.tgz - -The names of the packages differ only due to naming convention. The -functionality of wantools and wanpipe packages are the same. The latest -version of the WAN Drivers is wanpipe-2.0.0. - -For technical questions and/or comments please e-mail to jaspreet@sangoma.com. +For technical questions and/or comments please e-mail to ncorbic@sangoma.com. For general inquiries please contact Sangoma Technologies Inc. by Hotline: 1-800-388-2475 (USA and Canada, toll free) @@ -57,117 +44,214 @@ Ave, Cambridge, MA 02139, USA. -NEW IN THIS RELEASE - - o This Version of WANPIPE supports only the S508 and S508/FT1 cards. IF YOU - OWN A S502E OR A S508 CARD THEN PLEASE CONTACT SANGOMA TECHNOLOGIES FOR AN - UPGRADE. - o Protection of "enable_irq()" while "disable_irq()" has been enabled from - any other routine (for Frame Relay, PPP and X25). - o Added additional Stats for Fpipemon and Ppipemon. - o Improved Load Sharing for multiple boards - - -FILE LIST - -drivers/net: - README.wanpipe This file - sdladrv.c SDLA support module source code - sdla_fr.c SDLA Frame Relay source code - sdla_ppp.c SDLA PPP source code - sdla_x25.c SDLA X.25 source code - sdlamain.c SDLA support source code - -include/linux: - sdla_x25.h SDLA X.25 firmware API definitions - sdla_fr.h SDLA frame relay firmware API definitions - sdla_ppp.h SDLA PPP firmware API definitions - wanpipe.h WANPIPE API definitions - sdladrv.h SDLA support module API definitions - sdlasfm.h SDLA firmware module definitions - router.h - - -REVISION HISTORY - -4.1 November 28, 1997 - o Protection of "enable_irq()" while "disable_irq()" has been enabled - from any other routine (for Frame Relay, PPP and X25). - o Added additional Stats for Fpipemon and Ppipemon - o Improved Load Sharing for multiple boards - - -4.0 November 06, 1997 - o Implemented better protection of RACE conditions by critical flags for - FRAME RELAY, PPP and X25. - o DLCI List interrupt mode implemented for DLCI specific CIR. - o IPX support for FRAME RELAY, PPP and X25. - o IPX Server Support (MARS) for FRAME RELAY, PPP and X25. - o More driver specific stats included. - o MULTICAST for FRAME RELAY and PPP. - -3.1.0 January 30, 1997 - - o Implemented IOCTL for executing adapter commands. - o Fixed a bug in frame relay code causing driver configured as a FR - switch to be stuck in WAN_DISCONNECTED mode. - -3.0.0 December 31, 1996 - - o Uses Linux WAN Router interface - o Added support for X.25 routing - o Miscellaneous bug fixes and performance improvements - -2.4.1 December 18, 1996 +ACKNOWLEDGEMENTS - o Added support for LMI and Q.933 frame relay link management +This product is based on the WANPIPE(tm) Multiprotocol WAN Router developed +by Sangoma Technologies Inc. for Linux 2.2.x. Success of the WANPIPE +together with the next major release of Linux kernel in summer 1996 commanded +adequate changes to the WANPIPE code to take full advantage of new Linux +features. -2.3.0 October 17, 1996 +Instead of continuing developing proprietary interface tied to Sangoma WAN +cards, we decided to separate all hardware-independent code into a separate +module and defined two levels of interfaces - one for user-level applications +and another for kernel-level WAN drivers. WANPIPE is now implemented as a +WAN driver compliant with the WAN Link Driver interface. Also a general +purpose WAN configuration utility and a set of shell scripts was developed to +support WAN router at the user level. - o All shell scripts use meta-configuration file - o Miscellaneous bug fixes +Many useful ideas concerning hardware-independent interface implementation +were given by Mike McLagan and his implementation +of the Frame Relay router and drivers for Sangoma cards (dlci/sdla). -2.2.0 July 16, 1996 +With the new implementation of the APIs being incorporated into the WANPIPE, +a special thank goes to Alan Cox in providing insight into BSD sockets. - o Compatible with Linux 2.0 - o Added uninstall script - o User's Manual is available in HTML format +Special thanks to all the WANPIPE users who performed field-testing, reported +bugs and made valuable comments and suggestions that help us to improve this +product. -2.1.0 June 20, 1996 - o Added support for synchronous PPP - o Added support for S503 adapter - o Added API for executing adapter commands - o Fixed a re-entrancy problem in frame relay driver - o Changed interface between SDLA driver and protocol support modules - o Updated frame relay firmware -2.0.0 May 1, 1996 - - o Added interactive installation and configuration scripts - o Added System V-style start-up script - o Added dynamic memory window address selection in SDLA driver - o Miscellaneous bug fixes in SDLA driver - o Updated S508 frame relay firmware - o Changed SFM file format +NEW IN THIS RELEASE -1.0.0 February 12, 1996 +o Renamed startup script to wanrouter +o Option to turn off/on each router + separately +o New source directory /usr/lib/wanrouter +o New PPP driver +o X25 is not supported in this release + + +PRODUCT COMPONENTS AND RELATED FILES + +/etc: + wanpipe1.conf default router configuration file + wanrouter.rc meta-configuration file (used by the Setup script) + +/lib/modules/X.Y.Z/misc: + wanrouter.o router kernel loadable module + +/lib/modules/X.Y.Z/net: + sdladrv.o Sangoma SDLA support module + wanpipe.o Sangoma WANPIPE(tm) driver module + +/proc/net/wanrouter + Config reads current router configuration + Status reads current router status + {name} reads WAN driver statistics + +/usr/sbin: + wanrouter router start-up script + wanconfig router configuration utility + sdladump WANPIPE adapter memory dump utility + fpipemon Monitor for Frame Relay + cpipemon Monitor for Cisco HDLC + +/usr/lib/wanrouter: + README this file + COPYING GNU General Public License + Setup installation script + Configure configuration script + Filelist distribution definition file + +/usr/lib/wanrouter/doc: + WANPIPE_USER_MANUAL.txt WAN Router User's Manual + WANPIPE_CONFIG.txt WAN Configuration Manual + +/usr/lib/wanrouter/interfaces: + * interface configuration files (TCP/IP configuration) + +/usr/lib/wanrouter/patches: + wanrouter-22.gz patch for Linux kernel 2.2.10 and 2.2.11 + (compatible for all 2.2.X kernels) + wanrouter-20.gz patch for Linux kernel 2.0.36 + + Fix_2.2.11.gz patch to fix the 2.2.11 kernel so other patches + can be applied properly. + +/usr/lib/wanrouter/samples: + interface sample interface configuration file + wanpipe1.cpri CHDLC primary port + wanpipe2.csec CHDLC secondary port + wanpipe1.fr Frame Relay protocol + wanpipe1.ppp PPP protocol ) + wanrouter.rc sample meta-configuration file + +/usr/lib/wanrouter/src: + * wan-tools source code + +/usr/include/linux: + wanrouter.h router API definitions + wanpipe.h WANPIPE API definitions + sdladrv.h SDLA support module API definitions + sdlasfm.h SDLA firmware module definitions - o Final release - o Added support for Linux 1.3 - o Updated S508 frame relay firmware +/usr/src/linux/net/router: + * router source code -0.9.0 December 21, 1995 +/var/log: + wanrouter router start-up log (created by the Setup script) - o Added SNAP encapsulation for routed frames - o Added support for the frame relay switch emulation mode - o Added support for S508 adapter - o Added capability to autodetect adapter type - o Miscellaneous bug fixes in SDLA and frame relay drivers +/var/lock: + wanrouter router lock file (created by the Setup script) -0.1.0 October 12, 1995 +/usr/lib/wanrouter/wanpipe: + fr514.sfm Frame relay firmware for Sangoma S508/S514 card + cdual514.sfm Dual Port Cisco HDLC firmware for Sangoma S508/S514 card + ppp514.sfm PPP Firmware for Sangoma S508 and S514 cards - o Initial version ->>>>>>> END OF README <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +REVISION HISTORY +1.0.0 December 31, 1996 Initial version + +1.0.1 January 30, 1997 Status and statistics can be read via /proc + filesystem entries. + +1.0.2 April 30, 1997 Added UDP management via monitors. + +1.0.3 June 3, 1997 UDP management for multiple boards using Frame + Relay and PPP + Enabled continuous transmission of Configure + Request Packet for PPP (for 508 only) + Connection Timeout for PPP changed from 900 to 0 + Flow Control Problem fixed for Frame Relay + +1.0.4 July 10, 1997 S508/FT1 monitoring capability in fpipemon and + ppipemon utilities. + Configurable TTL for UDP packets. + Multicast and Broadcast IP source addresses are + silently discarded. + +1.0.5 July 28, 1997 Configurable T391,T392,N391,N392,N393 for Frame + Relay in router.conf. + Configurable Memory Address through router.conf + for Frame Relay, PPP and X.25. (commenting this + out enables auto-detection). + Fixed freeing up received buffers using kfree() + for Frame Relay and X.25. + Protect sdla_peek() by calling save_flags(), + cli() and restore_flags(). + Changed number of Trace elements from 32 to 20 + Added DLCI specific data monitoring in FPIPEMON. +2.0.0 Nov 07, 1997 Implemented protection of RACE conditions by + critical flags for FRAME RELAY and PPP. + DLCI List interrupt mode implemented. + IPX support in FRAME RELAY and PPP. + IPX Server Support (MARS) + More driver specific stats included in FPIPEMON + and PIPEMON. + +2.0.1 Nov 28, 1997 Bug Fixes for version 2.0.0. + Protection of "enable_irq()" while + "disable_irq()" has been enabled from any other + routine (for Frame Relay, PPP and X25). + Added additional Stats for Fpipemon and Ppipemon + Improved Load Sharing for multiple boards + +2.0.2 Dec 09, 1997 Support for PAP and CHAP for ppp has been + implemented. + +2.0.3 Aug 15, 1998 New release supporting Cisco HDLC, CIR for Frame + relay, Dynamic IP assignment for PPP and Inverse + Arp support for Frame-relay. Man Pages are + included for better support and a new utility + for configuring FT1 cards. + +2.0.4 Dec 09, 1998 Dual Port support for Cisco HDLC. + Support for HDLC (LAPB) API. + Supports BiSync Streaming code for S502E + and S503 cards. + Support for Streaming HDLC API. + Provides a BSD socket interface for + creating applications using BiSync + streaming. + +2.0.5 Aug 04, 1999 CHDLC initializatin bug fix. + PPP interrupt driven driver: + Fix to the PPP line hangup problem. + New PPP firmware + Added comments to the startup SYSTEM ERROR messages + Xpipemon debugging application for the X25 protocol + New USER_MANUAL.txt + Fixed the odd boundary 4byte writes to the board. + BiSync Streaming code has been taken out. + Available as a patch. + Streaming HDLC API has been taken out. + Available as a patch. + +2.0.6 Aug 17, 1999 Increased debugging in statup scripts + Fixed insallation bugs from 2.0.5 + Kernel patch works for both 2.2.10 and 2.2.11 kernels. + There is no functional difference between the two packages + +2.0.7 Aug 26, 1999 o Merged X25API code into WANPIPE. + o Fixed a memeory leak for X25API + o Updated the X25API code for 2.2.X kernels. + o Improved NEM handling. + +2.1.0 Oct 25, 1999 o New code for S514 PCI Card + o New CHDLC and Frame Relay drivers + o PPP and X25 are not supported in this release +>>>>>> END OF README <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< diff --git a/MAINTAINERS b/MAINTAINERS index 6135fc893e3c..48468fde61d8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -631,7 +631,7 @@ M: pavel@atrey.karlin.mff.cuni.cz S: Maintained NETWORKING [GENERAL] -P: Networking Teak +P: Networking Team M: netdev@oss.sgi.com L: linux-net@vger.rutgers.edu W: http://www.uk.linux.org/NetNews.html (2.0 only) diff --git a/arch/sparc/kernel/process.c b/arch/sparc/kernel/process.c index 1be5a02e3990..420efe1e16af 100644 --- a/arch/sparc/kernel/process.c +++ b/arch/sparc/kernel/process.c @@ -1,4 +1,4 @@ -/* $Id: process.c,v 1.143 2000/01/09 09:13:28 anton Exp $ +/* $Id: process.c,v 1.144 2000/01/21 11:38:39 jj Exp $ * linux/arch/sparc/kernel/process.c * * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu) @@ -293,7 +293,6 @@ void show_thread(struct thread_struct *thread) printk("uwinmask: 0x%08lx kregs: 0x%08lx\n", thread->uwinmask, (unsigned long)thread->kregs); show_regs(thread->kregs); - printk("sig_address: 0x%08lx sig_desc: 0x%08lx\n", thread->sig_address, thread->sig_desc); printk("ksp: 0x%08lx kpc: 0x%08lx\n", thread->ksp, thread->kpc); printk("kpsr: 0x%08lx kwim: 0x%08lx\n", thread->kpsr, thread->kwim); printk("fork_kpsr: 0x%08lx fork_kwim: 0x%08lx\n", thread->fork_kpsr, thread->fork_kwim); @@ -595,7 +594,7 @@ void dump_thread(struct pt_regs * regs, struct user * dump) dump->fpu.fpstatus.fpq_count = current->thread.fpqdepth; memcpy(&dump->fpu.fpstatus.fpq[0], ¤t->thread.fpqueue[0], ((sizeof(unsigned long) * 2) * 16)); - dump->sigcode = current->thread.sig_desc; + dump->sigcode = 0; } /* diff --git a/arch/sparc/kernel/signal.c b/arch/sparc/kernel/signal.c index ea0c7e1ffe39..98b54240252c 100644 --- a/arch/sparc/kernel/signal.c +++ b/arch/sparc/kernel/signal.c @@ -1,4 +1,4 @@ -/* $Id: signal.c,v 1.99 1999/12/27 06:08:32 anton Exp $ +/* $Id: signal.c,v 1.101 2000/01/21 11:38:38 jj Exp $ * linux/arch/sparc/kernel/signal.c * * Copyright (C) 1991, 1992 Linus Torvalds @@ -424,12 +424,15 @@ static inline void *get_sigframe(struct sigaction *sa, struct pt_regs *regs, uns } static inline void -setup_frame(struct sigaction *sa, unsigned long pc, unsigned long npc, - struct pt_regs *regs, int signr, sigset_t *oldset) +setup_frame(struct sigaction *sa, struct pt_regs *regs, int signr, sigset_t *oldset, siginfo_t *info) { struct signal_sframe *sframep; struct sigcontext *sc; int window = 0, err; + unsigned long pc = regs->pc; + unsigned long npc = regs->npc; + void *sig_address; + int sig_code; synchronize_user_stack(); sframep = (struct signal_sframe *)get_sigframe(sa, regs, SF_ALIGNEDSZ); @@ -474,18 +477,63 @@ setup_frame(struct sigaction *sa, unsigned long pc, unsigned long npc, sizeof(struct reg_window)); current->thread.w_saved = 0; /* So process is allowed to execute. */ + err |= __put_user(signr, &sframep->sig_num); - if(signr == SIGSEGV || - signr == SIGILL || - signr == SIGFPE || - signr == SIGBUS || - signr == SIGEMT) { - err |= __put_user(current->thread.sig_desc, &sframep->sig_code); - err |= __put_user(current->thread.sig_address, &sframep->sig_address); - } else { - err |= __put_user(0, &sframep->sig_code); - err |= __put_user(0, &sframep->sig_address); + sig_address = NULL; + sig_code = 0; + if (SI_FROMKERNEL (info) && (info->si_code & __SI_MASK) == __SI_FAULT) { + sig_address = info->si_addr; + switch (signr) { + case SIGSEGV: + switch (info->si_code) { + case SEGV_MAPERR: sig_code = SUBSIG_NOMAPPING; break; + default: sig_code = SUBSIG_PROTECTION; break; + } + break; + case SIGILL: + switch (info->si_code) { + case ILL_ILLOPC: sig_code = SUBSIG_ILLINST; break; + case ILL_PRVOPC: sig_code = SUBSIG_PRIVINST; break; + case ILL_ILLTRP: sig_code = SUBSIG_BADTRAP (info->si_trapno); break; + default: sig_code = SUBSIG_STACK; break; + } + break; + case SIGFPE: + switch (info->si_code) { + case FPE_INTDIV: sig_code = SUBSIG_IDIVZERO; break; + case FPE_INTOVF: sig_code = SUBSIG_FPINTOVFL; break; + case FPE_FLTDIV: sig_code = SUBSIG_FPDIVZERO; break; + case FPE_FLTOVF: sig_code = SUBSIG_FPOVFLOW; break; + case FPE_FLTUND: sig_code = SUBSIG_FPUNFLOW; break; + case FPE_FLTRES: sig_code = SUBSIG_FPINEXACT; break; + case FPE_FLTINV: sig_code = SUBSIG_FPOPERROR; break; + default: sig_code = SUBSIG_FPERROR; break; + } + break; + case SIGBUS: + switch (info->si_code) { + case BUS_ADRALN: sig_code = SUBSIG_ALIGNMENT; break; + case BUS_ADRERR: sig_code = SUBSIG_MISCERROR; break; + default: sig_code = SUBSIG_BUSTIMEOUT; break; + } + break; + case SIGEMT: + switch (info->si_code) { + case EMT_TAGOVF: sig_code = SUBSIG_TAG; break; + } + break; + case SIGSYS: + if (info->si_code == (__SI_FAULT|0x100)) { + /* See sys_sunos.c */ + sig_code = info->si_trapno; + break; + } + default: + sig_address = NULL; + } } + err |= __put_user((long)sig_address, &sframep->sig_address); + err |= __put_user(sig_code, &sframep->sig_code); err |= __put_user(sc, &sframep->sig_scptr); if (err) goto sigsegv; @@ -791,8 +839,7 @@ setup_svr4_frame(struct sigaction *sa, unsigned long pc, unsigned long npc, /* Setup the signal information. Solaris expects a bunch of * information to be passed to the signal handler, we don't provide - * that much currently, should use those that David already - * is providing with thread.sig_desc + * that much currently, should use siginfo. */ err |= __put_user(signr, &si->siginfo.signo); err |= __put_user(SVR4_SINOINFO, &si->siginfo.code); @@ -977,7 +1024,7 @@ handle_signal(unsigned long signr, struct k_sigaction *ka, else if (current->thread.new_signal) new_setup_frame (ka, regs, signr, oldset); else - setup_frame(&ka->sa, regs->pc, regs->npc, regs, signr, oldset); + setup_frame(&ka->sa, regs, signr, oldset, info); } if(ka->sa.sa_flags & SA_ONESHOT) ka->sa.sa_handler = SIG_DFL; @@ -1074,7 +1121,16 @@ asmlinkage int do_signal(sigset_t *oldset, struct pt_regs * regs, struct k_sigaction *ka; siginfo_t info; + /* + * XXX Disable svr4 signal handling until solaris emulation works. + * It is buggy - Anton + */ +#define SVR4_SIGNAL_BROKEN 1 +#ifdef SVR4_SIGNAL_BROKEN + int svr4_signal = 0; +#else int svr4_signal = current->personality == PER_SVR4; +#endif if (!oldset) oldset = ¤t->blocked; diff --git a/arch/sparc/kernel/smp.c b/arch/sparc/kernel/smp.c index 58f1be4e80e5..6153eb99e3bb 100644 --- a/arch/sparc/kernel/smp.c +++ b/arch/sparc/kernel/smp.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -51,7 +52,7 @@ unsigned long cpu_offset[NR_CPUS]; unsigned char boot_cpu_id = 0; unsigned char boot_cpu_id4 = 0; /* boot_cpu_id << 2 */ int smp_activated = 0; -volatile int cpu_number_map[NR_CPUS]; +volatile int __cpu_number_map[NR_CPUS]; volatile int __cpu_logical_map[NR_CPUS]; cycles_t cacheflush_time = 0; /* XXX */ diff --git a/arch/sparc/kernel/sparc_ksyms.c b/arch/sparc/kernel/sparc_ksyms.c index 6943aa4e6f6d..3f4cebb33ce2 100644 --- a/arch/sparc/kernel/sparc_ksyms.c +++ b/arch/sparc/kernel/sparc_ksyms.c @@ -1,4 +1,4 @@ -/* $Id: sparc_ksyms.c,v 1.86 2000/01/09 10:46:49 anton Exp $ +/* $Id: sparc_ksyms.c,v 1.87 2000/01/21 17:41:14 anton Exp $ * arch/sparc/kernel/ksyms.c: Sparc specific ksyms support. * * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu) @@ -16,6 +16,7 @@ #include #include #include +#include #include #include diff --git a/arch/sparc/kernel/sun4d_smp.c b/arch/sparc/kernel/sun4d_smp.c index ec105ec18e7c..b589712aaee1 100644 --- a/arch/sparc/kernel/sun4d_smp.c +++ b/arch/sparc/kernel/sun4d_smp.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -57,13 +58,15 @@ extern struct cpuinfo_sparc cpu_data[NR_CPUS]; extern unsigned long cpu_offset[NR_CPUS]; extern unsigned char boot_cpu_id; extern int smp_activated; -extern volatile int cpu_number_map[NR_CPUS]; +extern volatile int __cpu_number_map[NR_CPUS]; extern volatile int __cpu_logical_map[NR_CPUS]; extern volatile unsigned long ipi_count; extern volatile int smp_process_available; extern volatile int smp_commenced; extern int __smp4d_processor_id(void); +extern unsigned long totalram_pages; + /* #define SMP_DEBUG */ #ifdef SMP_DEBUG @@ -138,10 +141,10 @@ void __init smp4d_callin(void) cpu_leds[cpuid] = 0x9; show_leds(cpuid); - current->mm->mmap->vm_page_prot = PAGE_SHARED; - current->mm->mmap->vm_start = PAGE_OFFSET; - current->mm->mmap->vm_end = init_mm.mmap->vm_end; - + /* Attach to the address space of init_task. */ + atomic_inc(&init_mm.mm_count); + current->active_mm = &init_mm; + local_flush_cache_all(); local_flush_tlb_all(); @@ -189,12 +192,12 @@ void __init smp4d_boot_cpus(void) cpu_present_map |= (1<processor = boot_cpu_id; smp_store_cpu_info(boot_cpu_id); @@ -218,12 +221,19 @@ void __init smp4d_boot_cpus(void) /* Cook up an idler for this guy. */ kernel_thread(start_secondary, NULL, CLONE_PID); - p = task[++cpucount]; + cpucount++; + + p = init_task.prev_task; + init_tasks[i] = p; p->processor = i; p->has_cpu = 1; /* we schedule the first task manually */ + current_set[i] = p; - + + del_from_runqueue(p); + unhash_process(p); + for (no = 0; no < linux_num_cpus; no++) if (linux_cpus[no].mid == i) break; @@ -254,7 +264,7 @@ void __init smp4d_boot_cpus(void) if(cpu_callin_map[i]) { /* Another "Red Snapper". */ - cpu_number_map[i] = cpucount; + __cpu_number_map[i] = cpucount; __cpu_logical_map[cpucount] = i; } else { cpucount--; @@ -263,7 +273,7 @@ void __init smp4d_boot_cpus(void) } if(!(cpu_callin_map[i])) { cpu_present_map &= ~(1 << i); - cpu_number_map[i] = -1; + __cpu_number_map[i] = -1; } } local_flush_cache_all(); @@ -289,13 +299,23 @@ void __init smp4d_boot_cpus(void) } /* Free unneeded trap tables */ - - mem_map[MAP_NR((unsigned long)trapbase_cpu1)].flags &= ~(1 << PG_reserved); + ClearPageReserved(mem_map + MAP_NR(trapbase_cpu1)); + set_page_count(mem_map + MAP_NR(trapbase_cpu1), 1); free_page((unsigned long)trapbase_cpu1); - mem_map[MAP_NR((unsigned long)trapbase_cpu2)].flags &= ~(1 << PG_reserved); + totalram_pages++; + num_physpages++; + + ClearPageReserved(mem_map + MAP_NR(trapbase_cpu2)); + set_page_count(mem_map + MAP_NR(trapbase_cpu2), 1); free_page((unsigned long)trapbase_cpu2); - mem_map[MAP_NR((unsigned long)trapbase_cpu3)].flags &= ~(1 << PG_reserved); + totalram_pages++; + num_physpages++; + + ClearPageReserved(mem_map + MAP_NR(trapbase_cpu3)); + set_page_count(mem_map + MAP_NR(trapbase_cpu3), 1); free_page((unsigned long)trapbase_cpu3); + totalram_pages++; + num_physpages++; /* Ok, they are spinning and ready to go. */ smp_processors_ready = 1; diff --git a/arch/sparc/kernel/sun4m_smp.c b/arch/sparc/kernel/sun4m_smp.c index 2d2d97810372..36c3d3c259d9 100644 --- a/arch/sparc/kernel/sun4m_smp.c +++ b/arch/sparc/kernel/sun4m_smp.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -52,13 +53,15 @@ extern struct cpuinfo_sparc cpu_data[NR_CPUS]; extern unsigned long cpu_offset[NR_CPUS]; extern unsigned char boot_cpu_id; extern int smp_activated; -extern volatile int cpu_number_map[NR_CPUS]; +extern volatile int __cpu_number_map[NR_CPUS]; extern volatile int __cpu_logical_map[NR_CPUS]; extern volatile unsigned long ipi_count; extern volatile int smp_process_available; extern volatile int smp_commenced; extern int __smp4m_processor_id(void); +extern unsigned long totalram_pages; + /*#define SMP_DEBUG*/ #ifdef SMP_DEBUG @@ -84,6 +87,7 @@ void __init smp4m_callin(void) local_flush_cache_all(); local_flush_tlb_all(); + set_irq_udt(mid_xlate[boot_cpu_id]); /* Get our local ticker going. */ @@ -91,6 +95,7 @@ void __init smp4m_callin(void) calibrate_delay(); smp_store_cpu_info(cpuid); + local_flush_cache_all(); local_flush_tlb_all(); @@ -104,22 +109,21 @@ void __init smp4m_callin(void) /* Allow master to continue. */ swap((unsigned long *)&cpu_callin_map[cpuid], 1); + local_flush_cache_all(); local_flush_tlb_all(); cpu_probe(); - while(!task[cpuid] || current_set[cpuid] != task[cpuid]) - barrier(); - /* Fix idle thread fields. */ __asm__ __volatile__("ld [%0], %%g6\n\t" : : "r" (¤t_set[cpuid]) : "memory" /* paranoid */); - current->mm->mmap->vm_page_prot = PAGE_SHARED; - current->mm->mmap->vm_start = PAGE_OFFSET; - current->mm->mmap->vm_end = init_mm.mmap->vm_end; - + + /* Attach to the address space of init_task. */ + atomic_inc(&init_mm.mm_count); + current->active_mm = &init_mm; + while(!smp_commenced) barrier(); @@ -152,21 +156,23 @@ void __init smp4m_boot_cpus(void) printk("Entering SMP Mode...\n"); - for (i = 0; i < NR_CPUS; i++) - cpu_offset[i] = (char *)&cpu_data[i] - (char *)&cpu_data; - __sti(); cpu_present_map = 0; + for(i=0; i < linux_num_cpus; i++) cpu_present_map |= (1<processor = boot_cpu_id; + smp_store_cpu_info(boot_cpu_id); set_irq_udt(mid_xlate[boot_cpu_id]); smp_setup_percpu_timer(); @@ -187,12 +193,19 @@ void __init smp4m_boot_cpus(void) /* Cook up an idler for this guy. */ kernel_thread(start_secondary, NULL, CLONE_PID); - p = task[++cpucount]; + cpucount++; + + p = init_task.prev_task; + init_tasks[i] = p; p->processor = i; p->has_cpu = 1; /* we schedule the first task manually */ + current_set[i] = p; + del_from_runqueue(p); + unhash_process(p); + /* See trampoline.S for details... */ entry += ((i-1) * 3); @@ -220,7 +233,7 @@ void __init smp4m_boot_cpus(void) } if(cpu_callin_map[i]) { /* Another "Red Snapper". */ - cpu_number_map[i] = i; + __cpu_number_map[i] = i; __cpu_logical_map[i] = i; } else { cpucount--; @@ -229,7 +242,7 @@ void __init smp4m_boot_cpus(void) } if(!(cpu_callin_map[i])) { cpu_present_map &= ~(1 << i); - cpu_number_map[i] = -1; + __cpu_number_map[i] = -1; } } local_flush_cache_all(); @@ -265,18 +278,26 @@ void __init smp4m_boot_cpus(void) cpu_data[prev].next = first; /* Free unneeded trap tables */ - if (!(cpu_present_map & (1 << 1))) { - mem_map[MAP_NR((unsigned long)trapbase_cpu1)].flags &= ~(1 << PG_reserved); + ClearPageReserved(mem_map + MAP_NR(trapbase_cpu1)); + set_page_count(mem_map + MAP_NR(trapbase_cpu1), 1); free_page((unsigned long)trapbase_cpu1); + totalram_pages++; + num_physpages++; } if (!(cpu_present_map & (1 << 2))) { - mem_map[MAP_NR((unsigned long)trapbase_cpu2)].flags &= ~(1 << PG_reserved); + ClearPageReserved(mem_map + MAP_NR(trapbase_cpu2)); + set_page_count(mem_map + MAP_NR(trapbase_cpu2), 1); free_page((unsigned long)trapbase_cpu2); + totalram_pages++; + num_physpages++; } if (!(cpu_present_map & (1 << 3))) { - mem_map[MAP_NR((unsigned long)trapbase_cpu3)].flags &= ~(1 << PG_reserved); + ClearPageReserved(mem_map + MAP_NR(trapbase_cpu3)); + set_page_count(mem_map + MAP_NR(trapbase_cpu3), 1); free_page((unsigned long)trapbase_cpu3); + totalram_pages++; + num_physpages++; } /* Ok, they are spinning and ready to go. */ diff --git a/arch/sparc/kernel/sys_sparc.c b/arch/sparc/kernel/sys_sparc.c index 69379de4a17d..ad449a435cf7 100644 --- a/arch/sparc/kernel/sys_sparc.c +++ b/arch/sparc/kernel/sys_sparc.c @@ -1,4 +1,4 @@ -/* $Id: sys_sparc.c,v 1.56 2000/01/04 11:01:26 jj Exp $ +/* $Id: sys_sparc.c,v 1.57 2000/01/21 11:38:42 jj Exp $ * linux/arch/sparc/kernel/sys_sparc.c * * This file contains various random system calls that @@ -260,11 +260,19 @@ c_sys_nis_syscall (struct pt_regs *regs) asmlinkage void sparc_breakpoint (struct pt_regs *regs) { + siginfo_t info; + lock_kernel(); #ifdef DEBUG_SPARC_BREAKPOINT printk ("TRAP: Entering kernel PC=%x, nPC=%x\n", regs->pc, regs->npc); #endif - force_sig(SIGTRAP, current); + info.si_signo = SIGTRAP; + info.si_errno = 0; + info.si_code = TRAP_BRKPT; + info.si_addr = (void *)regs->pc; + info.si_trapno = 0; + force_sig_info(SIGTRAP, &info, current); + #ifdef DEBUG_SPARC_BREAKPOINT printk ("TRAP: Returning to space: PC=%x nPC=%x\n", regs->pc, regs->npc); #endif diff --git a/arch/sparc/kernel/sys_sunos.c b/arch/sparc/kernel/sys_sunos.c index ddac348fe76a..c0155e65b47a 100644 --- a/arch/sparc/kernel/sys_sunos.c +++ b/arch/sparc/kernel/sys_sunos.c @@ -1,4 +1,4 @@ -/* $Id: sys_sunos.c,v 1.108 2000/01/06 23:51:46 davem Exp $ +/* $Id: sys_sunos.c,v 1.110 2000/01/21 11:38:40 jj Exp $ * sys_sunos.c: SunOS specific syscall compatibility support. * * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu) @@ -366,6 +366,7 @@ asmlinkage unsigned long sunos_sigblock(unsigned long blk_mask) spin_lock_irq(¤t->sigmask_lock); old = current->blocked.sig[0]; current->blocked.sig[0] |= (blk_mask & _BLOCKABLE); + recalc_sigpending(current); spin_unlock_irq(¤t->sigmask_lock); return old; } @@ -377,6 +378,7 @@ asmlinkage unsigned long sunos_sigsetmask(unsigned long newmask) spin_lock_irq(¤t->sigmask_lock); retval = current->blocked.sig[0]; current->blocked.sig[0] = (newmask & _BLOCKABLE); + recalc_sigpending(current); spin_unlock_irq(¤t->sigmask_lock); return retval; } @@ -595,15 +597,22 @@ asmlinkage int sunos_uname(struct sunos_utsname *name) asmlinkage int sunos_nosys(void) { struct pt_regs *regs; + siginfo_t info; + static int cnt; lock_kernel(); regs = current->thread.kregs; - current->thread.sig_address = regs->pc; - current->thread.sig_desc = regs->u_regs[UREG_G1]; - send_sig(SIGSYS, current, 1); - printk("Process makes ni_syscall number %d, register dump:\n", - (int) regs->u_regs[UREG_G1]); - show_regs(regs); + info.si_signo = SIGSYS; + info.si_errno = 0; + info.si_code = __SI_FAULT|0x100; + info.si_addr = (void *)regs->tpc; + info.si_trapno = regs->u_regs[UREG_G1]; + send_sig_info(SIGSYS, &info, current); + if (cnt++ < 4) { + printk("Process makes ni_syscall number %d, register dump:\n", + (int) regs->u_regs[UREG_G1]); + show_regs(regs); + } unlock_kernel(); return -ENOSYS; } diff --git a/arch/sparc/kernel/systbls.S b/arch/sparc/kernel/systbls.S index 6da3c3f2125c..47aea979b466 100644 --- a/arch/sparc/kernel/systbls.S +++ b/arch/sparc/kernel/systbls.S @@ -1,4 +1,4 @@ -/* $Id: systbls.S,v 1.90 2000/01/11 17:33:20 jj Exp $ +/* $Id: systbls.S,v 1.91 2000/01/16 06:20:44 davem Exp $ * systbls.S: System call entry point tables for OS compatibility. * The native Linux system call table lives here also. * @@ -33,15 +33,15 @@ sys_call_table: /*60*/ .long sys_umask, sys_chroot, sys_newfstat, sys_fstat64, sys_getpagesize /*65*/ .long sys_msync, sys_vfork, sys_pread, sys_pwrite, sys_geteuid /*70*/ .long sys_getegid, sys_mmap, sys_setreuid, sys_munmap, sys_mprotect -/*75*/ .long sys_setregid, sys_vhangup, sys_truncate64, sys_getgroups, sys_getgroups16 +/*75*/ .long sys_nis_syscall, sys_vhangup, sys_truncate64, sys_nis_syscall, sys_getgroups16 /*80*/ .long sys_setgroups16, sys_getpgrp, sys_setgroups, sys_setitimer, sys_ftruncate64 /*85*/ .long sys_swapon, sys_getitimer, sys_setuid, sys_sethostname, sys_setgid /*90*/ .long sys_dup2, sys_setfsuid, sys_fcntl, sys_select, sys_setfsgid /*95*/ .long sys_fsync, sys_setpriority, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall /*100*/ .long sys_getpriority, sys_rt_sigreturn, sys_rt_sigaction, sys_rt_sigprocmask, sys_rt_sigpending /*105*/ .long sys_rt_sigtimedwait, sys_rt_sigqueueinfo, sys_rt_sigsuspend, sys_setresuid, sys_getresuid -/*110*/ .long sys_setresgid, sys_getresgid, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall -/*115*/ .long sys_nis_syscall, sys_gettimeofday, sys_getrusage, sys_nis_syscall, sys_getcwd +/*110*/ .long sys_setresgid, sys_getresgid, sys_setregid, sys_nis_syscall, sys_nis_syscall +/*115*/ .long sys_getgroups, sys_gettimeofday, sys_getrusage, sys_nis_syscall, sys_getcwd /*120*/ .long sys_readv, sys_writev, sys_settimeofday, sys_fchown16, sys_fchmod /*125*/ .long sys_nis_syscall, sys_setreuid16, sys_setregid16, sys_rename, sys_truncate /*130*/ .long sys_ftruncate, sys_flock, sys_lstat64, sys_nis_syscall, sys_nis_syscall diff --git a/arch/sparc/kernel/time.c b/arch/sparc/kernel/time.c index 008567aba50b..295988f5838e 100644 --- a/arch/sparc/kernel/time.c +++ b/arch/sparc/kernel/time.c @@ -1,4 +1,4 @@ -/* $Id: time.c,v 1.49 1999/11/17 07:34:07 zaitcev Exp $ +/* $Id: time.c,v 1.50 2000/01/21 04:35:53 anton Exp $ * linux/arch/sparc/kernel/time.c * * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu) @@ -229,7 +229,7 @@ static __inline__ void sun4_clock_probe(void) sp_clock_typ = INTERSIL; r.start = sun4_clock_physaddr; intersil_clock = (struct intersil *) - sparc_ioremap(&r, 0, sizeof(*intersil_clock), "intersil"); + sbus_ioremap(&r, 0, sizeof(*intersil_clock), "intersil"); mstk48t02_regs = 0; /* just be sure */ mstk48t08_regs = 0; /* ditto */ /* initialise the clock */ diff --git a/arch/sparc/kernel/traps.c b/arch/sparc/kernel/traps.c index 31b64b0cd66a..d7159f9ff0e4 100644 --- a/arch/sparc/kernel/traps.c +++ b/arch/sparc/kernel/traps.c @@ -1,7 +1,8 @@ -/* $Id: traps.c,v 1.60 1999/08/14 03:51:31 anton Exp $ +/* $Id: traps.c,v 1.61 2000/01/21 11:38:41 jj Exp $ * arch/sparc/kernel/traps.c * * Copyright 1995 David S. Miller (davem@caip.rutgers.edu) + * Copyright 2000 Jakub Jelinek (jakub@redhat.com) */ /* @@ -128,6 +129,8 @@ void die_if_kernel(char *str, struct pt_regs *regs) void do_hw_interrupt(unsigned long type, unsigned long psr, unsigned long pc) { + siginfo_t info; + lock_kernel(); if(type < 0x80) { /* Sun OS's puke from bad traps, Linux survives! */ @@ -135,22 +138,23 @@ void do_hw_interrupt(unsigned long type, unsigned long psr, unsigned long pc) die_if_kernel("Whee... Hello Mr. Penguin", current->thread.kregs); } - if(type == SP_TRAP_SBPT) { - send_sig(SIGTRAP, current, 1); - } else { - if(psr & PSR_PS) - die_if_kernel("Kernel bad trap", current->thread.kregs); - - current->thread.sig_desc = SUBSIG_BADTRAP(type - 0x80); - current->thread.sig_address = pc; - send_sig(SIGILL, current, 1); - } + if(psr & PSR_PS) + die_if_kernel("Kernel bad trap", current->thread.kregs); + + info.si_signo = SIGILL; + info.si_errno = 0; + info.si_code = ILL_ILLTRP; + info.si_addr = (void *)pc; + info.si_trapno = type - 0x80; + force_sig_info(SIGILL, &info, current); unlock_kernel(); } void do_illegal_instruction(struct pt_regs *regs, unsigned long pc, unsigned long npc, unsigned long psr) { + siginfo_t info; + lock_kernel(); if(psr & PSR_PS) die_if_kernel("Kernel illegal instruction", regs); @@ -163,9 +167,12 @@ void do_illegal_instruction(struct pt_regs *regs, unsigned long pc, unsigned lon if (!do_user_muldiv (regs, pc)) goto out; } - current->thread.sig_address = pc; - current->thread.sig_desc = SUBSIG_ILLINST; - send_sig(SIGILL, current, 1); + info.si_signo = SIGILL; + info.si_errno = 0; + info.si_code = ILL_ILLOPC; + info.si_addr = (void *)pc; + info.si_trapno = 0; + send_sig_info(SIGILL, &info, current); out: unlock_kernel(); } @@ -173,12 +180,17 @@ out: void do_priv_instruction(struct pt_regs *regs, unsigned long pc, unsigned long npc, unsigned long psr) { + siginfo_t info; + lock_kernel(); if(psr & PSR_PS) die_if_kernel("Penguin instruction from Penguin mode??!?!", regs); - current->thread.sig_address = pc; - current->thread.sig_desc = SUBSIG_PRIVINST; - send_sig(SIGILL, current, 1); + info.si_signo = SIGILL; + info.si_errno = 0; + info.si_code = ILL_PRVOPC; + info.si_addr = (void *)pc; + info.si_trapno = 0; + send_sig_info(SIGILL, &info, current); unlock_kernel(); } @@ -187,6 +199,8 @@ void do_priv_instruction(struct pt_regs *regs, unsigned long pc, unsigned long n void do_memaccess_unaligned(struct pt_regs *regs, unsigned long pc, unsigned long npc, unsigned long psr) { + siginfo_t info; + lock_kernel(); if(regs->psr & PSR_PS) { printk("KERNEL MNA at pc %08lx npc %08lx called by %08lx\n", pc, npc, @@ -194,14 +208,17 @@ void do_memaccess_unaligned(struct pt_regs *regs, unsigned long pc, unsigned lon die_if_kernel("BOGUS", regs); /* die_if_kernel("Kernel MNA access", regs); */ } - current->thread.sig_address = pc; - current->thread.sig_desc = SUBSIG_PRIVINST; #if 0 show_regs (regs); instruction_dump ((unsigned long *) regs->pc); printk ("do_MNA!\n"); #endif - send_sig(SIGBUS, current, 1); + info.si_signo = SIGBUS; + info.si_errno = 0; + info.si_code = BUS_ADRALN; + info.si_addr = /* FIXME: Should dig out mna address */ (void *)0; + info.si_trapno = 0; + send_sig_info(SIGBUS, &info, current); unlock_kernel(); } @@ -269,6 +286,8 @@ void do_fpe_trap(struct pt_regs *regs, unsigned long pc, unsigned long npc, unsigned long psr) { static int calls = 0; + siginfo_t info; + unsigned long fsr; int ret = 0; #ifndef __SMP__ struct task_struct *fpt = last_task_used_math; @@ -326,8 +345,6 @@ void do_fpe_trap(struct pt_regs *regs, unsigned long pc, unsigned long npc, } /* nope, better SIGFPE the offending process... */ - fpt->thread.sig_address = pc; - fpt->thread.sig_desc = SUBSIG_FPERROR; /* as good as any */ #ifdef __SMP__ fpt->flags &= ~PF_USEDFPU; #endif @@ -345,7 +362,26 @@ void do_fpe_trap(struct pt_regs *regs, unsigned long pc, unsigned long npc, regs); goto out; } - send_sig(SIGFPE, fpt, 1); + + fsr = fpt->thread.fsr; + info.si_signo = SIGFPE; + info.si_errno = 0; + info.si_addr = (void *)pc; + info.si_trapno = 0; + info.si_code = __SI_FAULT; + if ((fsr & 0x1c000) == (1 << 14)) { + if (fsr & 0x10) + info.si_code = FPE_FLTINV; + else if (fsr & 0x08) + info.si_code = FPE_FLTOVF; + else if (fsr & 0x04) + info.si_code = FPE_FLTUND; + else if (fsr & 0x02) + info.si_code = FPE_FLTDIV; + else if (fsr & 0x01) + info.si_code = FPE_FLTRES; + } + send_sig_info(SIGFPE, &info, fpt); #ifndef __SMP__ last_task_used_math = NULL; #endif @@ -359,12 +395,17 @@ out: void handle_tag_overflow(struct pt_regs *regs, unsigned long pc, unsigned long npc, unsigned long psr) { + siginfo_t info; + lock_kernel(); if(psr & PSR_PS) die_if_kernel("Penguin overflow trap from kernel mode", regs); - current->thread.sig_address = pc; - current->thread.sig_desc = SUBSIG_TAG; /* as good as any */ - send_sig(SIGEMT, current, 1); + info.si_signo = SIGEMT; + info.si_errno = 0; + info.si_code = EMT_TAGOVF; + info.si_addr = (void *)pc; + info.si_trapno = 0; + send_sig_info(SIGEMT, &info, current); unlock_kernel(); } @@ -385,40 +426,69 @@ void handle_watchpoint(struct pt_regs *regs, unsigned long pc, unsigned long npc void handle_reg_access(struct pt_regs *regs, unsigned long pc, unsigned long npc, unsigned long psr) { + siginfo_t info; + lock_kernel(); #ifdef TRAP_DEBUG printk("Register Access Exception at PC %08lx NPC %08lx PSR %08lx\n", pc, npc, psr); #endif - send_sig(SIGILL, current, 1); + info.si_signo = SIGBUS; + info.si_errno = 0; + info.si_code = BUS_OBJERR; + info.si_addr = (void *)pc; + info.si_trapno = 0; + force_sig_info(SIGBUS, &info, current); unlock_kernel(); } void handle_cp_disabled(struct pt_regs *regs, unsigned long pc, unsigned long npc, unsigned long psr) { + siginfo_t info; + lock_kernel(); - send_sig(SIGILL, current, 1); + info.si_signo = SIGILL; + info.si_errno = 0; + info.si_code = ILL_COPROC; + info.si_addr = (void *)pc; + info.si_trapno = 0; + send_sig_info(SIGILL, &info, current); unlock_kernel(); } void handle_cp_exception(struct pt_regs *regs, unsigned long pc, unsigned long npc, unsigned long psr) { + siginfo_t info; + lock_kernel(); #ifdef TRAP_DEBUG printk("Co-Processor Exception at PC %08lx NPC %08lx PSR %08lx\n", pc, npc, psr); #endif - send_sig(SIGILL, current, 1); + info.si_signo = SIGILL; + info.si_errno = 0; + info.si_code = ILL_COPROC; + info.si_addr = (void *)pc; + info.si_trapno = 0; + send_sig_info(SIGILL, &info, current); unlock_kernel(); } void handle_hw_divzero(struct pt_regs *regs, unsigned long pc, unsigned long npc, unsigned long psr) { + siginfo_t info; + lock_kernel(); - send_sig(SIGILL, current, 1); + info.si_signo = SIGFPE; + info.si_errno = 0; + info.si_code = FPE_INTDIV; + info.si_addr = (void *)pc; + info.si_trapno = 0; + send_sig_info(SIGFPE, &info, current); + unlock_kernel(); } diff --git a/arch/sparc/kernel/unaligned.c b/arch/sparc/kernel/unaligned.c index ff9295e9d601..592a2a4c0096 100644 --- a/arch/sparc/kernel/unaligned.c +++ b/arch/sparc/kernel/unaligned.c @@ -1,4 +1,4 @@ -/* $Id: unaligned.c,v 1.19 1999/08/14 03:51:33 anton Exp $ +/* $Id: unaligned.c,v 1.20 2000/01/21 11:38:42 jj Exp $ * unaligned.c: Unaligned load/store trap handling with special * cases for the kernel to do them more quickly. * @@ -422,9 +422,14 @@ void user_mna_trap_fault(struct pt_regs *regs, unsigned int insn) __asm__ ("user void user_mna_trap_fault(struct pt_regs *regs, unsigned int insn) { - current->thread.sig_address = regs->pc; - current->thread.sig_desc = SUBSIG_PRIVINST; - send_sig(SIGBUS, current, 1); + siginfo_t info; + + info.si_signo = SIGBUS; + info.si_errno = 0; + info.si_code = BUS_ADRALN; + info.si_addr = (void *)compute_effective_address(regs, insn); + info.si_trapno = 0; + send_sig_info(SIGBUS, &info, current); } asmlinkage void user_unaligned_trap(struct pt_regs *regs, unsigned int insn) @@ -487,9 +492,7 @@ asmlinkage void user_unaligned_trap(struct pt_regs *regs, unsigned int insn) } kill_user: - current->thread.sig_address = regs->pc; - current->thread.sig_desc = SUBSIG_PRIVINST; - send_sig(SIGBUS, current, 1); + user_mna_trap_fault(regs, insn); out: unlock_kernel(); } diff --git a/arch/sparc/mm/asyncd.c b/arch/sparc/mm/asyncd.c index 569940417c42..0034cb60b085 100644 --- a/arch/sparc/mm/asyncd.c +++ b/arch/sparc/mm/asyncd.c @@ -1,4 +1,4 @@ -/* $Id: asyncd.c,v 1.19 2000/01/08 20:22:16 davem Exp $ +/* $Id: asyncd.c,v 1.20 2000/01/21 11:38:47 jj Exp $ * The asyncd kernel daemon. This handles paging on behalf of * processes that receive page faults due to remote (async) memory * accesses. @@ -116,6 +116,7 @@ static int fault_in_page(int taskid, pgd_t *pgd; pmd_t *pmd; pte_t *pte; + siginfo_t info; if (!tsk || !tsk->mm) return 1; @@ -179,9 +180,12 @@ no_memory: bad_area: stats.failure++; - tsk->thread.sig_address = address; - tsk->thread.sig_desc = SUBSIG_NOMAPPING; - send_sig(SIGSEGV, tsk, 1); + info.si_signo = SIGSEGV; + info.si_errno = 0; + info.si_code = SEGV_MAPERR; + info.si_addr = (void *)address; + info.si_trapno = 0; + send_sig_info(SIGSEGV, &info, tsk); return 1; } diff --git a/arch/sparc/mm/fault.c b/arch/sparc/mm/fault.c index ba75681b1d5b..d9981e68b6a5 100644 --- a/arch/sparc/mm/fault.c +++ b/arch/sparc/mm/fault.c @@ -1,4 +1,4 @@ -/* $Id: fault.c,v 1.111 1999/10/24 13:45:59 anton Exp $ +/* $Id: fault.c,v 1.113 2000/01/21 11:38:47 jj Exp $ * fault.c: Page fault handlers for the Sparc. * * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu) @@ -197,8 +197,10 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write, struct mm_struct *mm = tsk->mm; unsigned int fixup; unsigned long g2; + siginfo_t info; int from_user = !(regs->psr & PSR_PS); + info.si_code = SEGV_MAPERR; if(text_fault) address = regs->pc; @@ -207,10 +209,12 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write, * context, we must not take the fault.. */ if (in_interrupt() || !mm) - goto do_kernel_fault; + goto no_context; down(&mm->mmap_sem); - /* The kernel referencing a bad kernel pointer can lock up + + /* + * The kernel referencing a bad kernel pointer can lock up * a sun4c machine completely, so we must attempt recovery. */ if(!from_user && address >= PAGE_OFFSET) @@ -230,6 +234,7 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write, * we can handle it.. */ good_area: + info.si_code = SEGV_ACCERR; if(write) { if(!(vma->vm_flags & VM_WRITE)) goto bad_area; @@ -238,18 +243,47 @@ good_area: if(!(vma->vm_flags & (VM_READ | VM_EXEC))) goto bad_area; } - if (!handle_mm_fault(current, vma, address, write)) - goto do_sigbus; + + /* + * If for any reason at all we couldn't handle the fault, + * make sure we exit gracefully rather than endlessly redo + * the fault. + */ + { + int fault = handle_mm_fault(tsk, vma, address, write); + if (fault < 0) + goto out_of_memory; + if (!fault) + goto do_sigbus; + } up(&mm->mmap_sem); return; + /* * Something tried to access memory that isn't in our memory map.. * Fix it, but check if it's kernel or user first.. */ bad_area: up(&mm->mmap_sem); + + /* User mode accesses just cause a SIGSEGV */ + if(from_user) { +#if 0 + printk("Fault whee %s [%d]: segfaults at %08lx pc=%08lx\n", + tsk->comm, tsk->pid, address, regs->pc); +#endif + info.si_signo = SIGSEGV; + info.si_errno = 0; + /* info.si_code set above to make clear whether + this was a SEGV_MAPERR or SEGV_ACCERR fault. */ + info.si_addr = (void *)address; + info.si_trapno = 0; + force_sig_info (SIGSEGV, &info, tsk); + return; + } + /* Is this in ex_table? */ -do_kernel_fault: +no_context: g2 = regs->u_regs[UREG_G2]; if (!from_user && (fixup = search_exception_table (regs->pc, &g2))) { if (fixup > 10) { /* Values below are reserved for other things */ @@ -276,26 +310,31 @@ do_kernel_fault: return; } } - if(from_user) { -#if 0 - printk("Fault whee %s [%d]: segfaults at %08lx pc=%08lx\n", - tsk->comm, tsk->pid, address, regs->pc); -#endif - tsk->thread.sig_address = address; - tsk->thread.sig_desc = SUBSIG_NOMAPPING; - force_sig(SIGSEGV, tsk); - return; - } + unhandled_fault (address, tsk, regs); - return; + do_exit(SIGKILL); + +/* + * We ran out of memory, or some other thing happened to us that made + * us unable to handle the page fault gracefully. + */ +out_of_memory: + up(&mm->mmap_sem); + printk("VM: killing process %s\n", tsk->comm); + if (from_user) + do_exit(SIGKILL); + goto no_context; do_sigbus: up(&mm->mmap_sem); - tsk->thread.sig_address = address; - tsk->thread.sig_desc = SUBSIG_MISCERROR; - force_sig(SIGBUS, tsk); - if (! from_user) - goto do_kernel_fault; + info.si_signo = SIGBUS; + info.si_errno = 0; + info.si_code = BUS_ADRERR; + info.si_addr = (void *)address; + info.si_trapno = 0; + force_sig_info (SIGBUS, &info, tsk); + if (!from_user) + goto no_context; } asmlinkage void do_sun4c_fault(struct pt_regs *regs, int text_fault, int write, @@ -385,6 +424,9 @@ inline void force_user_fault(unsigned long address, int write) struct vm_area_struct *vma; struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; + siginfo_t info; + + info.si_code = SEGV_MAPERR; #if 0 printk("wf\n", @@ -401,6 +443,7 @@ inline void force_user_fault(unsigned long address, int write) if(expand_stack(vma, address)) goto bad_area; good_area: + info.si_code = SEGV_ACCERR; if(write) { if(!(vma->vm_flags & VM_WRITE)) goto bad_area; @@ -418,16 +461,23 @@ bad_area: printk("Window whee %s [%d]: segfaults at %08lx\n", tsk->comm, tsk->pid, address); #endif - tsk->thread.sig_address = address; - tsk->thread.sig_desc = SUBSIG_NOMAPPING; - send_sig(SIGSEGV, tsk, 1); + info.si_signo = SIGSEGV; + info.si_errno = 0; + /* info.si_code set above to make clear whether + this was a SEGV_MAPERR or SEGV_ACCERR fault. */ + info.si_addr = (void *)address; + info.si_trapno = 0; + force_sig_info (SIGSEGV, &info, tsk); return; do_sigbus: up(&mm->mmap_sem); - tsk->thread.sig_address = address; - tsk->thread.sig_desc = SUBSIG_MISCERROR; - force_sig(SIGBUS, tsk); + info.si_signo = SIGBUS; + info.si_errno = 0; + info.si_code = BUS_ADRERR; + info.si_addr = (void *)address; + info.si_trapno = 0; + force_sig_info (SIGBUS, &info, tsk); } void window_overflow_fault(void) diff --git a/arch/sparc/mm/init.c b/arch/sparc/mm/init.c index 817861a19459..bb16064ec0b0 100644 --- a/arch/sparc/mm/init.c +++ b/arch/sparc/mm/init.c @@ -1,4 +1,4 @@ -/* $Id: init.c,v 1.73 2000/01/15 00:51:26 anton Exp $ +/* $Id: init.c,v 1.76 2000/01/21 18:16:55 anton Exp $ * linux/arch/sparc/mm/init.c * * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu) @@ -45,9 +45,9 @@ unsigned long sparc_unmapped_base; struct pgtable_cache_struct pgt_quicklists; /* References to section boundaries */ -extern char __init_begin, __init_end, _start, _end, etext , edata; +extern char __init_begin, __init_end, _start, end, etext , edata; -static unsigned long totalram_pages = 0; +unsigned long totalram_pages = 0; /* * BAD_PAGE is the page that is used for page faults when linux @@ -165,7 +165,7 @@ unsigned long __init bootmem_init(void) /* Start with page aligned address of last symbol in kernel * image. */ - start_pfn = (unsigned long)__pa(PAGE_ALIGN((unsigned long) &_end)); + start_pfn = (unsigned long)__pa(PAGE_ALIGN((unsigned long) &end)); /* Adjust up to the physical address where the kernel begins. */ start_pfn += phys_base; @@ -199,11 +199,11 @@ unsigned long __init bootmem_init(void) #ifdef DEBUG_BOOTMEM prom_printf("reserve_bootmem: base[%lx] size[%lx]\n", phys_base, - (((start_pfn << PAGE_SHIFT) + - bootmap_size) - phys_base)); + (start_pfn << PAGE_SHIFT) + + bootmap_size + PAGE_SIZE-1 - phys_base); #endif - reserve_bootmem(phys_base, (((start_pfn << PAGE_SHIFT) + - bootmap_size) - phys_base)); + reserve_bootmem(phys_base, (start_pfn << PAGE_SHIFT) + + bootmap_size + PAGE_SIZE-1 - phys_base); #ifdef DEBUG_BOOTMEM prom_printf("init_bootmem: return end_pfn[%lx]\n", end_pfn); @@ -366,10 +366,12 @@ void __init mem_init(void) int datapages = 0; int initpages = 0; int i; +#ifdef CONFIG_BLK_DEV_INITRD unsigned long addr, last; +#endif /* Saves us work later. */ - memset((void *) ZERO_PAGE(0), 0, PAGE_SIZE); + memset((void *)&empty_zero_page, 0, PAGE_SIZE); i = last_valid_pfn >> (8 + 5); i += 1; @@ -386,7 +388,7 @@ void __init mem_init(void) /* fix this */ #ifdef CONFIG_BLK_DEV_INITRD addr = __va(phys_base); - last = PAGE_ALIGN((unsigned long)&_end) + phys_base; + last = PAGE_ALIGN((unsigned long)&end) + phys_base; while(addr < last) { if (initrd_below_start_ok && addr >= initrd_start && addr < initrd_end) mem_map[MAP_NR(addr)].flags &= ~(1<> 10); } void si_meminfo(struct sysinfo *val) diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c index 77e9aa841166..10990f3ead6a 100644 --- a/arch/sparc/mm/srmmu.c +++ b/arch/sparc/mm/srmmu.c @@ -1,4 +1,4 @@ -/* $Id: srmmu.c,v 1.203 2000/01/15 00:51:28 anton Exp $ +/* $Id: srmmu.c,v 1.205 2000/01/21 17:59:46 anton Exp $ * srmmu.c: SRMMU specific routines for memory management. * * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu) @@ -1204,8 +1204,12 @@ static inline void map_kernel(void) { int i; + if (phys_base > 0) { + do_large_mapping(PAGE_OFFSET, phys_base); + } + for (i = 0; sp_banks[i].num_bytes != 0; i++) { - map_spbank(__va(sp_banks[i].base_addr), i); + map_spbank((unsigned long)__va(sp_banks[i].base_addr), i); } init_mm.mmap->vm_start = PAGE_OFFSET; @@ -1255,7 +1259,7 @@ void __init srmmu_paging_init(void) last_valid_pfn = end_pfn = bootmem_init(); - srmmu_allocate_ptable_skeleton(KERNBASE, __va(end_of_phys_memory)); + srmmu_allocate_ptable_skeleton(KERNBASE, (unsigned long)__va(end_of_phys_memory)); #if CONFIG_SUN_IO srmmu_allocate_ptable_skeleton(sparc_iomap.start, IOBASE_END); srmmu_allocate_ptable_skeleton(DVMA_VADDR, DVMA_END); diff --git a/arch/sparc64/config.in b/arch/sparc64/config.in index 448dad773de4..ace1366bae52 100644 --- a/arch/sparc64/config.in +++ b/arch/sparc64/config.in @@ -1,4 +1,4 @@ -# $Id: config.in,v 1.86 1999/12/23 01:46:09 davem Exp $ +# $Id: config.in,v 1.87 2000/01/16 06:18:53 davem Exp $ # For a description of the syntax of this configuration file, # see the Configure script. # diff --git a/arch/sparc64/kernel/ioctl32.c b/arch/sparc64/kernel/ioctl32.c index cba5cfac3bd2..7a7315003fff 100644 --- a/arch/sparc64/kernel/ioctl32.c +++ b/arch/sparc64/kernel/ioctl32.c @@ -1,4 +1,4 @@ -/* $Id: ioctl32.c,v 1.73 2000/01/11 01:06:47 davem Exp $ +/* $Id: ioctl32.c,v 1.74 2000/01/15 04:47:48 davem Exp $ * ioctl32.c: Conversion between 32bit and 64bit native ioctls. * * Copyright (C) 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz) @@ -39,6 +39,7 @@ #include #include #include +#include #include /* Ugly hack. */ @@ -1734,6 +1735,24 @@ static int do_unimap_ioctl(struct file *file, int cmd, struct unimapdesc32 *user return 0; } +static int do_smb_getmountuid(unsigned int fd, unsigned int cmd, unsigned long arg) +{ + mm_segment_t old_fs = get_fs(); + __kernel_uid_t kuid; + int err; + + cmd = SMB_IOC_GETMOUNTUID; + + set_fs(KERNEL_DS); + err = sys_ioctl(fd, cmd, (unsigned long)&kuid); + set_fs(old_fs); + + if (err >= 0) + err = put_user(kuid, (__kernel_uid_t32 *)arg); + + return err; +} + asmlinkage int sys32_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) { struct file * filp; @@ -1921,6 +1940,11 @@ asmlinkage int sys32_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) error = do_video_ioctl(fd, cmd, arg); goto out; + /* One SMB ioctl needs translations. */ + case _IOR('u', 1, __kernel_uid_t32): /* SMB_IOC_GETMOUNTUID */ + error = do_smb_getmountuid(fd, cmd, arg); + goto out; + /* List here exlicitly which ioctl's are known to have * compatable types passed or none at all... */ @@ -2427,6 +2451,9 @@ asmlinkage int sys32_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) case RAW_SETBIND: case RAW_GETBIND: + /* SMB ioctls which do not need any translations */ + case SMB_IOC_NEWCONN: + error = sys_ioctl (fd, cmd, arg); goto out; diff --git a/arch/sparc64/kernel/irq.c b/arch/sparc64/kernel/irq.c index b70d936c35f4..820b93bb4c7b 100644 --- a/arch/sparc64/kernel/irq.c +++ b/arch/sparc64/kernel/irq.c @@ -1,4 +1,4 @@ -/* $Id: irq.c,v 1.80 1999/12/06 03:14:48 davem Exp $ +/* $Id: irq.c,v 1.81 2000/01/21 06:33:59 davem Exp $ * irq.c: UltraSparc IRQ handling/init/registry. * * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) @@ -756,7 +756,7 @@ void handler_irq(int irq, struct pt_regs *regs) * of our buddy. */ if(should_forward != 0) { - buddy = cpu_number_map[cpu] + 1; + buddy = cpu_number_map(cpu) + 1; if (buddy >= NR_CPUS || (buddy = cpu_logical_map(buddy)) == -1) buddy = cpu_logical_map(0); diff --git a/arch/sparc64/kernel/process.c b/arch/sparc64/kernel/process.c index 922e74d2e69c..503f5b875a8f 100644 --- a/arch/sparc64/kernel/process.c +++ b/arch/sparc64/kernel/process.c @@ -1,4 +1,4 @@ -/* $Id: process.c,v 1.102 1999/12/15 22:24:49 davem Exp $ +/* $Id: process.c,v 1.103 2000/01/21 11:38:53 jj Exp $ * arch/sparc64/kernel/process.c * * Copyright (C) 1995, 1996 David S. Miller (davem@caip.rutgers.edu) @@ -364,8 +364,6 @@ void show_thread(struct thread_struct *thread) printk("kregs: 0x%016lx\n", (unsigned long)thread->kregs); show_regs(thread->kregs); #endif - printk("sig_address: 0x%016lx\n", thread->sig_address); - printk("sig_desc: 0x%016lx\n", thread->sig_desc); printk("ksp: 0x%016lx\n", thread->ksp); if (thread->w_saved) { @@ -701,7 +699,6 @@ void dump_thread(struct pt_regs * regs, struct user * dump) memcpy(&dump->fpu.fpstatus.fregs.regs[0], ¤t->thread.float_regs[0], (sizeof(unsigned long) * 32)); dump->fpu.fpstatus.fsr = current->thread.fsr; dump->fpu.fpstatus.flags = dump->fpu.fpstatus.extra = 0; - dump->sigcode = current->thread.sig_desc; #endif } diff --git a/arch/sparc64/kernel/signal32.c b/arch/sparc64/kernel/signal32.c index f0e512666532..f226a8ae5d84 100644 --- a/arch/sparc64/kernel/signal32.c +++ b/arch/sparc64/kernel/signal32.c @@ -1,4 +1,4 @@ -/* $Id: signal32.c,v 1.58 2000/01/14 09:40:08 jj Exp $ +/* $Id: signal32.c,v 1.59 2000/01/21 11:38:52 jj Exp $ * arch/sparc64/kernel/signal32.c * * Copyright (C) 1991, 1992 Linus Torvalds @@ -101,6 +101,44 @@ struct rt_signal_frame32 { #define NF_ALIGNEDSZ (((sizeof(struct new_signal_frame32) + 7) & (~7))) #define RT_ALIGNEDSZ (((sizeof(struct rt_signal_frame32) + 7) & (~7))) +int copy_siginfo_to_user32(siginfo_t32 *to, siginfo_t *from) +{ + int err; + + if (!access_ok (VERIFY_WRITE, to, sizeof(siginfo_t32))) + return -EFAULT; + + err = __put_user(from->si_signo, &to->si_signo); + err |= __put_user(from->si_errno, &to->si_errno); + err |= __put_user(from->si_code, &to->si_code); + if (from->si_code < 0) + err |= __copy_to_user(&to->_sifields._pad, &from->_sifields._pad, SI_PAD_SIZE); + else { + int signo = from->si_signo; + if (from->si_code == SI_USER || from->si_code == SI_KERNEL) + signo = SIGRTMIN; + switch (signo) { + case SIGCHLD: + err |= __put_user(from->si_utime, &to->si_utime); + err |= __put_user(from->si_stime, &to->si_stime); + err |= __put_user(from->si_status, &to->si_status); + default: + err |= __put_user(from->si_pid, &to->si_pid); + err |= __put_user(from->si_uid, &to->si_uid); + break; + case SIGSEGV: + case SIGILL: + case SIGFPE: + case SIGBUS: + case SIGEMT: + err |= __put_user(from->si_trapno, &to->si_trapno); + err |= __put_user((long)from->si_addr, &to->si_addr); + break; + } + } + return err; +} + /* * atomically swap in the new signal mask, and wait for a signal. * This is really tricky on the Sparc, watch out... @@ -436,13 +474,16 @@ static inline void *get_sigframe(struct sigaction *sa, struct pt_regs *regs, uns } static void -setup_frame32(struct sigaction *sa, unsigned long pc, unsigned long npc, - struct pt_regs *regs, int signr, sigset_t *oldset) +setup_frame32(struct sigaction *sa, struct pt_regs *regs, int signr, sigset_t *oldset, siginfo_t *info) { struct signal_sframe32 *sframep; struct sigcontext32 *sc; unsigned seta[_NSIG_WORDS32]; int err = 0; + void *sig_address; + int sig_code; + unsigned long pc = regs->tpc; + unsigned long npc = regs->tnpc; #if 0 int window = 0; @@ -513,17 +554,61 @@ setup_frame32(struct sigaction *sa, unsigned long pc, unsigned long npc, current->thread.w_saved = 0; /* So process is allowed to execute. */ err |= __put_user(signr, &sframep->sig_num); - if(signr == SIGSEGV || - signr == SIGILL || - signr == SIGFPE || - signr == SIGBUS || - signr == SIGEMT) { - err |= __put_user(current->thread.sig_desc, &sframep->sig_code); - err |= __put_user(current->thread.sig_address, &sframep->sig_address); - } else { - err |= __put_user(0, &sframep->sig_code); - err |= __put_user(0, &sframep->sig_address); + sig_address = NULL; + sig_code = 0; + if (SI_FROMKERNEL (info) && (info->si_code & __SI_MASK) == __SI_FAULT) { + sig_address = info->si_addr; + switch (signr) { + case SIGSEGV: + switch (info->si_code) { + case SEGV_MAPERR: sig_code = SUBSIG_NOMAPPING; break; + default: sig_code = SUBSIG_PROTECTION; break; + } + break; + case SIGILL: + switch (info->si_code) { + case ILL_ILLOPC: sig_code = SUBSIG_ILLINST; break; + case ILL_PRVOPC: sig_code = SUBSIG_PRIVINST; break; + case ILL_ILLTRP: sig_code = SUBSIG_BADTRAP (info->si_trapno); break; + default: sig_code = SUBSIG_STACK; break; + } + break; + case SIGFPE: + switch (info->si_code) { + case FPE_INTDIV: sig_code = SUBSIG_IDIVZERO; break; + case FPE_INTOVF: sig_code = SUBSIG_FPINTOVFL; break; + case FPE_FLTDIV: sig_code = SUBSIG_FPDIVZERO; break; + case FPE_FLTOVF: sig_code = SUBSIG_FPOVFLOW; break; + case FPE_FLTUND: sig_code = SUBSIG_FPUNFLOW; break; + case FPE_FLTRES: sig_code = SUBSIG_FPINEXACT; break; + case FPE_FLTINV: sig_code = SUBSIG_FPOPERROR; break; + default: sig_code = SUBSIG_FPERROR; break; + } + break; + case SIGBUS: + switch (info->si_code) { + case BUS_ADRALN: sig_code = SUBSIG_ALIGNMENT; break; + case BUS_ADRERR: sig_code = SUBSIG_MISCERROR; break; + default: sig_code = SUBSIG_BUSTIMEOUT; break; + } + break; + case SIGEMT: + switch (info->si_code) { + case EMT_TAGOVF: sig_code = SUBSIG_TAG; break; + } + break; + case SIGSYS: + if (info->si_code == (__SI_FAULT|0x100)) { + /* See sys_sunos32.c */ + sig_code = info->si_trapno; + break; + } + default: + sig_address = NULL; + } } + err |= __put_user((long)sig_address, &sframep->sig_address); + err |= __put_user(sig_code, &sframep->sig_code); err |= __put_user((u64)sc, &sframep->sig_scptr); if (err) goto sigsegv; @@ -790,8 +875,7 @@ setup_svr4_frame32(struct sigaction *sa, unsigned long pc, unsigned long npc, /* Setup the signal information. Solaris expects a bunch of * information to be passed to the signal handler, we don't provide - * that much currently, should use those that David already - * is providing with thread.sig_desc + * that much currently, should use siginfo. */ err |= __put_user(signr, &si->siginfo.signo); err |= __put_user(SVR4_SINOINFO, &si->siginfo.code); @@ -1034,61 +1118,8 @@ static inline void setup_rt_frame32(struct k_sigaction *ka, struct pt_regs *regs err |= __put_user(0, &sf->fpu_save); } - /* Update the siginfo structure. Is this good? */ - if (info->si_code == 0) { - info->si_signo = signr; - info->si_errno = 0; - - switch (signr) { - case SIGSEGV: - case SIGILL: - case SIGFPE: - case SIGBUS: - case SIGEMT: - info->si_code = current->thread.sig_desc; - info->si_addr = (void *)current->thread.sig_address; - info->si_trapno = 0; - break; - default: - break; - } - } - - err = __put_user (info->si_signo, &sf->info.si_signo); - err |= __put_user (info->si_errno, &sf->info.si_errno); - err |= __put_user (info->si_code, &sf->info.si_code); - if (info->si_code < 0) - err |= __copy_to_user (sf->info._sifields._pad, info->_sifields._pad, SI_PAD_SIZE); - else { - i = info->si_signo; - if (info->si_code == SI_USER) - i = SIGRTMIN; - switch (i) { - case SIGPOLL: - err |= __put_user (info->si_band, &sf->info.si_band); - err |= __put_user (info->si_fd, &sf->info.si_fd); - break; - case SIGCHLD: - err |= __put_user (info->si_pid, &sf->info.si_pid); - err |= __put_user (info->si_uid, &sf->info.si_uid); - err |= __put_user (info->si_status, &sf->info.si_status); - err |= __put_user (info->si_utime, &sf->info.si_utime); - err |= __put_user (info->si_stime, &sf->info.si_stime); - break; - case SIGSEGV: - case SIGILL: - case SIGFPE: - case SIGBUS: - case SIGEMT: - err |= __put_user ((long)info->si_addr, &sf->info.si_addr); - err |= __put_user (info->si_trapno, &sf->info.si_trapno); - break; - default: - err |= __put_user (info->si_pid, &sf->info.si_pid); - err |= __put_user (info->si_uid, &sf->info.si_uid); - break; - } - } + /* Update the siginfo structure. */ + err |= copy_siginfo_to_user32(&sf->info, info); /* Setup sigaltstack */ err |= __put_user(current->sas_ss_sp, &sf->stack.ss_sp); @@ -1174,7 +1205,7 @@ static inline void handle_signal32(unsigned long signr, struct k_sigaction *ka, else if (current->thread.flags & SPARC_FLAG_NEWSIGNALS) new_setup_frame32(ka, regs, signr, oldset); else - setup_frame32(&ka->sa, regs->tpc, regs->tnpc, regs, signr, oldset); + setup_frame32(&ka->sa, regs, signr, oldset, info); } if(ka->sa.sa_flags & SA_ONESHOT) ka->sa.sa_handler = SIG_DFL; diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c index 2fa4945d82fd..6e8899435f83 100644 --- a/arch/sparc64/kernel/smp.c +++ b/arch/sparc64/kernel/smp.c @@ -38,7 +38,7 @@ extern unsigned prom_cpu_nodes[]; struct cpuinfo_sparc cpu_data[NR_CPUS] __attribute__ ((aligned (64))); -volatile int cpu_number_map[NR_CPUS] __attribute__ ((aligned (64))); +volatile int __cpu_number_map[NR_CPUS] __attribute__ ((aligned (64))); volatile int __cpu_logical_map[NR_CPUS] __attribute__ ((aligned (64))); /* Please don't make this stuff initdata!!! --DaveM */ @@ -243,7 +243,7 @@ void __init smp_boot_cpus(void) udelay(100); } if(callin_flag) { - cpu_number_map[i] = cpucount; + __cpu_number_map[i] = cpucount; __cpu_logical_map[cpucount] = i; prom_cpu_nodes[i] = linux_cpus[no].prom_node; prom_printf("OK\n"); @@ -255,7 +255,7 @@ void __init smp_boot_cpus(void) } if(!callin_flag) { cpu_present_map &= ~(1UL << i); - cpu_number_map[i] = -1; + __cpu_number_map[i] = -1; } } cpu_new_task = NULL; @@ -697,10 +697,10 @@ void __init smp_tick_init(void) for(i = 0; i < linux_num_cpus; i++) cpu_present_map |= (1UL << linux_cpus[i].mid); for(i = 0; i < NR_CPUS; i++) { - cpu_number_map[i] = -1; + __cpu_number_map[i] = -1; __cpu_logical_map[i] = -1; } - cpu_number_map[boot_cpu_id] = 0; + __cpu_number_map[boot_cpu_id] = 0; prom_cpu_nodes[boot_cpu_id] = linux_cpus[0].prom_node; __cpu_logical_map[0] = boot_cpu_id; current->processor = boot_cpu_id; diff --git a/arch/sparc64/kernel/sys_sparc.c b/arch/sparc64/kernel/sys_sparc.c index af4743ec127a..206eee5d0a99 100644 --- a/arch/sparc64/kernel/sys_sparc.c +++ b/arch/sparc64/kernel/sys_sparc.c @@ -1,4 +1,4 @@ -/* $Id: sys_sparc.c,v 1.33 2000/01/11 17:33:25 jj Exp $ +/* $Id: sys_sparc.c,v 1.34 2000/01/21 11:39:06 jj Exp $ * linux/arch/sparc64/kernel/sys_sparc.c * * This file contains various random system calls that @@ -223,11 +223,18 @@ c_sys_nis_syscall (struct pt_regs *regs) asmlinkage void sparc_breakpoint (struct pt_regs *regs) { + siginfo_t info; + lock_kernel(); #ifdef DEBUG_SPARC_BREAKPOINT printk ("TRAP: Entering kernel PC=%lx, nPC=%lx\n", regs->tpc, regs->tnpc); #endif - force_sig(SIGTRAP, current); + info.si_signo = SIGTRAP; + info.si_errno = 0; + info.si_code = TRAP_BRKPT; + info.si_addr = (void *)regs->tpc; + info.si_trapno = 0; + force_sig_info(SIGTRAP, &info, current); #ifdef DEBUG_SPARC_BREAKPOINT printk ("TRAP: Returning to space: PC=%lx nPC=%lx\n", regs->tpc, regs->tnpc); #endif diff --git a/arch/sparc64/kernel/sys_sparc32.c b/arch/sparc64/kernel/sys_sparc32.c index 4083680603df..61b9a33977b3 100644 --- a/arch/sparc64/kernel/sys_sparc32.c +++ b/arch/sparc64/kernel/sys_sparc32.c @@ -1,4 +1,4 @@ -/* $Id: sys_sparc32.c,v 1.130 2000/01/14 09:40:07 jj Exp $ +/* $Id: sys_sparc32.c,v 1.131 2000/01/21 11:38:54 jj Exp $ * sys_sparc32.c: Conversion between 32bit and 64bit native syscalls. * * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz) @@ -2082,7 +2082,7 @@ sys32_rt_sigtimedwait(sigset_t32 *uthese, siginfo_t32 *uinfo, sigset_t s; sigset_t32 s32; struct timespec t; - int ret, err, i; + int ret; mm_segment_t old_fs = get_fs(); siginfo_t info; @@ -2104,42 +2104,8 @@ sys32_rt_sigtimedwait(sigset_t32 *uthese, siginfo_t32 *uinfo, ret = sys_rt_sigtimedwait(&s, &info, &t, sigsetsize); set_fs (old_fs); if (ret >= 0 && uinfo) { - err = put_user (info.si_signo, &uinfo->si_signo); - err |= __put_user (info.si_errno, &uinfo->si_errno); - err |= __put_user (info.si_code, &uinfo->si_code); - if (info.si_code < 0) - err |= __copy_to_user (uinfo->_sifields._pad, info._sifields._pad, SI_PAD_SIZE); - else { - i = info.si_signo; - if (info.si_code == SI_USER) - i = SIGRTMIN; - switch (i) { - case SIGPOLL: - err |= __put_user (info.si_band, &uinfo->si_band); - err |= __put_user (info.si_fd, &uinfo->si_fd); - break; - case SIGCHLD: - err |= __put_user (info.si_pid, &uinfo->si_pid); - err |= __put_user (info.si_uid, &uinfo->si_uid); - err |= __put_user (info.si_status, &uinfo->si_status); - err |= __put_user (info.si_utime, &uinfo->si_utime); - err |= __put_user (info.si_stime, &uinfo->si_stime); - break; - case SIGSEGV: - case SIGILL: - case SIGFPE: - case SIGBUS: - case SIGEMT: - err |= __put_user ((long)info.si_addr, &uinfo->si_addr); - err |= __put_user (info.si_trapno, &uinfo->si_trapno); - break; - default: - err |= __put_user (info.si_pid, &uinfo->si_pid); - err |= __put_user (info.si_uid, &uinfo->si_uid); - break; - } - } - if (err) + extern int copy_siginfo_to_user32(siginfo_t32 *, siginfo_t *); + if (copy_siginfo_to_user32(uinfo, &info)) ret = -EFAULT; } return ret; diff --git a/arch/sparc64/kernel/sys_sunos32.c b/arch/sparc64/kernel/sys_sunos32.c index ffc72b74d4d9..5297954ef422 100644 --- a/arch/sparc64/kernel/sys_sunos32.c +++ b/arch/sparc64/kernel/sys_sunos32.c @@ -1,4 +1,4 @@ -/* $Id: sys_sunos32.c,v 1.35 2000/01/06 23:51:50 davem Exp $ +/* $Id: sys_sunos32.c,v 1.37 2000/01/21 11:39:03 jj Exp $ * sys_sunos32.c: SunOS binary compatability layer on sparc64. * * Copyright (C) 1995, 1996, 1997 David S. Miller (davem@caip.rutgers.edu) @@ -323,6 +323,7 @@ asmlinkage u32 sunos_sigblock(u32 blk_mask) spin_lock_irq(¤t->sigmask_lock); old = (u32) current->blocked.sig[0]; current->blocked.sig[0] |= (blk_mask & _BLOCKABLE); + recalc_sigpending(current); spin_unlock_irq(¤t->sigmask_lock); return old; } @@ -334,6 +335,7 @@ asmlinkage u32 sunos_sigsetmask(u32 newmask) spin_lock_irq(¤t->sigmask_lock); retval = (u32) current->blocked.sig[0]; current->blocked.sig[0] = (newmask & _BLOCKABLE); + recalc_sigpending(current); spin_unlock_irq(¤t->sigmask_lock); return retval; } @@ -555,15 +557,22 @@ asmlinkage int sunos_uname(struct sunos_utsname *name) asmlinkage int sunos_nosys(void) { struct pt_regs *regs; + siginfo_t info; + static int cnt; lock_kernel(); regs = current->thread.kregs; - current->thread.sig_address = regs->tpc; - current->thread.sig_desc = regs->u_regs[UREG_G1]; - send_sig(SIGSYS, current, 1); - printk("Process makes ni_syscall number %d, register dump:\n", - (int) regs->u_regs[UREG_G1]); - show_regs(regs); + info.si_signo = SIGSYS; + info.si_errno = 0; + info.si_code = __SI_FAULT|0x100; + info.si_addr = (void *)regs->tpc; + info.si_trapno = regs->u_regs[UREG_G1]; + send_sig_info(SIGSYS, &info, current); + if (cnt++ < 4) { + printk("Process makes ni_syscall number %d, register dump:\n", + (int) regs->u_regs[UREG_G1]); + show_regs(regs); + } unlock_kernel(); return -ENOSYS; } diff --git a/arch/sparc64/kernel/systbls.S b/arch/sparc64/kernel/systbls.S index b66a116b43cc..2c61f6623d89 100644 --- a/arch/sparc64/kernel/systbls.S +++ b/arch/sparc64/kernel/systbls.S @@ -1,4 +1,4 @@ -/* $Id: systbls.S,v 1.65 2000/01/14 07:12:34 davem Exp $ +/* $Id: systbls.S,v 1.66 2000/01/16 06:20:48 davem Exp $ * systbls.S: System call entry point tables for OS compatibility. * The native Linux system call table lives here also. * @@ -34,15 +34,15 @@ sys_call_table32: /*60*/ .word sys_umask, sys_chroot, sys32_newfstat, sys_fstat64, sys_getpagesize .word sys_msync, sys_vfork, sys32_pread, sys32_pwrite, sys_geteuid /*70*/ .word sys_getegid, sys32_mmap, sys_setreuid, sys_munmap, sys_mprotect - .word sys_setregid, sys_vhangup, sys32_truncate64, sys_getgroups, sys32_getgroups16 + .word sys_nis_syscall, sys_vhangup, sys32_truncate64, sys_nis_syscall, sys32_getgroups16 /*80*/ .word sys32_setgroups16, sys_getpgrp, sys_setgroups, sys32_setitimer, sys32_ftruncate64 .word sys_swapon, sys32_getitimer, sys_setuid, sys_sethostname, sys_setgid /*90*/ .word sys_dup2, sys_setfsuid, sys32_fcntl, sys32_select, sys_setfsgid .word sys_fsync, sys_setpriority, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall /*100*/ .word sys_getpriority, sys32_rt_sigreturn, sys32_rt_sigaction, sys32_rt_sigprocmask, sys32_rt_sigpending .word sys32_rt_sigtimedwait, sys32_rt_sigqueueinfo, sys32_rt_sigsuspend, sys_setresuid, sys_getresuid -/*110*/ .word sys_setresgid, sys_getresgid, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall - .word sys_nis_syscall, sys32_gettimeofday, sys32_getrusage, sys_nis_syscall, sys_getcwd +/*110*/ .word sys_setresgid, sys_getresgid, sys_setregid, sys_nis_syscall, sys_nis_syscall + .word sys_getgroups, sys32_gettimeofday, sys32_getrusage, sys_nis_syscall, sys_getcwd /*120*/ .word sys32_readv, sys32_writev, sys32_settimeofday, sys32_fchown16, sys_fchmod .word sys_nis_syscall, sys32_setreuid16, sys32_setregid16, sys_rename, sys_truncate /*130*/ .word sys_ftruncate, sys_flock, sys_lstat64, sys_nis_syscall, sys_nis_syscall diff --git a/arch/sparc64/kernel/traps.c b/arch/sparc64/kernel/traps.c index 8458097091ad..29032a90151d 100644 --- a/arch/sparc64/kernel/traps.c +++ b/arch/sparc64/kernel/traps.c @@ -1,8 +1,8 @@ -/* $Id: traps.c,v 1.64 1999/12/19 23:53:13 davem Exp $ +/* $Id: traps.c,v 1.65 2000/01/21 11:39:01 jj Exp $ * arch/sparc64/kernel/traps.c * * Copyright (C) 1995,1997 David S. Miller (davem@caip.rutgers.edu) - * Copyright (C) 1997,1999 Jakub Jelinek (jj@sunsite.mff.cuni.cz) + * Copyright (C) 1997,1999,2000 Jakub Jelinek (jakub@redhat.com) */ /* @@ -253,6 +253,8 @@ void rtrap_check(struct pt_regs *regs) void bad_trap (struct pt_regs *regs, long lvl) { + siginfo_t info; + lock_kernel (); if (lvl < 0x100) { char buffer[24]; @@ -262,9 +264,12 @@ void bad_trap (struct pt_regs *regs, long lvl) } if (regs->tstate & TSTATE_PRIV) die_if_kernel ("Kernel bad trap", regs); - current->thread.sig_desc = SUBSIG_BADTRAP(lvl - 0x100); - current->thread.sig_address = regs->tpc; - force_sig(SIGILL, current); + info.si_signo = SIGILL; + info.si_errno = 0; + info.si_code = ILL_ILLTRP; + info.si_addr = (void *)regs->tpc; + info.si_trapno = lvl - 0x100; + force_sig_info(SIGILL, &info, current); unlock_kernel (); } @@ -281,6 +286,8 @@ void bad_trap_tl1 (struct pt_regs *regs, long lvl) void instruction_access_exception (struct pt_regs *regs, unsigned long sfsr, unsigned long sfar) { + siginfo_t info; + lock_kernel(); if (regs->tstate & TSTATE_PRIV) { #if 1 @@ -289,15 +296,20 @@ void instruction_access_exception (struct pt_regs *regs, #endif die_if_kernel("Iax", regs); } - current->thread.sig_desc = SUBSIG_ILLINST; - current->thread.sig_address = regs->tpc; - force_sig(SIGILL, current); + info.si_signo = SIGSEGV; + info.si_errno = 0; + info.si_code = SEGV_MAPERR; + info.si_addr = (void *)regs->tpc; + info.si_trapno = 0; + force_sig_info(SIGSEGV, &info, current); unlock_kernel(); } void data_access_exception (struct pt_regs *regs, unsigned long sfsr, unsigned long sfar) { + siginfo_t info; + if (regs->tstate & TSTATE_PRIV) { /* Test if this comes from uaccess places. */ unsigned long fixup, g2; @@ -326,8 +338,13 @@ void data_access_exception (struct pt_regs *regs, else rtrap_check(regs); #endif + info.si_signo = SIGSEGV; + info.si_errno = 0; + info.si_code = SEGV_MAPERR; + info.si_addr = (void *)sfar; + info.si_trapno = 0; lock_kernel(); - force_sig(SIGSEGV, current); + force_sig_info(SIGSEGV, &info, current); unlock_kernel(); } @@ -361,6 +378,22 @@ static __inline__ void clean_and_reenable_l1_caches(void) : "memory"); } +void do_iae(struct pt_regs *regs) +{ + siginfo_t info; + + clean_and_reenable_l1_caches(); + + info.si_signo = SIGBUS; + info.si_errno = 0; + info.si_code = BUS_OBJERR; + info.si_addr = (void *)0; + info.si_trapno = 0; + lock_kernel(); + force_sig_info(SIGBUS, &info, current); + unlock_kernel(); +} + void do_dae(struct pt_regs *regs) { #ifdef CONFIG_PCI @@ -381,19 +414,7 @@ void do_dae(struct pt_regs *regs) return; } #endif - clean_and_reenable_l1_caches(); - lock_kernel(); - force_sig(SIGSEGV, current); - unlock_kernel(); -} - -void do_iae(struct pt_regs *regs) -{ - clean_and_reenable_l1_caches(); - - lock_kernel(); - force_sig(SIGSEGV, current); - unlock_kernel(); + do_iae(regs); } static char ecc_syndrome_table[] = { @@ -521,22 +542,26 @@ void do_fpe_common(struct pt_regs *regs) regs->tnpc += 4; } else { unsigned long fsr = current->thread.xfsr[0]; + siginfo_t info; - current->thread.sig_address = regs->tpc; - current->thread.sig_desc = SUBSIG_FPERROR; + info.si_signo = SIGFPE; + info.si_errno = 0; + info.si_addr = (void *)regs->tpc; + info.si_trapno = 0; + info.si_code = __SI_FAULT; if ((fsr & 0x1c000) == (1 << 14)) { - if (fsr & 0x01) - current->thread.sig_desc = SUBSIG_FPINEXACT; - else if (fsr & 0x02) - current->thread.sig_desc = SUBSIG_FPDIVZERO; - else if (fsr & 0x04) - current->thread.sig_desc = SUBSIG_FPUNFLOW; + if (fsr & 0x10) + info.si_code = FPE_FLTINV; else if (fsr & 0x08) - current->thread.sig_desc = SUBSIG_FPOVFLOW; - else if (fsr & 0x10) - current->thread.sig_desc = SUBSIG_FPINTOVFL; + info.si_code = FPE_FLTOVF; + else if (fsr & 0x04) + info.si_code = FPE_FLTUND; + else if (fsr & 0x02) + info.si_code = FPE_FLTDIV; + else if (fsr & 0x01) + info.si_code = FPE_FLTRES; } - send_sig(SIGFPE, current, 1); + send_sig_info(SIGFPE, &info, current); } } @@ -570,24 +595,34 @@ void do_fpother(struct pt_regs *regs) void do_tof(struct pt_regs *regs) { + siginfo_t info; + if(regs->tstate & TSTATE_PRIV) die_if_kernel("Penguin overflow trap from kernel mode", regs); - current->thread.sig_address = regs->tpc; - current->thread.sig_desc = SUBSIG_TAG; /* as good as any */ - send_sig(SIGEMT, current, 1); + info.si_signo = SIGEMT; + info.si_errno = 0; + info.si_code = EMT_TAGOVF; + info.si_addr = (void *)regs->tpc; + info.si_trapno = 0; + send_sig_info(SIGEMT, &info, current); } void do_div0(struct pt_regs *regs) { - current->thread.sig_address = regs->tpc; - current->thread.sig_desc = SUBSIG_IDIVZERO; - send_sig(SIGFPE, current, 1); + siginfo_t info; + + info.si_signo = SIGFPE; + info.si_errno = 0; + info.si_code = FPE_INTDIV; + info.si_addr = (void *)regs->tpc; + info.si_trapno = 0; + send_sig_info(SIGFPE, &info, current); } void instruction_dump (unsigned int *pc) { int i; - + if((((unsigned long) pc) & 3)) return; @@ -671,6 +706,7 @@ void do_illegal_instruction(struct pt_regs *regs) unsigned long pc = regs->tpc; unsigned long tstate = regs->tstate; u32 insn; + siginfo_t info; if(tstate & TSTATE_PRIV) die_if_kernel("Kernel illegal instruction", regs); @@ -685,56 +721,48 @@ void do_illegal_instruction(struct pt_regs *regs) return; } } - current->thread.sig_address = pc; - current->thread.sig_desc = SUBSIG_ILLINST; - send_sig(SIGILL, current, 1); + info.si_signo = SIGILL; + info.si_errno = 0; + info.si_code = ILL_ILLOPC; + info.si_addr = (void *)pc; + info.si_trapno = 0; + send_sig_info(SIGILL, &info, current); } void mem_address_unaligned(struct pt_regs *regs, unsigned long sfar, unsigned long sfsr) { + siginfo_t info; + if(regs->tstate & TSTATE_PRIV) { extern void kernel_unaligned_trap(struct pt_regs *regs, unsigned int insn, unsigned long sfar, unsigned long sfsr); return kernel_unaligned_trap(regs, *((unsigned int *)regs->tpc), sfar, sfsr); - } else { - current->thread.sig_address = regs->tpc; - current->thread.sig_desc = SUBSIG_PRIVINST; - send_sig(SIGBUS, current, 1); } + info.si_signo = SIGBUS; + info.si_errno = 0; + info.si_code = BUS_ADRALN; + info.si_addr = (void *)sfar; + info.si_trapno = 0; + send_sig_info(SIGBUS, &info, current); } void do_privop(struct pt_regs *regs) { - current->thread.sig_address = regs->tpc; - current->thread.sig_desc = SUBSIG_PRIVINST; - send_sig(SIGILL, current, 1); -} - -void do_privact(struct pt_regs *regs) -{ - current->thread.sig_address = regs->tpc; - current->thread.sig_desc = SUBSIG_PRIVINST; - send_sig(SIGILL, current, 1); -} + siginfo_t info; -void do_priv_instruction(struct pt_regs *regs, unsigned long pc, unsigned long npc, - unsigned long tstate) -{ - if(tstate & TSTATE_PRIV) - die_if_kernel("Penguin instruction from Penguin mode??!?!", regs); - current->thread.sig_address = pc; - current->thread.sig_desc = SUBSIG_PRIVINST; - send_sig(SIGILL, current, 1); + info.si_signo = SIGILL; + info.si_errno = 0; + info.si_code = ILL_PRVOPC; + info.si_addr = (void *)regs->tpc; + info.si_trapno = 0; + send_sig_info(SIGILL, &info, current); } -void handle_hw_divzero(struct pt_regs *regs, unsigned long pc, - unsigned long npc, unsigned long psr) +void do_privact(struct pt_regs *regs) { - current->thread.sig_address = regs->tpc; - current->thread.sig_desc = SUBSIG_IDIVZERO; - send_sig(SIGFPE, current, 1); + do_privop(regs); } /* Trap level 1 stuff or other traps we should never see... */ diff --git a/arch/sparc64/lib/Makefile b/arch/sparc64/lib/Makefile index 90d1c4e7c791..f3067cad6833 100644 --- a/arch/sparc64/lib/Makefile +++ b/arch/sparc64/lib/Makefile @@ -1,4 +1,4 @@ -# $Id: Makefile,v 1.19 1999/07/03 22:11:08 davem Exp $ +# $Id: Makefile,v 1.20 2000/01/19 04:06:03 davem Exp $ # Makefile for Sparc library files.. # @@ -6,8 +6,8 @@ CFLAGS := $(CFLAGS) OBJS = PeeCeeI.o blockops.o debuglocks.o strlen.o strncmp.o \ memscan.o strncpy_from_user.o strlen_user.o memcmp.o checksum.o \ - VIScopy.o VISbzero.o VISmemset.o VIScsum.o VIScsumcopy.o VISsave.o \ - atomic.o rwlock.o + VIScopy.o VISbzero.o VISmemset.o VIScsum.o VIScsumcopy.o \ + VIScsumcopyusr.o VISsave.o atomic.o rwlock.o lib.a: $(OBJS) $(AR) rcs lib.a $(OBJS) diff --git a/arch/sparc64/lib/VIScsumcopy.S b/arch/sparc64/lib/VIScsumcopy.S index dbf89b4f60cc..3f89eea29e50 100644 --- a/arch/sparc64/lib/VIScsumcopy.S +++ b/arch/sparc64/lib/VIScsumcopy.S @@ -1,4 +1,4 @@ -/* $Id: VIScsumcopy.S,v 1.6 1999/05/25 16:53:03 jj Exp $ +/* $Id: VIScsumcopy.S,v 1.7 2000/01/19 04:06:03 davem Exp $ * VIScsumcopy.S: High bandwidth IP checksumming with simultaneous * copying utilizing the UltraSparc Visual Instruction Set. * @@ -75,7 +75,7 @@ membar #Sync -#define DO_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,F0,F2,F4,F6,F8,F10,F12,F14,DUMMY1,A0,A2,A4,A6,A8,A10,A12,A14,B14,DYMMY2,LOAD,STORE1,STORE2,STORE3,STORE4,STORE5,STORE6,STORE7,STORE8,DUMMY3,BRANCH...) \ +#define DO_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,F0,F2,F4,F6,F8,F10,F12,F14,DUMMY1,A0,A2,A4,A6,A8,A10,A12,A14,B14,DUMMY2,LOAD,STORE1,STORE2,STORE3,STORE4,STORE5,STORE6,STORE7,STORE8,DUMMY3,BRANCH...) \ LOAD /* Load Group */; \ faligndata %A14, %F0, %A14 /* FPA Group */; \ inc %x5 /* IEU0 */; \ diff --git a/arch/sparc64/lib/VIScsumcopyusr.S b/arch/sparc64/lib/VIScsumcopyusr.S new file mode 100644 index 000000000000..17bbe78b1200 --- /dev/null +++ b/arch/sparc64/lib/VIScsumcopyusr.S @@ -0,0 +1,914 @@ +/* $Id: VIScsumcopyusr.S,v 1.1 2000/01/19 04:06:04 davem Exp $ + * VIScsumcopyusr.S: High bandwidth IP checksumming with simultaneous + * copying utilizing the UltraSparc Visual Instruction Set. + * + * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz) + * Copyright (C) 2000 David S. Miller (davem@redhat.com) + * + * Based on older sparc32/sparc64 checksum.S, which is: + * + * Copyright(C) 1995 Linus Torvalds + * Copyright(C) 1995 Miguel de Icaza + * Copyright(C) 1996,1997 David S. Miller + * derived from: + * Linux/Alpha checksum c-code + * Linux/ix86 inline checksum assembly + * RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code) + * David Mosberger-Tang for optimized reference c-code + * BSD4.4 portable checksum routine + */ + +#ifdef __sparc_v9__ +#define STACKOFF 0x7ff+128 +#else +#define STACKOFF 64 +#endif + +#ifdef __KERNEL__ +#include +#include +#include +#include +#define ASI_BLK_XOR 0 +#define ASI_BLK_XOR1 (ASI_BLK_P ^ (ASI_BLK_P >> 3) ^ ASI_P) +#define ASI_BLK_OR (ASI_BLK_P & ~ASI_P) +#else +#define ASI_P 0x80 +#define ASI_BLK_P 0xf0 +#define FRPS_FEF 0x04 +#define FPRS_DU 0x02 +#define FPRS_DL 0x01 +#define ASI_BLK_XOR (ASI_BLK_P ^ ASI_P) +#endif + +#define src o0 +#define dst o1 +#define len o2 +#define sum o3 +#define x1 g1 +#define x2 g2 +#define x3 o4 +#define x4 g4 +#define x5 g5 +#define x6 g7 +#define x7 g3 +#define x8 o5 + +/* Dobrou noc, SunSoft engineers. Spete sladce. + * This has a couple of tricks in and those + * tricks are UltraLinux trade secrets :)) + * Once AGAIN, the SunSoft engineers are caught + * asleep at the keyboard :)). + * The main loop does about 20 superscalar cycles + * per 64bytes checksummed/copied. + */ + +#define LDBLK(O0) \ + ldda [%src] ASI_BLK_P, %O0 /* Load Group */ + +#define STBLK \ + stda %f48, [%dst] %asi /* Store */ + +#ifdef __KERNEL__ +#define STBLK_XORASI(tmpreg1,tmpreg2) \ + stda %f48, [%dst] %asi /* Store */; \ + rd %asi, %tmpreg1; \ + srl %tmpreg1, 3, %tmpreg2; \ + xor %tmpreg1, ASI_BLK_XOR1, %tmpreg1; \ + wr %tmpreg1, %tmpreg2, %asi; +#else +#define STBLK_XORASI(tmpreg1,tmpreg2) \ + stda %f48, [%dst] %asi /* Store */; \ + rd %asi, %tmpreg1; \ + wr %tmpreg1, ASI_BLK_XOR, %asi; +#endif + +#define ST(fx,off) \ + stda %fx, [%dst + off] %asi /* Store */ + +#define SYNC \ + membar #Sync + + +#define DO_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,F0,F2,F4,F6,F8,F10,F12,F14,DUMMY1,A0,A2,A4,A6,A8,A10,A12,A14,B14,DUMMY2,LOAD,STORE1,STORE2,STORE3,STORE4,STORE5,STORE6,STORE7,STORE8,DUMMY3,BRANCH...) \ + LOAD /* Load Group */; \ + faligndata %A14, %F0, %A14 /* FPA Group */; \ + inc %x5 /* IEU0 */; \ + STORE1 /* Store (optional) */; \ + faligndata %F0, %F2, %A0 /* FPA Group */; \ + srl %x5, 1, %x5 /* IEU0 */; \ + add %sum, %x4, %sum /* IEU1 */; \ + fpadd32 %F0, %f0, %F0 /* FPA Group */; \ + inc %x6 /* IEU0 */; \ + STORE2 /* Store (optional) */; \ + faligndata %F2, %F4, %A2 /* FPA Group */; \ + srl %x6, 1, %x6 /* IEU0 */; \ + add %sum, %x5, %sum /* IEU1 */; \ + fpadd32 %F2, %f2, %F2 /* FPA Group */; \ + add %src, 64, %src /* IEU0 */; \ + add %dst, 64, %dst /* IEU1 */; \ + fcmpgt32 %f0, %F0, %x1 /* FPM Group */; \ + inc %x7 /* IEU0 */; \ + STORE3 /* Store (optional) */; \ + faligndata %F4, %F6, %A4 /* FPA */; \ + srl %x7, 1, %x7 /* IEU0 Group */; \ + add %sum, %x6, %sum /* IEU1 */; \ + fpadd32 %F4, %f4, %F4 /* FPA */; \ + fcmpgt32 %f2, %F2, %x2 /* FPM Group */; \ + inc %x8 /* IEU0 */; \ + STORE4 /* Store (optional) */; \ + faligndata %F6, %F8, %A6 /* FPA */; \ + srl %x8, 1, %x8 /* IEU0 Group */; \ + add %sum, %x7, %sum /* IEU1 */; \ + fpadd32 %F6, %f6, %F6 /* FPA */; \ + fcmpgt32 %f4, %F4, %x3 /* FPM Group */; \ + inc %x1 /* IEU0 */; \ + STORE5 /* Store (optional) */; \ + faligndata %F8, %F10, %A8 /* FPA */; \ + srl %x1, 1, %x1 /* IEU0 Group */; \ + add %sum, %x8, %sum /* IEU1 */; \ + fpadd32 %F8, %f8, %F8 /* FPA */; \ + fcmpgt32 %f6, %F6, %x4 /* FPM Group */; \ + inc %x2 /* IEU0 */; \ + STORE6 /* Store (optional) */; \ + faligndata %F10, %F12, %A10 /* FPA */; \ + srl %x2, 1, %x2 /* IEU0 Group */; \ + add %sum, %x1, %sum /* IEU1 */; \ + fpadd32 %F10, %f10, %F10 /* FPA */; \ + fcmpgt32 %f8, %F8, %x5 /* FPM Group */; \ + inc %x3 /* IEU0 */; \ + STORE7 /* Store (optional) */; \ + faligndata %F12, %F14, %A12 /* FPA */; \ + srl %x3, 1, %x3 /* IEU0 Group */; \ + add %sum, %x2, %sum /* IEU1 */; \ + fpadd32 %F12, %f12, %F12 /* FPA */; \ + fcmpgt32 %f10, %F10, %x6 /* FPM Group */; \ + inc %x4 /* IEU0 */; \ + STORE8 /* Store (optional) */; \ + fmovd %F14, %B14 /* FPA */; \ + srl %x4, 1, %x4 /* IEU0 Group */; \ + add %sum, %x3, %sum /* IEU1 */; \ + fpadd32 %F14, %f14, %F14 /* FPA */; \ + fcmpgt32 %f12, %F12, %x7 /* FPM Group */; \ + subcc %len, 64, %len /* IEU1 */; \ + BRANCH /* CTI */; \ + fcmpgt32 %f14, %F14, %x8 /* FPM Group */; \ + +#define END_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB,S0,S1,S2,S3,T0,T1,U0,fz) \ + inc %x5 /* IEU0 Group */; \ + fpadd32 %f2, %f0, %S0 /* FPA */; \ + srl %x5, 1, %x5 /* IEU0 Group */; \ + add %sum, %x4, %sum /* IEU1 */; \ + fpadd32 %f6, %f4, %S1 /* FPA */; \ + inc %x6 /* IEU0 Group */; \ + add %sum, %x5, %sum /* IEU1 */; \ + fcmpgt32 %f0, %S0, %x1 /* FPM Group */; \ + srl %x6, 1, %x6 /* IEU0 */; \ + inc %x7 /* IEU1 */; \ + fpadd32 %f10, %f8, %S2 /* FPA */; \ + fcmpgt32 %f4, %S1, %x2 /* FPM Group */; \ + srl %x7, 1, %x7 /* IEU0 */; \ + add %sum, %x6, %sum /* IEU1 */; \ + fpadd32 %f14, %f12, %S3 /* FPA */; \ + inc %x8 /* IEU0 Group */; \ + add %sum, %x7, %sum /* IEU1 */; \ + fzero %fz /* FPA */; \ + fcmpgt32 %f8, %S2, %x3 /* FPM Group */; \ + srl %x8, 1, %x8 /* IEU0 */; \ + inc %x1 /* IEU1 */; \ + fpadd32 %S0, %S1, %T0 /* FPA */; \ + fcmpgt32 %f12, %S3, %x4 /* FPM Group */; \ + srl %x1, 1, %x1 /* IEU0 */; \ + add %sum, %x8, %sum /* IEU1 */; \ + fpadd32 %S2, %S3, %T1 /* FPA */; \ + inc %x2 /* IEU0 Group */; \ + add %sum, %x1, %sum /* IEU1 */; \ + fcmpgt32 %S0, %T0, %x5 /* FPM Group */; \ + srl %x2, 1, %x2 /* IEU0 */; \ + inc %x3 /* IEU1 */; \ + fcmpgt32 %S2, %T1, %x6 /* FPM Group */; \ + srl %x3, 1, %x3 /* IEU0 */; \ + add %sum, %x2, %sum /* IEU1 */; \ + inc %x4 /* IEU0 Group */; \ + add %sum, %x3, %sum /* IEU1 */; \ + fcmpgt32 %fz, %f2, %x7 /* FPM Group */; \ + srl %x4, 1, %x4 /* IEU0 */; \ + inc %x5 /* IEU1 */; \ + fpadd32 %T0, %T1, %U0 /* FPA */; \ + fcmpgt32 %fz, %f6, %x8 /* FPM Group */; \ + srl %x5, 1, %x5 /* IEU0 */; \ + add %sum, %x4, %sum /* IEU1 */; \ + inc %x6 /* IEU0 Group */; \ + add %sum, %x5, %sum /* IEU1 */; \ + fcmpgt32 %fz, %f10, %x1 /* FPM Group */; \ + srl %x6, 1, %x6 /* IEU0 */; \ + inc %x7 /* IEU1 */; \ + fcmpgt32 %fz, %f14, %x2 /* FPM Group */; \ + ba,pt %xcc, ett /* CTI */; \ + fmovd %FA, %FB /* FPA */; \ + +#define END_THE_TRICK1(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB) \ + END_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB,f48,f50,f52,f54,f56,f58,f60,f62) + +#define END_THE_TRICK2(S0,S1,S2,S3,T0,T1,U0,U1,V0,fz) \ + fpadd32 %U0, %U1, %V0 /* FPA Group */; \ + srl %x7, 1, %x7 /* IEU0 */; \ + add %sum, %x6, %sum /* IEU1 */; \ + std %V0, [%sp + STACKOFF] /* Store Group */; \ + inc %x8 /* IEU0 */; \ + sub %sum, %x7, %sum /* IEU1 */; \ + fcmpgt32 %fz, %S1, %x3 /* FPM Group */; \ + srl %x8, 1, %x8 /* IEU0 */; \ + inc %x1 /* IEU1 */; \ + fcmpgt32 %fz, %S3, %x4 /* FPM Group */; \ + srl %x1, 1, %x1 /* IEU0 */; \ + sub %sum, %x8, %sum /* IEU1 */; \ + ldx [%sp + STACKOFF], %x8 /* Load Group */; \ + inc %x2 /* IEU0 */; \ + sub %sum, %x1, %sum /* IEU1 */; \ + fcmpgt32 %fz, %T1, %x5 /* FPM Group */; \ + srl %x2, 1, %x2 /* IEU0 */; \ + inc %x3 /* IEU1 */; \ + fcmpgt32 %T0, %U0, %x6 /* FPM Group */; \ + srl %x3, 1, %x3 /* IEU0 */; \ + sub %sum, %x2, %sum /* IEU1 */; \ + inc %x4 /* IEU0 Group */; \ + sub %sum, %x3, %sum /* IEU1 */; \ + fcmpgt32 %fz, %U1, %x7 /* FPM Group */; \ + srl %x4, 1, %x4 /* IEU0 */; \ + inc %x5 /* IEU1 */; \ + fcmpgt32 %U0, %V0, %x1 /* FPM Group */; \ + srl %x5, 1, %x5 /* IEU0 */; \ + sub %sum, %x4, %sum /* IEU1 */; \ + fcmpgt32 %fz, %V0, %x2 /* FPM Group */; \ + inc %x6 /* IEU0 */; \ + sub %sum, %x5, %sum /* IEU1 */; \ + srl %x6, 1, %x6 /* IEU0 Group */; \ + inc %x7 /* IEU1 */; \ + srl %x7, 1, %x7 /* IEU0 Group */; \ + add %sum, %x6, %sum /* IEU1 */; \ + inc %x1 /* IEU0 Group */; \ + sub %sum, %x7, %sum /* IEU1 */; \ + srl %x1, 1, %x1 /* IEU0 Group */; \ + inc %x2 /* IEU1 */; \ + srl %x2, 1, %x2 /* IEU0 Group */; \ + add %sum, %x1, %sum /* IEU1 */; \ + sub %sum, %x2, %sum /* IEU0 Group */; \ + addcc %sum, %x8, %sum /* IEU Group */; \ + bcs,a,pn %xcc, 33f /* CTI */; \ + add %sum, 1, %sum /* IEU0 */; \ +33: /* That's it */; + + .text + .globl csum_partial_copy_user_vis + .align 32 +/* %asi should be either ASI_P or ASI_AIUS for csum_partial_copy resp. csum_partial_copy_from_user */ +/* This assumes that !((%src^%dst)&3) && !((%src|%dst)&1) && %len >= 256 */ +csum_partial_copy_user_vis: + andcc %dst, 7, %g0 /* IEU1 Group */ + be,pt %icc, 4f /* CTI */ + and %dst, 0x38, %o4 /* IEU0 */ + mov 1, %g5 /* IEU0 Group */ + andcc %dst, 2, %g0 /* IEU1 */ + be,pt %icc, 1f /* CTI */ + and %dst, 4, %g7 /* IEU0 Group */ + lduh [%src], %g2 /* Load */ + sub %len, 2, %len /* IEU0 Group */ + add %dst, 2, %dst /* IEU1 */ + andcc %dst, 4, %g7 /* IEU1 Group */ + sll %g5, 16, %g5 /* IEU0 */ + stha %g2, [%dst - 2] %asi /* Store Group */ + sll %g2, 16, %g2 /* IEU0 */ + add %src, 2, %src /* IEU1 */ + addcc %g2, %sum, %sum /* IEU1 Group */ + bcs,a,pn %icc, 1f /* CTI */ + add %sum, %g5, %sum /* IEU0 */ +1: lduw [%src], %g2 /* Load */ + brz,a,pn %g7, 4f /* CTI+IEU1 Group */ + and %dst, 0x38, %o4 /* IEU0 */ + add %dst, 4, %dst /* IEU0 Group */ + sub %len, 4, %len /* IEU1 */ + addcc %g2, %sum, %sum /* IEU1 Group */ + bcs,a,pn %icc, 1f /* CTI */ + add %sum, 1, %sum /* IEU0 */ +1: and %dst, 0x38, %o4 /* IEU0 Group */ + stwa %g2, [%dst - 4] %asi /* Store */ + add %src, 4, %src /* IEU1 */ +4: +#ifdef __KERNEL__ + VISEntry +#endif + mov %src, %g7 /* IEU1 Group */ + fzero %f48 /* FPA */ + alignaddr %src, %g0, %src /* Single Group */ + subcc %g7, %src, %g7 /* IEU1 Group */ + be,pt %xcc, 1f /* CTI */ + mov 0x40, %g1 /* IEU0 */ + lduw [%src], %g2 /* Load Group */ + subcc %sum, %g2, %sum /* IEU1 Group+load stall */ + bcs,a,pn %icc, 1f /* CTI */ + sub %sum, 1, %sum /* IEU0 */ +1: srl %sum, 0, %sum /* IEU0 Group */ + clr %g5 /* IEU1 */ + brz,pn %o4, 3f /* CTI+IEU1 Group */ + sub %g1, %o4, %g1 /* IEU0 */ + ldd [%src], %f0 /* Load */ + clr %o4 /* IEU0 Group */ + andcc %dst, 8, %g0 /* IEU1 */ + be,pn %icc, 1f /* CTI */ + ldd [%src + 8], %f2 /* Load Group */ + add %src, 8, %src /* IEU0 */ + sub %len, 8, %len /* IEU1 */ + fpadd32 %f0, %f48, %f50 /* FPA */ + addcc %dst, 8, %dst /* IEU1 Group */ + faligndata %f0, %f2, %f16 /* FPA */ + fcmpgt32 %f48, %f50, %o4 /* FPM Group */ + fmovd %f2, %f0 /* FPA Group */ + ldd [%src + 8], %f2 /* Load */ + stda %f16, [%dst - 8] %asi /* Store */ + fmovd %f50, %f48 /* FPA */ +1: andcc %g1, 0x10, %g0 /* IEU1 Group */ + be,pn %icc, 1f /* CTI */ + and %g1, 0x20, %g1 /* IEU0 */ + fpadd32 %f0, %f48, %f50 /* FPA */ + ldd [%src + 16], %f4 /* Load Group */ + add %src, 16, %src /* IEU0 */ + add %dst, 16, %dst /* IEU1 */ + faligndata %f0, %f2, %f16 /* FPA */ + fcmpgt32 %f48, %f50, %g5 /* FPM Group */ + sub %len, 16, %len /* IEU0 */ + inc %o4 /* IEU1 */ + stda %f16, [%dst - 16] %asi /* Store Group */ + fpadd32 %f2, %f50, %f48 /* FPA */ + srl %o4, 1, %o5 /* IEU0 */ + faligndata %f2, %f4, %f18 /* FPA Group */ + stda %f18, [%dst - 8] %asi /* Store */ + fcmpgt32 %f50, %f48, %o4 /* FPM Group */ + add %o5, %sum, %sum /* IEU0 */ + ldd [%src + 8], %f2 /* Load */ + fmovd %f4, %f0 /* FPA */ +1: brz,a,pn %g1, 4f /* CTI+IEU1 Group */ + rd %asi, %g2 /* LSU Group + 4 bubbles */ + inc %g5 /* IEU0 */ + fpadd32 %f0, %f48, %f50 /* FPA */ + ldd [%src + 16], %f4 /* Load Group */ + srl %g5, 1, %g5 /* IEU0 */ + add %dst, 32, %dst /* IEU1 */ + faligndata %f0, %f2, %f16 /* FPA */ + fcmpgt32 %f48, %f50, %o5 /* FPM Group */ + inc %o4 /* IEU0 */ + ldd [%src + 24], %f6 /* Load */ + srl %o4, 1, %o4 /* IEU0 Group */ + add %g5, %sum, %sum /* IEU1 */ + ldd [%src + 32], %f8 /* Load */ + fpadd32 %f2, %f50, %f48 /* FPA */ + faligndata %f2, %f4, %f18 /* FPA Group */ + sub %len, 32, %len /* IEU0 */ + stda %f16, [%dst - 32] %asi /* Store */ + fcmpgt32 %f50, %f48, %g3 /* FPM Group */ + inc %o5 /* IEU0 */ + add %o4, %sum, %sum /* IEU1 */ + fpadd32 %f4, %f48, %f50 /* FPA */ + faligndata %f4, %f6, %f20 /* FPA Group */ + srl %o5, 1, %o5 /* IEU0 */ + fcmpgt32 %f48, %f50, %g5 /* FPM Group */ + add %o5, %sum, %sum /* IEU0 */ + stda %f18, [%dst - 24] %asi /* Store */ + fpadd32 %f6, %f50, %f48 /* FPA */ + inc %g3 /* IEU0 Group */ + stda %f20, [%dst - 16] %asi /* Store */ + add %src, 32, %src /* IEU1 */ + faligndata %f6, %f8, %f22 /* FPA */ + fcmpgt32 %f50, %f48, %o4 /* FPM Group */ + srl %g3, 1, %g3 /* IEU0 */ + stda %f22, [%dst - 8] %asi /* Store */ + add %g3, %sum, %sum /* IEU0 Group */ +3: rd %asi, %g2 /* LSU Group + 4 bubbles */ +#ifdef __KERNEL__ +4: sethi %hi(vis0s), %g7 /* IEU0 Group */ + or %g2, ASI_BLK_OR, %g2 /* IEU1 */ +#else +4: rd %pc, %g7 /* LSU Group + 4 bubbles */ +#endif + inc %g5 /* IEU0 Group */ + and %src, 0x38, %g3 /* IEU1 */ + membar #StoreLoad /* LSU Group */ + srl %g5, 1, %g5 /* IEU0 */ + inc %o4 /* IEU1 */ + sll %g3, 8, %g3 /* IEU0 Group */ + sub %len, 0xc0, %len /* IEU1 */ + addcc %g5, %sum, %sum /* IEU1 Group */ + srl %o4, 1, %o4 /* IEU0 */ + add %g7, %g3, %g7 /* IEU0 Group */ + add %o4, %sum, %sum /* IEU1 */ +#ifdef __KERNEL__ + jmpl %g7 + %lo(vis0s), %g0 /* CTI+IEU1 Group */ +#else + jmpl %g7 + (vis0s - 4b), %g0 /* CTI+IEU1 Group */ +#endif + fzero %f32 /* FPA */ + + .align 2048 +vis0s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ + ldda [%src] ASI_BLK_P, %f0 /* Load Group */ + add %src, 64, %src /* IEU0 Group */ + ldda [%src] ASI_BLK_P, %f16 /* Load Group */ + add %src, 64, %src /* IEU0 Group */ + fmovd %f48, %f62 /* FPA Group f0 available */ + faligndata %f0, %f2, %f48 /* FPA Group f2 available */ + fcmpgt32 %f32, %f2, %x1 /* FPM Group f4 available */ + fpadd32 %f0, %f62, %f0 /* FPA */ + fcmpgt32 %f32, %f4, %x2 /* FPM Group f6 available */ + faligndata %f2, %f4, %f50 /* FPA */ + fcmpgt32 %f62, %f0, %x3 /* FPM Group f8 available */ + faligndata %f4, %f6, %f52 /* FPA */ + fcmpgt32 %f32, %f6, %x4 /* FPM Group f10 available */ + inc %x1 /* IEU0 */ + faligndata %f6, %f8, %f54 /* FPA */ + fcmpgt32 %f32, %f8, %x5 /* FPM Group f12 available */ + srl %x1, 1, %x1 /* IEU0 */ + inc %x2 /* IEU1 */ + faligndata %f8, %f10, %f56 /* FPA */ + fcmpgt32 %f32, %f10, %x6 /* FPM Group f14 available */ + srl %x2, 1, %x2 /* IEU0 */ + add %sum, %x1, %sum /* IEU1 */ + faligndata %f10, %f12, %f58 /* FPA */ + fcmpgt32 %f32, %f12, %x7 /* FPM Group */ + inc %x3 /* IEU0 */ + add %sum, %x2, %sum /* IEU1 */ + faligndata %f12, %f14, %f60 /* FPA */ + fcmpgt32 %f32, %f14, %x8 /* FPM Group */ + srl %x3, 1, %x3 /* IEU0 */ + inc %x4 /* IEU1 */ + fmovd %f14, %f62 /* FPA */ + srl %x4, 1, %x4 /* IEU0 Group */ + add %sum, %x3, %sum /* IEU1 */ +vis0: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, + ,f48,f50,f52,f54,f56,f58,f60,f62,f62, + ,LDBLK(f32), STBLK,,,,,,,, + ,bcs,pn %icc, vis0e1) + DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, + ,f48,f50,f52,f54,f56,f58,f60,f62,f62, + ,LDBLK(f0), STBLK,,,,,,,, + ,bcs,pn %icc, vis0e2) + DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, + ,f48,f50,f52,f54,f56,f58,f60,f62,f62, + ,LDBLK(f16), STBLK,,,,,,,, + ,bcc,pt %icc, vis0) +vis0e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, + ,f48,f50,f52,f54,f56,f58,f60,f62,f32, + ,SYNC, STBLK_XORASI(x1,x2),ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48), + ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e2) +vis0e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, + ,f48,f50,f52,f54,f56,f58,f60,f62,f0, + ,SYNC, STBLK_XORASI(x1,x2),ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48), + ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e3) +vis0e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, + ,f48,f50,f52,f54,f56,f58,f60,f62,f16, + ,SYNC, STBLK_XORASI(x1,x2),ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48), + ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e1) + .align 2048 +vis1s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ + sub %src, 8, %src /* IEU0 Group */ + ldda [%src] ASI_BLK_P, %f0 /* Load Group */ + add %src, 64, %src /* IEU0 Group */ + ldda [%src] ASI_BLK_P, %f16 /* Load Group */ + add %src, 64, %src /* IEU0 Group */ + fmovd %f0, %f58 /* FPA Group */ + fmovd %f48, %f0 /* FPA Group */ + fcmpgt32 %f32, %f2, %x2 /* FPM Group */ + faligndata %f2, %f4, %f48 /* FPA */ + fcmpgt32 %f32, %f4, %x3 /* FPM Group */ + faligndata %f4, %f6, %f50 /* FPA */ + fcmpgt32 %f32, %f6, %x4 /* FPM Group */ + faligndata %f6, %f8, %f52 /* FPA */ + fcmpgt32 %f32, %f8, %x5 /* FPM Group */ + inc %x2 /* IEU1 */ + faligndata %f8, %f10, %f54 /* FPA */ + fcmpgt32 %f32, %f10, %x6 /* FPM Group */ + srl %x2, 1, %x2 /* IEU0 */ + faligndata %f10, %f12, %f56 /* FPA */ + fcmpgt32 %f32, %f12, %x7 /* FPM Group */ + inc %x3 /* IEU0 */ + add %sum, %x2, %sum /* IEU1 */ + faligndata %f12, %f14, %f58 /* FPA */ + fcmpgt32 %f32, %f14, %x8 /* FPM Group */ + srl %x3, 1, %x3 /* IEU0 */ + inc %x4 /* IEU1 */ + fmovd %f14, %f60 /* FPA */ + srl %x4, 1, %x4 /* IEU0 Group */ + add %sum, %x3, %sum /* IEU1 */ +vis1: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, + ,f62,f48,f50,f52,f54,f56,f58,f60,f60, + ,LDBLK(f32), ,STBLK,,,,,,, + ,bcs,pn %icc, vis1e1) + DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, + ,f62,f48,f50,f52,f54,f56,f58,f60,f60, + ,LDBLK(f0), ,STBLK,,,,,,, + ,bcs,pn %icc, vis1e2) + DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, + ,f62,f48,f50,f52,f54,f56,f58,f60,f60, + ,LDBLK(f16), ,STBLK,,,,,,, + ,bcc,pt %icc, vis1) +vis1e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, + ,f62,f48,f50,f52,f54,f56,f58,f60,f32, + ,SYNC, ,STBLK_XORASI(x1,x2),ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40), + ,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e2) +vis1e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, + ,f62,f48,f50,f52,f54,f56,f58,f60,f0, + ,SYNC, ,STBLK_XORASI(x1,x2),ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40), + ,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e3) +vis1e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, + ,f62,f48,f50,f52,f54,f56,f58,f60,f16, + ,SYNC, ,STBLK_XORASI(x1,x2),ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40), + ,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e1) + .align 2048 +vis2s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ + sub %src, 16, %src /* IEU0 Group */ + ldda [%src] ASI_BLK_P, %f0 /* Load Group */ + add %src, 64, %src /* IEU0 Group */ + ldda [%src] ASI_BLK_P, %f16 /* Load Group */ + add %src, 64, %src /* IEU0 Group */ + fmovd %f0, %f56 /* FPA Group */ + fmovd %f48, %f0 /* FPA Group */ + sub %dst, 64, %dst /* IEU0 */ + fpsub32 %f2, %f2, %f2 /* FPA Group */ + fcmpgt32 %f32, %f4, %x3 /* FPM Group */ + faligndata %f4, %f6, %f48 /* FPA */ + fcmpgt32 %f32, %f6, %x4 /* FPM Group */ + faligndata %f6, %f8, %f50 /* FPA */ + fcmpgt32 %f32, %f8, %x5 /* FPM Group */ + faligndata %f8, %f10, %f52 /* FPA */ + fcmpgt32 %f32, %f10, %x6 /* FPM Group */ + faligndata %f10, %f12, %f54 /* FPA */ + fcmpgt32 %f32, %f12, %x7 /* FPM Group */ + inc %x3 /* IEU0 */ + faligndata %f12, %f14, %f56 /* FPA */ + fcmpgt32 %f32, %f14, %x8 /* FPM Group */ + srl %x3, 1, %x3 /* IEU0 */ + inc %x4 /* IEU1 */ + fmovd %f14, %f58 /* FPA */ + srl %x4, 1, %x4 /* IEU0 Group */ + add %sum, %x3, %sum /* IEU1 */ +vis2: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, + ,f60,f62,f48,f50,f52,f54,f56,f58,f58, + ,LDBLK(f32), ,,STBLK,,,,,, + ,bcs,pn %icc, vis2e1) + DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, + ,f60,f62,f48,f50,f52,f54,f56,f58,f58, + ,LDBLK(f0), ,,STBLK,,,,,, + ,bcs,pn %icc, vis2e2) + DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, + ,f60,f62,f48,f50,f52,f54,f56,f58,f58, + ,LDBLK(f16), ,,STBLK,,,,,, + ,bcc,pt %icc, vis2) +vis2e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, + ,f60,f62,f48,f50,f52,f54,f56,f58,f32, + ,SYNC, ,,STBLK_XORASI(x2,x3),ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96), + ,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e2) +vis2e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, + ,f60,f62,f48,f50,f52,f54,f56,f58,f0, + ,SYNC, ,,STBLK_XORASI(x2,x3),ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96), + ,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e3) +vis2e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, + ,f60,f62,f48,f50,f52,f54,f56,f58,f16, + ,SYNC, ,,STBLK_XORASI(x2,x3),ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96), + ,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e1) + .align 2048 +vis3s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ + sub %src, 24, %src /* IEU0 Group */ + ldda [%src] ASI_BLK_P, %f0 /* Load Group */ + add %src, 64, %src /* IEU0 Group */ + ldda [%src] ASI_BLK_P, %f16 /* Load Group */ + add %src, 64, %src /* IEU0 Group */ + fmovd %f0, %f54 /* FPA Group */ + fmovd %f48, %f0 /* FPA Group */ + sub %dst, 64, %dst /* IEU0 */ + fpsub32 %f2, %f2, %f2 /* FPA Group */ + fpsub32 %f4, %f4, %f4 /* FPA Group */ + fcmpgt32 %f32, %f6, %x4 /* FPM Group */ + faligndata %f6, %f8, %f48 /* FPA */ + fcmpgt32 %f32, %f8, %x5 /* FPM Group */ + faligndata %f8, %f10, %f50 /* FPA */ + fcmpgt32 %f32, %f10, %x6 /* FPM Group */ + faligndata %f10, %f12, %f52 /* FPA */ + fcmpgt32 %f32, %f12, %x7 /* FPM Group */ + faligndata %f12, %f14, %f54 /* FPA */ + fcmpgt32 %f32, %f14, %x8 /* FPM Group */ + fmovd %f14, %f56 /* FPA */ + inc %x4 /* IEU0 */ + srl %x4, 1, %x4 /* IEU0 Group */ +vis3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, + ,f58,f60,f62,f48,f50,f52,f54,f56,f56, + ,LDBLK(f32), ,,,STBLK,,,,, + ,bcs,pn %icc, vis3e1) + DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, + ,f58,f60,f62,f48,f50,f52,f54,f56,f56, + ,LDBLK(f0), ,,,STBLK,,,,, + ,bcs,pn %icc, vis3e2) + DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, + ,f58,f60,f62,f48,f50,f52,f54,f56,f56, + ,LDBLK(f16), ,,,STBLK,,,,, + ,bcc,pt %icc, vis3) +vis3e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, + ,f58,f60,f62,f48,f50,f52,f54,f56,f32, + ,SYNC, ,,,STBLK_XORASI(x3,x4),ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88), + ,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e2) +vis3e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, + ,f58,f60,f62,f48,f50,f52,f54,f56,f0, + ,SYNC, ,,,STBLK_XORASI(x3,x4),ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88), + ,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e3) +vis3e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, + ,f58,f60,f62,f48,f50,f52,f54,f56,f16, + ,SYNC, ,,,STBLK_XORASI(x3,x4),ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88), + ,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e1) + .align 2048 +vis4s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ + sub %src, 32, %src /* IEU0 Group */ + ldda [%src] ASI_BLK_P, %f0 /* Load Group */ + add %src, 64, %src /* IEU0 Group */ + ldda [%src] ASI_BLK_P, %f16 /* Load Group */ + add %src, 64, %src /* IEU0 Group */ + fmovd %f0, %f52 /* FPA Group */ + fmovd %f48, %f0 /* FPA Group */ + sub %dst, 64, %dst /* IEU0 */ + fpsub32 %f2, %f2, %f2 /* FPA Group */ + fpsub32 %f4, %f4, %f4 /* FPA Group */ + fpsub32 %f6, %f6, %f6 /* FPA Group */ + clr %x4 /* IEU0 */ + fcmpgt32 %f32, %f8, %x5 /* FPM Group */ + faligndata %f8, %f10, %f48 /* FPA */ + fcmpgt32 %f32, %f10, %x6 /* FPM Group */ + faligndata %f10, %f12, %f50 /* FPA */ + fcmpgt32 %f32, %f12, %x7 /* FPM Group */ + faligndata %f12, %f14, %f52 /* FPA */ + fcmpgt32 %f32, %f14, %x8 /* FPM Group */ + fmovd %f14, %f54 /* FPA */ +vis4: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, + ,f56,f58,f60,f62,f48,f50,f52,f54,f54, + ,LDBLK(f32), ,,,,STBLK,,,, + ,bcs,pn %icc, vis4e1) + DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, + ,f56,f58,f60,f62,f48,f50,f52,f54,f54, + ,LDBLK(f0), ,,,,STBLK,,,, + ,bcs,pn %icc, vis4e2) + DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, + ,f56,f58,f60,f62,f48,f50,f52,f54,f54, + ,LDBLK(f16), ,,,,STBLK,,,, + ,bcc,pt %icc, vis4) +vis4e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, + ,f56,f58,f60,f62,f48,f50,f52,f54,f32, + ,SYNC, ,,,,STBLK_XORASI(x4,x5),ST(f48,64),ST(f50,72),ST(f52,80), + ,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e2) +vis4e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, + ,f56,f58,f60,f62,f48,f50,f52,f54,f0, + ,SYNC, ,,,,STBLK_XORASI(x4,x5),ST(f48,64),ST(f50,72),ST(f52,80), + ,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e3) +vis4e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, + ,f56,f58,f60,f62,f48,f50,f52,f54,f16, + ,SYNC, ,,,,STBLK_XORASI(x4,x5),ST(f48,64),ST(f50,72),ST(f52,80), + ,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e1) + .align 2048 +vis5s: ldd [%src+0], %f10 /* Load Group */ + ldd [%src+8], %f12 /* Load Group */ + ldd [%src+16], %f14 /* Load Group */ + add %src, 24, %src /* IEU0 Group */ + wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ + ldda [%src] ASI_BLK_P, %f16 /* Load Group */ + add %src, 64, %src /* IEU0 Group */ + fmovd %f48, %f0 /* FPA Group */ + fmuld %f32, %f32, %f2 /* FPM */ + clr %x4 /* IEU0 */ + faddd %f32, %f32, %f4 /* FPA Group */ + fmuld %f32, %f32, %f6 /* FPM */ + clr %x5 /* IEU0 */ + faddd %f32, %f32, %f8 /* FPA Group */ + fcmpgt32 %f32, %f10, %x6 /* FPM Group */ + sub %dst, 64, %dst /* IEU0 */ + faligndata %f10, %f12, %f48 /* FPA */ + fcmpgt32 %f32, %f12, %x7 /* FPM Group */ + faligndata %f12, %f14, %f50 /* FPA */ + fcmpgt32 %f32, %f14, %x8 /* FPM Group */ + fmovd %f14, %f52 /* FPA */ +vis5: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, + ,f54,f56,f58,f60,f62,f48,f50,f52,f52, + ,LDBLK(f32), ,,,,,STBLK,,, + ,bcs,pn %icc, vis5e1) + DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, + ,f54,f56,f58,f60,f62,f48,f50,f52,f52, + ,LDBLK(f0), ,,,,,STBLK,,, + ,bcs,pn %icc, vis5e2) + DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, + ,f54,f56,f58,f60,f62,f48,f50,f52,f52, + ,LDBLK(f16), ,,,,,STBLK,,, + ,bcc,pt %icc, vis5) +vis5e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, + ,f54,f56,f58,f60,f62,f48,f50,f52,f32, + ,SYNC, ,,,,,STBLK_XORASI(x5,x6),ST(f48,64),ST(f50,72), + ,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e2) +vis5e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, + ,f54,f56,f58,f60,f62,f48,f50,f52,f0, + ,SYNC, ,,,,,STBLK_XORASI(x5,x6),ST(f48,64),ST(f50,72), + ,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e3) +vis5e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, + ,f54,f56,f58,f60,f62,f48,f50,f52,f16, + ,SYNC, ,,,,,STBLK_XORASI(x5,x6),ST(f48,64),ST(f50,72), + ,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e1) + .align 2048 +vis6s: ldd [%src+0], %f12 /* Load Group */ + ldd [%src+8], %f14 /* Load Group */ + add %src, 16, %src /* IEU0 Group */ + wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ + ldda [%src] ASI_BLK_P, %f16 /* Load Group */ + add %src, 64, %src /* IEU0 Group */ + fmovd %f48, %f0 /* FPA Group */ + fmuld %f32, %f32, %f2 /* FPM */ + clr %x4 /* IEU0 */ + faddd %f32, %f32, %f4 /* FPA Group */ + fmuld %f32, %f32, %f6 /* FPM */ + clr %x5 /* IEU0 */ + faddd %f32, %f32, %f8 /* FPA Group */ + fmuld %f32, %f32, %f10 /* FPM */ + clr %x6 /* IEU0 */ + fcmpgt32 %f32, %f12, %x7 /* FPM Group */ + sub %dst, 64, %dst /* IEU0 */ + fcmpgt32 %f32, %f14, %x8 /* FPM Group */ + faligndata %f12, %f14, %f48 /* FPA */ + fmovd %f14, %f50 /* FPA Group */ +vis6: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, + ,f52,f54,f56,f58,f60,f62,f48,f50,f50, + ,LDBLK(f32), ,,,,,,STBLK,, + ,bcs,pn %icc, vis6e1) + DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, + ,f52,f54,f56,f58,f60,f62,f48,f50,f50, + ,LDBLK(f0), ,,,,,,STBLK,, + ,bcs,pn %icc, vis6e2) + DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, + ,f52,f54,f56,f58,f60,f62,f48,f50,f50, + ,LDBLK(f16), ,,,,,,STBLK,, + ,bcc,pt %icc, vis6) +vis6e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, + ,f52,f54,f56,f58,f60,f62,f48,f50,f32, + ,SYNC, ,,,,,,STBLK_XORASI(x6,x7),ST(f48,64), + ,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e2) +vis6e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, + ,f52,f54,f56,f58,f60,f62,f48,f50,f0, + ,SYNC, ,,,,,,STBLK_XORASI(x6,x7),ST(f48,64), + ,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e3) +vis6e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, + ,f52,f54,f56,f58,f60,f62,f48,f50,f16, + ,SYNC, ,,,,,,STBLK_XORASI(x6,x7),ST(f48,64), + ,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e1) + .align 2048 +vis7s: ldd [%src+0], %f14 /* Load Group */ + add %src, 8, %src /* IEU0 Group */ + wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ + ldda [%src] ASI_BLK_P, %f16 /* Load Group */ + add %src, 64, %src /* IEU0 Group */ + fmovd %f48, %f0 /* FPA Group */ + fmuld %f32, %f32, %f2 /* FPM */ + clr %x4 /* IEU0 */ + faddd %f32, %f32, %f4 /* FPA Group */ + fmuld %f32, %f32, %f6 /* FPM */ + clr %x5 /* IEU0 */ + faddd %f32, %f32, %f8 /* FPA Group */ + fmuld %f32, %f32, %f10 /* FPM */ + clr %x6 /* IEU0 */ + faddd %f32, %f32, %f12 /* FPA Group */ + clr %x7 /* IEU0 */ + fcmpgt32 %f32, %f14, %x8 /* FPM Group */ + sub %dst, 64, %dst /* IEU0 */ + fmovd %f14, %f48 /* FPA */ +vis7: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, + ,f50,f52,f54,f56,f58,f60,f62,f48,f48, + ,LDBLK(f32), ,,,,,,,STBLK, + ,bcs,pn %icc, vis7e1) + DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, + ,f50,f52,f54,f56,f58,f60,f62,f48,f48, + ,LDBLK(f0), ,,,,,,,STBLK, + ,bcs,pn %icc, vis7e2) + DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, + ,f50,f52,f54,f56,f58,f60,f62,f48,f48, + ,LDBLK(f16), ,,,,,,,STBLK, + ,bcc,pt %icc, vis7) +vis7e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, + ,f50,f52,f54,f56,f58,f60,f62,f48,f32, + ,SYNC, ,,,,,,,STBLK_XORASI(x7,x8), + ,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e2) +vis7e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, + ,f50,f52,f54,f56,f58,f60,f62,f48,f0, + ,SYNC, ,,,,,,,STBLK_XORASI(x7,x8), + ,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e3) +vis7e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, + ,f50,f52,f54,f56,f58,f60,f62,f48,f16, + ,SYNC, ,,,,,,,STBLK_XORASI(x7,x8), + ,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e1) +e1: END_THE_TRICK1( f0,f2,f4,f6,f8,f10,f12,f14,f16,f6) +e2: END_THE_TRICK1( f16,f18,f20,f22,f24,f26,f28,f30,f32,f6) +e3: END_THE_TRICK1( f32,f34,f36,f38,f40,f42,f44,f46,f0,f6) +ett: rd %gsr, %x3 /* LSU Group+4bubbles */ + andcc %x3, 7, %x3 /* IEU1 Group */ + add %dst, 8, %dst /* IEU0 */ + bne,pn %icc, 1f /* CTI */ + fzero %f10 /* FPA */ + brz,a,pn %len, 2f /* CTI+IEU1 Group */ + stda %f6, [%dst - 8] %asi /* Store */ +1: cmp %len, 8 /* IEU1 */ + blu,pn %icc, 3f /* CTI */ + sub %src, 64, %src /* IEU0 Group */ +1: ldd [%src], %f2 /* Load Group */ + fpadd32 %f10, %f2, %f12 /* FPA Group+load stall */ + add %src, 8, %src /* IEU0 */ + add %dst, 8, %dst /* IEU1 */ + faligndata %f6, %f2, %f14 /* FPA Group */ + fcmpgt32 %f10, %f12, %x5 /* FPM Group */ + stda %f14, [%dst - 16] %asi /* Store */ + fmovd %f2, %f6 /* FPA */ + fmovd %f12, %f10 /* FPA Group */ + sub %len, 8, %len /* IEU1 */ + fzero %f16 /* FPA Group - FPU nop */ + fzero %f18 /* FPA Group - FPU nop */ + inc %x5 /* IEU0 */ + srl %x5, 1, %x5 /* IEU0 Group (regdep) */ + cmp %len, 8 /* IEU1 */ + bgeu,pt %icc, 1b /* CTI */ + add %x5, %sum, %sum /* IEU0 Group */ +3: brz,a,pt %x3, 2f /* CTI+IEU1 */ + stda %f6, [%dst - 8] %asi /* Store Group */ + sta %f7, [%dst - 8] %asi /* Store Group */ + sub %dst, 4, %dst /* IEU0 */ + add %len, 4, %len /* IEU1 */ +2: +#ifdef __KERNEL__ + sub %sp, 8, %sp /* IEU0 Group */ +#endif + END_THE_TRICK2( f48,f50,f52,f54,f56,f58,f60,f10,f12,f62) + membar #Sync /* LSU Group */ +#ifdef __KERNEL__ + VISExit + add %sp, 8, %sp /* IEU0 Group */ +#endif +23: brnz,pn %len, 26f /* CTI+IEU1 Group */ +24: sllx %sum, 32, %g1 /* IEU0 */ +25: addcc %sum, %g1, %src /* IEU1 Group */ + srlx %src, 32, %src /* IEU0 Group (regdep) */ + bcs,a,pn %xcc, 1f /* CTI */ + add %src, 1, %src /* IEU1 */ +#ifndef __KERNEL__ +1: retl /* CTI Group brk forced */ + srl %src, 0, %src /* IEU0 */ +#else +1: sethi %uhi(PAGE_OFFSET), %g4 /* IEU0 Group */ + retl /* CTI Group brk forced */ + sllx %g4, 32, %g4 /* IEU0 */ +#endif +26: andcc %len, 8, %g0 /* IEU1 Group */ + be,pn %icc, 1f /* CTI */ + lduw [%src], %o4 /* Load */ + lduw [%src+4], %g2 /* Load Group */ + add %src, 8, %src /* IEU0 */ + add %dst, 8, %dst /* IEU1 */ + sllx %o4, 32, %g5 /* IEU0 Group */ + stwa %o4, [%dst - 8] %asi /* Store */ + or %g5, %g2, %g5 /* IEU0 Group */ + stwa %g2, [%dst - 4] %asi /* Store */ + addcc %g5, %sum, %sum /* IEU1 Group */ + bcs,a,pn %xcc, 1f /* CTI */ + add %sum, 1, %sum /* IEU0 */ +1: andcc %len, 4, %g0 /* IEU1 Group */ + be,a,pn %icc, 1f /* CTI */ + clr %g2 /* IEU0 */ + lduw [%src], %g7 /* Load */ + add %src, 4, %src /* IEU0 Group */ + add %dst, 4, %dst /* IEU1 */ + sllx %g7, 32, %g2 /* IEU0 Group */ + stwa %g7, [%dst - 4] %asi /* Store */ +1: andcc %len, 2, %g0 /* IEU1 */ + be,a,pn %icc, 1f /* CTI */ + clr %g3 /* IEU0 Group */ + lduh [%src], %g7 /* Load */ + add %src, 2, %src /* IEU1 */ + add %dst, 2, %dst /* IEU0 Group */ + sll %g7, 16, %g3 /* IEU0 Group */ + stha %g7, [%dst - 2] %asi /* Store */ +1: andcc %len, 1, %g0 /* IEU1 */ + be,a,pn %icc, 1f /* CTI */ + clr %o5 /* IEU0 Group */ + ldub [%src], %g7 /* Load */ + sll %g7, 8, %o5 /* IEU0 Group */ + stba %g7, [%dst] %asi /* Store */ +1: or %g2, %g3, %g3 /* IEU1 */ + or %o5, %g3, %g3 /* IEU0 Group (regdep) */ + addcc %g3, %sum, %sum /* IEU1 Group (regdep) */ + bcs,a,pn %xcc, 1f /* CTI */ + add %sum, 1, %sum /* IEU0 */ +1: ba,pt %xcc, 25b /* CTI Group */ + sllx %sum, 32, %g1 /* IEU0 */ + +#ifdef __KERNEL__ +end: + + .section __ex_table + .align 4 + .word csum_partial_copy_user_vis, 0, end, cpc_handler +#endif diff --git a/arch/sparc64/lib/checksum.S b/arch/sparc64/lib/checksum.S index 07d10ba428b4..4e962ed473a7 100644 --- a/arch/sparc64/lib/checksum.S +++ b/arch/sparc64/lib/checksum.S @@ -2,7 +2,7 @@ * * Copyright(C) 1995 Linus Torvalds * Copyright(C) 1995 Miguel de Icaza - * Copyright(C) 1996 David S. Miller + * Copyright(C) 1996, 2000 David S. Miller * Copyright(C) 1997 Jakub Jelinek * * derived from: @@ -263,6 +263,238 @@ ccslow: mov 0, %g5 srl %o0, 0, %o0 cpc_end: + /* Now the version with userspace as the destination */ +#define CSUMCOPY_LASTCHUNK_USER(off, t0, t1) \ + ldx [%src - off - 0x08], t0; \ + ldx [%src - off - 0x00], t1; \ + nop; nop; \ + addcc t0, %sum, %sum; \ + stwa t0, [%dst - off - 0x04] %asi; \ + srlx t0, 32, t0; \ + bcc,pt %xcc, 51f; \ + stwa t0, [%dst - off - 0x08] %asi; \ + add %sum, 1, %sum; \ +51: addcc t1, %sum, %sum; \ + stwa t1, [%dst - off + 0x04] %asi; \ + srlx t1, 32, t1; \ + bcc,pt %xcc, 52f; \ + stwa t1, [%dst - off - 0x00] %asi; \ + add %sum, 1, %sum; \ +52: + +cpc_user_start: +cc_user_end_cruft: + andcc %g7, 8, %g0 ! IEU1 Group + be,pn %icc, 1f ! CTI + and %g7, 4, %g5 ! IEU0 + ldx [%src + 0x00], %g2 ! Load Group + add %dst, 8, %dst ! IEU0 + add %src, 8, %src ! IEU1 + addcc %g2, %sum, %sum ! IEU1 Group + 2 bubbles + stwa %g2, [%dst - 0x04] %asi ! Store + srlx %g2, 32, %g2 ! IEU0 + bcc,pt %xcc, 1f ! CTI Group + stwa %g2, [%dst - 0x08] %asi ! Store + add %sum, 1, %sum ! IEU0 +1: brz,pt %g5, 1f ! CTI Group + clr %g2 ! IEU0 + lduw [%src + 0x00], %g2 ! Load + add %dst, 4, %dst ! IEU0 Group + add %src, 4, %src ! IEU1 + stwa %g2, [%dst - 0x04] %asi ! Store Group + 2 bubbles + sllx %g2, 32, %g2 ! IEU0 +1: andcc %g7, 2, %g0 ! IEU1 + be,pn %icc, 1f ! CTI Group + clr %o4 ! IEU1 + lduh [%src + 0x00], %o4 ! Load + add %src, 2, %src ! IEU0 Group + add %dst, 2, %dst ! IEU1 + stha %o4, [%dst - 0x2] %asi ! Store Group + 2 bubbles + sll %o4, 16, %o4 ! IEU0 +1: andcc %g7, 1, %g0 ! IEU1 + be,pn %icc, 1f ! CTI Group + clr %o5 ! IEU0 + ldub [%src + 0x00], %o5 ! Load + stba %o5, [%dst + 0x00] %asi ! Store Group + 2 bubbles + sll %o5, 8, %o5 ! IEU0 +1: or %g2, %o4, %o4 ! IEU1 + or %o5, %o4, %o4 ! IEU0 Group + addcc %o4, %sum, %sum ! IEU1 + bcc,pt %xcc, ccuserfold ! CTI + sethi %uhi(PAGE_OFFSET), %g4 ! IEU0 Group + b,pt %xcc, ccuserfold ! CTI + add %sum, 1, %sum ! IEU1 + +cc_user_fixit: + cmp %len, 6 ! IEU1 Group + bl,a,pn %icc, ccuserte ! CTI + andcc %len, 0xf, %g7 ! IEU1 Group + andcc %src, 2, %g0 ! IEU1 Group + be,pn %icc, 1f ! CTI + andcc %src, 0x4, %g0 ! IEU1 Group + lduh [%src + 0x00], %g4 ! Load + sub %len, 2, %len ! IEU0 + add %src, 2, %src ! IEU0 Group + add %dst, 2, %dst ! IEU1 + sll %g4, 16, %g3 ! IEU0 Group + 1 bubble + addcc %g3, %sum, %sum ! IEU1 + bcc,pt %xcc, 0f ! CTI + srl %sum, 16, %g3 ! IEU0 Group + add %g3, 1, %g3 ! IEU0 4 clocks (mispredict) +0: andcc %src, 0x4, %g0 ! IEU1 Group + stha %g4, [%dst - 0x2] %asi ! Store + sll %sum, 16, %sum ! IEU0 + sll %g3, 16, %g3 ! IEU0 Group + srl %sum, 16, %sum ! IEU0 Group + or %g3, %sum, %sum ! IEU0 Group (regdep) +1: be,pt %icc, ccusermerge ! CTI + andcc %len, 0xf0, %g1 ! IEU1 + lduw [%src + 0x00], %g4 ! Load Group + sub %len, 4, %len ! IEU0 + add %src, 4, %src ! IEU1 + add %dst, 4, %dst ! IEU0 Group + addcc %g4, %sum, %sum ! IEU1 Group + 1 bubble + stwa %g4, [%dst - 0x4] %asi ! Store + bcc,pt %xcc, ccusermerge ! CTI + andcc %len, 0xf0, %g1 ! IEU1 Group + b,pt %xcc, ccusermerge ! CTI 4 clocks (mispredict) + add %sum, 1, %sum ! IEU0 + + .align 32 + .globl csum_partial_copy_user_sparc64 +csum_partial_copy_user_sparc64: /* %o0=src, %o1=dest, %o2=len, %o3=sum */ + xorcc %src, %dst, %o4 ! IEU1 Group + srl %sum, 0, %sum ! IEU0 + andcc %o4, 3, %g0 ! IEU1 Group + srl %len, 0, %len ! IEU0 + bne,pn %icc, ccuserslow ! CTI + andcc %src, 1, %g0 ! IEU1 Group + bne,pn %icc, ccuserslow ! CTI + cmp %len, 256 ! IEU1 Group + bgeu,pt %icc, csum_partial_copy_user_vis ! CTI + andcc %src, 7, %g0 ! IEU1 Group + bne,pn %icc, cc_user_fixit ! CTI + andcc %len, 0xf0, %g1 ! IEU1 Group +ccusermerge: + be,pn %icc, ccuserte ! CTI + andcc %len, 0xf, %g7 ! IEU1 Group + sll %g1, 2, %o4 ! IEU0 +13: sethi %hi(12f), %o5 ! IEU0 Group + add %src, %g1, %src ! IEU1 + sub %o5, %o4, %o5 ! IEU0 Group + jmpl %o5 + %lo(12f), %g0 ! CTI Group brk forced + add %dst, %g1, %dst ! IEU0 Group +ccusertbl: + CSUMCOPY_LASTCHUNK_USER(0xe8,%g2,%g3) + CSUMCOPY_LASTCHUNK_USER(0xd8,%g2,%g3) + CSUMCOPY_LASTCHUNK_USER(0xc8,%g2,%g3) + CSUMCOPY_LASTCHUNK_USER(0xb8,%g2,%g3) + CSUMCOPY_LASTCHUNK_USER(0xa8,%g2,%g3) + CSUMCOPY_LASTCHUNK_USER(0x98,%g2,%g3) + CSUMCOPY_LASTCHUNK_USER(0x88,%g2,%g3) + CSUMCOPY_LASTCHUNK_USER(0x78,%g2,%g3) + CSUMCOPY_LASTCHUNK_USER(0x68,%g2,%g3) + CSUMCOPY_LASTCHUNK_USER(0x58,%g2,%g3) + CSUMCOPY_LASTCHUNK_USER(0x48,%g2,%g3) + CSUMCOPY_LASTCHUNK_USER(0x38,%g2,%g3) + CSUMCOPY_LASTCHUNK_USER(0x28,%g2,%g3) + CSUMCOPY_LASTCHUNK_USER(0x18,%g2,%g3) + CSUMCOPY_LASTCHUNK_USER(0x08,%g2,%g3) +12: + andcc %len, 0xf, %g7 ! IEU1 Group +ccuserte: + bne,pn %icc, cc_user_end_cruft ! CTI + sethi %uhi(PAGE_OFFSET), %g4 ! IEU0 +ccuserfold: + sllx %sum, 32, %o0 ! IEU0 Group + addcc %sum, %o0, %o0 ! IEU1 Group (regdep) + srlx %o0, 32, %o0 ! IEU0 Group (regdep) + bcs,a,pn %xcc, 1f ! CTI + add %o0, 1, %o0 ! IEU1 4 clocks (mispredict) +1: retl ! CTI Group brk forced + sllx %g4, 32, %g4 ! IEU0 Group + +ccuserslow: + mov 0, %g5 + brlez,pn %len, 4f + andcc %src, 1, %o5 + be,a,pt %icc, 1f + srl %len, 1, %g7 + sub %len, 1, %len + ldub [%src], %g5 + add %src, 1, %src + stba %g5, [%dst] %asi + srl %len, 1, %g7 + add %dst, 1, %dst +1: brz,a,pn %g7, 3f + andcc %len, 1, %g0 + andcc %src, 2, %g0 + be,a,pt %icc, 1f + srl %g7, 1, %g7 + lduh [%src], %o4 + sub %len, 2, %len + srl %o4, 8, %g2 + sub %g7, 1, %g7 + stba %g2, [%dst] %asi + add %o4, %g5, %g5 + stba %o4, [%dst + 1] %asi + add %src, 2, %src + srl %g7, 1, %g7 + add %dst, 2, %dst +1: brz,a,pn %g7, 2f + andcc %len, 2, %g0 + lduw [%src], %o4 +5: srl %o4, 24, %g2 + srl %o4, 16, %g3 + stba %g2, [%dst] %asi + srl %o4, 8, %g2 + stba %g3, [%dst + 1] %asi + add %src, 4, %src + stba %g2, [%dst + 2] %asi + addcc %o4, %g5, %g5 + stba %o4, [%dst + 3] %asi + addc %g5, %g0, %g5 + add %dst, 4, %dst + subcc %g7, 1, %g7 + bne,a,pt %icc, 5b + lduw [%src], %o4 + sll %g5, 16, %g2 + srl %g5, 16, %g5 + srl %g2, 16, %g2 + andcc %len, 2, %g0 + add %g2, %g5, %g5 +2: be,a,pt %icc, 3f + andcc %len, 1, %g0 + lduh [%src], %o4 + andcc %len, 1, %g0 + srl %o4, 8, %g2 + add %src, 2, %src + stba %g2, [%dst] %asi + add %g5, %o4, %g5 + stba %o4, [%dst + 1] %asi + add %dst, 2, %dst +3: be,a,pt %icc, 1f + sll %g5, 16, %o4 + ldub [%src], %g2 + sll %g2, 8, %o4 + stba %g2, [%dst] %asi + add %g5, %o4, %g5 + sll %g5, 16, %o4 +1: addcc %o4, %g5, %g5 + srl %g5, 16, %o4 + addc %g0, %o4, %g5 + brz,pt %o5, 4f + srl %g5, 8, %o4 + and %g5, 0xff, %g2 + and %o4, 0xff, %o4 + sll %g2, 8, %g2 + or %g2, %o4, %g5 +4: addcc %sum, %g5, %sum + addc %g0, %sum, %o0 + retl + srl %o0, 0, %o0 +cpc_user_end: + .globl cpc_handler cpc_handler: ldx [%sp + 0x7ff + 128], %g1 @@ -277,5 +509,5 @@ cpc_handler: .section __ex_table .align 4 - .word cpc_start, 0, cpc_end, cpc_handler - + .word cpc_start, 0, cpc_end, cpc_handler + .word cpc_user_start, 0, cpc_user_end, cpc_handler diff --git a/arch/sparc64/mm/asyncd.c b/arch/sparc64/mm/asyncd.c index f23a04edea35..b87efd590221 100644 --- a/arch/sparc64/mm/asyncd.c +++ b/arch/sparc64/mm/asyncd.c @@ -1,4 +1,4 @@ -/* $Id: asyncd.c,v 1.11 2000/01/08 20:22:19 davem Exp $ +/* $Id: asyncd.c,v 1.12 2000/01/21 11:39:13 jj Exp $ * The asyncd kernel daemon. This handles paging on behalf of * processes that receive page faults due to remote (async) memory * accesses. @@ -20,6 +20,7 @@ #include #include #include +#include #include #include /* for cli()/sti() */ @@ -113,6 +114,7 @@ static int fault_in_page(int taskid, { static unsigned last_address; static int last_task, loop_counter; + siginfo_t info; #warning Need some fixing here... -DaveM struct task_struct *tsk = current /* XXX task[taskid] */; pgd_t *pgd; @@ -181,9 +183,12 @@ no_memory: bad_area: stats.failure++; - tsk->thread.sig_address = address; - tsk->thread.sig_desc = SUBSIG_NOMAPPING; - send_sig(SIGSEGV, tsk, 1); + info.si_signo = SIGSEGV; + info.si_errno = 0; + info.si_code = SEGV_MAPERR; + info.si_addr = (void *)address; + info.si_trapno = 0; + send_sig_info(SIGSEGV, &info, tsk); return 1; } diff --git a/arch/sparc64/mm/fault.c b/arch/sparc64/mm/fault.c index 1835b874fb3c..226246980b74 100644 --- a/arch/sparc64/mm/fault.c +++ b/arch/sparc64/mm/fault.c @@ -1,4 +1,4 @@ -/* $Id: fault.c,v 1.40 1999/12/01 10:44:53 davem Exp $ +/* $Id: fault.c,v 1.42 2000/01/21 11:39:13 jj Exp $ * arch/sparc64/mm/fault.c: Page fault handlers for the 64-bit Sparc. * * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu) @@ -149,10 +149,13 @@ asmlinkage void do_sparc64_fault(struct pt_regs *regs, unsigned long address, in struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned int insn = 0; + siginfo_t info; #ifdef DEBUG_LOCKUPS static unsigned long lastaddr, lastpc; static int lastwrite, lockcnt; #endif + + info.si_code = SEGV_MAPERR; /* * If we're in an interrupt or have no user * context, we must not take the fault.. @@ -233,6 +236,7 @@ asmlinkage void do_sparc64_fault(struct pt_regs *regs, unsigned long address, in * we can handle it.. */ good_area: + info.si_code = SEGV_ACCERR; if(write) { if(!(vma->vm_flags & VM_WRITE)) goto bad_area; @@ -242,8 +246,14 @@ good_area: goto bad_area; } current->mm->segments = (void *) (address & PAGE_SIZE); - if (!handle_mm_fault(current, vma, address, write)) - goto do_sigbus; + { + int fault = handle_mm_fault(current, vma, address, write); + + if (fault < 0) + goto out_of_memory; + if (!fault) + goto do_sigbus; + } up(&mm->mmap_sem); return; /* @@ -324,20 +334,45 @@ do_kernel_fault: while(1) barrier(); #endif - current->thread.sig_address = address; - current->thread.sig_desc = SUBSIG_NOMAPPING; - force_sig(SIGSEGV, current); + info.si_signo = SIGSEGV; + info.si_errno = 0; + /* info.si_code set above to make clear whether + this was a SEGV_MAPERR or SEGV_ACCERR fault. */ + info.si_addr = (void *)address; + info.si_trapno = 0; + force_sig_info (SIGSEGV, &info, current); return; } unhandled_fault (address, current, regs); } return; +/* + * We ran out of memory, or some other thing happened to us that made + * us unable to handle the page fault gracefully. + */ +out_of_memory: + up(&mm->mmap_sem); + printk("VM: killing process %s\n", current->comm); + if (!(regs->tstate & TSTATE_PRIV)) + do_exit(SIGKILL); + goto do_kernel_fault; + do_sigbus: up(&mm->mmap_sem); - current->thread.sig_address = address; - current->thread.sig_desc = SUBSIG_MISCERROR; - force_sig(SIGBUS, current); + + /* + * Send a sigbus, regardless of whether we were in kernel + * or user mode. + */ + info.si_signo = SIGBUS; + info.si_errno = 0; + info.si_code = BUS_ADRERR; + info.si_addr = (void *)address; + info.si_trapno = 0; + force_sig_info (SIGBUS, &info, current); + + /* Kernel mode? Handle exceptions or die */ if (regs->tstate & TSTATE_PRIV) goto do_kernel_fault; } diff --git a/drivers/char/Config.in b/drivers/char/Config.in index 140cc583dc3f..1bffec23dc61 100644 --- a/drivers/char/Config.in +++ b/drivers/char/Config.in @@ -115,6 +115,7 @@ if [ "$CONFIG_WATCHDOG" != "n" ]; then tristate ' Software Watchdog' CONFIG_SOFT_WATCHDOG tristate ' Berkshire Products PC Watchdog' CONFIG_PCWATCHDOG tristate ' Acquire SBC Watchdog Timer' CONFIG_ACQUIRE_WDT + tristate ' Mixcom Watchdog' CONFIG_MIXCOMWD fi endmenu diff --git a/drivers/char/Makefile b/drivers/char/Makefile index 9b67e5a90fff..bebb7882cb77 100644 --- a/drivers/char/Makefile +++ b/drivers/char/Makefile @@ -315,6 +315,14 @@ else endif endif +ifeq ($(CONFIG_MIXCOMWD),y) +O_OBJS += mixcomwd.o +else + ifeq ($(CONFIG_MIXCOMWD),m) + M_OBJS += mixcomwd.o + endif +endif + ifeq ($(CONFIG_AMIGAMOUSE),y) O_OBJS += amigamouse.o else diff --git a/drivers/char/console.c b/drivers/char/console.c index 090f1066cdc9..1333871456da 100644 --- a/drivers/char/console.c +++ b/drivers/char/console.c @@ -2847,6 +2847,7 @@ EXPORT_SYMBOL(default_blu); EXPORT_SYMBOL(video_font_height); EXPORT_SYMBOL(video_scan_lines); EXPORT_SYMBOL(vc_resize); +EXPORT_SYMBOL(fg_console); #ifndef VT_SINGLE_DRIVER EXPORT_SYMBOL(take_over_console); diff --git a/drivers/char/mixcomwd.c b/drivers/char/mixcomwd.c new file mode 100644 index 000000000000..1bb5a5b7f3e2 --- /dev/null +++ b/drivers/char/mixcomwd.c @@ -0,0 +1,250 @@ +/* + * MixCom Watchdog: A Simple Hardware Watchdog Device + * Based on Softdog driver by Alan Cox and PC Watchdog driver by Ken Hollis + * + * Author: Gergely Madarasz + * + * Copyright (c) 1999 ITConsult-Pro Co. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Version 0.1 (99/04/15): + * - first version + * + * Version 0.2 (99/06/16): + * - added kernel timer watchdog ping after close + * since the hardware does not support watchdog shutdown + * + * Version 0.3 (99/06/21): + * - added WDIOC_GETSTATUS and WDIOC_GETSUPPORT ioctl calls + * + * Version 0.3.1 (99/06/22): + * - allow module removal while internal timer is active, + * print warning about probable reset + * + */ + +#define VERSION "0.3.1" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int mixcomwd_ioports[] = { 0x180, 0x280, 0x380, 0x000 }; + +#define MIXCOM_WATCHDOG_OFFSET 0xc10 +#define MIXCOM_ID1 0x11 +#define MIXCOM_ID2 0x13 + +static int mixcomwd_opened; +static int mixcomwd_port; + +#ifndef CONFIG_WATCHDOG_NOWAYOUT +static int mixcomwd_timer_alive; +static struct timer_list mixcomwd_timer; +#endif + +static void mixcomwd_ping(void) +{ + outb_p(55,mixcomwd_port+MIXCOM_WATCHDOG_OFFSET); + return; +} + +#ifndef CONFIG_WATCHDOG_NOWAYOUT +static void mixcomwd_timerfun(unsigned long d) +{ + mixcomwd_ping(); + + mod_timer(&mixcomwd_timer,jiffies+ 5*HZ); +} +#endif + +/* + * Allow only one person to hold it open + */ + +static int mixcomwd_open(struct inode *inode, struct file *file) +{ + if(test_and_set_bit(0,&mixcomwd_opened)) { + return -EBUSY; + } + mixcomwd_ping(); + +#ifndef CONFIG_WATCHDOG_NOWAYOUT + if(mixcomwd_timer_alive) { + del_timer(&mixcomwd_timer); + mixcomwd_timer_alive=0; + } +#endif + MOD_INC_USE_COUNT; + + return 0; +} + +static int mixcomwd_release(struct inode *inode, struct file *file) +{ + +#ifndef CONFIG_WATCHDOG_NOWAYOUT + if(mixcomwd_timer_alive) { + printk(KERN_ERR "mixcomwd: release called while internal timer alive"); + return -EBUSY; + } + init_timer(&mixcomwd_timer); + mixcomwd_timer.expires=jiffies + 5 * HZ; + mixcomwd_timer.function=mixcomwd_timerfun; + mixcomwd_timer.data=0; + mixcomwd_timer_alive=1; + add_timer(&mixcomwd_timer); +#endif + MOD_DEC_USE_COUNT; + + clear_bit(0,&mixcomwd_opened); + return 0; +} + + +static ssize_t mixcomwd_write(struct file *file, const char *data, size_t len, loff_t *ppos) +{ + if (ppos != &file->f_pos) { + return -ESPIPE; + } + + if(len) + { + mixcomwd_ping(); + return 1; + } + return 0; +} + +static int mixcomwd_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + int status; + static struct watchdog_info ident = { + WDIOF_KEEPALIVEPING, 1, "MixCOM watchdog" + }; + + switch(cmd) + { + case WDIOC_GETSTATUS: + status=mixcomwd_opened; +#ifndef CONFIG_WATCHDOG_NOWAYOUT + status|=mixcomwd_timer_alive; +#endif + if (copy_to_user((int *)arg, &status, sizeof(int))) { + return -EFAULT; + } + break; + case WDIOC_GETSUPPORT: + if (copy_to_user((struct watchdog_info *)arg, &ident, + sizeof(ident))) { + return -EFAULT; + } + break; + case WDIOC_KEEPALIVE: + mixcomwd_ping(); + break; + default: + return -ENOIOCTLCMD; + } + return 0; +} + +static struct file_operations mixcomwd_fops= +{ + NULL, /* Seek */ + NULL, /* Read */ + mixcomwd_write, /* Write */ + NULL, /* Readdir */ + NULL, /* Select */ + mixcomwd_ioctl, /* Ioctl */ + NULL, /* MMap */ + mixcomwd_open, + NULL, /* flush */ + mixcomwd_release, + NULL, + NULL /* Fasync */ +}; + +static struct miscdevice mixcomwd_miscdev= +{ + WATCHDOG_MINOR, + "watchdog", + &mixcomwd_fops +}; + +static int __init mixcomwd_checkcard(int port) +{ + int id; + + if(check_region(port,1)) { + return 0; + } + + id=inb_p(port + MIXCOM_WATCHDOG_OFFSET) & 0x3f; + if(id!=MIXCOM_ID1 && id!=MIXCOM_ID2) { + return 0; + } + return 1; +} + + +void __init mixcomwd_init(void) +{ + int i; + int found=0; + + for (i = 0; mixcomwd_ioports[i] != 0; i++) { + if (mixcomwd_checkcard(mixcomwd_ioports[i])) { + found = 1; + mixcomwd_port = mixcomwd_ioports[i]; + break; + } + } + + if (!found) { + printk("mixcomwd: No card detected, or port not available.\n"); + return; + } + + request_region(mixcomwd_port+MIXCOM_WATCHDOG_OFFSET,1,"MixCOM watchdog"); + + misc_register(&mixcomwd_miscdev); + printk("MixCOM watchdog driver v%s, MixCOM card at 0x%3x\n",VERSION,mixcomwd_port); +} + +#ifdef MODULE +int init_module(void) +{ + mixcomwd_init(); + return 0; +} + +void cleanup_module(void) +{ +#ifndef CONFIG_WATCHDOG_NOWAYOUT + if(mixcomwd_timer_alive) { + printk(KERN_WARNING "mixcomwd: I quit now, hardware will" + " probably reboot!\n"); + del_timer(&mixcomwd_timer); + mixcomwd_timer_alive=0; + } +#endif + release_region(mixcomwd_port+MIXCOM_WATCHDOG_OFFSET,1); + misc_deregister(&mixcomwd_miscdev); +} +#endif diff --git a/drivers/char/saa5249.c b/drivers/char/saa5249.c index 58c6f7b83496..784ae208a287 100644 --- a/drivers/char/saa5249.c +++ b/drivers/char/saa5249.c @@ -50,7 +50,7 @@ #include #include #include -#include +#include #include #include @@ -101,7 +101,7 @@ struct saa5249_device int is_searching[NUM_DAUS]; int disp_mode; int virtual_mode; - struct i2c_bus *bus; + struct i2c_client *client; }; @@ -109,7 +109,6 @@ struct saa5249_device #define CCTRD 35 #define NOACK_REPEAT 10 /* Retry access this many times on failure */ #define CLEAR_DELAY (HZ/20) /* Time required to clear a page */ -#define I2C_TIMEOUT (3*HZ) /* open/close/SDA-check timeout */ #define READY_TIMEOUT (30*HZ/1000) /* Time to wait for ready signal of I²C-bus interface */ #define INIT_DELAY 500 /* Time in usec to wait at initialization of CEA interface */ #define START_DELAY 10 /* Time in usec to wait before starting write-cycle (CEA) */ @@ -135,45 +134,60 @@ struct saa5249_device static struct video_device saa_template; /* Declared near bottom */ -/* - * We do most of the hard work when we become a device on the i2c. - */ - -static int saa5249_attach(struct i2c_device *device) +/* Addresses to scan */ +static unsigned short normal_i2c[] = {34>>1,I2C_CLIENT_END}; +static unsigned short normal_i2c_range[] = {I2C_CLIENT_END}; +static unsigned short probe[2] = { I2C_CLIENT_END, I2C_CLIENT_END }; +static unsigned short probe_range[2] = { I2C_CLIENT_END, I2C_CLIENT_END }; +static unsigned short ignore[2] = { I2C_CLIENT_END, I2C_CLIENT_END }; +static unsigned short ignore_range[2] = { I2C_CLIENT_END, I2C_CLIENT_END }; +static unsigned short force[2] = { I2C_CLIENT_END, I2C_CLIENT_END }; + +static struct i2c_client_address_data addr_data = { + normal_i2c, normal_i2c_range, + probe, probe_range, + ignore, ignore_range, + force +}; + +static struct i2c_client client_template; + +static int saa5249_attach(struct i2c_adapter *adap, int addr, unsigned short flags, int kind) { int pgbuf; int err; + struct i2c_client *client; struct video_device *vd; struct saa5249_device *t; - /* Only attach these chips to the BT848 bus for now */ - - if(device->bus->id!=I2C_BUSID_BT848) - return -EINVAL; - - strcpy(device->name, IF_NAME); + + printk(KERN_INFO "saa5249: teletext chip found.\n"); + client=kmalloc(sizeof(*client), GFP_KERNEL); + if(client==NULL) + return -ENOMEM; + client_template.adapter = adap; + client_template.addr = addr; + memcpy(client, &client_template, sizeof(*client)); + t = kmalloc(sizeof(*t), GFP_KERNEL); + if(t==NULL) + { + kfree(client); + return -ENOMEM; + } + memset(t, 0, sizeof(*t)); + strcpy(client->name, IF_NAME); /* * Now create a video4linux device */ - vd=(struct video_device *)kmalloc(sizeof(struct video_device), GFP_KERNEL); + client->data = vd=(struct video_device *)kmalloc(sizeof(struct video_device), GFP_KERNEL); if(vd==NULL) - return -ENOMEM; - - memcpy(vd, &saa_template, sizeof(*vd)); - - /* - * Attach an saa5249 device - */ - - t=(struct saa5249_device *)kmalloc(sizeof(struct saa5249_device), GFP_KERNEL); - if(t==NULL) { - kfree(vd); + kfree(t); + kfree(client); return -ENOMEM; } - - memset(t, 0, sizeof(*t)); + memcpy(vd, &saa_template, sizeof(*vd)); for (pgbuf = 0; pgbuf < NUM_DAUS; pgbuf++) { @@ -186,7 +200,6 @@ static int saa5249_attach(struct i2c_device *device) t->is_searching[pgbuf] = FALSE; } vd->priv=t; - device->data=vd; /* * Register it @@ -196,22 +209,43 @@ static int saa5249_attach(struct i2c_device *device) { kfree(t); kfree(vd); + kfree(client); return err; } - t->bus = device->bus; + t->client = client; + i2c_attach_client(client); + MOD_INC_USE_COUNT; + return 0; +} + +/* + * We do most of the hard work when we become a device on the i2c. + */ + +static int saa5249_probe(struct i2c_adapter *adap) +{ + /* Only attach these chips to the BT848 bus for now */ + + if (adap->id == (I2C_ALGO_BIT | I2C_HW_B_BT848)) + { + return i2c_probe(adap, &addr_data, saa5249_attach); + } return 0; } -static int saa5249_detach(struct i2c_device *device) +static int saa5249_detach(struct i2c_client *client) { - struct video_device *vd=device->data; + struct video_device *vd=client->data; + i2c_detach_client(client); video_unregister_device(vd); kfree(vd->priv); kfree(vd); + kfree(client); + MOD_DEC_USE_COUNT; return 0; } -static int saa5249_command(struct i2c_device *device, +static int saa5249_command(struct i2c_client *device, unsigned int cmd, void *arg) { return -EINVAL; @@ -223,12 +257,21 @@ static struct i2c_driver i2c_driver_videotext = { IF_NAME, /* name */ I2C_DRIVERID_VIDEOTEXT, /* in i2c.h */ - 34, 35, /* Addr range */ - saa5249_attach, + I2C_DF_NOTIFY, + saa5249_probe, saa5249_detach, saa5249_command }; +static struct i2c_client client_template = { + "(unset)", + -1, + 0, + 0, + NULL, + &i2c_driver_videotext +}; + /* * Wait the given number of jiffies (10ms). This calls the scheduler, so the actual * delay may be longer. @@ -252,109 +295,46 @@ static void jdelay(unsigned long delay) } -/* Send arbitrary number of bytes to I²C-bus. Start & stop handshaking is done by this routine. - * adr should be address of I²C-device, varargs-list of values to send must be terminated by -1 - * Returns -1 if I²C-device didn't send acknowledge, 0 otherwise +/* + * I2C interfaces */ - -static int i2c_senddata(struct saa5249_device *t, int adr, ...) + +static int i2c_sendbuf(struct saa5249_device *t, int reg, int count, u8 *data) { - int val, loop; - va_list argp; - - for (loop = 0; loop <= NOACK_REPEAT; loop++) - { - i2c_start(t->bus); - if (i2c_sendbyte(t->bus, adr, 0) != 0) - goto loopend; - - va_start(argp, adr); - while ((val = va_arg(argp, int)) != -1) - { - if (val < 0 || val > 255) - { - printk(KERN_ERR "vtx: internal error in i2c_senddata\n"); - break; - } - if (i2c_sendbyte(t->bus, val, 0) != 0) - goto loopend; - } - va_end(argp); - i2c_stop(t->bus); + char buf[64]; + + buf[0] = reg; + memcpy(buf+1, data, count); + + if(i2c_master_send(t->client, buf, count+1)==count+1) return 0; -loopend: - i2c_stop(t->bus); - } - va_end(argp); return -1; } - -/* Send count number of bytes from buffer buf to I²C-device adr. Start & stop handshaking is - * done by this routine. If uaccess is TRUE, data is read from user-space with get_user. - * Returns -1 if I²C-device didn't send acknowledge, 0 otherwise - */ - -static int i2c_sendbuf(struct saa5249_device *t, int adr, int reg, int count, u8 *buf, int uaccess) +static int i2c_senddata(struct saa5249_device *t, ...) { - int pos, loop; - u8 val; - - for (loop = 0; loop <= NOACK_REPEAT; loop++) - { - i2c_start(t->bus); - if (i2c_sendbyte(t->bus, adr, 0) != 0 || i2c_sendbyte(t->bus, reg, 0) != 0) - goto loopend; - for (pos = 0; pos < count; pos++) - { - /* FIXME: FAULT WITH CLI/SPINLOCK ?? */ - if (uaccess) - get_user(val, buf + pos); - else - val = buf[pos]; - if (i2c_sendbyte(t->bus, val, 0) != 0) - goto loopend; - RESCHED; - } - i2c_stop(t->bus); - return 0; -loopend: - i2c_stop(t->bus); - } - return -1; + unsigned char buf[64]; + int v; + int ct=0; + va_list argp; + va_start(argp,t); + + while((v=va_arg(argp,int))!=-1) + buf[ct++]=v; + return i2c_sendbuf(t, buf[0], ct-1, buf+1); } - /* Get count number of bytes from I²C-device at address adr, store them in buf. Start & stop * handshaking is done by this routine, ack will be sent after the last byte to inhibit further * sending of data. If uaccess is TRUE, data is written to user-space with put_user. * Returns -1 if I²C-device didn't send acknowledge, 0 otherwise */ -static int i2c_getdata(struct saa5249_device *t, int adr, int count, u8 *buf, int uaccess) +static int i2c_getdata(struct saa5249_device *t, int count, u8 *buf) { - int pos, loop, val; - - for (loop = 0; loop <= NOACK_REPEAT; loop++) - { - i2c_start(t->bus); - if (i2c_sendbyte(t->bus, adr, 1) != 0) - goto loopend; - for (pos = 0; pos < count; pos++) - { - val = i2c_readbyte(t->bus, (pos==count-1) ? 1 : 0); - if (uaccess) - put_user(val, buf + pos); - else - buf[pos] = val; - RESCHED; - } - i2c_stop(t->bus); - return 0; -loopend: - i2c_stop(t->bus); - } - return -1; + if(i2c_master_recv(t->client, buf, count)!=count) + return -1; + return 0; } @@ -449,41 +429,41 @@ static int saa5249_ioctl(struct video_device *vd, unsigned int cmd, void *arg) return -EINVAL; if (!t->vdau[req.pgbuf].stopped) { - if (i2c_senddata(t, CCTWR, 2, 0, -1) || - i2c_sendbuf(t, CCTWR, 3, sizeof(t->vdau[0].sregs), t->vdau[req.pgbuf].sregs, FALSE) || - i2c_senddata(t, CCTWR, 8, 0, 25, 0, ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', -1) || - i2c_senddata(t, CCTWR, 2, 0, t->vdau[req.pgbuf].sregs[0] | 8, -1) || - i2c_senddata(t, CCTWR, 8, 0, 25, 0, -1)) + if (i2c_senddata(t, 2, 0, -1) || + i2c_sendbuf(t, 3, sizeof(t->vdau[0].sregs), t->vdau[req.pgbuf].sregs) || + i2c_senddata(t, 8, 0, 25, 0, ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', -1) || + i2c_senddata(t, 2, 0, t->vdau[req.pgbuf].sregs[0] | 8, -1) || + i2c_senddata(t, 8, 0, 25, 0, -1)) return -EIO; jdelay(PAGE_WAIT); - if (i2c_getdata(t, CCTRD, 10, infobits, FALSE)) + if (i2c_getdata(t, 10, infobits)) return -EIO; if (!(infobits[8] & 0x10) && !(infobits[7] & 0xf0) && /* check FOUND-bit */ (memcmp(infobits, t->vdau[req.pgbuf].laststat, sizeof(infobits)) || time_after_eq(jiffies, t->vdau[req.pgbuf].expire))) { /* check if new page arrived */ - if (i2c_senddata(t, CCTWR, 8, 0, 0, 0, -1) || - i2c_getdata(t, CCTRD, VTX_PAGESIZE, t->vdau[req.pgbuf].pgbuf, FALSE)) + if (i2c_senddata(t, 8, 0, 0, 0, -1) || + i2c_getdata(t, VTX_PAGESIZE, t->vdau[req.pgbuf].pgbuf)) return -EIO; t->vdau[req.pgbuf].expire = jiffies + PGBUF_EXPIRE; memset(t->vdau[req.pgbuf].pgbuf + VTX_PAGESIZE, ' ', VTX_VIRTUALSIZE - VTX_PAGESIZE); if (t->virtual_mode) { /* Packet X/24 */ - if (i2c_senddata(t, CCTWR, 8, 0, 0x20, 0, -1) || - i2c_getdata(t, CCTRD, 40, t->vdau[req.pgbuf].pgbuf + VTX_PAGESIZE + 20 * 40, FALSE)) + if (i2c_senddata(t, 8, 0, 0x20, 0, -1) || + i2c_getdata(t, 40, t->vdau[req.pgbuf].pgbuf + VTX_PAGESIZE + 20 * 40)) return -EIO; /* Packet X/27/0 */ - if (i2c_senddata(t, CCTWR, 8, 0, 0x21, 0, -1) || - i2c_getdata(t, CCTRD, 40, t->vdau[req.pgbuf].pgbuf + VTX_PAGESIZE + 16 * 40, FALSE)) + if (i2c_senddata(t, 8, 0, 0x21, 0, -1) || + i2c_getdata(t, 40, t->vdau[req.pgbuf].pgbuf + VTX_PAGESIZE + 16 * 40)) return -EIO; /* Packet 8/30/0...8/30/15 * FIXME: AFAIK, the 5249 does hamming-decoding for some bytes in packet 8/30, * so we should undo this here. */ - if (i2c_senddata(t, CCTWR, 8, 0, 0x22, 0, -1) || - i2c_getdata(t, CCTRD, 40, t->vdau[req.pgbuf].pgbuf + VTX_PAGESIZE + 23 * 40, FALSE)) + if (i2c_senddata(t, 8, 0, 0x22, 0, -1) || + i2c_getdata(t, 40, t->vdau[req.pgbuf].pgbuf + VTX_PAGESIZE + 23 * 40)) return -EIO; } t->vdau[req.pgbuf].clrfound = FALSE; @@ -554,20 +534,30 @@ static int saa5249_ioctl(struct video_device *vd, unsigned int cmd, void *arg) if (req.start <= 39 && req.end >= 32) { + int len; + char buf[16]; start = MAX(req.start, 32); end = MIN(req.end, 39); - if (i2c_senddata(t, CCTWR, 8, 0, 0, start, -1) || - i2c_getdata(t, CCTRD, end - start + 1, req.buffer + start - req.start, TRUE)) + len=end-start+1; + if (i2c_senddata(t, 8, 0, 0, start, -1) || + i2c_getdata(t, len, buf)) return -EIO; + if(copy_to_user(req.buffer+start-req.start, buf, len)) + return -EFAULT; } /* Insert the current header if DAU is still searching for a page */ if (req.start <= 31 && req.end >= 7 && t->is_searching[req.pgbuf]) { + char buf[32]; + int len; start = MAX(req.start, 7); end = MIN(req.end, 31); - if (i2c_senddata(t, CCTWR, 8, 0, 0, start, -1) || - i2c_getdata(t, CCTRD, end - start + 1, req.buffer + start - req.start, TRUE)) + len=end-start+1; + if (i2c_senddata(t, 8, 0, 0, start, -1) || + i2c_getdata(t, len, buf)) return -EIO; + if(copy_to_user(req.buffer+start-req.start, buf, len)) + return -EFAULT; } return 0; } @@ -592,11 +582,11 @@ static int saa5249_ioctl(struct video_device *vd, unsigned int cmd, void *arg) case VTXIOCCLRCACHE: { - if (i2c_senddata(t ,CCTWR, 0, NUM_DAUS, 0, 8, -1) || i2c_senddata(t, CCTWR, 11, + if (i2c_senddata(t, 0, NUM_DAUS, 0, 8, -1) || i2c_senddata(t, 11, ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ', ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ', -1)) return -EIO; - if (i2c_senddata(t, CCTWR, 3, 0x20, -1)) + if (i2c_senddata(t, 3, 0x20, -1)) return -EIO; jdelay(10 * CLEAR_DELAY); /* I have no idea how long we have to wait here */ return 0; @@ -618,14 +608,14 @@ static int saa5249_open(struct video_device *vd, int nb) struct saa5249_device *t=vd->priv; int pgbuf; - if (t->bus==NULL) + if (t->client==NULL) return -ENODEV; - if (i2c_senddata(t, CCTWR, 0, 0, -1) || /* Select R11 */ + if (i2c_senddata(t, 0, 0, -1) || /* Select R11 */ /* Turn off parity checks (we do this ourselves) */ - i2c_senddata(t, CCTWR, 1, disp_modes[t->disp_mode][0], 0, -1) || + i2c_senddata(t, 1, disp_modes[t->disp_mode][0], 0, -1) || /* Display TV-picture, no virtual rows */ - i2c_senddata(t, CCTWR, 4, NUM_DAUS, disp_modes[t->disp_mode][1], disp_modes[t->disp_mode][2], 7, -1)) /* Set display to page 4 */ + i2c_senddata(t, 4, NUM_DAUS, disp_modes[t->disp_mode][1], disp_modes[t->disp_mode][2], 7, -1)) /* Set display to page 4 */ { return -EIO; @@ -651,8 +641,8 @@ static int saa5249_open(struct video_device *vd, int nb) static void saa5249_release(struct video_device *vd) { struct saa5249_device *t=vd->priv; - i2c_senddata(t, CCTWR, 1, 0x20, -1); /* Turn off CCT */ - i2c_senddata(t, CCTWR, 5, 3, 3, -1); /* Turn off TV-display */ + i2c_senddata(t, 1, 0x20, -1); /* Turn off CCT */ + i2c_senddata(t, 5, 3, 3, -1); /* Turn off TV-display */ MOD_DEC_USE_COUNT; return; } @@ -666,12 +656,12 @@ static int __init init_saa_5249 (void) { printk(KERN_INFO "SAA5249 driver (" IF_NAME " interface) for VideoText version %d.%d\n", VTX_VER_MAJ, VTX_VER_MIN); - return i2c_register_driver(&i2c_driver_videotext); + return i2c_add_driver(&i2c_driver_videotext); } static void __exit cleanup_saa_5249 (void) { - i2c_unregister_driver(&i2c_driver_videotext); + i2c_del_driver(&i2c_driver_videotext); } module_init(init_saa_5249); diff --git a/drivers/char/sx.c b/drivers/char/sx.c index eacde318ea32..ae2fb2d705be 100644 --- a/drivers/char/sx.c +++ b/drivers/char/sx.c @@ -2457,20 +2457,20 @@ void fix_sx_pci (PDEV, struct sx_board *board) { unsigned int hwbase; unsigned long rebase; - int t; + unsigned int t; -#define CNTRL_REG_OFFSET 0x14 +#define CNTRL_REG_OFFSET 0x50 +#define CNTRL_REG_GOODVALUE 0x00260000 pci_read_config_dword(pdev, PCI_BASE_ADDRESS_0, &hwbase); hwbase &= PCI_BASE_ADDRESS_MEM_MASK; rebase = (ulong) ioremap(hwbase, 0x80); - t = readb (rebase + CNTRL_REG_OFFSET*4 + 2); - if (t != 0x06) { - printk (KERN_DEBUG "sx: performing cntrl reg fix: %02x -> 06\n", t); - writeb (0x06, rebase + CNTRL_REG_OFFSET*4+2); + t = readl (rebase + CNTRL_REG_OFFSET); + if (t != CNTRL_REG_GOODVALUE) { + printk (KERN_DEBUG "sx: performing cntrl reg fix: %08x -> %08x\n", t, CNTRL_REG_GOODVALUE); + writel (CNTRL_REG_GOODVALUE, rebase + CNTRL_REG_OFFSET); } my_iounmap (hwbase, rebase); - } #endif diff --git a/drivers/net/setup.c b/drivers/net/setup.c index 601617f79f46..35b90ff3b8b1 100644 --- a/drivers/net/setup.c +++ b/drivers/net/setup.c @@ -48,6 +48,7 @@ extern int rcpci_probe(void); extern int rr_hippi_probe(void); extern int rtl8139_probe(void); extern int sdla_setup(void); +extern int sdla_c_setup(void); extern int sis900_probe(void); extern int skge_probe(void); extern int sparc_lance_probe(void); @@ -94,7 +95,7 @@ struct net_probe pci_probes[] __initdata = { {dlci_setup, 0}, #endif #if defined(CONFIG_SDLA) - {sdla_setup, 0}, + {sdla_c_setup, 0}, #endif #if defined(CONFIG_LAPBETHER) {lapbeth_init, 0}, diff --git a/drivers/net/wan/cycx_main.c b/drivers/net/wan/cycx_main.c index b9b72ddeb1b8..02eab0c53ba8 100644 --- a/drivers/net/wan/cycx_main.c +++ b/drivers/net/wan/cycx_main.c @@ -13,6 +13,8 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * ============================================================================ +* 2000/01/21 acme rename cyclomx_open to cyclomx_mod_inc_use_count +* and cyclomx_close to cyclomx_mod_dec_use_count * 2000/01/08 acme cleanup * 1999/11/06 acme cycx_down back to life (it needs to be * called to iounmap the dpmbase) @@ -49,7 +51,7 @@ MODULE_DESCRIPTION("Cyclom 2X Sync Card Driver."); /* Defines & Macros */ #define DRV_VERSION 0 /* version number */ -#define DRV_RELEASE 5 /* release (minor version) number */ +#define DRV_RELEASE 6 /* release (minor version) number */ #define MAX_CARDS 1 /* max number of adapters */ #ifndef CONFIG_CYCLOMX_CARDS /* configurable option */ @@ -132,9 +134,9 @@ int __init cyclomx_init (void) err = register_wan_device(wandev); if (err) { - printk(KERN_ERR - "%s: %s registration failed with error %d!\n", - drvname, card->devname, err); + printk(KERN_ERR "%s: %s registration failed with " + "error %d!\n", + drvname, card->devname, err); break; } } @@ -338,7 +340,7 @@ static void cycx_isr (int irq, void *dev_id, struct pt_regs *regs) * have to call MOD_INC_USE_COUNT, but cannot include 'module.h' where it's * defined more than once into the same kernel module. */ -void cyclomx_open (cycx_t *card) +void cyclomx_mod_inc_use_count (cycx_t *card) { ++card->open_cnt; MOD_INC_USE_COUNT; @@ -350,7 +352,7 @@ void cyclomx_open (cycx_t *card) * have to call MOD_DEC_USE_COUNT, but cannot include 'module.h' where it's * defined more than once into the same kernel module. */ -void cyclomx_close (cycx_t *card) +void cyclomx_mod_dec_use_count (cycx_t *card) { --card->open_cnt; MOD_DEC_USE_COUNT; diff --git a/drivers/net/wan/cycx_x25.c b/drivers/net/wan/cycx_x25.c index e9156df92aba..e11c9bcfe5f6 100644 --- a/drivers/net/wan/cycx_x25.c +++ b/drivers/net/wan/cycx_x25.c @@ -70,6 +70,7 @@ * 1998/12/26 acme Minimal debug code cleanup * 1998/08/08 acme Initial version. */ + #define CYCLOMX_X25_DEBUG 1 #include @@ -188,7 +189,6 @@ static void x25_dump_devs(wan_device_t *wandev); * * This routine is called by the main Cyclom 2X module during setup. At this * point adapter is completely initialized and X.25 firmware is running. - * o read firmware version (to make sure it's alive) * o configure adapter * o initialize protocol-specific fields of the adapter data space. * @@ -336,7 +336,8 @@ static int update (wan_device_t *wandev) * * Return: 0 o.k. * < 0 failure (channel will not be created) */ -static int new_if (wan_device_t *wandev, struct net_device *dev, wanif_conf_t *conf) +static int new_if (wan_device_t *wandev, struct net_device *dev, + wanif_conf_t *conf) { cycx_t *card = wandev->private; x25_channel_t *chan; @@ -507,7 +508,7 @@ static int if_open (struct net_device *dev) dev->interrupt = 0; dev->tbusy = 0; dev->start = 1; - cyclomx_open(card); + cyclomx_mod_inc_use_count(card); return 0; } @@ -525,7 +526,7 @@ static int if_close (struct net_device *dev) if (chan->state == WAN_CONNECTED || chan->state == WAN_CONNECTING) chan_disconnect(dev); - cyclomx_close(card); + cyclomx_mod_dec_use_count(card); return 0; } diff --git a/drivers/net/wan/sdla.c b/drivers/net/wan/sdla.c index 9cadf1e0b9a6..29bedf8f9635 100644 --- a/drivers/net/wan/sdla.c +++ b/drivers/net/wan/sdla.c @@ -1666,7 +1666,7 @@ int __init sdla_init(struct net_device *dev) return(0); } -int __init sdla_setup(void) +int __init sdla_c_setup(void) { printk("%s.\n", version); register_frad(devname); @@ -1680,7 +1680,7 @@ int init_module(void) { int result; - sdla_setup(); + sdla_c_setup(); if ((result = register_netdev(&sdla0)) != 0) return result; return 0; diff --git a/drivers/sbus/audio/dbri.c b/drivers/sbus/audio/dbri.c index 66a34b35c1f5..88604450d199 100644 --- a/drivers/sbus/audio/dbri.c +++ b/drivers/sbus/audio/dbri.c @@ -1,4 +1,4 @@ -/* $Id: dbri.c,v 1.16 1999/11/19 09:56:05 davem Exp $ +/* $Id: dbri.c,v 1.17 2000/01/20 07:57:47 anton Exp $ * drivers/sbus/audio/dbri.c * * Copyright (C) 1997 Rudolf Koenig (rfkoenig@immd4.informatik.uni-erlangen.de) @@ -226,7 +226,7 @@ static void dbri_detach(struct dbri *dbri) free_irq(dbri->irq, dbri); sbus_iounmap(dbri->regs, dbri->regs_size); sbus_free_consistant(dbri->sdev, sizeof(struct dbri_dma), - dbri->dma, dbry->dma_dvma); + dbri->dma, dbri->dma_dvma); kfree(dbri); } @@ -999,7 +999,7 @@ static void recv_on_pipe(struct dbri *dbri, int pipe, /* Make sure buffer size is multiple of four */ len &= ~3; - buf_buffer_base = buf_buffer = sbus_map_single(dbri->sdev, buffer, len); + bus_buffer_base = bus_buffer = sbus_map_single(dbri->sdev, buffer, len); while (len > 0) { int rd, mylen; diff --git a/drivers/scsi/megaraid.c b/drivers/scsi/megaraid.c index 863bb3fce838..b083bc987296 100644 --- a/drivers/scsi/megaraid.c +++ b/drivers/scsi/megaraid.c @@ -237,52 +237,18 @@ static void mega_Convert8ldTo40ld( mega_RAIDINQ *inquiry, megaRaidProductInfo *productInfo ); +#include +#include - -#if LINUX_VERSION_CODE > 0x020100 -# include -# include -# define cpuid smp_processor_id() -# if LINUX_VERSION_CODE < 0x020195 -# define DRIVER_LOCK_T unsigned long cpu_flags = 0; -# define DRIVER_LOCK_INIT(p) \ - spin_lock_init(&p->mega_lock); -# define DRIVER_LOCK(p) \ - if(!p->cpu_lock_count[cpuid]) { \ - spin_lock_irqsave(&p->mega_lock, cpu_flags); \ - p->cpu_lock_count[cpuid]++; \ - } else { \ - p->cpu_lock_count[cpuid]++; \ - } -# define DRIVER_UNLOCK(p) \ - if(--p->cpu_lock_count[cpuid] == 0) \ - spin_unlock_irqrestore(&p->mega_lock, cpu_flags); -# define IO_LOCK(p) spin_lock_irqsave(&io_request_lock,cpu_flags); -# define IO_UNLOCK(p) spin_unlock_irqrestore(&io_request_lock,cpu_flags); -# else -# define DRIVER_LOCK_T -# define DRIVER_LOCK_INIT(p) -# define DRIVER_LOCK(p) -# define DRIVER_UNLOCK(p) -# define IO_LOCK_T unsigned long io_flags = 0; -# define IO_LOCK spin_lock_irqsave(&io_request_lock,io_flags); -# define IO_UNLOCK spin_unlock_irqrestore(&io_request_lock,io_flags); -# endif -#else -# define cpuid 0 -# define DRIVER_LOCK_T long cpu_flags; -# define DRIVER_LOCK_INIT(p) -# define DRIVER_LOCK(p) \ - save_flags(cpu_flags); \ - cli(); -# define DRIVER_UNLOCK(p) \ - restore_flags(cpu_flags); -# define IO_LOCK(p) DRIVER_LOCK(p) -# define IO_UNLOCK(p) DRIVER_UNLOCK(p) -# define le32_to_cpu(x) (x) -# define cpu_to_le32(x) (x) -#endif +#define cpuid smp_processor_id() +#define DRIVER_LOCK_T +#define DRIVER_LOCK_INIT(p) +#define DRIVER_LOCK(p) +#define DRIVER_UNLOCK(p) +#define IO_LOCK_T unsigned long io_flags = 0; +#define IO_LOCK spin_lock_irqsave(&io_request_lock,io_flags); +#define IO_UNLOCK spin_unlock_irqrestore(&io_request_lock,io_flags); /* set SERDEBUG to 1 to enable serial debugging */ #define SERDEBUG 0 diff --git a/drivers/scsi/qlogicfc.c b/drivers/scsi/qlogicfc.c index 8d7dfd26c2cb..a4b9332b0dd5 100644 --- a/drivers/scsi/qlogicfc.c +++ b/drivers/scsi/qlogicfc.c @@ -751,7 +751,7 @@ int isp2x00_detect(Scsi_Host_Template * tmpt) hostdata->control_block.firm_opts = 0x0108; hostdata->control_block.max_frame_len = 2048; hostdata->control_block.max_iocb = 256; - hostdata->control_block.exec_throttle = 8; + hostdata->control_block.exec_throttle = QLOGICFC_CMD_PER_LUN; hostdata->control_block.retry_delay = 5; hostdata->control_block.retry_cnt = 1; hostdata->control_block.node_name[0] = 0x0020; @@ -1280,8 +1280,9 @@ int isp2x00_queuecommand(Scsi_Cmnd * Cmnd, void (*done) (Scsi_Cmnd *)) /* scsi.c expects sense info in a different buffer */ cmd->dataseg[0].d_base = virt_to_bus_low32(Cmnd->sense_buffer); #if BITS_PER_LONG > 32 - cmd->dataseg[0].d_base_hi = virt_to_bus_high32(Cmnd->request_buffer); + cmd->dataseg[0].d_base_hi = virt_to_bus_high32(Cmnd->sense_buffer); #endif + cmd->dataseg[0].d_count = sizeof(Cmnd->sense_buffer); cmd->segment_cnt = 1; cmd->control_flags = CFLAG_READ; break; @@ -1848,7 +1849,7 @@ static int isp2x00_get_nvram_defaults(struct Scsi_Host *host, struct init_cb *co static int isp2x00_init(struct Scsi_Host *sh) { - u_int io_base; + u_long io_base; struct isp2x00_hostdata *hostdata; u_char revision; u_int irq; diff --git a/drivers/sound/ad1848.c b/drivers/sound/ad1848.c index 8de63a0682b6..498db82f0681 100644 --- a/drivers/sound/ad1848.c +++ b/drivers/sound/ad1848.c @@ -71,7 +71,8 @@ typedef struct #define MD_4232 5 #define MD_C930 6 #define MD_IWAVE 7 -#define MD_4235 8 /* Crystal Audio CS4235 */ +#define MD_4235 8 /* Crystal Audio CS4235 */ +#define MD_1845_SSCAPE 9 /* Ensoniq Soundscape PNP*/ /* Mixer parameters */ int recmask; @@ -100,6 +101,7 @@ ad1848_port_info; static int nr_ad1848_devs = 0; int deskpro_xl = 0; +int deskpro_m = 0; #ifdef CONFIG_SOUND_SPRO int soundpro = 1; #else @@ -117,7 +119,7 @@ static int timer_installed = -1; #endif -static int ad_format_mask[9 /*devc->model */ ] = +static int ad_format_mask[10 /*devc->model */ ] = { 0, AFMT_U8 | AFMT_S16_LE | AFMT_MU_LAW | AFMT_A_LAW, @@ -127,7 +129,8 @@ static int ad_format_mask[9 /*devc->model */ ] = AFMT_U8 | AFMT_S16_LE | AFMT_MU_LAW | AFMT_A_LAW | AFMT_S16_BE | AFMT_IMA_ADPCM, AFMT_U8 | AFMT_S16_LE | AFMT_MU_LAW | AFMT_A_LAW | AFMT_S16_BE | AFMT_IMA_ADPCM, AFMT_U8 | AFMT_S16_LE | AFMT_MU_LAW | AFMT_A_LAW | AFMT_S16_BE | AFMT_IMA_ADPCM, - AFMT_U8 | AFMT_S16_LE /* CS4235 */ + AFMT_U8 | AFMT_S16_LE /* CS4235 */, + AFMT_U8 | AFMT_S16_LE | AFMT_MU_LAW | AFMT_A_LAW /* Ensoniq Soundscape*/ }; static ad1848_info adev_info[MAX_AUDIO_DEV]; @@ -140,7 +143,7 @@ static ad1848_info adev_info[MAX_AUDIO_DEV]; static struct { unsigned char flags; #define CAP_F_TIMER 0x01 -} capabilities [9 /*devc->model */ ] = { +} capabilities [10 /*devc->model */ ] = { {0} ,{0} /* MD_1848 */ ,{CAP_F_TIMER} /* MD_4231 */ @@ -149,7 +152,8 @@ static struct { ,{CAP_F_TIMER} /* MD_4232 */ ,{0} /* MD_C930 */ ,{CAP_F_TIMER} /* MD_IWAVE */ - ,{0} /* MD_4235 */ + ,{0} /* MD_4235 */ + ,{CAP_F_TIMER} /* MD_1845_SSCAPE */ }; static int ad1848_open(int dev, int mode); @@ -231,7 +235,7 @@ static void wait_for_calibration(ad1848_info * devc) while (timeout > 0 && (ad_read(devc, 11) & 0x20)) timeout--; if (ad_read(devc, 11) & 0x20) - if (devc->model != MD_1845) + if ( (devc->model != MD_1845) || (devc->model != MD_1845_SSCAPE)) printk(KERN_WARNING "ad1848: Auto calibration timed out(3).\n"); } @@ -555,6 +559,7 @@ static void ad1848_mixer_reset(ad1848_info * devc) case MD_4231: case MD_4231A: case MD_1845: + case MD_1845_SSCAPE: devc->supported_devices = MODE2_MIXER_DEVICES; break; @@ -751,7 +756,7 @@ static int ad1848_set_speed(int dev, int arg) if (arg <= 0) return portc->speed; - if (devc->model == MD_1845) /* AD1845 has different timer than others */ + if (devc->model == MD_1845 || devc->model == MD_1845_SSCAPE) /* AD1845 has different timer than others */ { if (arg < 4000) arg = 4000; @@ -1087,7 +1092,7 @@ static int ad1848_prepare_for_output(int dev, int bsize, int bcount) ad_enter_MCE(devc); /* Enables changes to the format select reg */ - if (devc->model == MD_1845) /* Use alternate speed select registers */ + if (devc->model == MD_1845 || devc->model == MD_1845_SSCAPE) /* Use alternate speed select registers */ { fs &= 0xf0; /* Mask off the rate select bits */ @@ -1157,7 +1162,7 @@ static int ad1848_prepare_for_input(int dev, int bsize, int bcount) ad_enter_MCE(devc); /* Enables changes to the format select reg */ - if (devc->model == MD_1845) /* Use alternate speed select registers */ + if ((devc->model == MD_1845) || (devc->model == MD_1845_SSCAPE)) /* Use alternate speed select registers */ { fs &= 0xf0; /* Mask off the rate select bits */ @@ -1193,7 +1198,7 @@ static int ad1848_prepare_for_input(int dev, int bsize, int bcount) while (timeout < 10000 && inb(devc->base) == 0x80) timeout++; - if (devc->model != MD_1848 && devc->model != MD_1845) + if (devc->model != MD_1848 && devc->model != MD_1845 && devc->model != MD_1845_SSCAPE) { /* * CS4231 compatible devices don't have separate sampling rate selection @@ -1405,13 +1410,17 @@ static void ad1848_init_hw(ad1848_info * devc) if (devc->model > MD_1848) { - ad_write(devc, 12, ad_read(devc, 12) | 0x40); /* Mode2 = enabled */ + if (devc->model == MD_1845_SSCAPE) + ad_write(devc, 12, ad_read(devc, 12) | 0x50); + else + ad_write(devc, 12, ad_read(devc, 12) | 0x40); /* Mode2 = enabled */ if (devc->model == MD_IWAVE) ad_write(devc, 12, 0x6c); /* Select codec mode 3 */ - for (i = 16; i < 32; i++) - ad_write(devc, i, init_values[i]); + if (devc-> model != MD_1845_SSCAPE) + for (i = 16; i < 32; i++) + ad_write(devc, i, init_values[i]); if (devc->model == MD_IWAVE) ad_write(devc, 16, 0x30); /* Playback and capture counters enabled */ @@ -1423,7 +1432,7 @@ static void ad1848_init_hw(ad1848_info * devc) else ad_write(devc, 9, ad_read(devc, 9) | 0x04); /* Single DMA mode */ - if (devc->model == MD_1845) + if (devc->model == MD_1845 || devc->model == MD_1845_SSCAPE) ad_write(devc, 27, ad_read(devc, 27) | 0x08); /* Alternate freq select enabled */ if (devc->model == MD_IWAVE) @@ -1462,6 +1471,7 @@ int ad1848_detect(int io_base, int *ad_flags, int *osp) int interwave = 0; int ad1847_flag = 0; int cs4248_flag = 0; + int sscape_flag = 0; int i; @@ -1474,6 +1484,13 @@ int ad1848_detect(int io_base, int *ad_flags, int *osp) interwave = 1; *ad_flags = 0; } + + if (*ad_flags == 0x87654321) + { + sscape_flag = 1; + *ad_flags = 0; + } + if (*ad_flags == 0x12345677) { cs4248_flag = 1; @@ -1821,6 +1838,9 @@ int ad1848_detect(int io_base, int *ad_flags, int *osp) devc->chip_name = "AD1847"; + if (sscape_flag == 1) + devc->model = MD_1845_SSCAPE; + return 1; } @@ -1979,7 +1999,7 @@ int ad1848_control(int cmd, int arg) switch (cmd) { case AD1848_SET_XTAL: /* Change clock frequency of AD1845 (only ) */ - if (devc->model != MD_1845) + if (devc->model != MD_1845 || devc->model != MD_1845_SSCAPE) return -EINVAL; ad_enter_MCE(devc); ad_write(devc, 29, (ad_read(devc, 29) & 0x1f) | (arg << 5)); @@ -2144,6 +2164,34 @@ interrupt_again: /* Jump back here if int status doesn't reset */ } } +/* + * Experimental initialization sequence for the integrated sound system + * of the Compaq Deskpro M. + */ + +static int init_deskpro_m(struct address_info *hw_config) +{ + unsigned char tmp; + + if ((tmp = inb(0xc44)) == 0xff) + { + DDB(printk("init_deskpro_m: Dead port 0xc44\n")); + return 0; + } + + outb(0x10, 0xc44); + outb(0x40, 0xc45); + outb(0x00, 0xc46); + outb(0xe8, 0xc47); + outb(0x14, 0xc44); + outb(0x40, 0xc45); + outb(0x00, 0xc46); + outb(0xe8, 0xc47); + outb(0x10, 0xc44); + + return 1; +} + /* * Experimental initialization sequence for the integrated sound system * of Compaq Deskpro XL. @@ -2370,6 +2418,12 @@ int probe_ms_sound(struct address_info *hw_config) return 0; } + if (deskpro_m) /* Compaq Deskpro M */ + { + if (!init_deskpro_m(hw_config)) + return 0; + } + /* * Check if the IO port returns valid signature. The original MS Sound * system returns 0x04 while some cards (AudioTrix Pro for example) @@ -2558,7 +2612,7 @@ static unsigned int ad1848_tmr_start(int dev, unsigned int usecs) * the timer divider. */ - if (devc->model == MD_1845) + if (devc->model == MD_1845 || devc->model == MD_1845_SSCAPE) xtal_nsecs = 10050; else if (ad_read(devc, 8) & 0x01) xtal_nsecs = 9920; @@ -2659,6 +2713,7 @@ MODULE_PARM(dma, "i"); /* First DMA channel */ MODULE_PARM(dma2, "i"); /* Second DMA channel */ MODULE_PARM(type, "i"); /* Card type */ MODULE_PARM(deskpro_xl, "i"); /* Special magic for Deskpro XL boxen */ +MODULE_PARM(deskpro_m, "i"); /* Special magic for Deskpro M box */ MODULE_PARM(soundpro, "i"); /* More special magic for SoundPro chips */ int io = -1; diff --git a/drivers/sound/msnd_pinnacle.c b/drivers/sound/msnd_pinnacle.c index dd1b596f46d7..0217f3a8542d 100644 --- a/drivers/sound/msnd_pinnacle.c +++ b/drivers/sound/msnd_pinnacle.c @@ -46,6 +46,7 @@ # include #endif #include +#include #include "sound_config.h" #include "sound_firmware.h" #ifdef MSND_CLASSIC diff --git a/drivers/video/bwtwofb.c b/drivers/video/bwtwofb.c index cd79cd7d2b3b..f8c0ea52d642 100644 --- a/drivers/video/bwtwofb.c +++ b/drivers/video/bwtwofb.c @@ -1,4 +1,4 @@ -/* $Id: bwtwofb.c,v 1.11 1999/11/19 09:56:54 davem Exp $ +/* $Id: bwtwofb.c,v 1.12 2000/01/21 03:57:05 anton Exp $ * bwtwofb.c: BWtwo frame buffer driver * * Copyright (C) 1998 Jakub Jelinek (jj@ultra.linux.cz) @@ -172,7 +172,7 @@ char __init *bwtwofb_init(struct fb_info_sbusfb *fb) #ifdef CONFIG_SUN4 res.start = phys; res.end = res.start + BWTWO_REGISTER_OFFSET + sizeof(struct bw2_regs) - 1; - res.flags = IORESOURE_IO | (fb->iospace & 0xff); + res.flags = IORESOURCE_IO | (fb->iospace & 0xff); resp = &res; #else resp = &fb->sbdp->resource[0]; diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 57a0be62d334..08136962a305 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -345,18 +345,20 @@ static inline void ext2_set_de_type(struct super_block *sb, umode_t mode) { if (!EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_FILETYPE)) return; - if (S_ISCHR(mode)) + if (S_ISREG(mode)) + de->file_type = EXT2_FT_REG_FILE; + else if (S_ISDIR(mode)) + de->file_type = EXT2_FT_DIR; + else if (S_ISLNK(mode)) + de->file_type = EXT2_FT_SYMLINK; + else if (S_ISSOCK(mode)) + de->file_type = EXT2_FT_SOCK; + else if (S_ISFIFO(mode)) + de->file_type = EXT2_FT_FIFO; + else if (S_ISCHR(mode)) de->file_type = EXT2_FT_CHRDEV; else if (S_ISBLK(mode)) de->file_type = EXT2_FT_BLKDEV; - else if (S_ISFIFO(mode)) - de->file_type = EXT2_FT_FIFO; - else if (S_ISLNK(mode)) - de->file_type = EXT2_FT_SYMLINK; - else if (S_ISREG(mode)) - de->file_type = EXT2_FT_REG_FILE; - else if (S_ISDIR(mode)) - de->file_type = EXT2_FT_DIR; } /* diff --git a/fs/partitions/sgi.c b/fs/partitions/sgi.c index 8ae97eca0ffd..db9c1ae391b9 100644 --- a/fs/partitions/sgi.c +++ b/fs/partitions/sgi.c @@ -44,15 +44,15 @@ int sgi_partition(struct gendisk *hd, kdev_t dev, unsigned long first_sector, in struct sgi_partition *p; if(!(bh = bread(dev, 0, get_ptable_blocksize(dev)))) { - printk("Dev %s: unable to read partition table\n", kdevname(dev)); + printk(KERN_WARNING "Dev %s: unable to read partition table\n", kdevname(dev)); return -1; } label = (struct sgi_disklabel *) bh->b_data; p = &label->partitions[0]; magic = label->magic_mushroom; if(be32_to_cpu(magic) != SGI_LABEL_MAGIC) { - printk("Dev %s SGI disklabel: bad magic %08x\n", - kdevname(dev), magic); + /*printk("Dev %s SGI disklabel: bad magic %08x\n", + kdevname(dev), magic);*/ brelse(bh); return 0; } @@ -62,7 +62,7 @@ int sgi_partition(struct gendisk *hd, kdev_t dev, unsigned long first_sector, in csum += be32_to_cpu(cs); } if(csum) { - printk("Dev %s SGI disklabel: csum bad, label corrupted\n", + printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n", kdevname(dev)); brelse(bh); return 0; diff --git a/fs/partitions/sun.c b/fs/partitions/sun.c index 5d82a70c6f0d..5f31cde4bccc 100644 --- a/fs/partitions/sun.c +++ b/fs/partitions/sun.c @@ -48,15 +48,15 @@ int sun_partition(struct gendisk *hd, kdev_t dev, unsigned long first_sector, in unsigned long spc; if(!(bh = bread(dev, 0, get_ptable_blocksize(dev)))) { - printk("Dev %s: unable to read partition table\n", + printk(KERN_WARNING "Dev %s: unable to read partition table\n", kdevname(dev)); return -1; } label = (struct sun_disklabel *) bh->b_data; p = label->partitions; if (be16_to_cpu(label->magic) != SUN_LABEL_MAGIC) { - printk("Dev %s Sun disklabel: bad magic %04x\n", - kdevname(dev), be16_to_cpu(label->magic)); +/* printk(KERN_INFO "Dev %s Sun disklabel: bad magic %04x\n", + kdevname(dev), be16_to_cpu(label->magic)); */ brelse(bh); return 0; } diff --git a/fs/udf/directory.c b/fs/udf/directory.c index b8a94e8f57ea..90c3def19ac0 100644 --- a/fs/udf/directory.c +++ b/fs/udf/directory.c @@ -213,8 +213,9 @@ udf_get_fileident(void * buffer, int bufsize, int * offset) #ifdef __KERNEL__ udf_debug("0x%x != TID_FILE_IDENT_DESC\n", le16_to_cpu(fi->descTag.tagIdent)); - udf_debug("offset: %u sizeof: %u bufsize: %u\n", - *offset, sizeof(struct FileIdentDesc), bufsize); + udf_debug("offset: %u sizeof: %lu bufsize: %u\n", + *offset, (unsigned long)sizeof(struct FileIdentDesc), + bufsize); #endif return NULL; } diff --git a/fs/udf/super.c b/fs/udf/super.c index aba702b57f89..48cd0adddf51 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -194,8 +194,8 @@ int __init init_udf_fs(void) if ( size < sizeof(struct udf_sb_info) ) { printk(KERN_ERR "udf: Danger! Kernel was compiled without enough room for udf_sb_info\n"); - printk(KERN_ERR "udf: Kernel has room for %u bytes, udf needs %u\n", - size, sizeof(struct udf_sb_info)); + printk(KERN_ERR "udf: Kernel has room for %u bytes, udf needs %lu\n", + size, (unsigned long)sizeof(struct udf_sb_info)); return 0; } } diff --git a/include/asm-sparc/asm_offsets.h b/include/asm-sparc/asm_offsets.h index 58f26364cd22..15f2d5093a19 100644 --- a/include/asm-sparc/asm_offsets.h +++ b/include/asm-sparc/asm_offsets.h @@ -159,32 +159,32 @@ #define AOFF_task_semsleeping 0x00000238 #define ASIZ_task_semsleeping 0x00000004 #define AOFF_task_thread 0x00000240 -#define ASIZ_task_thread 0x00000388 -#define AOFF_task_fs 0x000005c8 +#define ASIZ_task_thread 0x00000380 +#define AOFF_task_fs 0x000005c0 #define ASIZ_task_fs 0x00000004 -#define AOFF_task_files 0x000005cc +#define AOFF_task_files 0x000005c4 #define ASIZ_task_files 0x00000004 -#define AOFF_task_sigmask_lock 0x000005d0 +#define AOFF_task_sigmask_lock 0x000005c8 #define ASIZ_task_sigmask_lock 0x00000000 -#define AOFF_task_sig 0x000005d0 +#define AOFF_task_sig 0x000005c8 #define ASIZ_task_sig 0x00000004 -#define AOFF_task_signal 0x000005d4 +#define AOFF_task_signal 0x000005cc #define ASIZ_task_signal 0x00000008 -#define AOFF_task_blocked 0x000005dc +#define AOFF_task_blocked 0x000005d4 #define ASIZ_task_blocked 0x00000008 -#define AOFF_task_sigqueue 0x000005e4 +#define AOFF_task_sigqueue 0x000005dc #define ASIZ_task_sigqueue 0x00000004 -#define AOFF_task_sigqueue_tail 0x000005e8 +#define AOFF_task_sigqueue_tail 0x000005e0 #define ASIZ_task_sigqueue_tail 0x00000004 -#define AOFF_task_sas_ss_sp 0x000005ec +#define AOFF_task_sas_ss_sp 0x000005e4 #define ASIZ_task_sas_ss_sp 0x00000004 -#define AOFF_task_sas_ss_size 0x000005f0 +#define AOFF_task_sas_ss_size 0x000005e8 #define ASIZ_task_sas_ss_size 0x00000004 -#define AOFF_task_parent_exec_id 0x000005f4 +#define AOFF_task_parent_exec_id 0x000005ec #define ASIZ_task_parent_exec_id 0x00000004 -#define AOFF_task_self_exec_id 0x000005f8 +#define AOFF_task_self_exec_id 0x000005f0 #define ASIZ_task_self_exec_id 0x00000004 -#define AOFF_task_exit_sem 0x000005fc +#define AOFF_task_exit_sem 0x000005f4 #define ASIZ_task_exit_sem 0x0000001c #define AOFF_mm_mmap 0x00000000 #define ASIZ_mm_mmap 0x00000004 @@ -248,45 +248,41 @@ #define ASIZ_thread_uwinmask 0x00000004 #define AOFF_thread_kregs 0x00000004 #define ASIZ_thread_kregs 0x00000004 -#define AOFF_thread_sig_address 0x00000008 -#define ASIZ_thread_sig_address 0x00000004 -#define AOFF_thread_sig_desc 0x0000000c -#define ASIZ_thread_sig_desc 0x00000004 -#define AOFF_thread_ksp 0x00000010 +#define AOFF_thread_ksp 0x00000008 #define ASIZ_thread_ksp 0x00000004 -#define AOFF_thread_kpc 0x00000014 +#define AOFF_thread_kpc 0x0000000c #define ASIZ_thread_kpc 0x00000004 -#define AOFF_thread_kpsr 0x00000018 +#define AOFF_thread_kpsr 0x00000010 #define ASIZ_thread_kpsr 0x00000004 -#define AOFF_thread_kwim 0x0000001c +#define AOFF_thread_kwim 0x00000014 #define ASIZ_thread_kwim 0x00000004 -#define AOFF_thread_fork_kpsr 0x00000020 +#define AOFF_thread_fork_kpsr 0x00000018 #define ASIZ_thread_fork_kpsr 0x00000004 -#define AOFF_thread_fork_kwim 0x00000024 +#define AOFF_thread_fork_kwim 0x0000001c #define ASIZ_thread_fork_kwim 0x00000004 -#define AOFF_thread_reg_window 0x00000028 +#define AOFF_thread_reg_window 0x00000020 #define ASIZ_thread_reg_window 0x00000200 -#define AOFF_thread_rwbuf_stkptrs 0x00000228 +#define AOFF_thread_rwbuf_stkptrs 0x00000220 #define ASIZ_thread_rwbuf_stkptrs 0x00000020 -#define AOFF_thread_w_saved 0x00000248 +#define AOFF_thread_w_saved 0x00000240 #define ASIZ_thread_w_saved 0x00000004 -#define AOFF_thread_float_regs 0x00000250 +#define AOFF_thread_float_regs 0x00000248 #define ASIZ_thread_float_regs 0x00000080 -#define AOFF_thread_fsr 0x000002d0 +#define AOFF_thread_fsr 0x000002c8 #define ASIZ_thread_fsr 0x00000004 -#define AOFF_thread_fpqdepth 0x000002d4 +#define AOFF_thread_fpqdepth 0x000002cc #define ASIZ_thread_fpqdepth 0x00000004 -#define AOFF_thread_fpqueue 0x000002d8 +#define AOFF_thread_fpqueue 0x000002d0 #define ASIZ_thread_fpqueue 0x00000080 -#define AOFF_thread_flags 0x00000358 +#define AOFF_thread_flags 0x00000350 #define ASIZ_thread_flags 0x00000004 -#define AOFF_thread_current_ds 0x0000035c +#define AOFF_thread_current_ds 0x00000354 #define ASIZ_thread_current_ds 0x00000004 -#define AOFF_thread_core_exec 0x00000360 +#define AOFF_thread_core_exec 0x00000358 #define ASIZ_thread_core_exec 0x00000020 -#define AOFF_thread_new_signal 0x00000380 +#define AOFF_thread_new_signal 0x00000378 #define ASIZ_thread_new_signal 0x00000004 -#define AOFF_thread_refcount 0x00000384 +#define AOFF_thread_refcount 0x0000037c #define ASIZ_thread_refcount 0x00000004 #else /* CONFIG_SMP */ @@ -444,32 +440,32 @@ #define AOFF_task_semsleeping 0x00000338 #define ASIZ_task_semsleeping 0x00000004 #define AOFF_task_thread 0x00000340 -#define ASIZ_task_thread 0x00000388 -#define AOFF_task_fs 0x000006c8 +#define ASIZ_task_thread 0x00000380 +#define AOFF_task_fs 0x000006c0 #define ASIZ_task_fs 0x00000004 -#define AOFF_task_files 0x000006cc +#define AOFF_task_files 0x000006c4 #define ASIZ_task_files 0x00000004 -#define AOFF_task_sigmask_lock 0x000006d0 +#define AOFF_task_sigmask_lock 0x000006c8 #define ASIZ_task_sigmask_lock 0x00000008 -#define AOFF_task_sig 0x000006d8 +#define AOFF_task_sig 0x000006d0 #define ASIZ_task_sig 0x00000004 -#define AOFF_task_signal 0x000006dc +#define AOFF_task_signal 0x000006d4 #define ASIZ_task_signal 0x00000008 -#define AOFF_task_blocked 0x000006e4 +#define AOFF_task_blocked 0x000006dc #define ASIZ_task_blocked 0x00000008 -#define AOFF_task_sigqueue 0x000006ec +#define AOFF_task_sigqueue 0x000006e4 #define ASIZ_task_sigqueue 0x00000004 -#define AOFF_task_sigqueue_tail 0x000006f0 +#define AOFF_task_sigqueue_tail 0x000006e8 #define ASIZ_task_sigqueue_tail 0x00000004 -#define AOFF_task_sas_ss_sp 0x000006f4 +#define AOFF_task_sas_ss_sp 0x000006ec #define ASIZ_task_sas_ss_sp 0x00000004 -#define AOFF_task_sas_ss_size 0x000006f8 +#define AOFF_task_sas_ss_size 0x000006f0 #define ASIZ_task_sas_ss_size 0x00000004 -#define AOFF_task_parent_exec_id 0x000006fc +#define AOFF_task_parent_exec_id 0x000006f4 #define ASIZ_task_parent_exec_id 0x00000004 -#define AOFF_task_self_exec_id 0x00000700 +#define AOFF_task_self_exec_id 0x000006f8 #define ASIZ_task_self_exec_id 0x00000004 -#define AOFF_task_exit_sem 0x00000704 +#define AOFF_task_exit_sem 0x000006fc #define ASIZ_task_exit_sem 0x00000024 #define AOFF_mm_mmap 0x00000000 #define ASIZ_mm_mmap 0x00000004 @@ -533,45 +529,41 @@ #define ASIZ_thread_uwinmask 0x00000004 #define AOFF_thread_kregs 0x00000004 #define ASIZ_thread_kregs 0x00000004 -#define AOFF_thread_sig_address 0x00000008 -#define ASIZ_thread_sig_address 0x00000004 -#define AOFF_thread_sig_desc 0x0000000c -#define ASIZ_thread_sig_desc 0x00000004 -#define AOFF_thread_ksp 0x00000010 +#define AOFF_thread_ksp 0x00000008 #define ASIZ_thread_ksp 0x00000004 -#define AOFF_thread_kpc 0x00000014 +#define AOFF_thread_kpc 0x0000000c #define ASIZ_thread_kpc 0x00000004 -#define AOFF_thread_kpsr 0x00000018 +#define AOFF_thread_kpsr 0x00000010 #define ASIZ_thread_kpsr 0x00000004 -#define AOFF_thread_kwim 0x0000001c +#define AOFF_thread_kwim 0x00000014 #define ASIZ_thread_kwim 0x00000004 -#define AOFF_thread_fork_kpsr 0x00000020 +#define AOFF_thread_fork_kpsr 0x00000018 #define ASIZ_thread_fork_kpsr 0x00000004 -#define AOFF_thread_fork_kwim 0x00000024 +#define AOFF_thread_fork_kwim 0x0000001c #define ASIZ_thread_fork_kwim 0x00000004 -#define AOFF_thread_reg_window 0x00000028 +#define AOFF_thread_reg_window 0x00000020 #define ASIZ_thread_reg_window 0x00000200 -#define AOFF_thread_rwbuf_stkptrs 0x00000228 +#define AOFF_thread_rwbuf_stkptrs 0x00000220 #define ASIZ_thread_rwbuf_stkptrs 0x00000020 -#define AOFF_thread_w_saved 0x00000248 +#define AOFF_thread_w_saved 0x00000240 #define ASIZ_thread_w_saved 0x00000004 -#define AOFF_thread_float_regs 0x00000250 +#define AOFF_thread_float_regs 0x00000248 #define ASIZ_thread_float_regs 0x00000080 -#define AOFF_thread_fsr 0x000002d0 +#define AOFF_thread_fsr 0x000002c8 #define ASIZ_thread_fsr 0x00000004 -#define AOFF_thread_fpqdepth 0x000002d4 +#define AOFF_thread_fpqdepth 0x000002cc #define ASIZ_thread_fpqdepth 0x00000004 -#define AOFF_thread_fpqueue 0x000002d8 +#define AOFF_thread_fpqueue 0x000002d0 #define ASIZ_thread_fpqueue 0x00000080 -#define AOFF_thread_flags 0x00000358 +#define AOFF_thread_flags 0x00000350 #define ASIZ_thread_flags 0x00000004 -#define AOFF_thread_current_ds 0x0000035c +#define AOFF_thread_current_ds 0x00000354 #define ASIZ_thread_current_ds 0x00000004 -#define AOFF_thread_core_exec 0x00000360 +#define AOFF_thread_core_exec 0x00000358 #define ASIZ_thread_core_exec 0x00000020 -#define AOFF_thread_new_signal 0x00000380 +#define AOFF_thread_new_signal 0x00000378 #define ASIZ_thread_new_signal 0x00000004 -#define AOFF_thread_refcount 0x00000384 +#define AOFF_thread_refcount 0x0000037c #define ASIZ_thread_refcount 0x00000004 #endif /* CONFIG_SMP */ diff --git a/include/asm-sparc/hdreg.h b/include/asm-sparc/hdreg.h new file mode 100644 index 000000000000..1c321c3e7d2c --- /dev/null +++ b/include/asm-sparc/hdreg.h @@ -0,0 +1,13 @@ +/* $Id: hdreg.h,v 1.1 2000/01/21 04:56:27 zaitcev Exp $ + * hdreg.h: SPARC PCI specific IDE glue. + * + * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) + * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be) + */ + +#ifndef __SPARC_HDREG_H +#define __SPARC_HDREG_H + +typedef unsigned int ide_ioreg_t; + +#endif /* __SPARC_HDREG_H */ diff --git a/include/asm-sparc/ide.h b/include/asm-sparc/ide.h new file mode 100644 index 000000000000..bec4233e6289 --- /dev/null +++ b/include/asm-sparc/ide.h @@ -0,0 +1,289 @@ +/* $Id: ide.h,v 1.2 2000/01/21 04:56:27 zaitcev Exp $ + * ide.h: SPARC PCI specific IDE glue. + * + * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) + * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be) + * Adaptation from sparc64 version to sparc by Pete Zaitcev. + */ + +#ifndef _SPARC_IDE_H +#define _SPARC_IDE_H + +#ifdef __KERNEL__ + +#include +#include +#include +#include + +#undef MAX_HWIFS +#define MAX_HWIFS 2 + +#define ide__sti() __sti() + +static __inline__ int ide_default_irq(ide_ioreg_t base) +{ + return 0; +} + +static __inline__ ide_ioreg_t ide_default_io_base(int index) +{ + return 0; +} + +/* + * Doing any sort of ioremap() here does not work + * because this function may be called with null aguments. + */ +static __inline__ void ide_init_hwif_ports(hw_regs_t *hw, ide_ioreg_t data_port, ide_ioreg_t ctrl_port, int *irq) +{ + ide_ioreg_t reg = data_port; + int i; + + for (i = IDE_DATA_OFFSET; i <= IDE_STATUS_OFFSET; i++) { + hw->io_ports[i] = reg; + reg += 1; + } + if (ctrl_port) { + hw->io_ports[IDE_CONTROL_OFFSET] = ctrl_port; + } else { + hw->io_ports[IDE_CONTROL_OFFSET] = 0; + } + if (irq != NULL) + *irq = 0; +} + +/* + * This registers the standard ports for this architecture with the IDE + * driver. + */ +static __inline__ void ide_init_default_hwifs(void) +{ +#ifdef __DO_I_NEED_THIS + hw_regs_t hw; + int index; + + for (index = 0; index < MAX_HWIFS; index++) { + ide_init_hwif_ports(&hw, ide_default_io_base(index), 0, 0); + hw.irq = ide_default_irq(ide_default_io_base(index)); + ide_register_hw(&hw, NULL); + } +#endif /* __DO_I_NEED_THIS */ +} + +typedef union { + unsigned int all : 8; /* all of the bits together */ + struct { + unsigned int bit7 : 1; + unsigned int lba : 1; + unsigned int bit5 : 1; + unsigned int unit : 1; + unsigned int head : 4; + } b; +} select_t; + +static __inline__ int ide_request_irq(unsigned int irq, + void (*handler)(int, void *, struct pt_regs *), + unsigned long flags, const char *name, void *devid) +{ + return request_irq(irq, handler, SA_SHIRQ, name, devid); +} + +static __inline__ void ide_free_irq(unsigned int irq, void *dev_id) +{ + free_irq(irq, dev_id); +} + +static __inline__ int ide_check_region(ide_ioreg_t base, unsigned int size) +{ + /* We leave these empty because pcic.c calls sparc_alloc_io() */ + return 0; +} + +static __inline__ void ide_request_region(ide_ioreg_t base, unsigned int size, + const char *name) +{ +} + +static __inline__ void ide_release_region(ide_ioreg_t base, unsigned int size) +{ +} + +#undef SUPPORT_SLOW_DATA_PORTS +#define SUPPORT_SLOW_DATA_PORTS 0 + +#undef SUPPORT_VLB_SYNC +#define SUPPORT_VLB_SYNC 0 + +#undef HD_DATA +#define HD_DATA ((ide_ioreg_t)0) + +/* From m68k code... */ + +#ifdef insl +#undef insl +#endif +#ifdef outsl +#undef outsl +#endif +#ifdef insw +#undef insw +#endif +#ifdef outsw +#undef outsw +#endif + +#define insl(data_reg, buffer, wcount) insw(data_reg, buffer, (wcount)<<1) +#define outsl(data_reg, buffer, wcount) outsw(data_reg, buffer, (wcount)<<1) + +#define insw(port, buf, nr) ide_insw((port), (buf), (nr)) +#define outsw(port, buf, nr) ide_outsw((port), (buf), (nr)) + +static __inline__ void ide_insw(unsigned long port, + void *dst, + unsigned long count) +{ + volatile unsigned short *data_port; + /* unsigned long end = (unsigned long)dst + (count << 1); */ /* P3 */ + u16 *ps = dst; + u32 *pi; + + data_port = (volatile unsigned short *)port; + + if(((unsigned long)ps) & 0x2) { + *ps++ = *data_port; + count--; + } + pi = (u32 *)ps; + while(count >= 2) { + u32 w; + + w = (*data_port) << 16; + w |= (*data_port); + *pi++ = w; + count -= 2; + } + ps = (u16 *)pi; + if(count) + *ps++ = *data_port; + + /* __flush_dcache_range((unsigned long)dst, end); */ /* P3 see hme */ +} + +static __inline__ void ide_outsw(unsigned long port, + const void *src, + unsigned long count) +{ + volatile unsigned short *data_port; + /* unsigned long end = (unsigned long)src + (count << 1); */ + const u16 *ps = src; + const u32 *pi; + + data_port = (volatile unsigned short *)port; + + if(((unsigned long)src) & 0x2) { + *data_port = *ps++; + count--; + } + pi = (const u32 *)ps; + while(count >= 2) { + u32 w; + + w = *pi++; + *data_port = (w >> 16); + *data_port = w; + count -= 2; + } + ps = (const u16 *)pi; + if(count) + *data_port = *ps; + + /* __flush_dcache_range((unsigned long)src, end); */ /* P3 see hme */ +} + +#define T_CHAR (0x0000) /* char: don't touch */ +#define T_SHORT (0x4000) /* short: 12 -> 21 */ +#define T_INT (0x8000) /* int: 1234 -> 4321 */ +#define T_TEXT (0xc000) /* text: 12 -> 21 */ + +#define T_MASK_TYPE (0xc000) +#define T_MASK_COUNT (0x3fff) + +#define D_CHAR(cnt) (T_CHAR | (cnt)) +#define D_SHORT(cnt) (T_SHORT | (cnt)) +#define D_INT(cnt) (T_INT | (cnt)) +#define D_TEXT(cnt) (T_TEXT | (cnt)) + +static u_short driveid_types[] = { + D_SHORT(10), /* config - vendor2 */ + D_TEXT(20), /* serial_no */ + D_SHORT(3), /* buf_type - ecc_bytes */ + D_TEXT(48), /* fw_rev - model */ + D_CHAR(2), /* max_multsect - vendor3 */ + D_SHORT(1), /* dword_io */ + D_CHAR(2), /* vendor4 - capability */ + D_SHORT(1), /* reserved50 */ + D_CHAR(4), /* vendor5 - tDMA */ + D_SHORT(4), /* field_valid - cur_sectors */ + D_INT(1), /* cur_capacity */ + D_CHAR(2), /* multsect - multsect_valid */ + D_INT(1), /* lba_capacity */ + D_SHORT(194) /* dma_1word - reservedyy */ +}; + +#define num_driveid_types (sizeof(driveid_types)/sizeof(*driveid_types)) + +static __inline__ void ide_fix_driveid(struct hd_driveid *id) +{ + u_char *p = (u_char *)id; + int i, j, cnt; + u_char t; + + for (i = 0; i < num_driveid_types; i++) { + cnt = driveid_types[i] & T_MASK_COUNT; + switch (driveid_types[i] & T_MASK_TYPE) { + case T_CHAR: + p += cnt; + break; + case T_SHORT: + for (j = 0; j < cnt; j++) { + t = p[0]; + p[0] = p[1]; + p[1] = t; + p += 2; + } + break; + case T_INT: + for (j = 0; j < cnt; j++) { + t = p[0]; + p[0] = p[3]; + p[3] = t; + t = p[1]; + p[1] = p[2]; + p[2] = t; + p += 4; + } + break; + case T_TEXT: + for (j = 0; j < cnt; j += 2) { + t = p[0]; + p[0] = p[1]; + p[1] = t; + p += 2; + } + break; + }; + } +} + +/* + * The following are not needed for the non-m68k ports + */ +#define ide_ack_intr(hwif) (1) +/* #define ide_ack_intr(hwif) ((hwif)->hw.ack_intr ? (hwif)->hw.ack_intr(hwif) : 1) */ +#define ide_release_lock(lock) do {} while (0) +#define ide_get_lock(lock, hdlr, data) do {} while (0) + +#endif /* __KERNEL__ */ + +#endif /* _SPARC_IDE_H */ diff --git a/include/asm-sparc/processor.h b/include/asm-sparc/processor.h index a74493500b35..278c15c801f9 100644 --- a/include/asm-sparc/processor.h +++ b/include/asm-sparc/processor.h @@ -1,4 +1,4 @@ -/* $Id: processor.h,v 1.76 2000/01/09 09:13:38 anton Exp $ +/* $Id: processor.h,v 1.77 2000/01/21 11:39:17 jj Exp $ * include/asm-sparc/processor.h * * Copyright (C) 1994 David S. Miller (davem@caip.rutgers.edu) @@ -58,10 +58,6 @@ struct thread_struct { unsigned long uwinmask __attribute__ ((aligned (8))); struct pt_regs *kregs; - /* For signal handling */ - unsigned long sig_address __attribute__ ((aligned (8))); - unsigned long sig_desc; - /* Context switch saved kernel state. */ unsigned long ksp __attribute__ ((aligned (8))); unsigned long kpc; @@ -99,8 +95,8 @@ struct thread_struct { NULL, __pgprot(0x0) , VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL } #define INIT_THREAD { \ -/* uwinmask, kregs, sig_address, sig_desc, ksp, kpc, kpsr, kwim */ \ - 0, 0, 0, 0, 0, 0, 0, 0, \ +/* uwinmask, kregs, ksp, kpc, kpsr, kwim */ \ + 0, 0, 0, 0, 0, 0, \ /* fork_kpsr, fork_kwim */ \ 0, 0, \ /* reg_window */ \ diff --git a/include/asm-sparc/sembuf.h b/include/asm-sparc/sembuf.h index 47b2ef9bcd95..a79c4bb3c08a 100644 --- a/include/asm-sparc/sembuf.h +++ b/include/asm-sparc/sembuf.h @@ -8,7 +8,7 @@ * * Pad space is left for: * - 64-bit time_t to solve y2038 problem - * - 2 miscellaneous 64-bit values + * - 2 miscellaneous 32-bit values */ struct semid64_ds { diff --git a/include/asm-sparc/siginfo.h b/include/asm-sparc/siginfo.h index 2baed407f7b5..854b3bce1625 100644 --- a/include/asm-sparc/siginfo.h +++ b/include/asm-sparc/siginfo.h @@ -1,4 +1,4 @@ -/* $Id: siginfo.h,v 1.5 1999/07/29 12:56:57 jj Exp $ +/* $Id: siginfo.h,v 1.6 2000/01/21 11:39:17 jj Exp $ * siginfo.c: */ @@ -81,6 +81,11 @@ typedef struct siginfo { #define si_band _sifields._sigpoll._band #define si_fd _sifields._sigpoll._fd +#ifdef __KERNEL__ +#define __SI_MASK 0 +#define __SI_FAULT 0 +#endif + /* * si_code values * Digital reserves positive values for kernel-generated signals. @@ -127,7 +132,7 @@ typedef struct siginfo { * SIGSEGV si_codes */ #define SEGV_MAPERR 1 /* address not mapped to object */ -#define SRGV_ACCERR 2 /* invalid permissions for mapped object */ +#define SEGV_ACCERR 2 /* invalid permissions for mapped object */ #define NSIGSEGV 2 /* diff --git a/include/asm-sparc/smp.h b/include/asm-sparc/smp.h index 7c69f57c26e6..ca4604fe72f2 100644 --- a/include/asm-sparc/smp.h +++ b/include/asm-sparc/smp.h @@ -92,7 +92,7 @@ extern __inline__ void xc5(smpfunc_t func, unsigned long arg1, unsigned long arg unsigned long arg3, unsigned long arg4, unsigned long arg5) { smp_cross_call(func, arg1, arg2, arg3, arg4, arg5); } -extern __volatile__ int cpu_number_map[NR_CPUS]; +extern __volatile__ int __cpu_number_map[NR_CPUS]; extern __volatile__ int __cpu_logical_map[NR_CPUS]; extern unsigned long smp_proc_in_lock[NR_CPUS]; @@ -100,6 +100,10 @@ extern __inline__ int cpu_logical_map(int cpu) { return __cpu_logical_map[cpu]; } +extern __inline__ int cpu_number_map(int cpu) +{ + return __cpu_number_map[cpu]; +} extern __inline__ int hard_smp4m_processor_id(void) { diff --git a/include/asm-sparc/stat.h b/include/asm-sparc/stat.h index dd266dc5de53..4b0dd7fa4d6b 100644 --- a/include/asm-sparc/stat.h +++ b/include/asm-sparc/stat.h @@ -1,4 +1,4 @@ -/* $Id: stat.h,v 1.10 1999/12/21 14:09:41 jj Exp $ */ +/* $Id: stat.h,v 1.11 2000/01/16 15:22:53 jj Exp $ */ #ifndef _SPARC_STAT_H #define _SPARC_STAT_H @@ -19,23 +19,23 @@ struct __old_kernel_stat { }; struct stat { - dev_t st_dev; - ino_t st_ino; - mode_t st_mode; - short st_nlink; - uid_t st_uid; - gid_t st_gid; - dev_t st_rdev; - off_t st_size; - time_t st_atime; - unsigned long __unused1; - time_t st_mtime; - unsigned long __unused2; - time_t st_ctime; - unsigned long __unused3; - off_t st_blksize; - off_t st_blocks; - unsigned long __unused4[2]; + unsigned short st_dev; + unsigned long st_ino; + unsigned short st_mode; + short st_nlink; + unsigned short st_uid; + unsigned short st_gid; + unsigned short st_rdev; + long st_size; + long st_atime; + unsigned long __unused1; + long st_mtime; + unsigned long __unused2; + long st_ctime; + unsigned long __unused3; + long st_blksize; + long st_blocks; + unsigned long __unused4[2]; }; struct stat64 { diff --git a/include/asm-sparc/unistd.h b/include/asm-sparc/unistd.h index f16345145ea7..c257415494ce 100644 --- a/include/asm-sparc/unistd.h +++ b/include/asm-sparc/unistd.h @@ -1,4 +1,4 @@ -/* $Id: unistd.h,v 1.63 2000/01/12 11:47:40 anton Exp $ */ +/* $Id: unistd.h,v 1.64 2000/01/16 06:20:32 davem Exp $ */ #ifndef _SPARC_UNISTD_H #define _SPARC_UNISTD_H diff --git a/include/asm-sparc64/asm_offsets.h b/include/asm-sparc64/asm_offsets.h index 07dfc1d30b00..895583bb824e 100644 --- a/include/asm-sparc64/asm_offsets.h +++ b/include/asm-sparc64/asm_offsets.h @@ -165,34 +165,34 @@ #define AOFF_task_semsleeping 0x00000378 #define ASIZ_task_semsleeping 0x00000008 #define AOFF_task_thread 0x00000380 -#define ASIZ_task_thread 0x00000460 -#define AOFF_task_fs 0x000007e0 +#define ASIZ_task_thread 0x00000450 +#define AOFF_task_fs 0x000007d0 #define ASIZ_task_fs 0x00000008 -#define AOFF_task_files 0x000007e8 +#define AOFF_task_files 0x000007d8 #define ASIZ_task_files 0x00000008 -#define AOFF_task_sigmask_lock 0x000007f0 +#define AOFF_task_sigmask_lock 0x000007e0 #define ASIZ_task_sigmask_lock 0x00000000 -#define AOFF_task_sig 0x000007f0 +#define AOFF_task_sig 0x000007e0 #define ASIZ_task_sig 0x00000008 -#define AOFF_task_signal 0x000007f8 +#define AOFF_task_signal 0x000007e8 #define ASIZ_task_signal 0x00000008 -#define AOFF_task_blocked 0x00000800 +#define AOFF_task_blocked 0x000007f0 #define ASIZ_task_blocked 0x00000008 -#define AOFF_task_sigqueue 0x00000808 +#define AOFF_task_sigqueue 0x000007f8 #define ASIZ_task_sigqueue 0x00000008 -#define AOFF_task_sigqueue_tail 0x00000810 +#define AOFF_task_sigqueue_tail 0x00000800 #define ASIZ_task_sigqueue_tail 0x00000008 -#define AOFF_task_sas_ss_sp 0x00000818 +#define AOFF_task_sas_ss_sp 0x00000808 #define ASIZ_task_sas_ss_sp 0x00000008 -#define AOFF_task_sas_ss_size 0x00000820 +#define AOFF_task_sas_ss_size 0x00000810 #define ASIZ_task_sas_ss_size 0x00000008 -#define AOFF_task_parent_exec_id 0x00000828 +#define AOFF_task_parent_exec_id 0x00000818 #define ASIZ_task_parent_exec_id 0x00000004 -#define AOFF_task_self_exec_id 0x0000082c +#define AOFF_task_self_exec_id 0x0000081c #define ASIZ_task_self_exec_id 0x00000004 -#define AOFF_task_exit_sem 0x00000830 +#define AOFF_task_exit_sem 0x00000820 #define ASIZ_task_exit_sem 0x00000030 -#define ASIZ_task 0x00000860 +#define ASIZ_task 0x00000850 #define AOFF_mm_mmap 0x00000000 #define ASIZ_mm_mmap 0x00000008 #define AOFF_mm_mmap_avl 0x00000008 @@ -278,29 +278,23 @@ #define ASIZ_thread_gsr 0x00000007 #define AOFF_thread___pad2 0x0000002f #define ASIZ_thread___pad2 0x00000001 -#define AOFF_thread_sig_address 0x00000030 -#define ASIZ_thread_sig_address 0x00000008 -#define AOFF_thread_sig_desc 0x00000038 -#define ASIZ_thread_sig_desc 0x00000008 -#define AOFF_thread_xfsr 0x00000040 +#define AOFF_thread_xfsr 0x00000030 #define ASIZ_thread_xfsr 0x00000038 -#define AOFF_thread___pad3 0x00000078 -#define ASIZ_thread___pad3 0x00000008 -#define AOFF_thread_reg_window 0x00000080 +#define AOFF_thread_reg_window 0x00000068 #define ASIZ_thread_reg_window 0x00000380 -#define AOFF_thread_rwbuf_stkptrs 0x00000400 +#define AOFF_thread_rwbuf_stkptrs 0x000003e8 #define ASIZ_thread_rwbuf_stkptrs 0x00000038 -#define AOFF_thread_user_cntd0 0x00000438 +#define AOFF_thread_user_cntd0 0x00000420 #define ASIZ_thread_user_cntd0 0x00000008 -#define AOFF_thread_user_cntd1 0x00000440 +#define AOFF_thread_user_cntd1 0x00000428 #define ASIZ_thread_user_cntd1 0x00000008 -#define AOFF_thread_kernel_cntd0 0x00000448 +#define AOFF_thread_kernel_cntd0 0x00000430 #define ASIZ_thread_kernel_cntd0 0x00000008 -#define AOFF_thread_kernel_cntd1 0x00000450 +#define AOFF_thread_kernel_cntd1 0x00000438 #define ASIZ_thread_kernel_cntd1 0x00000008 -#define AOFF_thread_pcr_reg 0x00000458 +#define AOFF_thread_pcr_reg 0x00000440 #define ASIZ_thread_pcr_reg 0x00000008 -#define ASIZ_thread 0x00000460 +#define ASIZ_thread 0x00000450 #else /* CONFIG_SMP */ @@ -459,34 +453,34 @@ #define AOFF_task_semsleeping 0x00000570 #define ASIZ_task_semsleeping 0x00000008 #define AOFF_task_thread 0x00000580 -#define ASIZ_task_thread 0x00000460 -#define AOFF_task_fs 0x000009e0 +#define ASIZ_task_thread 0x00000450 +#define AOFF_task_fs 0x000009d0 #define ASIZ_task_fs 0x00000008 -#define AOFF_task_files 0x000009e8 +#define AOFF_task_files 0x000009d8 #define ASIZ_task_files 0x00000008 -#define AOFF_task_sigmask_lock 0x000009f0 +#define AOFF_task_sigmask_lock 0x000009e0 #define ASIZ_task_sigmask_lock 0x00000001 -#define AOFF_task_sig 0x000009f8 +#define AOFF_task_sig 0x000009e8 #define ASIZ_task_sig 0x00000008 -#define AOFF_task_signal 0x00000a00 +#define AOFF_task_signal 0x000009f0 #define ASIZ_task_signal 0x00000008 -#define AOFF_task_blocked 0x00000a08 +#define AOFF_task_blocked 0x000009f8 #define ASIZ_task_blocked 0x00000008 -#define AOFF_task_sigqueue 0x00000a10 +#define AOFF_task_sigqueue 0x00000a00 #define ASIZ_task_sigqueue 0x00000008 -#define AOFF_task_sigqueue_tail 0x00000a18 +#define AOFF_task_sigqueue_tail 0x00000a08 #define ASIZ_task_sigqueue_tail 0x00000008 -#define AOFF_task_sas_ss_sp 0x00000a20 +#define AOFF_task_sas_ss_sp 0x00000a10 #define ASIZ_task_sas_ss_sp 0x00000008 -#define AOFF_task_sas_ss_size 0x00000a28 +#define AOFF_task_sas_ss_size 0x00000a18 #define ASIZ_task_sas_ss_size 0x00000008 -#define AOFF_task_parent_exec_id 0x00000a30 +#define AOFF_task_parent_exec_id 0x00000a20 #define ASIZ_task_parent_exec_id 0x00000004 -#define AOFF_task_self_exec_id 0x00000a34 +#define AOFF_task_self_exec_id 0x00000a24 #define ASIZ_task_self_exec_id 0x00000004 -#define AOFF_task_exit_sem 0x00000a38 +#define AOFF_task_exit_sem 0x00000a28 #define ASIZ_task_exit_sem 0x00000038 -#define ASIZ_task 0x00000a70 +#define ASIZ_task 0x00000a60 #define AOFF_mm_mmap 0x00000000 #define ASIZ_mm_mmap 0x00000008 #define AOFF_mm_mmap_avl 0x00000008 @@ -572,29 +566,23 @@ #define ASIZ_thread_gsr 0x00000007 #define AOFF_thread___pad2 0x0000002f #define ASIZ_thread___pad2 0x00000001 -#define AOFF_thread_sig_address 0x00000030 -#define ASIZ_thread_sig_address 0x00000008 -#define AOFF_thread_sig_desc 0x00000038 -#define ASIZ_thread_sig_desc 0x00000008 -#define AOFF_thread_xfsr 0x00000040 +#define AOFF_thread_xfsr 0x00000030 #define ASIZ_thread_xfsr 0x00000038 -#define AOFF_thread___pad3 0x00000078 -#define ASIZ_thread___pad3 0x00000008 -#define AOFF_thread_reg_window 0x00000080 +#define AOFF_thread_reg_window 0x00000068 #define ASIZ_thread_reg_window 0x00000380 -#define AOFF_thread_rwbuf_stkptrs 0x00000400 +#define AOFF_thread_rwbuf_stkptrs 0x000003e8 #define ASIZ_thread_rwbuf_stkptrs 0x00000038 -#define AOFF_thread_user_cntd0 0x00000438 +#define AOFF_thread_user_cntd0 0x00000420 #define ASIZ_thread_user_cntd0 0x00000008 -#define AOFF_thread_user_cntd1 0x00000440 +#define AOFF_thread_user_cntd1 0x00000428 #define ASIZ_thread_user_cntd1 0x00000008 -#define AOFF_thread_kernel_cntd0 0x00000448 +#define AOFF_thread_kernel_cntd0 0x00000430 #define ASIZ_thread_kernel_cntd0 0x00000008 -#define AOFF_thread_kernel_cntd1 0x00000450 +#define AOFF_thread_kernel_cntd1 0x00000438 #define ASIZ_thread_kernel_cntd1 0x00000008 -#define AOFF_thread_pcr_reg 0x00000458 +#define AOFF_thread_pcr_reg 0x00000440 #define ASIZ_thread_pcr_reg 0x00000008 -#define ASIZ_thread 0x00000460 +#define ASIZ_thread 0x00000450 #else /* SPIN_LOCK_DEBUG */ @@ -751,34 +739,34 @@ #define AOFF_task_semsleeping 0x00000578 #define ASIZ_task_semsleeping 0x00000008 #define AOFF_task_thread 0x00000580 -#define ASIZ_task_thread 0x00000460 -#define AOFF_task_fs 0x000009e0 +#define ASIZ_task_thread 0x00000450 +#define AOFF_task_fs 0x000009d0 #define ASIZ_task_fs 0x00000008 -#define AOFF_task_files 0x000009e8 +#define AOFF_task_files 0x000009d8 #define ASIZ_task_files 0x00000008 -#define AOFF_task_sigmask_lock 0x000009f0 +#define AOFF_task_sigmask_lock 0x000009e0 #define ASIZ_task_sigmask_lock 0x0000000c -#define AOFF_task_sig 0x00000a00 +#define AOFF_task_sig 0x000009f0 #define ASIZ_task_sig 0x00000008 -#define AOFF_task_signal 0x00000a08 +#define AOFF_task_signal 0x000009f8 #define ASIZ_task_signal 0x00000008 -#define AOFF_task_blocked 0x00000a10 +#define AOFF_task_blocked 0x00000a00 #define ASIZ_task_blocked 0x00000008 -#define AOFF_task_sigqueue 0x00000a18 +#define AOFF_task_sigqueue 0x00000a08 #define ASIZ_task_sigqueue 0x00000008 -#define AOFF_task_sigqueue_tail 0x00000a20 +#define AOFF_task_sigqueue_tail 0x00000a10 #define ASIZ_task_sigqueue_tail 0x00000008 -#define AOFF_task_sas_ss_sp 0x00000a28 +#define AOFF_task_sas_ss_sp 0x00000a18 #define ASIZ_task_sas_ss_sp 0x00000008 -#define AOFF_task_sas_ss_size 0x00000a30 +#define AOFF_task_sas_ss_size 0x00000a20 #define ASIZ_task_sas_ss_size 0x00000008 -#define AOFF_task_parent_exec_id 0x00000a38 +#define AOFF_task_parent_exec_id 0x00000a28 #define ASIZ_task_parent_exec_id 0x00000004 -#define AOFF_task_self_exec_id 0x00000a3c +#define AOFF_task_self_exec_id 0x00000a2c #define ASIZ_task_self_exec_id 0x00000004 -#define AOFF_task_exit_sem 0x00000a40 +#define AOFF_task_exit_sem 0x00000a30 #define ASIZ_task_exit_sem 0x00000040 -#define ASIZ_task 0x00000a80 +#define ASIZ_task 0x00000a70 #define AOFF_mm_mmap 0x00000000 #define ASIZ_mm_mmap 0x00000008 #define AOFF_mm_mmap_avl 0x00000008 @@ -864,29 +852,23 @@ #define ASIZ_thread_gsr 0x00000007 #define AOFF_thread___pad2 0x0000002f #define ASIZ_thread___pad2 0x00000001 -#define AOFF_thread_sig_address 0x00000030 -#define ASIZ_thread_sig_address 0x00000008 -#define AOFF_thread_sig_desc 0x00000038 -#define ASIZ_thread_sig_desc 0x00000008 -#define AOFF_thread_xfsr 0x00000040 +#define AOFF_thread_xfsr 0x00000030 #define ASIZ_thread_xfsr 0x00000038 -#define AOFF_thread___pad3 0x00000078 -#define ASIZ_thread___pad3 0x00000008 -#define AOFF_thread_reg_window 0x00000080 +#define AOFF_thread_reg_window 0x00000068 #define ASIZ_thread_reg_window 0x00000380 -#define AOFF_thread_rwbuf_stkptrs 0x00000400 +#define AOFF_thread_rwbuf_stkptrs 0x000003e8 #define ASIZ_thread_rwbuf_stkptrs 0x00000038 -#define AOFF_thread_user_cntd0 0x00000438 +#define AOFF_thread_user_cntd0 0x00000420 #define ASIZ_thread_user_cntd0 0x00000008 -#define AOFF_thread_user_cntd1 0x00000440 +#define AOFF_thread_user_cntd1 0x00000428 #define ASIZ_thread_user_cntd1 0x00000008 -#define AOFF_thread_kernel_cntd0 0x00000448 +#define AOFF_thread_kernel_cntd0 0x00000430 #define ASIZ_thread_kernel_cntd0 0x00000008 -#define AOFF_thread_kernel_cntd1 0x00000450 +#define AOFF_thread_kernel_cntd1 0x00000438 #define ASIZ_thread_kernel_cntd1 0x00000008 -#define AOFF_thread_pcr_reg 0x00000458 +#define AOFF_thread_pcr_reg 0x00000440 #define ASIZ_thread_pcr_reg 0x00000008 -#define ASIZ_thread 0x00000460 +#define ASIZ_thread 0x00000450 #endif /* SPIN_LOCK_DEBUG */ #endif /* CONFIG_SMP */ diff --git a/include/asm-sparc64/checksum.h b/include/asm-sparc64/checksum.h index 6f6f8fe06a66..b2c06823a466 100644 --- a/include/asm-sparc64/checksum.h +++ b/include/asm-sparc64/checksum.h @@ -1,4 +1,4 @@ -/* $Id: checksum.h,v 1.14 2000/01/05 21:27:42 davem Exp $ */ +/* $Id: checksum.h,v 1.15 2000/01/19 04:06:09 davem Exp $ */ #ifndef __SPARC64_CHECKSUM_H #define __SPARC64_CHECKSUM_H @@ -37,12 +37,6 @@ extern unsigned int csum_partial(const unsigned char * buff, int len, unsigned i * here even more important to align src and dst on a 32-bit (or even * better 64-bit) boundary */ -/* FIXME: Remove these macros ASAP */ -#define csum_partial_copy(src, dst, len, sum) \ - csum_partial_copy_nocheck(src,dst,len,sum) -#define csum_partial_copy_fromuser(s, d, l, w) \ - csum_partial_copy_from_user((char *) (s), (d), (l), (w), NULL) - extern unsigned int csum_partial_copy_sparc64(const char *src, char *dst, int len, unsigned int sum); extern __inline__ unsigned int @@ -66,15 +60,19 @@ csum_partial_copy_from_user(const char *src, char *dst, int len, return csum_partial_copy_sparc64(src, dst, len, sum); } -#if 0 -/* XXX should implement this now... -DaveM */ +/* + * Copy and checksum to user + */ +#define HAVE_CSUM_COPY_USER +extern unsigned int csum_partial_copy_user_sparc64(const char *src, char *dst, int len, unsigned int sum); extern __inline__ unsigned int -csum_partial_copy_to_user(const char *src, char *dst, int len, - unsigned int sum, int *err) +csum_and_copy_to_user(const char *src, char *dst, int len, + unsigned int sum, int *err) { - return 0; + __asm__ __volatile__ ("stx %0, [%%sp + 0x7ff + 128]" + : : "r" (err)); + return csum_partial_copy_user_sparc64(src, dst, len, sum); } -#endif /* ihl is always 5 or greater, almost always is 5, and iph is word aligned * the majority of the time. diff --git a/include/asm-sparc64/processor.h b/include/asm-sparc64/processor.h index 807dd0cf4a36..158fbbf39f1e 100644 --- a/include/asm-sparc64/processor.h +++ b/include/asm-sparc64/processor.h @@ -1,4 +1,4 @@ -/* $Id: processor.h,v 1.60 2000/01/07 20:21:45 davem Exp $ +/* $Id: processor.h,v 1.61 2000/01/21 11:39:22 jj Exp $ * include/asm-sparc64/processor.h * * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu) @@ -56,16 +56,11 @@ struct thread_struct { unsigned char __pad1[3]; struct pt_regs *kregs; - /* D$ line 2 */ + /* D$ line 2, 3, 4 */ unsigned long *utraps; unsigned char gsr[7]; unsigned char __pad2; - unsigned long sig_address; - unsigned long sig_desc; - - /* D$ lines 3 and 4 */ unsigned long xfsr[7]; - unsigned long __pad3; struct reg_window reg_window[NSWINS]; unsigned long rwbuf_stkptrs[NSWINS]; @@ -92,10 +87,8 @@ struct thread_struct { 0, 0, 0, 0, KERNEL_DS, \ /* w_saved, fpdepth, fpsaved, pad1, kregs, */ \ 0, 0, { 0 }, { 0 }, 0, \ -/* utraps, gsr, pad2, sig_address, sig_desc, */ \ - 0, { 0 }, 0, 0, 0, \ -/* xfsr, pad3, */ \ - { 0 }, 0, \ +/* utraps, gsr, pad2, xfsr, */ \ + 0, { 0 }, 0, { 0 }, \ /* reg_window */ \ { { { 0, }, { 0, } }, }, \ /* rwbuf_stkptrs */ \ diff --git a/include/asm-sparc64/siginfo.h b/include/asm-sparc64/siginfo.h index 9e60d6015ab3..f704e071fa6a 100644 --- a/include/asm-sparc64/siginfo.h +++ b/include/asm-sparc64/siginfo.h @@ -141,6 +141,11 @@ typedef struct siginfo32 { #define si_band _sifields._sigpoll._band #define si_fd _sifields._sigpoll._fd +#ifdef __KERNEL__ +#define __SI_MASK 0 +#define __SI_FAULT 0 +#endif + /* * si_code values * Digital reserves positive values for kernel-generated signals. @@ -187,7 +192,7 @@ typedef struct siginfo32 { * SIGSEGV si_codes */ #define SEGV_MAPERR 1 /* address not mapped to object */ -#define SRGV_ACCERR 2 /* invalid permissions for mapped object */ +#define SEGV_ACCERR 2 /* invalid permissions for mapped object */ #define NSIGSEGV 2 /* diff --git a/include/asm-sparc64/smp.h b/include/asm-sparc64/smp.h index 7cd66aa563ed..116fe903e624 100644 --- a/include/asm-sparc64/smp.h +++ b/include/asm-sparc64/smp.h @@ -68,13 +68,17 @@ extern void smp_callin(void); extern void smp_boot_cpus(void); extern void smp_store_cpu_info(int id); -extern __volatile__ int cpu_number_map[NR_CPUS]; +extern __volatile__ int __cpu_number_map[NR_CPUS]; extern __volatile__ int __cpu_logical_map[NR_CPUS]; extern __inline__ int cpu_logical_map(int cpu) { return __cpu_logical_map[cpu]; } +extern __inline__ int cpu_number_map(int cpu) +{ + return __cpu_number_map[cpu]; +} extern __inline__ int hard_smp_processor_id(void) { diff --git a/include/asm-sparc64/unistd.h b/include/asm-sparc64/unistd.h index 49c063d4408f..d1c874480b1b 100644 --- a/include/asm-sparc64/unistd.h +++ b/include/asm-sparc64/unistd.h @@ -1,4 +1,4 @@ -/* $Id: unistd.h,v 1.39 2000/01/11 17:34:05 jj Exp $ */ +/* $Id: unistd.h,v 1.40 2000/01/16 06:20:38 davem Exp $ */ #ifndef _SPARC64_UNISTD_H #define _SPARC64_UNISTD_H diff --git a/include/linux/cyclomx.h b/include/linux/cyclomx.h index 2e387395ea38..983295c18357 100644 --- a/include/linux/cyclomx.h +++ b/include/linux/cyclomx.h @@ -13,6 +13,8 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * ============================================================================ +* 2000/01/21 acme rename cyclomx_open to cyclomx_mod_inc_use_count +* and cyclomx_close to cyclomx_mod_dec_use_count * 1999/05/19 acme wait_queue_head_t wait_stats(support for 2.3.*) * 1999/01/03 acme judicious use of data types * 1998/12/27 acme cleanup: PACKED not needed @@ -80,8 +82,8 @@ typedef struct cycx { } cycx_t; /* Public Functions */ -void cyclomx_open (cycx_t *card); /* cycx_main.c */ -void cyclomx_close (cycx_t *card); /* cycx_main.c */ +void cyclomx_mod_inc_use_count (cycx_t *card); /* cycx_main.c */ +void cyclomx_mod_dec_use_count (cycx_t *card); /* cycx_main.c */ void cyclomx_set_state (cycx_t *card, int state); /* cycx_main.c */ #ifdef CONFIG_CYCLOMX_X25 diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 56ace40e0639..9a14c07ed25e 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -41,7 +41,7 @@ extern int eth_header_parse(struct sk_buff *skb, unsigned char *haddr); extern struct net_device * init_etherdev(struct net_device *, int); -#ifdef CONFIG_IP_ROUTER +#if 1 /*def CONFIG_IP_ROUTER*/ static __inline__ void eth_copy_and_sum (struct sk_buff *dest, unsigned char *src, int len, int base) { memcpy (dest->data, src, len); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7bea8a9a024c..f2170ed5f56b 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -202,16 +202,25 @@ extern __inline__ int skb_queue_empty(struct sk_buff_head *list) return (list->next == (struct sk_buff *) list); } +extern __inline__ struct sk_buff *skb_get(struct sk_buff *skb) +{ + atomic_inc(&skb->users); + return skb; +} + +/* If users==1, we are the only owner and are can avoid redundant + * atomic change. + */ extern __inline__ void kfree_skb(struct sk_buff *skb) { - if (atomic_dec_and_test(&skb->users)) + if (atomic_read(&skb->users) == 1 || atomic_dec_and_test(&skb->users)) __kfree_skb(skb); } /* Use this if you didn't touch the skb state [for fast switching] */ extern __inline__ void kfree_skb_fast(struct sk_buff *skb) { - if (atomic_dec_and_test(&skb->users)) + if (atomic_read(&skb->users) == 1 || atomic_dec_and_test(&skb->users)) kfree_skbmem(skb); } diff --git a/include/linux/sockios.h b/include/linux/sockios.h index 995e43e9a695..fe38a2d40507 100644 --- a/include/linux/sockios.h +++ b/include/linux/sockios.h @@ -20,6 +20,10 @@ #include +/* Linux-specific socket ioctls */ +#define SIOCINQ FIONREAD +#define SIOCOUTQ TIOCOUTQ + /* Routing table calls. */ #define SIOCADDRT 0x890B /* add routing table entry */ #define SIOCDELRT 0x890C /* delete routing table entry */ diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index d0a68c50241c..03148253d94a 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -251,7 +251,12 @@ enum NET_IPV4_INET_PEER_MINTTL=70, NET_IPV4_INET_PEER_MAXTTL=71, NET_IPV4_INET_PEER_GC_MINTIME=72, - NET_IPV4_INET_PEER_GC_MAXTIME=73 + NET_IPV4_INET_PEER_GC_MAXTIME=73, + NET_TCP_ORPHAN_RETRIES=74, + NET_TCP_ABORT_ON_OVERFLOW=75, + NET_TCP_SYNACK_RETRIES=76, + NET_TCP_MAX_ORPHANS=77, + NET_TCP_MAX_TW_BUCKETS=78, }; enum { diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 1e78e322c022..e030ee09f22d 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -124,5 +124,6 @@ enum { #define TCP_SYNCNT 7 /* Number of SYN retransmits */ #define TCP_LINGER2 8 /* Life time of orphaned FIN-WAIT-2 state */ #define TCP_DEFER_ACCEPT 9 /* Wake up listener only when data arrive */ +#define TCP_WINDOW_CLAMP 10 /* Bound advertised window */ #endif /* _LINUX_TCP_H */ diff --git a/include/net/dst.h b/include/net/dst.h index 79a3cd392592..4bca9c09203e 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -29,10 +29,13 @@ struct dst_entry struct dst_entry *next; atomic_t __refcnt; /* client references */ int __use; - struct net_device *dev; + struct net_device *dev; int obsolete; + int flags; +#define DST_HOST 1 unsigned long lastuse; unsigned long expires; + unsigned mxlock; unsigned pmtu; unsigned window; @@ -41,6 +44,7 @@ struct dst_entry unsigned ssthresh; unsigned cwnd; unsigned advmss; + unsigned long rate_last; /* rate limiting for ICMP */ unsigned long rate_tokens; diff --git a/include/net/ip.h b/include/net/ip.h index eeb25ffeab4a..a17c12bbddae 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -84,7 +84,7 @@ extern int ip_mc_procinfo(char *, char **, off_t, int); * Functions provided by ip.c */ -extern void ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, +extern int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, u32 saddr, u32 daddr, struct ip_options *opt); extern int ip_rcv(struct sk_buff *skb, struct net_device *dev, diff --git a/include/net/route.h b/include/net/route.h index 9ccfd3bea9cd..180daad87981 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -92,8 +92,7 @@ struct ip_rt_acct __u32 i_packets; }; -extern struct ip_rt_acct ip_rt_acct[256]; -extern rwlock_t ip_rt_acct_lock; +extern struct ip_rt_acct *ip_rt_acct; extern void ip_rt_init(void); extern void ip_rt_redirect(u32 old_gw, u32 dst, u32 new_gw, diff --git a/include/net/snmp.h b/include/net/snmp.h index 4469fdcd155c..5105fd2209d4 100644 --- a/include/net/snmp.h +++ b/include/net/snmp.h @@ -182,7 +182,24 @@ struct linux_mib unsigned long OfoPruned; unsigned long OutOfWindowIcmps; unsigned long LockDroppedIcmps; - unsigned long __pad[32-9]; + unsigned long TimeWaited; + unsigned long TimeWaitRecycled; + unsigned long TimeWaitKilled; + unsigned long PAWSPassiveRejected; + unsigned long PAWSActiveRejected; + unsigned long PAWSEstabRejected; + unsigned long DelayedACKs; + unsigned long DelayedACKLocked; + unsigned long DelayedACKLost; + unsigned long ListenOverflows; + unsigned long ListenDrops; + unsigned long TCPPrequeued; + unsigned long TCPDirectCopyFromBacklog; + unsigned long TCPDirectCopyFromPrequeue; + unsigned long TCPPrequeueDropped; + unsigned long TCPHPHits; + unsigned long TCPHPHitsToUser; + unsigned long __pad[32-26]; }; #define SNMP_INC_STATS(mib, field) ((mib)[2*smp_processor_id()+!in_interrupt()].field++) diff --git a/include/net/sock.h b/include/net/sock.h index 5aa0172c2b0a..5dc9f5be3b46 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -96,7 +96,6 @@ struct atm_vcc; #include #include -#define MIN_WRITE_SPACE 2048 /* The AF_UNIX specific socket options */ struct unix_opt { @@ -229,41 +228,66 @@ struct tcp_opt { __u32 snd_nxt; /* Next sequence we send */ __u32 snd_una; /* First byte we want an ack for */ - __u32 rcv_tstamp; /* timestamp of last received packet */ - __u32 lrcvtime; /* timestamp of last received data packet*/ - __u32 srtt; /* smothed round trip time << 3 */ + __u32 snd_sml; /* Last byte of the most recently transmitted small packet */ + __u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ + __u32 lsndtime; /* timestamp of last sent data packet (for restart window) */ - __u32 ato; /* delayed ack timeout */ - __u32 snd_wl1; /* Sequence for window update */ + /* Delayed ACK control data */ + struct { + __u8 pending; /* ACK is pending */ + __u8 quick; /* Scheduled number of quick acks */ + __u8 pingpong; /* The session is interactive */ + __u8 blocked; /* Delayed ACK was blocked by socket lock*/ + __u32 ato; /* Predicted tick of soft clock */ + __u32 lrcvtime; /* timestamp of last received data packet*/ + __u16 last_seg_size; /* Size of last incoming segment */ + __u16 rcv_mss; /* MSS used for delayed ACK decisions */ + } ack; + + /* Data for direct copy to user */ + struct { + struct sk_buff_head prequeue; + int memory; + struct task_struct *task; + struct iovec *iov; + int len; + } ucopy; + __u32 snd_wl1; /* Sequence for window update */ __u32 snd_wl2; /* Ack sequence for update */ __u32 snd_wnd; /* The window we expect to receive */ - __u32 max_window; + __u32 max_window; /* Maximal window ever seen from peer */ __u32 pmtu_cookie; /* Last pmtu seen by socket */ __u16 mss_cache; /* Cached effective mss, not including SACKS */ __u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ - __u16 ext_header_len; /* Dave, do you allow mw to use this hole? 8) --ANK */ - __u8 pending; /* pending events */ + __u16 ext_header_len; /* Network protocol overhead (IP/IPv6 options) */ + __u8 dup_acks; /* Consequetive duplicate acks seen from other end */ __u8 retransmits; - __u32 last_ack_sent; /* last ack we sent */ - __u32 backoff; /* backoff */ + __u16 __empty1; + __u8 defer_accept; + +/* RTT measurement */ + __u8 backoff; /* backoff */ + __u32 srtt; /* smothed round trip time << 3 */ __u32 mdev; /* medium deviation */ - __u32 snd_cwnd; /* Sending congestion window */ __u32 rto; /* retransmit timeout */ __u32 packets_out; /* Packets which are "in flight" */ __u32 fackets_out; /* Non-retrans SACK'd packets */ __u32 retrans_out; /* Fast-retransmitted packets out */ __u32 high_seq; /* snd_nxt at onset of congestion */ + /* * Slow start and congestion control (see also Nagle, and Karn & Partridge) */ __u32 snd_ssthresh; /* Slow start size threshold */ + __u32 snd_cwnd; /* Sending congestion window */ __u16 snd_cwnd_cnt; /* Linear increase counter */ __u16 snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */ - __u8 dup_acks; /* Consequetive duplicate acks seen from other end */ - __u8 delayed_acks; + + __u8 nonagle; /* Disable Nagle algorithm? */ + __u8 syn_retries; /* num of allowed syn retries */ __u16 user_mss; /* mss requested by user in ioctl */ /* Two commonly used timers in both sender and receiver paths. */ @@ -294,34 +318,49 @@ struct tcp_opt { __u8 snd_wscale; /* Window scaling received from sender */ __u8 rcv_wscale; /* Window scaling to send to receiver */ __u8 rexmt_done; /* Retransmitted up to send head? */ + __u8 keepalive_probes; /* num of allowed keep alive probes */ + +/* PAWS/RTTM data */ __u32 rcv_tsval; /* Time stamp value */ __u32 rcv_tsecr; /* Time stamp echo reply */ __u32 ts_recent; /* Time stamp to echo next */ long ts_recent_stamp;/* Time we stored ts_recent (for aging) */ - int num_sacks; /* Number of SACK blocks */ + __u32 last_ack_sent; /* last ack we sent (RTTM/PAWS) */ + +/* SACKs data */ struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/ struct timer_list probe_timer; /* Probes */ - __u32 window_clamp; /* XXX Document this... -DaveM */ - __u32 probes_out; /* unanswered 0 window probes */ + __u32 window_clamp; /* Maximal window to advertise */ + __u8 probes_out; /* unanswered 0 window probes */ + __u8 num_sacks; /* Number of SACK blocks */ + __u16 advmss; /* Advertised MSS */ + + __u32 syn_stamp; __u32 syn_seq; __u32 fin_seq; __u32 urg_seq; __u32 urg_data; - __u32 last_seg_size; /* Size of last incoming segment */ - __u32 rcv_mss; /* MSS used for delayed ACK decisions */ + /* The syn_wait_lock is necessary only to avoid tcp_get_info having + * to grab the main lock sock while browsing the listening hash + * (otherwise it's deadlock prone). + * This lock is acquired in read mode only from tcp_get_info() and + * it's acquired in write mode _only_ from code that is actively + * changing the syn_wait_queue. All readers that are holding + * the master sock lock don't need to grab this lock in read mode + * too as the syn_wait_queue writes are always protected from + * the main sock lock. + */ + rwlock_t syn_wait_lock; + struct tcp_listen_opt *listen_opt; + struct open_request *accept_queue; /* Established children */ - struct open_request *syn_wait_queue; - struct open_request **syn_wait_last; + int write_pending; /* A write to socket waits to start. */ - int syn_backlog; /* Backlog of received SYNs */ - int write_pending; - unsigned int keepalive_time; /* time before keep alive takes place */ unsigned int keepalive_intvl; /* time interval between keep alive probes */ - unsigned char keepalive_probes; /* num of allowed keep alive probes */ - unsigned char syn_retries; /* num of allowed syn retries */ + int linger2; }; @@ -411,7 +450,7 @@ struct sock { unsigned short family; /* Address family */ unsigned char reuse, /* SO_REUSEADDR setting */ - nonagle; /* Disable Nagle algorithm? */ + __unused; atomic_t refcnt; /* Reference count */ socket_lock_t lock; /* Synchronizer... */ @@ -498,6 +537,9 @@ struct sock { unsigned char localroute; /* Route locally only */ unsigned char protocol; struct ucred peercred; + int rcvlowat; + long rcvtimeo; + long sndtimeo; #ifdef CONFIG_FILTER /* Socket Filtering Instructions */ @@ -557,7 +599,7 @@ struct sock { struct timer_list timer; /* This is the sock cleanup timer. */ struct timeval stamp; - /* Identd */ + /* Identd and reporting IO signals */ struct socket *socket; /* RPC layer private data */ @@ -599,12 +641,6 @@ struct proto { int (*disconnect)(struct sock *sk, int flags); struct sock * (*accept) (struct sock *sk, int flags, int *err); - void (*retransmit)(struct sock *sk, int all); - void (*write_wakeup)(struct sock *sk); - void (*read_wakeup)(struct sock *sk); - - unsigned int (*poll)(struct file * file, struct socket *sock, - struct poll_table_struct *wait); int (*ioctl)(struct sock *sk, int cmd, unsigned long arg); @@ -632,8 +668,6 @@ struct proto { void (*unhash)(struct sock *sk); int (*get_port)(struct sock *sk, unsigned short snum); - unsigned short max_header; - unsigned long retransmits; char name[32]; struct { @@ -672,6 +706,9 @@ static void __inline__ sock_prot_dec_use(struct proto *prot) * While locked, BH processing will add new packets to * the backlog queue. This queue is processed by the * owner of the socket lock right before it is released. + * + * Since ~2.3.5 it is also exclusive sleep lock serializing + * accesses from user process context. */ extern void __lock_sock(struct sock *sk); extern void __release_sock(struct sock *sk); @@ -682,11 +719,12 @@ do { spin_lock_bh(&((__sk)->lock.slock)); \ (__sk)->lock.users = 1; \ spin_unlock_bh(&((__sk)->lock.slock)); \ } while(0) + #define release_sock(__sk) \ do { spin_lock_bh(&((__sk)->lock.slock)); \ - (__sk)->lock.users = 0; \ if ((__sk)->backlog.tail != NULL) \ __release_sock(__sk); \ + (__sk)->lock.users = 0; \ wake_up(&((__sk)->lock.wq)); \ spin_unlock_bh(&((__sk)->lock.slock)); \ } while(0) @@ -788,9 +826,6 @@ extern int sock_no_mmap(struct file *file, * Default socket callbacks and setup code */ -extern void sock_def_callback1(struct sock *); -extern void sock_def_callback2(struct sock *, int); -extern void sock_def_callback3(struct sock *); extern void sock_def_destruct(struct sock *); /* Initialise core socket variables */ @@ -888,6 +923,34 @@ extern __inline__ void sock_put(struct sock *sk) sk_free(sk); } +/* Detach socket from process context. + * Announce socket dead, detach it from wait queue and inode. + * Note that parent inode held reference count on this struct sock, + * we do not release it in this function, because protocol + * probably wants some additional cleanups or even continuing + * to work with this socket (TCP). + * + * NOTE: When softnet goes in replace _irq with _bh! + */ +extern __inline__ void sock_orphan(struct sock *sk) +{ + write_lock_irq(&sk->callback_lock); + sk->dead = 1; + sk->socket = NULL; + sk->sleep = NULL; + write_unlock_irq(&sk->callback_lock); +} + +extern __inline__ void sock_graft(struct sock *sk, struct socket *parent) +{ + write_lock_irq(&sk->callback_lock); + sk->sleep = &parent->wait; + parent->sk = sk; + sk->socket = parent; + write_unlock_irq(&sk->callback_lock); +} + + extern __inline__ struct dst_entry * __sk_dst_get(struct sock *sk) { @@ -1071,13 +1134,18 @@ extern __inline__ unsigned long sock_wspace(struct sock *sk) return amt; } +#define SOCK_MIN_SNDBUF 2048 +#define SOCK_MIN_RCVBUF 128 +/* Must be less or equal SOCK_MIN_SNDBUF */ +#define SOCK_MIN_WRITE_SPACE SOCK_MIN_SNDBUF + /* * Default write policy as shown to user space via poll/select/SIGIO * Kernel internally doesn't use the MIN_WRITE_SPACE threshold. */ extern __inline__ int sock_writeable(struct sock *sk) { - return sock_wspace(sk) >= MIN_WRITE_SPACE; + return sock_wspace(sk) >= SOCK_MIN_WRITE_SPACE; } extern __inline__ int gfp_any(void) @@ -1085,6 +1153,20 @@ extern __inline__ int gfp_any(void) return in_interrupt() ? GFP_ATOMIC : GFP_KERNEL; } +extern __inline__ long sock_rcvtimeo(struct sock *sk, int noblock) +{ + return noblock ? 0 : sk->rcvtimeo; +} + +extern __inline__ long sock_sndtimeo(struct sock *sk, int noblock) +{ + return noblock ? 0 : sk->sndtimeo; +} + +extern __inline__ int sock_rcvlowat(struct sock *sk, int waitall, int len) +{ + return waitall ? len : min(sk->rcvlowat, len); +} /* * Enable debug/info messages @@ -1117,4 +1199,7 @@ extern __inline__ int gfp_any(void) lock_sock(sk); \ } +extern __u32 sysctl_wmem_max; +extern __u32 sysctl_rmem_max; + #endif /* _SOCK_H */ diff --git a/include/net/tcp.h b/include/net/tcp.h index ef7da5368c03..f62449ae3d25 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -19,6 +19,7 @@ #define _TCP_H #define TCP_DEBUG 1 +#undef TCP_FORMAL_WINDOW #include #include @@ -130,27 +131,27 @@ struct tcp_tw_bucket { struct sock *bind_next; struct sock **bind_pprev; unsigned char state, - zapped; + substate; /* "zapped" is replaced with "substate" */ __u16 sport; unsigned short family; unsigned char reuse, - nonagle; + rcv_wscale; /* It is also TW bucket specific */ atomic_t refcnt; /* And these are ours. */ int hashent; + int timeout; __u32 rcv_nxt; __u32 snd_nxt; + __u32 rcv_wnd; + __u32 syn_seq; __u32 ts_recent; long ts_recent_stamp; + unsigned long ttd; struct tcp_bind_bucket *tb; struct tcp_tw_bucket *next_death; struct tcp_tw_bucket **pprev_death; - int death_slot; -#ifdef CONFIG_TCP_TW_RECYCLE - unsigned long ttd; - int rto; -#endif + #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) struct in6_addr v6_daddr; struct in6_addr v6_rcv_saddr; @@ -169,10 +170,11 @@ extern __inline__ void tcp_tw_put(struct tcp_tw_bucket *tw) } } -extern int tcp_tw_death_row_slot; +extern atomic_t tcp_orphan_count; +extern int tcp_tw_count; +extern void tcp_time_wait(struct sock *sk, int state, int timeo); extern void tcp_timewait_kill(struct tcp_tw_bucket *tw); -extern void tcp_tw_schedule(struct tcp_tw_bucket *tw); -extern void tcp_tw_reschedule(struct tcp_tw_bucket *tw); +extern void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo); extern void tcp_tw_deschedule(struct tcp_tw_bucket *tw); @@ -224,67 +226,81 @@ static __inline__ int tcp_sk_listen_hashfn(struct sock *sk) return tcp_lhashfn(sk->num); } -/* Note, that it is > than ipv6 header */ -#define NETHDR_SIZE (sizeof(struct iphdr) + 40) - -/* - * 40 is maximal IP options size - * 20 is the maximum TCP options size we can currently construct on a SYN. - * 40 is the maximum possible TCP options size. - */ - -#define MAX_SYN_SIZE (NETHDR_SIZE + sizeof(struct tcphdr) + 20 + MAX_HEADER + 15) -#define MAX_FIN_SIZE (NETHDR_SIZE + sizeof(struct tcphdr) + MAX_HEADER + 15) -#define BASE_ACK_SIZE (NETHDR_SIZE + MAX_HEADER + 15) -#define MAX_ACK_SIZE (NETHDR_SIZE + sizeof(struct tcphdr) + MAX_HEADER + 15) -#define MAX_RESET_SIZE (NETHDR_SIZE + sizeof(struct tcphdr) + MAX_HEADER + 15) -#define MAX_TCPHEADER_SIZE (NETHDR_SIZE + sizeof(struct tcphdr) + 20 + MAX_HEADER + 15) +#define MAX_TCP_HEADER (128 + MAX_HEADER) /* * Never offer a window over 32767 without using window scaling. Some * poor stacks do signed 16bit maths! */ -#define MAX_WINDOW 32767 -#define MAX_DELAY_ACK 2 +#define MAX_TCP_WINDOW 32767 + +/* Minimal accepted MSS. It is (60+60+8) - (20+20). */ +#define TCP_MIN_MSS 88 + +/* Minimal RCV_MSS. */ +#define TCP_MIN_RCVMSS 536 /* * How much of the receive buffer do we advertize * (the rest is reserved for headers and driver packet overhead) * Use a power of 2. */ -#define WINDOW_ADVERTISE_DIVISOR 2 +#define TCP_WINDOW_ADVERTISE_DIVISOR 2 /* urg_data states */ -#define URG_VALID 0x0100 -#define URG_NOTYET 0x0200 -#define URG_READ 0x0400 +#define TCP_URG_VALID 0x0100 +#define TCP_URG_NOTYET 0x0200 +#define TCP_URG_READ 0x0400 -#define TCP_RETR1 7 /* +#define TCP_RETR1 3 /* * This is how many retries it does before it * tries to figure out if the gateway is - * down. + * down. Minimal RFC value is 3; it corresponds + * to ~3sec-8min depending on RTO. */ #define TCP_RETR2 15 /* * This should take at least * 90 minutes to time out. + * RFC1122 says that the limit is 100 sec. + * 15 is ~13-30min depending on RTO. + */ + +#define TCP_SYN_RETRIES 5 /* number of times to retry active opening a + * connection: ~180sec is RFC minumum */ + +#define TCP_SYNACK_RETRIES 5 /* number of times to retry passive opening a + * connection: ~180sec is RFC minumum */ + + +#define TCP_ORPHAN_RETRIES 7 /* number of times to retry on an orphaned + * socket. 7 is ~50sec-16min. */ -#define TCP_TIMEOUT_LEN (15*60*HZ) /* should be about 15 mins */ -#define TCP_TIMEWAIT_LEN (60*HZ) /* how long to wait to successfully - * close the socket, about 60 seconds */ -#define TCP_FIN_TIMEOUT (3*60*HZ) /* BSD style FIN_WAIT2 deadlock breaker */ - -#define TCP_ACK_TIME (3*HZ) /* time to delay before sending an ACK */ -#define TCP_WRITE_TIME (30*HZ) /* initial time to wait for an ACK, - * after last transmit */ -#define TCP_TIMEOUT_INIT (3*HZ) /* RFC 1122 initial timeout value */ -#define TCP_SYN_RETRIES 10 /* number of times to retry opening a - * connection (TCP_RETR2-....) */ -#define TCP_PROBEWAIT_LEN (1*HZ)/* time to wait between probes when - * I've got something to write and - * there is no window */ -#define TCP_KEEPALIVE_TIME (120*60*HZ) /* two hours */ + +#define TCP_TIMEWAIT_LEN (60*HZ) /* how long to wait to destroy TIME-WAIT + * state, about 60 seconds */ +#define TCP_FIN_TIMEOUT TCP_TIMEWAIT_LEN + /* BSD style FIN_WAIT2 deadlock breaker. + * It used to be 3min, new value is 60sec, + * to combine FIN-WAIT-2 timeout with + * TIME-WAIT timer. + */ + +#define TCP_DELACK_MAX (HZ/2) /* maximal time to delay before sending an ACK */ +#define TCP_DELACK_MIN (2) /* minimal time to delay before sending an ACK, + * 2 scheduler ticks, not depending on HZ */ +#define TCP_ATO_MAX ((TCP_DELACK_MAX*4)/5) /* ATO producing TCP_DELACK_MAX */ +#define TCP_ATO_MIN 2 +#define TCP_RTO_MAX (120*HZ) +#define TCP_RTO_MIN (HZ/5) +#define TCP_TIMEOUT_INIT (3*HZ) /* RFC 1122 initial RTO value */ + +#define TCP_RESOURCE_PROBE_INTERVAL (HZ/2) /* Maximal interval between probes + * for local resources. + */ + +#define TCP_KEEPALIVE_TIME (120*60*HZ) /* two hours */ #define TCP_KEEPALIVE_PROBES 9 /* Max of 9 keepalive probes */ #define TCP_KEEPALIVE_INTVL (75*HZ) @@ -293,14 +309,39 @@ static __inline__ int tcp_sk_listen_hashfn(struct sock *sk) #define MAX_TCP_KEEPCNT 127 #define MAX_TCP_SYNCNT 127 -#define TCP_SYNACK_PERIOD (HZ/2) /* How often to run the synack slow timer */ -#define TCP_QUICK_TRIES 8 /* How often we try to retransmit, until - * we tell the link layer that it is something - * wrong (e.g. that it can expire redirects) */ - /* TIME_WAIT reaping mechanism. */ #define TCP_TWKILL_SLOTS 8 /* Please keep this a power of 2. */ -#define TCP_TWKILL_PERIOD ((HZ*60)/TCP_TWKILL_SLOTS) +#define TCP_TWKILL_PERIOD (TCP_TIMEWAIT_LEN/TCP_TWKILL_SLOTS) + +#define TCP_SYNQ_INTERVAL (HZ/5) /* Period of SYNACK timer */ +#define TCP_SYNQ_HSIZE 64 /* Size of SYNACK hash table */ + +#define TCP_PAWS_24DAYS (60 * 60 * 24 * 24) +#define TCP_PAWS_MSL 60 /* Per-host timestamps are invalidated + * after this time. It should be equal + * (or greater than) TCP_TIMEWAIT_LEN + * to provide reliability equal to one + * provided by timewait state. + */ +#define TCP_PAWS_WINDOW 1 /* Replay window for per-host + * timestamps. It must be less than + * minimal timewait lifetime. + */ + +#define TCP_TW_RECYCLE_SLOTS_LOG 5 +#define TCP_TW_RECYCLE_SLOTS (1< 4sec, it is "slow" path, no recycling is required, + so that we select tick to get range about 4 seconds. + */ + +#if HZ == 100 +#define TCP_TW_RECYCLE_TICK (7+2-TCP_TW_RECYCLE_SLOTS_LOG) +#elif HZ == 1024 +#define TCP_TW_RECYCLE_TICK (10+2-TCP_TW_RECYCLE_SLOTS_LOG) +#else +#error HZ != 100 && HZ != 1024. +#endif /* * TCP option @@ -331,23 +372,40 @@ static __inline__ int tcp_sk_listen_hashfn(struct sock *sk) #define TCPOLEN_SACK_BASE_ALIGNED 4 #define TCPOLEN_SACK_PERBLOCK 8 -#define TIME_WRITE 1 /* Not yet used */ -#define TIME_RETRANS 2 /* Retransmit timer */ -#define TIME_DACK 3 /* Delayed ack timer */ -#define TIME_PROBE0 4 -#define TIME_KEEPOPEN 5 +#define TCP_TIME_RETRANS 1 /* Retransmit timer */ +#define TCP_TIME_DACK 2 /* Delayed ack timer */ +#define TCP_TIME_PROBE0 3 /* Zero window probe timer */ +#define TCP_TIME_KEEPOPEN 4 /* Keepalive timer */ /* sysctl variables for tcp */ +extern int sysctl_max_syn_backlog; +extern int sysctl_tcp_timestamps; +extern int sysctl_tcp_window_scaling; +extern int sysctl_tcp_sack; +extern int sysctl_tcp_fin_timeout; +extern int sysctl_tcp_tw_recycle; extern int sysctl_tcp_keepalive_time; extern int sysctl_tcp_keepalive_probes; extern int sysctl_tcp_keepalive_intvl; extern int sysctl_tcp_syn_retries; +extern int sysctl_tcp_synack_retries; +extern int sysctl_tcp_retries1; +extern int sysctl_tcp_retries2; +extern int sysctl_tcp_orphan_retries; +extern int sysctl_tcp_syncookies; +extern int sysctl_tcp_retrans_collapse; +extern int sysctl_tcp_stdurg; +extern int sysctl_tcp_rfc1337; +extern int sysctl_tcp_tw_recycle; +extern int sysctl_tcp_abort_on_overflow; +extern int sysctl_tcp_max_orphans; +extern int sysctl_tcp_max_tw_buckets; struct open_request; struct or_calltable { int family; - void (*rtx_syn_ack) (struct sock *sk, struct open_request *req); + int (*rtx_syn_ack) (struct sock *sk, struct open_request *req, struct dst_entry*); void (*send_ack) (struct sk_buff *skb, struct open_request *req); void (*destructor) (struct open_request *req); void (*send_reset) (struct sk_buff *skb); @@ -376,12 +434,14 @@ struct open_request { __u16 rmt_port; __u16 mss; __u8 retrans; - __u8 __pad; - unsigned snd_wscale : 4, + __u8 index; + __u16 snd_wscale : 4, rcv_wscale : 4, tstamp_ok : 1, sack_ok : 1, - wscale_ok : 1; + wscale_ok : 1, + ecn_ok : 1, + acked : 1; /* The following two fields can be easily recomputed I think -AK */ __u32 window_clamp; /* window clamp at creation time */ __u32 rcv_wnd; /* rcv_wnd offered first time */ @@ -400,8 +460,14 @@ struct open_request { /* SLAB cache for open requests. */ extern kmem_cache_t *tcp_openreq_cachep; -#define tcp_openreq_alloc() kmem_cache_alloc(tcp_openreq_cachep, SLAB_ATOMIC) -#define tcp_openreq_free(req) kmem_cache_free(tcp_openreq_cachep, req) +#define tcp_openreq_alloc() kmem_cache_alloc(tcp_openreq_cachep, SLAB_ATOMIC) +#define tcp_openreq_fastfree(req) kmem_cache_free(tcp_openreq_cachep, req) + +extern __inline__ void tcp_openreq_free(struct open_request *req) +{ + req->class->destructor(req); + tcp_openreq_fastfree(req); +} #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) #define TCP_INET_FAMILY(fam) ((fam) == AF_INET) @@ -441,9 +507,9 @@ struct tcp_func { int (*hash_connecting) (struct sock *sk); - __u16 net_header_len; - + int (*remember_stamp) (struct sock *sk); + __u16 net_header_len; int (*setsockopt) (struct sock *sk, int level, @@ -506,7 +572,11 @@ extern void tcp_shutdown (struct sock *sk, int how); extern int tcp_v4_rcv(struct sk_buff *skb, unsigned short len); -extern int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg); +extern int tcp_v4_remember_stamp(struct sock *sk); + +extern int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw); + +extern int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size); extern int tcp_ioctl(struct sock *sk, int cmd, @@ -522,6 +592,23 @@ extern int tcp_rcv_established(struct sock *sk, struct tcphdr *th, unsigned len); +static __inline__ void tcp_dec_quickack_mode(struct tcp_opt *tp) +{ + if (tp->ack.quick && --tp->ack.quick == 0 && !tp->ack.pingpong) { + /* Leaving quickack mode we deflate ATO to give peer + * a time to adapt to new worse(!) RTO. It is not required + * in pingpong mode, when ACKs were delayed in any case. + */ + tp->ack.ato = TCP_ATO_MIN; + } +} + +static __inline__ void tcp_delack_init(struct tcp_opt *tp) +{ + memset(&tp->ack, 0, sizeof(tp->ack)); +} + + enum tcp_tw_status { TCP_TW_SUCCESS = 0, @@ -530,6 +617,7 @@ enum tcp_tw_status TCP_TW_SYN = 3 }; + extern enum tcp_tw_status tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, struct tcphdr *th, @@ -537,7 +625,10 @@ extern enum tcp_tw_status tcp_timewait_state_process(struct tcp_tw_bucket *tw, extern struct sock * tcp_check_req(struct sock *sk,struct sk_buff *skb, struct open_request *req, - struct open_request *prev); + struct open_request **prev); +extern int tcp_child_process(struct sock *parent, + struct sock *child, + struct sk_buff *skb); extern void tcp_close(struct sock *sk, long timeout); @@ -557,6 +648,8 @@ extern int tcp_recvmsg(struct sock *sk, int len, int nonblock, int flags, int *addr_len); +extern int tcp_listen_start(struct sock *sk); + extern void tcp_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp, int no_fancy); @@ -614,9 +707,7 @@ extern __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, /* tcp_output.c */ -extern void tcp_read_wakeup(struct sock *); -extern void tcp_write_xmit(struct sock *); -extern void tcp_time_wait(struct sock *); +extern int tcp_write_xmit(struct sock *); extern int tcp_retransmit_skb(struct sock *, struct sk_buff *); extern void tcp_fack_retransmit(struct sock *); extern void tcp_xmit_retransmit_queue(struct sock *); @@ -624,46 +715,22 @@ extern void tcp_simple_retransmit(struct sock *); extern void tcp_send_probe0(struct sock *); extern void tcp_send_partial(struct sock *); -extern void tcp_write_wakeup(struct sock *); +extern int tcp_write_wakeup(struct sock *); extern void tcp_send_fin(struct sock *sk); extern void tcp_send_active_reset(struct sock *sk, int priority); extern int tcp_send_synack(struct sock *); -extern void tcp_transmit_skb(struct sock *, struct sk_buff *); -extern void tcp_send_skb(struct sock *, struct sk_buff *, int force_queue); +extern int tcp_transmit_skb(struct sock *, struct sk_buff *); +extern void tcp_send_skb(struct sock *, struct sk_buff *, int force_queue, unsigned mss_now); extern void tcp_send_ack(struct sock *sk); -extern void tcp_send_delayed_ack(struct sock *sk, int max_timeout); +extern void tcp_send_delayed_ack(struct sock *sk); /* tcp_timer.c */ extern void tcp_reset_xmit_timer(struct sock *, int, unsigned long); extern void tcp_init_xmit_timers(struct sock *); extern void tcp_clear_xmit_timers(struct sock *); -extern void tcp_retransmit_timer(unsigned long); -extern void tcp_delack_timer(unsigned long); -extern void tcp_probe_timer(unsigned long); - extern void tcp_delete_keepalive_timer (struct sock *); extern void tcp_reset_keepalive_timer (struct sock *, unsigned long); -extern void tcp_keepalive_timer (unsigned long); - -/* - * TCP slow timer - */ -extern struct timer_list tcp_slow_timer; - -struct tcp_sl_timer { - atomic_t count; - unsigned long period; - unsigned long last; - void (*handler) (unsigned long); -}; - -#define TCP_SLT_SYNACK 0 -#define TCP_SLT_TWKILL 1 -#define TCP_SLT_MAX 2 - -extern struct tcp_sl_timer tcp_slt_array[TCP_SLT_MAX]; - extern int tcp_sync_mss(struct sock *sk, u32 pmtu); /* Compute the current effective MSS, taking SACKs and IP options, @@ -673,7 +740,7 @@ extern int tcp_sync_mss(struct sock *sk, u32 pmtu); static __inline__ unsigned int tcp_current_mss(struct sock *sk) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - struct dst_entry *dst = sk->dst_cache; + struct dst_entry *dst = __sk_dst_get(sk); int mss_now = tp->mss_cache; if (dst && dst->pmtu != tp->pmtu_cookie) @@ -682,7 +749,7 @@ static __inline__ unsigned int tcp_current_mss(struct sock *sk) if(tp->sack_ok && tp->num_sacks) mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + (tp->num_sacks * TCPOLEN_SACK_PERBLOCK)); - return mss_now > 8 ? mss_now : 8; + return mss_now; } /* Initialize RCV_MSS value. @@ -704,9 +771,24 @@ extern __inline__ void tcp_initialize_rcv_mss(struct sock *sk) else mss = tp->mss_cache; - tp->rcv_mss = max(min(mss, 536), 8); + tp->ack.rcv_mss = max(min(mss, TCP_MIN_RCVMSS), TCP_MIN_MSS); +} + +static __inline__ void __tcp_fast_path_on(struct tcp_opt *tp, u32 snd_wnd) +{ + tp->pred_flags = htonl((tp->tcp_header_len << 26) | + ntohl(TCP_FLAG_ACK) | + snd_wnd); +} + +static __inline__ void tcp_fast_path_on(struct tcp_opt *tp) +{ + __tcp_fast_path_on(tp, tp->snd_wnd>>tp->snd_wscale); } + + + /* Compute the actual receive window we are currently advertising. * Rcv_nxt can be after the window if our peer push more data * than the offered window. @@ -751,23 +833,26 @@ extern __inline__ u16 tcp_select_window(struct sock *sk) } /* RFC1323 scaling applied */ - return new_win >> tp->rcv_wscale; -} - -/* See if we can advertise non-zero, and if so how much we - * can increase our advertisement. If it becomes more than - * twice what we are talking about right now, return true. - */ -extern __inline__ int tcp_raise_window(struct sock *sk) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - u32 cur_win = tcp_receive_window(tp); - u32 new_win = __tcp_select_window(sk); + new_win >>= tp->rcv_wscale; + +#ifdef TCP_FORMAL_WINDOW + if (new_win == 0) { + /* If we advertise zero window, disable fast path. */ + tp->pred_flags = 0; + } else if (cur_win == 0 && tp->pred_flags == 0 && + skb_queue_len(&tp->out_of_order_queue) == 0 && + !tp->urg_data) { + /* If we open zero window, enable fast path. + Without this it will be open by the first data packet, + it is too late to merge checksumming to copy. + */ + tcp_fast_path_on(tp); + } +#endif - return (new_win && (new_win > (cur_win << 1))); + return new_win; } - /* TCP timestamps are only 32-bits, this causes a slight * complication on 64-bit systems since we store a snapshot * of jiffies in the buffer control blocks below. We decidely @@ -804,6 +889,8 @@ struct tcp_skb_cb { #define TCPCB_FLAG_PSH 0x08 #define TCPCB_FLAG_ACK 0x10 #define TCPCB_FLAG_URG 0x20 +#define TCPCB_FLAG_ECE 0x40 +#define TCPCB_FLAG_CWR 0x80 __u8 sacked; /* State flags for SACK/FACK. */ #define TCPCB_SACKED_ACKED 0x01 /* SKB ACK'd by a SACK block */ @@ -860,13 +947,91 @@ extern __inline__ __u32 tcp_recalc_ssthresh(struct tcp_opt *tp) return max(min(FlightSize, tp->snd_cwnd) >> 1, 2); } +/* Set slow start threshould and cwnd not falling to slow start */ +extern __inline__ void __tcp_enter_cong_avoid(struct tcp_opt *tp) +{ + tp->snd_ssthresh = tcp_recalc_ssthresh(tp); + if (tp->snd_ssthresh > tp->snd_cwnd_clamp) + tp->snd_ssthresh = tp->snd_cwnd_clamp; + tp->snd_cwnd = tp->snd_ssthresh; + tp->snd_cwnd_cnt = 0; + tp->high_seq = tp->snd_nxt; +} + +extern __inline__ void tcp_enter_cong_avoid(struct tcp_opt *tp) +{ + if (!tp->high_seq || after(tp->snd_nxt, tp->high_seq)) + __tcp_enter_cong_avoid(tp); +} + + +/* Increase initial CWND conservatively, i.e. only if estimated + RTT is low enough. It is not quite correct, we should use + POWER i.e. RTT*BANDWIDTH, but we still cannot estimate this. + + Numbers are taken from RFC1414. + */ +static __inline__ __u32 tcp_init_cwnd(struct tcp_opt *tp) +{ + __u32 cwnd; + + if (!tp->srtt || tp->srtt > (HZ/50) || tp->mss_cache > 1460) + cwnd = 2; + else if (tp->mss_cache > 1095) + cwnd = 3; + else + cwnd = 4; + + return min(cwnd, tp->snd_cwnd_clamp); +} + + +static __inline__ int tcp_minshall_check(struct tcp_opt *tp) +{ + return after(tp->snd_sml,tp->snd_una) && + !after(tp->snd_sml, tp->snd_nxt); +} + +static __inline__ void tcp_minshall_update(struct tcp_opt *tp, int mss, int len) +{ + if (len < mss) + tp->snd_sml = tp->snd_nxt; +} + +/* Return 0, if packet can be sent now without violation Nagle's rules: + 1. It is full sized. + 2. Or it contains FIN or URG. + 3. Or TCP_NODELAY was set. + 4. Or TCP_CORK is not set, and all sent packets are ACKed. + With Minshall's modification: all sent small packets are ACKed. + */ + +static __inline__ int tcp_nagle_check(struct tcp_opt *tp, struct sk_buff *skb, unsigned mss_now) +{ + return (skb->len < mss_now && + !(TCP_SKB_CB(skb)->flags & (TCPCB_FLAG_URG|TCPCB_FLAG_FIN)) && + (tp->nonagle == 2 || + (!tp->nonagle && + tp->packets_out && + tcp_minshall_check(tp)))); +} + /* This checks if the data bearing packet SKB (usually tp->send_head) * should be put on the wire right now. */ -static __inline__ int tcp_snd_test(struct sock *sk, struct sk_buff *skb) +static __inline__ int tcp_snd_test(struct tcp_opt *tp, struct sk_buff *skb, + unsigned cur_mss, int tail) { - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - int nagle_check = 1; + /* + * Reset CWND after idle period longer RTO to "restart window". + * It is "side" effect of the function, which is _not_ good + * from viewpoint of clarity. But we have to make it before + * checking congestion window below. Alternative is to prepend + * all the calls with this test. + */ + if (tp->packets_out==0 && + (s32)(tcp_time_stamp - tp->lsndtime) > tp->rto) + tp->snd_cwnd = min(tp->snd_cwnd, tcp_init_cwnd(tp)); /* RFC 1122 - section 4.2.3.4 * @@ -876,97 +1041,126 @@ static __inline__ int tcp_snd_test(struct sock *sk, struct sk_buff *skb) * b) There are packets in flight and we have a small segment * [SWS avoidance and Nagle algorithm] * (part of SWS is done on packetization) + * Minshall version sounds: there are no _small_ + * segments in flight. (tcp_nagle_check) * c) We are retransmiting [Nagle] * d) We have too many packets 'in flight' * * Don't use the nagle rule for urgent data (or * for the final FIN -DaveM). + * + * Also, Nagle rule does not apply to frames, which + * sit in the middle of queue (they have no chances + * to get new data) and if room at tail of skb is + * not enough to save something seriously (<32 for now). */ - if ((sk->nonagle == 2 && (skb->len < tp->mss_cache)) || - (!sk->nonagle && - skb->len < (tp->mss_cache >> 1) && - tp->packets_out && - !(TCP_SKB_CB(skb)->flags & (TCPCB_FLAG_URG|TCPCB_FLAG_FIN)))) - nagle_check = 0; - - /* - * Reset CWND after idle period longer rto. Actually, it would - * be better to save last send time, but VJ in SIGCOMM'88 proposes - * to use keepalive timestamp. Well, it is not good, certainly, - * because SMTP is still broken, but it is better than nothing yet. - */ - if (tp->packets_out==0 && (s32)(tcp_time_stamp - tp->rcv_tstamp) > tp->rto) - tp->snd_cwnd = min(tp->snd_cwnd, 2); /* Don't be strict about the congestion window for the * final FIN frame. -DaveM */ - return (nagle_check && + return ((!tail || !tcp_nagle_check(tp, skb, cur_mss) || + skb_tailroom(skb) < 32) && ((tcp_packets_in_flight(tp) < tp->snd_cwnd) || (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) && !after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) && tp->retransmits == 0); } +static __inline__ void tcp_check_probe_timer(struct sock *sk, struct tcp_opt *tp) +{ + if (!tp->packets_out && !tp->probe_timer.prev) + tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, tp->rto); +} + +static __inline__ int tcp_skb_is_last(struct sock *sk, struct sk_buff *skb) +{ + return (skb->next == (struct sk_buff*)&sk->write_queue); +} + /* Push out any pending frames which were held back due to * TCP_CORK or attempt at coalescing tiny packets. * The socket must be locked by the caller. */ -static __inline__ void tcp_push_pending_frames(struct sock *sk, struct tcp_opt *tp) +static __inline__ void __tcp_push_pending_frames(struct sock *sk, + struct tcp_opt *tp, + unsigned cur_mss) { - if(tp->send_head) { - if(tcp_snd_test(sk, tp->send_head)) - tcp_write_xmit(sk); - else if(tp->packets_out == 0 && !tp->pending) { - /* We held off on this in tcp_send_skb() */ - tp->pending = TIME_PROBE0; - tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto); - } + struct sk_buff *skb = tp->send_head; + + if (skb) { + if (!tcp_snd_test(tp, skb, cur_mss, tcp_skb_is_last(sk, skb)) || + tcp_write_xmit(sk)) + tcp_check_probe_timer(sk, tp); } } -/* This tells the input processing path that an ACK should go out - * right now. - */ -#define tcp_enter_quickack_mode(__tp) ((__tp)->ato |= (1<<31)) -#define tcp_exit_quickack_mode(__tp) ((__tp)->ato &= ~(1<<31)) -#define tcp_in_quickack_mode(__tp) (((__tp)->ato & (1 << 31)) != 0) +static __inline__ void tcp_push_pending_frames(struct sock *sk, + struct tcp_opt *tp) +{ + __tcp_push_pending_frames(sk, tp, tcp_current_mss(sk)); +} + +extern void tcp_destroy_sock(struct sock *sk); + /* - * List all states of a TCP socket that can be viewed as a "connected" - * state. This now includes TCP_SYN_RECV, although I am not yet fully - * convinced that this is the solution for the 'getpeername(2)' - * problem. Thanks to Stephen A. Wood -FvK + * Calculate(/check) TCP checksum */ +static __inline__ u16 tcp_v4_check(struct tcphdr *th, int len, + unsigned long saddr, unsigned long daddr, + unsigned long base) +{ + return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base); +} -extern __inline const int tcp_connected(const int state) +static __inline__ int __tcp_checksum_complete(struct sk_buff *skb) { - return ((1 << state) & - (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1| - TCPF_FIN_WAIT2|TCPF_SYN_RECV)); + return (unsigned short)csum_fold(csum_partial(skb->h.raw, skb->len, skb->csum)); } -extern __inline const int tcp_established(const int state) +static __inline__ int tcp_checksum_complete(struct sk_buff *skb) { - return ((1 << state) & - (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1| - TCPF_FIN_WAIT2)); + return skb->ip_summed != CHECKSUM_UNNECESSARY && + __tcp_checksum_complete(skb); } -extern void tcp_destroy_sock(struct sock *sk); +/* Prequeue for VJ style copy to user, combined with checksumming. */ +static __inline__ void tcp_prequeue_init(struct tcp_opt *tp) +{ + tp->ucopy.task = NULL; + tp->ucopy.len = 0; + tp->ucopy.memory = 0; + skb_queue_head_init(&tp->ucopy.prequeue); +} -/* - * Calculate(/check) TCP checksum +/* Packet is added to VJ-style prequeue for processing in process + * context, if a reader task is waiting. Apparently, this exciting + * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93) + * failed somewhere. Latency? Burstiness? Well, at least now we will + * see, why it failed. 8)8) --ANK */ -static __inline__ u16 tcp_v4_check(struct tcphdr *th, int len, - unsigned long saddr, unsigned long daddr, - unsigned long base) +static __inline__ int tcp_prequeue(struct sock *sk, struct sk_buff *skb) { - return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base); + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + + if (tp->ucopy.task) { + if ((tp->ucopy.memory += skb->truesize) <= (sk->rcvbuf<<1)) { + __skb_queue_tail(&tp->ucopy.prequeue, skb); + if (skb_queue_len(&tp->ucopy.prequeue) == 1) + wake_up_interruptible(sk->sleep); + } else { + NET_INC_STATS_BH(TCPPrequeueDropped); + tp->ucopy.memory -= skb->truesize; + kfree_skb(skb); + } + return 1; + } + return 0; } + #undef STATE_TRACE #ifdef STATE_TRACE @@ -1007,9 +1201,12 @@ static __inline__ void tcp_set_state(struct sock *sk, int state) static __inline__ void tcp_done(struct sock *sk) { + tcp_set_state(sk, TCP_CLOSE); + tcp_clear_xmit_timers(sk); + sk->shutdown = SHUTDOWN_MASK; - if (!sk->dead) + if (!sk->dead) sk->state_change(sk); else tcp_destroy_sock(sk); @@ -1106,7 +1303,7 @@ extern __inline__ void tcp_select_initial_window(int space, __u32 mss, * our initial window offering to 32k. There should also * be a sysctl option to stop being nice. */ - (*rcv_wnd) = min(space, MAX_WINDOW); + (*rcv_wnd) = min(space, MAX_TCP_WINDOW); (*rcv_wscale) = 0; if (wscale_ok) { /* See RFC1323 for an explanation of the limit to 14 */ @@ -1123,52 +1320,127 @@ extern __inline__ void tcp_select_initial_window(int space, __u32 mss, extern __inline__ int tcp_space(struct sock *sk) { return (sk->rcvbuf - atomic_read(&sk->rmem_alloc)) / - WINDOW_ADVERTISE_DIVISOR; + TCP_WINDOW_ADVERTISE_DIVISOR; } extern __inline__ int tcp_full_space( struct sock *sk) { - return sk->rcvbuf / WINDOW_ADVERTISE_DIVISOR; + return sk->rcvbuf / TCP_WINDOW_ADVERTISE_DIVISOR; } -extern __inline__ void tcp_synq_unlink(struct tcp_opt *tp, struct open_request *req, struct open_request *prev) +extern __inline__ void tcp_init_buffer_space(struct sock *sk) { - if(!req->dl_next) - tp->syn_wait_last = (struct open_request **)prev; - prev->dl_next = req->dl_next; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int rcvbuf = tp->advmss+MAX_TCP_HEADER+16+sizeof(struct sk_buff); + int sndbuf = tp->mss_clamp+MAX_TCP_HEADER+16+sizeof(struct sk_buff); + + if (sk->rcvbuf < 3*rcvbuf) + sk->rcvbuf = min (3*rcvbuf, sysctl_rmem_max); + if (sk->sndbuf < 3*sndbuf) + sk->sndbuf = min (3*sndbuf, sysctl_wmem_max); } -extern __inline__ void tcp_synq_queue(struct tcp_opt *tp, struct open_request *req) -{ - req->dl_next = NULL; - *tp->syn_wait_last = req; - tp->syn_wait_last = &req->dl_next; +extern __inline__ void tcp_acceptq_removed(struct sock *sk) +{ + sk->ack_backlog--; } -extern __inline__ void tcp_synq_init(struct tcp_opt *tp) +extern __inline__ void tcp_acceptq_added(struct sock *sk) { - tp->syn_wait_queue = NULL; - tp->syn_wait_last = &tp->syn_wait_queue; + sk->ack_backlog++; } -extern void __tcp_inc_slow_timer(struct tcp_sl_timer *slt); -extern __inline__ void tcp_inc_slow_timer(int timer) +extern __inline__ int tcp_acceptq_is_full(struct sock *sk) { - struct tcp_sl_timer *slt = &tcp_slt_array[timer]; - - if (atomic_read(&slt->count) == 0) - { - __tcp_inc_slow_timer(slt); - } + return sk->ack_backlog > sk->max_ack_backlog; +} + +extern __inline__ void tcp_acceptq_queue(struct sock *sk, struct open_request *req, + struct sock *child) +{ + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + + req->sk = child; + tcp_acceptq_added(sk); + + req->dl_next = tp->accept_queue; + tp->accept_queue = req; +} + +struct tcp_listen_opt +{ + u8 max_qlen_log; /* log_2 of maximal queued SYNs */ + int qlen; + int qlen_young; + int clock_hand; + struct open_request *syn_table[TCP_SYNQ_HSIZE]; +}; + +extern __inline__ void +tcp_synq_removed(struct sock *sk, struct open_request *req) +{ + struct tcp_listen_opt *lopt = sk->tp_pinfo.af_tcp.listen_opt; + + if (--lopt->qlen == 0) + tcp_delete_keepalive_timer(sk); + if (req->retrans == 0) + lopt->qlen_young--; +} + +extern __inline__ void tcp_synq_added(struct sock *sk) +{ + struct tcp_listen_opt *lopt = sk->tp_pinfo.af_tcp.listen_opt; + + if (lopt->qlen++ == 0) + tcp_reset_keepalive_timer(sk, TCP_TIMEOUT_INIT); + lopt->qlen_young++; +} + +extern __inline__ int tcp_synq_len(struct sock *sk) +{ + return sk->tp_pinfo.af_tcp.listen_opt->qlen; +} + +extern __inline__ int tcp_synq_young(struct sock *sk) +{ + return sk->tp_pinfo.af_tcp.listen_opt->qlen_young; +} - atomic_inc(&slt->count); +extern __inline__ int tcp_synq_is_full(struct sock *sk) +{ + return tcp_synq_len(sk)>>sk->tp_pinfo.af_tcp.listen_opt->max_qlen_log; } -extern __inline__ void tcp_dec_slow_timer(int timer) +extern __inline__ void tcp_synq_unlink(struct tcp_opt *tp, struct open_request *req, + struct open_request **prev) { - struct tcp_sl_timer *slt = &tcp_slt_array[timer]; + write_lock(&tp->syn_wait_lock); + *prev = req->dl_next; + write_unlock(&tp->syn_wait_lock); +} - atomic_dec(&slt->count); +extern __inline__ void tcp_synq_drop(struct sock *sk, struct open_request *req, + struct open_request **prev) +{ + tcp_synq_unlink(&sk->tp_pinfo.af_tcp, req, prev); + tcp_synq_removed(sk, req); + tcp_openreq_free(req); +} + +static __inline__ void tcp_openreq_init(struct open_request *req, + struct tcp_opt *tp, + struct sk_buff *skb) +{ + req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */ + req->rcv_isn = TCP_SKB_CB(skb)->seq; + req->mss = tp->mss_clamp; + req->ts_recent = tp->saw_tstamp ? tp->rcv_tsval : 0; + req->tstamp_ok = tp->tstamp_ok; + req->sack_ok = tp->sack_ok; + req->snd_wscale = tp->snd_wscale; + req->wscale_ok = tp->wscale_ok; + req->acked = 0; + req->rmt_port = skb->h.th->source; } extern const char timer_bug_msg[]; @@ -1179,13 +1451,14 @@ static inline void tcp_clear_xmit_timer(struct sock *sk, int what) struct timer_list *timer; switch (what) { - case TIME_RETRANS: + case TCP_TIME_RETRANS: timer = &tp->retransmit_timer; break; - case TIME_DACK: + case TCP_TIME_DACK: + tp->ack.blocked = 0; timer = &tp->delack_timer; break; - case TIME_PROBE0: + case TCP_TIME_PROBE0: timer = &tp->probe_timer; break; default: @@ -1199,7 +1472,7 @@ static inline void tcp_clear_xmit_timer(struct sock *sk, int what) spin_unlock_bh(&sk->timer_lock); } -/* This function does not return reliable answer. You is only as advice. +/* This function does not return reliable answer. Use it only as advice. */ static inline int tcp_timer_is_set(struct sock *sk, int what) @@ -1208,13 +1481,13 @@ static inline int tcp_timer_is_set(struct sock *sk, int what) int ret; switch (what) { - case TIME_RETRANS: + case TCP_TIME_RETRANS: ret = tp->retransmit_timer.prev != NULL; break; - case TIME_DACK: + case TCP_TIME_DACK: ret = tp->delack_timer.prev != NULL; break; - case TIME_PROBE0: + case TCP_TIME_PROBE0: ret = tp->probe_timer.prev != NULL; break; default: @@ -1248,18 +1521,46 @@ extern __inline__ void tcp_listen_unlock(void) static inline int keepalive_intvl_when(struct tcp_opt *tp) { - if (tp->keepalive_intvl) - return tp->keepalive_intvl; - else - return sysctl_tcp_keepalive_intvl; + return tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl; } static inline int keepalive_time_when(struct tcp_opt *tp) { - if (tp->keepalive_time) - return tp->keepalive_time; - else - return sysctl_tcp_keepalive_time; + return tp->keepalive_time ? : sysctl_tcp_keepalive_time; } +static inline int tcp_fin_time(struct tcp_opt *tp) +{ + int fin_timeout = tp->linger2 ? : sysctl_tcp_fin_timeout; + + if (fin_timeout < (tp->rto<<2) - (tp->rto>>1)) + fin_timeout = (tp->rto<<2) - (tp->rto>>1); + + return fin_timeout; +} + +#if 0 /* TCP_DEBUG */ +#define TCP_CHECK_TIMER(sk) \ +do { struct tcp_opt *__tp = &sk->tp_pinfo.af_tcp; \ + if (sk->state != TCP_CLOSE) { \ + if (__tp->packets_out) { \ + if (!tcp_timer_is_set(sk, TCP_TIME_RETRANS) && !timer_is_running(&__tp->retransmit_timer) && net_ratelimit()) \ + printk(KERN_DEBUG "sk=%p RETRANS" __FUNCTION__ "(%d) %d\n", sk, __LINE__, sk->state); \ + } else if (__tp->send_head) { \ + if (!tcp_timer_is_set(sk, TCP_TIME_PROBE0) && !timer_is_running(&__tp->probe_timer) && net_ratelimit()) \ + printk(KERN_DEBUG "sk=%p PROBE0" __FUNCTION__ "(%d) %d\n", sk, __LINE__, sk->state); \ + } \ + if (__tp->ack.pending) { \ + if (!tcp_timer_is_set(sk, TCP_TIME_DACK) && !timer_is_running(&__tp->delack_timer) && net_ratelimit()) \ + printk(KERN_DEBUG "sk=%p DACK" __FUNCTION__ "(%d) %d\n", sk, __LINE__, sk->state); \ + } \ + if (__tp->packets_out > skb_queue_len(&sk->write_queue) || \ + (__tp->send_head && skb_queue_len(&sk->write_queue) == 0)) { \ + printk(KERN_DEBUG "sk=%p QUEUE" __FUNCTION__ "(%d) %d %d %d %p\n", sk, __LINE__, sk->state, __tp->packets_out, skb_queue_len(&sk->write_queue), __tp->send_head); \ + } \ + } } while (0) +#else +#define TCP_CHECK_TIMER(sk) do { } while (0); +#endif + #endif /* _TCP_H */ diff --git a/net/core/datagram.c b/net/core/datagram.c index bb320872982c..bda174519a5e 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -60,14 +60,14 @@ static inline int connection_based(struct sock *sk) * Wait for a packet.. */ -static int wait_for_packet(struct sock * sk, int *err) +static int wait_for_packet(struct sock * sk, int *err, long *timeo_p) { int error; DECLARE_WAITQUEUE(wait, current); - __set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(sk->sleep, &wait); + __set_current_state(TASK_INTERRUPTIBLE|TASK_EXCLUSIVE); + add_wait_queue_exclusive(sk->sleep, &wait); /* Socket errors? */ error = sock_error(sk); @@ -91,7 +91,7 @@ static int wait_for_packet(struct sock * sk, int *err) if (signal_pending(current)) goto out; - schedule(); + *timeo_p = schedule_timeout(*timeo_p); ready: current->state = TASK_RUNNING; @@ -132,12 +132,15 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock, { int error; struct sk_buff *skb; + long timeo; /* Caller is allowed not to check sk->err before skb_recv_datagram() */ error = sock_error(sk); if (error) goto no_packet; + timeo = sock_rcvtimeo(sk, noblock); + do { /* Again only user level code calls this function, so nothing interrupt level will suddenly eat the receive_queue. @@ -162,10 +165,10 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock, /* User doesn't want to wait */ error = -EAGAIN; - if (noblock) + if (!timeo) goto no_packet; - } while (wait_for_packet(sk, err) == 0); + } while (wait_for_packet(sk, err, &timeo) == 0); return NULL; @@ -225,11 +228,11 @@ unsigned int datagram_poll(struct file * file, struct socket *sock, poll_table * /* exceptional events? */ if (sk->err || !skb_queue_empty(&sk->error_queue)) mask |= POLLERR; - if (sk->shutdown & RCV_SHUTDOWN) + if (sk->shutdown == SHUTDOWN_MASK) mask |= POLLHUP; /* readable? */ - if (!skb_queue_empty(&sk->receive_queue)) + if (!skb_queue_empty(&sk->receive_queue) || (sk->shutdown&RCV_SHUTDOWN)) mask |= POLLIN | POLLRDNORM; /* Connection-based need to check for termination and startup */ diff --git a/net/core/iovec.c b/net/core/iovec.c index 5ba18150d111..4ebad506e856 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -104,6 +104,11 @@ out: /* Copy and checkum skb to user iovec. Caller _must_ check that skb will fit to this iovec. + + Returns: 0 - success. + -EINVAL - checksum failure. + -EFAULT - fault during copy. Beware, in this case iovec can be + modified! */ int copy_and_csum_toiovec(struct iovec *iov, struct sk_buff *skb, int hlen) @@ -111,7 +116,7 @@ int copy_and_csum_toiovec(struct iovec *iov, struct sk_buff *skb, int hlen) unsigned int csum; int chunk = skb->len - hlen; - /* Skip filled elements. Pretty silly, look at mecpy_toiove, though 8) */ + /* Skip filled elements. Pretty silly, look at memcpy_toiovec, though 8) */ while (iov->iov_len == 0) iov++; @@ -119,7 +124,7 @@ int copy_and_csum_toiovec(struct iovec *iov, struct sk_buff *skb, int hlen) if ((unsigned short)csum_fold(csum_partial(skb->h.raw, chunk+hlen, skb->csum))) goto csum_error; if (memcpy_toiovec(iov, skb->h.raw + hlen, chunk)) - goto csum_error; + goto fault; } else { int err = 0; csum = csum_partial(skb->h.raw, hlen, skb->csum); @@ -133,6 +138,9 @@ int copy_and_csum_toiovec(struct iovec *iov, struct sk_buff *skb, int hlen) return 0; csum_error: + return -EINVAL; + +fault: return -EFAULT; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 12a8f8d72f4a..3528c7510e3d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4,7 +4,7 @@ * Authors: Alan Cox * Florian La Roche * - * Version: $Id: skbuff.c,v 1.63 2000/01/02 09:15:17 davem Exp $ + * Version: $Id: skbuff.c,v 1.64 2000/01/16 05:11:03 davem Exp $ * * Fixes: * Alan Cox : Fixed the worst of the load balancer bugs. @@ -61,10 +61,6 @@ #include #include -#ifdef CONFIG_ATM -#include -#endif - /* * Resource tracking variables */ @@ -165,10 +161,6 @@ struct sk_buff *alloc_skb(unsigned int size,int gfp_mask) skb->is_clone = 0; skb->cloned = 0; -#ifdef CONFIG_ATM - ATM_SKB(skb)->iovcnt = 0; -#endif - atomic_set(&skb->users, 1); atomic_set(skb_datarefp(skb), 1); return skb; @@ -205,6 +197,9 @@ static inline void skb_headerinit(void *p, kmem_cache_t *cache, #ifdef CONFIG_NETFILTER_DEBUG skb->nf_debug = 0; #endif +#endif +#ifdef CONFIG_NET_SCHED + skb->tc_index = 0; #endif memset(skb->cb, 0, sizeof(skb->cb)); skb->priority = 0; @@ -308,6 +303,9 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->nf_debug=old->nf_debug; #endif #endif +#ifdef CONFIG_NET_SCHED + new->tc_index = old->tc_index; +#endif } /* diff --git a/net/core/sock.c b/net/core/sock.c index e069ca898f47..c5781c6e3b18 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -7,7 +7,7 @@ * handler for protocols to use and generic option handler. * * - * Version: $Id: sock.c,v 1.87 1999/11/23 08:56:59 davem Exp $ + * Version: $Id: sock.c,v 1.89 2000/01/18 08:24:13 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -140,6 +140,23 @@ __u32 sysctl_rmem_default = SK_RMEM_MAX; /* Maximal space eaten by iovec or ancilliary data plus some space */ int sysctl_optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV + 512); +static int sock_set_timeout(long *timeo_p, char *optval, int optlen) +{ + struct timeval tv; + + if (optlen < sizeof(tv)) + return -EINVAL; + if (copy_from_user(&tv, optval, sizeof(tv))) + return -EFAULT; + + *timeo_p = MAX_SCHEDULE_TIMEOUT; + if (tv.tv_sec == 0 && tv.tv_usec == 0) + return 0; + if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1)) + *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ); + return 0; +} + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. @@ -214,7 +231,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname, if (val > sysctl_wmem_max) val = sysctl_wmem_max; - sk->sndbuf = max(val*2,2048); + sk->sndbuf = max(val*2,SOCK_MIN_SNDBUF); /* * Wake up sending tasks if we @@ -233,7 +250,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname, val = sysctl_rmem_max; /* FIXME: is this lower bound the right one? */ - sk->rcvbuf = max(val*2,256); + sk->rcvbuf = max(val*2,SOCK_MIN_RCVBUF); break; case SO_KEEPALIVE: @@ -266,16 +283,19 @@ int sock_setsockopt(struct socket *sock, int level, int optname, ret = -EINVAL; /* 1003.1g */ break; } - if (copy_from_user(&ling,optval,sizeof(ling))) - { + if (copy_from_user(&ling,optval,sizeof(ling))) { ret = -EFAULT; break; } - if(ling.l_onoff==0) + if(ling.l_onoff==0) { sk->linger=0; - else - { - sk->lingertime=ling.l_linger; + } else { +#if (BITS_PER_LONG == 32) + if (ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ) + sk->lingertime=MAX_SCHEDULE_TIMEOUT; + else +#endif + sk->lingertime=ling.l_linger*HZ; sk->linger=1; } break; @@ -287,8 +307,21 @@ int sock_setsockopt(struct socket *sock, int level, int optname, case SO_PASSCRED: sock->passcred = valbool; break; - - + + case SO_RCVLOWAT: + if (val < 0) + val = INT_MAX; + sk->rcvlowat = val ? : 1; + break; + + case SO_RCVTIMEO: + ret = sock_set_timeout(&sk->rcvtimeo, optval, optlen); + break; + + case SO_SNDTIMEO: + ret = sock_set_timeout(&sk->sndtimeo, optval, optlen); + break; + #ifdef CONFIG_NETDEVICES case SO_BINDTODEVICE: { @@ -446,7 +479,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, case SO_LINGER: lv=sizeof(v.ling); v.ling.l_onoff=sk->linger; - v.ling.l_linger=sk->lingertime; + v.ling.l_linger=sk->lingertime/HZ; break; case SO_BSDCOMPAT: @@ -454,13 +487,31 @@ int sock_getsockopt(struct socket *sock, int level, int optname, break; case SO_RCVTIMEO: + lv=sizeof(struct timeval); + if (sk->rcvtimeo == MAX_SCHEDULE_TIMEOUT) { + v.tm.tv_sec = 0; + v.tm.tv_usec = 0; + } else { + v.tm.tv_sec = sk->rcvtimeo/HZ; + v.tm.tv_usec = ((sk->rcvtimeo%HZ)*1000)/HZ; + } + break; + case SO_SNDTIMEO: lv=sizeof(struct timeval); - v.tm.tv_sec=0; - v.tm.tv_usec=0; + if (sk->sndtimeo == MAX_SCHEDULE_TIMEOUT) { + v.tm.tv_sec = 0; + v.tm.tv_usec = 0; + } else { + v.tm.tv_sec = sk->sndtimeo/HZ; + v.tm.tv_usec = ((sk->sndtimeo%HZ)*1000)/HZ; + } break; case SO_RCVLOWAT: + v.val = sk->rcvlowat; + break; + case SO_SNDLOWAT: v.val=1; break; @@ -663,7 +714,7 @@ unsigned long sock_rspace(struct sock *sk) /* It is almost wait_for_tcp_memory minus release_sock/lock_sock. I think, these locks should be removed for datagram sockets. */ -static void sock_wait_for_wmem(struct sock * sk) +static long sock_wait_for_wmem(struct sock * sk, long timeo) { DECLARE_WAITQUEUE(wait, current); @@ -679,10 +730,11 @@ static void sock_wait_for_wmem(struct sock * sk) break; if (sk->err) break; - schedule(); + timeo = schedule_timeout(timeo); } __set_current_state(TASK_RUNNING); remove_wait_queue(sk->sleep, &wait); + return timeo; } @@ -695,6 +747,9 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, { int err; struct sk_buff *skb; + long timeo; + + timeo = sock_sndtimeo(sk, noblock); while (1) { unsigned long try_size = size; @@ -736,12 +791,12 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, sk->socket->flags |= SO_NOSPACE; err = -EAGAIN; - if (noblock) + if (!timeo) goto failure; err = -ERESTARTSYS; if (signal_pending(current)) goto failure; - sock_wait_for_wmem(sk); + timeo = sock_wait_for_wmem(sk, timeo); } return skb; @@ -771,13 +826,21 @@ void __lock_sock(struct sock *sk) void __release_sock(struct sock *sk) { struct sk_buff *skb = sk->backlog.head; + do { - struct sk_buff *next = skb->next; - skb->next = NULL; - sk->backlog_rcv(sk, skb); - skb = next; - } while(skb != NULL); - sk->backlog.head = sk->backlog.tail = NULL; + sk->backlog.head = sk->backlog.tail = NULL; + bh_unlock_sock(sk); + + do { + struct sk_buff *next = skb->next; + + skb->next = NULL; + sk->backlog_rcv(sk, skb); + skb = next; + } while (skb != NULL); + + bh_lock_sock(sk); + } while((skb = sk->backlog.head) != NULL); } /* @@ -1004,7 +1067,7 @@ void sock_def_wakeup(struct sock *sk) { read_lock(&sk->callback_lock); if(!sk->dead) - wake_up_interruptible(sk->sleep); + wake_up_interruptible_all(sk->sleep); read_unlock(&sk->callback_lock); } @@ -1087,6 +1150,9 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->peercred.pid = 0; sk->peercred.uid = -1; sk->peercred.gid = -1; + sk->rcvlowat = 1; + sk->rcvtimeo = MAX_SCHEDULE_TIMEOUT; + sk->sndtimeo = MAX_SCHEDULE_TIMEOUT; atomic_set(&sk->refcnt, 1); } diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index f7972b7dfb8d..a1b402672add 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -207,7 +207,7 @@ unsigned short eth_type_trans(struct sk_buff *skb, struct net_device *dev) * seems to set IFF_PROMISC. */ - else if(dev->flags&(IFF_PROMISC/*|IFF_ALLMULTI*/)) + else if(1 /*dev->flags&IFF_PROMISC*/) { if(memcmp(eth->h_dest,dev->dev_addr, ETH_ALEN)) skb->pkt_type=PACKET_OTHERHOST; @@ -265,7 +265,8 @@ void eth_header_cache_update(struct hh_cache *hh, struct net_device *dev, unsign memcpy(((u8*)hh->hh_data) + 2, haddr, dev->addr_len); } -#ifndef CONFIG_IP_ROUTER +#if 0 /*ndef CONFIG_IP_ROUTER*/ +/* This one is only slowdown with checksumming in user process context. --ANK */ /* * Copy from an ethernet device memory space to an sk_buff while checksumming if IP @@ -298,7 +299,7 @@ void eth_copy_and_sum(struct sk_buff *dest, unsigned char *src, int length, int if ((ip_length <= length) && (ip_length > 7)) length=ip_length; - dest->csum=csum_partial_copy(src+sizeof(struct iphdr)+ETH_HLEN,dest->data+sizeof(struct iphdr)+ETH_HLEN,length,base); + dest->csum=csum_partial_copy_nocheck(src+sizeof(struct iphdr)+ETH_HLEN,dest->data+sizeof(struct iphdr)+ETH_HLEN,length,base); dest->ip_summed=1; } diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 94fb19f92de6..bc2c97779037 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -5,7 +5,7 @@ * * PF_INET protocol family socket handler. * - * Version: $Id: af_inet.c,v 1.101 2000/01/09 02:19:38 davem Exp $ + * Version: $Id: af_inet.c,v 1.104 2000/01/18 08:24:14 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -117,7 +117,9 @@ struct linux_mib net_statistics[NR_CPUS*2]; +#ifdef INET_REFCNT_DEBUG atomic_t inet_sock_nr; +#endif extern int raw_get_info(char *, char **, off_t, int); extern int snmp_get_info(char *, char **, off_t, int); @@ -159,8 +161,8 @@ void inet_sock_destruct(struct sock *sk) if (sk->protinfo.af_inet.opt) kfree(sk->protinfo.af_inet.opt); dst_release(sk->dst_cache); - atomic_dec(&inet_sock_nr); #ifdef INET_REFCNT_DEBUG + atomic_dec(&inet_sock_nr); printk(KERN_DEBUG "INET socket %p released, %d are still alive\n", sk, atomic_read(&inet_sock_nr)); #endif } @@ -171,32 +173,28 @@ void inet_sock_release(struct sock *sk) sk->prot->destroy(sk); /* Observation: when inet_sock_release is called, processes have - no access to socket. But net still has. - Step one, detach it from networking: - - A. Remove from hash tables. + * no access to socket. But net still has. + * Step one, detach it from networking: + * + * A. Remove from hash tables. */ sk->prot->unhash(sk); /* In this point socket cannot receive new packets, - but it is possible that some packets are in flight - because some CPU runs receiver and did hash table lookup - before we unhashed socket. They will achieve receive queue - and will be purged by socket destructor. - - Also we still have packets pending on receive - queue and probably, our own packets waiting in device queues. - sock_destroy will drain receive queue, but transmitted - packets will delay socket destruction until the last reference - will be released. + * but it is possible that some packets are in flight + * because some CPU runs receiver and did hash table lookup + * before we unhashed socket. They will achieve receive queue + * and will be purged by socket destructor. + * + * Also we still have packets pending on receive + * queue and probably, our own packets waiting in device queues. + * sock_destroy will drain receive queue, but transmitted + * packets will delay socket destruction until the last reference + * will be released. */ - write_lock_irq(&sk->callback_lock); - sk->dead=1; - sk->socket = NULL; - sk->sleep = NULL; - write_unlock_irq(&sk->callback_lock); + sock_orphan(sk); #ifdef INET_REFCNT_DEBUG if (atomic_read(&sk->refcnt) != 1) { @@ -222,8 +220,7 @@ int inet_setsockopt(struct socket *sock, int level, int optname, char *optval, int optlen) { struct sock *sk=sock->sk; - if (sk->prot->setsockopt==NULL) - return -EOPNOTSUPP; + return sk->prot->setsockopt(sk,level,optname,optval,optlen); } @@ -239,8 +236,7 @@ int inet_getsockopt(struct socket *sock, int level, int optname, char *optval, int *optlen) { struct sock *sk=sock->sk; - if (sk->prot->getsockopt==NULL) - return -EOPNOTSUPP; + return sk->prot->getsockopt(sk,level,optname,optval,optlen); } @@ -264,14 +260,6 @@ static int inet_autobind(struct sock *sk) return 0; } -/* Listening INET sockets never sleep to wait for memory, so - * it is completely silly to wake them up on queue space - * available events. So we hook them up to this dummy callback. - */ -static void inet_listen_write_space(struct sock *sk) -{ -} - /* * Move a socket into listening state. */ @@ -282,12 +270,13 @@ int inet_listen(struct socket *sock, int backlog) unsigned char old_state; int err; + lock_sock(sk); + + err = -EINVAL; if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM) - return -EINVAL; + goto out; - lock_sock(sk); old_state = sk->state; - err = -EINVAL; if (!((1<state = TCP_LISTEN; - sk->ack_backlog = 0; - if (sk->num == 0) { - if (sk->prot->get_port(sk, 0) != 0) { - sk->state = old_state; - err = -EAGAIN; - goto out; - } - sk->sport = htons(sk->num); - } else { - /* Not nice, but the simplest solution however */ - if (sk->prev) - ((struct tcp_bind_bucket*)sk->prev)->fastreuse = 0; - } - - sk_dst_reset(sk); - sk->prot->hash(sk); - sk->socket->flags |= SO_ACCEPTCON; - sk->write_space = inet_listen_write_space; + err = tcp_listen_start(sk); + if (err) + goto out; } sk->max_ack_backlog = backlog; err = 0; @@ -345,10 +318,6 @@ static int inet_create(struct socket *sock, int protocol) if (protocol && protocol != IPPROTO_TCP) goto free_and_noproto; protocol = IPPROTO_TCP; - if (ipv4_config.no_pmtu_disc) - sk->protinfo.af_inet.pmtudisc = IP_PMTUDISC_DONT; - else - sk->protinfo.af_inet.pmtudisc = IP_PMTUDISC_WANT; prot = &tcp_prot; sock->ops = &inet_stream_ops; break; @@ -359,7 +328,6 @@ static int inet_create(struct socket *sock, int protocol) goto free_and_noproto; protocol = IPPROTO_UDP; sk->no_check = UDP_CSUM_DEFAULT; - sk->protinfo.af_inet.pmtudisc = IP_PMTUDISC_DONT; prot=&udp_prot; sock->ops = &inet_dgram_ops; break; @@ -370,7 +338,6 @@ static int inet_create(struct socket *sock, int protocol) goto free_and_noproto; prot = &raw_prot; sk->reuse = 1; - sk->protinfo.af_inet.pmtudisc = IP_PMTUDISC_DONT; sk->num = protocol; sock->ops = &inet_dgram_ops; if (protocol == IPPROTO_RAW) @@ -380,23 +347,22 @@ static int inet_create(struct socket *sock, int protocol) goto free_and_badtype; } + if (ipv4_config.no_pmtu_disc) + sk->protinfo.af_inet.pmtudisc = IP_PMTUDISC_DONT; + else + sk->protinfo.af_inet.pmtudisc = IP_PMTUDISC_WANT; + sock_init_data(sock,sk); sk->destruct = inet_sock_destruct; - sk->zapped=0; -#ifdef CONFIG_TCP_NAGLE_OFF - sk->nonagle = 1; -#endif + sk->zapped = 0; sk->family = PF_INET; sk->protocol = protocol; sk->prot = prot; sk->backlog_rcv = prot->backlog_rcv; - sk->timer.data = (unsigned long)sk; - sk->timer.function = &tcp_keepalive_timer; - sk->protinfo.af_inet.ttl=sysctl_ip_default_ttl; sk->protinfo.af_inet.mc_loop=1; @@ -404,7 +370,9 @@ static int inet_create(struct socket *sock, int protocol) sk->protinfo.af_inet.mc_index=0; sk->protinfo.af_inet.mc_list=NULL; +#ifdef INET_REFCNT_DEBUG atomic_inc(&inet_sock_nr); +#endif if (sk->num) { /* It assumes that any protocol which allows @@ -469,11 +437,8 @@ int inet_release(struct socket *sock) * linger.. */ timeout = 0; - if (sk->linger && !(current->flags & PF_EXITING)) { - timeout = HZ * sk->lingertime; - if (!timeout) - timeout = MAX_SCHEDULE_TIMEOUT; - } + if (sk->linger && !(current->flags & PF_EXITING)) + timeout = sk->lingertime; sock->sk = NULL; sk->prot->close(sk, timeout); } @@ -496,10 +461,6 @@ static int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) return -EINVAL; chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr); - if (addr->sin_addr.s_addr != 0 && chk_addr_ret != RTN_LOCAL && - chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) { - return -EADDRNOTAVAIL; /* Source address MUST be ours! */ - } snum = ntohs(addr->sin_port); if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) @@ -555,25 +516,29 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr, return sk->prot->connect(sk, (struct sockaddr *)uaddr, addr_len); } -static void inet_wait_for_connect(struct sock *sk) +static long inet_wait_for_connect(struct sock *sk, long timeo) { DECLARE_WAITQUEUE(wait, current); __set_current_state(TASK_INTERRUPTIBLE); add_wait_queue(sk->sleep, &wait); + /* Basic assumption: if someone sets sk->err, he _must_ + * change state of the socket from TCP_SYN_*. + * Connect() does not allow to get error notifications + * without closing the socket. + */ while ((1<state)&(TCPF_SYN_SENT|TCPF_SYN_RECV)) { - if (signal_pending(current)) - break; - if (sk->err) - break; release_sock(sk); - schedule(); + timeo = schedule_timeout(timeo); lock_sock(sk); + if (signal_pending(current) || !timeo) + break; set_current_state(TASK_INTERRUPTIBLE); } __set_current_state(TASK_RUNNING); remove_wait_queue(sk->sleep, &wait); + return timeo; } /* @@ -586,16 +551,16 @@ int inet_stream_connect(struct socket *sock, struct sockaddr * uaddr, { struct sock *sk=sock->sk; int err; + long timeo; + + lock_sock(sk); if (uaddr->sa_family == AF_UNSPEC) { - lock_sock(sk); err = sk->prot->disconnect(sk, flags); sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; - release_sock(sk); - return err; + goto out; } - lock_sock(sk); switch (sock->state) { default: err = -EINVAL; @@ -604,40 +569,58 @@ int inet_stream_connect(struct socket *sock, struct sockaddr * uaddr, err = -EISCONN; goto out; case SS_CONNECTING: - if (tcp_established(sk->state)) { - sock->state = SS_CONNECTED; - err = 0; - goto out; - } - if (sk->err) - goto sock_error; err = -EALREADY; - if (flags & O_NONBLOCK) - goto out; + /* Fall out of switch with err, set for this state */ break; case SS_UNCONNECTED: + err = -EISCONN; + if (sk->state != TCP_CLOSE) + goto out; + + err = -EAGAIN; + if (sk->num == 0) { + if (sk->prot->get_port(sk, 0) != 0) + goto out; + sk->sport = htons(sk->num); + } + err = sk->prot->connect(sk, uaddr, addr_len); if (err < 0) goto out; + sock->state = SS_CONNECTING; - } - if (sk->state > TCP_FIN_WAIT2) - goto sock_error; + /* Just entered SS_CONNECTING state; the only + * difference is that return value in non-blocking + * case is EINPROGRESS, rather than EALREADY. + */ + err = -EINPROGRESS; + break; + } - err = -EINPROGRESS; - if (!tcp_established(sk->state) && (flags & O_NONBLOCK)) - goto out; + timeo = sock_sndtimeo(sk, flags&O_NONBLOCK); if ((1<state)&(TCPF_SYN_SENT|TCPF_SYN_RECV)) { - inet_wait_for_connect(sk); + /* Error code is set above */ + if (!timeo || !inet_wait_for_connect(sk, timeo)) + goto out; + err = -ERESTARTSYS; if (signal_pending(current)) goto out; } - if (sk->err && !tcp_established(sk->state)) - goto sock_error; + /* Connection was closed by RST, timeout, ICMP error + * or another process disconnected us. + */ + if (sk->state == TCP_CLOSE) + goto sock_error; + + /* sk->err may be not zero now, if RECVERR was ordered by user + * and error was received after socket entered established state. + * Hence, it is handled normally after connect() return successfully. + */ + sock->state = SS_CONNECTED; err = 0; out: @@ -647,11 +630,9 @@ out: sock_error: err = sock_error(sk) ? : -ECONNABORTED; sock->state = SS_UNCONNECTED; - if (sk->prot->disconnect(sk, O_NONBLOCK)) + if (sk->prot->disconnect(sk, flags)) sock->state = SS_DISCONNECTING; - release_sock(sk); - - return err; + goto out; } /* @@ -671,11 +652,7 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags) BUG_TRAP((1<state)&(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_CLOSE)); - write_lock_irq(&sk2->callback_lock); - sk2->sleep = &newsock->wait; - newsock->sk = sk2; - sk2->socket = newsock; - write_unlock_irq(&sk2->callback_lock); + sock_graft(sk2, newsock); newsock->state = SS_CONNECTED; release_sock(sk2); @@ -749,7 +726,7 @@ int inet_sendmsg(struct socket *sock, struct msghdr *msg, int size, int inet_shutdown(struct socket *sock, int how) { struct sock *sk = sock->sk; - int err; + int err = 0; /* This should really check to make sure * the socket is a TCP socket. (WHY AC...) @@ -759,35 +736,45 @@ int inet_shutdown(struct socket *sock, int how) 2->3 */ if ((how & ~SHUTDOWN_MASK) || how==0) /* MAXINT->0 */ return -EINVAL; - if (!sk) - return -ENOTCONN; lock_sock(sk); - if (sock->state == SS_CONNECTING && tcp_established(sk->state)) - sock->state = SS_CONNECTED; - err = -ENOTCONN; - if (!tcp_connected(sk->state)) - goto out; - sk->shutdown |= how; - if (sk->prot->shutdown) - sk->prot->shutdown(sk, how); + if (sock->state == SS_CONNECTING) { + if ((1<state)&(TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE)) + sock->state = SS_DISCONNECTING; + else + sock->state = SS_CONNECTED; + } + + switch (sk->state) { + default: + sk->shutdown |= how; + if (sk->prot->shutdown) + sk->prot->shutdown(sk, how); + break; + case TCP_CLOSE: + err = -ENOTCONN; + break; + + /* Remaining two branches are temporary solution for missing + * close() in multithreaded environment. It is _not_ a good idea, + * but we have no choice until close() is repaired at VFS level. + */ + case TCP_LISTEN: + if (!(how & RCV_SHUTDOWN)) + break; + /* Fall through */ + case TCP_SYN_SENT: + err = sk->prot->disconnect(sk, O_NONBLOCK); + sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; + break; + } + /* Wake up anyone sleeping in poll. */ sk->state_change(sk); - err = 0; -out: release_sock(sk); return err; } -unsigned int inet_poll(struct file * file, struct socket *sock, poll_table *wait) -{ - struct sock *sk = sock->sk; - - if (sk->prot->poll == NULL) - return(0); - return sk->prot->poll(file, sock, wait); -} - /* * ioctl() calls you can issue on an INET socket. Most of these are * device configuration and stuff and very rarely used. Some ioctls @@ -909,7 +896,7 @@ struct proto_ops inet_stream_ops = { sock_no_socketpair, inet_accept, inet_getname, - inet_poll, + tcp_poll, inet_ioctl, inet_listen, inet_shutdown, diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 591f3cceba5e..588cdf030870 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1,6 +1,6 @@ /* linux/net/inet/arp.c * - * Version: $Id: arp.c,v 1.83 1999/12/15 22:39:03 davem Exp $ + * Version: $Id: arp.c,v 1.84 2000/01/18 08:24:14 davem Exp $ * * Copyright (C) 1994 by Florian La Roche * @@ -487,7 +487,9 @@ void arp_send(int type, int ptype, u32 dest_ip, /* * Fill the device header for the ARP frame */ - dev->hard_header(skb,dev,ptype,dest_hw,src_hw,skb->len); + if (dev->hard_header && + dev->hard_header(skb,dev,ptype,dest_hw,src_hw,skb->len) < 0) + goto out; /* * Fill out the arp protocol part. @@ -552,6 +554,10 @@ void arp_send(int type, int ptype, u32 dest_ip, skb->dev = dev; dev_queue_xmit(skb); + return; + +out: + kfree_skb(skb); } static void parp_redo(struct sk_buff *skb) diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 11a8c319b4d2..23389d249fbe 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -5,7 +5,7 @@ * * The Internet Protocol (IP) module. * - * Version: $Id: ip_input.c,v 1.44 2000/01/09 02:19:30 davem Exp $ + * Version: $Id: ip_input.c,v 1.45 2000/01/16 05:11:22 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -317,13 +317,12 @@ static inline int ip_rcv_finish(struct sk_buff *skb) #ifdef CONFIG_NET_CLS_ROUTE if (skb->dst->tclassid) { + struct ip_rt_acct *st = ip_rt_acct + 256*smp_processor_id(); u32 idx = skb->dst->tclassid; - write_lock(&ip_rt_acct_lock); - ip_rt_acct[idx&0xFF].o_packets++; - ip_rt_acct[idx&0xFF].o_bytes+=skb->len; - ip_rt_acct[(idx>>16)&0xFF].i_packets++; - ip_rt_acct[(idx>>16)&0xFF].i_bytes+=skb->len; - write_unlock(&ip_rt_acct_lock); + st[idx&0xFF].o_packets++; + st[idx&0xFF].o_bytes+=skb->len; + st[(idx>>16)&0xFF].i_packets++; + st[(idx>>16)&0xFF].i_bytes+=skb->len; } #endif diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 59e6ff8658be..2a4e3cf41774 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -5,7 +5,7 @@ * * The Internet Protocol (IP) output module. * - * Version: $Id: ip_output.c,v 1.77 2000/01/09 02:19:31 davem Exp $ + * Version: $Id: ip_output.c,v 1.78 2000/01/16 05:11:22 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -149,8 +149,8 @@ output_maybe_reroute(struct sk_buff *skb) /* * Add an ip header to a skbuff and send it out. */ -void ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, - u32 saddr, u32 daddr, struct ip_options *opt) +int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, + u32 saddr, u32 daddr, struct ip_options *opt) { struct rtable *rt = (struct rtable *)skb->dst; struct iphdr *iph; @@ -182,8 +182,8 @@ void ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, ip_send_check(iph); /* Send it out. */ - NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, - output_maybe_reroute); + return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, + output_maybe_reroute); } static inline int ip_finish_output2(struct sk_buff *skb) @@ -257,7 +257,7 @@ int ip_mc_output(struct sk_buff *skb) { struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb) - NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, newskb, NULL, + NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL, newskb->dev, ip_dev_loopback_xmit); } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index c618689b2a16..90b74447fb37 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -5,7 +5,7 @@ * * The IP to API glue. * - * Version: $Id: ip_sockglue.c,v 1.46 2000/01/09 02:19:32 davem Exp $ + * Version: $Id: ip_sockglue.c,v 1.47 2000/01/16 05:11:23 davem Exp $ * * Authors: see ip.c * @@ -415,7 +415,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int opt struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) if (sk->family == PF_INET || - ((tcp_connected(sk->state) || sk->state == TCP_SYN_SENT) + (!((1<state)&(TCPF_LISTEN|TCPF_CLOSE)) && sk->daddr != LOOPBACK4_IPV6)) { #endif if (opt) diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index b3e86f58cb9a..d6a7c57f5d02 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -7,7 +7,7 @@ * PROC file system. It is mainly used for debugging and * statistics. * - * Version: $Id: proc.c,v 1.38 2000/01/09 02:19:30 davem Exp $ + * Version: $Id: proc.c,v 1.41 2000/01/21 23:45:57 davem Exp $ * * Authors: Fred N. van Kempen, * Gerald J. Heim, @@ -71,8 +71,9 @@ int afinet_get_info(char *buffer, char **start, off_t offset, int length) int len = socket_get_info(buffer,start,offset,length); - len += sprintf(buffer+len,"TCP: inuse %d\n", - fold_prot_inuse(&tcp_prot)); + len += sprintf(buffer+len,"TCP: inuse %d orphan %d tw %d\n", + fold_prot_inuse(&tcp_prot), + atomic_read(&tcp_orphan_count), tcp_tw_count); len += sprintf(buffer+len,"UDP: inuse %d\n", fold_prot_inuse(&udp_prot)); len += sprintf(buffer+len,"RAW: inuse %d\n", @@ -163,7 +164,14 @@ int netstat_get_info(char *buffer, char **start, off_t offset, int length) len = sprintf(buffer, "TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed" " EmbryonicRsts PruneCalled RcvPruned OfoPruned" - " OutOfWindowIcmps LockDroppedIcmps\n" + " OutOfWindowIcmps LockDroppedIcmps" + " TW TWRecycled TWKilled" + " PAWSPassive PAWSActive PAWSEstab" + " DelayedACKs DelayedACKLocked DelayedACKLost" + " ListenOverflows ListenDrops" + " TCPPrequeued TCPDirectCopyFromBacklog" + " TCPDirectCopyFromPrequeue TCPPrequeueDropped" + " TCPHPHits TCPHPHitsToUser\n" "TcpExt:"); for (i=0; i * Fred N. van Kempen, @@ -648,10 +648,6 @@ struct proto raw_prot = { udp_connect, /* connect */ udp_disconnect, /* disconnect */ NULL, /* accept */ - NULL, /* retransmit */ - NULL, /* write_wakeup */ - NULL, /* read_wakeup */ - datagram_poll, /* poll */ #ifdef CONFIG_IP_MROUTE ipmr_ioctl, /* ioctl */ #else @@ -669,7 +665,5 @@ struct proto raw_prot = { raw_v4_hash, /* hash */ raw_v4_unhash, /* unhash */ NULL, /* get_port */ - 128, /* max_header */ - 0, /* retransmits */ "RAW", /* name */ }; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index add42730d8a4..bbc6ec111fad 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -5,7 +5,7 @@ * * ROUTE - implementation of the IP router. * - * Version: $Id: route.c,v 1.78 2000/01/13 00:06:58 davem Exp $ + * Version: $Id: route.c,v 1.80 2000/01/21 06:37:27 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -1178,6 +1178,7 @@ ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr, rth->u.dst.output= ip_rt_bug; atomic_set(&rth->u.dst.__refcnt, 1); + rth->u.dst.flags= DST_HOST; rth->key.dst = daddr; rth->rt_dst = daddr; rth->key.tos = tos; @@ -1385,6 +1386,7 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr, goto e_nobufs; atomic_set(&rth->u.dst.__refcnt, 1); + rth->u.dst.flags= DST_HOST; rth->key.dst = daddr; rth->rt_dst = daddr; rth->key.tos = tos; @@ -1462,6 +1464,7 @@ local_input: rth->u.dst.output= ip_rt_bug; atomic_set(&rth->u.dst.__refcnt, 1); + rth->u.dst.flags= DST_HOST; rth->key.dst = daddr; rth->rt_dst = daddr; rth->key.tos = tos; @@ -1815,6 +1818,7 @@ make_route: goto e_nobufs; atomic_set(&rth->u.dst.__refcnt, 1); + rth->u.dst.flags= DST_HOST; rth->key.dst = daddr; rth->key.tos = tos; rth->key.src = saddr; @@ -2208,8 +2212,7 @@ ctl_table ipv4_route_table[] = { #endif #ifdef CONFIG_NET_CLS_ROUTE -struct ip_rt_acct ip_rt_acct[256]; -rwlock_t ip_rt_acct_lock = RW_LOCK_UNLOCKED; +struct ip_rt_acct *ip_rt_acct; #ifdef CONFIG_PROC_FS static int ip_rt_acct_read(char *buffer, char **start, off_t offset, @@ -2217,14 +2220,34 @@ static int ip_rt_acct_read(char *buffer, char **start, off_t offset, { *start=buffer; - if (offset + length > sizeof(ip_rt_acct)) { - length = sizeof(ip_rt_acct) - offset; + if ((offset&3) || (length&3)) + return -EIO; + + if (offset + length >= sizeof(struct ip_rt_acct)*256) { + length = sizeof(struct ip_rt_acct)*256 - offset; *eof = 1; } if (length > 0) { - read_lock_bh(&ip_rt_acct_lock); - memcpy(buffer, ((u8*)&ip_rt_acct)+offset, length); - read_unlock_bh(&ip_rt_acct_lock); + u32 *dst = (u32*)buffer; + u32 *src = (u32*)(((u8*)ip_rt_acct) + offset); + + memcpy(dst, src, length); + +#ifdef __SMP__ + if (smp_num_cpus > 1) { + int i; + int cnt = length/4; + + for (i=1; itp_pinfo.af_tcp; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sock *child; - /* Oops! It was missing, syn_recv_sock decreases it. */ - tp->syn_backlog++; + child = tp->af_specific->syn_recv_sock(sk, skb, req, dst); + if (child) + tcp_acceptq_queue(sk, req, child); + else + tcp_openreq_free(req); - sk = tp->af_specific->syn_recv_sock(sk, skb, req, dst); - if (sk) { - req->sk = sk; - - /* Queue up for accept() */ - tcp_synq_queue(tp, req); - } else { - tp->syn_backlog--; - req->class->destructor(req); - tcp_openreq_free(req); - } - return sk; + return child; } struct sock * @@ -171,9 +164,9 @@ cookie_v4_check(struct sock *sk, struct sk_buff *skb, struct ip_options *opt) } } } - + req->snd_wscale = req->rcv_wscale = req->tstamp_ok = 0; - req->wscale_ok = 0; + req->wscale_ok = req->sack_ok = 0; req->expires = 0UL; req->retrans = 0; @@ -189,8 +182,8 @@ cookie_v4_check(struct sock *sk, struct sk_buff *skb, struct ip_options *opt) req->af.v4_req.loc_addr, sk->protinfo.af_inet.tos | RTO_CONN, 0)) { - tcp_openreq_free(req); - return NULL; + tcp_openreq_free(req); + return NULL; } /* Try to redo what tcp_v4_send_synack did. */ @@ -198,6 +191,7 @@ cookie_v4_check(struct sock *sk, struct sk_buff *skb, struct ip_options *opt) tcp_select_initial_window(tcp_full_space(sk),req->mss, &req->rcv_wnd, &req->window_clamp, 0, &rcv_wscale); + /* BTW win scale with syncookies is 0 by definition */ req->rcv_wscale = rcv_wscale; return get_cookie_sock(sk, skb, req, &rt->u.dst); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 9465e40214d9..d9416525ba7f 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1,7 +1,7 @@ /* * sysctl_net_ipv4.c: sysctl interface to net IPV4 subsystem. * - * $Id: sysctl_net_ipv4.c,v 1.42 2000/01/09 02:19:37 davem Exp $ + * $Id: sysctl_net_ipv4.c,v 1.43 2000/01/16 05:11:27 davem Exp $ * * Begun April 1, 1996, Mike Shaver. * Added /proc/sys/net/ipv4 directory entry (empty =) ). [MS] @@ -41,26 +41,6 @@ extern int sysctl_ipfrag_time; /* From ip_output.c */ extern int sysctl_ip_dynaddr; -/* From ip_masq.c */ -extern int sysctl_ip_masq_debug; - -extern int sysctl_tcp_timestamps; -extern int sysctl_tcp_window_scaling; -extern int sysctl_tcp_sack; -extern int sysctl_tcp_retrans_collapse; -extern int sysctl_tcp_keepalive_time; -extern int sysctl_tcp_keepalive_probes; -extern int sysctl_tcp_retries1; -extern int sysctl_tcp_retries2; -extern int sysctl_tcp_fin_timeout; -extern int sysctl_tcp_syncookies; -extern int sysctl_tcp_syn_retries; -extern int sysctl_tcp_stdurg; -extern int sysctl_tcp_rfc1337; -extern int sysctl_tcp_syn_taildrop; -extern int sysctl_max_syn_backlog; -extern int sysctl_tcp_tw_recycle; - /* From icmp.c */ extern int sysctl_icmp_destunreach_time; extern int sysctl_icmp_timeexceed_time; @@ -142,6 +122,12 @@ ctl_table ipv4_table[] = { &proc_dointvec}, {NET_IPV4_TCP_SYN_RETRIES, "tcp_syn_retries", &sysctl_tcp_syn_retries, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_TCP_SYNACK_RETRIES, "tcp_synack_retries", + &sysctl_tcp_synack_retries, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_TCP_MAX_ORPHANS, "tcp_max_orphans", + &sysctl_tcp_max_orphans, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_TCP_MAX_TW_BUCKETS, "tcp_max_tw_buckets", + &sysctl_tcp_max_tw_buckets, sizeof(int), 0644, NULL, &proc_dointvec}, {NET_IPV4_IPFRAG_HIGH_THRESH, "ipfrag_high_thresh", &sysctl_ipfrag_high_thresh, sizeof(int), 0644, NULL, &proc_dointvec}, {NET_IPV4_IPFRAG_LOW_THRESH, "ipfrag_low_thresh", @@ -172,10 +158,10 @@ ctl_table ipv4_table[] = { {NET_TCP_SYNCOOKIES, "tcp_syncookies", &sysctl_tcp_syncookies, sizeof(int), 0644, NULL, &proc_dointvec}, #endif -#ifdef CONFIG_TCP_TW_RECYCLE {NET_TCP_TW_RECYCLE, "tcp_tw_recycle", &sysctl_tcp_tw_recycle, sizeof(int), 0644, NULL, &proc_dointvec}, -#endif + {NET_TCP_ABORT_ON_OVERFLOW, "tcp_abort_on_overflow", + &sysctl_tcp_abort_on_overflow, sizeof(int), 0644, NULL, &proc_dointvec}, {NET_TCP_STDURG, "tcp_stdurg", &sysctl_tcp_stdurg, sizeof(int), 0644, NULL, &proc_dointvec}, {NET_TCP_RFC1337, "tcp_rfc1337", &sysctl_tcp_rfc1337, @@ -221,6 +207,8 @@ ctl_table ipv4_table[] = { {NET_IPV4_INET_PEER_GC_MAXTIME, "inet_peer_gc_maxtime", &inet_peer_gc_maxtime, sizeof(int), 0644, NULL, &proc_dointvec_jiffies, &sysctl_jiffies}, + {NET_TCP_ORPHAN_RETRIES, "tcp_orphan_retries", + &sysctl_tcp_orphan_retries, sizeof(int), 0644, NULL, &proc_dointvec}, {0} }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8e24e19a4434..479836c28721 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp.c,v 1.153 2000/01/09 02:19:33 davem Exp $ + * Version: $Id: tcp.c,v 1.158 2000/01/21 23:45:57 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -202,6 +202,8 @@ * Eric Schenk : Fix fast close down bug with * shutdown() followed by close(). * Andi Kleen : Make poll agree with SIGIO + * Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and + * lingertime == 0 (RFC 793 ABORT Call) * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -432,113 +434,14 @@ kmem_cache_t *tcp_openreq_cachep; kmem_cache_t *tcp_bucket_cachep; kmem_cache_t *tcp_timewait_cachep; -/* - * Find someone to 'accept'. Must be called with - * the listening socket locked. - */ - -static struct open_request *tcp_find_established(struct tcp_opt *tp, - struct open_request **prevp) -{ - struct open_request *req = tp->syn_wait_queue; - struct open_request *prev = (struct open_request *)&tp->syn_wait_queue; - while(req) { - if (req->sk) { - if((1 << req->sk->state) & - ~(TCPF_SYN_SENT|TCPF_SYN_RECV)) - break; - } - prev = req; - req = req->dl_next; - } - *prevp = prev; - return req; -} - -/* - * Walk down the receive queue counting readable data. - * - * Must be called with the socket lock held. - */ - -static int tcp_readable(struct sock *sk) -{ - unsigned long counted; - unsigned long amount; - struct sk_buff *skb; - int sum; - - SOCK_DEBUG(sk, "tcp_readable: %p - ",sk); - - skb = skb_peek(&sk->receive_queue); - if (skb == NULL) { - SOCK_DEBUG(sk, "empty\n"); - return(0); - } - - counted = sk->tp_pinfo.af_tcp.copied_seq; /* Where we are at the moment */ - amount = 0; - - /* Do until a push or until we are out of data. */ - do { - /* Found a hole so stops here. */ - if (before(counted, TCP_SKB_CB(skb)->seq)) /* should not happen */ - break; - - /* Length - header but start from where we are up to - * avoid overlaps. - */ - sum = skb->len - (counted - TCP_SKB_CB(skb)->seq); - if (sum >= 0) { - /* Add it up, move on. */ - amount += sum; - counted += sum; - if (skb->h.th->syn) - counted++; - } - - /* Don't count urg data ... but do it in the right place! - * Consider: "old_data (ptr is here) URG PUSH data" - * The old code would stop at the first push because - * it counted the urg (amount==1) and then does amount-- - * *after* the loop. This means tcp_readable() always - * returned zero if any URG PUSH was in the queue, even - * though there was normal data available. If we subtract - * the urg data right here, we even get it to work for more - * than one URG PUSH skb without normal data. - * This means that poll() finally works now with urg data - * in the queue. Note that rlogin was never affected - * because it doesn't use poll(); it uses two processes - * and a blocking read(). And the queue scan in tcp_read() - * was correct. Mike - */ - - /* Don't count urg data. */ - if (skb->h.th->urg) - amount--; -#if 0 - if (amount && skb->h.th->psh) break; -#endif - skb = skb->next; - } while(skb != (struct sk_buff *)&sk->receive_queue); - - SOCK_DEBUG(sk, "got %lu bytes.\n",amount); - return(amount); -} +atomic_t tcp_orphan_count = ATOMIC_INIT(0); /* * LISTEN is a special case for poll.. */ -static unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait) +static __inline__ unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait) { - struct open_request *req, *dummy; - - lock_sock(sk); - req = tcp_find_established(&sk->tp_pinfo.af_tcp, &dummy); - release_sock(sk); - if (req) - return POLLIN | POLLRDNORM; - return 0; + return sk->tp_pinfo.af_tcp.accept_queue ? (POLLIN | POLLRDNORM) : 0; } /* @@ -585,9 +488,25 @@ unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait) * if you don't tell them that something has hung up! * * Check-me. + * + * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and + * our fs/select.c). It means that after we received EOF, + * poll always returns immediately, making impossible poll() on write() + * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP + * if and only if shutdown has been made in both directions. + * Actually, it is interesting to look how Solaris and DUX + * solve this dilemma. I would prefer, if PULLHUP were maskable, + * then we could set it on SND_SHUTDOWN. BTW examples given + * in Stevens' books assume exactly this behaviour, it explains + * why PULLHUP is incompatible with POLLOUT. --ANK + * + * NOTE. Check for TCP_CLOSE is added. The goal is to prevent + * blocking on fresh not-connected or disconnected socket. --ANK */ - if (sk->shutdown & RCV_SHUTDOWN) + if (sk->shutdown == SHUTDOWN_MASK || sk->state == TCP_CLOSE) mask |= POLLHUP; + if (sk->shutdown & RCV_SHUTDOWN) + mask |= POLLIN | POLLRDNORM; /* Connected? */ if ((1 << sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV)) { @@ -605,7 +524,7 @@ unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait) } } - if (tp->urg_data & URG_VALID) + if (tp->urg_data & TCP_URG_VALID) mask |= POLLPRI; } return mask; @@ -631,32 +550,48 @@ void tcp_write_space(struct sock *sk) read_unlock(&sk->callback_lock); } +/* Listening TCP sockets never sleep to wait for memory, so + * it is completely silly to wake them up on queue space + * available events. So we hook them up to this dummy callback. + */ +static void tcp_listen_write_space(struct sock *sk) +{ +} int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); int answ; switch(cmd) { - case TIOCINQ: -#ifdef FIXME /* FIXME: */ - case FIONREAD: -#endif + case SIOCINQ: if (sk->state == TCP_LISTEN) return(-EINVAL); + lock_sock(sk); - answ = tcp_readable(sk); + if ((1<state) & (TCPF_SYN_SENT|TCPF_SYN_RECV)) + answ = 0; + else if (sk->urginline || !tp->urg_data || + before(tp->urg_seq,tp->copied_seq) || + !before(tp->urg_seq,tp->rcv_nxt)) + answ = tp->rcv_nxt - tp->copied_seq; + else + answ = tp->urg_seq - tp->copied_seq; release_sock(sk); break; case SIOCATMARK: { - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); answ = tp->urg_data && tp->urg_seq == tp->copied_seq; break; } - case TIOCOUTQ: + case SIOCOUTQ: if (sk->state == TCP_LISTEN) return(-EINVAL); - answ = sock_wspace(sk); + + if ((1<state) & (TCPF_SYN_SENT|TCPF_SYN_RECV)) + answ = 0; + else + answ = tp->write_seq - tp->snd_una; break; default: return(-ENOIOCTLCMD); @@ -665,12 +600,131 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) return put_user(answ, (int *)arg); } + +int tcp_listen_start(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct tcp_listen_opt *lopt; + + sk->max_ack_backlog = 0; + sk->ack_backlog = 0; + tp->accept_queue = NULL; + tp->syn_wait_lock = RW_LOCK_UNLOCKED; + + lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL); + if (!lopt) + return -ENOMEM; + + memset(lopt, 0, sizeof(struct tcp_listen_opt)); + for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++) + if ((1<max_qlen_log) >= sysctl_max_syn_backlog) + break; + + write_lock_bh(&tp->syn_wait_lock); + tp->listen_opt = lopt; + write_unlock_bh(&tp->syn_wait_lock); + + sk->state = TCP_LISTEN; + if (sk->num == 0) { + if (sk->prot->get_port(sk, 0) != 0) { + sk->state = TCP_CLOSE; + write_lock_bh(&tp->syn_wait_lock); + tp->listen_opt = NULL; + write_unlock_bh(&tp->syn_wait_lock); + kfree(lopt); + return -EAGAIN; + } + sk->sport = htons(sk->num); + } else { + if (sk->prev) + ((struct tcp_bind_bucket*)sk->prev)->fastreuse = 0; + } + + sk_dst_reset(sk); + sk->prot->hash(sk); + sk->socket->flags |= SO_ACCEPTCON; + sk->write_space = tcp_listen_write_space; + + return 0; +} + +/* + * This routine closes sockets which have been at least partially + * opened, but not yet accepted. + */ + +static void tcp_listen_stop (struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct tcp_listen_opt *lopt = tp->listen_opt; + struct open_request *acc_req = tp->accept_queue; + struct open_request *req; + int i; + + tcp_delete_keepalive_timer(sk); + + /* make all the listen_opt local to us */ + write_lock_bh(&tp->syn_wait_lock); + tp->listen_opt =NULL; + write_unlock_bh(&tp->syn_wait_lock); + tp->accept_queue = NULL; + + if (lopt->qlen) { + for (i=0; isyn_table[i]) != NULL) { + lopt->syn_table[i] = req->dl_next; + lopt->qlen--; + tcp_openreq_free(req); + + /* Following specs, it would be better either to send FIN + * (and enter FIN-WAIT-1, it is normal close) + * or to send active reset (abort). + * Certainly, it is pretty dangerous while synflood, but it is + * bad justification for our negligence 8) + * To be honest, we are not able to make either + * of the variants now. --ANK + */ + } + } + } + BUG_TRAP(lopt->qlen == 0); + + kfree(lopt); + + while ((req=acc_req) != NULL) { + struct sock *child = req->sk; + + acc_req = req->dl_next; + + local_bh_disable(); + bh_lock_sock(child); + BUG_TRAP(child->lock.users==0); + sock_hold(child); + + tcp_disconnect(child, O_NONBLOCK); + + sock_orphan(child); + + atomic_inc(&tcp_orphan_count); + + tcp_destroy_sock(child); + + bh_unlock_sock(child); + local_bh_enable(); + sock_put(child); + + tcp_acceptq_removed(sk); + tcp_openreq_fastfree(req); + } + BUG_TRAP(sk->ack_backlog == 0); +} + /* * Wait for a socket to get into the connected state * * Note: Must be called with the socket locked. */ -static int wait_for_tcp_connect(struct sock * sk, int flags) +static int wait_for_tcp_connect(struct sock * sk, int flags, long *timeo_p) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -684,7 +738,7 @@ static int wait_for_tcp_connect(struct sock * sk, int flags) send_sig(SIGPIPE, tsk, 0); return -EPIPE; } - if(flags & MSG_DONTWAIT) + if(!*timeo_p) return -EAGAIN; if(signal_pending(tsk)) return -ERESTARTSYS; @@ -694,7 +748,7 @@ static int wait_for_tcp_connect(struct sock * sk, int flags) sk->tp_pinfo.af_tcp.write_pending++; release_sock(sk); - schedule(); + *timeo_p = schedule_timeout(*timeo_p); lock_sock(sk); __set_task_state(tsk, TASK_RUNNING); @@ -712,7 +766,7 @@ static inline int tcp_memory_free(struct sock *sk) /* * Wait for more memory for a socket */ -static void wait_for_tcp_memory(struct sock * sk) +static long wait_for_tcp_memory(struct sock * sk, long timeo) { if (!tcp_memory_free(sk)) { DECLARE_WAITQUEUE(wait, current); @@ -732,12 +786,13 @@ static void wait_for_tcp_memory(struct sock * sk) break; release_sock(sk); if (!tcp_memory_free(sk)) - schedule(); + timeo = schedule_timeout(timeo); lock_sock(sk); } current->state = TASK_RUNNING; remove_wait_queue(sk->sleep, &wait); } + return timeo; } /* When all user supplied data has been queued set the PSH bit */ @@ -746,11 +801,9 @@ static void wait_for_tcp_memory(struct sock * sk) /* * This routine copies from a user buffer into a socket, * and starts the transmit system. - * - * Note: must be called with the socket locked. */ -int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg) +int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size) { struct iovec *iov; struct tcp_opt *tp; @@ -758,15 +811,22 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg) int iovlen, flags; int mss_now; int err, copied; + long timeo; err = 0; tp = &(sk->tp_pinfo.af_tcp); - /* Wait for a connection to finish. */ + lock_sock(sk); + TCP_CHECK_TIMER(sk); + flags = msg->msg_flags; + + timeo = sock_sndtimeo(sk, flags&MSG_DONTWAIT); + + /* Wait for a connection to finish. */ if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) - if((err = wait_for_tcp_connect(sk, flags)) != 0) - goto out; + if((err = wait_for_tcp_connect(sk, flags, &timeo)) != 0) + goto out_unlock; /* This should be in poll */ sk->socket->flags &= ~SO_NOSPACE; /* clear SIGIO XXX */ @@ -777,7 +837,7 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg) iovlen = msg->msg_iovlen; iov = msg->msg_iov; copied = 0; - + while(--iovlen >= 0) { int seglen=iov->iov_len; unsigned char * from=iov->iov_base; @@ -785,7 +845,7 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg) iov++; while(seglen > 0) { - int copy, tmp, queue_it, psh; + int copy, tmp, queue_it; if (err) goto do_fault2; @@ -811,8 +871,7 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg) * welcome. */ if (skb_tailroom(skb) > 0 && - (mss_now - copy) > 0 && - tp->snd_nxt < TCP_SKB_CB(skb)->end_seq) { + (mss_now - copy) > 0) { int last_byte_was_odd = (copy % 4); copy = mss_now - copy; @@ -855,34 +914,17 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg) } } - /* We also need to worry about the window. If - * window < 1/2 the maximum window we've seen - * from this host, don't use it. This is - * sender side silly window prevention, as - * specified in RFC1122. (Note that this is - * different than earlier versions of SWS - * prevention, e.g. RFC813.). What we - * actually do is use the whole MSS. Since - * the results in the right edge of the packet - * being outside the window, it will be queued - * for later rather than sent. + /* A chunk was here doing something strange + * with psh etc. It is deleted, because it was + * evident non-sense. --ANK */ - psh = 0; - copy = tp->snd_wnd - (tp->snd_nxt - tp->snd_una); - if(copy > (tp->max_window >> 1)) { - copy = min(copy, mss_now); - psh = 1; - } else { - copy = mss_now; - } - if(copy > seglen) - copy = seglen; + + copy = min(seglen, mss_now); /* Determine how large of a buffer to allocate. */ - tmp = MAX_HEADER + sk->prot->max_header; - if (copy < min(mss_now, tp->max_window >> 1) && - !(flags & MSG_OOB)) { - tmp += min(mss_now, tp->max_window); + tmp = MAX_TCP_HEADER + 15; + if (copy < mss_now && !(flags & MSG_OOB)) { + tmp += mss_now; /* What is happening here is that we want to * tack on later members of the users iovec @@ -901,7 +943,7 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg) /* If we didn't get any memory, we need to sleep. */ if (skb == NULL) { sk->socket->flags |= SO_NOSPACE; - if (flags&MSG_DONTWAIT) { + if (!timeo) { err = -EAGAIN; goto do_interrupted; } @@ -909,8 +951,8 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg) err = -ERESTARTSYS; goto do_interrupted; } - tcp_push_pending_frames(sk, tp); - wait_for_tcp_memory(sk); + __tcp_push_pending_frames(sk, tp, mss_now); + timeo = wait_for_tcp_memory(sk, timeo); /* If SACK's were formed or PMTU events happened, * we must find out about it. @@ -923,7 +965,7 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg) /* Prepare control bits for TCP header creation engine. */ TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | - ((PSH_NEEDED || psh) ? + ((PSH_NEEDED) ? TCPCB_FLAG_PSH : 0)); TCP_SKB_CB(skb)->sacked = 0; if (flags & MSG_OOB) { @@ -936,7 +978,7 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg) * TCP+IP+DEV headers are SKB_PUSH()'d beneath. * Reserve header space and checksum the data. */ - skb_reserve(skb, MAX_HEADER + sk->prot->max_header); + skb_reserve(skb, MAX_TCP_HEADER); skb->csum = csum_and_copy_from_user(from, skb_put(skb, copy), copy, 0, &err); @@ -950,7 +992,7 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg) TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + copy; /* This advances tp->write_seq for us. */ - tcp_send_skb(sk, skb, queue_it); + tcp_send_skb(sk, skb, queue_it, mss_now); } } sk->err = 0; @@ -981,63 +1023,40 @@ do_fault: do_fault2: err = -EFAULT; out: + __tcp_push_pending_frames(sk, tp, mss_now); + TCP_CHECK_TIMER(sk); +out_unlock: + release_sock(sk); tcp_push_pending_frames(sk, tp); return err; } #undef PSH_NEEDED -/* - * Send an ack if one is backlogged at this point. Ought to merge - * this with tcp_send_ack(). - * This is called for delayed acks also. - */ - -void tcp_read_wakeup(struct sock *sk) -{ - /* If we're closed, don't send an ack, or we'll get a RST - * from the closed destination. - */ - if (sk->state != TCP_CLOSE) - tcp_send_ack(sk); -} - /* * Handle reading urgent data. BSD has very simple semantics for * this, no blocking and very strange errors 8) */ -static int tcp_recv_urg(struct sock * sk, int nonblock, +static int tcp_recv_urg(struct sock * sk, long timeo, struct msghdr *msg, int len, int flags, int *addr_len) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); /* No URG data to read. */ - if (sk->urginline || !tp->urg_data || tp->urg_data == URG_READ) + if (sk->urginline || !tp->urg_data || tp->urg_data == TCP_URG_READ) return -EINVAL; /* Yes this is right ! */ if (sk->done) return -ENOTCONN; - if (sk->state == TCP_CLOSE || (sk->shutdown & RCV_SHUTDOWN)) { - sk->done = 1; - return 0; - } - - if (tp->urg_data & URG_VALID) { + if (tp->urg_data & TCP_URG_VALID) { int err = 0; char c = tp->urg_data; if (!(flags & MSG_PEEK)) - tp->urg_data = URG_READ; - - if(msg->msg_name) - tp->af_specific->addr2sockaddr(sk, (struct sockaddr *) - msg->msg_name); - - if(addr_len) - *addr_len = tp->af_specific->sockaddr_len; + tp->urg_data = TCP_URG_READ; /* Read urgent data. */ msg->msg_flags|=MSG_OOB; @@ -1051,6 +1070,10 @@ static int tcp_recv_urg(struct sock * sk, int nonblock, return err ? -EFAULT : len; } + /* Do not set sk->done, it is set only by normal data receive */ + if (sk->state == TCP_CLOSE || (sk->shutdown & RCV_SHUTDOWN)) + return 0; + /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and * the available implementations agree in this case: * this call should never block, independent of the @@ -1069,6 +1092,8 @@ static int tcp_recv_urg(struct sock * sk, int nonblock, static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb) { __skb_unlink(skb, &sk->receive_queue); + BUG_TRAP(atomic_read(&skb->users) == 1); + /* Well, if I missed something then punishment will be terrible oops. */ __kfree_skb(skb); } @@ -1080,22 +1105,34 @@ static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb) */ static void cleanup_rbuf(struct sock *sk, int copied) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sk_buff *skb; + int time_to_ack; /* NOTE! The socket must be locked, so that we don't get * a messed-up receive queue. */ while ((skb=skb_peek(&sk->receive_queue)) != NULL) { - if (!skb->used || atomic_read(&skb->users) > 1) + if (!skb->used) break; tcp_eat_skb(sk, skb); } + /* Delayed ACKs frequently hit locked sockets during bulk receive. */ + time_to_ack = tp->ack.blocked && tp->ack.pending; +#if 1/*def CONFIG_TCP_MORE_COARSE_ACKS*/ + if (tp->ack.pending && + (tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss) + time_to_ack = 1; +#endif + /* We send an ACK if we can now advertise a non-zero window * which has been raised "significantly". + * + * Even if window raised up to infinity, do not send window open ACK + * in states, where we will not receive more. It is useless. */ - if(copied > 0) { - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + if(copied > 0 && !time_to_ack && !(sk->shutdown&RCV_SHUTDOWN)) { __u32 rcv_window_now = tcp_receive_window(tp); __u32 new_window = __tcp_select_window(sk); @@ -1106,16 +1143,20 @@ static void cleanup_rbuf(struct sock *sk, int copied) * which don't advertize a larger window. */ if((new_window && (new_window >= rcv_window_now * 2)) && - ((rcv_window_now + tp->mss_cache) <= tp->window_clamp)) - tcp_read_wakeup(sk); + ((rcv_window_now + tp->ack.rcv_mss) <= tp->window_clamp)) + time_to_ack = 1; } + if (time_to_ack) + tcp_send_ack(sk); } /* Now socket state including sk->err is changed only under lock, - hence we should check only pending signals. + * hence we may omit checks after joining wait queue. + * We check receive queue before schedule() only as optimization; + * it is very likely that release_sock() added new data. */ -static void tcp_data_wait(struct sock *sk) +static long tcp_data_wait(struct sock *sk, long timeo) { DECLARE_WAITQUEUE(wait, current); @@ -1127,17 +1168,39 @@ static void tcp_data_wait(struct sock *sk) release_sock(sk); if (skb_queue_empty(&sk->receive_queue)) - schedule(); + timeo = schedule_timeout(timeo); lock_sock(sk); sk->socket->flags &= ~SO_WAITDATA; remove_wait_queue(sk->sleep, &wait); __set_current_state(TASK_RUNNING); + return timeo; +} + +static void tcp_prequeue_process(struct sock *sk) +{ + struct sk_buff *skb; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + net_statistics[smp_processor_id()*2+1].TCPPrequeued += skb_queue_len(&tp->ucopy.prequeue); + + /* RX process wants to run with disabled BHs, though it is not necessary */ + local_bh_disable(); + while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) + sk->backlog_rcv(sk, skb); + local_bh_enable(); + + /* Clear memory counter. */ + tp->ucopy.memory = 0; } /* * This routine copies from a sock struct into the user buffer. + * + * Technical note: in 2.3 we work on _locked_ socket, so that + * tricks with *seq access order and skb->users are not required. + * Probably, code can be easily improved even more. */ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, @@ -1146,13 +1209,18 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); int copied = 0; u32 peek_seq; - volatile u32 *seq; /* So gcc doesn't overoptimise */ + u32 *seq; unsigned long used; int err; - int target = 1; /* Read at least this many bytes */ + int target; /* Read at least this many bytes */ + long timeo; + struct task_struct *user_recv = NULL; lock_sock(sk); + TCP_CHECK_TIMER(sk); + + if (sk->err) goto out_err; @@ -1160,24 +1228,20 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, if (sk->state == TCP_LISTEN) goto out; + timeo = sock_rcvtimeo(sk, nonblock); + /* Urgent data needs to be handled specially. */ if (flags & MSG_OOB) goto recv_urg; - /* Copying sequence to update. This is volatile to handle - * the multi-reader case neatly (memcpy_to/fromfs might be - * inline and thus not flush cached variables otherwise). - */ - peek_seq = tp->copied_seq; seq = &tp->copied_seq; - if (flags & MSG_PEEK) + if (flags & MSG_PEEK) { + peek_seq = tp->copied_seq; seq = &peek_seq; + } - /* Handle the POSIX bogosity MSG_WAITALL. */ - if (flags & MSG_WAITALL) - target=len; + target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); - /* * BUG BUG BUG * This violates 1003.1g compliance. We must wait for @@ -1200,7 +1264,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, if (copied) break; copied = -ERESTARTSYS; - if (nonblock) + if (!timeo) copied = -EAGAIN; break; } @@ -1232,47 +1296,98 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, skb = skb->next; } while (skb != (struct sk_buff *)&sk->receive_queue); - if (copied >= target) + /* Well, if we have backlog, try to process it now yet. */ + + if (copied >= target && sk->backlog.tail == NULL) break; - if (sk->err && !(flags&MSG_PEEK)) { - if (!copied) + if (copied) { + if (sk->err || + sk->state == TCP_CLOSE || + (sk->shutdown & RCV_SHUTDOWN) || + !timeo) + break; + } else { + if (sk->err) { copied = sock_error(sk); - break; - } + break; + } - if (sk->shutdown & RCV_SHUTDOWN) { - sk->done = 1; - break; - } + if (sk->done) { + copied = -ENOTCONN; + break; + } - if (sk->state == TCP_CLOSE) { - if (!sk->done) { - sk->done = 1; + if (sk->state == TCP_CLOSE) { + if (!(flags&MSG_PEEK)) + sk->done = 1; break; } - if (!copied) - copied = -ENOTCONN; - break; - } - if (nonblock) { - copied = -EAGAIN; - break; + if (sk->shutdown & RCV_SHUTDOWN) + break; + + if (!timeo) { + copied = -EAGAIN; + break; + } } cleanup_rbuf(sk, copied); - tcp_data_wait(sk); + + if (tp->ucopy.task == user_recv) { + /* Install new reader */ + if (user_recv == NULL && !(flags&MSG_PEEK)) { + user_recv = current; + tp->ucopy.task = user_recv; + tp->ucopy.iov = msg->msg_iov; + } + + tp->ucopy.len = len; + + BUG_TRAP(tp->copied_seq == tp->rcv_nxt); + + /* __ Set realtime policy in scheduler __ */ + } + + if (copied >= target) { + /* Do not sleep, just process backlog. */ + release_sock(sk); + lock_sock(sk); + } else { + timeo = tcp_data_wait(sk, timeo); + } + + if (user_recv) { + int chunk; + + /* __ Restore normal policy in scheduler __ */ + + if ((chunk = len - tp->ucopy.len) != 0) { + net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromBacklog += chunk; + len -= chunk; + copied += chunk; + } + + if (tp->rcv_nxt == tp->copied_seq && + skb_queue_len(&tp->ucopy.prequeue)) { + tcp_prequeue_process(sk); + + if ((chunk = len - tp->ucopy.len) != 0) { + net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromPrequeue += chunk; + len -= chunk; + copied += chunk; + } + } +#if 1/*def CONFIG_TCP_MORE_COARSE_ACKS*/ + if (tp->ack.pending && + (tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss) + tcp_send_ack(sk); +#endif + } continue; found_ok_skb: - /* Lock the buffer. We can be fairly relaxed as - * an interrupt will never steal a buffer we are - * using unless I've missed something serious in - * tcp_data. - */ - atomic_inc(&skb->users); - /* Ok so how much can we use? */ used = skb->len - offset; if (len < used) @@ -1293,36 +1408,28 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, } } - /* Copy it - We _MUST_ update *seq first so that we - * don't ever double read when we have dual readers - */ - *seq += used; - - /* This memcpy_toiovec can sleep. If it sleeps and we - * do a second read it relies on the skb->users to avoid - * a crash when cleanup_rbuf() gets called. - */ err = memcpy_toiovec(msg->msg_iov, ((unsigned char *)skb->h.th) + skb->h.th->doff*4 + offset, used); if (err) { /* Exception. Bailout! */ - atomic_dec(&skb->users); - copied = -EFAULT; + if (!copied) + copied = -EFAULT; break; } + *seq += used; copied += used; len -= used; - /* We now will not sleep again until we are finished - * with skb. Sorry if you are doing the SMP port - * but you'll just have to fix it neatly ;) - * - * Very funny Alan... -DaveM - */ - atomic_dec(&skb->users); - - if (after(tp->copied_seq,tp->urg_seq)) + if (after(tp->copied_seq,tp->urg_seq)) { tp->urg_data = 0; + if (skb_queue_len(&tp->out_of_order_queue) == 0 +#ifdef TCP_FORMAL_WINDOW + && tcp_receive_window(tp) +#endif + ) { + tcp_fast_path_on(tp); + } + } if (used + offset < skb->len) continue; @@ -1334,8 +1441,30 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, if (flags & MSG_PEEK) continue; skb->used = 1; - if (atomic_read(&skb->users) == 1) - tcp_eat_skb(sk, skb); + tcp_eat_skb(sk, skb); + +#ifdef CONFIG_TCP_LESS_COARSE_ACKS + /* Possible improvement. When sender is faster than receiver, + * traffic looks like: fill window ... wait for window open ... + * fill window. We lose at least one rtt, because call + * cleanup_rbuf only once. Probably, if "len" was large + * we should insert several intermediate cleanup_rbuf(s). + * + * F.e.: + */ + do { + u32 full_space = min(tp->window_clamp, tcp_full_space(sk)); + + /* Try to ACK, if total buffer length is larger + than maximal window and if rcv_window has + chances to increase twice. It will result + to exponentially decreased ACKing during + read to huge (usually, mmapped) buffer. + */ + if (len >= full_space && tp->rcv_wnd <= full_space/2) + cleanup_rbuf(sk, copied); + } while (0); +#endif continue; found_fin_ok: @@ -1345,19 +1474,36 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, /* All is done. */ skb->used = 1; - sk->shutdown |= RCV_SHUTDOWN; break; } - if (copied >= 0 && msg->msg_name) - tp->af_specific->addr2sockaddr(sk, (struct sockaddr *) - msg->msg_name); + if (user_recv) { + if (skb_queue_len(&tp->ucopy.prequeue)) { + int chunk; + + tp->ucopy.len = copied > 0 ? len : 0; + + tcp_prequeue_process(sk); + + if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) { + net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromPrequeue += chunk; + len -= chunk; + copied += chunk; + } + } - if(addr_len) - *addr_len = tp->af_specific->sockaddr_len; + tp->ucopy.task = NULL; + tp->ucopy.len = 0; + } + + /* According to UNIX98, msg_name/msg_namelen are ignored + * on connected socket. I was just happy when found this 8) --ANK + */ /* Clean up data we have read: This will do ACK frames. */ cleanup_rbuf(sk, copied); + + TCP_CHECK_TIMER(sk); release_sock(sk); return copied; @@ -1365,23 +1511,15 @@ out_err: err = sock_error(sk); out: + TCP_CHECK_TIMER(sk); release_sock(sk); return err; recv_urg: - err = tcp_recv_urg(sk, nonblock, msg, len, flags, addr_len); + err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len); goto out; } -/* - * Check whether to renew the timer. - */ -static inline void tcp_check_fin_timer(struct sock *sk) -{ - if (sk->state == TCP_FIN_WAIT2) - tcp_reset_keepalive_timer(sk, sysctl_tcp_fin_timeout); -} - /* * State processing on a close. This implements the state shift for * sending our FIN frame. Note that we only send a FIN for some @@ -1405,24 +1543,13 @@ static unsigned char new_state[16] = { /* TCP_CLOSING */ TCP_CLOSING, }; -static int tcp_close_state(struct sock *sk, int dead) +static int tcp_close_state(struct sock *sk) { int next = (int) new_state[sk->state]; int ns = (next & TCP_STATE_MASK); tcp_set_state(sk, ns); - /* This is a (useful) BSD violating of the RFC. There is a - * problem with TCP as specified in that the other end could - * keep a socket open forever with no application left this end. - * We use a 3 minute timeout (about the same as BSD) then kill - * our end. If they send after that then tough - BUT: long enough - * that we won't make the old 4*rto = almost no time - whoops - * reset mistake. - */ - if (dead) - tcp_check_fin_timer(sk); - return (next & TCP_ACTION_FIN); } @@ -1443,9 +1570,8 @@ void tcp_shutdown(struct sock *sk, int how) /* If we've already sent a FIN, or it's a closed state, skip this. */ if ((1 << sk->state) & (TCPF_ESTABLISHED|TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE_WAIT)) { - /* Clear out any half completed packets. FIN if needed. */ - if (tcp_close_state(sk,0)) + if (tcp_close_state(sk)) tcp_send_fin(sk); } } @@ -1460,40 +1586,6 @@ static inline int closing(struct sock * sk) return ((1 << sk->state) & (TCPF_FIN_WAIT1|TCPF_CLOSING|TCPF_LAST_ACK)); } -/* - * This routine closes sockets which have been at least partially - * opened, but not yet accepted. Currently it is only called by - * tcp_close. - */ - -static void tcp_close_pending (struct sock *sk) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - struct open_request *req = tp->syn_wait_queue; - - while(req) { - struct open_request *iter; - - if (req->sk) - tcp_close(req->sk, 0); - - iter = req; - req = req->dl_next; - - if (iter->sk) { - sk->ack_backlog--; - } else { - tcp_dec_slow_timer(TCP_SLT_SYNACK); - tp->syn_backlog--; - } - (*iter->class->destructor)(iter); - tcp_openreq_free(iter); - } - BUG_TRAP(tp->syn_backlog == 0); - BUG_TRAP(sk->ack_backlog == 0); - tcp_synq_init(tp); -} - static __inline__ void tcp_kill_sk_queues(struct sock *sk) { /* First the read buffer. */ @@ -1528,6 +1620,14 @@ void tcp_destroy_sock(struct sock *sk) /* It it has not 0 sk->num, it must be bound */ BUG_TRAP(!sk->num || sk->prev!=NULL); +#ifdef TCP_DEBUG + if (sk->zapped) { + printk("TCP: double destroy sk=%p\n", sk); + sock_hold(sk); + } + sk->zapped = 1; +#endif + sk->prot->destroy(sk); tcp_kill_sk_queues(sk); @@ -1538,6 +1638,7 @@ void tcp_destroy_sock(struct sock *sk) } #endif + atomic_dec(&tcp_orphan_count); sock_put(sk); } @@ -1547,17 +1648,17 @@ void tcp_close(struct sock *sk, long timeout) int data_was_unread = 0; lock_sock(sk); + sk->shutdown = SHUTDOWN_MASK; + if(sk->state == TCP_LISTEN) { tcp_set_state(sk, TCP_CLOSE); /* Special case. */ - tcp_close_pending(sk); + tcp_listen_stop(sk); goto adjudge_to_death; } - sk->shutdown = SHUTDOWN_MASK; - /* We need to flush the recv. buffs. We do this only on the * descriptor close, not protocol-sourced closes, because the * reader process may not have drained the data yet! @@ -1581,10 +1682,35 @@ void tcp_close(struct sock *sk, long timeout) /* Unread data was tossed, zap the connection. */ tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_KERNEL); - } else if (tcp_close_state(sk,1)) { + } else if (sk->linger && sk->lingertime==0) { + /* Check zero linger _after_ checking for unread data. */ + sk->prot->disconnect(sk, 0); + } else if (tcp_close_state(sk)) { /* We FIN if the application ate all the data before * zapping the connection. */ + + /* RED-PEN. Formally speaking, we have broken TCP state + * machine. State transitions: + * + * TCP_ESTABLISHED -> TCP_FIN_WAIT1 + * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible) + * TCP_CLOSE_WAIT -> TCP_LAST_ACK + * + * are legal only when FIN has been sent (i.e. in window), + * rather than queued out of window. Purists blame. + * + * F.e. "RFC state" is ESTABLISHED, + * if Linux state is FIN-WAIT-1, but FIN is still not sent. + * + * The visible declinations are that sometimes + * we enter time-wait state, when it is not required really + * (harmless), do not send active resets, when they are + * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when + * they look as CLOSING or LAST_ACK for Linux) + * Probably, I missed some more holelets. + * --ANK + */ tcp_send_fin(sk); } @@ -1594,26 +1720,19 @@ void tcp_close(struct sock *sk, long timeout) add_wait_queue(sk->sleep, &wait); - while (1) { + do { set_current_state(TASK_INTERRUPTIBLE); if (!closing(sk)) break; release_sock(sk); timeout = schedule_timeout(timeout); lock_sock(sk); - if (!signal_pending(tsk) || timeout) - break; - } + } while (!signal_pending(tsk) && timeout); tsk->state = TASK_RUNNING; remove_wait_queue(sk->sleep, &wait); } - /* Now that the socket is dead, if we are in the FIN_WAIT2 state - * we may need to set up a timer. - */ - tcp_check_fin_timer(sk); - adjudge_to_death: /* It is the last release_sock in its life. It will remove backlog. */ release_sock(sk); @@ -1627,23 +1746,67 @@ adjudge_to_death: BUG_TRAP(sk->lock.users==0); sock_hold(sk); + sock_orphan(sk); + + /* This is a (useful) BSD violating of the RFC. There is a + * problem with TCP as specified in that the other end could + * keep a socket open forever with no application left this end. + * We use a 3 minute timeout (about the same as BSD) then kill + * our end. If they send after that then tough - BUT: long enough + * that we won't make the old 4*rto = almost no time - whoops + * reset mistake. + * + * Nope, it was not mistake. It is really desired behaviour + * f.e. on http servers, when such sockets are useless, but + * consume significant resources. Let's do it with special + * linger2 option. --ANK + */ + + if (sk->state == TCP_FIN_WAIT2) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + if (tp->linger2 < 0) { + tcp_set_state(sk, TCP_CLOSE); + tcp_send_active_reset(sk, GFP_ATOMIC); + } else { + int tmo = tcp_fin_time(tp); - /* Announce socket dead, detach it from wait queue and inode. */ - write_lock_irq(&sk->callback_lock); - sk->dead = 1; - sk->socket = NULL; - sk->sleep = NULL; - write_unlock_irq(&sk->callback_lock); + if (tmo > TCP_TIMEWAIT_LEN) { + tcp_reset_keepalive_timer(sk, tcp_fin_time(tp)); + } else { + atomic_inc(&tcp_orphan_count); + tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); + goto out; + } + } + } + if (sk->state != TCP_CLOSE && + atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans) { + if (net_ratelimit()) + printk(KERN_INFO "TCP: too many of orphaned sockets\n"); + tcp_set_state(sk, TCP_CLOSE); + tcp_send_active_reset(sk, GFP_ATOMIC); + } + atomic_inc(&tcp_orphan_count); if (sk->state == TCP_CLOSE) tcp_destroy_sock(sk); /* Otherwise, socket is reprieved until protocol close. */ +out: bh_unlock_sock(sk); local_bh_enable(); sock_put(sk); } +/* These states need RST on ABORT according to RFC793 */ + +extern __inline__ int tcp_need_reset(int state) +{ + return ((1 << state) & + (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1| + TCPF_FIN_WAIT2|TCPF_SYN_RECV)); +} + int tcp_disconnect(struct sock *sk, int flags) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; @@ -1656,9 +1819,14 @@ int tcp_disconnect(struct sock *sk, int flags) /* ABORT function of RFC793 */ if (old_state == TCP_LISTEN) { - tcp_close_pending(sk); - } else if (tcp_connected(old_state)) { - tcp_send_active_reset(sk, GFP_KERNEL); + tcp_listen_stop(sk); + } else if (tcp_need_reset(old_state) || + (tp->snd_nxt != tp->write_seq && + (1<err = ECONNRESET; } else if (old_state == TCP_SYN_SENT) sk->err = ECONNRESET; @@ -1677,26 +1845,25 @@ int tcp_disconnect(struct sock *sk, int flags) memset(&sk->net_pinfo.af_inet6.rcv_saddr, 0, 16); #endif - sk->zapped = 0; sk->shutdown = 0; sk->done = 0; sk->write_space = tcp_write_space; tp->srtt = 0; -#ifdef CONFIG_TCP_TW_RECYCLE - if ((tp->write_seq += 2) == 0) - tp->write_seq = 1; -#else - tp->write_seq = 0; -#endif - tp->ato = 0; + if (sysctl_tcp_tw_recycle) { + if ((tp->write_seq += 2) == 0) + tp->write_seq = 1; + } else { + tp->write_seq = 0; + } tp->backoff = 0; tp->snd_cwnd = 2; tp->probes_out = 0; + tp->packets_out = 0; tp->high_seq = 0; tp->snd_ssthresh = 0x7fffffff; tp->snd_cwnd_cnt = 0; tp->dup_acks = 0; - tp->delayed_acks = 0; + tcp_delack_init(tp); tp->send_head = tp->retrans_head = NULL; tp->saw_tstamp = 0; __sk_dst_reset(sk); @@ -1712,11 +1879,10 @@ int tcp_disconnect(struct sock *sk, int flags) * conditions. This must be called with the socket locked, * and without the kernel lock held. */ -static struct open_request * wait_for_connect(struct sock * sk, - struct open_request **pprev) +static int wait_for_connect(struct sock * sk, long timeo) { DECLARE_WAITQUEUE(wait, current); - struct open_request *req; + int err; /* * True wake-one mechanism for incoming connections: only @@ -1736,17 +1902,25 @@ static struct open_request * wait_for_connect(struct sock * sk, for (;;) { current->state = TASK_EXCLUSIVE | TASK_INTERRUPTIBLE; release_sock(sk); - schedule(); + if (sk->tp_pinfo.af_tcp.accept_queue == NULL) + timeo = schedule_timeout(timeo); lock_sock(sk); - req = tcp_find_established(&(sk->tp_pinfo.af_tcp), pprev); - if (req) + err = 0; + if (sk->tp_pinfo.af_tcp.accept_queue) + break; + err = -EINVAL; + if (sk->state != TCP_LISTEN) break; + err = -ERESTARTSYS; if (signal_pending(current)) break; + err = -EAGAIN; + if (!timeo) + break; } current->state = TASK_RUNNING; remove_wait_queue(sk->sleep, &wait); - return req; + return err; } /* @@ -1758,9 +1932,10 @@ static struct open_request * wait_for_connect(struct sock * sk, struct sock *tcp_accept(struct sock *sk, int flags, int *err) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - struct open_request *req, *prev; + struct open_request *req; struct sock *newsk; int error; + long timeo; lock_sock(sk); @@ -1771,25 +1946,27 @@ struct sock *tcp_accept(struct sock *sk, int flags, int *err) if (sk->state != TCP_LISTEN) goto out; + timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + /* Find already established connection */ - req = tcp_find_established(tp, &prev); - if (!req) { + if (!tp->accept_queue) { /* If this is a non blocking socket don't sleep */ error = -EAGAIN; - if (flags & O_NONBLOCK) + if (!timeo) goto out; - error = -ERESTARTSYS; - req = wait_for_connect(sk, &prev); - if (!req) + error = wait_for_connect(sk, timeo); + if (error) goto out; } - tcp_synq_unlink(tp, req, prev); - newsk = req->sk; - req->class->destructor(req); - tcp_openreq_free(req); - sk->ack_backlog--; + req = tp->accept_queue; + tp->accept_queue = req->dl_next; + + newsk = req->sk; + tcp_acceptq_removed(sk); + tcp_openreq_fastfree(req); + BUG_TRAP(newsk->state != TCP_SYN_RECV); release_sock(sk); return newsk; @@ -1828,7 +2005,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, * the point when this call is done we typically don't yet know * which interface is going to be used */ - if(val < 1 || val > MAX_WINDOW) { + if(val < 8 || val > MAX_TCP_WINDOW) { err = -EINVAL; break; } @@ -1839,11 +2016,11 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, /* You cannot try to use this and TCP_CORK in * tandem, so let the user know. */ - if (sk->nonagle == 2) { + if (tp->nonagle == 2) { err = -EINVAL; break; } - sk->nonagle = (val == 0) ? 0 : 1; + tp->nonagle = (val == 0) ? 0 : 1; break; case TCP_CORK: @@ -1858,14 +2035,14 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, * You cannot try to use TCP_NODELAY and this mechanism * at the same time, so let the user know. */ - if (sk->nonagle == 1) { + if (tp->nonagle == 1) { err = -EINVAL; break; } if (val != 0) { - sk->nonagle = 2; + tp->nonagle = 2; } else { - sk->nonagle = 0; + tp->nonagle = 0; tcp_push_pending_frames(sk, tp); } @@ -1905,6 +2082,38 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, tp->syn_retries = val; break; + case TCP_LINGER2: + if (val < 0) + tp->linger2 = -1; + else if (val > sysctl_tcp_fin_timeout/HZ) + tp->linger2 = 0; + else + tp->linger2 = val*HZ; + break; + + case TCP_DEFER_ACCEPT: + tp->defer_accept = 0; + if (val > 0) { + /* Translate value in seconds to number of retransmits */ + while (val > ((TCP_TIMEOUT_INIT/HZ)<defer_accept)) + tp->defer_accept++; + tp->defer_accept++; + } + break; + + case TCP_WINDOW_CLAMP: + if (val==0) { + if (sk->state != TCP_CLOSE) { + err = -EINVAL; + break; + } + tp->window_clamp = 0; + } else { + tp->window_clamp = valuser_mss; + val = tp->mss_cache; + if (val == 0 && ((1<state)&(TCPF_CLOSE|TCPF_LISTEN))) + val = tp->user_mss; break; case TCP_NODELAY: - val = (sk->nonagle == 1); + val = (tp->nonagle == 1); break; case TCP_CORK: - val = (sk->nonagle == 2); + val = (tp->nonagle == 2); break; case TCP_KEEPIDLE: - if (tp->keepalive_time) - val = tp->keepalive_time / HZ; - else - val = sysctl_tcp_keepalive_time / HZ; + val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time)/HZ; break; case TCP_KEEPINTVL: - if (tp->keepalive_intvl) - val = tp->keepalive_intvl / HZ; - else - val = sysctl_tcp_keepalive_intvl / HZ; + val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl)/HZ; break; case TCP_KEEPCNT: - if (tp->keepalive_probes) - val = tp->keepalive_probes; - else - val = sysctl_tcp_keepalive_probes; + val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes; break; case TCP_SYNCNT: - if (tp->syn_retries) - val = tp->syn_retries; - else - val = sysctl_tcp_syn_retries; + val = tp->syn_retries ? : sysctl_tcp_syn_retries; + break; + case TCP_LINGER2: + val = tp->linger2; + if (val > 0) + val = (val ? : sysctl_tcp_fin_timeout)/HZ; + break; + case TCP_DEFER_ACCEPT: + val = tp->defer_accept == 0 ? 0 : (TCP_TIMEOUT_INIT<<(tp->defer_accept-1)); + break; + case TCP_WINDOW_CLAMP: + val = tp->window_clamp; break; default: return -ENOPROTOOPT; @@ -2049,11 +2259,20 @@ void __init tcp_init(void) tcp_bhash[i].chain = NULL; } + /* Try to be a bit smarter and adjust defaults depending + * on available memory. + */ if (order > 4) { sysctl_local_port_range[0] = 32768; sysctl_local_port_range[1] = 61000; + sysctl_tcp_max_tw_buckets = 180000; + sysctl_tcp_max_orphans = 4096<<(order-4); + sysctl_max_syn_backlog = 1024; } else if (order < 3) { sysctl_local_port_range[0] = 1024*(3-order); + sysctl_tcp_max_tw_buckets >>= (3-order); + sysctl_tcp_max_orphans >>= (3-order); + sysctl_max_syn_backlog = 128; } tcp_port_rover = sysctl_local_port_range[0] - 1; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3b4ae64a2b36..fd869075d17e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_input.c,v 1.177 2000/01/09 02:19:39 davem Exp $ + * Version: $Id: tcp_input.c,v 1.182 2000/01/21 23:45:59 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -70,9 +70,6 @@ #define SYNC_INIT 1 #endif -extern int sysctl_tcp_fin_timeout; -extern int sysctl_tcp_keepalive_time; - /* These are on by default so the code paths get tested. * For the final 2.2 this may be undone at our discretion. -DaveM */ @@ -83,10 +80,108 @@ int sysctl_tcp_sack = 1; int sysctl_tcp_syncookies = SYNC_INIT; int sysctl_tcp_stdurg; int sysctl_tcp_rfc1337; -int sysctl_tcp_tw_recycle; +int sysctl_tcp_tw_recycle = 1; +int sysctl_tcp_abort_on_overflow = 0; +int sysctl_tcp_max_orphans = NR_FILE; +int sysctl_tcp_max_tw_buckets = NR_FILE*2; static int prune_queue(struct sock *sk); +/* + * Adapt the MSS value used to make delayed ack decision to the + * real world. + * + * The constant 536 hasn't any good meaning. In IPv4 world + * MTU may be smaller, though it contradicts to RFC1122, which + * states that MSS must be at least 536. + * We use the constant to do not ACK each second + * packet in a stream of tiny size packets. + * It means that super-low mtu links will be aggressively delacked. + * Seems, it is even good. If they have so low mtu, they are weirdly + * slow. + * + * AK: BTW it may be useful to add an option to lock the rcv_mss. + * this way the beowulf people wouldn't need ugly patches to get the + * ack frequencies they want and it would be an elegant way to tune delack. + */ +static __inline__ void tcp_measure_rcv_mss(struct tcp_opt *tp, struct sk_buff *skb) +{ + unsigned int len, lss; + + lss = tp->ack.last_seg_size; + tp->ack.last_seg_size = 0; + + /* skb->len may jitter because of SACKs, even if peer + * sends good full-sized frames. + */ + len = skb->len; + if (len >= tp->ack.rcv_mss) { + tp->ack.rcv_mss = len; + } else { + /* Otherwise, we make more careful check taking into account, + * that SACKs block is variable. + * + * "len" is invariant segment length, including TCP header. + */ + len = skb->tail - skb->h.raw; + if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr)) { + /* Subtract also invariant (if peer is RFC compliant), + * tcp header plus fixed timestamp option length. + * Resulting "len" is MSS free of SACK jitter. + */ + len -= tp->tcp_header_len; + if (len == lss) + tp->ack.rcv_mss = len; + tp->ack.last_seg_size = len; + } + +#if 0 + /* Tiny-grams with PSH set artifically deflate our + * ato measurement. + * + * Mmm... I copied this test from tcp_remember_ack(), but + * I did not understand this. Is it to speedup nagling sender? + * It does not because classic (non-Minshall) sender nagles + * guided by not-acked frames not depending on size. + * And it does not help NODELAY sender, because latency + * is too high in any case. The only result is timer trashing + * and redundant ACKs. Grr... Seems, I missed something. --ANK + * + * Let me to comment out this yet... TCP should work + * perfectly without this. --ANK + */ + if (len < (tp->ack.rcv_mss >> 1) && skb->h.th->psh) + tp->ack.ato = TCP_ATO_MIN; +#endif + } +} + + +static __inline__ void tcp_enter_quickack_mode(struct tcp_opt *tp) +{ + unsigned quickacks = tcp_receive_window(tp)/(2*tp->ack.rcv_mss); + + tp->ack.quick = max(min(quickacks, 127), 1); + + if (!tp->tstamp_ok && tp->ack.quick>2) { + /* Quick ACKs are _dangerous_, if RTTM is not used. + * See comment in tcp_init_metrics(). We still help + * them to overcome the most difficult, initial + * phase of slow start. + */ + tp->ack.quick = 2; + } +} + +/* Send ACKs quickly, if "quick" count is not ehausted + * and the session is not interactive. + */ + +static __inline__ int tcp_in_quickack_mode(struct tcp_opt *tp) +{ + return (tp->ack.quick && !tp->ack.pingpong); +} + /* There is something which you must keep in mind when you analyze the * behavior of the tp->ato delayed ack timeout interval. When a * connection starts up, we want to ack as quickly as possible. The @@ -97,53 +192,52 @@ static int prune_queue(struct sock *sk); * each ACK we send, he increments snd_cwnd and transmits more of his * queue. -DaveM */ -static void tcp_delack_estimator(struct tcp_opt *tp) +static void tcp_event_data_recv(struct tcp_opt *tp, struct sk_buff *skb) { - if(tp->ato == 0) { - tp->lrcvtime = tcp_time_stamp; + u32 now; + + tcp_measure_rcv_mss(tp, skb); + + tp->ack.pending = 1; - /* Help sender leave slow start quickly, - * and also makes sure we do not take this - * branch ever again for this connection. + now = tcp_time_stamp; + + if (!tp->ack.ato) { + /* The _first_ data packet received, initialize + * delayed ACK engine. */ - tp->ato = 1; + + /* Help sender leave slow start quickly. */ tcp_enter_quickack_mode(tp); + + /* Pingpong is off, session is not interactive by default */ + tp->ack.pingpong = 0; + + /* ATO is minimal */ + tp->ack.ato = TCP_ATO_MIN; } else { - int m = tcp_time_stamp - tp->lrcvtime; - - tp->lrcvtime = tcp_time_stamp; - if(m <= 0) - m = 1; - if(m > tp->rto) - tp->ato = tp->rto; - else { - /* This funny shift makes sure we - * clear the "quick ack mode" bit. + int m = now - tp->ack.lrcvtime; + + if (m > TCP_ATO_MAX/2) { + /* Do not touch ATO, if interval is out of bounds. + * It will be deflated by delack timer, if our peer + * really sends too rarely. */ - tp->ato = ((tp->ato << 1) >> 2) + m; + if (m > tp->rto) { + /* Too long gap. Apparently sender falled to + * restart window, so that we send ACKs quickly. + */ + tcp_enter_quickack_mode(tp); + } + } else { + if (m <= 0) + m = TCP_ATO_MIN/2; + tp->ack.ato = (tp->ack.ato >> 1) + m; } } + tp->ack.lrcvtime = now; } -/* - * Remember to send an ACK later. - */ -static __inline__ void tcp_remember_ack(struct tcp_opt *tp, struct tcphdr *th, - struct sk_buff *skb) -{ - tp->delayed_acks++; - - /* Tiny-grams with PSH set artifically deflate our - * ato measurement, but with a lower bound. - */ - if(th->psh && (skb->len < (tp->rcv_mss >> 1))) { - /* Preserve the quickack state. */ - if((tp->ato & 0x7fffffff) > HZ/50) - tp->ato = ((tp->ato & 0x80000000) | - (HZ/50)); - } -} - /* Called to compute a smoothed rtt estimate. The data fed to this * routine either comes from timestamps, or from segments that were * known _not_ to have been retransmitted [see Karn/Partridge @@ -209,10 +303,10 @@ static __inline__ void tcp_set_rto(struct tcp_opt *tp) */ static __inline__ void tcp_bound_rto(struct tcp_opt *tp) { - if (tp->rto > 120*HZ) - tp->rto = 120*HZ; - if (tp->rto < HZ/5) - tp->rto = HZ/5; + if (tp->rto < TCP_RTO_MIN) + tp->rto = TCP_RTO_MIN; + else if (tp->rto > TCP_RTO_MAX) + tp->rto = TCP_RTO_MAX; } /* Save metrics learned by this TCP session. @@ -224,7 +318,9 @@ static void tcp_update_metrics(struct sock *sk) struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct dst_entry *dst = __sk_dst_get(sk); - if (dst) { + dst_confirm(dst); + + if (dst && (dst->flags&DST_HOST)) { int m; if (tp->backoff || !tp->srtt) { @@ -237,8 +333,6 @@ static void tcp_update_metrics(struct sock *sk) return; } - dst_confirm(dst); - m = dst->rtt - tp->srtt; /* If newly calculated rtt larger than stored one, @@ -308,10 +402,18 @@ static void tcp_init_metrics(struct sock *sk) dst_confirm(dst); + if (dst->mxlock&(1<snd_cwnd_clamp = dst->cwnd; + if (dst->ssthresh) { + tp->snd_ssthresh = dst->ssthresh; + if (tp->snd_ssthresh > tp->snd_cwnd_clamp) + tp->snd_ssthresh = tp->snd_cwnd_clamp; + } + if (dst->rtt == 0) goto reset; - if (!tp->srtt || !tp->saw_tstamp) + if (!tp->srtt && dst->rtt < (TCP_TIMEOUT_INIT<<3)) goto reset; /* Initial rtt is determined from SYN,SYN-ACK. @@ -334,14 +436,9 @@ static void tcp_init_metrics(struct sock *sk) tp->mdev = dst->rttvar; tcp_set_rto(tp); tcp_bound_rto(tp); - - if (dst->mxlock&(1<snd_cwnd_clamp = dst->cwnd; - if (dst->ssthresh) { - tp->snd_ssthresh = dst->ssthresh; - if (tp->snd_ssthresh > tp->snd_cwnd_clamp) - tp->snd_ssthresh = tp->snd_cwnd_clamp; - } + if (tp->rto < TCP_TIMEOUT_INIT && !tp->saw_tstamp) + goto reset; + tp->snd_cwnd = tcp_init_cwnd(tp); return; @@ -357,9 +454,6 @@ reset: } } -#define PAWS_24DAYS (60 * 60 * 24 * 24) - - /* WARNING: this must not be called if tp->saw_tstamp was false. */ extern __inline__ void tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp, u32 seq) @@ -374,7 +468,7 @@ tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp, u32 seq) */ if((s32)(tp->rcv_tsval - tp->ts_recent) >= 0 || - xtime.tv_sec >= tp->ts_recent_stamp + PAWS_24DAYS) { + xtime.tv_sec >= tp->ts_recent_stamp + TCP_PAWS_24DAYS) { tp->ts_recent = tp->rcv_tsval; tp->ts_recent_stamp = xtime.tv_sec; } @@ -384,7 +478,7 @@ tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp, u32 seq) extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct sk_buff *skb) { return ((s32)(tp->rcv_tsval - tp->ts_recent) < 0 && - xtime.tv_sec < tp->ts_recent_stamp + PAWS_24DAYS + xtime.tv_sec < tp->ts_recent_stamp + TCP_PAWS_24DAYS /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM @@ -411,8 +505,13 @@ extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct sk_buff *skb) static int __tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq) { u32 end_window = tp->rcv_wup + tp->rcv_wnd; +#ifdef TCP_FORMAL_WINDOW + u32 rcv_wnd = tcp_receive_window(tp); +#else + u32 rcv_wnd = tp->rcv_wnd; +#endif - if (tp->rcv_wnd && + if (rcv_wnd && after(end_seq, tp->rcv_nxt) && before(seq, end_window)) return 1; @@ -424,8 +523,13 @@ static int __tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq) /* This functions checks to see if the tcp header is actually acceptable. */ extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq) { +#ifdef TCP_FORMAL_WINDOW + u32 rcv_wnd = tcp_receive_window(tp); +#else + u32 rcv_wnd = tp->rcv_wnd; +#endif if (seq == tp->rcv_nxt) - return (tp->rcv_wnd || (end_seq == seq)); + return (rcv_wnd || (end_seq == seq)); return __tcp_sequence(tp, seq, end_seq); } @@ -433,8 +537,6 @@ extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq) /* When we get a reset we do this. */ static void tcp_reset(struct sock *sk) { - sk->zapped = 1; - /* We want the right error as BSD sees it (and indeed as we do). */ switch (sk->state) { case TCP_SYN_SENT: @@ -447,9 +549,8 @@ static void tcp_reset(struct sock *sk) return; default: sk->err = ECONNRESET; - }; - tcp_set_state(sk, TCP_CLOSE); - tcp_clear_xmit_timers(sk); + } + tcp_done(sk); } @@ -658,17 +759,18 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) if (tp->high_seq == 0 || after(ack, tp->high_seq)) { tp->dup_acks++; if ((tp->fackets_out > 3) || (tp->dup_acks == 3)) { - tp->snd_ssthresh = tcp_recalc_ssthresh(tp); - if (tp->snd_ssthresh > tp->snd_cwnd_clamp) - tp->snd_ssthresh = tp->snd_cwnd_clamp; - tp->snd_cwnd = (tp->snd_ssthresh + 3); - tp->high_seq = tp->snd_nxt; + __tcp_enter_cong_avoid(tp); + /* ... and account for 3 ACKs, which are + * already received to this time. + */ + tp->snd_cwnd += 3; + if(!tp->fackets_out) tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)); else tcp_fack_retransmit(sk); - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); } } else if (++tp->dup_acks > 3) { /* 2. Each time another duplicate ACK arrives, increment @@ -733,7 +835,7 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) if (ack != tp->snd_una && before(ack, tp->high_seq)) { tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)); - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); } } else { /* FACK style, fill any remaining holes in @@ -752,7 +854,8 @@ static __inline__ void tcp_cong_avoid(struct tcp_opt *tp) { if (tp->snd_cwnd <= tp->snd_ssthresh) { /* In "safe" area, increase. */ - tp->snd_cwnd++; + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; } else { /* In dangerous area, increase slowly. * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd @@ -826,23 +929,23 @@ static void tcp_ack_probe(struct sock *sk, __u32 ack) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - /* Our probe was answered. */ - tp->probes_out = 0; - /* Was it a usable window open? */ - /* should always be non-null */ - if (tp->send_head != NULL && - !before (ack + tp->snd_wnd, TCP_SKB_CB(tp->send_head)->end_seq)) { - tp->backoff = 0; - tp->pending = 0; - tcp_clear_xmit_timer(sk, TIME_PROBE0); - } else { - tcp_reset_xmit_timer(sk, TIME_PROBE0, - min(tp->rto << tp->backoff, 120*HZ)); + if (tp->send_head != NULL) { + if (!after(TCP_SKB_CB(tp->send_head)->end_seq, ack + tp->snd_wnd)) { + tp->backoff = 0; + tcp_clear_xmit_timer(sk, TCP_TIME_PROBE0); + /* If packets_out==0, socket must be waked up by + * subsequent tcp_data_snd_check(). This function is + * not for random using! + */ + } else if (!tp->packets_out) { + tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, + min(tp->rto << tp->backoff, TCP_RTO_MAX)); + } } } - + /* Should we open up the congestion window? */ static __inline__ int should_advance_cwnd(struct tcp_opt *tp, int flag) { @@ -914,18 +1017,30 @@ static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp) { struct sk_buff *skb = skb_peek(&sk->write_queue); +#ifdef TCP_DEBUG + /* It occured in 2.3, because of racy timers. Namely, + * retransmit timer did not check packets_out and retransmitted + * send_head sometimes and, hence, messed all the write_queue. + * Now it is impossible, I bet. --ANK + */ + if (skb == NULL) { + printk("Sucks! packets_out=%d, sk=%p, %d\n", tp->packets_out, sk, sk->state); + return; + } +#endif + /* Some data was ACK'd, if still retransmitting (due to a * timeout), resend more of the retransmit queue. The * congestion window is handled properly by that code. */ if (tp->retransmits) { tcp_xmit_retransmit_queue(sk); - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); } else { __u32 when = tp->rto - (tcp_time_stamp - TCP_SKB_CB(skb)->when); if ((__s32)when < 0) when = 1; - tcp_reset_xmit_timer(sk, TIME_RETRANS, when); + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, when); } } @@ -938,13 +1053,8 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th, u32 seq = 0; u32 seq_rtt = 0; - if(sk->zapped) - return(1); /* Dead, can't ack any more so why bother */ - - if (tp->pending == TIME_KEEPOPEN) - tp->probes_out = 0; - - tp->rcv_tstamp = tcp_time_stamp; + if(sk->state == TCP_CLOSE) + return 1; /* Dead, can't ack any more so why bother */ /* If the ack is newer than sent or older than previous acks * then we can probably ignore it. @@ -953,10 +1063,8 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th, goto uninteresting_ack; /* If there is data set flag 1 */ - if (len != th->doff*4) { + if (len != th->doff*4) flag |= FLAG_DATA; - tcp_delack_estimator(tp); - } /* Update our send window. */ @@ -970,31 +1078,53 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th, if ((tp->snd_wl2 != ack) || (nwin > tp->snd_wnd)) { flag |= FLAG_WIN_UPDATE; - tp->snd_wnd = nwin; + if (tp->snd_wnd != nwin) { + tp->snd_wnd = nwin; + + /* Note, it is the only place, where + * fast path is recovered for sending TCP. + */ + if (skb_queue_len(&tp->out_of_order_queue) == 0 && +#ifdef TCP_FORMAL_WINDOW + tcp_receive_window(tp) && +#endif + !tp->urg_data) + tcp_fast_path_on(tp); + + if (nwin > tp->max_window) { + tp->max_window = nwin; + tcp_sync_mss(sk, tp->pmtu_cookie); + } + } tp->snd_wl1 = ack_seq; tp->snd_wl2 = ack; - - if (nwin > tp->max_window) - tp->max_window = nwin; } } + /* BEWARE! From this place and until return from this function + * snd_nxt and snd_wnd are out of sync. All the routines, called + * from here must get "ack" as argument or they should not depend + * on right edge of window. It is _UGLY_. It cries to be fixed. --ANK + */ + /* We passed data and got it acked, remove any soft error * log. Something worked... */ sk->err_soft = 0; + tp->probes_out = 0; + tp->rcv_tstamp = tcp_time_stamp; + + /* See if we can take anything off of the retransmit queue. */ + flag |= tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt); /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission. */ - if (tp->pending == TIME_PROBE0) + if (tcp_timer_is_set(sk, TCP_TIME_PROBE0)) tcp_ack_probe(sk, ack); - /* See if we can take anything off of the retransmit queue. */ - flag |= tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt); - /* We must do this here, before code below clears out important * state contained in tp->fackets_out and tp->retransmits. -DaveM */ @@ -1036,7 +1166,7 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th, if (flag & FLAG_DATA_ACKED) tcp_ack_packets_out(sk, tp); } else { - tcp_clear_xmit_timer(sk, TIME_RETRANS); + tcp_clear_xmit_timer(sk, TCP_TIME_RETRANS); } flag &= (FLAG_DATA | FLAG_WIN_UPDATE); @@ -1074,9 +1204,42 @@ uninteresting_ack: return 0; } +int tcp_paws_check(struct tcp_opt *tp, int rst) +{ + if ((s32)(tp->rcv_tsval - tp->ts_recent) >= 0) + return 0; + if (xtime.tv_sec >= tp->ts_recent_stamp + TCP_PAWS_24DAYS) + return 0; + + /* RST segments are not recommended to carry timestamp, + and, if they do, it is recommended to ignore PAWS because + "their cleanup function should take precedence over timestamps." + Certainly, it is mistake. It is necessary to understand the reasons + of this constraint to relax it: if peer reboots, clock may go + out-of-sync and half-open connections will not be reset. + Actually, the problem would be not existing if all + the implementations followed draft about maintaining clock + via reboots. Linux-2.2 DOES NOT! + + However, we can relax time bounds for RST segments to MSL. + */ + if (rst && xtime.tv_sec >= tp->ts_recent_stamp + TCP_PAWS_MSL) + return 0; + return 1; +} + +static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) +{ + if (seq == s_win) + return 1; + if (after(end_seq, s_win) && before(seq, e_win)) + return 1; + return (seq == e_win && seq == end_seq); +} + /* New-style handling of TIME_WAIT sockets. */ -/* Must be called only from BH context. */ +/* Must be called with locally disabled BHs. */ void tcp_timewait_kill(struct tcp_tw_bucket *tw) { struct tcp_ehash_bucket *ehead; @@ -1121,13 +1284,6 @@ void tcp_timewait_kill(struct tcp_tw_bucket *tw) tcp_tw_put(tw); } -/* We come here as a special case from the AF specific TCP input processing, - * and the SKB has no owner. Essentially handling this is very simple, - * we just keep silently eating rx'd packets until none show up for the - * entire timeout period. The only special cases are for BSD TIME_WAIT - * reconnects and SYN/RST bits being set in the TCP header. - */ - /* * * Main purpose of TIME-WAIT state is to close connection gracefully, * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN @@ -1149,6 +1305,12 @@ void tcp_timewait_kill(struct tcp_tw_bucket *tw) * The algorithm below is based on FORMAL INTERPRETATION of RFCs. * When you compare it to RFCs, please, read section SEGMENT ARRIVES * from the very beginning. + * + * NOTE. With recycling (and later with fin-wait-2) TW bucket + * is _not_ stateless. It means, that strictly speaking we must + * spinlock it. I do not want! Well, probability of misbehaviour + * is ridiculously low and, seems, we could use some mb() tricks + * to avoid misread sequence numbers, states etc. --ANK */ enum tcp_tw_status tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, @@ -1157,7 +1319,71 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, struct tcp_opt tp; int paws_reject = 0; - /* RFC 1122: + tp.saw_tstamp = 0; + if (th->doff > (sizeof(struct tcphdr)>>2) && tw->ts_recent_stamp) { + tcp_parse_options(NULL, th, &tp, 0); + + paws_reject = tp.saw_tstamp && tcp_paws_check(&tp, th->rst); + } + + if (tw->substate == TCP_FIN_WAIT2) { + /* Just repeat all the checks of tcp_rcv_state_process() */ + + /* Out of window, send ACK */ + if (paws_reject || + !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, + tw->rcv_nxt, tw->rcv_nxt + tw->rcv_wnd)) + return TCP_TW_ACK; + + if (th->rst) + goto kill; + + if (th->syn && TCP_SKB_CB(skb)->seq != tw->syn_seq) + goto kill_with_rst; + + /* Dup ACK? */ + if (!after(TCP_SKB_CB(skb)->end_seq, tw->rcv_nxt)) { + tcp_tw_put(tw); + return TCP_TW_SUCCESS; + } + + /* New data or FIN. If new data arrive after half-duplex close, + * reset. + */ + if (!th->fin || TCP_SKB_CB(skb)->end_seq != tw->rcv_nxt+1) { +kill_with_rst: + tcp_tw_deschedule(tw); + tcp_timewait_kill(tw); + tcp_tw_put(tw); + return TCP_TW_RST; + } + + /* FIN arrived, enter true time-wait state. */ + tw->substate = TCP_TIME_WAIT; + tw->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + if (tp.saw_tstamp) { + tw->ts_recent_stamp = xtime.tv_sec; + tw->ts_recent = tp.rcv_tsval; + } + + /* I am shamed, but failed to make it more elegant. + * Yes, it is direct reference to IP, which is impossible + * to generalize to IPv6. Taking into account that IPv6 + * do not undertsnad recycling in any case, it not + * a big problem in practice. --ANK */ + if (tw->family == AF_INET && + sysctl_tcp_tw_recycle && tw->ts_recent_stamp && + tcp_v4_tw_remember_stamp(tw)) + tcp_tw_schedule(tw, tw->timeout); + else + tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); + return TCP_TW_ACK; + } + + /* + * Now real TIME-WAIT state. + * + * RFC 1122: * "When a connection is [...] on TIME-WAIT state [...] * [a TCP] MAY accept a new SYN from the remote TCP to * reopen the connection directly, if it: @@ -1171,47 +1397,31 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, * to be an old duplicate". */ - tp.saw_tstamp = 0; - if (th->doff > (sizeof(struct tcphdr)>>2) && tw->ts_recent_stamp) { - tcp_parse_options(NULL, th, &tp, 0); - - paws_reject = tp.saw_tstamp && - ((s32)(tp.rcv_tsval - tw->ts_recent) < 0 && - xtime.tv_sec < tw->ts_recent_stamp + PAWS_24DAYS); - } - if (!paws_reject && (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq && TCP_SKB_CB(skb)->seq == tw->rcv_nxt)) { /* In window segment, it may be only reset or bare ack. */ if (th->rst) { -#ifdef CONFIG_TCP_TW_RECYCLE - /* When recycling, always follow rfc1337, - * but mark bucket as ready to recycling immediately. - */ - if (sysctl_tcp_tw_recycle) { - /* May kill it now. */ - tw->rto = 0; - tw->ttd = jiffies; - } else -#endif /* This is TIME_WAIT assasination, in two flavors. * Oh well... nobody has a sufficient solution to this * protocol bug yet. */ - if(sysctl_tcp_rfc1337 == 0) { + if (sysctl_tcp_rfc1337 == 0) { +kill: tcp_tw_deschedule(tw); tcp_timewait_kill(tw); + tcp_tw_put(tw); + return TCP_TW_SUCCESS; } - } else { - tcp_tw_reschedule(tw); } + tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); if (tp.saw_tstamp) { tw->ts_recent = tp.rcv_tsval; tw->ts_recent_stamp = xtime.tv_sec; } + tcp_tw_put(tw); return TCP_TW_SUCCESS; } @@ -1235,7 +1445,7 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, if (th->syn && !th->rst && !th->ack && !paws_reject && (after(TCP_SKB_CB(skb)->seq, tw->rcv_nxt) || - (tp.saw_tstamp && tw->ts_recent != tp.rcv_tsval))) { + (tp.saw_tstamp && (s32)(tw->ts_recent - tp.rcv_tsval) < 0))) { u32 isn = tw->snd_nxt + 2; if (isn == 0) isn++; @@ -1243,20 +1453,18 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, return TCP_TW_SYN; } + if (paws_reject) + NET_INC_STATS_BH(PAWSEstabRejected); + if(!th->rst) { /* In this case we must reset the TIMEWAIT timer. - - If it is ACKless SYN it may be both old duplicate - and new good SYN with random sequence number ack) { - tcp_tw_reschedule(tw); -#ifdef CONFIG_TCP_TW_RECYCLE - tw->rto = min(120*HZ, tw->rto<<1); - tw->ttd = jiffies + tw->rto; -#endif - } + if (paws_reject || th->ack) + tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); /* Send ACK. Note, we do not put the bucket, * it will be released by caller. @@ -1267,8 +1475,8 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, return TCP_TW_SUCCESS; } -/* Enter the time wait state. This is always called from BH - * context. Essentially we whip up a timewait bucket, copy the +/* Enter the time wait state. This is called with locally disabled BH. + * Essentially we whip up a timewait bucket, copy the * relevant info into it from the SK, and mess with hash chains * and list linkage. */ @@ -1286,6 +1494,7 @@ static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw) sk->next->pprev = sk->pprev; *sk->pprev = sk->next; sk->pprev = NULL; + sock_prot_dec_use(sk->prot); } /* Step 2: Hash TW into TIMEWAIT half of established hash table. */ @@ -1312,41 +1521,49 @@ static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw) tw->tb->owners = (struct sock*)tw; tw->bind_pprev = &tw->tb->owners; spin_unlock(&bhead->lock); - - /* Step 4: Un-charge protocol socket in-use count. */ - sock_prot_dec_use(sk->prot); } /* - * Move a socket to time-wait. + * Move a socket to time-wait or dead fin-wait-2 state. */ -void tcp_time_wait(struct sock *sk) +void tcp_time_wait(struct sock *sk, int state, int timeo) { - struct tcp_tw_bucket *tw; + struct tcp_tw_bucket *tw = NULL; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int recycle_ok = 0; + + if (sysctl_tcp_tw_recycle && tp->ts_recent_stamp) + recycle_ok = tp->af_specific->remember_stamp(sk); + + if (tcp_tw_count < sysctl_tcp_max_tw_buckets) + tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC); - tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC); if(tw != NULL) { + int rto = (tp->rto<<2) - (tp->rto>>1); + /* Give us an identity. */ tw->daddr = sk->daddr; tw->rcv_saddr = sk->rcv_saddr; tw->bound_dev_if= sk->bound_dev_if; tw->num = sk->num; tw->state = TCP_TIME_WAIT; + tw->substate = state; tw->sport = sk->sport; tw->dport = sk->dport; tw->family = sk->family; tw->reuse = sk->reuse; - tw->hashent = sk->hashent; - tw->rcv_nxt = sk->tp_pinfo.af_tcp.rcv_nxt; - tw->snd_nxt = sk->tp_pinfo.af_tcp.snd_nxt; - tw->ts_recent = sk->tp_pinfo.af_tcp.ts_recent; - tw->ts_recent_stamp= sk->tp_pinfo.af_tcp.ts_recent_stamp; -#ifdef CONFIG_TCP_TW_RECYCLE - tw->rto = sk->tp_pinfo.af_tcp.rto; - tw->ttd = jiffies + 2*tw->rto; -#endif + tw->rcv_wscale = tp->rcv_wscale; atomic_set(&tw->refcnt, 0); + tw->hashent = sk->hashent; + tw->rcv_nxt = tp->rcv_nxt; + tw->snd_nxt = tp->snd_nxt; + tw->rcv_wnd = tcp_receive_window(tp); + tw->syn_seq = tp->syn_seq; + tw->ts_recent = tp->ts_recent; + tw->ts_recent_stamp= tp->ts_recent_stamp; + tw->pprev_death = NULL; + #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) if(tw->family == PF_INET6) { memcpy(&tw->v6_daddr, @@ -1361,22 +1578,28 @@ void tcp_time_wait(struct sock *sk) __tcp_tw_hashdance(sk, tw); /* Get the TIME_WAIT timeout firing. */ - tcp_tw_schedule(tw); + if (timeo < rto) + timeo = rto; + + if (recycle_ok) { + tw->timeout = rto; + } else { + tw->timeout = TCP_TIMEWAIT_LEN; + if (state == TCP_TIME_WAIT) + timeo = TCP_TIMEWAIT_LEN; + } - /* CLOSE the SK. */ - if(sk->state == TCP_ESTABLISHED) - tcp_statistics[smp_processor_id()*2].TcpCurrEstab--; - sk->state = TCP_CLOSE; + tcp_tw_schedule(tw, timeo); } else { - /* Sorry, we're out of memory, just CLOSE this + /* Sorry, if we're out of memory, just CLOSE this * socket up. We've got bigger problems than * non-graceful socket closings. */ - tcp_set_state(sk, TCP_CLOSE); + if (net_ratelimit()) + printk(KERN_INFO "TCP: time wait bucket table overflow\n"); } tcp_update_metrics(sk); - tcp_clear_xmit_timers(sk); tcp_done(sk); } @@ -1397,10 +1620,13 @@ void tcp_time_wait(struct sock *sk) static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) { - sk->tp_pinfo.af_tcp.fin_seq = TCP_SKB_CB(skb)->end_seq; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + tp->fin_seq = TCP_SKB_CB(skb)->end_seq; tcp_send_ack(sk); + sk->shutdown |= RCV_SHUTDOWN; + switch(sk->state) { case TCP_SYN_RECV: case TCP_ESTABLISHED: @@ -1427,7 +1653,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) break; case TCP_FIN_WAIT2: /* Received a FIN -- send ACK and enter TIME_WAIT. */ - tcp_time_wait(sk); + tcp_time_wait(sk, TCP_TIME_WAIT, 0); break; default: /* Only TCP_LISTEN and TCP_CLOSE are left, in these @@ -1435,9 +1661,17 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) */ printk("tcp_fin: Impossible, sk->state=%d\n", sk->state); break; - } + }; + + /* It _is_ possible, that we have something out-of-order _after_ FIN. + * Probably, we should reset in this case. For now drop them. + */ + __skb_queue_purge(&tp->out_of_order_queue); + if (tp->sack_ok) + tp->num_sacks = 0; + if (!sk->dead) { - wake_up_interruptible(sk->sleep); + sk->state_change(sk); sock_wake_async(sk->socket, 1, POLL_HUP); } } @@ -1622,6 +1856,7 @@ static void tcp_sack_extend(struct tcp_opt *tp, struct sk_buff *old_skb, struct sp->end_seq = TCP_SKB_CB(new_skb)->end_seq; } + /* This one checks to see if we can put data from the * out_of_order queue into the receive_queue. */ @@ -1658,6 +1893,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { struct sk_buff *skb1; struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int eaten = 0; /* Queue data for delivery to the user. * Packets in sequence go to the receive queue. @@ -1665,33 +1901,68 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) */ if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { /* Ok. In sequence. */ - queue_and_out: + if (tp->ucopy.task == current && + tp->copied_seq == tp->rcv_nxt && + tp->ucopy.len && + sk->lock.users && + !tp->urg_data) { + int chunk = min(skb->len, tp->ucopy.len); + + local_bh_enable(); + if (memcpy_toiovec(tp->ucopy.iov, skb->data, chunk)) { + sk->err = EFAULT; + sk->error_report(sk); + } + local_bh_disable(); + tp->ucopy.len -= chunk; + tp->copied_seq += chunk; + eaten = (chunk == skb->len && !skb->h.th->fin); + } + + if (!eaten) { +queue_and_out: + skb_set_owner_r(skb, sk); + __skb_queue_tail(&sk->receive_queue, skb); + } dst_confirm(sk->dst_cache); - __skb_queue_tail(&sk->receive_queue, skb); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - if(skb->h.th->fin) { + if(skb->len) + tcp_event_data_recv(tp, skb); + if(skb->h.th->fin) tcp_fin(skb, sk, skb->h.th); - } else { - tcp_remember_ack(tp, skb->h.th, skb); - } + /* This may have eaten into a SACK block. */ if(tp->sack_ok && tp->num_sacks) tcp_sack_remove_skb(tp, skb); tcp_ofo_queue(sk); /* Turn on fast path. */ - if (skb_queue_len(&tp->out_of_order_queue) == 0) - tp->pred_flags = htonl(((tp->tcp_header_len >> 2) << 28) | - ntohl(TCP_FLAG_ACK) | - tp->snd_wnd); + if (skb_queue_len(&tp->out_of_order_queue) == 0 && +#ifdef TCP_FORMAL_WINDOW + tcp_receive_window(tp) && +#endif + !tp->urg_data) + tcp_fast_path_on(tp); + + if (eaten) + kfree_skb(skb); + + if (!sk->dead) { + wake_up_interruptible(sk->sleep); + sock_wake_async(sk->socket,1, POLL_IN); + } return; } - + /* An old packet, either a retransmit or some packet got lost. */ if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { - /* A retransmit, 2nd most common case. Force an imediate ack. */ - SOCK_DEBUG(sk, "retransmit received: seq %X\n", TCP_SKB_CB(skb)->seq); + /* A retransmit, 2nd most common case. Force an imediate ack. + * + * It is impossible, seq is checked by top level. + */ + NETDEBUG(printk("retransmit in tcp_data_queue: seq %X\n", TCP_SKB_CB(skb)->seq)); tcp_enter_quickack_mode(tp); + tp->ack.pending = 1; kfree_skb(skb); return; } @@ -1706,15 +1977,17 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) } /* Ok. This is an out_of_order segment, force an ack. */ - tp->delayed_acks++; - tcp_enter_quickack_mode(tp); + tp->ack.pending = 1; /* Disable header prediction. */ tp->pred_flags = 0; + SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); + skb_set_owner_r(skb, sk); + if (skb_peek(&tp->out_of_order_queue) == NULL) { /* Initial out of order segment, build 1 SACK. */ if(tp->sack_ok) { @@ -1758,6 +2031,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) } } } + return; } @@ -1767,7 +2041,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) * room, then we will just have to discard the packet. */ -static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len) +static void tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len) { struct tcphdr *th; struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); @@ -1777,11 +2051,11 @@ static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len) skb_trim(skb, len - (th->doff*4)); if (skb->len == 0 && !th->fin) - return(0); + goto drop; /* * If our receive queue has grown past its limits shrink it. - * Make sure to do this before moving snd_nxt, otherwise + * Make sure to do this before moving rcv_nxt, otherwise * data might be acked for that we don't have enough room. */ if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) { @@ -1789,7 +2063,7 @@ static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len) /* Still not enough room. That can happen when * skb->true_size differs significantly from skb->len. */ - return 0; + goto drop; } } @@ -1799,29 +2073,20 @@ static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len) printk(KERN_DEBUG "*** tcp.c:tcp_data bug acked < copied\n"); tp->rcv_nxt = tp->copied_seq; } + return; - /* Above, tcp_data_queue() increments delayed_acks appropriately. - * Now tell the user we may have some data. - */ - if (!sk->dead) { - wake_up_interruptible(sk->sleep); - sock_wake_async(sk->socket,1, POLL_IN); - } - return(1); +drop: + kfree_skb(skb); } static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) && - tcp_packets_in_flight(tp) < tp->snd_cwnd) { - /* Put more data onto the wire. */ - tcp_write_xmit(sk); - } else if (tp->packets_out == 0 && !tp->pending) { - /* Start probing the receivers window. */ - tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto); - } + if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) || + tcp_packets_in_flight(tp) >= tp->snd_cwnd || + tcp_write_xmit(sk)) + tcp_check_probe_timer(sk, tp); } static __inline__ void tcp_data_snd_check(struct sock *sk) @@ -1832,57 +2097,6 @@ static __inline__ void tcp_data_snd_check(struct sock *sk) __tcp_data_snd_check(sk, skb); } -/* - * Adapt the MSS value used to make delayed ack decision to the - * real world. - * - * The constant 536 hasn't any good meaning. In IPv4 world - * MTU may be smaller, though it contradicts to RFC1122, which - * states that MSS must be at least 536. - * We use the constant to do not ACK each second - * packet in a stream of tiny size packets. - * It means that super-low mtu links will be aggressively delacked. - * Seems, it is even good. If they have so low mtu, they are weirdly - * slow. - * - * AK: BTW it may be useful to add an option to lock the rcv_mss. - * this way the beowulf people wouldn't need ugly patches to get the - * ack frequencies they want and it would be an elegant way to tune delack. - */ -static __inline__ void tcp_measure_rcv_mss(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - unsigned int len, lss; - - lss = tp->last_seg_size; - tp->last_seg_size = 0; - - /* skb->len may jitter because of SACKs, even if peer - * sends good full-sized frames. - */ - len = skb->len; - if (len >= tp->rcv_mss) { - tp->rcv_mss = len; - } else { - /* Otherwise, we make more careful check taking into account, - * that SACKs block is variable. - * - * "len" is invariant segment length, including TCP header. - */ - len = skb->tail - skb->h.raw; - if (len >= 536 + sizeof(struct tcphdr)) { - /* Subtract also invariant (if peer is RFC compliant), - * tcp header plus fixed timestamp option length. - * Resulting "len" is MSS free of SACK jitter. - */ - len -= tp->tcp_header_len; - if (len == lss) - tp->rcv_mss = len; - tp->last_seg_size = len; - } - } -} - /* * Check if sending an ack is needed. */ @@ -1904,26 +2118,25 @@ static __inline__ void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) * start in an expediant manner. */ - /* Two full frames received or... */ - if (((tp->rcv_nxt - tp->rcv_wup) >= tp->rcv_mss * MAX_DELAY_ACK) || - /* We will update the window "significantly" or... */ - tcp_raise_window(sk) || - /* We entered "quick ACK" mode or... */ + /* More than one full frame received or... */ + if (((tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss) || + /* We ACK each frame or... */ tcp_in_quickack_mode(tp) || - /* We have out of order data */ - (ofo_possible && (skb_peek(&tp->out_of_order_queue) != NULL))) { + /* We have out of order data or */ + (ofo_possible && + skb_peek(&tp->out_of_order_queue) != NULL)) { /* Then ack it now */ tcp_send_ack(sk); } else { /* Else, send delayed ack. */ - tcp_send_delayed_ack(sk, HZ/2); + tcp_send_delayed_ack(sk); } } static __inline__ void tcp_ack_snd_check(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - if (tp->delayed_acks == 0) { + if (tp->ack.pending == 0) { /* We sent a data segment already. */ return; } @@ -1975,7 +2188,7 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th) */ if (tp->urg_seq == tp->copied_seq) tp->copied_seq++; /* Move the copied sequence on correctly */ - tp->urg_data = URG_NOTYET; + tp->urg_data = TCP_URG_NOTYET; tp->urg_seq = ptr; /* Disable header prediction. */ @@ -1992,12 +2205,12 @@ static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len tcp_check_urg(sk,th); /* Do we wait for any urgent data? - normally not... */ - if (tp->urg_data == URG_NOTYET) { + if (tp->urg_data == TCP_URG_NOTYET) { u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff*4); /* Is the urgent pointer pointing into this packet? */ if (ptr < len) { - tp->urg_data = URG_VALID | *(ptr + (unsigned char *) th); + tp->urg_data = TCP_URG_VALID | *(ptr + (unsigned char *) th); if (!sk->dead) sk->data_ready(sk,0); } @@ -2014,7 +2227,8 @@ static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len static int prune_queue(struct sock *sk) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - struct sk_buff * skb; + struct sk_buff *skb; + int pruned = 0; SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq); @@ -2024,7 +2238,9 @@ static int prune_queue(struct sock *sk) skb = __skb_dequeue_tail(&tp->out_of_order_queue); if(skb != NULL) { /* Free it all. */ - do { net_statistics[smp_processor_id()*2].OfoPruned += skb->len; + do { + pruned += skb->len; + net_statistics[smp_processor_id()*2].OfoPruned += skb->len; kfree_skb(skb); skb = __skb_dequeue_tail(&tp->out_of_order_queue); } while(skb != NULL); @@ -2059,13 +2275,47 @@ static int prune_queue(struct sock *sk) * if we are really having our buffer space abused we stop accepting * new receive data. * + * 8) The arguments are interesting, but I even cannot imagine + * what kind of arguments could force us to drop NICE, ALREADY + * RECEIVED DATA only to get one more packet? --ANK + * * FIXME: it should recompute SACK state and only remove enough * buffers to get into bounds again. The current scheme loses - * badly sometimes on links with large RTT, especially when - * the driver has high overhead per skb. - * (increasing the rcvbuf is not enough because it inflates the - * the window too, disabling flow control effectively) -AK + * badly sometimes on links with large RTT, especially when + * the driver has high overhead per skb. + * (increasing the rcvbuf is not enough because it inflates the + * the window too, disabling flow control effectively) -AK + * + * Mmm... Why not to scale it seprately then? Just replace + * / WINDOW_ADVERTISE_DIVISOR with >> sk->window_advertise_scale + * and adjust it dynamically, when TCP window flow control + * fails? -ANK */ + + /* F.e. one possible tactics is: */ + do { + u32 new_clamp = (tp->rcv_nxt-tp->copied_seq) + pruned; + + /* This guy is not a good guy. I bet, he martirized cats, + * when was child and grew up to finished sadist. Clamp him! + */ + if (new_clamp > 3*tp->ack.rcv_mss) + new_clamp -= tp->ack.rcv_mss; + else + new_clamp = 2*tp->ack.rcv_mss; + tp->window_clamp = min(tp->window_clamp, new_clamp); + } while (0); + /* Though it should be made earlier, when we are still not + * congested. This header prediction logic sucks + * without true implementation of VJ algorithm. + * I am really anxious. How was it possible to combine + * header prediction and sending ACKs outside of recvmsg() context? + * They _are_ incompatible. We should not advance window so + * brainlessly and we should not advertise so huge window from the very + * beginning. BTW window "prediction" does not speedup anything! + * SIlly, silly, silly. + */ + if(atomic_read(&sk->rmem_alloc) < (sk->rcvbuf << 1)) return 0; @@ -2073,6 +2323,57 @@ static int prune_queue(struct sock *sk) return -1; } +static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int chunk = skb->len - hlen; + int err; + + local_bh_enable(); + if (skb->ip_summed==CHECKSUM_UNNECESSARY) + err = memcpy_toiovec(tp->ucopy.iov, skb->h.raw + hlen, chunk); + else + err = copy_and_csum_toiovec(tp->ucopy.iov, skb, hlen); + + if (!err) { +update: + tp->ucopy.len -= chunk; + tp->copied_seq += chunk; + local_bh_disable(); + return 0; + } + + if (err == -EFAULT) { + sk->err = EFAULT; + sk->error_report(sk); + goto update; + } + + local_bh_disable(); + return err; +} + +static int __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) +{ + int result; + + if (sk->lock.users) { + local_bh_enable(); + result = __tcp_checksum_complete(skb); + local_bh_disable(); + } else { + result = __tcp_checksum_complete(skb); + } + return result; +} + +static __inline__ int +tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) +{ + return skb->ip_summed != CHECKSUM_UNNECESSARY && + __tcp_checksum_complete_user(sk, skb); +} + /* * TCP receive function for the ESTABLISHED state. * @@ -2080,7 +2381,33 @@ static int prune_queue(struct sock *sk) * disabled when: * - A zero window was announced from us - zero window probing * is only handled properly in the slow path. - * - Out of order segments arrived. + * [ NOTE: actually, it was made incorrectly and nobody ever noticed + * this! Reason is clear: 1. Correct senders do not send + * to zero window. 2. Even if a sender sends to zero window, + * nothing terrible occurs. + * + * For now I cleaned this and fast path is really always disabled, + * when window is zero, but I would be more happy to remove these + * checks. Code will be only cleaner and _faster_. --ANK + * + * Later note. I've just found that slow path also accepts + * out of window segments, look at tcp_sequence(). So... + * it is the last argument: I repair all and comment out + * repaired code by TCP_FORMAL_WINDOW. + * [ I remember one rhyme from a chidren's book. (I apologize, + * the trasnlation is not rhymed 8)): people in one (jewish) village + * decided to build sauna, but divided to two parties. + * The first one insisted that battens should not be dubbed, + * another objected that foots will suffer of splinters, + * the first fended that dubbed wet battens are too slippy + * and people will fall and it is much more serious! + * Certaiinly, all they went to rabbi. + * After some thinking, he judged: "Do not be lazy! + * Certainly, dub the battens! But put them by dubbed surface down." + * ] + * ] + * + * - Out of order segments arrived. * - Urgent data is expected. * - There is no buffer space left * - Unexpected TCP flags/window values/header lengths are received @@ -2088,7 +2415,7 @@ static int prune_queue(struct sock *sk) * - Data is sent in both directions. Fast path only supports pure senders * or pure receivers (this means either the sequence number or the ack * value must stay constant) - * - Unexpected TCP option. + * - Unexpected TCP option. * * When these conditions are not satisfied it drops into a standard * receive procedure patterned after RFC793 to handle all cases. @@ -2116,7 +2443,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, * We do checksum and copy also but from device to kernel. */ - /* RED-PEN. Using static variables to pass function arguments * cannot be good idea... */ @@ -2133,13 +2459,12 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, if ((tcp_flag_word(th) & ~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) == tp->pred_flags && TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { - int tcp_header_len = th->doff*4; + int tcp_header_len = tp->tcp_header_len; - /* Timestamp header prediction */ - - /* Non-standard header f.e. SACKs -> slow path */ - if (tcp_header_len != tp->tcp_header_len) - goto slow_path; + /* Timestamp header prediction: tcp_header_len + * is automatically equal to th->doff*4 due to pred_flags + * match. + */ /* Check timestamp */ if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) { @@ -2161,8 +2486,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, goto slow_path; /* Predicted packet is in window by definition. - seq == rcv_nxt and last_ack_sent <= rcv_nxt. - Hence, check seq<=last_ack_sent reduces to: + * seq == rcv_nxt and last_ack_sent <= rcv_nxt. + * Hence, check seq<=last_ack_sent reduces to: */ if (tp->rcv_nxt == tp->last_ack_sent) { tp->ts_recent = tp->rcv_tsval; @@ -2173,6 +2498,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, if (len <= tcp_header_len) { /* Bulk data transfer: sender */ if (len == tcp_header_len) { + /* We know that such packets are checksummed + * on entry. + */ tcp_ack(sk, th, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->ack_seq, len); kfree_skb(skb); @@ -2182,19 +2510,42 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, TCP_INC_STATS_BH(TcpInErrs); goto discard; } - } else if (TCP_SKB_CB(skb)->ack_seq == tp->snd_una && - atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) { - /* Bulk data transfer: receiver */ - __skb_pull(skb,tcp_header_len); + } else if (TCP_SKB_CB(skb)->ack_seq == tp->snd_una) { + int eaten = 0; - /* Is it possible to simplify this? */ - tcp_measure_rcv_mss(sk, skb); + if (tp->ucopy.task == current && + tp->copied_seq == tp->rcv_nxt && + len - tcp_header_len <= tp->ucopy.len && + sk->lock.users) { + eaten = 1; + + NET_INC_STATS_BH(TCPHPHitsToUser); + + if (tcp_copy_to_iovec(sk, skb, tcp_header_len)) + goto csum_error; + + __skb_pull(skb,tcp_header_len); + } else { + if (tcp_checksum_complete_user(sk, skb)) + goto csum_error; + + if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) + goto step5; + + NET_INC_STATS_BH(TCPHPHits); + + /* Bulk data transfer: receiver */ + __skb_pull(skb,tcp_header_len); + + /* DO NOT notify forward progress here. + * It saves dozen of CPU instructions in fast path. --ANK + * And where is it signaled then ? -AK + * Nowhere. 8) --ANK + */ + __skb_queue_tail(&sk->receive_queue, skb); + skb_set_owner_r(skb, sk); + } - /* DO NOT notify forward progress here. - * It saves dozen of CPU instructions in fast path. --ANK - * And where is it signaled then ? -AK - */ - __skb_queue_tail(&sk->receive_queue, skb); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; /* FIN bit check is not done since if FIN is set in @@ -2202,27 +2553,43 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, */ wake_up_interruptible(sk->sleep); sock_wake_async(sk->socket,1, POLL_IN); - tcp_delack_estimator(tp); - tcp_remember_ack(tp, th, skb); + tcp_event_data_recv(tp, skb); +#if 1/*def CONFIG_TCP_MORE_COARSE_ACKS*/ + if (eaten) { + if (tcp_in_quickack_mode(tp)) { + tcp_send_ack(sk); + } else { + tcp_send_delayed_ack(sk); + } + } else +#endif __tcp_ack_snd_check(sk, 0); + + if (eaten) + kfree_skb(skb); return 0; } /* Packet is in sequence, flags are trivial; - * only ACK is strange or we are tough on memory. - * Jump to step 5. + * only ACK is strange. Jump to step 5. */ + if (tcp_checksum_complete_user(sk, skb)) + goto csum_error; goto step5; } slow_path: + if (tcp_checksum_complete_user(sk, skb)) + goto csum_error; + /* * RFC1323: H1. Apply PAWS check first. */ if (tcp_fast_parse_options(sk, th, tp) && tp->saw_tstamp && tcp_paws_discard(tp, skb)) { if (!th->rst) { + NET_INC_STATS_BH(PAWSEstabRejected); tcp_send_ack(sk); goto discard; } @@ -2251,7 +2618,9 @@ slow_path: TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_wup, tp->rcv_wnd); } + tcp_enter_quickack_mode(tp); tcp_send_ack(sk); + NET_INC_STATS_BH(DelayedACKLost); goto discard; } @@ -2279,11 +2648,8 @@ step5: /* Process urgent data. */ tcp_urg(sk, th, len); - { /* step 7: process the segment text */ - int queued = tcp_data(skb, sk, len); - - tcp_measure_rcv_mss(sk, skb); + tcp_data(skb, sk, len); /* Be careful, tcp_data() may have put this into TIME_WAIT. */ if(sk->state != TCP_CLOSE) { @@ -2291,12 +2657,13 @@ step5: tcp_ack_snd_check(sk); } - if (!queued) { - discard: - kfree_skb(skb); - } - } + return 0; + +csum_error: + TCP_INC_STATS_BH(TcpInErrs); +discard: + kfree_skb(skb); return 0; } @@ -2328,6 +2695,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, newsk->dport = req->rmt_port; sock_lock_init(newsk); + bh_lock_sock(newsk); atomic_set(&newsk->rmem_alloc, 0); skb_queue_head_init(&newsk->receive_queue); @@ -2351,22 +2719,27 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, newtp->rcv_nxt = req->rcv_isn + 1; newtp->snd_nxt = req->snt_isn + 1; newtp->snd_una = req->snt_isn + 1; - newtp->srtt = 0; - newtp->ato = 0; + newtp->snd_sml = req->snt_isn + 1; + + tcp_delack_init(newtp); + if (skb->len >= 536) + newtp->ack.last_seg_size = skb->len; + + tcp_prequeue_init(newtp); + newtp->snd_wl1 = req->rcv_isn; newtp->snd_wl2 = req->snt_isn; - /* RFC1323: The window in SYN & SYN/ACK segments - * is never scaled. - */ - newtp->snd_wnd = ntohs(skb->h.th->window); - - newtp->max_window = newtp->snd_wnd; - newtp->pending = 0; newtp->retransmits = 0; - newtp->last_ack_sent = req->rcv_isn + 1; newtp->backoff = 0; + newtp->srtt = 0; newtp->mdev = TCP_TIMEOUT_INIT; + newtp->rto = TCP_TIMEOUT_INIT; + + newtp->packets_out = 0; + newtp->fackets_out = 0; + newtp->retrans_out = 0; + newtp->snd_ssthresh = 0x7fffffff; /* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control @@ -2374,22 +2747,11 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, * efficiently to them. -DaveM */ newtp->snd_cwnd = 2; - - newtp->rto = TCP_TIMEOUT_INIT; - newtp->packets_out = 0; - newtp->fackets_out = 0; - newtp->retrans_out = 0; - newtp->high_seq = 0; - newtp->snd_ssthresh = 0x7fffffff; newtp->snd_cwnd_cnt = 0; + newtp->high_seq = 0; + newtp->dup_acks = 0; - newtp->delayed_acks = 0; - init_timer(&newtp->retransmit_timer); - newtp->retransmit_timer.function = &tcp_retransmit_timer; - newtp->retransmit_timer.data = (unsigned long) newsk; - init_timer(&newtp->delack_timer); - newtp->delack_timer.function = &tcp_delack_timer; - newtp->delack_timer.data = (unsigned long) newsk; + tcp_init_xmit_timers(newsk); skb_queue_head_init(&newtp->out_of_order_queue); newtp->send_head = newtp->retrans_head = NULL; newtp->rcv_wup = req->rcv_isn + 1; @@ -2397,31 +2759,25 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, newtp->copied_seq = req->rcv_isn + 1; newtp->saw_tstamp = 0; + newtp->last_ack_sent = req->rcv_isn + 1; - init_timer(&newtp->probe_timer); - newtp->probe_timer.function = &tcp_probe_timer; - newtp->probe_timer.data = (unsigned long) newsk; newtp->probes_out = 0; newtp->syn_seq = req->rcv_isn; newtp->fin_seq = req->rcv_isn; newtp->urg_data = 0; - tcp_synq_init(newtp); - newtp->syn_backlog = 0; - if (skb->len >= 536) - newtp->last_seg_size = skb->len; + newtp->listen_opt = NULL; + newtp->accept_queue = NULL; + /* Deinitialize syn_wait_lock to trap illegal accesses. */ + memset(&newtp->syn_wait_lock, 0, sizeof(newtp->syn_wait_lock)); /* Back to base struct sock members. */ newsk->err = 0; - newsk->ack_backlog = 0; - newsk->max_ack_backlog = SOMAXCONN; newsk->priority = 0; atomic_set(&newsk->refcnt, 1); +#ifdef INET_REFCNT_DEBUG atomic_inc(&inet_sock_nr); +#endif - spin_lock_init(&sk->timer_lock); - init_timer(&newsk->timer); - newsk->timer.function = &tcp_keepalive_timer; - newsk->timer.data = (unsigned long) newsk; if (newsk->keepopen) tcp_reset_keepalive_timer(newsk, keepalive_time_when(newtp)); newsk->socket = NULL; @@ -2440,6 +2796,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, newtp->snd_wscale = newtp->rcv_wscale = 0; newtp->window_clamp = min(newtp->window_clamp,65535); } + newtp->snd_wnd = ntohs(skb->h.th->window) << newtp->snd_wscale; + newtp->max_window = newtp->snd_wnd; + if (newtp->tstamp_ok) { newtp->ts_recent = req->ts_recent; newtp->ts_recent_stamp = xtime.tv_sec; @@ -2453,16 +2812,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, return newsk; } -static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) -{ - if (seq == s_win) - return 1; - if (after(end_seq, s_win) && before(seq, e_win)) - return 1; - return (seq == e_win && seq == end_seq); -} - - /* * Process an incoming packet for SYN_RECV sockets represented * as an open_request. @@ -2470,30 +2819,20 @@ static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, struct open_request *req, - struct open_request *prev) + struct open_request **prev) { struct tcphdr *th = skb->h.th; struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); int paws_reject = 0; struct tcp_opt ttp; - - /* If socket has already been created, process - packet in its context. - - We fall here only due to race, when packets were enqueued - to backlog of listening socket. - */ - if (req->sk) - return req->sk; + struct sock *child; ttp.saw_tstamp = 0; if (th->doff > (sizeof(struct tcphdr)>>2)) { - tcp_parse_options(NULL, th, &ttp, 0); - paws_reject = ttp.saw_tstamp && - (s32)(ttp.rcv_tsval - req->ts_recent) < 0; + paws_reject = ttp.saw_tstamp && tcp_paws_check(&ttp, th->rst); } /* Check for pure retransmited SYN. */ @@ -2517,7 +2856,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, * Enforce "SYN-ACK" according to figure 8, figure 6 * of RFC793, fixed by RFC1122. */ - req->class->rtx_syn_ack(sk, req); + req->class->rtx_syn_ack(sk, req, NULL); return NULL; } @@ -2544,6 +2883,8 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, /* Out of window: send ACK and drop. */ if (!(flg & TCP_FLAG_RST)) req->class->send_ack(skb, req); + if (paws_reject) + NET_INC_STATS_BH(PAWSEstabRejected); return NULL; } @@ -2572,35 +2913,78 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, /* Invalid ACK: reset will be sent by listening socket */ if (TCP_SKB_CB(skb)->ack_seq != req->snt_isn+1) return sk; - - /* OK, ACK is valid, create big socket and - feed this segment to it. It will repeat all - the tests. THIS SEGMENT MUST MOVE SOCKET TO - ESTABLISHED STATE. If it will be dropped after - socket is created, wait for troubles. + /* Also, it would be not so bad idea to check rcv_tsecr, which + * is essentially ACK extension and too early or too late values + * should cause reset in unsynchronized states. */ - sk = tp->af_specific->syn_recv_sock(sk, skb, req, NULL); - if (sk == NULL) + + /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */ + if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == req->rcv_isn+1) { + req->acked = 1; return NULL; + } - tcp_dec_slow_timer(TCP_SLT_SYNACK); - req->sk = sk; - return sk; + /* OK, ACK is valid, create big socket and + * feed this segment to it. It will repeat all + * the tests. THIS SEGMENT MUST MOVE SOCKET TO + * ESTABLISHED STATE. If it will be dropped after + * socket is created, wait for troubles. + */ + child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL); + if (child == NULL) + goto listen_overflow; -embryonic_reset: tcp_synq_unlink(tp, req, prev); - tp->syn_backlog--; - tcp_dec_slow_timer(TCP_SLT_SYNACK); + tcp_synq_removed(sk, req); + + tcp_acceptq_queue(sk, req, child); + return child; + +listen_overflow: + if (!sysctl_tcp_abort_on_overflow) { + req->acked = 1; + return NULL; + } +embryonic_reset: NET_INC_STATS_BH(EmbryonicRsts); if (!(flg & TCP_FLAG_RST)) req->class->send_reset(skb); - req->class->destructor(req); - tcp_openreq_free(req); + tcp_synq_drop(sk, req, prev); return NULL; } +/* + * Queue segment on the new socket if the new socket is active, + * otherwise we just shortcircuit this and continue with + * the new socket. + */ + +int tcp_child_process(struct sock *parent, struct sock *child, + struct sk_buff *skb) +{ + int ret = 0; + int state = child->state; + + if (child->lock.users == 0) { + ret = tcp_rcv_state_process(child, skb, skb->h.th, skb->len); + + /* Wakeup parent, send SIGIO */ + if (state == TCP_SYN_RECV && child->state != state) + parent->data_ready(parent, 0); + } else { + /* Alas, it is possible again, because we do lookup + * in main socket hash table and lock on listening + * socket does not protect us more. + */ + sk_add_backlog(child, skb); + } + + bh_unlock_sock(child); + return ret; +} + static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, struct tcphdr *th, unsigned len) { @@ -2608,25 +2992,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_parse_options(sk, th, tp, 0); -#ifdef CONFIG_TCP_TW_RECYCLE - if (tp->ts_recent_stamp && tp->saw_tstamp && !th->rst && - (s32)(tp->rcv_tsval - tp->ts_recent) < 0 && - xtime.tv_sec < tp->ts_recent_stamp + PAWS_24DAYS) { - /* Old duplicate segment. We remember last - ts_recent from this host in timewait bucket. - - Actually, we could implement per host cache - to truncate timewait state after RTO. Paranoidal arguments - of rfc1337 are not enough to close this nice possibility. - */ - if (net_ratelimit()) - printk(KERN_DEBUG "TCP: tw recycle, PAWS worked. Good.\n"); - if (th->ack) - return 1; - goto discard; - } -#endif - if (th->ack) { /* rfc793: * "If the state is SYN-SENT then @@ -2646,10 +3011,22 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * We do not send data with SYN, so that RFC-correct * test reduces to: */ - if (sk->zapped || - TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt) + if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt) return 1; + /* Check not from any RFC, but it is evident consequence + * of combining PAWS and usual SYN-SENT logic: ACK _is_ + * checked in SYN-SENT unlike another states, hence + * echoed tstamp must be checked too. + */ + if (tp->saw_tstamp && + ((__s32)(tp->rcv_tsecr - tcp_time_stamp) > 0 || + (__s32)(tp->rcv_tsecr - tp->syn_stamp) < 0)) { + NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "TCP: synsent reject.\n")); + NET_INC_STATS_BH(PAWSActiveRejected); + return 1; + } + /* Now ACK is acceptable. * * "If the RST bit is set @@ -2689,18 +3066,13 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * because tcp_ack check is too weak for SYN-SENT) * causes moving socket to invalid semi-SYN-SENT, * semi-ESTABLISHED state and connection hangs. - * - * There exist buggy stacks, which really send - * such ACKs: f.e. 202.226.91.94 (okigate.oki.co.jp) - * Actually, if this host did not try to get something - * from ftp.inr.ac.ru I'd never find this bug 8) - * * --ANK (990514) * - * I was wrong, I apologize. Bare ACK is valid. + * Bare ACK is valid, however. * Actually, RFC793 requires to send such ACK * in reply to any out of window packet. - * It is wrong, but Linux also does it sometimes. + * It is wrong, but Linux also send such + * useless ACKs sometimes. * --ANK (990724) */ @@ -2717,7 +3089,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, /* RFC1323: The window in SYN & SYN/ACK segments is * never scaled. */ - tp->snd_wnd = htons(th->window); + tp->snd_wnd = ntohs(th->window); tp->snd_wl1 = TCP_SKB_CB(skb)->seq; tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq; tp->fin_seq = TCP_SKB_CB(skb)->seq; @@ -2742,26 +3114,35 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_initialize_rcv_mss(sk); tcp_init_metrics(sk); + if (sk->keepopen) + tcp_reset_keepalive_timer(sk, keepalive_time_when(tp)); + + tp->copied_seq = tp->rcv_nxt; + __tcp_fast_path_on(tp, tp->snd_wnd); + + if(!sk->dead) { + sk->state_change(sk); + sock_wake_async(sk->socket, 0, POLL_OUT); + } + if (tp->write_pending) { /* Save one ACK. Data will be ready after * several ticks, if write_pending is set. * - * How to make this correctly? + * It may be deleted, but with this feature tcpdumps + * look so _wonderfully_ clever, that I was not able + * to stand against the temptation 8) --ANK */ - tp->delayed_acks++; - if (tp->ato == 0) - tp->ato = tp->rto; - tcp_send_delayed_ack(sk, tp->rto); + tp->ack.pending = 1; + tp->ack.lrcvtime = tcp_time_stamp; + tcp_enter_quickack_mode(tp); + tp->ack.pingpong = 1; + tp->ack.ato = TCP_ATO_MIN; + tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MIN); + goto discard; } else { tcp_send_ack(sk); } - - tp->copied_seq = tp->rcv_nxt; - - if(!sk->dead) { - wake_up_interruptible(sk->sleep); - sock_wake_async(sk->socket, 0, POLL_OUT); - } return -1; } @@ -2777,6 +3158,10 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, goto discard; } + /* PAWS check. */ + if (tp->ts_recent_stamp && tp->saw_tstamp && tcp_paws_check(tp, 0)) + goto discard; + if (th->syn) { /* We see SYN without ACK. It is attempt of * simultaneous connect with crossed SYNs. @@ -2800,8 +3185,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, /* RFC1323: The window in SYN & SYN/ACK segments is * never scaled. */ - tp->snd_wnd = htons(th->window); + tp->snd_wnd = ntohs(th->window); tp->snd_wl1 = TCP_SKB_CB(skb)->seq; + tp->max_window = tp->snd_wnd; tcp_sync_mss(sk, tp->pmtu_cookie); tcp_initialize_rcv_mss(sk); @@ -2960,6 +3346,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, #endif ) { if (!th->rst) { + NET_INC_STATS_BH(DelayedACKLost); + tcp_enter_quickack_mode(tp); tcp_send_ack(sk); } goto discard; @@ -3011,28 +3399,29 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tp->copied_seq = tp->rcv_nxt; /* Note, that this wakeup is only for marginal - crossed SYN case. Passively open sockets - are not waked up, because sk->sleep == NULL - and sk->socket == NULL. + * crossed SYN case. Passively open sockets + * are not waked up, because sk->sleep == NULL + * and sk->socket == NULL. */ - if (!sk->dead && sk->sleep) { - wake_up_interruptible(sk->sleep); + if (!sk->dead) { + sk->state_change(sk); sock_wake_async(sk->socket,0,POLL_OUT); } tp->snd_una = TCP_SKB_CB(skb)->ack_seq; - tp->snd_wnd = htons(th->window) << tp->snd_wscale; + tp->snd_wnd = ntohs(th->window) << tp->snd_wscale; tp->snd_wl1 = TCP_SKB_CB(skb)->seq; tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq; /* tcp_ack considers this ACK as duplicate - * and does not calculate rtt. It is wrong. + * and does not calculate rtt. * Fix it at least with timestamps. */ if (tp->saw_tstamp && !tp->srtt) tcp_ack_saw_tstamp(sk, tp, 0, 0, FLAG_SYN_ACKED); tcp_init_metrics(sk); + tcp_fast_path_on(tp); } else { SOCK_DEBUG(sk, "bad ack\n"); return 1; @@ -3041,26 +3430,42 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, case TCP_FIN_WAIT1: if (tp->snd_una == tp->write_seq) { - sk->shutdown |= SEND_SHUTDOWN; tcp_set_state(sk, TCP_FIN_WAIT2); - if (!sk->dead) - sk->state_change(sk); - else - tcp_reset_keepalive_timer(sk, sysctl_tcp_fin_timeout); + sk->shutdown |= SEND_SHUTDOWN; dst_confirm(sk->dst_cache); + + if (!sk->dead) { + /* Wake up lingering close() */ + sk->state_change(sk); + } else { + int tmo; + + if (tp->linger2 < 0 || + after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { + tcp_done(sk); + return 1; + } + + tmo = tcp_fin_time(tp); + if (tmo > TCP_TIMEWAIT_LEN) { + tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); + } else { + tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); + goto discard; + } + } } break; - case TCP_CLOSING: + case TCP_CLOSING: if (tp->snd_una == tp->write_seq) { - tcp_time_wait(sk); + tcp_time_wait(sk, TCP_TIME_WAIT, 0); goto discard; } break; case TCP_LAST_ACK: if (tp->snd_una == tp->write_seq) { - tcp_set_state(sk,TCP_CLOSE); tcp_update_metrics(sk); tcp_done(sk); goto discard; @@ -3080,27 +3485,22 @@ step6: case TCP_CLOSING: if (!before(TCP_SKB_CB(skb)->seq, tp->fin_seq)) break; - case TCP_FIN_WAIT1: case TCP_FIN_WAIT2: /* RFC 793 says to queue data in these states, * RFC 1122 says we MUST send a reset. * BSD 4.4 also does reset. */ - if ((sk->shutdown & RCV_SHUTDOWN) && sk->dead) { + if (sk->shutdown & RCV_SHUTDOWN) { if (after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { tcp_reset(sk); return 1; } } - + /* Fall through */ case TCP_ESTABLISHED: - queued = tcp_data(skb, sk, len); - - /* This must be after tcp_data() does the skb_pull() to - * remove the header size from skb->len. - */ - tcp_measure_rcv_mss(sk, skb); + tcp_data(skb, sk, len); + queued = 1; break; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 22c35a191416..7420e268f406 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_ipv4.c,v 1.194 2000/01/09 02:19:41 davem Exp $ + * Version: $Id: tcp_ipv4.c,v 1.197 2000/01/21 06:37:28 davem Exp $ * * IPv4 specific functions * @@ -52,7 +52,6 @@ #include #include #include -#include #include #include @@ -61,15 +60,9 @@ #include #include +#include -extern int sysctl_tcp_timestamps; -extern int sysctl_tcp_window_scaling; -extern int sysctl_tcp_sack; -extern int sysctl_tcp_syncookies; -extern int sysctl_tcp_tw_recycle; extern int sysctl_ip_dynaddr; -extern __u32 sysctl_wmem_max; -extern __u32 sysctl_rmem_max; /* Check TCP sequence numbers in ICMP packets. */ #define ICMP_MIN_LENGTH 8 @@ -319,89 +312,13 @@ void tcp_put_port(struct sock *sk) local_bh_enable(); } -#ifdef CONFIG_TCP_TW_RECYCLE -/* - Very stupid pseudo-"algoritm". If the approach will be successful - (and it will!), we have to make it more reasonable. - Now it eats lots of CPU, when we are tough on ports. - - Apparently, it should be hash table indexed by daddr/dport. - - How does it work? We allow to truncate time-wait state, if: - 1. PAWS works on it. - 2. timewait bucket did not receive data for timeout: - - initially timeout := 2*RTO, so that if our ACK to first - transmitted peer's FIN is lost, we will see first retransmit. - - if we receive anything, the timout is increased exponentially - to follow normal TCP backoff pattern. - It is important that minimal RTO (HZ/5) > minimal timestamp - step (1ms). - 3. When creating new socket, we inherit sequence number - and ts_recent of time-wait bucket, increasinf them a bit. - - These two conditions guarantee, that data will not be corrupted - both by retransmitted and by delayed segments. They do not guarantee - that peer will leave LAST-ACK/CLOSING state gracefully, it will be - reset sometimes, namely, when more than two our ACKs to its FINs are lost. - This reset is harmless and even good. +/* This lock without TASK_EXCLUSIVE is good on UP and it can be very bad on SMP. + * Look, when several writers sleep and reader wakes them up, all but one + * immediately hit write lock and grab all the cpus. Exclusive sleep solves + * this, _but_ remember, it adds useless work on UP machines (wake up each + * exclusive lock release). It should be ifdefed really. */ -int tcp_v4_tw_recycle(struct sock *sk, u32 daddr, u16 dport) -{ - static int tw_rover; - - struct tcp_tw_bucket *tw; - struct tcp_bind_hashbucket *head; - struct tcp_bind_bucket *tb; - - int low = sysctl_local_port_range[0]; - int high = sysctl_local_port_range[1]; - unsigned long now = jiffies; - int i, rover; - - rover = tw_rover; - - local_bh_disable(); - for (i=0; ilock); - for (tb = head->chain; tb; tb = tb->next) { - tw = (struct tcp_tw_bucket*)tb->owners; - - if (tw->state != TCP_TIME_WAIT || - tw->dport != dport || - tw->daddr != daddr || - tw->rcv_saddr != sk->rcv_saddr || - tb->port < low || - tb->port >= high || - !TCP_INET_FAMILY(tw->family) || - tw->ts_recent_stamp == 0 || - (long)(now - tw->ttd) <= 0) - continue; - tw_rover = rover; - goto hit; - } - spin_unlock(&head->lock); - } - local_bh_enable(); - tw_rover = rover; - return -EAGAIN; - -hit: - sk->num = tw->num; - if ((sk->bind_next = tb->owners) != NULL) - tb->owners->bind_pprev = &sk->bind_next; - tb->owners = sk; - sk->bind_pprev = &tb->owners; - sk->prev = (struct sock *) tb; - spin_unlock_bh(&head->lock); - return 0; -} -#endif - - void tcp_listen_wlock(void) { write_lock(&tcp_lhash_lock); @@ -409,9 +326,9 @@ void tcp_listen_wlock(void) if (atomic_read(&tcp_lhash_users)) { DECLARE_WAITQUEUE(wait, current); - add_wait_queue(&tcp_lhash_wait, &wait); + add_wait_queue_exclusive(&tcp_lhash_wait, &wait); for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE|TASK_EXCLUSIVE); if (atomic_read(&tcp_lhash_users) == 0) break; write_unlock_bh(&tcp_lhash_lock); @@ -445,6 +362,8 @@ static __inline__ void __tcp_v4_hash(struct sock *sk) sk->pprev = skp; sock_prot_inc_use(sk->prot); write_unlock(lock); + if (sk->state == TCP_LISTEN) + wake_up(&tcp_lhash_wait); } static void tcp_v4_hash(struct sock *sk) @@ -478,6 +397,8 @@ void tcp_unhash(struct sock *sk) sock_prot_dec_use(sk->prot); } write_unlock_bh(lock); + if (sk->state == TCP_LISTEN) + wake_up(&tcp_lhash_wait); } /* Don't inline this cruft. Here are some nice properties to @@ -546,8 +467,9 @@ sherry_cache: * * Local BH must be disabled here. */ -static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport, - u32 daddr, u16 hnum, int dif) + +static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport, + u32 daddr, u16 hnum, int dif) { struct tcp_ehash_bucket *head; TCP_V4_ADDR_COOKIE(acookie, saddr, daddr) @@ -572,7 +494,7 @@ static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport, goto hit; read_unlock(&head->lock); - return tcp_v4_lookup_listener(daddr, hnum, dif); + return NULL; hit: sock_hold(sk); @@ -580,6 +502,19 @@ hit: return sk; } +static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport, + u32 daddr, u16 hnum, int dif) +{ + struct sock *sk; + + sk = __tcp_v4_lookup_established(saddr, sport, daddr, hnum, dif); + + if (sk) + return sk; + + return tcp_v4_lookup_listener(daddr, hnum, dif); +} + __inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif) { struct sock *sk; @@ -609,21 +544,16 @@ static int tcp_v4_check_established(struct sock *sk) int hash = tcp_hashfn(daddr, sk->num, saddr, sk->dport); struct tcp_ehash_bucket *head = &tcp_ehash[hash]; struct sock *sk2, **skp; -#ifdef CONFIG_TCP_TW_RECYCLE struct tcp_tw_bucket *tw; -#endif write_lock_bh(&head->lock); /* Check TIME-WAIT sockets first. */ for(skp = &(head + tcp_ehash_size)->chain; (sk2=*skp) != NULL; skp = &sk2->next) { -#ifdef CONFIG_TCP_TW_RECYCLE tw = (struct tcp_tw_bucket*)sk2; -#endif if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif)) { -#ifdef CONFIG_TCP_TW_RECYCLE struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); /* With PAWS, it is safe from the viewpoint @@ -631,12 +561,17 @@ static int tcp_v4_check_established(struct sock *sk) is safe provided sequence spaces do not overlap i.e. at data rates <= 80Mbit/sec. - Actually, the idea is close to VJ's (rfc1332) - one, only timestamp cache is held not per host, + Actually, the idea is close to VJ's one, + only timestamp cache is held not per host, but per port pair and TW bucket is used as state holder. + + If TW bucket has been already destroyed we + fall back to VJ's scheme and use initial + timestamp retrieved from peer table. */ - if (sysctl_tcp_tw_recycle && tw->ts_recent_stamp) { + if (tw->substate == TCP_TIME_WAIT && + sysctl_tcp_tw_recycle && tw->ts_recent_stamp) { if ((tp->write_seq = tw->snd_nxt + 2) == 0) tp->write_seq = 1; tp->ts_recent = tw->ts_recent; @@ -645,13 +580,10 @@ static int tcp_v4_check_established(struct sock *sk) skp = &head->chain; goto unique; } else -#endif - goto not_unique; + goto not_unique; } } -#ifdef CONFIG_TCP_TW_RECYCLE tw = NULL; -#endif /* And established part... */ for(skp = &head->chain; (sk2=*skp)!=NULL; skp = &sk2->next) { @@ -659,9 +591,7 @@ static int tcp_v4_check_established(struct sock *sk) goto not_unique; } -#ifdef CONFIG_TCP_TW_RECYCLE unique: -#endif BUG_TRAP(sk->pprev==NULL); if ((sk->next = *skp) != NULL) (*skp)->pprev = &sk->next; @@ -671,17 +601,17 @@ unique: sock_prot_inc_use(sk->prot); write_unlock_bh(&head->lock); -#ifdef CONFIG_TCP_TW_RECYCLE if (tw) { /* Silly. Should hash-dance instead... */ local_bh_disable(); tcp_tw_deschedule(tw); tcp_timewait_kill(tw); + NET_INC_STATS_BH(TimeWaitRecycled); local_bh_enable(); tcp_tw_put(tw); } -#endif + return 0; not_unique: @@ -727,9 +657,6 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) int tmp; int err; - if (sk->state != TCP_CLOSE) - return(-EISCONN); - if (addr_len < sizeof(struct sockaddr_in)) return(-EINVAL); @@ -759,8 +686,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) daddr = rt->rt_dst; err = -ENOBUFS; - buff = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header), - 0, GFP_KERNEL); + buff = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 0, GFP_KERNEL); if (buff == NULL) goto failure; @@ -769,27 +695,28 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) sk->saddr = rt->rt_src; sk->rcv_saddr = sk->saddr; - if (!sk->num) { - if (sk->prot->get_port(sk, 0) -#ifdef CONFIG_TCP_TW_RECYCLE - && (!sysctl_tcp_tw_recycle || - tcp_v4_tw_recycle(sk, daddr, usin->sin_port)) -#endif - ) { - kfree_skb(buff); - err = -EAGAIN; - goto failure; - } - sk->sport = htons(sk->num); - } -#ifdef CONFIG_TCP_TW_RECYCLE - else if (tp->ts_recent_stamp && sk->daddr != daddr) { + if (tp->ts_recent_stamp && sk->daddr != daddr) { /* Reset inherited state */ tp->ts_recent = 0; tp->ts_recent_stamp = 0; tp->write_seq = 0; } -#endif + + if (sysctl_tcp_tw_recycle && + !tp->ts_recent_stamp && + rt->rt_dst == daddr) { + struct inet_peer *peer = rt_get_peer(rt); + + /* VJ's idea. We save last timestamp seen from + * the destination in peer table, when entering state TIME-WAIT + * and initialize ts_recent from it, when trying new connection. + */ + + if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) { + tp->ts_recent_stamp = peer->tcp_ts_stamp; + tp->ts_recent = peer->tcp_ts; + } + } sk->dport = usin->sin_port; sk->daddr = daddr; @@ -814,85 +741,62 @@ failure: return err; } -static int tcp_v4_sendmsg(struct sock *sk, struct msghdr *msg, int len) +static __inline__ int tcp_v4_iif(struct sk_buff *skb) { - int retval = -EINVAL; - - lock_sock(sk); - - /* Do sanity checking for sendmsg/sendto/send. */ - if (msg->msg_flags & ~(MSG_OOB|MSG_DONTROUTE|MSG_DONTWAIT|MSG_NOSIGNAL)) - goto out; - if (msg->msg_name) { - struct sockaddr_in *addr=(struct sockaddr_in *)msg->msg_name; - - if (msg->msg_namelen < sizeof(*addr)) - goto out; - if (addr->sin_family && addr->sin_family != AF_INET) - goto out; - retval = -ENOTCONN; - if(sk->state == TCP_CLOSE) - goto out; - retval = -EISCONN; - if (addr->sin_port != sk->dport) - goto out; - if (addr->sin_addr.s_addr != sk->daddr) - goto out; - } - retval = tcp_do_sendmsg(sk, msg); - -out: - release_sock(sk); - return retval; + return ((struct rtable*)skb->dst)->rt_iif; } +static __inline__ unsigned tcp_v4_synq_hash(u32 raddr, u16 rport) +{ + unsigned h = raddr ^ rport; + h ^= h>>16; + h ^= h>>8; + return h&(TCP_SYNQ_HSIZE-1); +} -/* - * Do a linear search in the socket open_request list. - * This should be replaced with a global hash table. - */ static struct open_request *tcp_v4_search_req(struct tcp_opt *tp, struct iphdr *iph, struct tcphdr *th, - struct open_request **prevp) + struct open_request ***prevp) { - struct open_request *req, *prev; - __u16 rport = th->source; - - /* assumption: the socket is not in use. - * as we checked the user count on tcp_rcv and we're - * running from a soft interrupt. - */ - prev = (struct open_request *) (&tp->syn_wait_queue); - for (req = prev->dl_next; req; req = req->dl_next) { - if (req->af.v4_req.rmt_addr == iph->saddr && + struct tcp_listen_opt *lopt = tp->listen_opt; + struct open_request *req, **prev; + __u16 rport = th->source; + __u32 raddr = iph->saddr; + + for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport)]; + (req = *prev) != NULL; + prev = &req->dl_next) { + if (req->rmt_port == rport && + req->af.v4_req.rmt_addr == raddr && req->af.v4_req.loc_addr == iph->daddr && - req->rmt_port == rport && TCP_INET_FAMILY(req->class->family)) { - if (req->sk) { - /* Weird case: connection was established - and then killed by RST before user accepted - it. This connection is dead, but we cannot - kill openreq to avoid blocking in accept(). - - accept() will collect this garbage, - but such reqs must be ignored, when talking - to network. - */ - bh_lock_sock(req->sk); - BUG_TRAP(req->sk->lock.users==0); - if (req->sk->state == TCP_CLOSE) { - bh_unlock_sock(req->sk); - prev = req; - continue; - } - } + BUG_TRAP(req->sk == NULL); *prevp = prev; return req; } - prev = req; } - return NULL; + + return NULL; +} + +static void tcp_v4_synq_add(struct sock *sk, struct open_request *req) +{ + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + struct tcp_listen_opt *lopt = tp->listen_opt; + unsigned h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port); + + req->expires = jiffies + TCP_TIMEOUT_INIT; + req->retrans = 0; + req->sk = NULL; + req->index = h; + req->dl_next = lopt->syn_table[h]; + + write_lock(&tp->syn_wait_lock); + lopt->syn_table[h] = req; + write_unlock(&tp->syn_wait_lock); + + tcp_synq_added(sk); } @@ -984,7 +888,7 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) th = (struct tcphdr*)(dp+(iph->ihl<<2)); - sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, skb->dev->ifindex); + sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, tcp_v4_iif(skb)); if (sk == NULL) { ICMP_INC_STATS_BH(IcmpInErrors); return; @@ -1001,6 +905,9 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) if (sk->lock.users != 0) NET_INC_STATS_BH(LockDroppedIcmps); + if (sk->state == TCP_CLOSE) + goto out; + tp = &sk->tp_pinfo.af_tcp; seq = ntohl(th->seq); if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) { @@ -1010,14 +917,11 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) switch (type) { case ICMP_SOURCE_QUENCH: -#ifndef OLD_SOURCE_QUENCH /* This is deprecated */ - if (sk->lock.users == 0) { - tp->snd_ssthresh = tcp_recalc_ssthresh(tp); - tp->snd_cwnd = tp->snd_ssthresh; - tp->snd_cwnd_cnt = 0; - tp->high_seq = tp->snd_nxt; - } -#endif + /* This is deprecated, but if someone generated it, + * we have no reasons to ignore it. + */ + if (sk->lock.users == 0) + tcp_enter_cong_avoid(tp); goto out; case ICMP_PARAMETERPROB: err = EPROTO; @@ -1042,7 +946,7 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) } switch (sk->state) { - struct open_request *req, *prev; + struct open_request *req, **prev; case TCP_LISTEN: if (sk->lock.users != 0) goto out; @@ -1060,47 +964,25 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) if (!req) goto out; - if (req->sk) { - struct sock *nsk = req->sk; - - /* - * Already in ESTABLISHED and a big socket is created, - * set error code there. - * The error will _not_ be reported in the accept(), - * but only with the next operation on the socket after - * accept. - */ - sock_hold(nsk); - bh_unlock_sock(sk); - sock_put(sk); - sk = nsk; - - BUG_TRAP(sk->lock.users == 0); - tp = &sk->tp_pinfo.af_tcp; - if (!between(seq, tp->snd_una, tp->snd_nxt)) { - NET_INC_STATS(OutOfWindowIcmps); - goto out; - } - } else { - if (seq != req->snt_isn) { - NET_INC_STATS(OutOfWindowIcmps); - goto out; - } + /* ICMPs are not backlogged, hence we cannot get + an established socket here. + */ + BUG_TRAP(req->sk == NULL); - /* - * Still in SYN_RECV, just remove it silently. - * There is no good way to pass the error to the newly - * created socket, and POSIX does not want network - * errors returned from accept(). - */ - tp->syn_backlog--; - tcp_synq_unlink(tp, req, prev); - tcp_dec_slow_timer(TCP_SLT_SYNACK); - req->class->destructor(req); - tcp_openreq_free(req); + if (seq != req->snt_isn) { + NET_INC_STATS_BH(OutOfWindowIcmps); goto out; } - break; + + /* + * Still in SYN_RECV, just remove it silently. + * There is no good way to pass the error to the newly + * created socket, and POSIX does not want network + * errors returned from accept(). + */ + tcp_synq_drop(sk, req, prev); + goto out; + case TCP_SYN_SENT: case TCP_SYN_RECV: /* Cannot happen. It can f.e. if SYNs crossed. @@ -1110,10 +992,9 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) if (sk->lock.users == 0) { TCP_INC_STATS_BH(TcpAttemptFails); sk->err = err; - /* Wake people up to see the error (see connect in sock.c) */ + sk->error_report(sk); - tcp_set_state(sk, TCP_CLOSE); tcp_done(sk); } else { sk->err_soft = err; @@ -1270,28 +1151,23 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) { struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk; - tcp_v4_send_ack(skb, tw->snd_nxt, tw->rcv_nxt, 0, tw->ts_recent); + tcp_v4_send_ack(skb, tw->snd_nxt, tw->rcv_nxt, + tw->rcv_wnd>>tw->rcv_wscale, tw->ts_recent); tcp_tw_put(tw); } static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req) { - tcp_v4_send_ack(skb, req->snt_isn+1, req->rcv_isn+1, req->rcv_wnd, req->ts_recent); + tcp_v4_send_ack(skb, req->snt_isn+1, req->rcv_isn+1, req->rcv_wnd, + req->ts_recent); } -/* - * Send a SYN-ACK after having received an ACK. - * This still operates on a open_request only, not on a big - * socket. - */ -static void tcp_v4_send_synack(struct sock *sk, struct open_request *req) +static struct dst_entry* tcp_v4_route_req(struct sock *sk, struct open_request *req) { struct rtable *rt; struct ip_options *opt; - struct sk_buff * skb; - /* First, grab a route. */ opt = req->af.v4_req.opt; if(ip_route_output(&rt, ((opt && opt->srr) ? opt->faddr : @@ -1300,15 +1176,33 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req) RT_TOS(sk->protinfo.af_inet.tos) | RTO_CONN | sk->localroute, sk->bound_dev_if)) { IP_INC_STATS_BH(IpOutNoRoutes); - return; + return NULL; } - if(opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) { + if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) { ip_rt_put(rt); IP_INC_STATS_BH(IpOutNoRoutes); - return; + return NULL; } + return &rt->u.dst; +} + +/* + * Send a SYN-ACK after having received an ACK. + * This still operates on a open_request only, not on a big + * socket. + */ +static int tcp_v4_send_synack(struct sock *sk, struct open_request *req, + struct dst_entry *dst) +{ + int err = -1; + struct sk_buff * skb; - skb = tcp_make_synack(sk, &rt->u.dst, req); + /* First, grab a route. */ + if (dst == NULL && + (dst = tcp_v4_route_req(sk, req)) == NULL) + goto out; + + skb = tcp_make_synack(sk, dst, req); if (skb) { struct tcphdr *th = skb->h.th; @@ -1317,10 +1211,15 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req) req->af.v4_req.loc_addr, req->af.v4_req.rmt_addr, csum_partial((char *)th, skb->len, skb->csum)); - ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr, - req->af.v4_req.rmt_addr, req->af.v4_req.opt); + err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr, + req->af.v4_req.rmt_addr, req->af.v4_req.opt); + if (err == NET_XMIT_CN) + err = 0; } - ip_rt_put(rt); + +out: + dst_release(dst); + return err; } /* @@ -1328,7 +1227,7 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req) */ static void tcp_v4_or_free(struct open_request *req) { - if(!req->sk && req->af.v4_req.opt) + if (req->af.v4_req.opt) kfree_s(req->af.v4_req.opt, optlength(req->af.v4_req.opt)); } @@ -1372,8 +1271,14 @@ tcp_v4_save_options(struct sock *sk, struct sk_buff *skb) * It would be better to replace it with a global counter for all sockets * but then some measure against one socket starving all other sockets * would be needed. + * + * It was 128 by default. Experiments with real servers show, that + * it is absolutely not enough even at 100conn/sec. 256 cures most + * of problems. This value is adjusted to 128 for very small machines + * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb). + * Further increasing requires to change hash table size. */ -int sysctl_max_syn_backlog = 128; +int sysctl_max_syn_backlog = 256; struct or_calltable or_ipv4 = { PF_INET, @@ -1383,9 +1288,6 @@ struct or_calltable or_ipv4 = { tcp_v4_send_reset }; -#define BACKLOG(sk) ((sk)->tp_pinfo.af_tcp.syn_backlog) /* lvalue! */ -#define BACKLOGMAX(sk) sysctl_max_syn_backlog - int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) { struct tcp_opt tp; @@ -1394,6 +1296,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) __u32 saddr = skb->nh.iph->saddr; __u32 daddr = skb->nh.iph->daddr; __u32 isn = TCP_SKB_CB(skb)->when; + struct dst_entry *dst = NULL; #ifdef CONFIG_SYN_COOKIES int want_cookie = 0; #else @@ -1405,84 +1308,108 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) (RTCF_BROADCAST|RTCF_MULTICAST)) goto drop; - /* XXX: Check against a global syn pool counter. */ - if (BACKLOG(sk) > BACKLOGMAX(sk)) { + /* TW buckets are converted to open requests without + * limitations, they conserve resources and peer is + * evidently real one. + */ + if (tcp_synq_is_full(sk) && !isn) { #ifdef CONFIG_SYN_COOKIES - if (sysctl_tcp_syncookies && !isn) { - syn_flood_warning(skb); + if (sysctl_tcp_syncookies) { want_cookie = 1; } else #endif goto drop; - } else { - if (isn == 0) - isn = tcp_v4_init_sequence(sk, skb); - BACKLOG(sk)++; } - req = tcp_openreq_alloc(); - if (req == NULL) { - goto dropbacklog; - } + /* Accept backlog is full. If we have already queued enough + * of warm entries in syn queue, drop request. It is better than + * clogging syn queue with openreqs with exponentially increasing + * timeout. + */ + if (tcp_acceptq_is_full(sk) && tcp_synq_young(sk) > 1) + goto drop; - req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */ + req = tcp_openreq_alloc(); + if (req == NULL) + goto drop; - req->rcv_isn = TCP_SKB_CB(skb)->seq; tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0; - tp.mss_clamp = 536; tp.user_mss = sk->tp_pinfo.af_tcp.user_mss; tcp_parse_options(NULL, th, &tp, want_cookie); - req->mss = tp.mss_clamp; - req->ts_recent = tp.saw_tstamp ? tp.rcv_tsval : 0; - req->tstamp_ok = tp.tstamp_ok; - req->sack_ok = tp.sack_ok; - req->snd_wscale = tp.snd_wscale; - req->wscale_ok = tp.wscale_ok; - req->rmt_port = th->source; + tcp_openreq_init(req, &tp, skb); + req->af.v4_req.loc_addr = daddr; req->af.v4_req.rmt_addr = saddr; + req->af.v4_req.opt = tcp_v4_save_options(sk, skb); + req->class = &or_ipv4; - /* Note that we ignore the isn passed from the TIME_WAIT - * state here. That's the price we pay for cookies. - * - * RED-PEN. The price is high... Then we cannot kill TIME-WAIT - * and should reject connection attempt, duplicates with random - * sequence number can corrupt data. Right? - * I disabled sending cookie to request matching to a timewait - * bucket. - */ - if (want_cookie) + if (want_cookie) { +#ifdef CONFIG_SYN_COOKIES + syn_flood_warning(skb); +#endif isn = cookie_v4_init_sequence(sk, skb, &req->mss); + } else if (isn == 0) { + struct inet_peer *peer = NULL; - req->snt_isn = isn; - - req->af.v4_req.opt = tcp_v4_save_options(sk, skb); + /* VJ's idea. We save last timestamp seen + * from the destination in peer table, when entering + * state TIME-WAIT, and check against it before + * accepting new connection request. + * + * If "isn" is not zero, this request hit alive + * timewait bucket, so that all the necessary checks + * are made in the function processing timewait state. + */ + if (tp.saw_tstamp && + sysctl_tcp_tw_recycle && + (dst = tcp_v4_route_req(sk, req)) != NULL && + (peer = rt_get_peer((struct rtable*)dst)) != NULL && + peer->v4daddr == saddr) { + if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL && + (s32)(peer->tcp_ts - req->ts_recent) > TCP_PAWS_WINDOW) { + NETDEBUG(printk(KERN_DEBUG "TW_REC: reject openreq %u/%u %08x/%u\n", peer->tcp_ts, req->ts_recent, saddr, ntohs(skb->h.th->source))); + NET_INC_STATS_BH(PAWSPassiveRejected); + dst_release(dst); + goto drop_and_free; + } + } + /* Kill the following clause, if you dislike this way. */ + else if (!sysctl_tcp_syncookies && + (sysctl_max_syn_backlog - tcp_synq_len(sk) + < (sysctl_max_syn_backlog>>2)) && + (!peer || !peer->tcp_ts_stamp) && + (!dst || !dst->rtt)) { + /* Without syncookies last quarter of + * backlog is filled with destinations, proven to be alive. + * It means that we continue to communicate + * to destinations, already remembered + * to the moment of synflood. + */ + NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "TCP: drop open request from %08x/%u\n", saddr, ntohs(skb->h.th->source))); + TCP_INC_STATS_BH(TcpAttemptFails); + dst_release(dst); + goto drop_and_free; + } - req->class = &or_ipv4; - req->retrans = 0; - req->sk = NULL; + isn = tcp_v4_init_sequence(sk, skb); + } + req->snt_isn = isn; - tcp_v4_send_synack(sk, req); + if (tcp_v4_send_synack(sk, req, dst)) + goto drop_and_free; if (want_cookie) { - if (req->af.v4_req.opt) - kfree(req->af.v4_req.opt); - tcp_v4_or_free(req); tcp_openreq_free(req); } else { - req->expires = jiffies + TCP_TIMEOUT_INIT; - tcp_inc_slow_timer(TCP_SLT_SYNACK); - tcp_synq_queue(&sk->tp_pinfo.af_tcp, req); + tcp_v4_synq_add(sk, req); } - return 0; -dropbacklog: - if (!want_cookie) - BACKLOG(sk)--; +drop_and_free: + tcp_openreq_free(req); drop: TCP_INC_STATS_BH(TcpAttemptFails); return 0; @@ -1497,29 +1424,20 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, struct open_request *req, struct dst_entry *dst) { - struct ip_options *opt = req->af.v4_req.opt; struct tcp_opt *newtp; struct sock *newsk; - if (sk->ack_backlog > sk->max_ack_backlog) - goto exit; /* head drop */ - if (dst == NULL) { - struct rtable *rt; - - if (ip_route_output(&rt, - opt && opt->srr ? opt->faddr : req->af.v4_req.rmt_addr, - req->af.v4_req.loc_addr, sk->protinfo.af_inet.tos|RTO_CONN, 0)) - return NULL; - dst = &rt->u.dst; - } + if (tcp_acceptq_is_full(sk)) + goto exit_overflow; + + if (dst == NULL && + (dst = tcp_v4_route_req(sk, req)) == NULL) + goto exit; newsk = tcp_create_openreq_child(sk, req, skb); if (!newsk) goto exit; - sk->tp_pinfo.af_tcp.syn_backlog--; - sk->ack_backlog++; - newsk->dst_cache = dst; newtp = &(newsk->tp_pinfo.af_tcp); @@ -1527,7 +1445,8 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newsk->saddr = req->af.v4_req.loc_addr; newsk->rcv_saddr = req->af.v4_req.loc_addr; newsk->protinfo.af_inet.opt = req->af.v4_req.opt; - newsk->protinfo.af_inet.mc_index = ((struct rtable*)skb->dst)->rt_iif; + req->af.v4_req.opt = NULL; + newsk->protinfo.af_inet.mc_index = tcp_v4_iif(skb); newsk->protinfo.af_inet.mc_ttl = skb->nh.iph->ttl; newtp->ext_header_len = 0; if (newsk->protinfo.af_inet.opt) @@ -1535,28 +1454,26 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, tcp_sync_mss(newsk, dst->pmtu); tcp_initialize_rcv_mss(newsk); + newtp->advmss = dst->advmss; - if (newsk->rcvbuf < (3 * (dst->advmss+40+MAX_HEADER+15))) - newsk->rcvbuf = min ((3 * (dst->advmss+40+MAX_HEADER+15)), sysctl_rmem_max); - if (newsk->sndbuf < (3 * (newtp->mss_clamp+40+MAX_HEADER+15))) - newsk->sndbuf = min ((3 * (newtp->mss_clamp+40+MAX_HEADER+15)), sysctl_wmem_max); + tcp_init_buffer_space(newsk); - bh_lock_sock(newsk); - __tcp_v4_hash(newsk); __tcp_inherit_port(sk, newsk); return newsk; +exit_overflow: + NET_INC_STATS_BH(ListenOverflows); exit: + NET_INC_STATS_BH(ListenDrops); dst_release(dst); return NULL; } - static struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb) { - struct open_request *req, *prev; + struct open_request *req, **prev; struct tcphdr *th = skb->h.th; struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); @@ -1565,6 +1482,25 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb) if (req) return tcp_check_req(sk, skb, req, prev); + if (tp->accept_queue) { + struct sock *nsk; + + nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr, + th->source, + skb->nh.iph->daddr, + ntohs(th->dest), + tcp_v4_iif(skb)); + + if (nsk) { + if (nsk->state != TCP_TIME_WAIT) { + bh_lock_sock(nsk); + return nsk; + } + tcp_tw_put((struct tcp_tw_bucket*)sk); + return NULL; + } + } + #ifdef CONFIG_SYN_COOKIES if (!th->rst && (th->syn || th->ack)) sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt)); @@ -1572,27 +1508,26 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb) return sk; } -static int tcp_csum_verify(struct sk_buff *skb) +static int tcp_v4_checksum_init(struct sk_buff *skb) { - switch (skb->ip_summed) { - case CHECKSUM_NONE: - skb->csum = csum_partial((char *)skb->h.th, skb->len, 0); - case CHECKSUM_HW: - if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,skb->nh.iph->daddr,skb->csum)) { - NETDEBUG(printk(KERN_DEBUG "TCPv4 bad checksum " - "from %d.%d.%d.%d:%04x to %d.%d.%d.%d:%04x, " - "len=%d/%d\n", - NIPQUAD(skb->nh.iph->saddr), - ntohs(skb->h.th->source), - NIPQUAD(skb->nh.iph->daddr), - ntohs(skb->h.th->dest), - skb->len, - ntohs(skb->nh.iph->tot_len))); - return 1; + if (skb->ip_summed == CHECKSUM_HW) { + if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr, + skb->nh.iph->daddr,skb->csum)) { + NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n")); + return -1; } skb->ip_summed = CHECKSUM_UNNECESSARY; - default: - /* CHECKSUM_UNNECESSARY */ + } else if (skb->ip_summed != CHECKSUM_UNNECESSARY) { + if (skb->len <= 68) { + if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr, + skb->nh.iph->daddr, + csum_partial((char *)skb->h.th, skb->len, 0))) + return -1; + skb->ip_summed = CHECKSUM_UNNECESSARY; + } else { + skb->csum = ~tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr, + skb->nh.iph->daddr,0); + } } return 0; } @@ -1614,66 +1549,35 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) goto discard; #endif /* CONFIG_FILTER */ - /* - * This doesn't check if the socket has enough room for the packet. - * Either process the packet _without_ queueing it and then free it, - * or do the check later. - */ - skb_set_owner_r(skb, sk); + IP_INC_STATS_BH(IpInDelivers); if (sk->state == TCP_ESTABLISHED) { /* Fast path */ - /* Ready to move deeper ... */ - if (tcp_csum_verify(skb)) - goto csum_err; + TCP_CHECK_TIMER(sk); if (tcp_rcv_established(sk, skb, skb->h.th, skb->len)) goto reset; + TCP_CHECK_TIMER(sk); return 0; - } + } - if (tcp_csum_verify(skb)) + if (tcp_checksum_complete(skb)) goto csum_err; if (sk->state == TCP_LISTEN) { - struct sock *nsk; - - nsk = tcp_v4_hnd_req(sk, skb); + struct sock *nsk = tcp_v4_hnd_req(sk, skb); if (!nsk) goto discard; - /* - * Queue it on the new socket if the new socket is active, - * otherwise we just shortcircuit this and continue with - * the new socket.. - */ if (nsk != sk) { - int ret; - int state = nsk->state; - - skb_orphan(skb); - - BUG_TRAP(nsk->lock.users == 0); - skb_set_owner_r(skb, nsk); - ret = tcp_rcv_state_process(nsk, skb, skb->h.th, skb->len); - - /* Wakeup parent, send SIGIO, if this packet changed - socket state from SYN-RECV. - - It still looks ugly, however it is much better - than miracleous double wakeup in syn_recv_sock() - and tcp_rcv_state_process(). - */ - if (state == TCP_SYN_RECV && nsk->state != state) - sk->data_ready(sk, 0); - - bh_unlock_sock(nsk); - if (ret) + if (tcp_child_process(sk, nsk, skb)) goto reset; return 0; } } - + + TCP_CHECK_TIMER(sk); if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len)) goto reset; + TCP_CHECK_TIMER(sk); return 0; reset: @@ -1716,6 +1620,9 @@ int tcp_v4_rcv(struct sk_buff *skb, unsigned short len) if (len < sizeof(struct tcphdr)) goto bad_packet; + if (tcp_v4_checksum_init(skb) < 0) + goto bad_packet; + TCP_SKB_CB(skb)->seq = ntohl(th->seq); TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + len - th->doff*4); @@ -1724,7 +1631,7 @@ int tcp_v4_rcv(struct sk_buff *skb, unsigned short len) skb->used = 0; sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source, - skb->nh.iph->daddr, ntohs(th->dest), skb->dev->ifindex); + skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb)); if (!sk) goto no_tcp_socket; @@ -1738,9 +1645,10 @@ process: bh_lock_sock(sk); ret = 0; - if (!sk->lock.users) - ret = tcp_v4_do_rcv(sk, skb); - else + if (!sk->lock.users) { + if (!tcp_prequeue(sk, skb)) + ret = tcp_v4_do_rcv(sk, skb); + } else sk_add_backlog(sk, skb); bh_unlock_sock(sk); @@ -1749,7 +1657,7 @@ process: return ret; no_tcp_socket: - if (tcp_csum_verify(skb)) { + if (tcp_checksum_complete(skb)) { bad_packet: TCP_INC_STATS_BH(TcpInErrs); } else { @@ -1766,7 +1674,7 @@ discard_and_relse: goto discard_it; do_time_wait: - if (tcp_csum_verify(skb)) { + if (tcp_checksum_complete(skb)) { TCP_INC_STATS_BH(TcpInErrs); goto discard_and_relse; } @@ -1776,7 +1684,7 @@ do_time_wait: { struct sock *sk2; - sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr, ntohs(th->dest), skb->dev->ifindex); + sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb)); if (sk2 != NULL) { tcp_tw_deschedule((struct tcp_tw_bucket *)sk); tcp_timewait_kill((struct tcp_tw_bucket *)sk); @@ -1796,36 +1704,39 @@ do_time_wait: goto discard_it; } +/* With per-bucket locks this operation is not-atomic, so that + * this version is not worse. + */ static void __tcp_v4_rehash(struct sock *sk) { - struct tcp_ehash_bucket *oldhead = &tcp_ehash[sk->hashent]; - struct tcp_ehash_bucket *head = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))]; - struct sock **skp = &head->chain; - - write_lock_bh(&oldhead->lock); - if(sk->pprev) { - if(sk->next) - sk->next->pprev = sk->pprev; - *sk->pprev = sk->next; - sk->pprev = NULL; - } - write_unlock(&oldhead->lock); - write_lock(&head->lock); - if((sk->next = *skp) != NULL) - (*skp)->pprev = &sk->next; - *skp = sk; - sk->pprev = skp; - write_unlock_bh(&head->lock); + sk->prot->unhash(sk); + sk->prot->hash(sk); } int tcp_v4_rebuild_header(struct sock *sk) { - struct rtable *rt = (struct rtable *)__sk_dst_get(sk); + struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0); __u32 new_saddr; int want_rewrite = sysctl_ip_dynaddr && sk->state == TCP_SYN_SENT; - if(rt == NULL) - return 0; + if (rt == NULL) { + int err; + + u32 daddr = sk->daddr; + + if(sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr) + daddr = sk->protinfo.af_inet.opt->faddr; + + err = ip_route_output(&rt, daddr, sk->saddr, + RT_TOS(sk->protinfo.af_inet.tos) | RTO_CONN | sk->localroute, + sk->bound_dev_if); + if (err) { + sk->err_soft=-err; + sk->error_report(sk); + return -1; + } + __sk_dst_set(sk, &rt->u.dst); + } /* Force route checking if want_rewrite. * The idea is good, the implementation is disguisting. @@ -1855,16 +1766,6 @@ int tcp_v4_rebuild_header(struct sock *sk) dst_release(&new_rt->u.dst); } } - if (rt->u.dst.obsolete) { - int err; - err = ip_route_output(&rt, rt->rt_dst, rt->rt_src, rt->key.tos|RTO_CONN, rt->key.oif); - if (err) { - sk->err_soft=-err; - sk->error_report(sk); - return -1; - } - __sk_dst_set(sk, &rt->u.dst); - } return 0; @@ -1877,7 +1778,7 @@ do_rewrite: "saddr=%08X rcv_saddr=%08X\n", ntohl(sk->saddr), ntohl(sk->rcv_saddr)); - return 0; + return -1; } if (new_saddr != sk->saddr) { @@ -1895,7 +1796,7 @@ do_rewrite: * XXX really change the sockets identity after * XXX it has entered the hashes. -DaveM * - * Besides that, it does not check for connetion + * Besides that, it does not check for connection * uniqueness. Wait for troubles. */ __tcp_v4_rehash(sk); @@ -1913,6 +1814,63 @@ static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) sin->sin_port = sk->dport; } +/* VJ's idea. Save last timestamp seen from this destination + * and hold it at least for normal timewait interval to use for duplicate + * segment detection in subsequent connections, before they enter synchronized + * state. + */ + +int tcp_v4_remember_stamp(struct sock *sk) +{ + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + struct rtable *rt = (struct rtable*)__sk_dst_get(sk); + struct inet_peer *peer = NULL; + int release_it = 0; + + if (rt == NULL || rt->rt_dst != sk->daddr) { + peer = inet_getpeer(sk->daddr, 1); + release_it = 1; + } else { + if (rt->peer == NULL) + rt_bind_peer(rt, 1); + peer = rt->peer; + } + + if (peer) { + if ((s32)(peer->tcp_ts - tp->ts_recent) <= 0 || + (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec && + peer->tcp_ts_stamp <= tp->ts_recent_stamp)) { + peer->tcp_ts_stamp = tp->ts_recent_stamp; + peer->tcp_ts = tp->ts_recent; + } + if (release_it) + inet_putpeer(peer); + return 1; + } + + return 0; +} + +int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw) +{ + struct inet_peer *peer = NULL; + + peer = inet_getpeer(tw->daddr, 1); + + if (peer) { + if ((s32)(peer->tcp_ts - tw->ts_recent) <= 0 || + (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec && + peer->tcp_ts_stamp <= tw->ts_recent_stamp)) { + peer->tcp_ts_stamp = tw->ts_recent_stamp; + peer->tcp_ts = tw->ts_recent; + } + inet_putpeer(peer); + return 1; + } + + return 0; +} + struct tcp_func ipv4_specific = { ip_queue_xmit, tcp_v4_send_check, @@ -1920,6 +1878,7 @@ struct tcp_func ipv4_specific = { tcp_v4_conn_request, tcp_v4_syn_recv_sock, tcp_v4_hash_connecting, + tcp_v4_remember_stamp, sizeof(struct iphdr), ip_setsockopt, @@ -1937,6 +1896,7 @@ static int tcp_v4_init_sock(struct sock *sk) skb_queue_head_init(&tp->out_of_order_queue); tcp_init_xmit_timers(sk); + tcp_prequeue_init(tp); tp->rto = TCP_TIMEOUT_INIT; tp->mdev = TCP_TIMEOUT_INIT; @@ -1951,19 +1911,14 @@ static int tcp_v4_init_sock(struct sock *sk) /* See draft-stevens-tcpca-spec-01 for discussion of the * initialization of these values. */ - tp->snd_cwnd_cnt = 0; tp->snd_ssthresh = 0x7fffffff; /* Infinity */ tp->snd_cwnd_clamp = ~0; tp->mss_cache = 536; sk->state = TCP_CLOSE; - sk->max_ack_backlog = SOMAXCONN; sk->write_space = tcp_write_space; - /* Init SYN queue. */ - tcp_synq_init(tp); - sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific; return 0; @@ -1981,9 +1936,10 @@ static int tcp_v4_destroy_sock(struct sock *sk) /* Cleans up our, hopefuly empty, out_of_order_queue. */ __skb_queue_purge(&tp->out_of_order_queue); - /* Clean up a referenced TCP bind bucket, this only happens if a - * port is allocated for a socket, but it never fully connects. - */ + /* Clean prequeue, it must be empty really */ + __skb_queue_purge(&tp->ucopy.prequeue); + + /* Clean up a referenced TCP bind bucket. */ if(sk->prev != NULL) tcp_put_port(sk); @@ -1993,17 +1949,19 @@ static int tcp_v4_destroy_sock(struct sock *sk) /* Proc filesystem TCP sock list dumping. */ static void get_openreq(struct sock *sk, struct open_request *req, char *tmpbuf, int i) { - sprintf(tmpbuf, "%4d: %08lX:%04X %08lX:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p", + int ttd = req->expires - jiffies; + + sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X" + " %02X %08X:%08X %02X:%08X %08X %5d %8d %u %d %p", i, - (long unsigned int)req->af.v4_req.loc_addr, + req->af.v4_req.loc_addr, ntohs(sk->sport), - (long unsigned int)req->af.v4_req.rmt_addr, + req->af.v4_req.rmt_addr, ntohs(req->rmt_port), TCP_SYN_RECV, 0,0, /* could print option size, but that is af dependent. */ 1, /* timers active (only the expire timer) */ - (unsigned long)(req->expires - jiffies), + ttd, req->retrans, sk->socket ? sk->socket->inode->i_uid : 0, 0, /* non standard timer */ @@ -2017,7 +1975,7 @@ static void get_tcp_sock(struct sock *sp, char *tmpbuf, int i) { unsigned int dest, src; __u16 destp, srcp; - int timer_active, timer_active1, timer_active2; + int timer_active; unsigned long timer_expires; struct tcp_opt *tp = &sp->tp_pinfo.af_tcp; @@ -2025,15 +1983,16 @@ static void get_tcp_sock(struct sock *sp, char *tmpbuf, int i) src = sp->rcv_saddr; destp = ntohs(sp->dport); srcp = ntohs(sp->sport); - timer_active1 = tp->retransmit_timer.prev != NULL; - timer_active2 = sp->timer.prev != NULL; timer_active = 0; timer_expires = (unsigned) -1; - if (timer_active1 && tp->retransmit_timer.expires < timer_expires) { + if (tp->retransmit_timer.prev != NULL && tp->retransmit_timer.expires < timer_expires) { timer_active = 1; timer_expires = tp->retransmit_timer.expires; + } else if (tp->probe_timer.prev != NULL && tp->probe_timer.expires < timer_expires) { + timer_active = 4; + timer_expires = tp->probe_timer.expires; } - if (timer_active2 && sp->timer.expires < timer_expires) { + if (sp->timer.prev != NULL && sp->timer.expires < timer_expires) { timer_active = 2; timer_expires = sp->timer.expires; } @@ -2041,38 +2000,37 @@ static void get_tcp_sock(struct sock *sp, char *tmpbuf, int i) timer_expires = jiffies; sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5d %8d %ld %d %p", + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %ld %d %p %u %u %u %u", i, src, srcp, dest, destp, sp->state, tp->write_seq-tp->snd_una, tp->rcv_nxt-tp->copied_seq, timer_active, timer_expires-jiffies, tp->retransmits, sp->socket ? sp->socket->inode->i_uid : 0, - 0, + tp->probes_out, sp->socket ? sp->socket->inode->i_ino : 0, - atomic_read(&sp->refcnt), sp); + atomic_read(&sp->refcnt), sp, + tp->rto, tp->ack.ato, tp->ack.quick, tp->ack.pingpong + ); } static void get_timewait_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i) { unsigned int dest, src; __u16 destp, srcp; - int slot_dist; + int ttd = tw->ttd - jiffies; + + if (ttd < 0) + ttd = 0; dest = tw->daddr; src = tw->rcv_saddr; destp = ntohs(tw->dport); srcp = ntohs(tw->sport); - slot_dist = tw->death_slot; - if(slot_dist > tcp_tw_death_row_slot) - slot_dist = (TCP_TWKILL_SLOTS - slot_dist) + tcp_tw_death_row_slot; - else - slot_dist = tcp_tw_death_row_slot - slot_dist; - sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X" " %02X %08X:%08X %02X:%08X %08X %5d %8d %d %d %p", - i, src, srcp, dest, destp, TCP_TIME_WAIT, 0, 0, - 3, slot_dist * TCP_TWKILL_PERIOD, 0, 0, 0, 0, + i, src, srcp, dest, destp, tw->substate, 0, 0, + 3, ttd, 0, 0, 0, 0, atomic_read(&tw->refcnt), tw); } @@ -2093,6 +2051,8 @@ int tcp_get_info(char *buffer, char **start, off_t offset, int length) tcp_listen_lock(); for(i = 0; i < TCP_LHTABLE_SIZE; i++) { struct sock *sk = tcp_listening_hash[i]; + struct tcp_listen_opt *lopt; + int k; for (sk = tcp_listening_hash[i]; sk; sk = sk->next, num++) { struct open_request *req; @@ -2112,25 +2072,30 @@ int tcp_get_info(char *buffer, char **start, off_t offset, int length) } skip_listen: - lock_sock(sk); - for (req = tp->syn_wait_queue; req; req = req->dl_next, num++) { - if (req->sk) - continue; - if (!TCP_INET_FAMILY(req->class->family)) - continue; - - pos += 128; - if (pos < offset) - continue; - get_openreq(sk, req, tmpbuf, num); - len += sprintf(buffer+len, "%-127s\n", tmpbuf); - if(len >= length) { - tcp_listen_unlock(); - release_sock(sk); - goto out_no_bh; + read_lock_bh(&tp->syn_wait_lock); + lopt = tp->listen_opt; + if (lopt && lopt->qlen != 0) { + for (k=0; ksyn_table[k]; req; req = req->dl_next, num++) { + if (!TCP_INET_FAMILY(req->class->family)) + continue; + + pos += 128; + if (pos < offset) + continue; + get_openreq(sk, req, tmpbuf, num); + len += sprintf(buffer+len, "%-127s\n", tmpbuf); + if(len >= length) { + read_unlock_bh(&tp->syn_wait_lock); + tcp_listen_unlock(); + goto out_no_bh; + } + } } } - release_sock(sk); + read_unlock_bh(&tp->syn_wait_lock); + + /* Completed requests are in normal socket hash table */ } } tcp_listen_unlock(); @@ -2194,28 +2159,24 @@ struct proto tcp_prot = { tcp_v4_connect, /* connect */ tcp_disconnect, /* disconnect */ tcp_accept, /* accept */ - NULL, /* retransmit */ - tcp_write_wakeup, /* write_wakeup */ - tcp_read_wakeup, /* read_wakeup */ - tcp_poll, /* poll */ tcp_ioctl, /* ioctl */ tcp_v4_init_sock, /* init */ tcp_v4_destroy_sock, /* destroy */ tcp_shutdown, /* shutdown */ tcp_setsockopt, /* setsockopt */ tcp_getsockopt, /* getsockopt */ - tcp_v4_sendmsg, /* sendmsg */ + tcp_sendmsg, /* sendmsg */ tcp_recvmsg, /* recvmsg */ NULL, /* bind */ tcp_v4_do_rcv, /* backlog_rcv */ tcp_v4_hash, /* hash */ tcp_unhash, /* unhash */ tcp_v4_get_port, /* get_port */ - 128, /* max_header */ - 0, /* retransmits */ "TCP", /* name */ }; + + void __init tcp_v4_init(struct net_proto_family *ops) { int err; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e3d884dda316..d6bc8a2058c5 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_output.c,v 1.116 2000/01/13 00:19:49 davem Exp $ + * Version: $Id: tcp_output.c,v 1.119 2000/01/19 04:06:15 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -31,6 +31,7 @@ * during syn/ack processing. * David S. Miller : Output engine completely rewritten. * Andrea Arcangeli: SYNACK carry ts_recent in tsecr. + * Cacophonix Gaul : draft-minshall-nagle-01 * */ @@ -38,75 +39,65 @@ #include -extern int sysctl_tcp_timestamps; -extern int sysctl_tcp_window_scaling; -extern int sysctl_tcp_sack; - /* People can turn this off for buggy TCP's found in printers etc. */ int sysctl_tcp_retrans_collapse = 1; -/* Get rid of any delayed acks, we sent one already.. */ -static __inline__ void clear_delayed_acks(struct sock * sk) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - - tp->delayed_acks = 0; - if(tcp_in_quickack_mode(tp)) - tcp_exit_quickack_mode(tp); - tcp_clear_xmit_timer(sk, TIME_DACK); -} - static __inline__ void update_send_head(struct sock *sk) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - + tp->send_head = tp->send_head->next; if (tp->send_head == (struct sk_buff *) &sk->write_queue) tp->send_head = NULL; } /* Calculate mss to advertise in SYN segment. - RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that: - - 1. It is independent of path mtu. - 2. Ideally, it is maximal possible segment size i.e. 65535-40. - 3. For IPv4 it is reasonable to calculate it from maximal MTU of - attached devices, because some buggy hosts are confused by - large MSS. - 4. We do not make 3, we advertise MSS, calculated from first - hop device mtu, but allow to raise it to ip_rt_min_advmss. - This may be overriden via information stored in routing table. - 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible, - probably even Jumbo". + * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that: + * + * 1. It is independent of path mtu. + * 2. Ideally, it is maximal possible segment size i.e. 65535-40. + * 3. For IPv4 it is reasonable to calculate it from maximal MTU of + * attached devices, because some buggy hosts are confused by + * large MSS. + * 4. We do not make 3, we advertise MSS, calculated from first + * hop device mtu, but allow to raise it to ip_rt_min_advmss. + * This may be overriden via information stored in routing table. + * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible, + * probably even Jumbo". */ static __u16 tcp_advertise_mss(struct sock *sk) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct dst_entry *dst = __sk_dst_get(sk); - int mss; + int mss = tp->advmss; - if (dst) { + if (dst && dst->advmss < mss) { mss = dst->advmss; - } else { - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + tp->advmss = mss; + } - /* No dst. It is bad. Guess some reasonable value. - * Actually, this case should not be possible. - * SANITY. - */ - BUG_TRAP(dst!=NULL); + return (__u16)mss; +} - mss = tp->mss_cache; - mss += (tp->tcp_header_len - sizeof(struct tcphdr)) + - tp->ext_header_len; +static __inline__ void tcp_event_data_sent(struct tcp_opt *tp, struct sk_buff *skb) +{ + /* If we had a reply for ato after last received + * packet, enter pingpong mode. + */ + if ((u32)(tp->lsndtime - tp->ack.lrcvtime) < tp->ack.ato) + tp->ack.pingpong = 1; - /* Minimal MSS to include full set of of TCP/IP options - plus 8 bytes of data. It corresponds to mtu 128. - */ - if (mss < 88) - mss = 88; - } + tp->lsndtime = tcp_time_stamp; +} - return (__u16)mss; +static __inline__ void tcp_event_ack_sent(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + tp->last_ack_sent = tp->rcv_nxt; + tcp_dec_quickack_mode(tp); + tp->ack.pending = 0; + tcp_clear_xmit_timer(sk, TCP_TIME_DACK); } /* This routine actually transmits TCP packets queued in by @@ -120,7 +111,7 @@ static __u16 tcp_advertise_mss(struct sock *sk) * We are working here with either a clone of the original * SKB, or a fresh unique copy made by the retransmit engine. */ -void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) +int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) { if(skb != NULL) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); @@ -128,6 +119,7 @@ void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) int tcp_header_size = tp->tcp_header_len; struct tcphdr *th; int sysctl_flags; + int err; #define SYSCTL_FLAG_TSTAMPS 0x1 #define SYSCTL_FLAG_WSCALE 0x2 @@ -190,11 +182,29 @@ void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) } tp->af_specific->send_check(sk, th, skb->len, skb); - clear_delayed_acks(sk); - tp->last_ack_sent = tp->rcv_nxt; + if (th->ack) + tcp_event_ack_sent(sk); + + if (skb->len != tcp_header_size) + tcp_event_data_sent(tp, skb); + TCP_INC_STATS(TcpOutSegs); - tp->af_specific->queue_xmit(skb); + + err = tp->af_specific->queue_xmit(skb); + if (err <= 0) + return err; + + tcp_enter_cong_avoid(tp); + + /* NET_XMIT_CN is special. It does not guarantee, + * that this packet is lost. It tells that device + * is about to start to drop packets or already + * drops some packets of the same priority and + * invokes us to send less aggressively. + */ + return err == NET_XMIT_CN ? 0 : err; } + return -ENOBUFS; #undef SYSCTL_FLAG_TSTAMPS #undef SYSCTL_FLAG_WSCALE #undef SYSCTL_FLAG_SACK @@ -202,32 +212,33 @@ void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) /* This is the main buffer sending routine. We queue the buffer * and decide whether to queue or transmit now. + * + * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames, + * otherwise socket can stall. */ -void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue) +void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue, unsigned cur_mss) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); /* Advance write_seq and place onto the write_queue. */ - tp->write_seq += (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq); + tp->write_seq = TCP_SKB_CB(skb)->end_seq; __skb_queue_tail(&sk->write_queue, skb); - if (!force_queue && tp->send_head == NULL && tcp_snd_test(sk, skb)) { + if (!force_queue && tp->send_head == NULL && tcp_snd_test(tp, skb, cur_mss, 1)) { /* Send it out now. */ TCP_SKB_CB(skb)->when = tcp_time_stamp; - tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; - tp->packets_out++; - tcp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL)); - if(!tcp_timer_is_set(sk, TIME_RETRANS)) - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); - } else { - /* Queue it, remembering where we must start sending. */ - if (tp->send_head == NULL) - tp->send_head = skb; - if (!force_queue && tp->packets_out == 0 && !tp->pending) { - tp->pending = TIME_PROBE0; - tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto); + if (tcp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL)) == 0) { + tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; + tcp_minshall_update(tp, cur_mss, skb->len); + tp->packets_out++; + if(!tcp_timer_is_set(sk, TCP_TIME_RETRANS)) + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + return; } } + /* Queue it, remembering where we must start sending. */ + if (tp->send_head == NULL) + tp->send_head = skb; } /* Function to create two new TCP segments. Shrinks the given segment @@ -243,13 +254,13 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) /* Get a new skb... force flag on. */ buff = sock_wmalloc(sk, - (nsize + MAX_HEADER + sk->prot->max_header), + (nsize + MAX_TCP_HEADER + 15), 1, GFP_ATOMIC); if (buff == NULL) - return -1; /* We'll just try again later. */ + return -ENOMEM; /* We'll just try again later. */ /* Reserve space for headers. */ - skb_reserve(buff, MAX_HEADER + sk->prot->max_header); + skb_reserve(buff, MAX_TCP_HEADER); /* Correct the sequence numbers. */ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; @@ -276,8 +287,8 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) TCP_SKB_CB(buff)->sacked = 0; /* Copy and checksum data tail into the new buffer. */ - buff->csum = csum_partial_copy(skb->data + len, skb_put(buff, nsize), - nsize, 0); + buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize), + nsize, 0); /* This takes care of the FIN sequence number too. */ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; @@ -288,6 +299,11 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) /* Looks stupid, but our code really uses when of * skbs, which it never sent before. --ANK + * + * NOTE: several days after I added this, Dave repaired + * tcp_simple_retransmit() and it should not use ->when + * of never sent skbs more. I am not sure, so that + * this line remains until more careful investigation. --ANK */ TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; @@ -335,20 +351,19 @@ int tcp_sync_mss(struct sock *sk, u32 pmtu) if (mss_now > tp->mss_clamp) mss_now = tp->mss_clamp; - /* Now subtract TCP options size, not including SACKs */ - mss_now -= tp->tcp_header_len - sizeof(struct tcphdr); - /* Now subtract optional transport overhead */ mss_now -= tp->ext_header_len; - /* It we got too small (or even negative) value, - clamp it by 8 from below. Why 8 ? - Well, it could be 1 with the same success, - but if IP accepted segment of length 1, - it would love 8 even more 8) --ANK (980731) - */ - if (mss_now < 8) - mss_now = 8; + /* Then reserve room for full set of TCP options and 8 bytes of data */ + if (mss_now < 48) + mss_now = 48; + + /* Now subtract TCP options size, not including SACKs */ + mss_now -= tp->tcp_header_len - sizeof(struct tcphdr); + + /* Bound mss with half of window */ + if (tp->max_window && mss_now > (tp->max_window>>1)) + mss_now = max((tp->max_window>>1), 1); /* And store cached results */ tp->pmtu_cookie = pmtu; @@ -360,27 +375,30 @@ int tcp_sync_mss(struct sock *sk, u32 pmtu) /* This routine writes packets to the network. It advances the * send_head. This happens as incoming acks open up the remote * window for us. + * + * Returns 1, if no segments are in flight and we have queued segments, but + * cannot send anything now because of SWS or another problem. */ -void tcp_write_xmit(struct sock *sk) +int tcp_write_xmit(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); unsigned int mss_now; - /* Account for SACKS, we may need to fragment due to this. - * It is just like the real MSS changing on us midstream. - * We also handle things correctly when the user adds some - * IP options mid-stream. Silly to do, but cover it. - */ - mss_now = tcp_current_mss(sk); - - /* If we are zapped, the bytes will have to remain here. - * In time closedown will empty the write queue and all + /* If we are closed, the bytes will have to remain here. + * In time closedown will finish, we empty the write queue and all * will be happy. */ - if(!sk->zapped) { + if(sk->state != TCP_CLOSE) { struct sk_buff *skb; int sent_pkts = 0; + /* Account for SACKS, we may need to fragment due to this. + * It is just like the real MSS changing on us midstream. + * We also handle things correctly when the user adds some + * IP options mid-stream. Silly to do, but cover it. + */ + mss_now = tcp_current_mss(sk); + /* Anything on the transmit queue that fits the window can * be added providing we are: * @@ -388,27 +406,36 @@ void tcp_write_xmit(struct sock *sk) * b) not exceeding our congestion window. * c) not retransmitting [Nagle] */ - while((skb = tp->send_head) && tcp_snd_test(sk, skb)) { + while((skb = tp->send_head) && + tcp_snd_test(tp, skb, mss_now, tcp_skb_is_last(sk, skb))) { if (skb->len > mss_now) { if (tcp_fragment(sk, skb, mss_now)) break; } - /* Advance the send_head. This one is going out. */ - update_send_head(sk); TCP_SKB_CB(skb)->when = tcp_time_stamp; + if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))) + break; + /* Advance the send_head. This one is sent out. */ + update_send_head(sk); tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; + tcp_minshall_update(tp, mss_now, skb->len); tp->packets_out++; - tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); sent_pkts = 1; } /* If we sent anything, make sure the retransmit * timer is active. */ - if (sent_pkts && !tcp_timer_is_set(sk, TIME_RETRANS)) - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + if (sent_pkts) { + if (!tcp_timer_is_set(sk, TCP_TIME_RETRANS)) + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + return 0; + } + + return !tp->packets_out && tp->send_head; } + return 0; } /* This function returns the amount that we can raise the @@ -471,7 +498,7 @@ u32 __tcp_select_window(struct sock *sk) * but may be worse for the performance because of rcv_mss * fluctuations. --SAW 1998/11/1 */ - unsigned int mss = tp->rcv_mss; + unsigned int mss = tp->ack.rcv_mss; int free_space; u32 window; @@ -481,11 +508,19 @@ u32 __tcp_select_window(struct sock *sk) free_space = tp->window_clamp; if (tp->window_clamp < mss) mss = tp->window_clamp; - - if ((free_space < (tcp_full_space(sk) / 2)) && + + if ((free_space < (min((int)tp->window_clamp, tcp_full_space(sk)) / 2)) && (free_space < ((int) (mss/2)))) { window = 0; - tp->pred_flags = 0; + + /* THIS IS _VERY_ GOOD PLACE to play window clamp. + * if free_space becomes suspiciously low + * verify ratio rmem_alloc/(rcv_nxt - copied_seq), + * and if we predict that when free_space will be lower mss, + * rmem_alloc will run out of rcvbuf*2, shrink window_clamp. + * It will eliminate most of prune events! Very simple, + * it is the next thing to do. --ANK + */ } else { /* Get the largest window that is a nice multiple of mss. * Window clamp already applied above. @@ -542,9 +577,9 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m /* Optimize, actually we could also combine next_skb->csum * to skb->csum using a single add w/carry operation too. */ - skb->csum = csum_partial_copy(next_skb->data, - skb_put(skb, next_skb_size), - next_skb_size, skb->csum); + skb->csum = csum_partial_copy_nocheck(next_skb->data, + skb_put(skb, next_skb_size), + next_skb_size, skb->csum); } /* Update sequence range on original skb. */ @@ -603,8 +638,10 @@ void tcp_simple_retransmit(struct sock *sk) if (old_next_skb != skb || skb->len > mss) resend_skb = 1; old_next_skb = skb->next; - if (resend_skb != 0) - tcp_retransmit_skb(sk, skb); + if (resend_skb != 0) { + if (tcp_retransmit_skb(sk, skb)) + break; + } } } @@ -629,9 +666,21 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); unsigned int cur_mss = tcp_current_mss(sk); +#ifdef TCP_DEBUG + /* It was possible this summer, that retransmit timer + * raced with its deletion and hit socket with packets_out==0. + * I fixed it, but preserved the check in the place, + * where the fault occured. --ANK + */ + if (skb == NULL) { + printk("tcp_retransmit_skb: bug, skb==NULL, caller=%p\n", NET_CALLER(sk)); + return -EFAULT; + } +#endif + if(skb->len > cur_mss) { if(tcp_fragment(sk, skb, cur_mss)) - return 1; /* We'll try again later. */ + return -ENOMEM; /* We'll try again later. */ /* New SKB created, account for it. */ tp->packets_out++; @@ -646,7 +695,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) tcp_retrans_try_collapse(sk, skb, cur_mss); if(tp->af_specific->rebuild_header(sk)) - return 1; /* Routing failure or similar. */ + return -EHOSTUNREACH; /* Routing failure or similar. */ /* Some Solaris stacks overoptimize and ignore the FIN on a * retransmit when old data is attached. So strip it off @@ -673,13 +722,10 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) else skb = skb_clone(skb, GFP_ATOMIC); - tcp_transmit_skb(sk, skb); - /* Update global TCP statistics and return success. */ - sk->prot->retransmits++; TCP_INC_STATS(TcpRetransSegs); - return 0; + return tcp_transmit_skb(sk, skb); } /* This gets called after a retransmit timeout, and the initially @@ -774,7 +820,11 @@ void tcp_send_fin(struct sock *sk) */ mss_now = tcp_current_mss(sk); - if((tp->send_head != NULL) && (skb->len < mss_now)) { + /* Please, find seven differences of 2.3.33 and loook + * what I broke here. 8) --ANK + */ + + if(tp->send_head != NULL) { /* tcp_write_xmit() takes care of the rest. */ TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN; TCP_SKB_CB(skb)->end_seq++; @@ -783,31 +833,34 @@ void tcp_send_fin(struct sock *sk) /* Special case to avoid Nagle bogosity. If this * segment is the last segment, and it was queued * due to Nagle/SWS-avoidance, send it out now. + * + * Hmm... actually it overrides also congestion + * avoidance (OK for FIN) and retransmit phase + * (not OK? Added.). */ if(tp->send_head == skb && - !sk->nonagle && - skb->len < (tp->rcv_mss >> 1) && - tp->packets_out && - !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_URG)) { - update_send_head(sk); + !after(tp->write_seq, tp->snd_una + tp->snd_wnd) && + !tp->retransmits) { TCP_SKB_CB(skb)->when = tcp_time_stamp; - tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; - tp->packets_out++; - tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); - if(!tcp_timer_is_set(sk, TIME_RETRANS)) - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + if (!tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))) { + update_send_head(sk); + tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; + tp->packets_out++; + if(!tcp_timer_is_set(sk, TCP_TIME_RETRANS)) + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + } else + tcp_check_probe_timer(sk, tp); } } else { /* Socket is locked, keep trying until memory is available. */ do { skb = sock_wmalloc(sk, - (MAX_HEADER + - sk->prot->max_header), + MAX_TCP_HEADER + 15, 1, GFP_KERNEL); } while (skb == NULL); /* Reserve space for headers and prepare control bits. */ - skb_reserve(skb, MAX_HEADER + sk->prot->max_header); + skb_reserve(skb, MAX_TCP_HEADER); skb->csum = 0; TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN); TCP_SKB_CB(skb)->sacked = 0; @@ -816,7 +869,8 @@ void tcp_send_fin(struct sock *sk) /* FIN eats a sequence byte, write_seq advanced by tcp_send_skb(). */ TCP_SKB_CB(skb)->seq = tp->write_seq; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1; - tcp_send_skb(sk, skb, 0); + tcp_send_skb(sk, skb, 0, mss_now); + __tcp_push_pending_frames(sk, tp, mss_now); } } @@ -831,19 +885,19 @@ void tcp_send_active_reset(struct sock *sk, int priority) struct sk_buff *skb; /* NOTE: No TCP options attached and we never retransmit this. */ - skb = alloc_skb(MAX_HEADER + sk->prot->max_header, priority); + skb = alloc_skb(MAX_TCP_HEADER + 15, priority); if (!skb) return; /* Reserve space for headers and prepare control bits. */ - skb_reserve(skb, MAX_HEADER + sk->prot->max_header); + skb_reserve(skb, MAX_TCP_HEADER); skb->csum = 0; TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST); TCP_SKB_CB(skb)->sacked = 0; TCP_SKB_CB(skb)->urg_ptr = 0; /* Send it off. */ - TCP_SKB_CB(skb)->seq = tp->write_seq; + TCP_SKB_CB(skb)->seq = tp->snd_nxt; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(skb)->when = tcp_time_stamp; tcp_transmit_skb(sk, skb); @@ -859,13 +913,13 @@ int tcp_send_synack(struct sock *sk) struct tcp_opt* tp = &(sk->tp_pinfo.af_tcp); struct sk_buff* skb; - skb = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header), + skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); if (skb == NULL) return -ENOMEM; /* Reserve space for headers and prepare control bits. */ - skb_reserve(skb, MAX_HEADER + sk->prot->max_header); + skb_reserve(skb, MAX_TCP_HEADER); skb->csum = 0; TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_SYN); TCP_SKB_CB(skb)->sacked = 0; @@ -877,8 +931,7 @@ int tcp_send_synack(struct sock *sk) __skb_queue_tail(&sk->write_queue, skb); TCP_SKB_CB(skb)->when = tcp_time_stamp; tp->packets_out++; - tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); - return 0; + return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); } /* @@ -887,16 +940,17 @@ int tcp_send_synack(struct sock *sk) struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, struct open_request *req) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct tcphdr *th; int tcp_header_size; struct sk_buff *skb; - skb = sock_wmalloc(sk, MAX_HEADER + sk->prot->max_header, 1, GFP_ATOMIC); + skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); if (skb == NULL) return NULL; /* Reserve space for headers. */ - skb_reserve(skb, MAX_HEADER + sk->prot->max_header); + skb_reserve(skb, MAX_TCP_HEADER); skb->dst = dst_clone(dst); @@ -919,7 +973,7 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ __u8 rcv_wscale; /* Set this up on the first call only */ - req->window_clamp = skb->dst->window; + req->window_clamp = tp->window_clamp ? : skb->dst->window; /* tcp_full_space because it is guaranteed to be the first packet */ tcp_select_initial_window(tcp_full_space(sk), dst->advmss - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), @@ -951,7 +1005,7 @@ int tcp_connect(struct sock *sk, struct sk_buff *buff) struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); /* Reserve space for headers. */ - skb_reserve(buff, MAX_HEADER + sk->prot->max_header); + skb_reserve(buff, MAX_TCP_HEADER + 15); /* We'll fix this up when we get a response from the other end. * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. @@ -962,12 +1016,16 @@ int tcp_connect(struct sock *sk, struct sk_buff *buff) /* If user gave his TCP_MAXSEG, record it to clamp */ if (tp->user_mss) tp->mss_clamp = tp->user_mss; + tp->max_window = 0; tcp_sync_mss(sk, dst->pmtu); + tcp_initialize_rcv_mss(sk); - tp->window_clamp = dst->window; + if (!tp->window_clamp) + tp->window_clamp = dst->window; + tp->advmss = dst->advmss; tcp_select_initial_window(tcp_full_space(sk), - dst->advmss - (tp->tcp_header_len - sizeof(struct tcphdr)), + tp->advmss - (tp->tcp_header_len - sizeof(struct tcphdr)), &tp->rcv_wnd, &tp->window_clamp, sysctl_tcp_window_scaling, @@ -982,10 +1040,12 @@ int tcp_connect(struct sock *sk, struct sk_buff *buff) goto err_out; sk->err = 0; + sk->done = 0; tp->snd_wnd = 0; tp->snd_wl1 = 0; tp->snd_wl2 = tp->write_seq; tp->snd_una = tp->write_seq; + tp->snd_sml = tp->write_seq; tp->rcv_nxt = 0; tp->rcv_wup = 0; tp->copied_seq = 0; @@ -1006,13 +1066,14 @@ int tcp_connect(struct sock *sk, struct sk_buff *buff) /* Send it off. */ TCP_SKB_CB(buff)->when = tcp_time_stamp; + tp->syn_stamp = TCP_SKB_CB(buff)->when; __skb_queue_tail(&sk->write_queue, buff); tp->packets_out++; tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL)); TCP_INC_STATS(TcpActiveOpens); /* Timer for repeating the SYN until an answer. */ - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); return 0; err_out: @@ -1025,16 +1086,14 @@ err_out: * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check() * for details. */ -void tcp_send_delayed_ack(struct sock *sk, int max_timeout) +void tcp_send_delayed_ack(struct sock *sk) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; unsigned long timeout; /* Stay within the limit we were given */ - timeout = (tp->ato << 1) >> 1; - if (timeout > max_timeout) - timeout = max_timeout; - timeout += jiffies; + timeout = tp->ack.ato; + timeout += jiffies + (timeout>>2); /* Use new timeout only if there wasn't a older one earlier. */ spin_lock_bh(&sk->timer_lock); @@ -1042,18 +1101,46 @@ void tcp_send_delayed_ack(struct sock *sk, int max_timeout) sock_hold(sk); tp->delack_timer.expires = timeout; } else { + /* If delack timer was blocked or is about to expire, + * send ACK now. + */ + if (tp->ack.blocked || time_before_eq(tp->delack_timer.expires, jiffies+(tp->ack.ato>>2))) { + spin_unlock_bh(&sk->timer_lock); + + tcp_send_ack(sk); + __sock_put(sk); + return; + } + if (time_before(timeout, tp->delack_timer.expires)) tp->delack_timer.expires = timeout; } add_timer(&tp->delack_timer); spin_unlock_bh(&sk->timer_lock); + +#ifdef TCP_FORMAL_WINDOW + /* Explanation. Header prediction path does not handle + * case of zero window. If we send ACK immediately, pred_flags + * are reset when sending ACK. If rcv_nxt is advanced and + * ack is not sent, than delayed ack is scheduled. + * Hence, it is the best place to check for zero window. + */ + if (tp->pred_flags) { + if (tcp_receive_window(tp) == 0) + tp->pred_flags = 0; + } else { + if (skb_queue_len(&tp->out_of_order_queue) == 0 && + !tp->urg_data) + tcp_fast_path_on(tp); + } +#endif } /* This routine sends an ack and also updates the window. */ void tcp_send_ack(struct sock *sk) { /* If we have been reset, we may not send again. */ - if(!sk->zapped) { + if(sk->state != TCP_CLOSE) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sk_buff *buff; @@ -1061,29 +1148,15 @@ void tcp_send_ack(struct sock *sk) * tcp_transmit_skb() will set the ownership to this * sock. */ - buff = alloc_skb(MAX_HEADER + sk->prot->max_header, GFP_ATOMIC); + buff = alloc_skb(MAX_TCP_HEADER + 15, GFP_ATOMIC); if (buff == NULL) { - /* Force it to send an ack. We don't have to do this - * (ACK is unreliable) but it's much better use of - * bandwidth on slow links to send a spare ack than - * resend packets. - * - * This is the one possible way that we can delay an - * ACK and have tp->ato indicate that we are in - * quick ack mode, so clear it. It is also the only - * possible way for ato to be zero, when ACK'ing a - * SYNACK because we've taken no ATO measurement yet. - */ - if (tcp_in_quickack_mode(tp)) - tcp_exit_quickack_mode(tp); - if (!tp->ato) - tp->ato = tp->rto; - tcp_send_delayed_ack(sk, HZ/2); + tp->ack.pending = 1; + tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX); return; } /* Reserve space for headers and prepare control bits. */ - skb_reserve(buff, MAX_HEADER + sk->prot->max_header); + skb_reserve(buff, MAX_TCP_HEADER); buff->csum = 0; TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK; TCP_SKB_CB(buff)->sacked = 0; @@ -1099,24 +1172,20 @@ void tcp_send_ack(struct sock *sk) /* This routine sends a packet with an out of date sequence * number. It assumes the other end will try to ack it. */ -void tcp_write_wakeup(struct sock *sk) +int tcp_write_wakeup(struct sock *sk) { - /* After a valid reset we can send no more. */ - if (!sk->zapped) { + if (sk->state != TCP_CLOSE) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sk_buff *skb; - /* Write data can still be transmitted/retransmitted in the - * following states. If any other state is encountered, return. - * [listen/close will never occur here anyway] + /* Now this function is never called, while + * we have something not ACKed in queue. */ - if ((1 << sk->state) & - ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1| - TCPF_FIN_WAIT2|TCPF_LAST_ACK|TCPF_CLOSING)) - return; + BUG_TRAP(tp->snd_una == tp->snd_nxt); - if (before(tp->snd_nxt, tp->snd_una + tp->snd_wnd) && - ((skb = tp->send_head) != NULL)) { + if (tp->snd_wnd > (tp->snd_nxt-tp->snd_una) + && ((skb = tp->send_head) != NULL)) { + int err; unsigned long win_size; /* We are probing the opening of a window @@ -1126,24 +1195,26 @@ void tcp_write_wakeup(struct sock *sk) win_size = tp->snd_wnd - (tp->snd_nxt - tp->snd_una); if (win_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq) { if (tcp_fragment(sk, skb, win_size)) - return; /* Let a retransmit get it. */ + return -1; } - update_send_head(sk); TCP_SKB_CB(skb)->when = tcp_time_stamp; - tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; - tp->packets_out++; - tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); - if (!tcp_timer_is_set(sk, TIME_RETRANS)) - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); + if (!err) { + update_send_head(sk); + tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; + tp->packets_out++; + if (!tcp_timer_is_set(sk, TCP_TIME_RETRANS)) + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + } + return err; } else { /* We don't queue it, tcp_transmit_skb() sets ownership. */ - skb = alloc_skb(MAX_HEADER + sk->prot->max_header, - GFP_ATOMIC); + skb = alloc_skb(MAX_TCP_HEADER + 15, GFP_ATOMIC); if (skb == NULL) - return; + return -1; /* Reserve space for headers and set control bits. */ - skb_reserve(skb, MAX_HEADER + sk->prot->max_header); + skb_reserve(skb, MAX_TCP_HEADER); skb->csum = 0; TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK; TCP_SKB_CB(skb)->sacked = 0; @@ -1152,13 +1223,18 @@ void tcp_write_wakeup(struct sock *sk) /* Use a previous sequence. This should cause the other * end to send an ack. Don't queue or clone SKB, just * send it. + * + * RED-PEN: logically it should be snd_una-1. + * snd_nxt-1 will not be acked. snd_una==snd_nxt + * in this place however. Right? */ - TCP_SKB_CB(skb)->seq = tp->snd_nxt - 1; + TCP_SKB_CB(skb)->seq = tp->snd_una - 1; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_transmit_skb(sk, skb); + return tcp_transmit_skb(sk, skb); } } + return -1; } /* A window probe timeout has occurred. If window is not closed send @@ -1167,11 +1243,32 @@ void tcp_write_wakeup(struct sock *sk) void tcp_send_probe0(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int err; + + err = tcp_write_wakeup(sk); + + if (tp->packets_out || !tp->send_head) { + /* Cancel probe timer, if it is not required. */ + tp->probes_out = 0; + tp->backoff = 0; + return; + } - tcp_write_wakeup(sk); - tp->pending = TIME_PROBE0; - tp->backoff++; - tp->probes_out++; - tcp_reset_xmit_timer (sk, TIME_PROBE0, - min(tp->rto << tp->backoff, 120*HZ)); + if (err <= 0) { + tp->backoff++; + tp->probes_out++; + tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, + min(tp->rto << tp->backoff, TCP_RTO_MAX)); + } else { + /* If packet was not sent due to local congestion, + * do not backoff and do not remember probes_out. + * Let local senders to fight for local resources. + * + * Use accumulated backoff yet. + */ + if (!tp->probes_out) + tp->probes_out=1; + tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, + min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL)); + } } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index a38724e42e7d..bff4e872fce0 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_timer.c,v 1.68 1999/09/07 02:31:43 davem Exp $ + * Version: $Id: tcp_timer.c,v 1.71 2000/01/18 08:24:19 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -23,29 +23,20 @@ #include int sysctl_tcp_syn_retries = TCP_SYN_RETRIES; +int sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; int sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; int sysctl_tcp_retries1 = TCP_RETR1; int sysctl_tcp_retries2 = TCP_RETR2; +int sysctl_tcp_orphan_retries = TCP_ORPHAN_RETRIES; - -static void tcp_sltimer_handler(unsigned long); -static void tcp_syn_recv_timer(unsigned long); +static void tcp_retransmit_timer(unsigned long); +static void tcp_delack_timer(unsigned long); +static void tcp_probe_timer(unsigned long); +static void tcp_keepalive_timer (unsigned long data); static void tcp_twkill(unsigned long); -struct timer_list tcp_slow_timer = { - NULL, NULL, - 0, 0, - tcp_sltimer_handler, -}; - - -struct tcp_sl_timer tcp_slt_array[TCP_SLT_MAX] = { - {ATOMIC_INIT(0), TCP_SYNACK_PERIOD, 0, tcp_syn_recv_timer},/* SYNACK */ - {ATOMIC_INIT(0), TCP_TWKILL_PERIOD, 0, tcp_twkill} /* TWKILL */ -}; - const char timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n"; /* @@ -56,17 +47,25 @@ const char timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n"; void tcp_init_xmit_timers(struct sock *sk) { - init_timer(&sk->tp_pinfo.af_tcp.retransmit_timer); - sk->tp_pinfo.af_tcp.retransmit_timer.function=&tcp_retransmit_timer; - sk->tp_pinfo.af_tcp.retransmit_timer.data = (unsigned long) sk; - - init_timer(&sk->tp_pinfo.af_tcp.delack_timer); - sk->tp_pinfo.af_tcp.delack_timer.function=&tcp_delack_timer; - sk->tp_pinfo.af_tcp.delack_timer.data = (unsigned long) sk; - - init_timer(&sk->tp_pinfo.af_tcp.probe_timer); - sk->tp_pinfo.af_tcp.probe_timer.function=&tcp_probe_timer; - sk->tp_pinfo.af_tcp.probe_timer.data = (unsigned long) sk; + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + + spin_lock_init(&sk->timer_lock); + + init_timer(&tp->retransmit_timer); + tp->retransmit_timer.function=&tcp_retransmit_timer; + tp->retransmit_timer.data = (unsigned long) sk; + + init_timer(&tp->delack_timer); + tp->delack_timer.function=&tcp_delack_timer; + tp->delack_timer.data = (unsigned long) sk; + + init_timer(&tp->probe_timer); + tp->probe_timer.function=&tcp_probe_timer; + tp->probe_timer.data = (unsigned long) sk; + + init_timer(&sk->timer); + sk->timer.function=&tcp_keepalive_timer; + sk->timer.data = (unsigned long) sk; } /* @@ -79,7 +78,7 @@ void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when) spin_lock_bh(&sk->timer_lock); switch (what) { - case TIME_RETRANS: + case TCP_TIME_RETRANS: /* When seting the transmit timer the probe timer * should not be set. * The delayed ack timer can be set if we are changing the @@ -89,29 +88,25 @@ void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when) __sock_put(sk); if (!tp->retransmit_timer.prev || !del_timer(&tp->retransmit_timer)) sock_hold(sk); - if (when > 120*HZ) { + if (when > TCP_RTO_MAX) { printk(KERN_DEBUG "reset_xmit_timer sk=%p when=0x%lx, caller=%p\n", sk, when, NET_CALLER(sk)); - when = 120*HZ; + when = TCP_RTO_MAX; } mod_timer(&tp->retransmit_timer, jiffies+when); break; - case TIME_DACK: + case TCP_TIME_DACK: if (!tp->delack_timer.prev || !del_timer(&tp->delack_timer)) sock_hold(sk); mod_timer(&tp->delack_timer, jiffies+when); break; - case TIME_PROBE0: + case TCP_TIME_PROBE0: if (!tp->probe_timer.prev || !del_timer(&tp->probe_timer)) sock_hold(sk); mod_timer(&tp->probe_timer, jiffies+when); break; - case TIME_WRITE: - printk(KERN_DEBUG "bug: tcp_reset_xmit_timer TIME_WRITE\n"); - break; - default: printk(KERN_DEBUG "bug: unknown timer value\n"); }; @@ -127,6 +122,7 @@ void tcp_clear_xmit_timers(struct sock *sk) __sock_put(sk); if(tp->delack_timer.prev && del_timer(&tp->delack_timer)) __sock_put(sk); + tp->ack.blocked = 0; if(tp->probe_timer.prev && del_timer(&tp->probe_timer)) __sock_put(sk); if(sk->timer.prev && del_timer(&sk->timer)) @@ -134,39 +130,33 @@ void tcp_clear_xmit_timers(struct sock *sk) spin_unlock_bh(&sk->timer_lock); } -static void tcp_write_err(struct sock *sk, int force) +static void tcp_write_err(struct sock *sk) { - sk->err = sk->err_soft ? sk->err_soft : ETIMEDOUT; + sk->err = sk->err_soft ? : ETIMEDOUT; sk->error_report(sk); - tcp_clear_xmit_timers(sk); - - /* Do not time wait the socket. It is timed out and, hence, - * idle for 120*HZ. "force" argument is ignored, delete - * it eventually. - */ - - /* Clean up time. */ - tcp_set_state(sk, TCP_CLOSE); tcp_done(sk); } /* A write timeout has occurred. Process the after effects. */ -static void tcp_write_timeout(struct sock *sk) +static int tcp_write_timeout(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int retry_until; - /* Look for a 'soft' timeout. */ - if ((sk->state == TCP_ESTABLISHED && - tp->retransmits && (tp->retransmits % TCP_QUICK_TRIES) == 0) || - (sk->state != TCP_ESTABLISHED && tp->retransmits > sysctl_tcp_retries1)) { - /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black - hole detection. :-( + if ((1<state)&(TCPF_SYN_SENT|TCPF_SYN_RECV)) { + if (tp->retransmits) + dst_negative_advice(&sk->dst_cache); + retry_until = tp->syn_retries ? : sysctl_tcp_syn_retries; + } else { + if (tp->retransmits >= sysctl_tcp_retries1) { + /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black + hole detection. :-( - It is place to make it. It is not made. I do not want - to make it. It is disguisting. It does not work in any - case. Let me to cite the same draft, which requires for - us to implement this: + It is place to make it. It is not made. I do not want + to make it. It is disguisting. It does not work in any + case. Let me to cite the same draft, which requires for + us to implement this: "The one security concern raised by this memo is that ICMP black holes are often caused by over-zealous security administrators who block @@ -177,57 +167,70 @@ static void tcp_write_timeout(struct sock *sk) be far nicer to have all of the black holes fixed rather than fixing all of the TCP implementations." - Golden words :-). - */ + Golden words :-). + */ - dst_negative_advice(&sk->dst_cache); + dst_negative_advice(&sk->dst_cache); + } + retry_until = sysctl_tcp_retries2; + if (sk->dead) + retry_until = sysctl_tcp_orphan_retries; } - - /* Have we tried to SYN too many times (repent repent 8)) */ - if (sk->state == TCP_SYN_SENT && - ((!tp->syn_retries && tp->retransmits > sysctl_tcp_syn_retries) || - (tp->syn_retries && tp->retransmits > tp->syn_retries))) { - tcp_write_err(sk, 1); - /* Don't FIN, we got nothing back */ - } else if (tp->retransmits > sysctl_tcp_retries2) { + + if (tp->retransmits >= retry_until) { /* Has it gone just too far? */ - tcp_write_err(sk, 0); + tcp_write_err(sk); + return 1; } + return 0; } -void tcp_delack_timer(unsigned long data) +static void tcp_delack_timer(unsigned long data) { struct sock *sk = (struct sock*)data; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); bh_lock_sock(sk); if (sk->lock.users) { /* Try again later. */ - tcp_reset_xmit_timer(sk, TIME_DACK, HZ/5); + tp->ack.blocked = 1; + NET_INC_STATS_BH(DelayedACKLocked); + tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MIN); goto out_unlock; } - if(!sk->zapped && - sk->tp_pinfo.af_tcp.delayed_acks && - sk->state != TCP_CLOSE) + if (tp->ack.pending) { + /* Delayed ACK missed: inflate ATO, leave pingpong mode */ + tp->ack.ato = min(tp->ack.ato<<1, TCP_ATO_MAX); + tp->ack.pingpong = 0; tcp_send_ack(sk); + NET_INC_STATS_BH(DelayedACKs); + } + TCP_CHECK_TIMER(sk); out_unlock: bh_unlock_sock(sk); sock_put(sk); } -void tcp_probe_timer(unsigned long data) +static void tcp_probe_timer(unsigned long data) { struct sock *sk = (struct sock*)data; struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - - if(sk->zapped) - goto out; + int max_probes; bh_lock_sock(sk); if (sk->lock.users) { /* Try again later. */ - tcp_reset_xmit_timer(sk, TIME_PROBE0, HZ/5); + tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, HZ/5); + goto out_unlock; + } + + if (sk->state == TCP_CLOSE) + goto out_unlock; + + if (tp->packets_out || !tp->send_head) { + tp->probes_out = 0; goto out_unlock; } @@ -246,151 +249,251 @@ void tcp_probe_timer(unsigned long data) * with RFCs, only probe timer combines both retransmission timeout * and probe timeout in one bottle. --ANK */ - if (tp->probes_out > sysctl_tcp_retries2) { - tcp_write_err(sk, 0); + max_probes = sk->dead ? sysctl_tcp_orphan_retries : sysctl_tcp_retries2; + + if (tp->probes_out > max_probes) { + tcp_write_err(sk); } else { /* Only send another probe if we didn't close things up. */ tcp_send_probe0(sk); + TCP_CHECK_TIMER(sk); } out_unlock: bh_unlock_sock(sk); -out: sock_put(sk); } /* Kill off TIME_WAIT sockets once their lifetime has expired. */ -int tcp_tw_death_row_slot = 0; -static struct tcp_tw_bucket *tcp_tw_death_row[TCP_TWKILL_SLOTS] = - { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL }; -static spinlock_t tw_death_lock = SPIN_LOCK_UNLOCKED; +static int tcp_tw_death_row_slot = 0; +int tcp_tw_count = 0; +static struct tcp_tw_bucket *tcp_tw_death_row[TCP_TWKILL_SLOTS]; +static spinlock_t tw_death_lock = SPIN_LOCK_UNLOCKED; +static struct timer_list tcp_tw_timer = { function: tcp_twkill }; static void tcp_twkill(unsigned long data) { struct tcp_tw_bucket *tw; int killed = 0; - /* The death-row tw chains are only ever touched - * in BH context so no BH disabling (for now) is needed. + /* NOTE: compare this to previous version where lock + * was released after detaching chain. It was racy, + * because tw buckets are scheduled in not serialized context + * in 2.3 (with netfilter), and with softnet it is common, because + * soft irqs are not sequenced. */ spin_lock(&tw_death_lock); - tw = tcp_tw_death_row[tcp_tw_death_row_slot]; - tcp_tw_death_row[tcp_tw_death_row_slot] = NULL; - tcp_tw_death_row_slot = - ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1)); - spin_unlock(&tw_death_lock); - while(tw != NULL) { - struct tcp_tw_bucket *next = tw->next_death; + if (tcp_tw_count == 0) + goto out; + + while((tw = tcp_tw_death_row[tcp_tw_death_row_slot]) != NULL) { + tcp_tw_death_row[tcp_tw_death_row_slot] = tw->next_death; + tw->pprev_death = NULL; + spin_unlock(&tw_death_lock); tcp_timewait_kill(tw); tcp_tw_put(tw); + killed++; - tw = next; - } - if(killed != 0) { - struct tcp_sl_timer *slt = (struct tcp_sl_timer *)data; - atomic_sub(killed, &slt->count); + + spin_lock(&tw_death_lock); } + tcp_tw_death_row_slot = + ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1)); + + if ((tcp_tw_count -= killed) != 0) + mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD); + net_statistics[smp_processor_id()*2].TimeWaited += killed; +out: + spin_unlock(&tw_death_lock); } /* These are always called from BH context. See callers in * tcp_input.c to verify this. */ -void tcp_tw_schedule(struct tcp_tw_bucket *tw) -{ - struct tcp_tw_bucket **tpp; - int slot; +/* This is for handling early-kills of TIME_WAIT sockets. */ +void tcp_tw_deschedule(struct tcp_tw_bucket *tw) +{ spin_lock(&tw_death_lock); - slot = (tcp_tw_death_row_slot - 1) & (TCP_TWKILL_SLOTS - 1); - tpp = &tcp_tw_death_row[slot]; - if((tw->next_death = *tpp) != NULL) - (*tpp)->pprev_death = &tw->next_death; - *tpp = tw; - tw->pprev_death = tpp; - - tw->death_slot = slot; - atomic_inc(&tw->refcnt); + if (tw->pprev_death) { + if(tw->next_death) + tw->next_death->pprev_death = tw->pprev_death; + *tw->pprev_death = tw->next_death; + tw->pprev_death = NULL; + tcp_tw_put(tw); + if (--tcp_tw_count == 0) + del_timer(&tcp_tw_timer); + } spin_unlock(&tw_death_lock); - - tcp_inc_slow_timer(TCP_SLT_TWKILL); } -/* Happens rarely if at all, no care about scalability here. */ -void tcp_tw_reschedule(struct tcp_tw_bucket *tw) +/* Short-time timewait calendar */ + +static int tcp_twcal_hand = -1; +static int tcp_twcal_jiffie; +static void tcp_twcal_tick(unsigned long); +static struct timer_list tcp_twcal_timer = {NULL, NULL, 0, 0, tcp_twcal_tick,}; +static struct tcp_tw_bucket *tcp_twcal_row[TCP_TW_RECYCLE_SLOTS]; + +void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo) { struct tcp_tw_bucket **tpp; int slot; + /* timeout := RTO * 3.5 + * + * 3.5 = 1+2+0.5 to wait for two retransmits. + * + * RATIONALE: if FIN arrived and we entered TIME-WAIT state, + * our ACK acking that FIN can be lost. If N subsequent retransmitted + * FINs (or previous seqments) are lost (probability of such event + * is p^(N+1), where p is probability to lose single packet and + * time to detect the loss is about RTO*(2^N - 1) with exponential + * backoff). Normal timewait length is calculated so, that we + * waited at least for one retransmitted FIN (maximal RTO is 120sec). + * [ BTW Linux. following BSD, violates this requirement waiting + * only for 60sec, we should wait at least for 240 secs. + * Well, 240 consumes too much of resources 8) + * ] + * This interval is not reduced to catch old duplicate and + * responces to our wandering segments living for two MSLs. + * However, if we use PAWS to detect + * old duplicates, we can reduce the interval to bounds required + * by RTO, rather than MSL. So, if peer understands PAWS, we + * kill tw bucket after 3.5*RTO (it is important that this number + * is greater than TS tick!) and detect old duplicates with help + * of PAWS. + */ + slot = (timeo + (1<> TCP_TW_RECYCLE_TICK; + spin_lock(&tw_death_lock); + + /* Unlink it, if it was scheduled */ if (tw->pprev_death) { if(tw->next_death) tw->next_death->pprev_death = tw->pprev_death; *tw->pprev_death = tw->next_death; tw->pprev_death = NULL; + tcp_tw_count--; } else atomic_inc(&tw->refcnt); - slot = (tcp_tw_death_row_slot - 1) & (TCP_TWKILL_SLOTS - 1); - tpp = &tcp_tw_death_row[slot]; + if (slot >= TCP_TW_RECYCLE_SLOTS) { + /* Schedule to slow timer */ + if (timeo >= TCP_TIMEWAIT_LEN) { + slot = TCP_TWKILL_SLOTS-1; + } else { + slot = (timeo + TCP_TWKILL_PERIOD-1) / TCP_TWKILL_PERIOD; + if (slot >= TCP_TWKILL_SLOTS) + slot = TCP_TWKILL_SLOTS-1; + } + tw->ttd = jiffies + timeo; + slot = (tcp_tw_death_row_slot + slot) & (TCP_TWKILL_SLOTS - 1); + tpp = &tcp_tw_death_row[slot]; + } else { + tw->ttd = jiffies + (slot< (slot<next_death = *tpp) != NULL) (*tpp)->pprev_death = &tw->next_death; *tpp = tw; tw->pprev_death = tpp; - tw->death_slot = slot; + if (tcp_tw_count++ == 0) + mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD); spin_unlock(&tw_death_lock); - - /* Timer was incremented when we first entered the table. */ } -/* This is for handling early-kills of TIME_WAIT sockets. */ -void tcp_tw_deschedule(struct tcp_tw_bucket *tw) +void tcp_twcal_tick(unsigned long dummy) { + int n, slot; + unsigned long j; + unsigned long now = jiffies; + int killed = 0; + int adv = 0; + spin_lock(&tw_death_lock); - if (tw->pprev_death) { - if(tw->next_death) - tw->next_death->pprev_death = tw->pprev_death; - *tw->pprev_death = tw->next_death; - tw->pprev_death = NULL; - tcp_tw_put(tw); + if (tcp_twcal_hand < 0) + goto out; + + slot = tcp_twcal_hand; + j = tcp_twcal_jiffie; + + for (n=0; nnext_death; + tw->pprev_death = NULL; + + tcp_timewait_kill(tw); + tcp_tw_put(tw); + killed++; + } + } else { + if (!adv) { + adv = 1; + tcp_twcal_jiffie = j; + tcp_twcal_hand = slot; + } + + if (tcp_twcal_row[slot] != NULL) { + mod_timer(&tcp_twcal_timer, j); + goto out; + } + } + j += (1<tp_pinfo.af_tcp; - /* We are reset. We will send no more retransmits. */ - if(sk->zapped) - goto out; - bh_lock_sock(sk); if (sk->lock.users) { /* Try again later */ - tcp_reset_xmit_timer(sk, TIME_RETRANS, HZ/20); + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, HZ/20); goto out_unlock; } - /* Clear delay ack timer. */ - tcp_clear_xmit_timer(sk, TIME_DACK); + if (sk->state == TCP_CLOSE || tp->packets_out == 0) + goto out_unlock; + + BUG_TRAP(!skb_queue_empty(&sk->write_queue)); + + if (tcp_write_timeout(sk)) + goto out_unlock; /* RFC 2018, clear all 'sacked' flags in retransmission queue, * the sender may have dropped out of order frames and we must @@ -426,11 +529,19 @@ void tcp_retransmit_timer(unsigned long data) tp->snd_cwnd = 1; } - tp->retransmits++; - tp->dup_acks = 0; tp->high_seq = tp->snd_nxt; - tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)); + if (tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)) > 0) { + /* Retransmission failed because of local congestion, + * do not backoff. + */ + if (!tp->retransmits) + tp->retransmits=1; + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, + min(tp->rto, TCP_RESOURCE_PROBE_INTERVAL)); + TCP_CHECK_TIMER(sk); + goto out_unlock; + } /* Increase the timeout each time we retransmit. Note that * we do not increase the rtt estimate. rto is initialized @@ -448,132 +559,105 @@ void tcp_retransmit_timer(unsigned long data) * the 120 second clamps though! */ tp->backoff++; - tp->rto = min(tp->rto << 1, 120*HZ); - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); - - tcp_write_timeout(sk); + tp->retransmits++; + tp->rto = min(tp->rto << 1, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + TCP_CHECK_TIMER(sk); out_unlock: bh_unlock_sock(sk); -out: sock_put(sk); } /* - * Slow timer for SYN-RECV sockets + * Timer for listening sockets */ -static void tcp_do_syn_queue(struct sock *sk, struct tcp_opt *tp, unsigned long now) -{ - struct open_request *prev, *req; - - prev = (struct open_request *) &tp->syn_wait_queue; - for(req = tp->syn_wait_queue; req; ) { - struct open_request *next = req->dl_next; - - if (!req->sk && (long)(now - req->expires) >= 0) { - tcp_synq_unlink(tp, req, prev); - if(req->retrans >= sysctl_tcp_retries1) { - (*req->class->destructor)(req); - tcp_dec_slow_timer(TCP_SLT_SYNACK); - tp->syn_backlog--; - tcp_openreq_free(req); - if (! tp->syn_wait_queue) - break; - } else { - unsigned long timeo; - struct open_request *rp; - - (*req->class->rtx_syn_ack)(sk, req); - req->retrans++; - timeo = min((TCP_TIMEOUT_INIT << req->retrans), - (120 * HZ)); - req->expires = now + timeo; - rp = prev->dl_next; - tcp_synq_queue(tp, req); - if(rp != prev->dl_next) - prev = prev->dl_next; - } - } else - prev = req; - req = next; - } -} - -/* This now scales very nicely. -DaveM */ -static void tcp_syn_recv_timer(unsigned long data) +static void tcp_synack_timer(struct sock *sk) { - struct sock *sk; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct tcp_listen_opt *lopt = tp->listen_opt; + int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries; + int thresh = max_retries; unsigned long now = jiffies; - int i; - - read_lock(&tcp_lhash_lock); - for(i = 0; i < TCP_LHTABLE_SIZE; i++) { - sk = tcp_listening_hash[i]; - while(sk) { - struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - - /* TCP_LISTEN is implied. */ - bh_lock_sock(sk); - if (!sk->lock.users && tp->syn_wait_queue) - tcp_do_syn_queue(sk, tp, now); - bh_unlock_sock(sk); - sk = sk->next; + struct open_request **reqp, *req; + int i, budget; + + if (lopt == NULL || lopt->qlen == 0) + return; + + /* Normally all the openreqs are young and become mature + * (i.e. converted to established socket) for first timeout. + * If synack was not acknowledged for 3 seconds, it means + * one of the following things: synack was lost, ack was lost, + * rtt is high or nobody planned to ack (i.e. synflood). + * When server is a bit loaded, queue is populated with old + * open requests, reducing effective size of queue. + * When server is well loaded, queue size reduces to zero + * after several minutes of work. It is not synflood, + * it is normal operation. The solution is pruning + * too old entries overriding normal timeout, when + * situation becomes dangerous. + * + * Essentially, we reserve half of room for young + * embrions; and abort old ones without pity, if old + * ones are about to clog our table. + */ + if (lopt->qlen>>(lopt->max_qlen_log-1)) { + int young = (lopt->qlen_young<<1); + + while (thresh > 2) { + if (lopt->qlen < young) + break; + thresh--; + young <<= 1; } } - read_unlock(&tcp_lhash_lock); -} - -void tcp_sltimer_handler(unsigned long data) -{ - struct tcp_sl_timer *slt = tcp_slt_array; - unsigned long next = ~0UL; - unsigned long now = jiffies; - int i; - for (i=0; i < TCP_SLT_MAX; i++, slt++) { - if (atomic_read(&slt->count)) { - long trigger; - - trigger = slt->period - ((long)(now - slt->last)); - - if (trigger <= 0) { - (*slt->handler)((unsigned long) slt); - slt->last = now; - trigger = slt->period; + if (tp->defer_accept) + max_retries = tp->defer_accept; + + budget = 2*(TCP_SYNQ_HSIZE/(TCP_TIMEOUT_INIT/TCP_SYNQ_INTERVAL)); + i = lopt->clock_hand; + + do { + reqp=&lopt->syn_table[i]; + while ((req = *reqp) != NULL) { + if ((long)(now - req->expires) >= 0) { + if ((req->retrans < thresh || + (req->acked && req->retrans < max_retries)) + && !req->class->rtx_syn_ack(sk, req, NULL)) { + unsigned long timeo; + + if (req->retrans++ == 0) + lopt->qlen_young--; + timeo = min((TCP_TIMEOUT_INIT << req->retrans), + TCP_RTO_MAX); + req->expires = now + timeo; + reqp = &req->dl_next; + continue; + } + + /* Drop this request */ + write_lock(&tp->syn_wait_lock); + *reqp = req->dl_next; + write_unlock(&tp->syn_wait_lock); + lopt->qlen--; + if (req->retrans == 0) + lopt->qlen_young--; + tcp_openreq_free(req); } - - /* Only reschedule if some events remain. */ - if (atomic_read(&slt->count)) - next = min(next, trigger); + reqp = &req->dl_next; } - } - if (next != ~0UL) - mod_timer(&tcp_slow_timer, (now + next)); -} -/* __tcp_inc_slow_timer is called when an slow timer is started - * first time (slt->count was 0). There is race condition between - * timer creation and deletion and if we do not force adding timer here, - * we might lose timer. We could avoid it with global spinlock, but - * it is apparently overkill, so that we restart timer ALWAYS when - * this function is entered, it guarantees that timer will not lost. - */ + i = (i+1)&(TCP_SYNQ_HSIZE-1); -void __tcp_inc_slow_timer(struct tcp_sl_timer *slt) -{ - unsigned long now = jiffies; - unsigned long when; + } while (--budget > 0); - slt->last = now; + lopt->clock_hand = i; - when = now + slt->period; - - if (tcp_slow_timer.prev && - (long)(tcp_slow_timer.expires - when) < 0) - when = tcp_slow_timer.expires; - - mod_timer(&tcp_slow_timer, when); + if (lopt->qlen) + tcp_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL); } void tcp_delete_keepalive_timer (struct sock *sk) @@ -595,6 +679,9 @@ void tcp_reset_keepalive_timer (struct sock *sk, unsigned long len) void tcp_set_keepalive(struct sock *sk, int val) { + if ((1<state)&(TCPF_CLOSE|TCPF_LISTEN)) + return; + if (val && !sk->keepopen) tcp_reset_keepalive_timer(sk, keepalive_time_when(&sk->tp_pinfo.af_tcp)); else if (!val) @@ -602,7 +689,7 @@ void tcp_set_keepalive(struct sock *sk, int val) } -void tcp_keepalive_timer (unsigned long data) +static void tcp_keepalive_timer (unsigned long data) { struct sock *sk = (struct sock *) data; struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; @@ -616,14 +703,31 @@ void tcp_keepalive_timer (unsigned long data) goto out; } - if (sk->state == TCP_FIN_WAIT2 && sk->dead) + if (sk->state == TCP_LISTEN) { + tcp_synack_timer(sk); + goto out; + } + + if (sk->state == TCP_FIN_WAIT2 && sk->dead) { + if (tp->linger2 >= 0) { + int tmo = tcp_fin_time(tp) - TCP_TIMEWAIT_LEN; + + if (tmo > 0) { + tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); + goto out; + } + } + tcp_send_active_reset(sk, GFP_ATOMIC); goto death; + } - if (!sk->keepopen) + if (!sk->keepopen || sk->state == TCP_CLOSE) goto out; elapsed = keepalive_time_when(tp); - if (!((1<state) & (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT2))) + + /* It is alive without keepalive 8) */ + if (tp->packets_out || tp->send_head) goto resched; elapsed = tcp_time_stamp - tp->rcv_tstamp; @@ -632,28 +736,30 @@ void tcp_keepalive_timer (unsigned long data) if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) || (tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) { tcp_send_active_reset(sk, GFP_ATOMIC); - tcp_write_err(sk, 1); + tcp_write_err(sk); goto out; } - tp->probes_out++; - tp->pending = TIME_KEEPOPEN; - tcp_write_wakeup(sk); - elapsed = keepalive_intvl_when(tp); + if (tcp_write_wakeup(sk) <= 0) { + tp->probes_out++; + elapsed = keepalive_intvl_when(tp); + } else { + /* If keepalive was lost due to local congestion, + * try harder. + */ + elapsed = TCP_RESOURCE_PROBE_INTERVAL; + } } else { /* It is tp->rcv_tstamp + keepalive_time_when(tp) */ - if (keepalive_time_when(tp) > elapsed) - elapsed = keepalive_time_when(tp) - elapsed; - else - elapsed = 0; + elapsed = keepalive_time_when(tp) - elapsed; } + TCP_CHECK_TIMER(sk); + resched: tcp_reset_keepalive_timer (sk, elapsed); goto out; death: - tcp_set_state(sk, TCP_CLOSE); - tcp_clear_xmit_timers(sk); tcp_done(sk); out: diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 9ace56abdbc9..c052d2eb8e59 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -5,7 +5,7 @@ * * The User Datagram Protocol (UDP). * - * Version: $Id: udp.c,v 1.77 2000/01/09 02:19:44 davem Exp $ + * Version: $Id: udp.c,v 1.79 2000/01/18 08:24:20 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -368,31 +368,16 @@ void udp_err(struct sk_buff *skb, unsigned char *dp, int len) break; } - /* - * Various people wanted BSD UDP semantics. Well they've come - * back out because they slow down response to stuff like dead - * or unreachable name servers and they screw term users something - * chronic. Oh and it violates RFC1122. So basically fix your - * client code people. - */ - /* * RFC1122: OK. Passes ICMP errors back to application, as per - * 4.1.3.3. After the comment above, that should be no surprise. - */ - - if (!harderr && !sk->protinfo.af_inet.recverr) - goto out; - - /* - * 4.x BSD compatibility item. Break RFC1122 to - * get BSD socket semantics. + * 4.1.3.3. */ - if(sk->bsdism && sk->state!=TCP_ESTABLISHED && !sk->protinfo.af_inet.recverr) - goto out; - - if (sk->protinfo.af_inet.recverr) + if (!sk->protinfo.af_inet.recverr) { + if (!harderr || sk->state != TCP_ESTABLISHED) + goto out; + } else { ip_icmp_error(sk, skb, err, uh->dest, info, (u8*)(uh+1)); + } sk->err = err; sk->error_report(sk); out: @@ -629,15 +614,13 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) { switch(cmd) { - case TIOCOUTQ: + case SIOCOUTQ: { - unsigned long amount; - - amount = sock_wspace(sk); + int amount = atomic_read(&sk->wmem_alloc); return put_user(amount, (int *)arg); } - case TIOCINQ: + case SIOCINQ: { struct sk_buff *skb; unsigned long amount; @@ -663,6 +646,17 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) return(0); } +static __inline__ int __udp_checksum_complete(struct sk_buff *skb) +{ + return (unsigned short)csum_fold(csum_partial(skb->h.raw, skb->len, skb->csum)); +} + +static __inline__ int udp_checksum_complete(struct sk_buff *skb) +{ + return skb->ip_summed != CHECKSUM_UNNECESSARY && + __udp_checksum_complete(skb); +} + /* * This should be easy, if there is something there we * return it, otherwise we block. @@ -699,31 +693,21 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, int len, msg->msg_flags |= MSG_TRUNC; } -#ifndef CONFIG_UDP_DELAY_CSUM - err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, - copied); -#else if (skb->ip_summed==CHECKSUM_UNNECESSARY) { err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, copied); - } else if (copied > msg->msg_iov[0].iov_len || (msg->msg_flags&MSG_TRUNC)) { - if ((unsigned short)csum_fold(csum_partial(skb->h.raw, skb->len, skb->csum))) + } else if (msg->msg_flags&MSG_TRUNC) { + if (__udp_checksum_complete(skb)) goto csum_copy_err; err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, copied); } else { - unsigned int csum; + err = copy_and_csum_toiovec(msg->msg_iov, skb, sizeof(struct udphdr)); - err = 0; - csum = csum_partial(skb->h.raw, sizeof(struct udphdr), skb->csum); - csum = csum_and_copy_to_user((char*)&skb->h.uh[1], msg->msg_iov[0].iov_base, - copied, csum, &err); if (err) - goto out_free; - if ((unsigned short)csum_fold(csum)) goto csum_copy_err; } -#endif + if (err) goto out_free; sk->stamp=skb->stamp; @@ -744,7 +728,6 @@ out_free: out: return err; -#ifdef CONFIG_UDP_DELAY_CSUM csum_copy_err: UDP_INC_STATS_BH(UdpInErrors); @@ -768,7 +751,6 @@ csum_copy_err: * as some normal condition. */ return (flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH; -#endif } int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) @@ -831,9 +813,9 @@ static int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) * Charge it to the socket, dropping if the queue is full. */ -#if defined(CONFIG_FILTER) && defined(CONFIG_UDP_DELAY_CSUM) +#if defined(CONFIG_FILTER) if (sk->filter && skb->ip_summed != CHECKSUM_UNNECESSARY) { - if ((unsigned short)csum_fold(csum_partial(skb->h.raw, skb->len, skb->csum))) { + if (__udp_checksum_complete(skb)) { UDP_INC_STATS_BH(UdpInErrors); IP_INC_STATS_BH(IpInDiscards); ip_statistics[smp_processor_id()*2].IpInDelivers--; @@ -855,12 +837,6 @@ static int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) return 0; } - -static inline void udp_deliver(struct sock *sk, struct sk_buff *skb) -{ - udp_queue_rcv_skb(sk, skb); -} - /* * Multicasts and broadcasts go to each listener. * @@ -889,7 +865,7 @@ static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh, skb1 = skb_clone(skb, GFP_ATOMIC); if(skb1) - udp_deliver(sk, skb1); + udp_queue_rcv_skb(sk, skb1); sk = sknext; } while(sknext); } else @@ -898,30 +874,25 @@ static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh, return 0; } -static int udp_checksum_verify(struct sk_buff *skb, struct udphdr *uh, - unsigned short ulen, u32 saddr, u32 daddr, - int full_csum_deferred) +/* Initialize UDP checksum. If exited with zero value (success), + * CHECKSUM_UNNECESSARY means, that no more checks are required. + * Otherwise, csum completion requires chacksumming packet body, + * including udp header and folding it to skb->csum. + */ +static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, + unsigned short ulen, u32 saddr, u32 daddr) { - if (!full_csum_deferred) { - if (uh->check) { - if (skb->ip_summed == CHECKSUM_HW && - udp_check(uh, ulen, saddr, daddr, skb->csum)) - return -1; - if (skb->ip_summed == CHECKSUM_NONE && - udp_check(uh, ulen, saddr, daddr, - csum_partial((char *)uh, ulen, 0))) - return -1; - } - } else { - if (uh->check == 0) - skb->ip_summed = CHECKSUM_UNNECESSARY; - else if (skb->ip_summed == CHECKSUM_HW) { - if (udp_check(uh, ulen, saddr, daddr, skb->csum)) - return -1; - skb->ip_summed = CHECKSUM_UNNECESSARY; - } else if (skb->ip_summed != CHECKSUM_UNNECESSARY) - skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); - } + if (uh->check == 0) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + } else if (skb->ip_summed == CHECKSUM_HW) { + if (udp_check(uh, ulen, saddr, daddr, skb->csum)) + return -1; + skb->ip_summed = CHECKSUM_UNNECESSARY; + } else if (skb->ip_summed != CHECKSUM_UNNECESSARY) + skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); + /* Probably, we should checksum udp header (it should be in cache + * in any case) and data in tiny packets (< rx copybreak). + */ return 0; } @@ -961,50 +932,33 @@ int udp_rcv(struct sk_buff *skb, unsigned short len) } skb_trim(skb, ulen); - if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) { - int defer; + if (udp_checksum_init(skb, uh, ulen, saddr, daddr) < 0) + goto csum_error; -#ifdef CONFIG_UDP_DELAY_CSUM - defer = 1; -#else - defer = 0; -#endif - if (udp_checksum_verify(skb, uh, ulen, saddr, daddr, defer)) - goto csum_error; + if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) return udp_v4_mcast_deliver(skb, uh, saddr, daddr); - } sk = udp_v4_lookup(saddr, uh->source, daddr, uh->dest, skb->dev->ifindex); - - if (sk == NULL) { - /* No socket. Drop packet silently, if checksum is wrong */ - if (udp_checksum_verify(skb, uh, ulen, saddr, daddr, 0)) - goto csum_error; - - UDP_INC_STATS_BH(UdpNoPorts); - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); - /* - * Hmm. We got an UDP packet to a port to which we - * don't wanna listen. Ignore it. - */ - kfree_skb(skb); - return(0); - } - if (udp_checksum_verify(skb, uh, ulen, saddr, daddr, -#ifdef CONFIG_UDP_DELAY_CSUM - 1 -#else - (sk->no_check & UDP_CSUM_NORCV) != 0 -#endif - )) { + if (sk != NULL) { + udp_queue_rcv_skb(sk, skb); sock_put(sk); - goto csum_error; + return 0; } - udp_deliver(sk, skb); - __sock_put(sk); - return 0; + /* No socket. Drop packet silently, if checksum is wrong */ + if (udp_checksum_complete(skb)) + goto csum_error; + + UDP_INC_STATS_BH(UdpNoPorts); + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + + /* + * Hmm. We got an UDP packet to a port to which we + * don't wanna listen. Ignore it. + */ + kfree_skb(skb); + return(0); csum_error: /* @@ -1090,10 +1044,6 @@ struct proto udp_prot = { udp_connect, /* connect */ udp_disconnect, /* disconnect */ NULL, /* accept */ - NULL, /* retransmit */ - NULL, /* write_wakeup */ - NULL, /* read_wakeup */ - datagram_poll, /* poll */ udp_ioctl, /* ioctl */ NULL, /* init */ NULL, /* destroy */ @@ -1107,7 +1057,5 @@ struct proto udp_prot = { udp_v4_hash, /* hash */ udp_v4_unhash, /* unhash */ udp_v4_get_port, /* good_socknum */ - 128, /* max_header */ - 0, /* retransmits */ "UDP", /* name */ }; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 8430729e5e09..11a435ab33a5 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -6,7 +6,7 @@ * Pedro Roque * Alexey Kuznetsov * - * $Id: addrconf.c,v 1.55 1999/12/15 22:39:40 davem Exp $ + * $Id: addrconf.c,v 1.57 2000/01/18 08:24:21 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -682,6 +682,23 @@ static int ipv6_generate_eui64(u8 *eui, struct net_device *dev) } return -1; } + +static int ipv6_inherit_eui64(u8 *eui, struct inet6_dev *idev) +{ + int err = -1; + struct inet6_ifaddr *ifp; + + read_lock_bh(&idev->lock); + for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) { + if (ifp->scope == IFA_LINK && !(ifp->flags&IFA_F_TENTATIVE)) { + memcpy(eui, ifp->addr.s6_addr+8, 8); + err = 0; + break; + } + } + read_unlock_bh(&idev->lock); + return err; +} #endif /* @@ -859,7 +876,8 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len) #ifdef CONFIG_IPV6_EUI64 if (pinfo->prefix_len == 64) { memcpy(&addr, &pinfo->prefix, 8); - if (ipv6_generate_eui64(addr.s6_addr + 8, dev)) { + if (ipv6_generate_eui64(addr.s6_addr + 8, dev) && + ipv6_inherit_eui64(addr.s6_addr + 8, in6_dev)) { in6_dev_put(in6_dev); return; } @@ -1519,7 +1537,7 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp) */ if (ifp->idev->cnf.forwarding == 0 && - (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) == 0 && + (dev->flags&IFF_LOOPBACK) == 0 && (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL)) { struct in6_addr all_routers; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 68badee52c2a..a8d396ba3653 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -7,7 +7,7 @@ * * Adapted from linux/net/ipv4/af_inet.c * - * $Id: af_inet6.c,v 1.49 1999/12/15 22:39:43 davem Exp $ + * $Id: af_inet6.c,v 1.52 2000/01/18 08:24:21 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -85,13 +85,17 @@ extern void ipv6_sysctl_register(void); extern void ipv6_sysctl_unregister(void); #endif +#ifdef INET_REFCNT_DEBUG atomic_t inet6_sock_nr; +#endif static void inet6_sock_destruct(struct sock *sk) { inet_sock_destruct(sk); +#ifdef INET_REFCNT_DEBUG atomic_dec(&inet6_sock_nr); +#endif MOD_DEC_USE_COUNT; } @@ -140,9 +144,6 @@ static int inet6_create(struct socket *sock, int protocol) sk->prot = prot; sk->backlog_rcv = prot->backlog_rcv; - sk->timer.data = (unsigned long)sk; - sk->timer.function = &tcp_keepalive_timer; - sk->net_pinfo.af_inet6.hop_limit = -1; sk->net_pinfo.af_inet6.mcast_hops = -1; sk->net_pinfo.af_inet6.mc_loop = 1; @@ -158,8 +159,16 @@ static int inet6_create(struct socket *sock, int protocol) sk->protinfo.af_inet.mc_index = 0; sk->protinfo.af_inet.mc_list = NULL; + if (ipv4_config.no_pmtu_disc) + sk->protinfo.af_inet.pmtudisc = IP_PMTUDISC_DONT; + else + sk->protinfo.af_inet.pmtudisc = IP_PMTUDISC_WANT; + + +#ifdef INET_REFCNT_DEBUG atomic_inc(&inet6_sock_nr); atomic_inc(&inet_sock_nr); +#endif MOD_INC_USE_COUNT; if (sk->type==SOCK_RAW && protocol==IPPROTO_RAW) @@ -421,7 +430,7 @@ struct proto_ops inet6_stream_ops = { sock_no_socketpair, /* a do nothing */ inet_accept, /* ok */ inet6_getname, - inet_poll, /* ok */ + tcp_poll, /* ok */ inet6_ioctl, /* must change */ inet_listen, /* ok */ inet_shutdown, /* ok */ diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index cfa18eee80d1..f1c211532dec 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque * - * $Id: icmp.c,v 1.25 2000/01/09 02:19:54 davem Exp $ + * $Id: icmp.c,v 1.26 2000/01/19 04:06:19 davem Exp $ * * Based on net/ipv4/icmp.c * @@ -146,19 +146,19 @@ static int icmpv6_getfrag(const void *data, struct in6_addr *saddr, */ if (offset) { - csum = csum_partial_copy((void *) msg->data + - offset - sizeof(struct icmp6hdr), - buff, len, msg->csum); + csum = csum_partial_copy_nocheck((void *) msg->data + + offset - sizeof(struct icmp6hdr), + buff, len, msg->csum); msg->csum = csum; return 0; } - csum = csum_partial_copy((void *) &msg->icmph, buff, - sizeof(struct icmp6hdr), msg->csum); + csum = csum_partial_copy_nocheck((void *) &msg->icmph, buff, + sizeof(struct icmp6hdr), msg->csum); - csum = csum_partial_copy((void *) msg->data, - buff + sizeof(struct icmp6hdr), - len - sizeof(struct icmp6hdr), csum); + csum = csum_partial_copy_nocheck((void *) msg->data, + buff + sizeof(struct icmp6hdr), + len - sizeof(struct icmp6hdr), csum); icmph = (struct icmp6hdr *) buff; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 099953e5350c..d458adc934dc 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque * - * $Id: ip6_fib.c,v 1.19 1999/08/31 07:04:00 davem Exp $ + * $Id: ip6_fib.c,v 1.20 2000/01/16 05:11:37 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -926,8 +926,8 @@ int fib6_del(struct rt6_info *rt) #if RT6_DEBUG >= 2 if (rt->u.dst.obsolete>0) { - BUG_TRAP(rt->u.dst.obsolete<=0); - return -EFAULT; + BUG_TRAP(fn==NULL || rt->u.dst.obsolete<=0); + return -ENOENT; } #endif if (fn == NULL || rt == &ip6_null_entry) diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index eddf935a087e..873d22c3de01 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -7,7 +7,7 @@ * * Based on linux/net/ipv4/ip_sockglue.c * - * $Id: ipv6_sockglue.c,v 1.30 2000/01/09 02:19:49 davem Exp $ + * $Id: ipv6_sockglue.c,v 1.31 2000/01/16 05:11:38 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -192,7 +192,9 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval, kfree_skb(pktopt); sk->destruct = inet_sock_destruct; +#ifdef INET_REFCNT_DEBUG atomic_dec(&inet6_sock_nr); +#endif MOD_DEC_USE_COUNT; retv = 0; break; @@ -271,7 +273,7 @@ update: if (sk->type == SOCK_STREAM) { if (opt) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - if ((tcp_connected(sk->state) || sk->state == TCP_SYN_SENT) + if (!((1<state)&(TCPF_LISTEN|TCPF_CLOSE)) && sk->daddr != LOOPBACK4_IPV6) { tp->ext_header_len = opt->opt_flen + opt->opt_nflen; tcp_sync_mss(sk, tp->pmtu_cookie); diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index ce9f17adcc67..412b0b5e6e86 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque * - * $Id: mcast.c,v 1.28 2000/01/09 02:19:50 davem Exp $ + * $Id: mcast.c,v 1.29 2000/01/18 08:24:21 davem Exp $ * * Based on linux/ipv4/igmp.c and linux/ipv4/ip_sockglue.c * @@ -500,7 +500,8 @@ void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) if (dev->hard_header) { unsigned char ha[MAX_ADDR_LEN]; ndisc_mc_map(snd_addr, ha, dev, 1); - dev->hard_header(skb, dev, ETH_P_IPV6, ha, NULL, full_len); + if (dev->hard_header(skb, dev, ETH_P_IPV6, ha, NULL, full_len) < 0) + goto out; } if (ipv6_get_lladdr(dev, &addr_buf)) { @@ -508,7 +509,7 @@ void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) printk(KERN_DEBUG "igmp6: %s no linklocal address\n", dev->name); #endif - return; + goto out; } ip6_nd_hdr(sk, skb, dev, &addr_buf, snd_addr, NEXTHDR_HOP, payload_len); @@ -532,6 +533,10 @@ void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) else ICMP6_INC_STATS(Icmp6OutGroupMembResponses); ICMP6_INC_STATS(Icmp6OutMsgs); + return; + +out: + kfree_skb(skb); } static void igmp6_join_group(struct ifmcaddr6 *ma) diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 04ecdea9c0ed..574bc165cce5 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -7,7 +7,7 @@ * * Adapted from linux/net/ipv4/raw.c * - * $Id: raw.c,v 1.31 2000/01/09 02:19:50 davem Exp $ + * $Id: raw.c,v 1.33 2000/01/18 08:24:22 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -763,10 +763,6 @@ struct proto rawv6_prot = { udpv6_connect, /* connect */ udp_disconnect, /* disconnect */ NULL, /* accept */ - NULL, /* retransmit */ - NULL, /* write_wakeup */ - NULL, /* read_wakeup */ - datagram_poll, /* poll */ NULL, /* ioctl */ rawv6_init_sk, /* init */ inet6_destroy_sock, /* destroy */ @@ -780,7 +776,5 @@ struct proto rawv6_prot = { raw_v6_hash, /* hash */ raw_v6_unhash, /* unhash */ NULL, /* get_port */ - 128, /* max_header */ - 0, /* retransmits */ "RAW", /* name */ }; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 668f61bfba24..dc6020c33ffa 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque * - * $Id: route.c,v 1.44 2000/01/09 02:19:51 davem Exp $ + * $Id: route.c,v 1.45 2000/01/16 05:11:38 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -93,7 +93,7 @@ struct dst_ops ip6_dst_ops = { struct rt6_info ip6_null_entry = { {{NULL, ATOMIC_INIT(1), 1, &loopback_dev, - -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -ENETUNREACH, NULL, NULL, ip6_pkt_discard, ip6_pkt_discard, #ifdef CONFIG_NET_CLS_ROUTE @@ -296,6 +296,7 @@ static struct rt6_info *rt6_cow(struct rt6_info *ort, struct in6_addr *daddr, rt->rt6i_dst.plen = 128; rt->rt6i_flags |= RTF_CACHE; + rt->u.dst.flags |= DST_HOST; #ifdef CONFIG_IPV6_SUBTREES if (rt->rt6i_src.plen && saddr) { @@ -687,6 +688,8 @@ int ip6_route_add(struct in6_rtmsg *rtmsg) ipv6_addr_copy(&rt->rt6i_dst.addr, &rtmsg->rtmsg_dst); rt->rt6i_dst.plen = rtmsg->rtmsg_dst_len; + if (rt->rt6i_dst.plen == 128) + rt->u.dst.flags = DST_HOST; ipv6_wash_prefix(&rt->rt6i_dst.addr, rt->rt6i_dst.plen); #ifdef CONFIG_IPV6_SUBTREES @@ -940,6 +943,7 @@ source_ok: ipv6_addr_copy(&nrt->rt6i_dst.addr, dest); nrt->rt6i_dst.plen = 128; + nrt->u.dst.flags |= DST_HOST; ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key); nrt->rt6i_nexthop = neigh_clone(neigh); @@ -1025,6 +1029,7 @@ void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr, goto out; ipv6_addr_copy(&nrt->rt6i_dst.addr, daddr); nrt->rt6i_dst.plen = 128; + nrt->u.dst.flags |= DST_HOST; nrt->rt6i_nexthop = neigh_clone(rt->rt6i_nexthop); dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires); nrt->rt6i_flags |= RTF_DYNAMIC|RTF_CACHE|RTF_EXPIRES; @@ -1045,7 +1050,7 @@ static struct rt6_info * ip6_rt_copy(struct rt6_info *ort) struct rt6_info *rt; rt = dst_alloc(&ip6_dst_ops); - + if (rt) { rt->u.dst.input = ort->u.dst.input; rt->u.dst.output = ort->u.dst.output; @@ -1193,7 +1198,8 @@ int ip6_rt_addr_add(struct in6_addr *addr, struct net_device *dev) rt = dst_alloc(&ip6_dst_ops); if (rt == NULL) return -ENOMEM; - + + rt->u.dst.flags = DST_HOST; rt->u.dst.input = ip6_input; rt->u.dst.output = ip6_output; rt->rt6i_dev = dev_get_by_name("lo"); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index e87ef0c3ef04..420b81f4a5e7 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque * - * $Id: tcp_ipv6.c,v 1.116 2000/01/09 02:19:52 davem Exp $ + * $Id: tcp_ipv6.c,v 1.118 2000/01/18 08:24:22 davem Exp $ * * Based on: * linux/net/ipv4/tcp.c @@ -46,11 +46,6 @@ #include -extern int sysctl_max_syn_backlog; -extern int sysctl_tcp_tw_recycle; -extern __u32 sysctl_wmem_max; -extern __u32 sysctl_rmem_max; - static void tcp_v6_send_reset(struct sk_buff *skb); static void tcp_v6_or_send_ack(struct sk_buff *skb, struct open_request *req); static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len, @@ -58,11 +53,6 @@ static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len, static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); static int tcp_v6_xmit(struct sk_buff *skb); -static struct open_request *tcp_v6_search_req(struct tcp_opt *tp, - struct ipv6hdr *ip6h, - struct tcphdr *th, - int iif, - struct open_request **prevp); static struct tcp_func ipv6_mapped; static struct tcp_func ipv6_specific; @@ -282,9 +272,10 @@ static struct sock *tcp_v6_lookup_listener(struct in6_addr *daddr, unsigned shor * * The sockhash lock must be held as a reader here. */ -static inline struct sock *__tcp_v6_lookup(struct in6_addr *saddr, u16 sport, - struct in6_addr *daddr, u16 hnum, - int dif) + +static inline struct sock *__tcp_v6_lookup_established(struct in6_addr *saddr, u16 sport, + struct in6_addr *daddr, u16 hnum, + int dif) { struct tcp_ehash_bucket *head; struct sock *sk; @@ -314,8 +305,7 @@ static inline struct sock *__tcp_v6_lookup(struct in6_addr *saddr, u16 sport, } } read_unlock(&head->lock); - - return tcp_v6_lookup_listener(daddr, hnum, dif); + return NULL; hit: sock_hold(sk); @@ -323,6 +313,21 @@ hit: return sk; } + +static inline struct sock *__tcp_v6_lookup(struct in6_addr *saddr, u16 sport, + struct in6_addr *daddr, u16 hnum, + int dif) +{ + struct sock *sk; + + sk = __tcp_v6_lookup_established(saddr, sport, daddr, hnum, dif); + + if (sk) + return sk; + + return tcp_v6_lookup_listener(daddr, hnum, dif); +} + #define tcp_v6_lookup(sa, sp, da, dp, dif) \ ({ struct sock *___sk; \ local_bh_disable(); \ @@ -331,6 +336,46 @@ hit: ___sk; \ }) + +/* + * Open request hash tables. + */ + +static __inline__ unsigned tcp_v6_synq_hash(struct in6_addr *raddr, u16 rport) +{ + unsigned h = raddr->s6_addr32[3] ^ rport; + h ^= h>>16; + h ^= h>>8; + return h&(TCP_SYNQ_HSIZE-1); +} + +static struct open_request *tcp_v6_search_req(struct tcp_opt *tp, + struct ipv6hdr *ip6h, + struct tcphdr *th, + int iif, + struct open_request ***prevp) +{ + struct tcp_listen_opt *lopt = tp->listen_opt; + struct open_request *req, **prev; + __u16 rport = th->source; + + for (prev = &lopt->syn_table[tcp_v6_synq_hash(&ip6h->saddr, rport)]; + (req = *prev) != NULL; + prev = &req->dl_next) { + if (req->rmt_port == rport && + req->class->family == AF_INET6 && + !ipv6_addr_cmp(&req->af.v6_req.rmt_addr, &ip6h->saddr) && + !ipv6_addr_cmp(&req->af.v6_req.loc_addr, &ip6h->daddr) && + (!req->af.v6_req.iif || req->af.v6_req.iif == iif)) { + BUG_TRAP(req->sk == NULL); + *prevp = prev; + return req; + } + } + + return NULL; +} + static __inline__ u16 tcp_v6_check(struct tcphdr *th, int len, struct in6_addr *saddr, struct in6_addr *daddr, @@ -375,10 +420,10 @@ static int tcp_v6_check_established(struct sock *sk) !ipv6_addr_cmp(&tw->v6_daddr, saddr) && !ipv6_addr_cmp(&tw->v6_rcv_saddr, daddr) && sk2->bound_dev_if == sk->bound_dev_if) { -#ifdef CONFIG_TCP_TW_RECYCLE struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - if (sysctl_tcp_tw_recycle && tw->ts_recent_stamp) { + if (tw->substate == TCP_TIME_WAIT && + sysctl_tcp_tw_recycle && tw->ts_recent_stamp) { /* See comment in tcp_ipv4.c */ if ((tp->write_seq = tw->snd_nxt + 2) == 0) tp->write_seq = 1; @@ -388,8 +433,7 @@ static int tcp_v6_check_established(struct sock *sk) skp = &head->chain; goto unique; } else -#endif - goto not_unique; + goto not_unique; } } tw = NULL; @@ -399,9 +443,7 @@ static int tcp_v6_check_established(struct sock *sk) goto not_unique; } -#ifdef CONFIG_TCP_TW_RECYCLE unique: -#endif BUG_TRAP(sk->pprev==NULL); if ((sk->next = *skp) != NULL) (*skp)->pprev = &sk->next; @@ -411,17 +453,16 @@ unique: sock_prot_inc_use(sk->prot); write_unlock_bh(&head->lock); -#ifdef CONFIG_TCP_TW_RECYCLE if (tw) { /* Silly. Should hash-dance instead... */ local_bh_disable(); tcp_tw_deschedule(tw); tcp_timewait_kill(tw); + NET_INC_STATS_BH(TimeWaitRecycled); local_bh_enable(); tcp_tw_put(tw); } -#endif return 0; not_unique: @@ -467,9 +508,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_type; int err; - if (sk->state != TCP_CLOSE) - return(-EISCONN); - if (addr_len < sizeof(struct sockaddr_in6)) return(-EINVAL); @@ -501,18 +539,11 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, if(addr_type & IPV6_ADDR_MULTICAST) return -ENETUNREACH; - /* We may need to bind the socket. */ - if (sk->num==0 && sk->prot->get_port(sk, 0)) - return -EAGAIN; - sk->sport = htons(sk->num); - -#ifdef CONFIG_TCP_TW_RECYCLE if (tp->ts_recent_stamp && ipv6_addr_cmp(&np->daddr, &usin->sin6_addr)) { tp->ts_recent = 0; tp->ts_recent_stamp = 0; tp->write_seq = 0; } -#endif ipv6_addr_copy(&np->daddr, &usin->sin6_addr); np->flow_label = fl.fl6_flowlabel; @@ -602,8 +633,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, tp->mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); err = -ENOBUFS; - buff = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header), - 0, GFP_KERNEL); + buff = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 0, GFP_KERNEL); if (buff == NULL) goto failure; @@ -629,46 +659,6 @@ failure: return err; } -static int tcp_v6_sendmsg(struct sock *sk, struct msghdr *msg, int len) -{ - struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; - int retval = -EINVAL; - - lock_sock(sk); - /* - * Do sanity checking for sendmsg/sendto/send - */ - - if (msg->msg_flags & ~(MSG_OOB|MSG_DONTROUTE|MSG_DONTWAIT|MSG_NOSIGNAL)) - goto out; - if (msg->msg_name) { - struct sockaddr_in6 *addr=(struct sockaddr_in6 *)msg->msg_name; - - if (msg->msg_namelen < sizeof(*addr)) - goto out; - - if (addr->sin6_family && addr->sin6_family != AF_INET6) - goto out; - retval = -ENOTCONN; - - if(sk->state == TCP_CLOSE) - goto out; - retval = -EISCONN; - if (addr->sin6_port != sk->dport) - goto out; - if (ipv6_addr_cmp(&addr->sin6_addr, &np->daddr)) - goto out; - if (np->sndflow && np->flow_label != (addr->sin6_flowinfo&IPV6_FLOWINFO_MASK)) - goto out; - } - - retval = tcp_do_sendmsg(sk, msg); - -out: - release_sock(sk); - return retval; -} - void tcp_v6_err(struct sk_buff *skb, struct ipv6hdr *hdr, struct inet6_skb_parm *opt, int type, int code, unsigned char *header, __u32 info) @@ -701,6 +691,9 @@ void tcp_v6_err(struct sk_buff *skb, struct ipv6hdr *hdr, if (sk->lock.users) NET_INC_STATS_BH(LockDroppedIcmps); + if (sk->state == TCP_CLOSE) + goto out; + tp = &sk->tp_pinfo.af_tcp; seq = ntohl(th->seq); if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) { @@ -719,7 +712,7 @@ void tcp_v6_err(struct sk_buff *skb, struct ipv6hdr *hdr, goto out; /* icmp should have updated the destination cache entry */ - dst = sk_dst_check(sk, np->dst_cookie); + dst = __sk_dst_check(sk, np->dst_cookie); if (dst == NULL) { struct flowi fl; @@ -736,7 +729,8 @@ void tcp_v6_err(struct sk_buff *skb, struct ipv6hdr *hdr, fl.uli_u.ports.sport = sk->sport; dst = ip6_route_output(sk, &fl); - } + } else + dst_clone(dst); if (dst->error) { sk->err_soft = -dst->error; @@ -752,7 +746,7 @@ void tcp_v6_err(struct sk_buff *skb, struct ipv6hdr *hdr, /* Might be for an open_request */ switch (sk->state) { - struct open_request *req, *prev; + struct open_request *req, **prev; struct ipv6hdr hd; case TCP_LISTEN: if (sk->lock.users) @@ -765,35 +759,19 @@ void tcp_v6_err(struct sk_buff *skb, struct ipv6hdr *hdr, if (!req) goto out; - if (req->sk) { - struct sock *nsk = req->sk; - - sock_hold(nsk); - bh_unlock_sock(sk); - sock_put(sk); - sk = nsk; - - BUG_TRAP(sk->lock.users==0); - - tp = &sk->tp_pinfo.af_tcp; - if (!between(seq, tp->snd_una, tp->snd_nxt)) { - NET_INC_STATS_BH(OutOfWindowIcmps); - goto out; - } - } else { - if (seq != req->snt_isn) { - NET_INC_STATS_BH(OutOfWindowIcmps); - goto out; - } + /* ICMPs are not backlogged, hence we cannot get + * an established socket here. + */ + BUG_TRAP(req->sk == NULL); - tp->syn_backlog--; - tcp_synq_unlink(tp, req, prev); - tcp_dec_slow_timer(TCP_SLT_SYNACK); - req->class->destructor(req); - tcp_openreq_free(req); + if (seq != req->snt_isn) { + NET_INC_STATS_BH(OutOfWindowIcmps); goto out; } - break; + + tcp_synq_drop(sk, req, prev); + goto out; + case TCP_SYN_SENT: case TCP_SYN_RECV: /* Cannot happen. It can, it SYNs are crossed. --ANK */ @@ -802,7 +780,6 @@ void tcp_v6_err(struct sk_buff *skb, struct ipv6hdr *hdr, sk->err = err; sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ - tcp_set_state(sk, TCP_CLOSE); tcp_done(sk); } else { sk->err_soft = err; @@ -823,12 +800,13 @@ out: } -static void tcp_v6_send_synack(struct sock *sk, struct open_request *req) +static int tcp_v6_send_synack(struct sock *sk, struct open_request *req, + struct dst_entry *dst) { struct sk_buff * skb; - struct dst_entry *dst; struct ipv6_txoptions *opt = NULL; struct flowi fl; + int err = -1; fl.proto = IPPROTO_TCP; fl.nl_u.ip6_u.daddr = &req->af.v6_req.rmt_addr; @@ -838,24 +816,26 @@ static void tcp_v6_send_synack(struct sock *sk, struct open_request *req) fl.uli_u.ports.dport = req->rmt_port; fl.uli_u.ports.sport = sk->sport; - opt = sk->net_pinfo.af_inet6.opt; - if (opt == NULL && - sk->net_pinfo.af_inet6.rxopt.bits.srcrt == 2 && - req->af.v6_req.pktopts) { - struct sk_buff *pktopts = req->af.v6_req.pktopts; - struct inet6_skb_parm *rxopt = (struct inet6_skb_parm *)pktopts->cb; - if (rxopt->srcrt) - opt = ipv6_invert_rthdr(sk, (struct ipv6_rt_hdr*)(pktopts->nh.raw + rxopt->srcrt)); - } + if (dst == NULL) { + opt = sk->net_pinfo.af_inet6.opt; + if (opt == NULL && + sk->net_pinfo.af_inet6.rxopt.bits.srcrt == 2 && + req->af.v6_req.pktopts) { + struct sk_buff *pktopts = req->af.v6_req.pktopts; + struct inet6_skb_parm *rxopt = (struct inet6_skb_parm *)pktopts->cb; + if (rxopt->srcrt) + opt = ipv6_invert_rthdr(sk, (struct ipv6_rt_hdr*)(pktopts->nh.raw + rxopt->srcrt)); + } - if (opt && opt->srcrt) { - struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt; - fl.nl_u.ip6_u.daddr = rt0->addr; - } + if (opt && opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt; + fl.nl_u.ip6_u.daddr = rt0->addr; + } - dst = ip6_route_output(sk, &fl); - if (dst->error) - goto done; + dst = ip6_route_output(sk, &fl); + if (dst->error) + goto done; + } skb = tcp_make_synack(sk, dst, req); if (skb) { @@ -866,21 +846,22 @@ static void tcp_v6_send_synack(struct sock *sk, struct open_request *req) csum_partial((char *)th, skb->len, skb->csum)); fl.nl_u.ip6_u.daddr = &req->af.v6_req.rmt_addr; - ip6_xmit(sk, skb, &fl, opt); + err = ip6_xmit(sk, skb, &fl, opt); + if (err == NET_XMIT_CN) + err = 0; } done: dst_release(dst); if (opt && opt != sk->net_pinfo.af_inet6.opt) sock_kfree_s(sk, opt, opt->tot_len); + return err; } static void tcp_v6_or_free(struct open_request *req) { - if (req->af.v6_req.pktopts) { + if (req->af.v6_req.pktopts) kfree_skb(req->af.v6_req.pktopts); - req->af.v6_req.pktopts = NULL; - } } static struct or_calltable or_ipv6 = { @@ -907,125 +888,320 @@ static int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb) } -#define BACKLOG(sk) ((sk)->tp_pinfo.af_tcp.syn_backlog) /* lvalue! */ -#define BACKLOGMAX(sk) sysctl_max_syn_backlog +static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len, + struct sk_buff *skb) +{ + struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; + th->check = 0; + + th->check = csum_ipv6_magic(&np->saddr, &np->daddr, len, IPPROTO_TCP, + csum_partial((char *)th, th->doff<<2, + skb->csum)); +} -/* FIXME: this is substantially similar to the ipv4 code. - * Can some kind of merge be done? -- erics - */ -static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) + +static void tcp_v6_send_reset(struct sk_buff *skb) { - struct tcp_opt tp; - struct open_request *req; - __u32 isn = TCP_SKB_CB(skb)->when; + struct tcphdr *th = skb->h.th, *t1; + struct sk_buff *buff; + struct flowi fl; - if (skb->protocol == __constant_htons(ETH_P_IP)) - return tcp_v4_conn_request(sk, skb); + if (th->rst) + return; - /* FIXME: do the same check for anycast */ if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) - goto drop; - - if (isn == 0) - isn = tcp_v6_init_sequence(sk,skb); + return; /* - * There are no SYN attacks on IPv6, yet... + * We need to grab some memory, and put together an RST, + * and then put it into the queue to be sent. */ - if (BACKLOG(sk) >= BACKLOGMAX(sk)) { - (void)(net_ratelimit() && - printk(KERN_INFO "droping syn ack:%d max:%d\n", - BACKLOG(sk), BACKLOGMAX(sk))); - goto drop; - } - req = tcp_openreq_alloc(); - if (req == NULL) { - goto drop; - } - - BACKLOG(sk)++; - - req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */ - - req->rcv_isn = TCP_SKB_CB(skb)->seq; - req->snt_isn = isn; - tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0; + buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr), GFP_ATOMIC); + if (buff == NULL) + return; - tp.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); - tp.user_mss = sk->tp_pinfo.af_tcp.user_mss; + skb_reserve(buff, MAX_HEADER + sizeof(struct ipv6hdr)); - tcp_parse_options(NULL, skb->h.th, &tp, 0); + t1 = (struct tcphdr *) skb_push(buff,sizeof(struct tcphdr)); - req->mss = tp.mss_clamp; - req->ts_recent = tp.saw_tstamp ? tp.rcv_tsval : 0; - req->tstamp_ok = tp.tstamp_ok; - req->sack_ok = tp.sack_ok; - req->snd_wscale = tp.snd_wscale; - req->wscale_ok = tp.wscale_ok; - req->rmt_port = skb->h.th->source; - ipv6_addr_copy(&req->af.v6_req.rmt_addr, &skb->nh.ipv6h->saddr); - ipv6_addr_copy(&req->af.v6_req.loc_addr, &skb->nh.ipv6h->daddr); - req->af.v6_req.pktopts = NULL; - if (ipv6_opt_accepted(sk, skb) || - sk->net_pinfo.af_inet6.rxopt.bits.rxinfo || - sk->net_pinfo.af_inet6.rxopt.bits.rxhlim) { - atomic_inc(&skb->users); - req->af.v6_req.pktopts = skb; + /* Swap the send and the receive. */ + memset(t1, 0, sizeof(*t1)); + t1->dest = th->source; + t1->source = th->dest; + t1->doff = sizeof(*t1)/4; + t1->rst = 1; + + if(th->ack) { + t1->seq = th->ack_seq; + } else { + t1->ack = 1; + t1->ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + + skb->len - (th->doff<<2)); } - req->af.v6_req.iif = sk->bound_dev_if; - /* So that link locals have meaning */ - if (!sk->bound_dev_if && ipv6_addr_type(&req->af.v6_req.rmt_addr)&IPV6_ADDR_LINKLOCAL) - req->af.v6_req.iif = tcp_v6_iif(skb); + buff->csum = csum_partial((char *)t1, sizeof(*t1), 0); - req->class = &or_ipv6; - req->retrans = 0; - req->sk = NULL; + fl.nl_u.ip6_u.daddr = &skb->nh.ipv6h->saddr; + fl.nl_u.ip6_u.saddr = &skb->nh.ipv6h->daddr; + fl.fl6_flowlabel = 0; - tcp_v6_send_synack(sk, req); + t1->check = csum_ipv6_magic(fl.nl_u.ip6_u.saddr, + fl.nl_u.ip6_u.daddr, + sizeof(*t1), IPPROTO_TCP, + buff->csum); - req->expires = jiffies + TCP_TIMEOUT_INIT; - tcp_inc_slow_timer(TCP_SLT_SYNACK); - tcp_synq_queue(&sk->tp_pinfo.af_tcp, req); + fl.proto = IPPROTO_TCP; + fl.oif = tcp_v6_iif(skb); + fl.uli_u.ports.dport = t1->dest; + fl.uli_u.ports.sport = t1->source; - return 0; + /* sk = NULL, but it is safe for now. RST socket required. */ + buff->dst = ip6_route_output(NULL, &fl); -drop: - TCP_INC_STATS_BH(TcpAttemptFails); - return 0; /* don't send reset */ -} + if (buff->dst->error == 0) { + ip6_xmit(NULL, buff, &fl, NULL); + TCP_INC_STATS_BH(TcpOutSegs); + TCP_INC_STATS_BH(TcpOutRsts); + return; + } -static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len, - struct sk_buff *skb) -{ - struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; - th->check = 0; - - th->check = csum_ipv6_magic(&np->saddr, &np->daddr, len, IPPROTO_TCP, - csum_partial((char *)th, th->doff<<2, - skb->csum)); + kfree_skb(buff); } -static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, - struct open_request *req, - struct dst_entry *dst) +static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts) { - struct ipv6_pinfo *np; + struct tcphdr *th = skb->h.th, *t1; + struct sk_buff *buff; struct flowi fl; - struct tcp_opt *newtp; - struct sock *newsk; - struct ipv6_txoptions *opt; + int tot_len = sizeof(struct tcphdr); - if (skb->protocol == __constant_htons(ETH_P_IP)) { - /* - * v6 mapped - */ + buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr), GFP_ATOMIC); + if (buff == NULL) + return; - newsk = tcp_v4_syn_recv_sock(sk, skb, req, dst); + skb_reserve(buff, MAX_HEADER + sizeof(struct ipv6hdr)); - if (newsk == NULL) - return NULL; + if (ts) + tot_len += 3*4; + + t1 = (struct tcphdr *) skb_push(buff,tot_len); + + /* Swap the send and the receive. */ + memset(t1, 0, sizeof(*t1)); + t1->dest = th->source; + t1->source = th->dest; + t1->doff = tot_len/4; + t1->seq = htonl(seq); + t1->ack_seq = htonl(ack); + t1->ack = 1; + t1->window = htons(win); + + if (ts) { + u32 *ptr = (u32*)(t1 + 1); + *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | + TCPOLEN_TIMESTAMP); + *ptr++ = htonl(tcp_time_stamp); + *ptr = htonl(ts); + } + + buff->csum = csum_partial((char *)t1, tot_len, 0); + + fl.nl_u.ip6_u.daddr = &skb->nh.ipv6h->saddr; + fl.nl_u.ip6_u.saddr = &skb->nh.ipv6h->daddr; + fl.fl6_flowlabel = 0; + + t1->check = csum_ipv6_magic(fl.nl_u.ip6_u.saddr, + fl.nl_u.ip6_u.daddr, + tot_len, IPPROTO_TCP, + buff->csum); + + fl.proto = IPPROTO_TCP; + fl.oif = tcp_v6_iif(skb); + fl.uli_u.ports.dport = t1->dest; + fl.uli_u.ports.sport = t1->source; + + buff->dst = ip6_route_output(NULL, &fl); + + if (buff->dst->error == 0) { + ip6_xmit(NULL, buff, &fl, NULL); + TCP_INC_STATS_BH(TcpOutSegs); + return; + } + + kfree_skb(buff); +} + +static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk; + + tcp_v6_send_ack(skb, tw->snd_nxt, tw->rcv_nxt, + tw->rcv_wnd>>tw->rcv_wscale, tw->ts_recent); + + tcp_tw_put(tw); +} + +static void tcp_v6_or_send_ack(struct sk_buff *skb, struct open_request *req) +{ + tcp_v6_send_ack(skb, req->snt_isn+1, req->rcv_isn+1, req->rcv_wnd, req->ts_recent); +} + + +static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) +{ + struct open_request *req, **prev; + struct tcphdr *th = skb->h.th; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + /* Find possible connection requests. */ + req = tcp_v6_search_req(tp, skb->nh.ipv6h, th, tcp_v6_iif(skb), &prev); + if (req) + return tcp_check_req(sk, skb, req, prev); + + if (tp->accept_queue) { + struct sock *nsk; + + nsk = __tcp_v6_lookup_established(&skb->nh.ipv6h->saddr, + th->source, + &skb->nh.ipv6h->daddr, + ntohs(th->dest), + tcp_v6_iif(skb)); + + if (nsk) { + if (nsk->state != TCP_TIME_WAIT) { + bh_lock_sock(nsk); + return nsk; + } + tcp_tw_put((struct tcp_tw_bucket*)sk); + return NULL; + } + } + +#if 0 /*def CONFIG_SYN_COOKIES*/ + if (!th->rst && (th->syn || th->ack)) + sk = cookie_v6_check(sk, skb, &(IPCB(skb)->opt)); +#endif + return sk; +} + +static void tcp_v6_synq_add(struct sock *sk, struct open_request *req) +{ + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + struct tcp_listen_opt *lopt = tp->listen_opt; + unsigned h = tcp_v6_synq_hash(&req->af.v6_req.rmt_addr, req->rmt_port); + + req->sk = NULL; + req->expires = jiffies + TCP_TIMEOUT_INIT; + req->retrans = 0; + req->index = h; + req->dl_next = lopt->syn_table[h]; + + write_lock(&tp->syn_wait_lock); + lopt->syn_table[h] = req; + write_unlock(&tp->syn_wait_lock); + + tcp_synq_added(sk); +} + + +/* FIXME: this is substantially similar to the ipv4 code. + * Can some kind of merge be done? -- erics + */ +static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_opt tp; + struct open_request *req = NULL; + __u32 isn = TCP_SKB_CB(skb)->when; + + if (skb->protocol == __constant_htons(ETH_P_IP)) + return tcp_v4_conn_request(sk, skb); + + /* FIXME: do the same check for anycast */ + if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) + goto drop; + + /* + * There are no SYN attacks on IPv6, yet... + */ + if (tcp_synq_is_full(sk) && !isn) { + if (net_ratelimit()) + printk(KERN_INFO "TCPv6: dropping request, synflood is possible\n"); + goto drop; + } + + if (tcp_acceptq_is_full(sk) && tcp_synq_young(sk) > 1) + goto drop; + + req = tcp_openreq_alloc(); + if (req == NULL) + goto drop; + + tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0; + tp.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); + tp.user_mss = sk->tp_pinfo.af_tcp.user_mss; + + tcp_parse_options(NULL, skb->h.th, &tp, 0); + + tcp_openreq_init(req, &tp, skb); + + req->class = &or_ipv6; + ipv6_addr_copy(&req->af.v6_req.rmt_addr, &skb->nh.ipv6h->saddr); + ipv6_addr_copy(&req->af.v6_req.loc_addr, &skb->nh.ipv6h->daddr); + req->af.v6_req.pktopts = NULL; + if (ipv6_opt_accepted(sk, skb) || + sk->net_pinfo.af_inet6.rxopt.bits.rxinfo || + sk->net_pinfo.af_inet6.rxopt.bits.rxhlim) { + atomic_inc(&skb->users); + req->af.v6_req.pktopts = skb; + } + req->af.v6_req.iif = sk->bound_dev_if; + + /* So that link locals have meaning */ + if (!sk->bound_dev_if && ipv6_addr_type(&req->af.v6_req.rmt_addr)&IPV6_ADDR_LINKLOCAL) + req->af.v6_req.iif = tcp_v6_iif(skb); + + if (isn == 0) + isn = tcp_v6_init_sequence(sk,skb); + + req->snt_isn = isn; + + if (tcp_v6_send_synack(sk, req, NULL)) + goto drop; + + tcp_v6_synq_add(sk, req); + + return 0; + +drop: + if (req) + tcp_openreq_free(req); + + TCP_INC_STATS_BH(TcpAttemptFails); + return 0; /* don't send reset */ +} + +static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, + struct open_request *req, + struct dst_entry *dst) +{ + struct ipv6_pinfo *np; + struct flowi fl; + struct tcp_opt *newtp; + struct sock *newsk; + struct ipv6_txoptions *opt; + + if (skb->protocol == __constant_htons(ETH_P_IP)) { + /* + * v6 mapped + */ + + newsk = tcp_v4_syn_recv_sock(sk, skb, req, dst); + + if (newsk == NULL) + return NULL; np = &newsk->net_pinfo.af_inet6; @@ -1047,7 +1223,9 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, /* Charge newly allocated IPv6 socket. Though it is mapped, * it is IPv6 yet. */ +#ifdef INET_REFCNT_DEBUG atomic_inc(&inet6_sock_nr); +#endif MOD_INC_USE_COUNT; /* It is tricky place. Until this moment IPv4 tcp @@ -1061,8 +1239,8 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, opt = sk->net_pinfo.af_inet6.opt; - if (sk->ack_backlog > sk->max_ack_backlog) - goto out; + if (tcp_acceptq_is_full(sk)) + goto out_overflow; if (sk->net_pinfo.af_inet6.rxopt.bits.srcrt == 2 && opt == NULL && req->af.v6_req.pktopts) { @@ -1090,15 +1268,14 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, if (dst->error) goto out; - sk->tp_pinfo.af_tcp.syn_backlog--; - sk->ack_backlog++; - newsk = tcp_create_openreq_child(sk, req, skb); if (newsk == NULL) goto out; /* Charge newly allocated IPv6 socket */ +#ifdef INET_REFCNT_DEBUG atomic_inc(&inet6_sock_nr); +#endif MOD_INC_USE_COUNT; ip6_dst_store(newsk, dst, NULL); @@ -1124,6 +1301,8 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, np->pktoptions = NULL; if (req->af.v6_req.pktopts) { np->pktoptions = skb_clone(req->af.v6_req.pktopts, GFP_ATOMIC); + kfree_skb(req->af.v6_req.pktopts); + req->af.v6_req.pktopts = NULL; if (np->pktoptions) skb_set_owner_r(np->pktoptions, newsk); } @@ -1149,250 +1328,49 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, tcp_sync_mss(newsk, dst->pmtu); tcp_initialize_rcv_mss(newsk); + newtp->advmss = dst->advmss; - if (newsk->rcvbuf < (3 * (dst->advmss+60+MAX_HEADER+15))) - newsk->rcvbuf = min ((3 * (dst->advmss+60+MAX_HEADER+15)), sysctl_rmem_max); - if (newsk->sndbuf < (3 * (newtp->mss_clamp+60+MAX_HEADER+15))) - newsk->sndbuf = min ((3 * (newtp->mss_clamp+60+MAX_HEADER+15)), sysctl_wmem_max); + tcp_init_buffer_space(newsk); newsk->daddr = LOOPBACK4_IPV6; newsk->saddr = LOOPBACK4_IPV6; newsk->rcv_saddr= LOOPBACK4_IPV6; - bh_lock_sock(newsk); - __tcp_v6_hash(newsk); tcp_inherit_port(sk, newsk); return newsk; +out_overflow: + NET_INC_STATS_BH(ListenOverflows); out: + NET_INC_STATS_BH(ListenDrops); if (opt && opt != sk->net_pinfo.af_inet6.opt) sock_kfree_s(sk, opt, opt->tot_len); dst_release(dst); return NULL; } -static void tcp_v6_send_reset(struct sk_buff *skb) +static int tcp_v6_checksum_init(struct sk_buff *skb) { - struct tcphdr *th = skb->h.th, *t1; - struct sk_buff *buff; - struct flowi fl; - - if (th->rst) - return; - - if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) - return; - - /* - * We need to grab some memory, and put together an RST, - * and then put it into the queue to be sent. - */ - - buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr), GFP_ATOMIC); - if (buff == NULL) - return; - - skb_reserve(buff, MAX_HEADER + sizeof(struct ipv6hdr)); - - t1 = (struct tcphdr *) skb_push(buff,sizeof(struct tcphdr)); - - /* Swap the send and the receive. */ - memset(t1, 0, sizeof(*t1)); - t1->dest = th->source; - t1->source = th->dest; - t1->doff = sizeof(*t1)/4; - t1->rst = 1; - - if(th->ack) { - t1->seq = th->ack_seq; - } else { - t1->ack = 1; - t1->ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin - + skb->len - (th->doff<<2)); - } - - buff->csum = csum_partial((char *)t1, sizeof(*t1), 0); - - fl.nl_u.ip6_u.daddr = &skb->nh.ipv6h->saddr; - fl.nl_u.ip6_u.saddr = &skb->nh.ipv6h->daddr; - fl.fl6_flowlabel = 0; - - t1->check = csum_ipv6_magic(fl.nl_u.ip6_u.saddr, - fl.nl_u.ip6_u.daddr, - sizeof(*t1), IPPROTO_TCP, - buff->csum); - - fl.proto = IPPROTO_TCP; - fl.oif = tcp_v6_iif(skb); - fl.uli_u.ports.dport = t1->dest; - fl.uli_u.ports.sport = t1->source; - - /* sk = NULL, but it is safe for now. RST socket required. */ - buff->dst = ip6_route_output(NULL, &fl); - - if (buff->dst->error == 0) { - ip6_xmit(NULL, buff, &fl, NULL); - TCP_INC_STATS_BH(TcpOutSegs); - TCP_INC_STATS_BH(TcpOutRsts); - return; - } - - kfree_skb(buff); -} - -static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts) -{ - struct tcphdr *th = skb->h.th, *t1; - struct sk_buff *buff; - struct flowi fl; - int tot_len = sizeof(struct tcphdr); - - buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr), GFP_ATOMIC); - if (buff == NULL) - return; - - skb_reserve(buff, MAX_HEADER + sizeof(struct ipv6hdr)); - - if (ts) - tot_len += 3*4; - - t1 = (struct tcphdr *) skb_push(buff,tot_len); - - /* Swap the send and the receive. */ - memset(t1, 0, sizeof(*t1)); - t1->dest = th->source; - t1->source = th->dest; - t1->doff = tot_len/4; - t1->seq = htonl(seq); - t1->ack_seq = htonl(ack); - t1->ack = 1; - t1->window = htons(win); - - if (ts) { - u32 *ptr = (u32*)(t1 + 1); - *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | - (TCPOPT_TIMESTAMP << 8) | - TCPOLEN_TIMESTAMP); - *ptr++ = htonl(tcp_time_stamp); - *ptr = htonl(ts); - } - - buff->csum = csum_partial((char *)t1, tot_len, 0); - - fl.nl_u.ip6_u.daddr = &skb->nh.ipv6h->saddr; - fl.nl_u.ip6_u.saddr = &skb->nh.ipv6h->daddr; - fl.fl6_flowlabel = 0; - - t1->check = csum_ipv6_magic(fl.nl_u.ip6_u.saddr, - fl.nl_u.ip6_u.daddr, - tot_len, IPPROTO_TCP, - buff->csum); - - fl.proto = IPPROTO_TCP; - fl.oif = tcp_v6_iif(skb); - fl.uli_u.ports.dport = t1->dest; - fl.uli_u.ports.sport = t1->source; - - buff->dst = ip6_route_output(NULL, &fl); - - if (buff->dst->error == 0) { - ip6_xmit(NULL, buff, &fl, NULL); - TCP_INC_STATS_BH(TcpOutSegs); - return; - } - - kfree_skb(buff); -} - -static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk; - - tcp_v6_send_ack(skb, tw->snd_nxt, tw->rcv_nxt, 0, tw->ts_recent); - - tcp_tw_put(tw); -} - -static void tcp_v6_or_send_ack(struct sk_buff *skb, struct open_request *req) -{ - tcp_v6_send_ack(skb, req->snt_isn+1, req->rcv_isn+1, req->rcv_wnd, req->ts_recent); -} - -static struct open_request *tcp_v6_search_req(struct tcp_opt *tp, - struct ipv6hdr *ip6h, - struct tcphdr *th, - int iif, - struct open_request **prevp) -{ - struct open_request *req, *prev; - __u16 rport = th->source; - - /* assumption: the socket is not in use. - * as we checked the user count on tcp_rcv and we're - * running from a soft interrupt. - */ - prev = (struct open_request *) (&tp->syn_wait_queue); - for (req = prev->dl_next; req; req = req->dl_next) { - if (req->rmt_port == rport && - req->class->family == AF_INET6 && - !ipv6_addr_cmp(&req->af.v6_req.rmt_addr, &ip6h->saddr) && - !ipv6_addr_cmp(&req->af.v6_req.loc_addr, &ip6h->daddr) && - (!req->af.v6_req.iif || req->af.v6_req.iif == iif)) { - if (req->sk) { - bh_lock_sock(req->sk); - BUG_TRAP(req->sk->lock.users==0); - if (req->sk->state == TCP_CLOSE) { - bh_unlock_sock(req->sk); - prev = req; - continue; - } - } - *prevp = prev; - return req; - } - prev = req; - } - return NULL; -} - - -static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) -{ - struct open_request *req, *prev; - struct tcphdr *th = skb->h.th; - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - - /* Find possible connection requests. */ - req = tcp_v6_search_req(tp, skb->nh.ipv6h, th, tcp_v6_iif(skb), &prev); - if (req) - return tcp_check_req(sk, skb, req, prev); - -#if 0 /*def CONFIG_SYN_COOKIES*/ - if (!th->rst && (th->syn || th->ack)) - sk = cookie_v6_check(sk, skb, &(IPCB(skb)->opt)); -#endif - return sk; -} - - -static int tcp_v6_csum_verify(struct sk_buff *skb) -{ - switch (skb->ip_summed) { - case CHECKSUM_NONE: - skb->csum = csum_partial((char *)skb->h.th, skb->len, 0); - case CHECKSUM_HW: + if (skb->ip_summed == CHECKSUM_HW) { if (tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, &skb->nh.ipv6h->daddr,skb->csum)) { - printk(KERN_DEBUG "tcp v6 csum failed\n"); - return 1; + NETDEBUG(printk(KERN_DEBUG "hw tcp v6 csum failed\n")); + return -1; } skb->ip_summed = CHECKSUM_UNNECESSARY; - default: - /* CHECKSUM_UNNECESSARY */ - }; + } else if (skb->ip_summed != CHECKSUM_UNNECESSARY) { + if (skb->len <= 68) { + if (tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, + &skb->nh.ipv6h->daddr,csum_partial((char *)skb->h.th, skb->len, 0))) + return -1; + skb->ip_summed = CHECKSUM_UNNECESSARY; + } else { + skb->csum = ~tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, + &skb->nh.ipv6h->daddr,0); + } + } return 0; } @@ -1435,13 +1413,6 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) IP6_INC_STATS_BH(Ip6InDelivers); - /* - * This doesn't check if the socket has enough room for the packet. - * Either process the packet _without_ queueing it and then free it, - * or do the check later. - */ - skb_set_owner_r(skb, sk); - /* Do Stevens' IPV6_PKTOPTIONS. Yes, guys, it is the only place in our code, where we @@ -1461,23 +1432,20 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) } if (sk->state == TCP_ESTABLISHED) { /* Fast path */ - /* Ready to move deeper ... */ - if (tcp_v6_csum_verify(skb)) - goto csum_err; + TCP_CHECK_TIMER(sk); if (tcp_rcv_established(sk, skb, skb->h.th, skb->len)) goto reset; + TCP_CHECK_TIMER(sk); if (users) goto ipv6_pktoptions; return 0; } - if (tcp_v6_csum_verify(skb)) + if (tcp_checksum_complete(skb)) goto csum_err; if (sk->state == TCP_LISTEN) { - struct sock *nsk; - - nsk = tcp_v6_hnd_req(sk, skb); + struct sock *nsk = tcp_v6_hnd_req(sk, skb); if (!nsk) goto discard; @@ -1486,21 +1454,8 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) * otherwise we just shortcircuit this and continue with * the new socket.. */ - if(nsk != sk) { - int ret; - int state = nsk->state; - - skb_orphan(skb); - BUG_TRAP(nsk->lock.users == 0); - skb_set_owner_r(skb, nsk); - ret = tcp_rcv_state_process(nsk, skb, skb->h.th, skb->len); - - /* Wakeup parent, send SIGIO */ - if (state == TCP_SYN_RECV && nsk->state != state) - sk->data_ready(sk, 0); - bh_unlock_sock(nsk); - - if (ret) + if(nsk != sk) { + if (tcp_child_process(sk, nsk, skb)) goto reset; if (users) kfree_skb(skb); @@ -1508,8 +1463,10 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) } } + TCP_CHECK_TIMER(sk); if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len)) goto reset; + TCP_CHECK_TIMER(sk); if (users) goto ipv6_pktoptions; return 0; @@ -1588,6 +1545,9 @@ int tcp_v6_rcv(struct sk_buff *skb, unsigned long len) if (len < sizeof(struct tcphdr)) goto bad_packet; + if (tcp_v6_checksum_init(skb) < 0) + goto bad_packet; + TCP_SKB_CB(skb)->seq = ntohl(th->seq); TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + len - th->doff*4); @@ -1608,9 +1568,10 @@ process: bh_lock_sock(sk); ret = 0; - if (!sk->lock.users) - ret = tcp_v6_do_rcv(sk, skb); - else + if (!sk->lock.users) { + if (!tcp_prequeue(sk, skb)) + ret = tcp_v6_do_rcv(sk, skb); + } else sk_add_backlog(sk, skb); bh_unlock_sock(sk); @@ -1618,7 +1579,7 @@ process: return ret; no_tcp_socket: - if (tcp_v6_csum_verify(skb)) { + if (tcp_checksum_complete(skb)) { bad_packet: TCP_INC_STATS_BH(TcpInErrs); } else { @@ -1639,7 +1600,7 @@ discard_and_relse: goto discard_it; do_time_wait: - if (tcp_v6_csum_verify(skb)) { + if (tcp_checksum_complete(skb)) { TCP_INC_STATS_BH(TcpInErrs); sock_put(sk); goto discard_it; @@ -1677,7 +1638,7 @@ static int tcp_v6_rebuild_header(struct sock *sk) struct dst_entry *dst; struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; - dst = sk_dst_check(sk, np->dst_cookie); + dst = __sk_dst_check(sk, np->dst_cookie); if (dst == NULL) { struct flowi fl; @@ -1704,12 +1665,9 @@ static int tcp_v6_rebuild_header(struct sock *sk) } ip6_dst_store(sk, dst, NULL); - return 0; } - err = dst->error; - dst_release(dst); - return err; + return 0; } static int tcp_v6_xmit(struct sk_buff *skb) @@ -1732,7 +1690,7 @@ static int tcp_v6_xmit(struct sk_buff *skb) fl.nl_u.ip6_u.daddr = rt0->addr; } - dst = sk_dst_check(sk, np->dst_cookie); + dst = __sk_dst_check(sk, np->dst_cookie); if (dst == NULL) { dst = ip6_route_output(sk, &fl); @@ -1743,11 +1701,10 @@ static int tcp_v6_xmit(struct sk_buff *skb) return -sk->err_soft; } - dst_clone(dst); ip6_dst_store(sk, dst, NULL); } - skb->dst = dst; + skb->dst = dst_clone(dst); /* Restore final destination back after routing done */ fl.nl_u.ip6_u.daddr = &np->daddr; @@ -1767,6 +1724,12 @@ static void v6_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) sin6->sin6_flowinfo = 0; } +static int tcp_v6_remember_stamp(struct sock *sk) +{ + /* Alas, not yet... */ + return 0; +} + static struct tcp_func ipv6_specific = { tcp_v6_xmit, tcp_v6_send_check, @@ -1774,6 +1737,7 @@ static struct tcp_func ipv6_specific = { tcp_v6_conn_request, tcp_v6_syn_recv_sock, tcp_v6_hash_connecting, + tcp_v6_remember_stamp, sizeof(struct ipv6hdr), ipv6_setsockopt, @@ -1793,6 +1757,7 @@ static struct tcp_func ipv6_mapped = { tcp_v6_conn_request, tcp_v6_syn_recv_sock, tcp_v4_hash_connecting, + tcp_v4_remember_stamp, sizeof(struct iphdr), ipv6_setsockopt, @@ -1812,6 +1777,7 @@ static int tcp_v6_init_sock(struct sock *sk) skb_queue_head_init(&tp->out_of_order_queue); tcp_init_xmit_timers(sk); + tcp_prequeue_init(tp); tp->rto = TCP_TIMEOUT_INIT; tp->mdev = TCP_TIMEOUT_INIT; @@ -1826,16 +1792,11 @@ static int tcp_v6_init_sock(struct sock *sk) /* See draft-stevens-tcpca-spec-01 for discussion of the * initialization of these values. */ - tp->snd_cwnd_cnt = 0; tp->snd_ssthresh = 0x7fffffff; tp->snd_cwnd_clamp = ~0; tp->mss_cache = 536; sk->state = TCP_CLOSE; - sk->max_ack_backlog = SOMAXCONN; - - /* Init SYN queue. */ - tcp_synq_init(tp); sk->tp_pinfo.af_tcp.af_specific = &ipv6_specific; @@ -1847,27 +1808,19 @@ static int tcp_v6_init_sock(struct sock *sk) static int tcp_v6_destroy_sock(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - struct sk_buff *skb; tcp_clear_xmit_timers(sk); - /* - * Cleanup up the write buffer. - */ - - while((skb = __skb_dequeue(&sk->write_queue)) != NULL) - kfree_skb(skb); + /* Cleanup up the write buffer. */ + __skb_queue_purge(&sk->write_queue); - /* - * Cleans up our, hopefuly empty, out_of_order_queue - */ + /* Cleans up our, hopefuly empty, out_of_order_queue. */ + __skb_queue_purge(&tp->out_of_order_queue); - while((skb = __skb_dequeue(&tp->out_of_order_queue)) != NULL) - kfree_skb(skb); + /* Clean prequeue, it must be empty really */ + __skb_queue_purge(&tp->ucopy.prequeue); - /* Clean up a locked TCP bind bucket, this only happens if a - * port is allocated for a socket, but it never fully connects. - */ + /* Clean up a referenced TCP bind bucket. */ if(sk->prev != NULL) tcp_put_port(sk); @@ -1878,12 +1831,16 @@ static int tcp_v6_destroy_sock(struct sock *sk) static void get_openreq6(struct sock *sk, struct open_request *req, char *tmpbuf, int i) { struct in6_addr *dest, *src; + int ttd = req->expires - jiffies; + + if (ttd < 0) + ttd = 0; src = &req->af.v6_req.loc_addr; dest = &req->af.v6_req.rmt_addr; sprintf(tmpbuf, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " - "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p", + "%02X %08X:%08X %02X:%08X %08X %5d %8d %d %d %p", i, src->s6_addr32[0], src->s6_addr32[1], src->s6_addr32[2], src->s6_addr32[3], @@ -1894,7 +1851,7 @@ static void get_openreq6(struct sock *sk, struct open_request *req, char *tmpbuf TCP_SYN_RECV, 0,0, /* could print option size, but that is af dependent. */ 1, /* timers active (only the expire timer) */ - (unsigned long)(req->expires - jiffies), + ttd, req->retrans, sk->socket ? sk->socket->inode->i_uid : 0, 0, /* non standard timer */ @@ -1906,7 +1863,7 @@ static void get_tcp6_sock(struct sock *sp, char *tmpbuf, int i) { struct in6_addr *dest, *src; __u16 destp, srcp; - int timer_active, timer_active1, timer_active2; + int timer_active; unsigned long timer_expires; struct tcp_opt *tp = &sp->tp_pinfo.af_tcp; @@ -1914,15 +1871,16 @@ static void get_tcp6_sock(struct sock *sp, char *tmpbuf, int i) src = &sp->net_pinfo.af_inet6.rcv_saddr; destp = ntohs(sp->dport); srcp = ntohs(sp->sport); - timer_active1 = tp->retransmit_timer.prev != NULL; - timer_active2 = sp->timer.prev != NULL; timer_active = 0; timer_expires = (unsigned) -1; - if (timer_active1 && tp->retransmit_timer.expires < timer_expires) { + if (tp->retransmit_timer.prev != NULL && tp->retransmit_timer.expires < timer_expires) { timer_active = 1; timer_expires = tp->retransmit_timer.expires; + } else if (tp->probe_timer.prev != NULL && tp->probe_timer.expires < timer_expires) { + timer_active = 4; + timer_expires = tp->probe_timer.expires; } - if (timer_active2 && sp->timer.expires < timer_expires) { + if (sp->timer.prev != NULL && sp->timer.expires < timer_expires) { timer_active = 2; timer_expires = sp->timer.expires; } @@ -1931,7 +1889,7 @@ static void get_tcp6_sock(struct sock *sp, char *tmpbuf, int i) sprintf(tmpbuf, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " - "%02X %08X:%08X %02X:%08lX %08X %5d %8d %ld %d %p", + "%02X %08X:%08X %02X:%08lX %08X %5d %8d %ld %d %p %u %u %u %u", i, src->s6_addr32[0], src->s6_addr32[1], src->s6_addr32[2], src->s6_addr32[3], srcp, @@ -1942,28 +1900,27 @@ static void get_tcp6_sock(struct sock *sp, char *tmpbuf, int i) timer_active, timer_expires-jiffies, tp->retransmits, sp->socket ? sp->socket->inode->i_uid : 0, - 0, + tp->probes_out, sp->socket ? sp->socket->inode->i_ino : 0, - atomic_read(&sp->refcnt), sp); + atomic_read(&sp->refcnt), sp, + tp->rto, tp->ack.ato, tp->ack.quick, tp->ack.pingpong + ); } static void get_timewait6_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i) { struct in6_addr *dest, *src; __u16 destp, srcp; - int slot_dist; + int ttd = tw->ttd - jiffies; + + if (ttd < 0) + ttd = 0; dest = &tw->v6_daddr; src = &tw->v6_rcv_saddr; destp = ntohs(tw->dport); srcp = ntohs(tw->sport); - slot_dist = tw->death_slot; - if(slot_dist > tcp_tw_death_row_slot) - slot_dist = (TCP_TWKILL_SLOTS - slot_dist) + tcp_tw_death_row_slot; - else - slot_dist = tcp_tw_death_row_slot - slot_dist; - sprintf(tmpbuf, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " "%02X %08X:%08X %02X:%08X %08X %5d %8d %d %d %p", @@ -1972,8 +1929,8 @@ static void get_timewait6_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i) src->s6_addr32[2], src->s6_addr32[3], srcp, dest->s6_addr32[0], dest->s6_addr32[1], dest->s6_addr32[2], dest->s6_addr32[3], destp, - TCP_TIME_WAIT, 0, 0, - 3, slot_dist * TCP_TWKILL_PERIOD, 0, 0, 0, 0, + tw->substate, 0, 0, + 3, ttd, 0, 0, 0, 0, atomic_read(&tw->refcnt), tw); } @@ -2002,6 +1959,8 @@ int tcp6_get_info(char *buffer, char **start, off_t offset, int length) tcp_listen_lock(); for(i = 0; i < TCP_LHTABLE_SIZE; i++) { struct sock *sk = tcp_listening_hash[i]; + struct tcp_listen_opt *lopt; + int k; for (sk = tcp_listening_hash[i]; sk; sk = sk->next, num++) { struct open_request *req; @@ -2019,24 +1978,29 @@ int tcp6_get_info(char *buffer, char **start, off_t offset, int length) } } - lock_sock(sk); - for (req = tp->syn_wait_queue; req; req = req->dl_next, num++) { - if (req->sk) - continue; - if (req->class->family != PF_INET6) - continue; - pos += LINE_LEN+1; - if (pos < offset) - continue; - get_openreq6(sk, req, tmpbuf, num); - len += sprintf(buffer+len, LINE_FMT, tmpbuf); - if(len >= length) { - release_sock(sk); - tcp_listen_unlock(); - goto out_no_bh; + read_lock_bh(&tp->syn_wait_lock); + lopt = tp->listen_opt; + if (lopt && lopt->qlen != 0) { + for (k=0; ksyn_table[k]; req; req = req->dl_next, num++) { + if (req->class->family != PF_INET6) + continue; + pos += LINE_LEN+1; + if (pos < offset) + continue; + get_openreq6(sk, req, tmpbuf, num); + len += sprintf(buffer+len, LINE_FMT, tmpbuf); + if(len >= length) { + read_unlock_bh(&tp->syn_wait_lock); + tcp_listen_unlock(); + goto out_no_bh; + } + } } } - release_sock(sk); + read_unlock_bh(&tp->syn_wait_lock); + + /* Completed requests are in normal socket hash table */ } } tcp_listen_unlock(); @@ -2100,25 +2064,19 @@ struct proto tcpv6_prot = { tcp_v6_connect, /* connect */ tcp_disconnect, /* disconnect */ tcp_accept, /* accept */ - NULL, /* retransmit */ - tcp_write_wakeup, /* write_wakeup */ - tcp_read_wakeup, /* read_wakeup */ - tcp_poll, /* poll */ tcp_ioctl, /* ioctl */ tcp_v6_init_sock, /* init */ tcp_v6_destroy_sock, /* destroy */ tcp_shutdown, /* shutdown */ tcp_setsockopt, /* setsockopt */ tcp_getsockopt, /* getsockopt */ - tcp_v6_sendmsg, /* sendmsg */ + tcp_sendmsg, /* sendmsg */ tcp_recvmsg, /* recvmsg */ NULL, /* bind */ tcp_v6_do_rcv, /* backlog_rcv */ tcp_v6_hash, /* hash */ tcp_unhash, /* unhash */ tcp_v6_get_port, /* get_port */ - 128, /* max_header */ - 0, /* retransmits */ "TCPv6", /* name */ }; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 3ecc55030a69..a5984354b5a4 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -7,7 +7,7 @@ * * Based on linux/ipv4/udp.c * - * $Id: udp.c,v 1.48 2000/01/09 02:19:53 davem Exp $ + * $Id: udp.c,v 1.50 2000/01/18 08:24:24 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -366,54 +366,19 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, int len, msg->msg_flags |= MSG_TRUNC; } -#ifndef CONFIG_UDP_DELAY_CSUM - err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), - msg->msg_iov, copied); -#else if (skb->ip_summed==CHECKSUM_UNNECESSARY) { err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, copied); - } else if (copied > msg->msg_iov[0].iov_len || (msg->msg_flags&MSG_TRUNC)) { - if ((unsigned short)csum_fold(csum_partial(skb->h.raw, skb->len, skb->csum))) { - /* Clear queue. */ - if (flags&MSG_PEEK) { - int clear = 0; - spin_lock_irq(&sk->receive_queue.lock); - if (skb == skb_peek(&sk->receive_queue)) { - __skb_unlink(skb, &sk->receive_queue); - clear = 1; - } - spin_unlock_irq(&sk->receive_queue.lock); - if (clear) - kfree_skb(skb); - } - - /* Error for blocking case is chosen to masquerade - as some normal condition. - */ - err = (flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH; - udp_stats_in6.UdpInErrors++; - goto out_free; - } + } else if (msg->msg_flags&MSG_TRUNC) { + if ((unsigned short)csum_fold(csum_partial(skb->h.raw, skb->len, skb->csum))) + goto csum_copy_err; err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, copied); } else { - unsigned int csum = csum_partial(skb->h.raw, sizeof(struct udphdr), skb->csum); - - err = 0; - csum = csum_and_copy_to_user((char*)&skb->h.uh[1], msg->msg_iov[0].iov_base, copied, csum, &err); + err = copy_and_csum_toiovec(msg->msg_iov, skb, sizeof(struct udphdr)); if (err) - goto out_free; - if ((unsigned short)csum_fold(csum)) { - /* Error for blocking case is chosen to masquerade - as some normal condition. - */ - err = (flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH; - udp_stats_in6.UdpInErrors++; - goto out_free; - } + goto csum_copy_err; } -#endif if (err) goto out_free; @@ -447,6 +412,27 @@ out_free: skb_free_datagram(sk, skb); out: return err; + +csum_copy_err: + /* Clear queue. */ + if (flags&MSG_PEEK) { + int clear = 0; + spin_lock_irq(&sk->receive_queue.lock); + if (skb == skb_peek(&sk->receive_queue)) { + __skb_unlink(skb, &sk->receive_queue); + clear = 1; + } + spin_unlock_irq(&sk->receive_queue.lock); + if (clear) + kfree_skb(skb); + } + + /* Error for blocking case is chosen to masquerade + as some normal condition. + */ + err = (flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH; + UDP6_INC_STATS_USER(UdpInErrors); + goto out_free; } void udpv6_err(struct sk_buff *skb, struct ipv6hdr *hdr, @@ -474,7 +460,7 @@ void udpv6_err(struct sk_buff *skb, struct ipv6hdr *hdr, !sk->net_pinfo.af_inet6.recverr) goto out; - if (sk->bsdism && sk->state!=TCP_ESTABLISHED && + if (sk->state!=TCP_ESTABLISHED && !sk->net_pinfo.af_inet6.recverr) goto out; @@ -489,7 +475,7 @@ out: static inline int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) { -#if defined(CONFIG_FILTER) && defined(CONFIG_UDP_DELAY_CSUM) +#if defined(CONFIG_FILTER) if (sk->filter && skb->ip_summed != CHECKSUM_UNNECESSARY) { if ((unsigned short)csum_fold(csum_partial(skb->h.raw, skb->len, skb->csum))) { UDP6_INC_STATS_BH(UdpInErrors); @@ -621,24 +607,12 @@ int udpv6_rcv(struct sk_buff *skb, unsigned long len) skb_trim(skb, ulen); -#ifndef CONFIG_UDP_DELAY_CSUM - switch (skb->ip_summed) { - case CHECKSUM_NONE: - skb->csum = csum_partial((char*)uh, ulen, 0); - case CHECKSUM_HW: - if (csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, skb->csum)) { - printk(KERN_DEBUG "IPv6: udp checksum error\n"); - goto discard; - } - }; -#else if (skb->ip_summed==CHECKSUM_HW) { if (csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, skb->csum)) goto discard; skb->ip_summed = CHECKSUM_UNNECESSARY; } else if (skb->ip_summed != CHECKSUM_UNNECESSARY) skb->csum = ~csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, 0); -#endif len = ulen; @@ -651,7 +625,7 @@ int udpv6_rcv(struct sk_buff *skb, unsigned long len) } /* Unicast */ - + /* * check socket cache ... must talk to Alan about his plans * for sock caches... i'll skip this for now. @@ -660,11 +634,9 @@ int udpv6_rcv(struct sk_buff *skb, unsigned long len) sk = udp_v6_lookup(saddr, uh->source, daddr, uh->dest, dev->ifindex); if (sk == NULL) { -#ifdef CONFIG_UDP_DELAY_CSUM if (skb->ip_summed != CHECKSUM_UNNECESSARY && (unsigned short)csum_fold(csum_partial((char*)uh, len, skb->csum))) goto discard; -#endif UDP6_INC_STATS_BH(UdpNoPorts); icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0, dev); @@ -672,12 +644,6 @@ int udpv6_rcv(struct sk_buff *skb, unsigned long len) kfree_skb(skb); return(0); } - if (0/*sk->user_callback && - sk->user_callback(sk->user_data, skb) == 0*/) { - UDP6_INC_STATS_BH(UdpInDatagrams); - sock_put(sk); - return(0); - } /* deliver */ @@ -980,10 +946,6 @@ struct proto udpv6_prot = { udpv6_connect, /* connect */ udp_disconnect, /* disconnect */ NULL, /* accept */ - NULL, /* retransmit */ - NULL, /* write_wakeup */ - NULL, /* read_wakeup */ - datagram_poll, /* poll */ udp_ioctl, /* ioctl */ NULL, /* init */ inet6_destroy_sock, /* destroy */ @@ -997,8 +959,6 @@ struct proto udpv6_prot = { udp_v6_hash, /* hash */ udp_v6_unhash, /* unhash */ udp_v6_get_port, /* get_port */ - 128, /* max_header */ - 0, /* retransmits */ "UDP", /* name */ }; diff --git a/net/khttpd/accept.c b/net/khttpd/accept.c index 9c1912c68f38..97dd21709823 100644 --- a/net/khttpd/accept.c +++ b/net/khttpd/accept.c @@ -63,7 +63,7 @@ int AcceptConnections(const int CPUNR, struct socket *Socket) the allocation of a new socket. (Which doesn't seem to be used anyway) */ - if (Socket->sk->tp_pinfo.af_tcp.syn_wait_queue==NULL) + if (Socket->sk->tp_pinfo.af_tcp.accept_queue==NULL) { return 0; } diff --git a/net/khttpd/datasending.c b/net/khttpd/datasending.c index 7a1afc1dfef1..058b308dcfce 100644 --- a/net/khttpd/datasending.c +++ b/net/khttpd/datasending.c @@ -172,7 +172,7 @@ int DataSending(const int CPUNR) if (CurrentRequest->sock->sk->state == TCP_ESTABLISHED || CurrentRequest->sock->sk->state == TCP_CLOSE_WAIT) { - CurrentRequest->sock->sk->nonagle = 0; + CurrentRequest->sock->sk->tp_pinfo.af_tcp.nonagle = 0; tcp_push_pending_frames(CurrentRequest->sock->sk,&(CurrentRequest->sock->sk->tp_pinfo.af_tcp)); } release_sock(CurrentRequest->sock->sk); diff --git a/net/khttpd/sockets.c b/net/khttpd/sockets.c index 8f8b5d250032..0d575abdfb5f 100644 --- a/net/khttpd/sockets.c +++ b/net/khttpd/sockets.c @@ -68,9 +68,10 @@ int StartListening(const int Port) (void)printk(KERN_ERR " daemon is (or was a short time ago) using port %i.\n",Port); return 0; } - + + /* Grrr... setsockopt() does this. */ sock->sk->reuse = 1; - sock->sk->nonagle = 0; + /* Wow!!! */ sock->sk->linger = 1; /* Now, start listening on the socket */ diff --git a/net/khttpd/userspace.c b/net/khttpd/userspace.c index 2acb27ff1856..948d770feb74 100644 --- a/net/khttpd/userspace.c +++ b/net/khttpd/userspace.c @@ -181,8 +181,7 @@ static struct or_calltable Dummy = static int AddSocketToAcceptQueue(struct socket *sock,const int Port) { struct open_request *req; - struct sock *sk; - struct tcp_opt *tp; + struct sock *sk, *nsk; EnterFunction("AddSocketToAcceptQueue"); @@ -196,8 +195,7 @@ static int AddSocketToAcceptQueue(struct socket *sock,const int Port) lock_sock(sk); - if (sk->state != TCP_LISTEN || - sk->ack_backlog > sk->max_ack_backlog) /* To many pending requests */ + if (sk->state != TCP_LISTEN || tcp_acceptq_is_full(sk)) { release_sock(sk); sock_put(sk); @@ -213,20 +211,17 @@ static int AddSocketToAcceptQueue(struct socket *sock,const int Port) return -1; } - req->sk = sock->sk; + nsk = sock->sk; sock->sk = NULL; sock->state = SS_UNCONNECTED; req->class = &Dummy; - write_lock_irq(&req->sk->callback_lock); - req->sk->socket = NULL; - req->sk->sleep = NULL; - write_unlock_irq(&req->sk->callback_lock); - - tp =&(sk->tp_pinfo.af_tcp); - sk->ack_backlog++; + write_lock_irq(&nsk->callback_lock); + nsk->socket = NULL; + nsk->sleep = NULL; + write_unlock_irq(&nsk->callback_lock); - tcp_synq_queue(tp,req); + tcp_acceptq_queue(sk, req, nsk); sk->data_ready(sk, 0); diff --git a/net/khttpd/waitheaders.c b/net/khttpd/waitheaders.c index a7d4b82e0745..47fa1581d557 100644 --- a/net/khttpd/waitheaders.c +++ b/net/khttpd/waitheaders.c @@ -256,7 +256,7 @@ static int DecodeHeader(const int CPUNR, struct http_request *Request) } else /* Normal Case */ { - Request->sock->sk->nonagle = 2; /* this is TCP_CORK */ + Request->sock->sk->tp_pinfo.af_tcp.nonagle = 2; /* this is TCP_CORK */ if (Request->HTTPVER!=9) /* HTTP/0.9 doesn't allow a header */ SendHTTPHeader(Request); } diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 9c472a93792d..0136d15c2825 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -48,9 +48,9 @@ struct netlink_opt { - pid_t pid; + u32 pid; unsigned groups; - pid_t dst_pid; + u32 dst_pid; unsigned dst_groups; unsigned long state; int (*handler)(int unit, struct sk_buff *skb); @@ -95,6 +95,12 @@ static void netlink_sock_destruct(struct sock *sk) #endif } +/* This lock without TASK_EXCLUSIVE is good on UP and it is _very_ bad on SMP. + * Look, when several writers sleep and reader wakes them up, all but one + * immediately hit write lock and grab all the cpus. Exclusive sleep solves + * this, _but_ remember, it adds useless work on UP machines. + */ + static void netlink_table_grab(void) { write_lock_bh(&nl_table_lock); @@ -102,9 +108,9 @@ static void netlink_table_grab(void) if (atomic_read(&nl_table_users)) { DECLARE_WAITQUEUE(wait, current); - add_wait_queue(&nl_table_wait, &wait); + add_wait_queue_exclusive(&nl_table_wait, &wait); for(;;) { - set_current_state(TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE|TASK_EXCLUSIVE); if (atomic_read(&nl_table_users) == 0) break; write_unlock_bh(&nl_table_lock); @@ -120,6 +126,7 @@ static void netlink_table_grab(void) static __inline__ void netlink_table_ungrab(void) { write_unlock_bh(&nl_table_lock); + wake_up(&nl_table_wait); } static __inline__ void @@ -254,14 +261,9 @@ static int netlink_release(struct socket *sock) /* OK. Socket is unlinked, and, therefore, no new packets will arrive */ - write_lock_irq(&sk->callback_lock); - sk->dead = 1; - sk->socket = NULL; + sock_orphan(sk); sock->sk = NULL; - wake_up_interruptible(sk->sleep); - sk->sleep = NULL; - wake_up_interruptible(&sk->protinfo.af_netlink->wait); - write_unlock_irq(&sk->callback_lock); + wake_up_interruptible_all(&sk->protinfo.af_netlink->wait); skb_queue_purge(&sk->write_queue); @@ -391,8 +393,11 @@ int netlink_unicast(struct sock *ssk, struct sk_buff *skb, u32 pid, int nonblock struct sock *sk; int len = skb->len; int protocol = ssk->protocol; + long timeo; DECLARE_WAITQUEUE(wait, current); + timeo = sock_sndtimeo(ssk, nonblock); + retry: sk = netlink_lookup(protocol, pid); if (sk == NULL) @@ -409,7 +414,7 @@ retry: if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf || test_bit(0, &sk->protinfo.af_netlink->state)) { - if (nonblock) { + if (!timeo) { if (ssk->protinfo.af_netlink->pid == 0) netlink_overrun(sk); sock_put(sk); @@ -422,9 +427,8 @@ retry: if ((atomic_read(&sk->rmem_alloc) > sk->rcvbuf || test_bit(0, &sk->protinfo.af_netlink->state)) && - !signal_pending(current) && !sk->dead) - schedule(); + timeo = schedule_timeout(timeo); __set_current_state(TASK_RUNNING); remove_wait_queue(&sk->protinfo.af_netlink->wait, &wait); @@ -554,9 +558,6 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, int len, if (msg->msg_flags&MSG_OOB) return -EOPNOTSUPP; - if (msg->msg_flags&~(MSG_DONTWAIT|MSG_NOSIGNAL|MSG_ERRQUEUE)) - return -EINVAL; - if (msg->msg_namelen) { if (addr->nl_family != AF_NETLINK) return -EINVAL; diff --git a/net/netsyms.c b/net/netsyms.c index 9891d5cb0775..993f728f84e6 100644 --- a/net/netsyms.c +++ b/net/netsyms.c @@ -61,7 +61,6 @@ extern struct net_proto_family inet_family_ops; #include #include -extern int tcp_tw_death_row_slot; extern int sysctl_local_port_range[2]; extern int tcp_port_rover; extern int udp_port_rover; @@ -277,14 +276,15 @@ EXPORT_SYMBOL(inet_release); EXPORT_SYMBOL(inet_stream_connect); EXPORT_SYMBOL(inet_dgram_connect); EXPORT_SYMBOL(inet_accept); -EXPORT_SYMBOL(inet_poll); EXPORT_SYMBOL(inet_listen); EXPORT_SYMBOL(inet_shutdown); EXPORT_SYMBOL(inet_setsockopt); EXPORT_SYMBOL(inet_getsockopt); EXPORT_SYMBOL(inet_sendmsg); EXPORT_SYMBOL(inet_recvmsg); +#ifdef INET_REFCNT_DEBUG EXPORT_SYMBOL(inet_sock_nr); +#endif EXPORT_SYMBOL(inet_sock_destruct); EXPORT_SYMBOL(inet_sock_release); @@ -307,7 +307,6 @@ EXPORT_SYMBOL(ip_queue_xmit); EXPORT_SYMBOL(memcpy_fromiovecend); EXPORT_SYMBOL(csum_partial_copy_fromiovecend); EXPORT_SYMBOL(copy_and_csum_toiovec); -EXPORT_SYMBOL(tcp_keepalive_timer); EXPORT_SYMBOL(tcp_v4_lookup_listener); /* UDP/TCP exported functions for TCPv6 */ EXPORT_SYMBOL(udp_ioctl); @@ -318,7 +317,6 @@ EXPORT_SYMBOL(tcp_close); EXPORT_SYMBOL(tcp_disconnect); EXPORT_SYMBOL(tcp_accept); EXPORT_SYMBOL(tcp_write_wakeup); -EXPORT_SYMBOL(tcp_read_wakeup); EXPORT_SYMBOL(tcp_write_space); EXPORT_SYMBOL(tcp_poll); EXPORT_SYMBOL(tcp_ioctl); @@ -328,19 +326,18 @@ EXPORT_SYMBOL(tcp_getsockopt); EXPORT_SYMBOL(tcp_recvmsg); EXPORT_SYMBOL(tcp_send_synack); EXPORT_SYMBOL(tcp_check_req); +EXPORT_SYMBOL(tcp_child_process); EXPORT_SYMBOL(tcp_reset_xmit_timer); EXPORT_SYMBOL(tcp_parse_options); EXPORT_SYMBOL(tcp_rcv_established); EXPORT_SYMBOL(tcp_init_xmit_timers); EXPORT_SYMBOL(tcp_clear_xmit_timers); -EXPORT_SYMBOL(tcp_slt_array); -EXPORT_SYMBOL(__tcp_inc_slow_timer); EXPORT_SYMBOL(tcp_statistics); EXPORT_SYMBOL(tcp_rcv_state_process); EXPORT_SYMBOL(tcp_timewait_state_process); EXPORT_SYMBOL(tcp_timewait_cachep); EXPORT_SYMBOL(tcp_timewait_kill); -EXPORT_SYMBOL(tcp_do_sendmsg); +EXPORT_SYMBOL(tcp_sendmsg); EXPORT_SYMBOL(tcp_v4_rebuild_header); EXPORT_SYMBOL(tcp_v4_send_check); EXPORT_SYMBOL(tcp_v4_conn_request); @@ -362,8 +359,9 @@ EXPORT_SYMBOL(tcp_simple_retransmit); EXPORT_SYMBOL(tcp_transmit_skb); EXPORT_SYMBOL(tcp_connect); EXPORT_SYMBOL(tcp_make_synack); -EXPORT_SYMBOL(tcp_tw_death_row_slot); EXPORT_SYMBOL(tcp_tw_deschedule); +EXPORT_SYMBOL(tcp_delete_keepalive_timer); +EXPORT_SYMBOL(tcp_reset_keepalive_timer); EXPORT_SYMBOL(sysctl_local_port_range); EXPORT_SYMBOL(tcp_port_rover); EXPORT_SYMBOL(udp_port_rover); @@ -375,7 +373,12 @@ EXPORT_SYMBOL(xrlim_allow); EXPORT_SYMBOL(tcp_write_xmit); EXPORT_SYMBOL(dev_loopback_xmit); +EXPORT_SYMBOL(tcp_v4_remember_stamp); + +extern int sysctl_tcp_tw_recycle; + #ifdef CONFIG_SYSCTL +EXPORT_SYMBOL(sysctl_tcp_tw_recycle); EXPORT_SYMBOL(sysctl_max_syn_backlog); #endif @@ -489,7 +492,9 @@ EXPORT_SYMBOL(eth_type_trans); EXPORT_SYMBOL(fddi_type_trans); EXPORT_SYMBOL(fddi_setup); #endif /* CONFIG_FDDI */ +#if 0 EXPORT_SYMBOL(eth_copy_and_sum); +#endif EXPORT_SYMBOL(alloc_skb); EXPORT_SYMBOL(__kfree_skb); EXPORT_SYMBOL(skb_clone); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index eec4d92d752c..b18f0f86ebe7 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -5,7 +5,7 @@ * * PACKET - implements raw packet sockets. * - * Version: $Id: af_packet.c,v 1.26 1999/12/20 05:20:02 davem Exp $ + * Version: $Id: af_packet.c,v 1.27 2000/01/18 08:24:27 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -789,13 +789,8 @@ static int packet_release(struct socket *sock) * Now the socket is dead. No more input will appear. */ - write_lock_irq(&sk->callback_lock); + sock_orphan(sk); sock->sk = NULL; - sk->socket = NULL; - sk->dead = 1; - sk->sleep = NULL; - write_unlock_irq(&sk->callback_lock); - /* Purge queues */ diff --git a/net/socket.c b/net/socket.c index 4b4bc45b964b..5d91fb355e0b 100644 --- a/net/socket.c +++ b/net/socket.c @@ -176,7 +176,10 @@ static __inline__ void net_family_read_unlock(void) * Statistics counters of the socket lists */ -static int sockets_in_use = 0; +static union { + int counter; + char __pad[SMP_CACHE_BYTES]; +} sockets_in_use[NR_CPUS] __cacheline_aligned = {{0}}; /* * Support routines. Move socket addresses back and forth across the kernel/user @@ -261,23 +264,14 @@ static int sock_map_fd(struct socket *sock) goto out; } - lock_kernel(); file->f_dentry = d_alloc_root(sock->inode); if (!file->f_dentry) { - unlock_kernel(); put_filp(file); put_unused_fd(fd); fd = -ENOMEM; goto out; } - /* - * The socket maintains a reference to the inode, so we - * have to increment the count. - */ - sock->inode->i_count++; - unlock_kernel(); - file->f_op = &socket_file_ops; file->f_mode = 3; file->f_flags = O_RDWR; @@ -360,7 +354,7 @@ struct socket *sock_alloc(void) sock->sk = NULL; sock->file = NULL; - sockets_in_use++; + sockets_in_use[smp_processor_id()].counter++; return sock; } @@ -383,9 +377,8 @@ void sock_release(struct socket *sock) if (sock->fasync_list) printk(KERN_ERR "sock_release: fasync list not empty!\n"); - --sockets_in_use; /* Bookkeeping.. */ + sockets_in_use[smp_processor_id()].counter--; sock->file=NULL; - iput(sock->inode); } int sock_sendmsg(struct socket *sock, struct msghdr *msg, int size) @@ -889,8 +882,6 @@ asmlinkage long sys_listen(int fd, int backlog) int err; if ((sock = sockfd_lookup(fd, &err)) != NULL) { - if ((unsigned) backlog == 0) /* BSDism */ - backlog = 1; if ((unsigned) backlog > SOMAXCONN) backlog = SOMAXCONN; err=sock->ops->listen(sock, backlog); @@ -943,6 +934,9 @@ asmlinkage long sys_accept(int fd, struct sockaddr *upeer_sockaddr, int *upeer_a goto out_release; } + /* File flags are inherited via accept(). It looks silly, but we + * have to be compatible with another OSes. + */ if ((err = sock_map_fd(newsock)) < 0) goto out_release; @@ -1119,7 +1113,7 @@ asmlinkage long sys_recvfrom(int fd, void * ubuf, size_t size, unsigned flags, flags |= MSG_DONTWAIT; err=sock_recvmsg(sock, &msg, size, flags); - if(err >= 0 && addr != NULL) + if(err >= 0 && addr != NULL && msg.msg_namelen) { err2=move_addr_to_user(address, msg.msg_namelen, addr, addr_len); if(err2<0) @@ -1341,7 +1335,7 @@ asmlinkage long sys_recvmsg(int fd, struct msghdr *msg, unsigned int flags) goto out_freeiov; len = err; - if (uaddr != NULL) { + if (uaddr != NULL && msg_sys.msg_namelen) { err = move_addr_to_user(addr, msg_sys.msg_namelen, uaddr, uaddr_len); if (err < 0) goto out_freeiov; @@ -1595,7 +1589,17 @@ void __init sock_init(void) int socket_get_info(char *buffer, char **start, off_t offset, int length) { - int len = sprintf(buffer, "sockets: used %d\n", sockets_in_use); + int len, cpu; + int counter = 0; + + for (cpu=0; cpu= len) { *start = buffer; @@ -1605,5 +1609,7 @@ int socket_get_info(char *buffer, char **start, off_t offset, int length) len -= offset; if (len > length) len = length; + if (len < 0) + len = 0; return len; } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index e6b0eb50ce4d..7b3c63e87d28 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -8,7 +8,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * Version: $Id: af_unix.c,v 1.87 1999/12/09 00:54:25 davem Exp $ + * Version: $Id: af_unix.c,v 1.88 2000/01/18 08:24:28 davem Exp $ * * Fixes: * Linus Torvalds : Assorted bug cures. @@ -337,10 +337,7 @@ static int unix_release_sock (unix_socket *sk, int embrion) /* Clear state */ unix_state_wlock(sk); - write_lock(&sk->callback_lock); - sk->dead = 1; - sk->socket = NULL; - write_unlock(&sk->callback_lock); + sock_orphan(sk); sk->shutdown = SHUTDOWN_MASK; dentry = sk->protinfo.af_unix.dentry; sk->protinfo.af_unix.dentry=NULL; @@ -348,8 +345,7 @@ static int unix_release_sock (unix_socket *sk, int embrion) sk->state = TCP_CLOSE; unix_state_wunlock(sk); - wake_up_interruptible(sk->sleep); - wake_up_interruptible(&sk->protinfo.af_unix.peer_wait); + wake_up_interruptible_all(&sk->protinfo.af_unix.peer_wait); skpair=unix_peer(sk); @@ -360,7 +356,8 @@ static int unix_release_sock (unix_socket *sk, int embrion) if (!skb_queue_empty(&sk->receive_queue) || embrion) skpair->err = ECONNRESET; unix_state_wunlock(skpair); - sk->data_ready(skpair,0); + sk->state_change(skpair); + sock_wake_async(sk->socket,1,POLL_HUP); } sock_put(skpair); /* It may now die */ unix_peer(sk) = NULL; @@ -418,7 +415,7 @@ static int unix_listen(struct socket *sock, int backlog) if (sk->state != TCP_CLOSE && sk->state != TCP_LISTEN) goto out_unlock; if (backlog > sk->max_ack_backlog) - wake_up_interruptible(&sk->protinfo.af_unix.peer_wait); + wake_up_interruptible_all(&sk->protinfo.af_unix.peer_wait); sk->max_ack_backlog=backlog; sk->state=TCP_LISTEN; sock->flags |= SO_ACCEPTCON; @@ -740,26 +737,26 @@ out: return err; } -static void unix_wait_for_peer(unix_socket *other) +static long unix_wait_for_peer(unix_socket *other, long timeo) { int sched; DECLARE_WAITQUEUE(wait, current); - __set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&other->protinfo.af_unix.peer_wait, &wait); + __set_current_state(TASK_INTERRUPTIBLE|TASK_EXCLUSIVE); + add_wait_queue_exclusive(&other->protinfo.af_unix.peer_wait, &wait); sched = (!other->dead && !(other->shutdown&RCV_SHUTDOWN) && - !signal_pending(current) && - skb_queue_len(&other->receive_queue) >= other->max_ack_backlog); + skb_queue_len(&other->receive_queue) > other->max_ack_backlog); unix_state_runlock(other); if (sched) - schedule(); + timeo = schedule_timeout(timeo); __set_current_state(TASK_RUNNING); remove_wait_queue(&other->protinfo.af_unix.peer_wait, &wait); + return timeo; } static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, @@ -773,6 +770,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, unsigned hash; int st; int err; + long timeo; err = unix_mkname(sunaddr, addr_len, &hash); if (err < 0) @@ -783,6 +781,8 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, (err = unix_autobind(sock)) != 0) goto out; + timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); + /* First of all allocate resources. If we will make it after state is locked, we will have to recheck all again in any case. @@ -820,12 +820,12 @@ restart: if (other->state != TCP_LISTEN) goto out_unlock; - if (skb_queue_len(&other->receive_queue) >= other->max_ack_backlog) { + if (skb_queue_len(&other->receive_queue) > other->max_ack_backlog) { err = -EAGAIN; - if (flags & O_NONBLOCK) + if (!timeo) goto out_unlock; - unix_wait_for_peer(other); + timeo = unix_wait_for_peer(other, timeo); err = -ERESTARTSYS; if (signal_pending(current)) @@ -959,8 +959,8 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags) if (sk->state!=TCP_LISTEN) goto out; - /* If socket state is TCP_LISTEN it cannot change, - so that no locks are necessary. + /* If socket state is TCP_LISTEN it cannot change (for now...), + * so that no locks are necessary. */ skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err); @@ -968,16 +968,13 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags) goto out; tsk = skb->sk; - if (skb_queue_len(&sk->receive_queue) <= sk->max_ack_backlog/2) - wake_up_interruptible(&sk->protinfo.af_unix.peer_wait); skb_free_datagram(sk, skb); + wake_up_interruptible(&sk->protinfo.af_unix.peer_wait); /* attach accepted sock to socket */ unix_state_wlock(tsk); newsock->state = SS_CONNECTED; - newsock->sk = tsk; - tsk->sleep = &newsock->wait; - tsk->socket = newsock; + sock_graft(tsk, newsock); unix_state_wunlock(tsk); return 0; @@ -1069,15 +1066,12 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, int len, int err; unsigned hash; struct sk_buff *skb; + long timeo; err = -EOPNOTSUPP; if (msg->msg_flags&MSG_OOB) goto out; - err = -EINVAL; - if (msg->msg_flags&~(MSG_DONTWAIT|MSG_NOSIGNAL)) - goto out; - if (msg->msg_namelen) { err = unix_mkname(sunaddr, msg->msg_namelen, &hash); if (err < 0) @@ -1095,6 +1089,7 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, int len, (err = unix_autobind(sock)) != 0) goto out; + skb = sock_alloc_send_skb(sk, len, 0, msg->msg_flags&MSG_DONTWAIT, &err); if (skb==NULL) goto out; @@ -1108,6 +1103,8 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, int len, if (err) goto out_free; + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + restart: if (!other) { err = -ECONNRESET; @@ -1151,20 +1148,13 @@ restart: if (other->shutdown&RCV_SHUTDOWN) goto out_unlock; - if (0/*other->user_callback && - other->user_callback(other->user_data, skb) == 0*/) { - unix_state_runlock(other); - sock_put(other); - return len; - } - - if (skb_queue_len(&other->receive_queue) >= other->max_ack_backlog) { - if (msg->msg_flags & MSG_DONTWAIT) { + if (skb_queue_len(&other->receive_queue) > other->max_ack_backlog) { + if (!timeo) { err = -EAGAIN; goto out_unlock; } - unix_wait_for_peer(other); + timeo = unix_wait_for_peer(other, timeo); err = -ERESTARTSYS; if (signal_pending(current)) @@ -1205,10 +1195,6 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, int len, if (msg->msg_flags&MSG_OOB) goto out_err; - err = -EINVAL; - if (msg->msg_flags&~(MSG_DONTWAIT|MSG_NOSIGNAL)) - goto out_err; - if (msg->msg_namelen) { err = (sk->state==TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP); goto out_err; @@ -1329,8 +1315,7 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, int size, if (!skb) goto out; - if (skb_queue_len(&sk->receive_queue) <= sk->max_ack_backlog/2) - wake_up_interruptible(&sk->protinfo.af_unix.peer_wait); + wake_up_interruptible(&sk->protinfo.af_unix.peer_wait); if (msg->msg_name) unix_copy_addr(msg, skb->sk); @@ -1380,7 +1365,7 @@ out: * Sleep until data has arrive. But check for races.. */ -static void unix_stream_data_wait(unix_socket * sk) +static long unix_stream_data_wait(unix_socket * sk, long timeo) { DECLARE_WAITQUEUE(wait, current); @@ -1394,12 +1379,13 @@ static void unix_stream_data_wait(unix_socket * sk) if (skb_queue_len(&sk->receive_queue) || sk->err || (sk->shutdown & RCV_SHUTDOWN) || - signal_pending(current)) + signal_pending(current) || + !timeo) break; sk->socket->flags |= SO_WAITDATA; unix_state_runlock(sk); - schedule(); + timeo = schedule_timeout(timeo); unix_state_rlock(sk); sk->socket->flags &= ~SO_WAITDATA; } @@ -1407,6 +1393,7 @@ static void unix_stream_data_wait(unix_socket * sk) __set_current_state(TASK_RUNNING); remove_wait_queue(sk->sleep, &wait); unix_state_runlock(sk); + return timeo; } @@ -1415,12 +1402,12 @@ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, int size int flags, struct scm_cookie *scm) { struct sock *sk = sock->sk; - int noblock = flags & MSG_DONTWAIT; struct sockaddr_un *sunaddr=msg->msg_name; int copied = 0; int check_creds = 0; - int target = 1; + int target; int err = 0; + long timeo; err = -EINVAL; if (sk->state != TCP_ESTABLISHED) @@ -1430,9 +1417,8 @@ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, int size if (flags&MSG_OOB) goto out; - if (flags&MSG_WAITALL) - target = size; - + target = sock_rcvlowat(sk, flags&MSG_WAITALL, size); + timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT); msg->msg_namelen = 0; @@ -1462,11 +1448,11 @@ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, int size if (sk->shutdown & RCV_SHUTDOWN) break; err = -EAGAIN; - if (noblock) + if (!timeo) break; up(&sk->protinfo.af_unix.readsem); - unix_stream_data_wait(sk); + timeo = unix_stream_data_wait(sk, timeo); if (signal_pending(current)) { err = -ERESTARTSYS; @@ -1569,10 +1555,9 @@ static int unix_shutdown(struct socket *sock, int mode) unix_state_wlock(other); other->shutdown |= peer_mode; unix_state_wunlock(other); + other->state_change(other); if (peer_mode&RCV_SHUTDOWN) - other->data_ready(other,0); - else - other->state_change(other); + sock_wake_async(sk->socket,1,POLL_HUP); } if (other) sock_put(other); @@ -1589,14 +1574,11 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) switch(cmd) { - - case TIOCOUTQ: - amount = sk->sndbuf - atomic_read(&sk->wmem_alloc); - if(amount<0) - amount=0; + case SIOCOUTQ: + amount = atomic_read(&sk->wmem_alloc); err = put_user(amount, (int *)arg); break; - case TIOCINQ: + case SIOCINQ: { struct sk_buff *skb; if (sk->state==TCP_LISTEN) { @@ -1630,11 +1612,11 @@ static unsigned int unix_poll(struct file * file, struct socket *sock, poll_tabl /* exceptional events? */ if (sk->err) mask |= POLLERR; - if (sk->shutdown & RCV_SHUTDOWN) + if (sk->shutdown == SHUTDOWN_MASK) mask |= POLLHUP; /* readable? */ - if (!skb_queue_empty(&sk->receive_queue)) + if (!skb_queue_empty(&sk->receive_queue) || (sk->shutdown&RCV_SHUTDOWN)) mask |= POLLIN | POLLRDNORM; /* Connection-based need to check for termination and startup */ diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c index ae365edaa81e..af7191563064 100644 --- a/net/wanrouter/wanmain.c +++ b/net/wanrouter/wanmain.c @@ -9,15 +9,16 @@ * o Logical connection management (switched virtual circuits) * o Protocol encapsulation/decapsulation * -* Author: Gene Kozin +* Author: Gideon Hack * -* Copyright: (c) 1995-1997 Sangoma Technologies Inc. +* Copyright: (c) 1995-1999 Sangoma Technologies Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * ============================================================================ +* Oct 01, 1999 Gideon Hack Update for s514 PCI card * Dec 27, 1996 Gene Kozin Initial version (based on Sangoma's WANPIPE) * Jan 16, 1997 Gene Kozin router_devlist made public * Jan 31, 1997 Alan Cox Hacked it about a bit for 2.1 @@ -31,8 +32,10 @@ * kernel memory and copy configuration data to * kernel space (for big firmwares) * May 19, 1999 Arnaldo Melo __init in wanrouter_init +* Jun 02, 1999 Gideon Hack Updates for Linux 2.0.X and 2.2.X kernels. *****************************************************************************/ +#include #include #include /* offsetof(), etc. */ #include /* return codes */ @@ -48,7 +51,11 @@ #include /* WAN router API definitions */ #include /* __init et al. */ -/****** Defines and Macros **************************************************/ + + +/* + * Defines and Macros + */ #ifndef min #define min(a,b) (((a)<(b))?(a):(b)) @@ -57,48 +64,49 @@ #define max(a,b) (((a)>(b))?(a):(b)) #endif -/****** Function Prototypes *************************************************/ - /* - * Kernel loadable module interface. + * Function Prototypes */ +/* + * Kernel loadable module interface. + */ #ifdef MODULE int init_module (void); void cleanup_module (void); #endif -/* +/* * WAN device IOCTL handlers */ -static int device_setup (wan_device_t* wandev, wandev_conf_t* u_conf); -static int device_stat (wan_device_t* wandev, wandev_stat_t* u_stat); -static int device_shutdown (wan_device_t* wandev); -static int device_new_if (wan_device_t* wandev, wanif_conf_t* u_conf); -static int device_del_if (wan_device_t* wandev, char* u_name); - -/* +static int device_setup(wan_device_t *wandev, wandev_conf_t *u_conf); +static int device_stat(wan_device_t *wandev, wandev_stat_t *u_stat); +static int device_shutdown(wan_device_t *wandev); +static int device_new_if(wan_device_t *wandev, wanif_conf_t *u_conf); +static int device_del_if(wan_device_t *wandev, char *u_name); + +/* * Miscellaneous */ -static wan_device_t* find_device (char* name); -static int delete_interface (wan_device_t* wandev, char* name, int forse); +static wan_device_t *find_device (char *name); +static int delete_interface (wan_device_t *wandev, char *name, int force); /* * Global Data */ static char fullname[] = "WAN Router"; -static char copyright[] = "(c) 1995-1997 Sangoma Technologies Inc."; +static char copyright[] = "(c) 1995-1999 Sangoma Technologies Inc."; static char modname[] = ROUTER_NAME; /* short module name */ -wan_device_t * router_devlist = NULL; /* list of registered devices */ -static int devcnt = 0; +wan_device_t* router_devlist = NULL; /* list of registered devices */ +static int devcnt = 0; -/* - * Organizationally Unique Identifiers for encapsulation/decapsulation +/* + * Organize Unique Identifiers for encapsulation/decapsulation */ - + static unsigned char oui_ether[] = { 0x00, 0x00, 0x00 }; #if 0 static unsigned char oui_802_2[] = { 0x00, 0x80, 0xC2 }; @@ -115,8 +123,7 @@ int __init wanrouter_init(void) fullname, ROUTER_VERSION, ROUTER_RELEASE, copyright); err = wanrouter_proc_init(); if (err) - printk(KERN_ERR "%s: can't create entry in proc filesystem!\n", - modname); + printk(KERN_ERR "%s: can't create entry in proc filesystem!\n", modname); /* * Initialise compiled in boards @@ -138,14 +145,14 @@ int __init wanrouter_init(void) */ /* - * Module 'insert' entry point. - * o print announcement - * o initialize static data - * o create /proc/net/router directory and static entries + * Module 'insert' entry point. + * o print announcement + * o initialize static data + * o create /proc/net/router directory and static entries * - * Return: 0 Ok + * Return: 0 Ok * < 0 error. - * Context: process + * Context: process */ int init_module (void) @@ -161,10 +168,10 @@ int init_module (void) } /* - * Module 'remove' entry point. - * o delete /proc/net/router directory and static entries. + * Module 'remove' entry point. + * o delete /proc/net/router directory and static entries. */ - + void cleanup_module (void) { wanrouter_proc_cleanup(); @@ -173,33 +180,34 @@ void cleanup_module (void) #endif /* - * Kernel APIs + * Kernel APIs */ /* - * Register WAN device. - * o verify device credentials - * o create an entry for the device in the /proc/net/router directory - * o initialize internally maintained fields of the wan_device structure - * o link device data space to a singly-linked list - * o if it's the first device, then start kernel 'thread' - * o increment module use count + * Register WAN device. + * o verify device credentials + * o create an entry for the device in the /proc/net/router directory + * o initialize internally maintained fields of the wan_device structure + * o link device data space to a singly-linked list + * o if it's the first device, then start kernel 'thread' + * o increment module use count * - * Return: - * 0 Ok - * < 0 error. + * Return: + * 0 Ok + * < 0 error. * - * Context: process + * Context: process */ -int register_wan_device(wan_device_t* wandev) + +int register_wan_device(wan_device_t *wandev) { int err, namelen; if ((wandev == NULL) || (wandev->magic != ROUTER_MAGIC) || (wandev->name == NULL)) return -EINVAL; - + namelen = strlen(wandev->name); if (!namelen || (namelen > WAN_DRVNAME_SZ)) return -EINVAL; @@ -215,12 +223,10 @@ int register_wan_device(wan_device_t* wandev) * Register /proc directory entry */ err = wanrouter_proc_add(wandev); - if (err) - { + if (err) { printk(KERN_ERR "%s: can't create /proc/net/router/%s entry!\n", - modname, wandev->name) - ; + modname, wandev->name); return err; } @@ -250,8 +256,8 @@ int register_wan_device(wan_device_t* wandev) * Context: process */ - -int unregister_wan_device(char* name) + +int unregister_wan_device(char *name) { wan_device_t *wandev, *prev; @@ -269,8 +275,7 @@ int unregister_wan_device(char* name) printk(KERN_INFO "%s: unregistering WAN device %s\n", modname, name); #endif - if (wandev->state != WAN_UNCONFIGURED) - { + if (wandev->state != WAN_UNCONFIGURED) { while(wandev->dev) delete_interface(wandev, wandev->dev->name, 1); if (wandev->shutdown) @@ -359,7 +364,6 @@ unsigned short wanrouter_type_trans (struct sk_buff* skb, struct net_device* dev "on interface %s!\n", modname, skb->data[cnt+1], skb->data[cnt+2], skb->data[cnt+3], dev->name); - ; return 0; } ethertype = *((unsigned short*)&skb->data[cnt+4]); @@ -371,8 +375,7 @@ unsigned short wanrouter_type_trans (struct sk_buff* skb, struct net_device* dev default: printk(KERN_INFO "%s: unsupported NLPID 0x%02X on interface %s!\n", - modname, skb->data[cnt], dev->name) - ; + modname, skb->data[cnt], dev->name); return 0; } skb->protocol = ethertype; @@ -382,18 +385,19 @@ unsigned short wanrouter_type_trans (struct sk_buff* skb, struct net_device* dev return ethertype; } + /* * WAN device IOCTL. * o find WAN device associated with this node * o execute requested action or pass command to the device driver */ -int wanrouter_ioctl(struct inode* inode, struct file* file, +int wanrouter_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg) { int err = 0; - struct proc_dir_entry* dent; - wan_device_t* wandev; + struct proc_dir_entry *dent; + wan_device_t *wandev; if (!capable(CAP_NET_ADMIN)){ return -EPERM; @@ -410,8 +414,7 @@ int wanrouter_ioctl(struct inode* inode, struct file* file, if (wandev->magic != ROUTER_MAGIC) return -EINVAL; - switch (cmd) - { + switch (cmd) { case ROUTER_SETUP: err = device_setup(wandev, (void*)arg); break; @@ -439,8 +442,7 @@ int wanrouter_ioctl(struct inode* inode, struct file* file, if ((cmd >= ROUTER_USER) && (cmd <= ROUTER_USER_MAX) && wandev->ioctl) - err = wandev->ioctl(wandev, cmd, arg) - ; + err = wandev->ioctl(wandev, cmd, arg); else err = -EINVAL; } return err; @@ -458,51 +460,52 @@ int wanrouter_ioctl(struct inode* inode, struct file* file, * o call driver's setup() entry point */ -static int device_setup (wan_device_t* wandev, wandev_conf_t* u_conf) +static int device_setup (wan_device_t *wandev, wandev_conf_t *u_conf) { - void* data; + void *data = NULL; wandev_conf_t *conf; - int err= -EINVAL; + int err = -EINVAL; if (wandev->setup == NULL) /* Nothing to do ? */ return 0; - + conf = kmalloc(sizeof(wandev_conf_t), GFP_KERNEL); if (conf == NULL) return -ENOBUFS; - - if(copy_from_user(conf, u_conf, sizeof(wandev_conf_t))) - { + + if(copy_from_user(conf, u_conf, sizeof(wandev_conf_t))) { kfree(conf); return -EFAULT; } - if (conf->magic != ROUTER_MAGIC) - goto bail; + if (conf->magic != ROUTER_MAGIC) { + kfree(conf); + return -EINVAL; + } - if (conf->data_size && conf->data) - { - if(conf->data_size > 128000 || conf->data_size < 0){ - goto bail; + if (conf->data_size && conf->data) { + if(conf->data_size > 128000 || conf->data_size < 0) { + kfree(conf); + return -EINVAL;; } + data = vmalloc(conf->data_size); - if (data) - { - if(!copy_from_user(data, conf->data, conf->data_size)) - { + if (data) { + if(!copy_from_user(data, conf->data, conf->data_size)){ conf->data=data; err = wandev->setup(wandev,conf); } else err = -EFAULT; } - else + else err = -ENOBUFS; - + if (data) vfree(data); + } -bail: + kfree(conf); return err; } @@ -537,7 +540,7 @@ static int device_shutdown (wan_device_t* wandev) * Get WAN device status & statistics. */ -static int device_stat (wan_device_t* wandev, wandev_stat_t* u_stat) +static int device_stat (wan_device_t *wandev, wandev_stat_t *u_stat) { wandev_stat_t stat; @@ -553,6 +556,7 @@ static int device_stat (wan_device_t* wandev, wandev_stat_t* u_stat) if(copy_to_user(u_stat, &stat, sizeof(stat))) return -EFAULT; + return 0; } @@ -569,7 +573,7 @@ static int device_stat (wan_device_t* wandev, wandev_stat_t* u_stat) static int device_new_if (wan_device_t* wandev, wanif_conf_t* u_conf) { wanif_conf_t conf; - struct net_device* dev; + struct net_device *dev; int err; if ((wandev->state == WAN_UNCONFIGURED) || (wandev->new_if == NULL)) @@ -587,8 +591,7 @@ static int device_new_if (wan_device_t* wandev, wanif_conf_t* u_conf) memset(dev, 0, sizeof(struct net_device)); err = wandev->new_if(wandev, dev, &conf); - if (!err) - { + if (!err) { /* Register network interface. This will invoke init() * function supplied by the driver. If device registered * successfully, add it to the interface list. @@ -598,15 +601,13 @@ static int device_new_if (wan_device_t* wandev, wanif_conf_t* u_conf) else if (dev_get(dev->name)) err = -EEXIST; /* name already exists */ - else - { + else { #ifdef WANDEBUG printk(KERN_INFO "%s: registering interface %s...\n", modname, dev->name); #endif err = register_netdev(dev); - if (!err) - { + if (!err) { cli(); /***** critical section start *****/ dev->slave = wandev->dev; wandev->dev = dev; @@ -622,25 +623,28 @@ static int device_new_if (wan_device_t* wandev, wanif_conf_t* u_conf) return err; } + /* * Delete WAN logical channel. * o verify user address space * o copy configuration data to kernel address space */ -static int device_del_if (wan_device_t* wandev, char* u_name) +static int device_del_if (wan_device_t *wandev, char *u_name) { char name[WAN_IFNAME_SZ + 1]; if (wandev->state == WAN_UNCONFIGURED) return -ENODEV; - + memset(name, 0, sizeof(name)); + if(copy_from_user(name, u_name, WAN_IFNAME_SZ)) return -EFAULT; return delete_interface(wandev, name, 0); } + /* * Miscellaneous Functions */ @@ -650,9 +654,9 @@ static int device_del_if (wan_device_t* wandev, char* u_name) * Return pointer to the WAN device data space or NULL if device not found. */ -static wan_device_t* find_device (char* name) +static wan_device_t *find_device(char *name) { - wan_device_t* wandev; + wan_device_t *wandev; for (wandev = router_devlist;wandev && strcmp(wandev->name, name); wandev = wandev->next); @@ -676,7 +680,7 @@ static wan_device_t* find_device (char* name) * sure that opened interfaces are not removed! */ -static int delete_interface (wan_device_t* wandev, char* name, int force) +static int delete_interface (wan_device_t *wandev, char *name, int force) { struct net_device *dev, *prev; @@ -687,16 +691,16 @@ static int delete_interface (wan_device_t* wandev, char* name, int force) if (dev == NULL) return -ENODEV; /* interface not found */ - if (dev->start) - { - if (force) - { + if (dev->start) { + if (force) { printk(KERN_WARNING - "%s: deleting opened interface %s!\n",modname, name); + "%s: deleting opened interface %s!\n", + modname, name); } else return -EBUSY; /* interface in use */ } + if (wandev->del_if) wandev->del_if(wandev, dev); @@ -708,7 +712,7 @@ static int delete_interface (wan_device_t* wandev, char* name, int force) --wandev->ndev; sti(); /****** critical section end ******/ - printk("Unregistering '%s'\n", dev->name); + printk("Unregistering '%s'\n", dev->name); unregister_netdev(dev); kfree(dev); return 0; @@ -722,4 +726,3 @@ EXPORT_SYMBOL(wanrouter_type_trans); /* * End */ - diff --git a/net/wanrouter/wanproc.c b/net/wanrouter/wanproc.c index f895fc58b846..91696d57e925 100644 --- a/net/wanrouter/wanproc.c +++ b/net/wanrouter/wanproc.c @@ -4,21 +4,23 @@ * This module is completely hardware-independent and provides * access to the router using Linux /proc filesystem. * -* Author: Gene Kozin +* Author: Gideon Hack * -* Copyright: (c) 1995-1997 Sangoma Technologies Inc. +* Copyright: (c) 1995-1999 Sangoma Technologies Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * ============================================================================ +* Jun 02, 1999 Gideon Hack Updates for Linux 2.2.X kernels. * Jun 29, 1997 Alan Cox Merged with 1.0.3 vendor code * Jan 29, 1997 Gene Kozin v1.0.1. Implemented /proc read routines * Jan 30, 1997 Alan Cox Hacked around for 2.1 * Dec 13, 1996 Gene Kozin Initial version (based on Sangoma's WANPIPE) *****************************************************************************/ +#include #include #include /* offsetof(), etc. */ #include /* return codes */ @@ -36,6 +38,8 @@ /****** Defines and Macros **************************************************/ +#define PROC_STATS_FORMAT "%30s: %12lu\n" + #ifndef min #define min(a,b) (((a)<(b))?(a):(b)) #endif @@ -45,11 +49,12 @@ #define PROC_BUFSZ 4000 /* buffer size for printing proc info */ + /****** Data Types **********************************************************/ typedef struct wan_stat_entry { - struct wan_stat_entry * next; + struct wan_stat_entry *next; char *description; /* description string */ void *data; /* -> data */ unsigned data_type; /* data type */ @@ -83,7 +88,6 @@ static int wandev_get_info(char* buf, char** start, off_t offs, int len); /* * Generic /proc/net/router/ file and inode operations */ - static struct file_operations router_fops = { NULL, /* lseek */ @@ -266,7 +270,7 @@ static int router_proc_perms (struct inode* inode, int op) /* * Read router proc directory entry. - * This is universal routine for reading all entries in /proc/net/router + * This is universal routine for reading all entries in /proc/net/wanrouter * directory. Each directory entry contains a pointer to the 'method' for * preparing data for that entry. * o verify arguments @@ -300,8 +304,7 @@ static ssize_t router_proc_read(struct file* file, char* buf, size_t count, pos = dent->get_info(page, dent->data, 0, 0); offs = file->f_pos; - if (offs < pos) - { + if (offs < pos) { len = min(pos - offs, count); if(copy_to_user(buf, (page + offs), len)) return -EFAULT; @@ -325,15 +328,14 @@ static int config_get_info(char* buf, char** start, off_t offs, int len) strcpy(buf, conf_hdr); for (wandev = router_devlist; wandev && (cnt < (PROC_BUFSZ - 120)); - wandev = wandev->next) - { + wandev = wandev->next) { if (wandev->state) cnt += sprintf(&buf[cnt], "%-15s|0x%-4X|%3u|%3u| 0x%-8lX |0x%-6X|%7u|%7u|%7u|%7u\n", wandev->name, wandev->ioport, wandev->irq, wandev->dma, - virt_to_phys(wandev->maddr), + wandev->maddr, wandev->msize, wandev->hw_opt[0], wandev->hw_opt[1], @@ -351,13 +353,16 @@ static int config_get_info(char* buf, char** start, off_t offs, int len) static int status_get_info(char* buf, char** start, off_t offs, int len) { - int cnt = sizeof(stat_hdr) - 1; + int cnt = 0; wan_device_t* wandev; - strcpy(buf, stat_hdr); + + cnt += sprintf(&buf[cnt], "\nSTATUS FOR PORT 0\n\n"); + strcpy(&buf[cnt], stat_hdr); + cnt += sizeof(stat_hdr) - 1; + for (wandev = router_devlist; wandev && (cnt < (PROC_BUFSZ - 80)); - wandev = wandev->next) - { + wandev = wandev->next) { if (!wandev->state) continue; cnt += sprintf(&buf[cnt], "%-15s|%-7s|%-9s|%-8s|%9u|%5u|%3u |", @@ -367,10 +372,10 @@ static int status_get_info(char* buf, char** start, off_t offs, int len) wandev->clocking ? "internal" : "external", wandev->bps, wandev->mtu, - wandev->ndev) - ; - switch (wandev->state) - { + wandev->ndev); + + switch (wandev->state) { + case WAN_UNCONFIGURED: cnt += sprintf(&buf[cnt], "%-12s\n", "unconfigured"); break; @@ -407,56 +412,64 @@ static int wandev_get_info(char* buf, char** start, off_t offs, int len) { wan_device_t* wandev = (void*)start; int cnt = 0; + int rslt = 0; if ((wandev == NULL) || (wandev->magic != ROUTER_MAGIC)) return 0; if (!wandev->state) - return sprintf(&buf[cnt], "device is not configured!\n") - ; + return sprintf(&buf[cnt], "device is not configured!\n"); /* Update device statistics */ - if (wandev->update) wandev->update(wandev); - - cnt += sprintf(&buf[cnt], "%30s: %12lu\n", - "total frames received", wandev->stats.rx_packets) - ; - cnt += sprintf(&buf[cnt], "%30s: %12lu\n", - "receiver overrun errors", wandev->stats.rx_over_errors) - ; - cnt += sprintf(&buf[cnt], "%30s: %12lu\n", - "CRC errors", wandev->stats.rx_crc_errors) - ; - cnt += sprintf(&buf[cnt], "%30s: %12lu\n", - "frame length errors", wandev->stats.rx_length_errors) - ; - cnt += sprintf(&buf[cnt], "%30s: %12lu\n", - "frame format errors", wandev->stats.rx_frame_errors) - ; - cnt += sprintf(&buf[cnt], "%30s: %12lu\n", - "aborted frames received", wandev->stats.rx_missed_errors) - ; - cnt += sprintf(&buf[cnt], "%30s: %12lu\n", - "reveived frames dropped", wandev->stats.rx_dropped) - ; - cnt += sprintf(&buf[cnt], "%30s: %12lu\n", - "other receive errors", wandev->stats.rx_errors) - ; - cnt += sprintf(&buf[cnt], "\n%30s: %12lu\n", - "total frames transmitted", wandev->stats.tx_packets) - ; - cnt += sprintf(&buf[cnt], "%30s: %12lu\n", - "aborted frames transmitted", wandev->stats.tx_aborted_errors) - ; - cnt += sprintf(&buf[cnt], "%30s: %12lu\n", - "transmit frames dropped", wandev->stats.tx_dropped) - ; - cnt += sprintf(&buf[cnt], "%30s: %12lu\n", - "transmit collisions", wandev->stats.collisions) - ; - cnt += sprintf(&buf[cnt], "%30s: %12lu\n", - "other transmit errors", wandev->stats.tx_errors) - ; - return cnt; + if (wandev->update) { + + rslt = wandev->update(wandev); + if(rslt) { + switch (rslt) { + case -EAGAIN: + return sprintf(&buf[cnt], "Device is busy!\n"); + + default: + return sprintf(&buf[cnt], + "Device is not configured!\n"); + } + } + } + + cnt += sprintf(&buf[cnt], PROC_STATS_FORMAT, + "total packets received", wandev->stats.rx_packets); + cnt += sprintf(&buf[cnt], PROC_STATS_FORMAT, + "total packets transmitted", wandev->stats.tx_packets); + cnt += sprintf(&buf[cnt], PROC_STATS_FORMAT, + "total bytes received", wandev->stats.rx_bytes); + cnt += sprintf(&buf[cnt], PROC_STATS_FORMAT, + "total bytes transmitted", wandev->stats.tx_bytes); + cnt += sprintf(&buf[cnt], PROC_STATS_FORMAT, + "bad packets received", wandev->stats.rx_errors); + cnt += sprintf(&buf[cnt], PROC_STATS_FORMAT, + "packet transmit problems", wandev->stats.tx_errors); + cnt += sprintf(&buf[cnt], PROC_STATS_FORMAT, + "received frames dropped", wandev->stats.rx_dropped); + cnt += sprintf(&buf[cnt], PROC_STATS_FORMAT, + "transmit frames dropped", wandev->stats.tx_dropped); + cnt += sprintf(&buf[cnt], PROC_STATS_FORMAT, + "multicast packets received", wandev->stats.multicast); + cnt += sprintf(&buf[cnt], PROC_STATS_FORMAT, + "transmit collisions", wandev->stats.collisions); + cnt += sprintf(&buf[cnt], PROC_STATS_FORMAT, + "receive length errors", wandev->stats.rx_length_errors); + cnt += sprintf(&buf[cnt], PROC_STATS_FORMAT, + "receiver overrun errors", wandev->stats.rx_over_errors); + cnt += sprintf(&buf[cnt], PROC_STATS_FORMAT, + "CRC errors", wandev->stats.rx_crc_errors); + cnt += sprintf(&buf[cnt], PROC_STATS_FORMAT, + "frame format errors (aborts)", wandev->stats.rx_frame_errors); + cnt += sprintf(&buf[cnt], PROC_STATS_FORMAT, + "receiver fifo overrun", wandev->stats.rx_fifo_errors); + cnt += sprintf(&buf[cnt], PROC_STATS_FORMAT, + "receiver missed packet", wandev->stats.rx_missed_errors); + cnt += sprintf(&buf[cnt], PROC_STATS_FORMAT, + "aborted frames transmitted", wandev->stats.tx_aborted_errors); + return cnt; } /* @@ -490,3 +503,8 @@ int wanrouter_proc_delete(wan_device_t *wandev) } #endif + +/* + * End + */ + -- 2.39.5